diff options
Diffstat (limited to 'sys/contrib')
514 files changed, 16438 insertions, 4456 deletions
diff --git a/sys/contrib/dev/acpica/changes.txt b/sys/contrib/dev/acpica/changes.txt index 435540b254f1..4e3cf4f2f41c 100644 --- a/sys/contrib/dev/acpica/changes.txt +++ b/sys/contrib/dev/acpica/changes.txt @@ -1,11 +1,29 @@ ---------------------------------------- +7 August 2025. Summary of changes for version 20250807: + +Major changes: + +Added option to skip the global lock for SMM - Huacai Chen + +Fixed non-NUL terminated string implementations - Ahmed Salem + +Fixed CCEL and CDAT templates - Ahmed Salem + +Fixed a major Linux kernel bug (UAF) that was triggered by unequal number of method parameters (definition) vs arguments (invocation) in different places - Peter Williams, Hans de Goede, Rafael Wysocki + +Define distinct D3 states (D3Hot and D3Cold) that help clarify the device behavior support - Aymeric Wibo + +A few cleanups, improvements to existing table supports, small fixes, spelling corrections etc. + + +---------------------------------------- 4 April 2025. Summary of changes for version 20250404: Major changes: Update all the copyright continuation year to 2025 in the license header of all files -Add complete support for 3 new ACPI tables ? MRRM,ERDT and RIMT (Tony Luck & V L Sunil) +Add complete support for 3 new ACPI tables - MRRM,ERDT and RIMT (Tony Luck & V L Sunil) Add a license file to the project which is a great improvement (Dionna Glaze) @@ -21,11 +39,11 @@ Major changes: Fix 2 critical CVE addressing memory leaks - Seunghun Han -EINJ V2 updates ? Zaid Alali (Ampere Computing) +EINJ V2 updates - Zaid Alali (Ampere Computing) -CDAT updates ? Ira Weiny (Intel Corporation) +CDAT updates - Ira Weiny (Intel Corporation) -Fix mutex handling, don?t release ones that were never acquired ? Daniil Tatianin +Fix mutex handling, do not release ones that were never acquired - Daniil Tatianin Experiment with new tag name format Ryyyy_mm_dd to solve chronological sorting problems @@ -39,7 +57,7 @@ Fix the acpixf.h file which caused issues for the last release (before this) 202 Fix the pointer offset for the SLIC table -Verify the local environment and GitHub commits are all in sync which was a problem with the second from last release (before this)20240322 (aka 20240323 – date issue) +Verify the local environment and GitHub commits are all in sync which was a problem with the second from last release (before this)20240322 (aka 20240323 - date issue) diff --git a/sys/contrib/dev/acpica/common/adisasm.c b/sys/contrib/dev/acpica/common/adisasm.c index 96cd6c7f5d3c..83125098cbd1 100644 --- a/sys/contrib/dev/acpica/common/adisasm.c +++ b/sys/contrib/dev/acpica/common/adisasm.c @@ -481,12 +481,12 @@ AdDisassembleOneTable ( "FieldName : FieldValue (in hex)\n */\n\n"); AcpiDmDumpDataTable (Table); - fprintf (stderr, "Acpi Data Table [%4.4s] decoded\n", + fprintf (stdout, "Acpi Data Table [%4.4s] decoded\n", AcpiGbl_CDAT ? (char *) AcpiGbl_CDAT : Table->Signature); if (File) { - fprintf (stderr, "Formatted output: %s - %u bytes\n", + fprintf (stdout, "Formatted output: %s - %u bytes\n", DisasmFilename, CmGetFileSize (File)); } @@ -584,16 +584,16 @@ AdDisassembleOneTable ( AcpiDmDumpDataTable (Table); - fprintf (stderr, "Disassembly completed\n"); + fprintf (stdout, "Disassembly completed\n"); if (File) { - fprintf (stderr, "ASL Output: %s - %u bytes\n", + fprintf (stdout, "ASL Output: %s - %u bytes\n", DisasmFilename, CmGetFileSize (File)); } if (AslGbl_MapfileFlag) { - fprintf (stderr, "%14s %s - %u bytes\n", + fprintf (stdout, "%14s %s - %u bytes\n", AslGbl_FileDescs[ASL_FILE_MAP_OUTPUT].ShortDescription, AslGbl_Files[ASL_FILE_MAP_OUTPUT].Filename, FlGetFileSize (ASL_FILE_MAP_OUTPUT)); @@ -630,7 +630,7 @@ AdReparseOneTable ( ACPI_COMMENT_ADDR_NODE *AddrListHead; - fprintf (stderr, + fprintf (stdout, "\nFound %u external control methods, " "reparsing with new information\n", AcpiDmGetUnresolvedExternalMethodCount ()); diff --git a/sys/contrib/dev/acpica/common/ahtable.c b/sys/contrib/dev/acpica/common/ahtable.c index 898b2d09f609..587bf61016f0 100644 --- a/sys/contrib/dev/acpica/common/ahtable.c +++ b/sys/contrib/dev/acpica/common/ahtable.c @@ -265,6 +265,7 @@ const AH_TABLE AcpiGbl_SupportedTables[] = {ACPI_SIG_SSDT, "Secondary System Description Table (AML table)"}, {ACPI_SIG_STAO, "Status Override Table"}, {ACPI_SIG_SVKL, "Storage Volume Key Location Table"}, + {ACPI_SIG_SWFT, "SoundWire File Table"}, {ACPI_SIG_TCPA, "Trusted Computing Platform Alliance Table"}, {ACPI_SIG_TDEL, "TD-Event Log Table"}, {ACPI_SIG_TPM2, "Trusted Platform Module hardware interface Table"}, diff --git a/sys/contrib/dev/acpica/common/dmtable.c b/sys/contrib/dev/acpica/common/dmtable.c index fcff97a304ae..702f4f7965e4 100644 --- a/sys/contrib/dev/acpica/common/dmtable.c +++ b/sys/contrib/dev/acpica/common/dmtable.c @@ -721,6 +721,7 @@ const ACPI_DMTABLE_DATA AcpiDmTableData[] = {ACPI_SIG_SRAT, NULL, AcpiDmDumpSrat, DtCompileSrat, TemplateSrat}, {ACPI_SIG_STAO, NULL, AcpiDmDumpStao, DtCompileStao, TemplateStao}, {ACPI_SIG_SVKL, AcpiDmTableInfoSvkl, AcpiDmDumpSvkl, DtCompileSvkl, TemplateSvkl}, + {ACPI_SIG_SWFT, NULL, NULL, NULL, NULL}, {ACPI_SIG_TCPA, NULL, AcpiDmDumpTcpa, DtCompileTcpa, TemplateTcpa}, {ACPI_SIG_TDEL, AcpiDmTableInfoTdel, NULL, NULL, TemplateTdel}, {ACPI_SIG_TPM2, AcpiDmTableInfoTpm2, AcpiDmDumpTpm2, DtCompileTpm2, TemplateTpm2}, diff --git a/sys/contrib/dev/acpica/common/dmtbdump2.c b/sys/contrib/dev/acpica/common/dmtbdump2.c index 822920d2ea94..d29a60be0f67 100644 --- a/sys/contrib/dev/acpica/common/dmtbdump2.c +++ b/sys/contrib/dev/acpica/common/dmtbdump2.c @@ -2637,7 +2637,7 @@ AcpiDmDumpRhct ( RhctIsaString, RhctIsaString->IsaLength, AcpiDmTableInfoRhctIsa1); if (Subtable->Length > IsaPadOffset) { - Status = AcpiDmDumpTable (Table->Length, Offset + SubtableOffset, + Status = AcpiDmDumpTable (Table->Length, Offset + IsaPadOffset, ACPI_ADD_PTR (UINT8, Subtable, IsaPadOffset), (Subtable->Length - IsaPadOffset), AcpiDmTableInfoRhctIsaPad); } diff --git a/sys/contrib/dev/acpica/common/dmtbinfo2.c b/sys/contrib/dev/acpica/common/dmtbinfo2.c index 9ecf877fcfb0..b7c6d3b8d536 100644 --- a/sys/contrib/dev/acpica/common/dmtbinfo2.c +++ b/sys/contrib/dev/acpica/common/dmtbinfo2.c @@ -2180,7 +2180,7 @@ ACPI_DMTABLE_INFO AcpiDmTableInfoRhct[] = ACPI_DMTABLE_INFO AcpiDmTableInfoRhctNodeHdr[] = { {ACPI_DMT_RHCT, ACPI_RHCTH_OFFSET (Type), "Subtable Type", 0}, - {ACPI_DMT_UINT16, ACPI_RHCTH_OFFSET (Length), "Length", 0}, + {ACPI_DMT_UINT16, ACPI_RHCTH_OFFSET (Length), "Length", DT_LENGTH}, {ACPI_DMT_UINT16, ACPI_RHCTH_OFFSET (Revision), "Revision", 0}, ACPI_DMT_TERMINATOR }; diff --git a/sys/contrib/dev/acpica/common/dmtbinfo3.c b/sys/contrib/dev/acpica/common/dmtbinfo3.c index 75b580e0d890..0935fc86aff9 100644 --- a/sys/contrib/dev/acpica/common/dmtbinfo3.c +++ b/sys/contrib/dev/acpica/common/dmtbinfo3.c @@ -200,7 +200,7 @@ ACPI_DMTABLE_INFO AcpiDmTableInfoCcel[] = { {ACPI_DMT_UINT8, ACPI_CCEL_OFFSET (CCType), "CC Type", 0}, {ACPI_DMT_UINT8, ACPI_CCEL_OFFSET (CCSubType), "CC Sub Type", 0}, - {ACPI_DMT_UINT32, ACPI_CCEL_OFFSET (Reserved), "Reserved", 0}, + {ACPI_DMT_UINT16, ACPI_CCEL_OFFSET (Reserved), "Reserved", 0}, {ACPI_DMT_UINT64, ACPI_CCEL_OFFSET (LogAreaMinimumLength), "Log Area Minimum Length", 0}, {ACPI_DMT_UINT64, ACPI_CCEL_OFFSET (LogAreaStartAddress), "Log Area Start Address", 0}, ACPI_DMT_TERMINATOR diff --git a/sys/contrib/dev/acpica/compiler/aslanalyze.c b/sys/contrib/dev/acpica/compiler/aslanalyze.c index 17e2674817a9..625611a630de 100644 --- a/sys/contrib/dev/acpica/compiler/aslanalyze.c +++ b/sys/contrib/dev/acpica/compiler/aslanalyze.c @@ -572,10 +572,22 @@ ApCheckForGpeNameConflict ( ACPI_PARSE_OBJECT *NextOp; UINT32 GpeNumber; char Name[ACPI_NAMESEG_SIZE + 1]; - char Target[ACPI_NAMESEG_SIZE]; + char Target[ACPI_NAMESEG_SIZE] ACPI_NONSTRING; - /* Need a null-terminated string version of NameSeg */ + /** + * Need a null-terminated string version of NameSeg + * + * NOTE: during a review on Name[ACPI_NAMESEG_SIZE + 1] having an extra + * byte[1], compiler testing exhibited a difference in behavior between + * GCC and Clang[2] (at least; MSVC may also exhibit the same) in + * how optimization is done. The extra byte is needed to ensure + * the signature does not get mangled, subsequently avoiding + * GpeNumber being a completely different return value from strtoul. + * + * [1] https://github.com/acpica/acpica/pull/1019#discussion_r2058687704 + * [2] https://github.com/acpica/acpica/pull/1019#discussion_r2061953039 + */ ACPI_MOVE_32_TO_32 (Name, Op->Asl.NameSeg); Name[ACPI_NAMESEG_SIZE] = 0; diff --git a/sys/contrib/dev/acpica/compiler/aslrestype2s.c b/sys/contrib/dev/acpica/compiler/aslrestype2s.c index 096862290384..f47402d4e025 100644 --- a/sys/contrib/dev/acpica/compiler/aslrestype2s.c +++ b/sys/contrib/dev/acpica/compiler/aslrestype2s.c @@ -1469,7 +1469,7 @@ RsDoCsi2SerialBusDescriptor ( case 2: /* Local Port Instance [Integer] (_PRT) */ - RsSetFlagBits16 ((UINT16 *) &Descriptor->Csi2SerialBus.TypeSpecificFlags, InitializerOp, 0, 0); + RsSetFlagBits16 ((UINT16 *) &Descriptor->Csi2SerialBus.TypeSpecificFlags, InitializerOp, 2, 0); RsCreateMultiBitField (InitializerOp, ACPI_RESTAG_LOCALPORT, CurrentByteOffset + ASL_RESDESC_OFFSET (Csi2SerialBus.TypeSpecificFlags), 2, 6); break; diff --git a/sys/contrib/dev/acpica/compiler/dttable2.c b/sys/contrib/dev/acpica/compiler/dttable2.c index 6203a382ad62..754880346299 100644 --- a/sys/contrib/dev/acpica/compiler/dttable2.c +++ b/sys/contrib/dev/acpica/compiler/dttable2.c @@ -1929,24 +1929,30 @@ DtCompileRhct ( { ACPI_STATUS Status; ACPI_RHCT_NODE_HEADER *RhctHeader; - ACPI_RHCT_HART_INFO *RhctHartInfo = NULL; + ACPI_RHCT_HART_INFO *RhctHartInfo; DT_SUBTABLE *Subtable; DT_SUBTABLE *ParentTable; ACPI_DMTABLE_INFO *InfoTable; DT_FIELD **PFieldList = (DT_FIELD **) List; DT_FIELD *SubtableStart; + ACPI_TABLE_RHCT *Table; + BOOLEAN FirstNode = TRUE; /* Compile the main table */ + ParentTable = DtPeekSubtable (); Status = DtCompileTable (PFieldList, AcpiDmTableInfoRhct, &Subtable); if (ACPI_FAILURE (Status)) { return (Status); } + DtInsertSubtable (ParentTable, Subtable); + Table = ACPI_CAST_PTR (ACPI_TABLE_RHCT, ParentTable->Buffer); + Table->NodeCount = 0; + Table->NodeOffset = sizeof (ACPI_TABLE_RHCT); - ParentTable = DtPeekSubtable (); while (*PFieldList) { SubtableStart = *PFieldList; @@ -1961,7 +1967,10 @@ DtCompileRhct ( } DtInsertSubtable (ParentTable, Subtable); RhctHeader = ACPI_CAST_PTR (ACPI_RHCT_NODE_HEADER, Subtable->Buffer); - RhctHeader->Length = (UINT16)(Subtable->Length); + + DtPushSubtable (Subtable); + ParentTable = DtPeekSubtable (); + Table->NodeCount++; switch (RhctHeader->Type) { @@ -1999,37 +2008,54 @@ DtCompileRhct ( return (Status); } DtInsertSubtable (ParentTable, Subtable); - RhctHeader->Length += (UINT16)(Subtable->Length); + if (FirstNode) + { + Table->NodeOffset = ACPI_PTR_DIFF(ParentTable->Buffer, Table); + FirstNode = FALSE; + } /* Compile RHCT subtable additionals */ switch (RhctHeader->Type) { - case ACPI_RHCT_NODE_TYPE_HART_INFO: + case ACPI_RHCT_NODE_TYPE_ISA_STRING: - RhctHartInfo = ACPI_SUB_PTR (ACPI_RHCT_HART_INFO, - Subtable->Buffer, sizeof (ACPI_RHCT_NODE_HEADER)); - if (RhctHartInfo) + /* + * Padding - Variable-length data + * Optionally allows the padding of the ISA string to be used + * for filling this field. + */ + Status = DtCompileTable (PFieldList, AcpiDmTableInfoRhctIsaPad, + &Subtable); + if (ACPI_FAILURE (Status)) + { + return (Status); + } + if (Subtable) { + DtInsertSubtable (ParentTable, Subtable); + } + break; - RhctHartInfo->NumOffsets = 0; - while (*PFieldList) - { - Status = DtCompileTable (PFieldList, - AcpiDmTableInfoRhctHartInfo2, &Subtable); - if (ACPI_FAILURE (Status)) - { - return (Status); - } - if (!Subtable) - { - break; - } + case ACPI_RHCT_NODE_TYPE_HART_INFO: - DtInsertSubtable (ParentTable, Subtable); - RhctHeader->Length += (UINT16)(Subtable->Length); - RhctHartInfo->NumOffsets++; + RhctHartInfo = ACPI_CAST_PTR (ACPI_RHCT_HART_INFO, + Subtable->Buffer); + RhctHartInfo->NumOffsets = 0; + while (*PFieldList) + { + Status = DtCompileTable (PFieldList, + AcpiDmTableInfoRhctHartInfo2, &Subtable); + if (ACPI_FAILURE (Status)) + { + return (Status); } + if (!Subtable) + { + break; + } + DtInsertSubtable (ParentTable, Subtable); + RhctHartInfo->NumOffsets++; } break; @@ -2037,6 +2063,9 @@ DtCompileRhct ( break; } + + DtPopSubtable (); + ParentTable = DtPeekSubtable (); } return (AE_OK); diff --git a/sys/contrib/dev/acpica/compiler/dttemplate.c b/sys/contrib/dev/acpica/compiler/dttemplate.c index 67b13bb82d1b..d7140712d4e6 100644 --- a/sys/contrib/dev/acpica/compiler/dttemplate.c +++ b/sys/contrib/dev/acpica/compiler/dttemplate.c @@ -255,7 +255,7 @@ DtCreateTemplates ( if (AcpiGbl_Optind < 3) { - fprintf (stderr, "Creating default template: [DSDT]\n"); + fprintf (stdout, "Creating default template: [DSDT]\n"); Status = DtCreateOneTemplateFile (ACPI_SIG_DSDT, 0); goto Exit; } @@ -411,7 +411,7 @@ DtCreateAllTemplates ( ACPI_STATUS Status; - fprintf (stderr, "Creating all supported Template files\n"); + fprintf (stdout, "Creating all supported Template files\n"); /* Walk entire ACPI table data structure */ @@ -421,8 +421,13 @@ DtCreateAllTemplates ( if (TableData->Template) { - Status = DtCreateOneTemplate (TableData->Signature, - 0, TableData); + if (ACPI_COMPARE_NAMESEG (TableData->Signature, ACPI_SIG_CDAT)) + /* Special handling of CDAT */ + Status = DtCreateOneTemplate (TableData->Signature, + 0, NULL); + else + Status = DtCreateOneTemplate (TableData->Signature, + 0, TableData); if (ACPI_FAILURE (Status)) { return (Status); @@ -563,7 +568,7 @@ DtCreateOneTemplate ( } else { - /* Special ACPI tables - DSDT, SSDT, OSDT, FACS, RSDP */ + /* Special ACPI tables - DSDT, SSDT, OSDT, FACS, RSDP, CDAT */ AcpiOsPrintf (" (AML byte code table)\n"); AcpiOsPrintf (" */\n"); @@ -621,6 +626,11 @@ DtCreateOneTemplate ( AcpiDmDumpDataTable (ACPI_CAST_PTR (ACPI_TABLE_HEADER, TemplateRsdp)); } + else if (ACPI_COMPARE_NAMESEG (Signature, ACPI_SIG_CDAT)) + { + AcpiDmDumpCdat (ACPI_CAST_PTR (ACPI_TABLE_HEADER, + TemplateCdat)); + } else { fprintf (stderr, @@ -632,14 +642,14 @@ DtCreateOneTemplate ( if (TableCount == 0) { - fprintf (stderr, + fprintf (stdout, "Created ACPI table template for [%4.4s], " "written to \"%s\"\n", Signature, DisasmFilename); } else { - fprintf (stderr, + fprintf (stdout, "Created ACPI table templates for [%4.4s] " "and %u [SSDT] in same file, written to \"%s\"\n", Signature, TableCount, DisasmFilename); diff --git a/sys/contrib/dev/acpica/compiler/dttemplate.h b/sys/contrib/dev/acpica/compiler/dttemplate.h index 0fdd90f73a23..51a34be5c36b 100644 --- a/sys/contrib/dev/acpica/compiler/dttemplate.h +++ b/sys/contrib/dev/acpica/compiler/dttemplate.h @@ -389,7 +389,7 @@ const unsigned char TemplateBoot[] = const unsigned char TemplateCcel[] = { 0x43,0x43,0x45,0x4C,0x38,0x00,0x00,0x00, /* 00000000 "CCEL8..." */ - 0x04,0x1C,0x49,0x4E,0x54,0x45,0x4C,0x20, /* 00000008 "..INTEL " */ + 0x04,0x2E,0x49,0x4E,0x54,0x45,0x4C,0x20, /* 00000008 "..INTEL " */ 0x54,0x65,0x6D,0x70,0x6C,0x61,0x74,0x65, /* 00000010 "Template" */ 0x00,0x00,0x00,0x00,0x49,0x4E,0x54,0x4C, /* 00000018 "....INTL" */ 0x30,0x09,0x21,0x20,0x00,0x00,0x00,0x00, /* 00000020 "0.! ...." */ @@ -1951,25 +1951,25 @@ const unsigned char TemplateRgrt[] = const unsigned char TemplateRhct[] = { - 0x52,0x48,0x43,0x54,0x96,0x00,0x00,0x00, /* 00000000 "RHCT|..." */ - 0x01,0x24,0x4F,0x45,0x4D,0x43,0x41,0x00, /* 00000008 "..OEMCA." */ + 0x52,0x48,0x43,0x54,0x96,0x00,0x00,0x00, /* 00000000 "RHCT...." */ + 0x01,0x6D,0x4F,0x45,0x4D,0x43,0x41,0x00, /* 00000008 ".mOEMCA." */ 0x54,0x45,0x4D,0x50,0x4C,0x41,0x54,0x45, /* 00000010 "TEMPLATE" */ 0x01,0x00,0x00,0x00,0x49,0x4E,0x54,0x4C, /* 00000018 "....INTL" */ - 0x28,0x09,0x22,0x20,0x00,0x00,0x00,0x00, /* 00000020 "... ...." */ + 0x04,0x04,0x25,0x20,0x00,0x00,0x00,0x00, /* 00000020 "..% ...." */ 0x80,0x96,0x98,0x00,0x00,0x00,0x00,0x00, /* 00000028 "........" */ - 0x02,0x00,0x00,0x00,0x38,0x00,0x00,0x00, /* 00000030 "....8..." */ - 0x00,0x00,0x34,0x00,0x01,0x00,0x2B,0x00, /* 00000038 "..4...*." */ + 0x04,0x00,0x00,0x00,0x38,0x00,0x00,0x00, /* 00000030 "....8..." */ + 0x00,0x00,0x34,0x00,0x01,0x00,0x2B,0x00, /* 00000038 "..4...+." */ 0x72,0x76,0x36,0x34,0x69,0x6D,0x61,0x66, /* 00000040 "rv64imaf" */ 0x64,0x63,0x68,0x5F,0x7A,0x69,0x63,0x73, /* 00000048 "dch_zics" */ 0x72,0x5F,0x7A,0x69,0x66,0x65,0x6E,0x63, /* 00000050 "r_zifenc" */ 0x65,0x69,0x5F,0x7A,0x62,0x61,0x5F,0x7A, /* 00000058 "ei_zba_z" */ 0x62,0x62,0x5F,0x7A,0x62,0x63,0x5F,0x7A, /* 00000060 "bb_zbc_z" */ - 0x62,0x73,0x00,0x00,0xFF,0xFF,0x18,0x00, /* 00000068 "bs......" */ - 0x01,0x00,0x03,0x00,0x00,0x00,0x00,0x00, /* 00000070 "........" */ - 0x38,0x00,0x00,0x00,0x7c,0x00,0x00,0x00, /* 00000078 "........" */ - 0x8E,0x00,0x00,0x00,0x01,0x00,0x0A,0x00, /* 00000080 "........" */ - 0x01,0x00,0x00,0x06,0x06,0x06,0x02,0x00, /* 00000088 "........" */ - 0x08,0x00,0x01,0x00,0x00,0x02 /* 00000090 "........" */ + 0x62,0x73,0x00,0x00,0x01,0x00,0x0A,0x00, /* 00000068 "bs......" */ + 0x01,0x00,0x00,0x06,0x06,0x06,0x02,0x00, /* 00000070 "........" */ + 0x08,0x00,0x01,0x00,0x00,0x02,0xFF,0xFF, /* 00000078 "........" */ + 0x18,0x00,0x01,0x00,0x03,0x00,0x00,0x00, /* 00000080 "........" */ + 0x00,0x00,0x3B,0x00,0x00,0x00,0x6C,0x00, /* 00000088 "..;...l." */ + 0x00,0x00,0x76,0x00,0x00,0x00 /* 00000090 "..v..." */ }; const unsigned char TemplateRimt[] = diff --git a/sys/contrib/dev/acpica/compiler/dtutils.c b/sys/contrib/dev/acpica/compiler/dtutils.c index f2463f74b8fc..18ea18cefdd6 100644 --- a/sys/contrib/dev/acpica/compiler/dtutils.c +++ b/sys/contrib/dev/acpica/compiler/dtutils.c @@ -623,6 +623,7 @@ DtGetFieldLength ( case ACPI_DMT_NFIT: case ACPI_DMT_PCI_PATH: case ACPI_DMT_PHAT: + case ACPI_DMT_RHCT: ByteLength = 2; break; diff --git a/sys/contrib/dev/acpica/components/disassembler/dmresrcl2.c b/sys/contrib/dev/acpica/components/disassembler/dmresrcl2.c index dd8cf4889885..551cf8178d94 100644 --- a/sys/contrib/dev/acpica/components/disassembler/dmresrcl2.c +++ b/sys/contrib/dev/acpica/components/disassembler/dmresrcl2.c @@ -778,7 +778,7 @@ AcpiDmCsi2SerialBusDescriptor ( AcpiOsPrintf (" 0x%2.2X, 0x%2.2X,\n", Resource->Csi2SerialBus.TypeSpecificFlags & 0x03, - Resource->Csi2SerialBus.TypeSpecificFlags & 0xFC); + (Resource->Csi2SerialBus.TypeSpecificFlags & 0xFC) >> 2); /* ResourceSource is a required field */ diff --git a/sys/contrib/dev/acpica/components/dispatcher/dsmethod.c b/sys/contrib/dev/acpica/components/dispatcher/dsmethod.c index 8b6efc070b1b..becdb95f8b83 100644 --- a/sys/contrib/dev/acpica/components/dispatcher/dsmethod.c +++ b/sys/contrib/dev/acpica/components/dispatcher/dsmethod.c @@ -646,8 +646,6 @@ AcpiDsCallControlMethod ( ACPI_WALK_STATE *NextWalkState = NULL; ACPI_OPERAND_OBJECT *ObjDesc; ACPI_EVALUATE_INFO *Info; - UINT32 i; - ACPI_FUNCTION_TRACE_PTR (DsCallControlMethod, ThisWalkState); @@ -670,6 +668,23 @@ AcpiDsCallControlMethod ( return_ACPI_STATUS (AE_NULL_OBJECT); } + if (ThisWalkState->NumOperands < ObjDesc->Method.ParamCount) + { + ACPI_ERROR ((AE_INFO, "Missing argument(s) for method [%4.4s]", + AcpiUtGetNodeName (MethodNode))); + + return_ACPI_STATUS (AE_AML_TOO_FEW_ARGUMENTS); + } + + else if (ThisWalkState->NumOperands > ObjDesc->Method.ParamCount) + { + ACPI_ERROR ((AE_INFO, "Too many arguments for method [%4.4s]", + AcpiUtGetNodeName (MethodNode))); + + return_ACPI_STATUS (AE_AML_TOO_MANY_ARGUMENTS); + } + + /* Init for new method, possibly wait on method mutex */ Status = AcpiDsBeginMethodExecution ( @@ -726,15 +741,7 @@ AcpiDsCallControlMethod ( * Delete the operands on the previous walkstate operand stack * (they were copied to new objects) */ - for (i = 0; i < ObjDesc->Method.ParamCount; i++) - { - AcpiUtRemoveReference (ThisWalkState->Operands [i]); - ThisWalkState->Operands [i] = NULL; - } - - /* Clear the operand stack */ - - ThisWalkState->NumOperands = 0; + AcpiDsClearOperands (ThisWalkState); ACPI_DEBUG_PRINT ((ACPI_DB_DISPATCH, "**** Begin nested execution of [%4.4s] **** WalkState=%p\n", diff --git a/sys/contrib/dev/acpica/components/dispatcher/dsmthdat.c b/sys/contrib/dev/acpica/components/dispatcher/dsmthdat.c index 42e1aa505d02..2c45e8c91f57 100644 --- a/sys/contrib/dev/acpica/components/dispatcher/dsmthdat.c +++ b/sys/contrib/dev/acpica/components/dispatcher/dsmthdat.c @@ -357,6 +357,7 @@ AcpiDsMethodDataInitArgs ( Index++; } + AcpiExTraceArgs(Params, Index); ACPI_DEBUG_PRINT ((ACPI_DB_EXEC, "%u args passed to method\n", Index)); return_ACPI_STATUS (AE_OK); diff --git a/sys/contrib/dev/acpica/components/events/evglock.c b/sys/contrib/dev/acpica/components/events/evglock.c index 872e7b499a8f..395ca14fb315 100644 --- a/sys/contrib/dev/acpica/components/events/evglock.c +++ b/sys/contrib/dev/acpica/components/events/evglock.c @@ -195,6 +195,11 @@ AcpiEvInitGlobalLockHandler ( return_ACPI_STATUS (AE_OK); } + if (!AcpiGbl_UseGlobalLock) + { + return_ACPI_STATUS (AE_OK); + } + /* Attempt installation of the global lock handler */ Status = AcpiInstallFixedEventHandler (ACPI_EVENT_GLOBAL, diff --git a/sys/contrib/dev/acpica/components/executer/extrace.c b/sys/contrib/dev/acpica/components/executer/extrace.c index 0eceb0ffccb1..b48a5fcb289b 100644 --- a/sys/contrib/dev/acpica/components/executer/extrace.c +++ b/sys/contrib/dev/acpica/components/executer/extrace.c @@ -269,6 +269,68 @@ AcpiExGetTraceEventName ( #endif +/******************************************************************************* + * + * FUNCTION: AcpiExTraceArgs + * + * PARAMETERS: Params - AML method arguments + * Count - numer of method arguments + * + * RETURN: None + * + * DESCRIPTION: Trace any arguments + * + ******************************************************************************/ + +void +AcpiExTraceArgs(ACPI_OPERAND_OBJECT **Params, UINT32 Count) +{ + UINT32 i; + + ACPI_FUNCTION_NAME(ExTraceArgs); + + for (i = 0; i < Count; i++) + { + ACPI_OPERAND_OBJECT *obj_desc = Params[i]; + + if (!i) + { + ACPI_DEBUG_PRINT((ACPI_DB_TRACE_POINT, " ")); + } + + switch (obj_desc->Common.Type) + { + case ACPI_TYPE_INTEGER: + ACPI_DEBUG_PRINT_RAW((ACPI_DB_TRACE_POINT, "%jx", (uintmax_t)obj_desc->Integer.Value)); + break; + + case ACPI_TYPE_STRING: + if (!obj_desc->String.Length) + { + ACPI_DEBUG_PRINT_RAW((ACPI_DB_TRACE_POINT, "NULL")); + break; + } + if (ACPI_IS_DEBUG_ENABLED(ACPI_LV_TRACE_POINT, _COMPONENT)) + { + AcpiUtPrintString(obj_desc->String.Pointer, ACPI_UINT8_MAX); + } + break; + + default: + ACPI_DEBUG_PRINT_RAW((ACPI_DB_TRACE_POINT, "Unknown")); + break; + } + + if ((i + 1) == Count) + { + ACPI_DEBUG_PRINT_RAW((ACPI_DB_TRACE_POINT, "\n")); + } + else + { + ACPI_DEBUG_PRINT_RAW((ACPI_DB_TRACE_POINT, ", ")); + } + } +} /******************************************************************************* * @@ -299,9 +361,9 @@ AcpiExTracePoint ( if (Pathname) { ACPI_DEBUG_PRINT ((ACPI_DB_TRACE_POINT, - "%s %s [0x%p:%s] execution.\n", + "%s %s [%s] execution.\n", AcpiExGetTraceEventName (Type), Begin ? "Begin" : "End", - Aml, Pathname)); + Pathname)); } else { diff --git a/sys/contrib/dev/acpica/components/parser/psopinfo.c b/sys/contrib/dev/acpica/components/parser/psopinfo.c index 21c2b831ef24..1db32f4e8246 100644 --- a/sys/contrib/dev/acpica/components/parser/psopinfo.c +++ b/sys/contrib/dev/acpica/components/parser/psopinfo.c @@ -180,8 +180,8 @@ const ACPI_OPCODE_INFO * AcpiPsGetOpcodeInfo ( UINT16 Opcode) { -#ifdef ACPI_DEBUG_OUTPUT - const char *OpcodeName = "Unknown AML opcode"; +#if defined ACPI_ASL_COMPILER && defined ACPI_DEBUG_OUTPUT + const char *OpcodeName = "Unknown AML opcode"; #endif ACPI_FUNCTION_NAME (PsGetOpcodeInfo); @@ -207,7 +207,7 @@ AcpiPsGetOpcodeInfo ( #if defined ACPI_ASL_COMPILER && defined ACPI_DEBUG_OUTPUT #include <contrib/dev/acpica/compiler/asldefine.h> - + switch (Opcode) { case AML_RAW_DATA_BYTE: @@ -249,12 +249,12 @@ AcpiPsGetOpcodeInfo ( default: break; } -#endif /* Unknown AML opcode */ ACPI_DEBUG_PRINT ((ACPI_DB_EXEC, "%s [%4.4X]\n", OpcodeName, Opcode)); +#endif return (&AcpiGbl_AmlOpInfo [_UNK]); } diff --git a/sys/contrib/dev/acpica/components/tables/tbprint.c b/sys/contrib/dev/acpica/components/tables/tbprint.c index 7211673c42a2..8b812a890a07 100644 --- a/sys/contrib/dev/acpica/components/tables/tbprint.c +++ b/sys/contrib/dev/acpica/components/tables/tbprint.c @@ -279,6 +279,14 @@ AcpiTbPrintTableHeader ( ACPI_CAST_PTR (ACPI_TABLE_RSDP, Header)->Revision, LocalHeader.OemId)); } + else if (AcpiGbl_CDAT && !AcpiUtValidNameseg (Header->Signature)) + { + /* CDAT does not use the common ACPI table header */ + + ACPI_INFO (("%-4.4s 0x%8.8X%8.8X %06X", + ACPI_SIG_CDAT, ACPI_FORMAT_UINT64 (Address), + ACPI_CAST_PTR (ACPI_TABLE_CDAT, Header)->Length)); + } else { /* Standard ACPI table with full common header */ diff --git a/sys/contrib/dev/acpica/components/utilities/utnonansi.c b/sys/contrib/dev/acpica/components/utilities/utnonansi.c index bfbe1194ceae..f8b3a29e3283 100644 --- a/sys/contrib/dev/acpica/components/utilities/utnonansi.c +++ b/sys/contrib/dev/acpica/components/utilities/utnonansi.c @@ -353,7 +353,7 @@ AcpiUtSafeStrncpy ( { /* Always terminate destination string */ - memcpy (Dest, Source, DestSize); + strncpy (Dest, Source, DestSize); Dest[DestSize - 1] = 0; } diff --git a/sys/contrib/dev/acpica/include/acdebug.h b/sys/contrib/dev/acpica/include/acdebug.h index e335752148b9..63f39051a8ac 100644 --- a/sys/contrib/dev/acpica/include/acdebug.h +++ b/sys/contrib/dev/acpica/include/acdebug.h @@ -187,7 +187,7 @@ typedef struct acpi_db_execute_walk { UINT32 Count; UINT32 MaxCount; - char NameSeg[ACPI_NAMESEG_SIZE + 1] ACPI_NONSTRING; + char NameSeg[ACPI_NAMESEG_SIZE + 1]; } ACPI_DB_EXECUTE_WALK; diff --git a/sys/contrib/dev/acpica/include/acexcep.h b/sys/contrib/dev/acpica/include/acexcep.h index 57f98ab4540f..7216e0d49148 100644 --- a/sys/contrib/dev/acpica/include/acexcep.h +++ b/sys/contrib/dev/acpica/include/acexcep.h @@ -322,8 +322,11 @@ typedef struct acpi_exception_info #define AE_AML_TARGET_TYPE EXCEP_AML (0x0023) #define AE_AML_PROTOCOL EXCEP_AML (0x0024) #define AE_AML_BUFFER_LENGTH EXCEP_AML (0x0025) +#define AE_AML_TOO_FEW_ARGUMENTS EXCEP_AML (0x0026) +#define AE_AML_TOO_MANY_ARGUMENTS EXCEP_AML (0x0027) -#define AE_CODE_AML_MAX 0x0025 + +#define AE_CODE_AML_MAX 0x0027 /* @@ -456,7 +459,9 @@ static const ACPI_EXCEPTION_INFO AcpiGbl_ExceptionNames_Aml[] = EXCEP_TXT ("AE_AML_UNINITIALIZED_NODE", "A namespace node is uninitialized or unresolved"), EXCEP_TXT ("AE_AML_TARGET_TYPE", "A target operand of an incorrect type was encountered"), EXCEP_TXT ("AE_AML_PROTOCOL", "Violation of a fixed ACPI protocol"), - EXCEP_TXT ("AE_AML_BUFFER_LENGTH", "The length of the buffer is invalid/incorrect") + EXCEP_TXT ("AE_AML_BUFFER_LENGTH", "The length of the buffer is invalid/incorrect"), + EXCEP_TXT ("AE_AML_TOO_FEW_ARGUMENTS", "There are fewer than expected method arguments"), + EXCEP_TXT ("AE_AML_TOO_MANY_ARGUMENTS", "There are too many arguments for this method") }; static const ACPI_EXCEPTION_INFO AcpiGbl_ExceptionNames_Ctrl[] = diff --git a/sys/contrib/dev/acpica/include/acinterp.h b/sys/contrib/dev/acpica/include/acinterp.h index 74166384f172..b7f9e8f615e4 100644 --- a/sys/contrib/dev/acpica/include/acinterp.h +++ b/sys/contrib/dev/acpica/include/acinterp.h @@ -280,6 +280,10 @@ AcpiExTracePoint ( UINT8 *Aml, char *Pathname); +void +AcpiExTraceArgs( + ACPI_OPERAND_OBJECT **Params, + UINT32 Count); /* * exfield - ACPI AML (p-code) execution - field manipulation diff --git a/sys/contrib/dev/acpica/include/acpixf.h b/sys/contrib/dev/acpica/include/acpixf.h index 193b0e6a70dc..b5961e21bb9b 100644 --- a/sys/contrib/dev/acpica/include/acpixf.h +++ b/sys/contrib/dev/acpica/include/acpixf.h @@ -154,7 +154,7 @@ /* Current ACPICA subsystem version in YYYYMMDD format */ -#define ACPI_CA_VERSION 0x20250404 +#define ACPI_CA_VERSION 0x20250807 #include <contrib/dev/acpica/include/acconfig.h> #include <contrib/dev/acpica/include/actypes.h> @@ -358,6 +358,12 @@ ACPI_INIT_GLOBAL (UINT8, AcpiGbl_OsiData, 0); ACPI_INIT_GLOBAL (BOOLEAN, AcpiGbl_ReducedHardware, FALSE); /* + * ACPI Global Lock is mainly used for systems with SMM, so no-SMM systems + * (such as LoongArch) may not have and not use Global Lock. + */ +ACPI_INIT_GLOBAL (BOOLEAN, AcpiGbl_UseGlobalLock, TRUE); + +/* * Maximum timeout for While() loop iterations before forced method abort. * This mechanism is intended to prevent infinite loops during interpreter * execution within a host kernel. diff --git a/sys/contrib/dev/acpica/include/actbl.h b/sys/contrib/dev/acpica/include/actbl.h index eafd5d8a0f8b..ae52bd452c90 100644 --- a/sys/contrib/dev/acpica/include/actbl.h +++ b/sys/contrib/dev/acpica/include/actbl.h @@ -220,7 +220,7 @@ typedef struct acpi_table_header char OemId[ACPI_OEM_ID_SIZE] ACPI_NONSTRING; /* ASCII OEM identification */ char OemTableId[ACPI_OEM_TABLE_ID_SIZE] ACPI_NONSTRING; /* ASCII OEM table identification */ UINT32 OemRevision; /* OEM revision number */ - char AslCompilerId[ACPI_NAMESEG_SIZE]; /* ASCII ASL compiler vendor ID */ + char AslCompilerId[ACPI_NAMESEG_SIZE] ACPI_NONSTRING; /* ASCII ASL compiler vendor ID */ UINT32 AslCompilerRevision; /* ASL compiler version */ } ACPI_TABLE_HEADER; diff --git a/sys/contrib/dev/acpica/include/actbl1.h b/sys/contrib/dev/acpica/include/actbl1.h index 876b721068c6..ec04f0a0ab9f 100644 --- a/sys/contrib/dev/acpica/include/actbl1.h +++ b/sys/contrib/dev/acpica/include/actbl1.h @@ -262,7 +262,7 @@ typedef struct acpi_whea_header /* Larger subtable header (when Length can exceed 255) */ -typedef struct acpi_subtable_header_16 +typedef struct acpi_subtbl_hdr_16 { UINT16 Type; UINT16 Length; diff --git a/sys/contrib/dev/acpica/include/actbl2.h b/sys/contrib/dev/acpica/include/actbl2.h index 4899929b2d45..a74b6d555a3a 100644 --- a/sys/contrib/dev/acpica/include/actbl2.h +++ b/sys/contrib/dev/acpica/include/actbl2.h @@ -201,6 +201,7 @@ #define ACPI_SIG_SDEI "SDEI" /* Software Delegated Exception Interface Table */ #define ACPI_SIG_SDEV "SDEV" /* Secure Devices table */ #define ACPI_SIG_SVKL "SVKL" /* Storage Volume Key Location Table */ +#define ACPI_SIG_SWFT "SWFT" /* SoundWire File Table */ #define ACPI_SIG_TDEL "TDEL" /* TD Event Log Table */ @@ -4094,6 +4095,30 @@ enum acpi_svkl_format ACPI_SVKL_FORMAT_RESERVED = 1 /* 1 and greater are reserved */ }; +/******************************************************************************* + * + * SWFT - SoundWire File Table + * as described in Discovery and Configuration (DisCo) Specification + * for SoundWire® + * Version 1 + * + ******************************************************************************/ + +typedef struct acpi_table_swft +{ + ACPI_TABLE_HEADER Header; /* Common ACPI table header */ + +} ACPI_TABLE_SWFT; + +typedef struct acpi_swft_file +{ + UINT16 VendorID; + UINT32 FileID; + UINT16 FileVersion; + UINT16 FileLength; + UINT8 FileData[]; + +} ACPI_SWFT_FILE; /******************************************************************************* * diff --git a/sys/contrib/dev/acpica/include/actypes.h b/sys/contrib/dev/acpica/include/actypes.h index 66333243907b..3c95887b1678 100644 --- a/sys/contrib/dev/acpica/include/actypes.h +++ b/sys/contrib/dev/acpica/include/actypes.h @@ -741,9 +741,11 @@ typedef UINT64 ACPI_INTEGER; #define ACPI_STATE_D0 (UINT8) 0 #define ACPI_STATE_D1 (UINT8) 1 #define ACPI_STATE_D2 (UINT8) 2 -#define ACPI_STATE_D3 (UINT8) 3 -#define ACPI_D_STATES_MAX ACPI_STATE_D3 -#define ACPI_D_STATE_COUNT 4 +#define ACPI_STATE_D3_HOT (UINT8) 3 +#define ACPI_STATE_D3_COLD (UINT8) 4 +#define ACPI_STATE_D3 ACPI_STATE_D3_COLD +#define ACPI_D_STATES_MAX ACPI_STATE_D3_COLD +#define ACPI_D_STATE_COUNT 5 #define ACPI_STATE_C0 (UINT8) 0 #define ACPI_STATE_C1 (UINT8) 1 diff --git a/sys/contrib/dev/qat/qat_402xx.bin b/sys/contrib/dev/qat/qat_402xx.bin Binary files differnew file mode 100644 index 000000000000..74151547edce --- /dev/null +++ b/sys/contrib/dev/qat/qat_402xx.bin diff --git a/sys/contrib/dev/qat/qat_402xx_mmp.bin b/sys/contrib/dev/qat/qat_402xx_mmp.bin Binary files differnew file mode 100644 index 000000000000..6404eb009d2f --- /dev/null +++ b/sys/contrib/dev/qat/qat_402xx_mmp.bin diff --git a/sys/contrib/dev/rtw88/main.c b/sys/contrib/dev/rtw88/main.c index 021d076808e0..963b73f35350 100644 --- a/sys/contrib/dev/rtw88/main.c +++ b/sys/contrib/dev/rtw88/main.c @@ -57,6 +57,62 @@ module_param_named(support_vht, rtw_vht_support, bool, 0644); MODULE_PARM_DESC(support_vht, "Set to Y to enable VHT support"); #endif +#if defined(__FreeBSD__) +/* Macros based on rtw89::core.c. */ +#define RTW88_DEF_CHAN(_freq, _hw_val, _flags, _band) \ + { .center_freq = _freq, .hw_value = _hw_val, .flags = _flags, .band = _band, } +#define RTW88_DEF_CHAN_2G(_freq, _hw_val) \ + RTW88_DEF_CHAN(_freq, _hw_val, 0, NL80211_BAND_2GHZ) +#define RTW88_DEF_CHAN_5G(_freq, _hw_val) \ + RTW88_DEF_CHAN(_freq, _hw_val, 0, NL80211_BAND_5GHZ) +#define RTW88_DEF_CHAN_5G_NO_HT40MINUS(_freq, _hw_val) \ + RTW88_DEF_CHAN(_freq, _hw_val, IEEE80211_CHAN_NO_HT40MINUS, NL80211_BAND_5GHZ) + +static struct ieee80211_channel rtw_channeltable_2g[] = { + RTW88_DEF_CHAN_2G(2412, 1), + RTW88_DEF_CHAN_2G(2417, 2), + RTW88_DEF_CHAN_2G(2422, 3), + RTW88_DEF_CHAN_2G(2427, 4), + RTW88_DEF_CHAN_2G(2432, 5), + RTW88_DEF_CHAN_2G(2437, 6), + RTW88_DEF_CHAN_2G(2442, 7), + RTW88_DEF_CHAN_2G(2447, 8), + RTW88_DEF_CHAN_2G(2452, 9), + RTW88_DEF_CHAN_2G(2457, 10), + RTW88_DEF_CHAN_2G(2462, 11), + RTW88_DEF_CHAN_2G(2467, 12), + RTW88_DEF_CHAN_2G(2472, 13), + RTW88_DEF_CHAN_2G(2484, 14), +}; + +static struct ieee80211_channel rtw_channeltable_5g[] = { + RTW88_DEF_CHAN_5G(5180, 36), + RTW88_DEF_CHAN_5G(5200, 40), + RTW88_DEF_CHAN_5G(5220, 44), + RTW88_DEF_CHAN_5G(5240, 48), + RTW88_DEF_CHAN_5G(5260, 52), + RTW88_DEF_CHAN_5G(5280, 56), + RTW88_DEF_CHAN_5G(5300, 60), + RTW88_DEF_CHAN_5G(5320, 64), + RTW88_DEF_CHAN_5G(5500, 100), + RTW88_DEF_CHAN_5G(5520, 104), + RTW88_DEF_CHAN_5G(5540, 108), + RTW88_DEF_CHAN_5G(5560, 112), + RTW88_DEF_CHAN_5G(5580, 116), + RTW88_DEF_CHAN_5G(5600, 120), + RTW88_DEF_CHAN_5G(5620, 124), + RTW88_DEF_CHAN_5G(5640, 128), + RTW88_DEF_CHAN_5G(5660, 132), + RTW88_DEF_CHAN_5G(5680, 136), + RTW88_DEF_CHAN_5G(5700, 140), + RTW88_DEF_CHAN_5G(5720, 144), + RTW88_DEF_CHAN_5G(5745, 149), + RTW88_DEF_CHAN_5G(5765, 153), + RTW88_DEF_CHAN_5G(5785, 157), + RTW88_DEF_CHAN_5G(5805, 161), + RTW88_DEF_CHAN_5G_NO_HT40MINUS(5825, 165), +}; +#elif deifned(__linux__) static struct ieee80211_channel rtw_channeltable_2g[] = { {.center_freq = 2412, .hw_value = 1,}, {.center_freq = 2417, .hw_value = 2,}, @@ -102,6 +158,7 @@ static struct ieee80211_channel rtw_channeltable_5g[] = { {.center_freq = 5825, .hw_value = 165, .flags = IEEE80211_CHAN_NO_HT40MINUS}, }; +#endif static struct ieee80211_rate rtw_ratetable[] = { {.bitrate = 10, .hw_value = 0x00,}, diff --git a/sys/contrib/dev/rtw89/fw.c b/sys/contrib/dev/rtw89/fw.c index e360f27c2ade..b4c0f864bc75 100644 --- a/sys/contrib/dev/rtw89/fw.c +++ b/sys/contrib/dev/rtw89/fw.c @@ -908,11 +908,7 @@ int rtw89_build_phy_tbl_from_elm(struct rtw89_dev *rtwdev, case RTW89_FW_ELEMENT_ID_RADIO_B: case RTW89_FW_ELEMENT_ID_RADIO_C: case RTW89_FW_ELEMENT_ID_RADIO_D: -#if defined(__linux__) rf_path = arg.rf_path; -#elif defined(__FreeBSD__) - rf_path = __DECONST(enum rtw89_rf_path, arg.rf_path); -#endif idx = elm->u.reg2.idx; elm_info->rf_radio[idx] = tbl; diff --git a/sys/contrib/libnv/nvlist.c b/sys/contrib/libnv/nvlist.c index 41edc72322c3..73226ee51a78 100644 --- a/sys/contrib/libnv/nvlist.c +++ b/sys/contrib/libnv/nvlist.c @@ -478,7 +478,7 @@ nvlist_dump_error_check(const nvlist_t *nvl, int fd, int level) void nvlist_dump(const nvlist_t *nvl, int fd) { - const nvlist_t *tmpnvl; + const nvlist_t *tmpnvl, *top; nvpair_t *nvp, *tmpnvp; void *cookie; int level; @@ -487,6 +487,7 @@ nvlist_dump(const nvlist_t *nvl, int fd) if (nvlist_dump_error_check(nvl, fd, level)) return; + top = nvl; nvp = nvlist_first_nvpair(nvl); while (nvp != NULL) { dprintf(fd, "%*s%s (%s):", level * 4, "", nvpair_name(nvp), @@ -645,6 +646,8 @@ nvlist_dump(const nvlist_t *nvl, int fd) while ((nvp = nvlist_next_nvpair(nvl, nvp)) == NULL) { do { + if (nvl == top) + return; cookie = NULL; if (nvlist_in_array(nvl)) dprintf(fd, "%*s,\n", level * 4, ""); @@ -847,7 +850,7 @@ nvlist_xpack(const nvlist_t *nvl, int64_t *fdidxp, size_t *sizep) { unsigned char *buf, *ptr; size_t left, size; - const nvlist_t *tmpnvl; + const nvlist_t *tmpnvl, *top; nvpair_t *nvp, *tmpnvp; void *cookie; @@ -868,6 +871,7 @@ nvlist_xpack(const nvlist_t *nvl, int64_t *fdidxp, size_t *sizep) ptr = nvlist_pack_header(nvl, ptr, &left); + top = nvl; nvp = nvlist_first_nvpair(nvl); while (nvp != NULL) { NVPAIR_ASSERT(nvp); @@ -958,6 +962,8 @@ nvlist_xpack(const nvlist_t *nvl, int64_t *fdidxp, size_t *sizep) goto fail; while ((nvp = nvlist_next_nvpair(nvl, nvp)) == NULL) { do { + if (nvl == top) + goto out; cookie = NULL; if (nvlist_in_array(nvl)) { ptr = nvpair_pack_nvlist_array_next(ptr, diff --git a/sys/contrib/openzfs/.cirrus.yml b/sys/contrib/openzfs/.cirrus.yml deleted file mode 100644 index 366bb87fbb14..000000000000 --- a/sys/contrib/openzfs/.cirrus.yml +++ /dev/null @@ -1,21 +0,0 @@ -env: - CIRRUS_CLONE_DEPTH: 1 - ARCH: amd64 - -build_task: - matrix: - freebsd_instance: - image_family: freebsd-13-5 - freebsd_instance: - image_family: freebsd-14-2 - freebsd_instance: - image_family: freebsd-15-0-snap - prepare_script: - - pkg install -y autoconf automake libtool gettext-runtime gmake ksh93 py311-packaging py311-cffi py311-sysctl - configure_script: - - env MAKE=gmake ./autogen.sh - - env MAKE=gmake ./configure --with-config="user" --with-python=3.11 - build_script: - - gmake -j `sysctl -n kern.smp.cpus` - install_script: - - gmake install diff --git a/sys/contrib/openzfs/.github/codeql-cpp.yml b/sys/contrib/openzfs/.github/codeql-cpp.yml index 88b8c6086025..d99cdb559244 100644 --- a/sys/contrib/openzfs/.github/codeql-cpp.yml +++ b/sys/contrib/openzfs/.github/codeql-cpp.yml @@ -2,3 +2,4 @@ name: "Custom CodeQL Analysis" queries: - uses: ./.github/codeql/custom-queries/cpp/deprecatedFunctionUsage.ql + - uses: ./.github/codeql/custom-queries/cpp/dslDatasetHoldReleMismatch.ql diff --git a/sys/contrib/openzfs/.github/codeql/custom-queries/cpp/dslDatasetHoldReleMismatch.ql b/sys/contrib/openzfs/.github/codeql/custom-queries/cpp/dslDatasetHoldReleMismatch.ql new file mode 100644 index 000000000000..fb5dae35092f --- /dev/null +++ b/sys/contrib/openzfs/.github/codeql/custom-queries/cpp/dslDatasetHoldReleMismatch.ql @@ -0,0 +1,34 @@ +/** + * @name Detect mismatched dsl_dataset_hold/_rele pairs + * @description Flags instances of issue #12014 where + * - a dataset held with dsl_dataset_hold_obj() ends up in dsl_dataset_rele_flags(), or + * - a dataset held with dsl_dataset_hold_obj_flags() ends up in dsl_dataset_rele(). + * @kind problem + * @severity error + * @tags correctness + * @id cpp/dslDatasetHoldReleMismatch + */ + +import cpp + +from Variable ds, Call holdCall, Call releCall, string message +where + ds.getType().toString() = "dsl_dataset_t *" and + holdCall.getASuccessor*() = releCall and + ( + (holdCall.getTarget().getName() = "dsl_dataset_hold_obj_flags" and + holdCall.getArgument(4).(AddressOfExpr).getOperand().(VariableAccess).getTarget() = ds and + releCall.getTarget().getName() = "dsl_dataset_rele" and + releCall.getArgument(0).(VariableAccess).getTarget() = ds and + message = "Held with dsl_dataset_hold_obj_flags but released with dsl_dataset_rele") + or + (holdCall.getTarget().getName() = "dsl_dataset_hold_obj" and + holdCall.getArgument(3).(AddressOfExpr).getOperand().(VariableAccess).getTarget() = ds and + releCall.getTarget().getName() = "dsl_dataset_rele_flags" and + releCall.getArgument(0).(VariableAccess).getTarget() = ds and + message = "Held with dsl_dataset_hold_obj but released with dsl_dataset_rele_flags") + ) +select releCall, + "Mismatched release: held with $@ but released with " + releCall.getTarget().getName() + " for dataset $@", + holdCall, holdCall.getTarget().getName(), + ds, ds.toString() diff --git a/sys/contrib/openzfs/.github/workflows/scripts/generate-ci-type.py b/sys/contrib/openzfs/.github/workflows/scripts/generate-ci-type.py index b49255e8381d..08021aabcb61 100755 --- a/sys/contrib/openzfs/.github/workflows/scripts/generate-ci-type.py +++ b/sys/contrib/openzfs/.github/workflows/scripts/generate-ci-type.py @@ -65,7 +65,7 @@ if __name__ == '__main__': # check last (HEAD) commit message last_commit_message_raw = subprocess.run([ - 'git', 'show', '-s', '--format=%B', 'HEAD' + 'git', 'show', '-s', '--format=%B', head ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) for line in last_commit_message_raw.stdout.decode().splitlines(): diff --git a/sys/contrib/openzfs/.github/workflows/scripts/qemu-1-setup.sh b/sys/contrib/openzfs/.github/workflows/scripts/qemu-1-setup.sh index de29ad1f57b6..0278264d9279 100755 --- a/sys/contrib/openzfs/.github/workflows/scripts/qemu-1-setup.sh +++ b/sys/contrib/openzfs/.github/workflows/scripts/qemu-1-setup.sh @@ -6,6 +6,13 @@ set -eu +# We've been seeing this script take over 15min to run. This may or +# may not be normal. Just to get a little more insight, print out +# a message to stdout with the top running process, and do this every +# 30 seconds. We can delete this watchdog later once we get a better +# handle on what the timeout value should be. +(while [ 1 ] ; do sleep 30 && echo "[watchdog: $(ps -eo cmd --sort=-pcpu | head -n 2 | tail -n 1)}')]"; done) & + # install needed packages export DEBIAN_FRONTEND="noninteractive" sudo apt-get -y update @@ -65,3 +72,6 @@ sudo zpool create -f -o ashift=12 zpool $SSD1 $SSD2 -O relatime=off \ for i in /sys/block/s*/queue/scheduler; do echo "none" | sudo tee $i done + +# Kill off our watchdog +kill $(jobs -p) diff --git a/sys/contrib/openzfs/.github/workflows/scripts/qemu-2-start.sh b/sys/contrib/openzfs/.github/workflows/scripts/qemu-2-start.sh index 7e20a98c2faf..8439942c5a41 100755 --- a/sys/contrib/openzfs/.github/workflows/scripts/qemu-2-start.sh +++ b/sys/contrib/openzfs/.github/workflows/scripts/qemu-2-start.sh @@ -25,6 +25,10 @@ UBMIRROR="https://cloud-images.ubuntu.com" # default nic model for vm's NIC="virtio" +# additional options for virt-install +OPTS[0]="" +OPTS[1]="" + case "$OS" in almalinux8) OSNAME="AlmaLinux 8" @@ -61,6 +65,14 @@ case "$OS" in OSNAME="Debian 12" URL="https://cloud.debian.org/images/cloud/bookworm/latest/debian-12-generic-amd64.qcow2" ;; + debian13) + OSNAME="Debian 13" + # TODO: Overwrite OSv to debian13 for virt-install until it's added to osinfo + OSv="debian12" + URL="https://cloud.debian.org/images/cloud/trixie/latest/debian-13-generic-amd64.qcow2" + OPTS[0]="--boot" + OPTS[1]="uefi=on" + ;; fedora41) OSNAME="Fedora 41" OSv="fedora-unknown" @@ -71,14 +83,6 @@ case "$OS" in OSv="fedora-unknown" URL="https://download.fedoraproject.org/pub/fedora/linux/releases/42/Cloud/x86_64/images/Fedora-Cloud-Base-Generic-42-1.1.x86_64.qcow2" ;; - freebsd13-4r) - FreeBSD="13.4-RELEASE" - OSNAME="FreeBSD $FreeBSD" - OSv="freebsd13.0" - URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz" - KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz" - NIC="rtl8139" - ;; freebsd13-5r) FreeBSD="13.5-RELEASE" OSNAME="FreeBSD $FreeBSD" @@ -117,7 +121,7 @@ case "$OS" in KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz" ;; freebsd15-0c) - FreeBSD="15.0-CURRENT" + FreeBSD="15.0-ALPHA2" OSNAME="FreeBSD $FreeBSD" OSv="freebsd14.0" URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz" @@ -250,7 +254,7 @@ sudo virt-install \ --network bridge=virbr0,model=$NIC,mac='52:54:00:83:79:00' \ --cloud-init user-data=/tmp/user-data \ --disk $DISK,bus=virtio,cache=none,format=raw,driver.discard=unmap \ - --import --noautoconsole >/dev/null + --import --noautoconsole ${OPTS[0]} ${OPTS[1]} >/dev/null # Give the VMs hostnames so we don't have to refer to them with # hardcoded IP addresses. diff --git a/sys/contrib/openzfs/.github/workflows/scripts/qemu-3-deps-vm.sh b/sys/contrib/openzfs/.github/workflows/scripts/qemu-3-deps-vm.sh index a581b13c2f58..ee058b488088 100755 --- a/sys/contrib/openzfs/.github/workflows/scripts/qemu-3-deps-vm.sh +++ b/sys/contrib/openzfs/.github/workflows/scripts/qemu-3-deps-vm.sh @@ -28,6 +28,7 @@ function debian() { export DEBIAN_FRONTEND="noninteractive" echo "##[group]Running apt-get update+upgrade" + sudo sed -i '/[[:alpha:]]-backports/d' /etc/apt/sources.list sudo apt-get update -y sudo apt-get upgrade -y echo "##[endgroup]" @@ -40,7 +41,7 @@ function debian() { libelf-dev libffi-dev libmount-dev libpam0g-dev libselinux-dev libssl-dev \ libtool libtool-bin libudev-dev libunwind-dev linux-headers-$(uname -r) \ lsscsi nfs-kernel-server pamtester parted python3 python3-all-dev \ - python3-cffi python3-dev python3-distlib python3-packaging \ + python3-cffi python3-dev python3-distlib python3-packaging libtirpc-dev \ python3-setuptools python3-sphinx qemu-guest-agent rng-tools rpm2cpio \ rsync samba sysstat uuid-dev watchdog wget xfslibs-dev xxhash zlib1g-dev echo "##[endgroup]" @@ -51,7 +52,7 @@ function freebsd() { echo "##[group]Install Development Tools" sudo pkg install -y autoconf automake autotools base64 checkbashisms fio \ - gdb gettext gettext-runtime git gmake gsed jq ksh93 lcov libtool lscpu \ + gdb gettext gettext-runtime git gmake gsed jq ksh lcov libtool lscpu \ pkgconf python python3 pamtester pamtester qemu-guest-agent rsync xxhash sudo pkg install -xy \ '^samba4[[:digit:]]+$' \ diff --git a/sys/contrib/openzfs/.github/workflows/scripts/qemu-4-build-vm.sh b/sys/contrib/openzfs/.github/workflows/scripts/qemu-4-build-vm.sh index 17e976ebcc39..2807d9e77127 100755 --- a/sys/contrib/openzfs/.github/workflows/scripts/qemu-4-build-vm.sh +++ b/sys/contrib/openzfs/.github/workflows/scripts/qemu-4-build-vm.sh @@ -5,12 +5,13 @@ # # Usage: # -# qemu-4-build-vm.sh OS [--enable-debug][--dkms][--poweroff] -# [--release][--repo][--tarball] +# qemu-4-build-vm.sh OS [--enable-debug][--dkms][--patch-level NUM] +# [--poweroff][--release][--repo][--tarball] # # OS: OS name like 'fedora41' # --enable-debug: Build RPMs with '--enable-debug' (for testing) # --dkms: Build DKMS RPMs as well +# --patch-level NUM: Use a custom patch level number for packages. # --poweroff: Power-off the VM after building # --release Build zfs-release*.rpm as well # --repo After building everything, copy RPMs into /tmp/repo @@ -21,6 +22,7 @@ ENABLE_DEBUG="" DKMS="" +PATCH_LEVEL="" POWEROFF="" RELEASE="" REPO="" @@ -35,6 +37,11 @@ while [[ $# -gt 0 ]]; do DKMS=1 shift ;; + --patch-level) + PATCH_LEVEL=$2 + shift + shift + ;; --poweroff) POWEROFF=1 shift @@ -215,6 +222,10 @@ function rpm_build_and_install() { run ./autogen.sh echo "##[endgroup]" + if [ -n "$PATCH_LEVEL" ] ; then + sed -i -E 's/(Release:\s+)1/\1'$PATCH_LEVEL'/g' META + fi + echo "##[group]Configure" run ./configure --enable-debuginfo $extra echo "##[endgroup]" @@ -328,7 +339,13 @@ fi # almalinux9.5 # fedora42 source /etc/os-release -sudo hostname "$ID$VERSION_ID" + if which hostnamectl &> /dev/null ; then + # Fedora 42+ use hostnamectl + sudo hostnamectl set-hostname "$ID$VERSION_ID" + sudo hostnamectl set-hostname --pretty "$ID$VERSION_ID" +else + sudo hostname "$ID$VERSION_ID" +fi # save some sysinfo uname -a > /var/tmp/uname.txt diff --git a/sys/contrib/openzfs/.github/workflows/scripts/qemu-5-setup.sh b/sys/contrib/openzfs/.github/workflows/scripts/qemu-5-setup.sh index 6bf10024a1a6..0adcad2a99bc 100755 --- a/sys/contrib/openzfs/.github/workflows/scripts/qemu-5-setup.sh +++ b/sys/contrib/openzfs/.github/workflows/scripts/qemu-5-setup.sh @@ -12,16 +12,26 @@ source /var/tmp/env.txt # wait for poweroff to succeed PID=$(pidof /usr/bin/qemu-system-x86_64) tail --pid=$PID -f /dev/null -sudo virsh undefine openzfs +sudo virsh undefine --nvram openzfs # cpu pinning CPUSET=("0,1" "2,3") +# additional options for virt-install +OPTS[0]="" +OPTS[1]="" + case "$OS" in freebsd*) # FreeBSD needs only 6GiB RAM=6 ;; + debian13) + RAM=8 + # Boot Debian 13 with uefi=on and secureboot=off (ZFS Kernel Module not signed) + OPTS[0]="--boot" + OPTS[1]="firmware=efi,firmware.feature0.name=secure-boot,firmware.feature0.enabled=no" + ;; *) # Linux needs more memory, but can be optimized to share it via KSM RAM=8 @@ -79,7 +89,7 @@ EOF --network bridge=virbr0,model=$NIC,mac="52:54:00:83:79:0$i" \ --disk $DISK-system,bus=virtio,cache=none,format=$FORMAT,driver.discard=unmap \ --disk $DISK-tests,bus=virtio,cache=none,format=$FORMAT,driver.discard=unmap \ - --import --noautoconsole >/dev/null + --import --noautoconsole ${OPTS[0]} ${OPTS[1]} done # generate some memory stats diff --git a/sys/contrib/openzfs/.github/workflows/scripts/qemu-6-tests.sh b/sys/contrib/openzfs/.github/workflows/scripts/qemu-6-tests.sh index e8e6adecd62f..5ab822f4f076 100755 --- a/sys/contrib/openzfs/.github/workflows/scripts/qemu-6-tests.sh +++ b/sys/contrib/openzfs/.github/workflows/scripts/qemu-6-tests.sh @@ -21,11 +21,13 @@ function prefix() { S=$((DIFF-(M*60))) CTR=$(cat /tmp/ctr) - echo $LINE| grep -q "^Test[: ]" && CTR=$((CTR+1)) && echo $CTR > /tmp/ctr + echo $LINE| grep -q '^\[.*] Test[: ]' && CTR=$((CTR+1)) && echo $CTR > /tmp/ctr BASE="$HOME/work/zfs/zfs" COLOR="$BASE/scripts/zfs-tests-color.sh" - CLINE=$(echo $LINE| grep "^Test[ :]" | sed -e 's|/usr/local|/usr|g' \ + CLINE=$(echo $LINE| grep '^\[.*] Test[: ]' \ + | sed -e 's|^\[.*] Test|Test|g' \ + | sed -e 's|/usr/local|/usr|g' \ | sed -e 's| /usr/share/zfs/zfs-tests/tests/| |g' | $COLOR) if [ -z "$CLINE" ]; then printf "vm${ID}: %s\n" "$LINE" diff --git a/sys/contrib/openzfs/.github/workflows/zfs-qemu-packages.yml b/sys/contrib/openzfs/.github/workflows/zfs-qemu-packages.yml index 5b5afe746859..d8a95954fe1a 100644 --- a/sys/contrib/openzfs/.github/workflows/zfs-qemu-packages.yml +++ b/sys/contrib/openzfs/.github/workflows/zfs-qemu-packages.yml @@ -32,6 +32,11 @@ on: options: - "Build RPMs" - "Test repo" + patch_level: + type: string + required: false + default: "" + description: "(optional) patch level number" repo_url: type: string required: false @@ -78,7 +83,13 @@ jobs: mkdir -p /tmp/repo ssh zfs@vm0 '$HOME/zfs/.github/workflows/scripts/qemu-test-repo-vm.sh' ${{ github.event.inputs.repo_url }} else - .github/workflows/scripts/qemu-4-build.sh --repo --release --dkms --tarball ${{ matrix.os }} + EXTRA="" + if [ -n "${{ github.event.inputs.patch_level }}" ] ; then + EXTRA="--patch-level ${{ github.event.inputs.patch_level }}" + fi + + .github/workflows/scripts/qemu-4-build.sh $EXTRA \ + --repo --release --dkms --tarball ${{ matrix.os }} fi - name: Prepare artifacts diff --git a/sys/contrib/openzfs/.github/workflows/zfs-qemu.yml b/sys/contrib/openzfs/.github/workflows/zfs-qemu.yml index 035d8be7e227..69349678d84c 100644 --- a/sys/contrib/openzfs/.github/workflows/zfs-qemu.yml +++ b/sys/contrib/openzfs/.github/workflows/zfs-qemu.yml @@ -5,16 +5,6 @@ on: pull_request: workflow_dispatch: inputs: - include_stream9: - type: boolean - required: false - default: false - description: 'Test on CentOS 9 stream' - include_stream10: - type: boolean - required: false - default: false - description: 'Test on CentOS 10 stream' fedora_kernel_ver: type: string required: false @@ -39,7 +29,7 @@ jobs: - name: Generate OS config and CI type id: os run: | - FULL_OS='["almalinux8", "almalinux9", "almalinux10", "debian11", "debian12", "fedora41", "fedora42", "freebsd13-4r", "freebsd14-3s", "freebsd15-0c", "ubuntu22", "ubuntu24"]' + FULL_OS='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian12", "debian13", "fedora41", "fedora42", "freebsd13-5r", "freebsd14-3s", "freebsd15-0c", "ubuntu22", "ubuntu24"]' QUICK_OS='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora42", "freebsd14-3s", "ubuntu24"]' # determine CI type when running on PR ci_type="full" @@ -54,7 +44,7 @@ jobs: os_selection="$FULL_OS" fi - if [ ${{ github.event.inputs.fedora_kernel_ver }} != "" ] ; then + if ${{ github.event.inputs.fedora_kernel_ver != '' }}; then # They specified a custom kernel version for Fedora. Use only # Fedora runners. os_json=$(echo ${os_selection} | jq -c '[.[] | select(startswith("fedora"))]') @@ -63,17 +53,8 @@ jobs: os_json=$(echo ${os_selection} | jq -c) fi - # Add optional runners - if [ "${{ github.event.inputs.include_stream9 }}" == 'true' ]; then - os_json=$(echo $os_json | jq -c '. += ["centos-stream9"]') - fi - if [ "${{ github.event.inputs.include_stream10 }}" == 'true' ]; then - os_json=$(echo $os_json | jq -c '. += ["centos-stream10"]') - fi - - echo $os_json - echo "os=$os_json" >> $GITHUB_OUTPUT - echo "ci_type=$ci_type" >> $GITHUB_OUTPUT + echo "os=$os_json" | tee -a $GITHUB_OUTPUT + echo "ci_type=$ci_type" | tee -a $GITHUB_OUTPUT qemu-vm: name: qemu-x86 @@ -81,11 +62,11 @@ jobs: strategy: fail-fast: false matrix: - # rhl: almalinux8, almalinux9, centos-stream9, fedora41 - # debian: debian11, debian12, ubuntu22, ubuntu24 + # rhl: almalinux8, almalinux9, centos-stream9, fedora4x + # debian: debian12, debian13, ubuntu22, ubuntu24 # misc: archlinux, tumbleweed # FreeBSD variants of 2025-06: - # FreeBSD Release: freebsd13-4r, freebsd13-5r, freebsd14-1r, freebsd14-2r, freebsd14-3r + # FreeBSD Release: freebsd13-5r, freebsd14-2r, freebsd14-3r # FreeBSD Stable: freebsd13-5s, freebsd14-3s # FreeBSD Current: freebsd15-0c os: ${{ fromJson(needs.test-config.outputs.test_os) }} @@ -96,8 +77,12 @@ jobs: ref: ${{ github.event.pull_request.head.sha }} - name: Setup QEMU - timeout-minutes: 10 - run: .github/workflows/scripts/qemu-1-setup.sh + timeout-minutes: 20 + run: | + # Add a timestamp to each line to debug timeouts + while IFS=$'\n' read -r line; do + echo "$(date +'%H:%M:%S') $line" + done < <(.github/workflows/scripts/qemu-1-setup.sh) - name: Start build machine timeout-minutes: 10 diff --git a/sys/contrib/openzfs/.github/workflows/zloop.yml b/sys/contrib/openzfs/.github/workflows/zloop.yml index 7b3bf49d90d5..4ae3ccdc5484 100644 --- a/sys/contrib/openzfs/.github/workflows/zloop.yml +++ b/sys/contrib/openzfs/.github/workflows/zloop.yml @@ -12,7 +12,8 @@ jobs: zloop: runs-on: ubuntu-24.04 env: - TEST_DIR: /var/tmp/zloop + WORK_DIR: /mnt/zloop + CORE_DIR: /mnt/zloop/cores steps: - uses: actions/checkout@v4 with: @@ -40,38 +41,37 @@ jobs: sudo modprobe zfs - name: Tests run: | - sudo mkdir -p $TEST_DIR - # run for 10 minutes or at most 6 iterations for a maximum runner - # time of 60 minutes. - sudo /usr/share/zfs/zloop.sh -t 600 -I 6 -l -m 1 -- -T 120 -P 60 + sudo truncate -s 256G /mnt/vdev + sudo zpool create cipool -m $WORK_DIR -O compression=on -o autotrim=on /mnt/vdev + sudo /usr/share/zfs/zloop.sh -t 600 -I 6 -l -m 1 -c $CORE_DIR -f $WORK_DIR -- -T 120 -P 60 - name: Prepare artifacts if: failure() run: | - sudo chmod +r -R $TEST_DIR/ + sudo chmod +r -R $WORK_DIR/ - name: Ztest log if: failure() run: | - grep -B10 -A1000 'ASSERT' $TEST_DIR/*/ztest.out || tail -n 1000 $TEST_DIR/*/ztest.out + grep -B10 -A1000 'ASSERT' $CORE_DIR/*/ztest.out || tail -n 1000 $CORE_DIR/*/ztest.out - name: Gdb log if: failure() run: | - sed -n '/Backtraces (full)/q;p' $TEST_DIR/*/ztest.gdb + sed -n '/Backtraces (full)/q;p' $CORE_DIR/*/ztest.gdb - name: Zdb log if: failure() run: | - cat $TEST_DIR/*/ztest.zdb + cat $CORE_DIR/*/ztest.zdb - uses: actions/upload-artifact@v4 if: failure() with: name: Logs path: | - /var/tmp/zloop/*/ - !/var/tmp/zloop/*/vdev/ + /mnt/zloop/*/ + !/mnt/zloop/cores/*/vdev/ if-no-files-found: ignore - uses: actions/upload-artifact@v4 if: failure() with: name: Pool files path: | - /var/tmp/zloop/*/vdev/ + /mnt/zloop/cores/*/vdev/ if-no-files-found: ignore diff --git a/sys/contrib/openzfs/.mailmap b/sys/contrib/openzfs/.mailmap index b6d942c000b8..e6f09c6c9d43 100644 --- a/sys/contrib/openzfs/.mailmap +++ b/sys/contrib/openzfs/.mailmap @@ -23,6 +23,7 @@ # These maps are making names consistent where they have varied but the email # address has never changed. In most cases, the full name is in the # Signed-off-by of a commit with a matching author. +Achill Gilgenast <achill@achill.org> Ahelenia ZiemiaÅ„ska <nabijaczleweli@gmail.com> Ahelenia ZiemiaÅ„ska <nabijaczleweli@nabijaczleweli.xyz> Alex John <alex@stty.io> @@ -37,6 +38,7 @@ Crag Wang <crag0715@gmail.com> Damian Szuberski <szuberskidamian@gmail.com> Daniel Kolesa <daniel@octaforge.org> Debabrata Banerjee <dbavatar@gmail.com> +Diwakar Kristappagari <diwakar-k@hpe.com> Finix Yan <yanchongwen@hotmail.com> Gaurav Kumar <gauravk.18@gmail.com> Gionatan Danti <g.danti@assyoma.it> @@ -145,6 +147,7 @@ Gaurav Kumar <gauravk.18@gmail.com> <gaurkuma@users.noreply.github.com> George Gaydarov <git@gg7.io> <gg7@users.noreply.github.com> Georgy Yakovlev <gyakovlev@gentoo.org> <168902+gyakovlev@users.noreply.github.com> Gerardwx <gerardw@alum.mit.edu> <Gerardwx@users.noreply.github.com> +Germano Massullo <germano.massullo@gmail.com> <Germano0@users.noreply.github.com> Gian-Carlo DeFazio <defazio1@llnl.gov> <defaziogiancarlo@users.noreply.github.com> Giuseppe Di Natale <dinatale2@llnl.gov> <dinatale2@users.noreply.github.com> Hajo Möller <dasjoe@gmail.com> <dasjoe@users.noreply.github.com> @@ -164,6 +167,7 @@ John Ramsden <johnramsden@riseup.net> <johnramsden@users.noreply.github.com> Jonathon Fernyhough <jonathon@m2x.dev> <559369+jonathonf@users.noreply.github.com> Jose Luis Duran <jlduran@gmail.com> <jlduran@users.noreply.github.com> Justin Hibbits <chmeeedalf@gmail.com> <chmeeedalf@users.noreply.github.com> +Kaitlin Hoang <kthoang@amazon.com> <khoang98@users.noreply.github.com> Kevin Greene <kevin.greene@delphix.com> <104801862+kxgreene@users.noreply.github.com> Kevin Jin <lostking2008@hotmail.com> <33590050+jxdking@users.noreply.github.com> Kevin P. Fleming <kevin@km6g.us> <kpfleming@users.noreply.github.com> diff --git a/sys/contrib/openzfs/AUTHORS b/sys/contrib/openzfs/AUTHORS index a9d249a66f1e..6c34c07f39ef 100644 --- a/sys/contrib/openzfs/AUTHORS +++ b/sys/contrib/openzfs/AUTHORS @@ -10,6 +10,7 @@ PAST MAINTAINERS: CONTRIBUTORS: Aaron Fineman <abyxcos@gmail.com> + Achill Gilgenast <achill@achill.org> Adam D. Moss <c@yotes.com> Adam Leventhal <ahl@delphix.com> Adam Stevko <adam.stevko@gmail.com> @@ -59,6 +60,7 @@ CONTRIBUTORS: Andreas Buschmann <andreas.buschmann@tech.net.de> Andreas Dilger <adilger@intel.com> Andreas Vögele <andreas@andreasvoegele.com> + Andres <a-d-j-i@users.noreply.github.com> Andrew Barnes <barnes333@gmail.com> Andrew Hamilton <ahamilto@tjhsst.edu> Andrew Innes <andrew.c12@gmail.com> @@ -72,6 +74,7 @@ CONTRIBUTORS: Andrey Prokopenko <job@terem.fr> Andrey Vesnovaty <andrey.vesnovaty@gmail.com> Andriy Gapon <avg@freebsd.org> + Andriy Tkachuk <andriy.tkachuk@seagate.com> Andy Bakun <github@thwartedefforts.org> Andy Fiddaman <omnios@citrus-it.co.uk> Aniruddha Shankar <k@191a.net> @@ -120,6 +123,7 @@ CONTRIBUTORS: Caleb James DeLisle <calebdelisle@lavabit.com> Cameron Harr <harr1@llnl.gov> Cao Xuewen <cao.xuewen@zte.com.cn> + Carl George <carlwgeorge@gmail.com> Carlo Landmeter <clandmeter@gmail.com> Carlos Alberto Lopez Perez <clopez@igalia.com> Cedric Maunoury <cedric.maunoury@gmail.com> @@ -200,6 +204,7 @@ CONTRIBUTORS: Dimitri John Ledkov <xnox@ubuntu.com> Dimitry Andric <dimitry@andric.com> Dirkjan Bussink <d.bussink@gmail.com> + Diwakar Kristappagari <diwakar-k@hpe.com> Dmitry Khasanov <pik4ez@gmail.com> Dominic Pearson <dsp@technoanimal.net> Dominik Hassler <hadfl@omniosce.org> @@ -250,6 +255,7 @@ CONTRIBUTORS: George Wilson <gwilson@delphix.com> Georgy Yakovlev <ya@sysdump.net> Gerardwx <gerardw@alum.mit.edu> + Germano Massullo <germano.massullo@gmail.com> Gian-Carlo DeFazio <defazio1@llnl.gov> Gionatan Danti <g.danti@assyoma.it> Giuseppe Di Natale <guss80@gmail.com> @@ -287,6 +293,7 @@ CONTRIBUTORS: Igor K <igor@dilos.org> Igor Kozhukhov <ikozhukhov@gmail.com> Igor Lvovsky <ilvovsky@gmail.com> + Igor Ostapenko <pm@igoro.pro> ilbsmart <wgqimut@gmail.com> Ilkka Sovanto <github@ilkka.kapsi.fi> illiliti <illiliti@protonmail.com> @@ -326,6 +333,7 @@ CONTRIBUTORS: Jinshan Xiong <jinshan.xiong@intel.com> Jitendra Patidar <jitendra.patidar@nutanix.com> JK Dingwall <james@dingwall.me.uk> + Joel Low <joel@joelsplace.sg> Joe Stein <joe.stein@delphix.com> John-Mark Gurney <jmg@funkthat.com> John Albietz <inthecloud247@gmail.com> @@ -374,6 +382,7 @@ CONTRIBUTORS: Kevin Jin <lostking2008@hotmail.com> Kevin P. Fleming <kevin@km6g.us> Kevin Tanguy <kevin.tanguy@ovh.net> + khoang98 <khoang98@users.noreply.github.com> KireinaHoro <i@jsteward.moe> Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl> Kleber TarcÃsio <klebertarcisio@yahoo.com.br> @@ -447,6 +456,7 @@ CONTRIBUTORS: Max Zettlmeißl <max@zettlmeissl.de> Md Islam <mdnahian@outlook.com> megari <megari@iki.fi> + Meriel Luna Mittelbach <lunarlambda@gmail.com> Michael D Labriola <michael.d.labriola@gmail.com> Michael Franzl <michael@franzl.name> Michael Gebetsroither <michael@mgeb.org> @@ -494,6 +504,7 @@ CONTRIBUTORS: Orivej Desh <orivej@gmx.fr> Pablo Correa Gómez <ablocorrea@hotmail.com> Palash Gandhi <pbg4930@rit.edu> + Patrick Fasano <patrick@patrickfasano.com> Patrick Mooney <pmooney@pfmooney.com> Patrik Greco <sikevux@sikevux.se> Paul B. Henson <henson@acm.org> @@ -535,6 +546,7 @@ CONTRIBUTORS: Remy Blank <remy.blank@pobox.com> renelson <bnelson@nelsonbe.com> Reno Reckling <e-github@wthack.de> + René Wirnata <rene.wirnata@pandascience.net> Ricardo M. Correia <ricardo.correia@oracle.com> Riccardo Schirone <rschirone91@gmail.com> Richard Allen <belperite@gmail.com> @@ -640,6 +652,7 @@ CONTRIBUTORS: tleydxdy <shironeko.github@tesaguri.club> Tobin Harding <me@tobin.cc> Todd Seidelmann <seidelma@users.noreply.github.com> + Todd Zullinger <tmz@pobox.com> Tom Caputi <tcaputi@datto.com> Tom Matthews <tom@axiom-partners.com> Tomohiro Kusumi <kusumi.tomohiro@gmail.com> diff --git a/sys/contrib/openzfs/META b/sys/contrib/openzfs/META index 47f0795bfa11..5704b5c6de8a 100644 --- a/sys/contrib/openzfs/META +++ b/sys/contrib/openzfs/META @@ -1,10 +1,10 @@ Meta: 1 Name: zfs Branch: 1.0 -Version: 2.3.99 +Version: 2.4.99 Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.15 +Linux-Maximum: 6.16 Linux-Minimum: 4.18 diff --git a/sys/contrib/openzfs/Makefile.am b/sys/contrib/openzfs/Makefile.am index 5f09d170e730..30f78e490b78 100644 --- a/sys/contrib/openzfs/Makefile.am +++ b/sys/contrib/openzfs/Makefile.am @@ -1,6 +1,7 @@ CLEANFILES = dist_noinst_DATA = INSTALL_DATA_HOOKS = +INSTALL_EXEC_HOOKS = ALL_LOCAL = CLEAN_LOCAL = CHECKS = shellcheck checkbashisms @@ -71,6 +72,9 @@ all: gitrev PHONY += install-data-hook $(INSTALL_DATA_HOOKS) install-data-hook: $(INSTALL_DATA_HOOKS) +PHONY += install-exec-hook $(INSTALL_EXEC_HOOKS) +install-exec-hook: $(INSTALL_EXEC_HOOKS) + PHONY += maintainer-clean-local maintainer-clean-local: -$(RM) $(GITREV) diff --git a/sys/contrib/openzfs/cmd/Makefile.am b/sys/contrib/openzfs/cmd/Makefile.am index 96040976e53e..ca94f6b77e06 100644 --- a/sys/contrib/openzfs/cmd/Makefile.am +++ b/sys/contrib/openzfs/cmd/Makefile.am @@ -98,17 +98,16 @@ endif if USING_PYTHON -bin_SCRIPTS += arc_summary arcstat dbufstat zilstat -CLEANFILES += arc_summary arcstat dbufstat zilstat -dist_noinst_DATA += %D%/arc_summary %D%/arcstat.in %D%/dbufstat.in %D%/zilstat.in +bin_SCRIPTS += zarcsummary zarcstat dbufstat zilstat +CLEANFILES += zarcsummary zarcstat dbufstat zilstat +dist_noinst_DATA += %D%/zarcsummary %D%/zarcstat.in %D%/dbufstat.in %D%/zilstat.in -$(call SUBST,arcstat,%D%/) +$(call SUBST,zarcstat,%D%/) $(call SUBST,dbufstat,%D%/) $(call SUBST,zilstat,%D%/) -arc_summary: %D%/arc_summary +zarcsummary: %D%/zarcsummary $(AM_V_at)cp $< $@ endif - PHONY += cmd cmd: $(bin_SCRIPTS) $(bin_PROGRAMS) $(sbin_SCRIPTS) $(sbin_PROGRAMS) $(dist_bin_SCRIPTS) $(zfsexec_PROGRAMS) $(mounthelper_PROGRAMS) diff --git a/sys/contrib/openzfs/cmd/arcstat.in b/sys/contrib/openzfs/cmd/zarcstat.in index 6f9abb39c3fb..8ffd20481166 100755 --- a/sys/contrib/openzfs/cmd/arcstat.in +++ b/sys/contrib/openzfs/cmd/zarcstat.in @@ -2,7 +2,7 @@ # SPDX-License-Identifier: CDDL-1.0 # # Print out ZFS ARC Statistics exported via kstat(1) -# For a definition of fields, or usage, use arcstat -v +# For a definition of fields, or usage, use zarcstat -v # # This script was originally a fork of the original arcstat.pl (0.1) # by Neelakanth Nadgir, originally published on his Sun blog on @@ -56,6 +56,7 @@ import time import getopt import re import copy +import os from signal import signal, SIGINT, SIGWINCH, SIG_DFL @@ -171,7 +172,7 @@ cols = { "zactive": [7, 1000, "zfetch prefetches active per second"], } -# ARC structural breakdown from arc_summary +# ARC structural breakdown from zarcsummary structfields = { "cmp": ["compressed", "Compressed"], "ovh": ["overhead", "Overhead"], @@ -187,7 +188,7 @@ structstats = { # size stats "sz": ["_size", "size"], } -# ARC types breakdown from arc_summary +# ARC types breakdown from zarcsummary typefields = { "data": ["data", "ARC data"], "meta": ["metadata", "ARC metadata"], @@ -198,7 +199,7 @@ typestats = { # size stats "sz": ["_size", "size"], } -# ARC states breakdown from arc_summary +# ARC states breakdown from zarcsummary statefields = { "ano": ["anon", "Anonymous"], "mfu": ["mfu", "MFU"], @@ -261,7 +262,7 @@ hdr_intr = 20 # Print header every 20 lines of output opfile = None sep = " " # Default separator is 2 spaces l2exist = False -cmd = ("Usage: arcstat [-havxp] [-f fields] [-o file] [-s string] [interval " +cmd = ("Usage: zarcstat [-havxp] [-f fields] [-o file] [-s string] [interval " "[count]]\n") cur = {} d = {} @@ -348,10 +349,10 @@ def usage(): "character or string\n") sys.stderr.write("\t -p : Disable auto-scaling of numerical fields\n") sys.stderr.write("\nExamples:\n") - sys.stderr.write("\tarcstat -o /tmp/a.log 2 10\n") - sys.stderr.write("\tarcstat -s \",\" -o /tmp/a.log 2 10\n") - sys.stderr.write("\tarcstat -v\n") - sys.stderr.write("\tarcstat -f time,hit%,dh%,ph%,mh% 1\n") + sys.stderr.write("\tzarcstat -o /tmp/a.log 2 10\n") + sys.stderr.write("\tzarcstat -s \",\" -o /tmp/a.log 2 10\n") + sys.stderr.write("\tzarcstat -v\n") + sys.stderr.write("\tzarcstat -f time,hit%,dh%,ph%,mh% 1\n") sys.stderr.write("\n") sys.exit(1) @@ -366,7 +367,7 @@ def snap_stats(): cur = kstat - # fill in additional values from arc_summary + # fill in additional values from zarcsummary cur["caches_size"] = caches_size = cur["anon_data"]+cur["anon_metadata"]+\ cur["mfu_data"]+cur["mfu_metadata"]+cur["mru_data"]+cur["mru_metadata"]+\ cur["uncached_data"]+cur["uncached_metadata"] @@ -766,6 +767,7 @@ def calculate(): def main(): + global sint global count global hdr_intr diff --git a/sys/contrib/openzfs/cmd/arc_summary b/sys/contrib/openzfs/cmd/zarcsummary index c1319573220c..24a129d9ca70 100755 --- a/sys/contrib/openzfs/cmd/arc_summary +++ b/sys/contrib/openzfs/cmd/zarcsummary @@ -34,7 +34,7 @@ Provides basic information on the ARC, its efficiency, the L2ARC (if present), the Data Management Unit (DMU), Virtual Devices (VDEVs), and tunables. See the in-source documentation and code at https://github.com/openzfs/zfs/blob/master/module/zfs/arc.c for details. -The original introduction to arc_summary can be found at +The original introduction to zarcsummary can be found at http://cuddletech.com/?p=454 """ @@ -161,7 +161,7 @@ elif sys.platform.startswith('linux'): return get_params(TUNABLES_PATH) def get_version_impl(request): - # The original arc_summary called /sbin/modinfo/{spl,zfs} to get + # The original zarcsummary called /sbin/modinfo/{spl,zfs} to get # the version information. We switch to /sys/module/{spl,zfs}/version # to make sure we get what is really loaded in the kernel try: @@ -439,7 +439,7 @@ def print_header(): """ # datetime is now recommended over time but we keep the exact formatting - # from the older version of arc_summary in case there are scripts + # from the older version of zarcsummary in case there are scripts # that expect it in this way daydate = time.strftime(DATE_FORMAT) spc_date = LINE_LENGTH-len(daydate) @@ -559,6 +559,7 @@ def section_arc(kstats_dict): print() compressed_size = arc_stats['compressed_size'] + uncompressed_size = arc_stats['uncompressed_size'] overhead_size = arc_stats['overhead_size'] bonus_size = arc_stats['bonus_size'] dnode_size = arc_stats['dnode_size'] @@ -671,6 +672,8 @@ def section_arc(kstats_dict): print() print('ARC misc:') + prt_i2('Uncompressed size:', f_perc(uncompressed_size, compressed_size), + f_bytes(uncompressed_size)) prt_i1('Memory throttles:', arc_stats['memory_throttle_count']) prt_i1('Memory direct reclaims:', arc_stats['memory_direct_count']) prt_i1('Memory indirect reclaims:', arc_stats['memory_indirect_count']) diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c index 45eb9c783659..d655fa715e15 100644 --- a/sys/contrib/openzfs/cmd/zdb/zdb.c +++ b/sys/contrib/openzfs/cmd/zdb/zdb.c @@ -107,7 +107,9 @@ extern uint_t zfs_reconstruct_indirect_combinations_max; extern uint_t zfs_btree_verify_intensity; static const char cmdname[] = "zdb"; -uint8_t dump_opt[256]; +uint8_t dump_opt[512]; + +#define ALLOCATED_OPT 256 typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); @@ -127,6 +129,7 @@ static zfs_range_tree_t *mos_refd_objs; static spa_t *spa; static objset_t *os; static boolean_t kernel_init_done; +static boolean_t corruption_found = B_FALSE; static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *, boolean_t); @@ -176,7 +179,7 @@ static int sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free, dmu_tx_t *tx) { - ASSERT3P(tx, ==, NULL); + ASSERT0P(tx); struct sublivelist_verify *sv = arg; sublivelist_verify_block_refcnt_t current = { .svbr_blk = *bp, @@ -208,7 +211,7 @@ sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free, sublivelist_verify_block_t svb = { .svb_dva = bp->blk_dva[i], .svb_allocated_txg = - BP_GET_LOGICAL_BIRTH(bp) + BP_GET_BIRTH(bp) }; if (zfs_btree_find(&sv->sv_leftover, &svb, @@ -250,6 +253,7 @@ sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle) &e->svbr_blk, B_TRUE); (void) printf("\tERROR: %d unmatched FREE(s): %s\n", e->svbr_refcnt, blkbuf); + corruption_found = B_TRUE; } zfs_btree_destroy(&sv->sv_pair); @@ -381,7 +385,7 @@ verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg, sublivelist_verify_block_t svb = {{{0}}}; DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid); DVA_SET_OFFSET(&svb.svb_dva, offset); - DVA_SET_ASIZE(&svb.svb_dva, size); + DVA_SET_ASIZE(&svb.svb_dva, 0); zfs_btree_index_t where; uint64_t end_offset = offset + size; @@ -405,6 +409,7 @@ verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg, (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva), (u_longlong_t)found->svb_allocated_txg, (u_longlong_t)txg); + corruption_found = B_TRUE; } } } @@ -426,6 +431,7 @@ metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg) (u_longlong_t)txg, (u_longlong_t)offset, (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, (u_longlong_t)mv->mv_msid); + corruption_found = B_TRUE; } else { zfs_range_tree_add(mv->mv_allocated, offset, size); @@ -439,6 +445,7 @@ metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg) (u_longlong_t)txg, (u_longlong_t)offset, (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, (u_longlong_t)mv->mv_msid); + corruption_found = B_TRUE; } else { zfs_range_tree_remove(mv->mv_allocated, offset, size); @@ -526,6 +533,7 @@ mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv) (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); + corruption_found = B_TRUE; continue; } @@ -542,6 +550,7 @@ mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv) (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); + corruption_found = B_TRUE; continue; } @@ -619,8 +628,9 @@ livelist_metaslab_validate(spa_t *spa) metaslab_calculate_range_tree_type(vd, m, &start, &shift); metaslab_verify_t mv; - mv.mv_allocated = zfs_range_tree_create(NULL, - type, NULL, start, shift); + mv.mv_allocated = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + 0, "livelist_metaslab_validate:mv_allocated"); mv.mv_vdid = vd->vdev_id; mv.mv_msid = m->ms_id; mv.mv_start = m->ms_start; @@ -654,6 +664,7 @@ livelist_metaslab_validate(spa_t *spa) } (void) printf("ERROR: Found livelist blocks marked as allocated " "for indirect vdevs:\n"); + corruption_found = B_TRUE; zfs_btree_index_t *where = NULL; sublivelist_verify_block_t *svb; @@ -797,8 +808,8 @@ usage(void) "[default is 200]\n"); (void) fprintf(stderr, " -K --key=KEY " "decryption key for encrypted dataset\n"); - (void) fprintf(stderr, " -o --option=\"OPTION=INTEGER\" " - "set global variable to an unsigned 32-bit integer\n"); + (void) fprintf(stderr, " -o --option=\"NAME=VALUE\" " + "set the named tunable to the given value\n"); (void) fprintf(stderr, " -p --path==PATH " "use one or more with -e to specify path to vdev dir\n"); (void) fprintf(stderr, " -P --parseable " @@ -826,7 +837,7 @@ usage(void) (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " "to make only that option verbose\n"); (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); - zdb_exit(1); + zdb_exit(2); } static void @@ -891,9 +902,9 @@ dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) size_t nvsize = *(uint64_t *)data; char *packed = umem_alloc(nvsize, UMEM_NOFAIL); - VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); + VERIFY0(dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); - VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0); + VERIFY0(nvlist_unpack(packed, nvsize, &nv, 0)); umem_free(packed, nvsize); @@ -1454,8 +1465,8 @@ get_obsolete_refcount(vdev_t *vd) refcount++; } } else { - ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); - ASSERT3U(obsolete_sm_object, ==, 0); + ASSERT0P(vd->vdev_obsolete_sm); + ASSERT0(obsolete_sm_object); } for (unsigned c = 0; c < vd->vdev_children; c++) { refcount += get_obsolete_refcount(vd->vdev_child[c]); @@ -1577,9 +1588,8 @@ dump_spacemap(objset_t *os, space_map_t *sm) continue; } - uint8_t words; char entry_type; - uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID; + uint64_t entry_off, entry_run, entry_vdev; if (sm_entry_is_single_word(word)) { entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ? @@ -1587,35 +1597,43 @@ dump_spacemap(objset_t *os, space_map_t *sm) entry_off = (SM_OFFSET_DECODE(word) << mapshift) + sm->sm_start; entry_run = SM_RUN_DECODE(word) << mapshift; - words = 1; + + (void) printf("\t [%6llu] %c " + "range: %012llx-%012llx size: %08llx\n", + (u_longlong_t)entry_id, entry_type, + (u_longlong_t)entry_off, + (u_longlong_t)(entry_off + entry_run - 1), + (u_longlong_t)entry_run); } else { /* it is a two-word entry so we read another word */ ASSERT(sm_entry_is_double_word(word)); uint64_t extra_word; offset += sizeof (extra_word); + ASSERT3U(offset, <, space_map_length(sm)); VERIFY0(dmu_read(os, space_map_object(sm), offset, sizeof (extra_word), &extra_word, DMU_READ_PREFETCH)); - ASSERT3U(offset, <=, space_map_length(sm)); - entry_run = SM2_RUN_DECODE(word) << mapshift; entry_vdev = SM2_VDEV_DECODE(word); entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ? 'A' : 'F'; entry_off = (SM2_OFFSET_DECODE(extra_word) << mapshift) + sm->sm_start; - words = 2; - } - (void) printf("\t [%6llu] %c range:" - " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n", - (u_longlong_t)entry_id, - entry_type, (u_longlong_t)entry_off, - (u_longlong_t)(entry_off + entry_run), - (u_longlong_t)entry_run, - (u_longlong_t)entry_vdev, words); + if (zopt_metaslab_args == 0 || + zopt_metaslab[0] == entry_vdev) { + (void) printf("\t [%6llu] %c " + "range: %012llx-%012llx size: %08llx " + "vdev: %llu\n", + (u_longlong_t)entry_id, entry_type, + (u_longlong_t)entry_off, + (u_longlong_t)(entry_off + entry_run - 1), + (u_longlong_t)entry_run, + (u_longlong_t)entry_vdev); + } + } if (entry_type == 'A') alloc += entry_run; @@ -1651,6 +1669,16 @@ dump_metaslab_stats(metaslab_t *msp) } static void +dump_allocated(void *arg, uint64_t start, uint64_t size) +{ + uint64_t *off = arg; + if (*off != start) + (void) printf("ALLOC: %"PRIu64" %"PRIu64"\n", *off, + start - *off); + *off = start + size; +} + +static void dump_metaslab(metaslab_t *msp) { vdev_t *vd = msp->ms_group->mg_vd; @@ -1666,13 +1694,24 @@ dump_metaslab(metaslab_t *msp) (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, (u_longlong_t)space_map_object(sm), freebuf); - if (dump_opt['m'] > 2 && !dump_opt['L']) { + if (dump_opt[ALLOCATED_OPT] || + (dump_opt['m'] > 2 && !dump_opt['L'])) { mutex_enter(&msp->ms_lock); VERIFY0(metaslab_load(msp)); + } + + if (dump_opt['m'] > 2 && !dump_opt['L']) { zfs_range_tree_stat_verify(msp->ms_allocatable); dump_metaslab_stats(msp); - metaslab_unload(msp); - mutex_exit(&msp->ms_lock); + } + + if (dump_opt[ALLOCATED_OPT]) { + uint64_t off = msp->ms_start; + zfs_range_tree_walk(msp->ms_allocatable, dump_allocated, + &off); + if (off != msp->ms_start + msp->ms_size) + (void) printf("ALLOC: %"PRIu64" %"PRIu64"\n", off, + msp->ms_size - off); } if (dump_opt['m'] > 1 && sm != NULL && @@ -1687,6 +1726,12 @@ dump_metaslab(metaslab_t *msp) SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); } + if (dump_opt[ALLOCATED_OPT] || + (dump_opt['m'] > 2 && !dump_opt['L'])) { + metaslab_unload(msp); + mutex_exit(&msp->ms_lock); + } + if (vd->vdev_ops == &vdev_draid_ops) ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift); else @@ -1723,8 +1768,9 @@ print_vdev_metaslab_header(vdev_t *vd) } } - (void) printf("\tvdev %10llu %s", - (u_longlong_t)vd->vdev_id, bias_str); + (void) printf("\tvdev %10llu\t%s metaslab shift %4llu", + (u_longlong_t)vd->vdev_id, bias_str, + (u_longlong_t)vd->vdev_ms_shift); if (ms_flush_data_obj != 0) { (void) printf(" ms_unflushed_phys object %llu", @@ -1791,7 +1837,7 @@ print_vdev_indirect(vdev_t *vd) vdev_indirect_births_t *vib = vd->vdev_indirect_births; if (vim == NULL) { - ASSERT3P(vib, ==, NULL); + ASSERT0P(vib); return; } @@ -1864,7 +1910,7 @@ dump_metaslabs(spa_t *spa) (void) printf("\nMetaslabs:\n"); - if (!dump_opt['d'] && zopt_metaslab_args > 0) { + if (zopt_metaslab_args > 0) { c = zopt_metaslab[0]; if (c >= children) @@ -1991,7 +2037,7 @@ dump_ddt_log(ddt_t *ddt) c += strlcpy(&flagstr[c], " UNKNOWN", sizeof (flagstr) - c); flagstr[1] = '['; - flagstr[c++] = ']'; + flagstr[c] = ']'; } uint64_t count = avl_numnodes(&ddl->ddl_tree); @@ -2042,10 +2088,10 @@ dump_ddt_object(ddt_t *ddt, ddt_type_t type, ddt_class_t class) if (error == ENOENT) return; - ASSERT(error == 0); + ASSERT0(error); error = ddt_object_count(ddt, type, class, &count); - ASSERT(error == 0); + ASSERT0(error); if (count == 0) return; @@ -2568,7 +2614,7 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp, (u_longlong_t)BP_GET_PSIZE(bp), (u_longlong_t)BP_GET_FILL(bp), (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp), - (u_longlong_t)BP_GET_BIRTH(bp)); + (u_longlong_t)BP_GET_PHYSICAL_BIRTH(bp)); if (bp_freed) (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), " %s", "FREE"); @@ -2582,19 +2628,17 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp, } } -static void +static u_longlong_t print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp) { char blkbuf[BP_SPRINTF_LEN]; + u_longlong_t offset; int l; - if (!BP_IS_EMBEDDED(bp)) { - ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); - ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); - } + offset = (u_longlong_t)blkid2offset(dnp, bp, zb); - (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb)); + (void) printf("%16llx ", offset); ASSERT(zb->zb_level >= 0); @@ -2609,19 +2653,38 @@ print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb, snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE); if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD) snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp); - (void) printf("%s\n", blkbuf); + (void) printf("%s", blkbuf); + + if (!BP_IS_EMBEDDED(bp)) { + if (BP_GET_TYPE(bp) != dnp->dn_type) { + (void) printf(" (ERROR: Block pointer type " + "(%llu) does not match dnode type (%hhu))", + BP_GET_TYPE(bp), dnp->dn_type); + corruption_found = B_TRUE; + } + if (BP_GET_LEVEL(bp) != zb->zb_level) { + (void) printf(" (ERROR: Block pointer level " + "(%llu) does not match bookmark level (%lld))", + BP_GET_LEVEL(bp), (longlong_t)zb->zb_level); + corruption_found = B_TRUE; + } + } + (void) printf("\n"); + + return (offset); } static int visit_indirect(spa_t *spa, const dnode_phys_t *dnp, blkptr_t *bp, const zbookmark_phys_t *zb) { + u_longlong_t offset; int err = 0; - if (BP_GET_LOGICAL_BIRTH(bp) == 0) + if (BP_GET_BIRTH(bp) == 0) return (0); - print_indirect(spa, bp, zb, dnp); + offset = print_indirect(spa, bp, zb, dnp); if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { arc_flags_t flags = ARC_FLAG_WAIT; @@ -2651,8 +2714,15 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, break; fill += BP_GET_FILL(cbp); } - if (!err) - ASSERT3U(fill, ==, BP_GET_FILL(bp)); + if (!err) { + if (fill != BP_GET_FILL(bp)) { + (void) printf("%16llx: Block pointer " + "fill (%llu) does not match calculated " + "value (%llu)\n", offset, BP_GET_FILL(bp), + (u_longlong_t)fill); + corruption_found = B_TRUE; + } + } arc_buf_destroy(buf, &buf); } @@ -2806,7 +2876,7 @@ dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) (void) arg, (void) tx; char blkbuf[BP_SPRINTF_LEN]; - if (BP_GET_LOGICAL_BIRTH(bp) != 0) { + if (BP_GET_BIRTH(bp) != 0) { snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("\t%s\n", blkbuf); } @@ -2847,7 +2917,7 @@ dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) (void) arg, (void) tx; char blkbuf[BP_SPRINTF_LEN]; - ASSERT(BP_GET_LOGICAL_BIRTH(bp) != 0); + ASSERT(BP_GET_BIRTH(bp) != 0); snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed); (void) printf("\t%s\n", blkbuf); return (0); @@ -2908,6 +2978,7 @@ dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) (void) printf("ERROR %u while trying to open " "subobj id %llu\n", error, (u_longlong_t)subobj); + corruption_found = B_TRUE; continue; } dump_full_bpobj(&subbpo, "subobj", indent + 1); @@ -3087,6 +3158,7 @@ bpobj_count_refd(bpobj_t *bpo) (void) printf("ERROR %u while trying to open " "subobj id %llu\n", error, (u_longlong_t)subobj); + corruption_found = B_TRUE; continue; } bpobj_count_refd(&subbpo); @@ -3108,7 +3180,7 @@ dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle) static int dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle) { - ASSERT(arg == NULL); + ASSERT0P(arg); if (dump_opt['d'] >= 5) { char buf[128]; (void) snprintf(buf, sizeof (buf), @@ -3346,7 +3418,7 @@ open_objset(const char *path, const void *tag, objset_t **osp) uint64_t sa_attrs = 0; uint64_t version = 0; - VERIFY3P(sa_os, ==, NULL); + VERIFY0P(sa_os); /* * We can't own an objset if it's redacted. Therefore, we do this @@ -3519,8 +3591,8 @@ dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid) uint64_t fuid_obj; /* first find the fuid object. It lives in the master node */ - VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, - 8, 1, &fuid_obj) == 0); + VERIFY0(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, + 8, 1, &fuid_obj)); zfs_fuid_avl_tree_create(&idx_tree, &domain_tree); (void) zfs_fuid_table_load(os, fuid_obj, &idx_tree, &domain_tree); @@ -5921,11 +5993,11 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, * entry back to the block pointer before we claim it. */ if (v == DDT_PHYS_FLAT) { - ASSERT3U(BP_GET_BIRTH(bp), ==, + ASSERT3U(BP_GET_PHYSICAL_BIRTH(bp), ==, ddt_phys_birth(dde->dde_phys, v)); tempbp = *bp; ddt_bp_fill(dde->dde_phys, v, &tempbp, - BP_GET_BIRTH(bp)); + BP_GET_PHYSICAL_BIRTH(bp)); bp = &tempbp; } @@ -6151,7 +6223,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (zb->zb_level == ZB_DNODE_LEVEL) return (0); - if (dump_opt['b'] >= 5 && BP_GET_LOGICAL_BIRTH(bp) > 0) { + if (dump_opt['b'] >= 5 && BP_GET_BIRTH(bp) > 0) { char blkbuf[BP_SPRINTF_LEN]; snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("objset %llu object %llu " @@ -6322,8 +6394,9 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs)); - zfs_range_tree_t *allocs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + zfs_range_tree_t *allocs = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + 0, "zdb_claim_removing:allocs"); for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; @@ -6750,6 +6823,7 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; spa->spa_log_class->mc_ops = &zdb_metaslab_ops; spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops; + spa->spa_special_embedded_log_class->mc_ops = &zdb_metaslab_ops; zcb->zcb_vd_obsolete_counts = umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), @@ -6887,7 +6961,9 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class == - spa_embedded_log_class(spa)) ? + spa_embedded_log_class(spa) || + msp->ms_group->mg_class == + spa_special_embedded_log_class(spa)) ? vd->vdev_log_mg : vd->vdev_mg); /* @@ -7011,7 +7087,7 @@ deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc) static void dump_livelist_cb(dsl_deadlist_t *ll, void *arg) { - ASSERT3P(arg, ==, NULL); + ASSERT0P(arg); global_feature_count[SPA_FEATURE_LIVELIST]++; dump_blkptr_list(ll, "Deleted Livelist"); dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL); @@ -7121,6 +7197,8 @@ dump_block_stats(spa_t *spa) zcb->zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); zcb->zcb_totalasize += metaslab_class_get_alloc(spa_embedded_log_class(spa)); + zcb->zcb_totalasize += + metaslab_class_get_alloc(spa_special_embedded_log_class(spa)); zcb->zcb_start = zcb->zcb_lastprint = gethrtime(); err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, zcb); @@ -7169,6 +7247,7 @@ dump_block_stats(spa_t *spa) total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa)) + metaslab_class_get_alloc(spa_embedded_log_class(spa)) + + metaslab_class_get_alloc(spa_special_embedded_log_class(spa)) + metaslab_class_get_alloc(spa_special_class(spa)) + metaslab_class_get_alloc(spa_dedup_class(spa)) + get_unflushed_alloc_space(spa); @@ -7252,6 +7331,18 @@ dump_block_stats(spa_t *spa) 100.0 * alloc / space); } + if (spa_special_embedded_log_class(spa)->mc_allocator[0].mca_rotor + != NULL) { + uint64_t alloc = metaslab_class_get_alloc( + spa_special_embedded_log_class(spa)); + uint64_t space = metaslab_class_get_space( + spa_special_embedded_log_class(spa)); + + (void) printf("\t%-16s %14llu used: %5.2f%%\n", + "Special embedded log", (u_longlong_t)alloc, + 100.0 * alloc / space); + } + for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { if (zcb->zcb_embedded_blocks[i] == 0) continue; @@ -7706,7 +7797,8 @@ zdb_set_skip_mmp(char *target) * applies to the new_path parameter if allocated. */ static char * -import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) +import_checkpointed_state(char *target, nvlist_t *cfg, boolean_t target_is_spa, + char **new_path) { int error = 0; char *poolname, *bogus_name = NULL; @@ -7714,11 +7806,11 @@ import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) /* If the target is not a pool, the extract the pool name */ char *path_start = strchr(target, '/'); - if (path_start != NULL) { + if (target_is_spa || path_start == NULL) { + poolname = target; + } else { size_t poolname_len = path_start - target; poolname = strndup(target, poolname_len); - } else { - poolname = target; } if (cfg == NULL) { @@ -7749,10 +7841,11 @@ import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) "with error %d\n", bogus_name, error); } - if (new_path != NULL && path_start != NULL) { - if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) { + if (new_path != NULL && !target_is_spa) { + if (asprintf(new_path, "%s%s", bogus_name, + path_start != NULL ? path_start : "") == -1) { free(bogus_name); - if (path_start != NULL) + if (!target_is_spa && path_start != NULL) free(poolname); return (NULL); } @@ -7891,7 +7984,7 @@ verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current) for (uint64_t c = ckpoint_rvd->vdev_children; c < current_rvd->vdev_children; c++) { vdev_t *current_vd = current_rvd->vdev_child[c]; - VERIFY3P(current_vd->vdev_checkpoint_sm, ==, NULL); + VERIFY0P(current_vd->vdev_checkpoint_sm); } } @@ -7981,7 +8074,7 @@ verify_checkpoint_blocks(spa_t *spa) * name) so we can do verification on it against the current state * of the pool. */ - checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL, + checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL, B_TRUE, NULL); ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0); @@ -8451,8 +8544,9 @@ dump_zpool(spa_t *spa) if (dump_opt['d'] || dump_opt['i']) { spa_feature_t f; - mos_refd_objs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + mos_refd_objs = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + 0, "dump_zpool:mos_refd_objs"); dump_objset(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { @@ -8588,9 +8682,9 @@ zdb_dump_indirect(blkptr_t *bp, int nbps, int flags) } static void -zdb_dump_gbh(void *buf, int flags) +zdb_dump_gbh(void *buf, uint64_t size, int flags) { - zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags); + zdb_dump_indirect((blkptr_t *)buf, gbh_nblkptrs(size), flags); } static void @@ -8780,7 +8874,6 @@ zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize, (void) buf; uint64_t orig_lsize = lsize; boolean_t tryzle = ((getenv("ZDB_NO_ZLE") == NULL)); - boolean_t found = B_FALSE; /* * We don't know how the data was compressed, so just try * every decompress function at every inflated blocksize. @@ -8823,20 +8916,19 @@ zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize, for (cfuncp = cfuncs; *cfuncp; cfuncp++) { if (try_decompress_block(pabd, lsize, psize, flags, *cfuncp, lbuf, lbuf2)) { - found = B_TRUE; + tryzle = B_FALSE; break; } } if (*cfuncp != 0) break; } - if (!found && tryzle) { + if (tryzle) { for (lsize = orig_lsize; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) { if (try_decompress_block(pabd, lsize, psize, flags, ZIO_COMPRESS_ZLE, lbuf, lbuf2)) { *cfuncp = ZIO_COMPRESS_ZLE; - found = B_TRUE; break; } } @@ -9073,7 +9165,7 @@ zdb_read_block(char *thing, spa_t *spa) zdb_dump_indirect((blkptr_t *)buf, orig_lsize / sizeof (blkptr_t), flags); else if (flags & ZDB_FLAG_GBH) - zdb_dump_gbh(buf, flags); + zdb_dump_gbh(buf, lsize, flags); else zdb_dump_block(thing, buf, lsize, flags); @@ -9120,7 +9212,7 @@ zdb_read_block(char *thing, spa_t *spa) ck_zio->io_offset = DVA_GET_OFFSET(&bp->blk_dva[0]); ck_zio->io_bp = bp; - zio_checksum_compute(ck_zio, ck, pabd, lsize); + zio_checksum_compute(ck_zio, ck, pabd, psize); printf( "%12s\t" "cksum=%016llx:%016llx:%016llx:%016llx\n", @@ -9313,6 +9405,8 @@ main(int argc, char **argv) {"all-reconstruction", no_argument, NULL, 'Y'}, {"livelist", no_argument, NULL, 'y'}, {"zstd-headers", no_argument, NULL, 'Z'}, + {"allocated-map", no_argument, NULL, + ALLOCATED_OPT}, {0, 0, 0, 0} }; @@ -9343,6 +9437,7 @@ main(int argc, char **argv) case 'u': case 'y': case 'Z': + case ALLOCATED_OPT: dump_opt[c]++; dump_all = 0; break; @@ -9377,9 +9472,11 @@ main(int argc, char **argv) while (*optarg != '\0') { *optarg++ = '*'; } break; case 'o': - error = set_global_var(optarg); + dump_opt[c]++; + dump_all = 0; + error = handle_tunable_option(optarg, B_FALSE); if (error != 0) - usage(); + zdb_exit(1); break; case 'p': if (searchdirs == NULL) { @@ -9545,6 +9642,12 @@ main(int argc, char **argv) error = 0; goto fini; } + if (dump_opt['o']) + /* + * Avoid blasting tunable options off the top of the + * screen. + */ + zdb_exit(1); usage(); } @@ -9605,7 +9708,7 @@ main(int argc, char **argv) } else if (objset_str && !zdb_numeric(objset_str + 1) && dump_opt['N']) { printf("Supply a numeric objset ID with -N\n"); - error = 1; + error = 2; goto fini; } } else { @@ -9697,7 +9800,7 @@ main(int argc, char **argv) char *checkpoint_target = NULL; if (dump_opt['k']) { checkpoint_pool = import_checkpointed_state(target, cfg, - &checkpoint_target); + target_is_spa, &checkpoint_target); if (checkpoint_target != NULL) target = checkpoint_target; @@ -9714,7 +9817,7 @@ main(int argc, char **argv) if (error == 0) { if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) { ASSERT(checkpoint_pool != NULL); - ASSERT(checkpoint_target == NULL); + ASSERT0P(checkpoint_target); error = spa_open(checkpoint_pool, &spa, FTAG); if (error != 0) { @@ -9907,5 +10010,8 @@ fini: if (kernel_init_done) kernel_fini(); + if (corruption_found && error == 0) + error = 3; + return (error); } diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.h b/sys/contrib/openzfs/cmd/zdb/zdb.h index 6b6c9169816b..48b561eb202c 100644 --- a/sys/contrib/openzfs/cmd/zdb/zdb.h +++ b/sys/contrib/openzfs/cmd/zdb/zdb.h @@ -29,6 +29,6 @@ #define _ZDB_H void dump_intent_log(zilog_t *); -extern uint8_t dump_opt[256]; +extern uint8_t dump_opt[512]; #endif /* _ZDB_H */ diff --git a/sys/contrib/openzfs/cmd/zdb/zdb_il.c b/sys/contrib/openzfs/cmd/zdb/zdb_il.c index 6b90b08ca1b1..3d91fb28a4c7 100644 --- a/sys/contrib/openzfs/cmd/zdb/zdb_il.c +++ b/sys/contrib/openzfs/cmd/zdb/zdb_il.c @@ -48,8 +48,6 @@ #include "zdb.h" -extern uint8_t dump_opt[256]; - static char tab_prefix[4] = "\t\t\t"; static void @@ -176,7 +174,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg) if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { (void) printf("%shas blkptr, %s\n", tab_prefix, - !BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) >= + !BP_IS_HOLE(bp) && BP_GET_BIRTH(bp) >= spa_min_claim_txg(zilog->zl_spa) ? "will claim" : "won't claim"); print_log_bp(bp, tab_prefix); @@ -189,7 +187,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg) (void) printf("%s<hole>\n", tab_prefix); return; } - if (BP_GET_LOGICAL_BIRTH(bp) < zilog->zl_header->zh_claim_txg) { + if (BP_GET_BIRTH(bp) < zilog->zl_header->zh_claim_txg) { (void) printf("%s<block already committed>\n", tab_prefix); return; @@ -240,7 +238,7 @@ zil_prt_rec_write_enc(zilog_t *zilog, int txtype, const void *arg) if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { (void) printf("%shas blkptr, %s\n", tab_prefix, - !BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) >= + !BP_IS_HOLE(bp) && BP_GET_BIRTH(bp) >= spa_min_claim_txg(zilog->zl_spa) ? "will claim" : "won't claim"); print_log_bp(bp, tab_prefix); @@ -476,7 +474,7 @@ print_log_block(zilog_t *zilog, const blkptr_t *bp, void *arg, if (claim_txg != 0) claim = "already claimed"; - else if (BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(zilog->zl_spa)) + else if (BP_GET_BIRTH(bp) >= spa_min_claim_txg(zilog->zl_spa)) claim = "will claim"; else claim = "won't claim"; diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c index 8718dbde03b6..c0590edc7516 100644 --- a/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c +++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c @@ -134,11 +134,13 @@ zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg) * of blkid cache and L2ARC VDEV does not contain pool guid in its * blkid, so this is a special case for L2ARC VDEV. */ - else if (gsp->gs_vdev_guid != 0 && gsp->gs_devid == NULL && + else if (gsp->gs_vdev_guid != 0 && nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, &vdev_guid) == 0 && gsp->gs_vdev_guid == vdev_guid) { - (void) nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, - &gsp->gs_devid); + if (gsp->gs_devid == NULL) { + (void) nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, + &gsp->gs_devid); + } (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME, &gsp->gs_vdev_expandtime); return (B_TRUE); @@ -156,22 +158,28 @@ zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg) /* * For each vdev in this pool, look for a match by devid */ - if ((config = zpool_get_config(zhp, NULL)) != NULL) { - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvl) == 0) { - (void) zfs_agent_iter_vdev(zhp, nvl, gsp); - } - } - /* - * if a match was found then grab the pool guid - */ - if (gsp->gs_vdev_guid && gsp->gs_devid) { - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, - &gsp->gs_pool_guid); - } + boolean_t found = B_FALSE; + uint64_t pool_guid; + /* Get pool configuration and extract pool GUID */ + if ((config = zpool_get_config(zhp, NULL)) == NULL || + nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pool_guid) != 0) + goto out; + + /* Skip this pool if we're looking for a specific pool */ + if (gsp->gs_pool_guid != 0 && pool_guid != gsp->gs_pool_guid) + goto out; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) == 0) + found = zfs_agent_iter_vdev(zhp, nvl, gsp); + + if (found && gsp->gs_pool_guid == 0) + gsp->gs_pool_guid = pool_guid; + +out: zpool_close(zhp); - return (gsp->gs_devid != NULL && gsp->gs_vdev_guid != 0); + return (found); } void @@ -233,20 +241,17 @@ zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl) * For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or * ZFS_EV_POOL_GUID may be missing so find them. */ - if (devid == NULL || pool_guid == 0 || vdev_guid == 0) { - if (devid == NULL) - search.gs_vdev_guid = vdev_guid; - else - search.gs_devid = devid; - zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search); - if (devid == NULL) - devid = search.gs_devid; - if (pool_guid == 0) - pool_guid = search.gs_pool_guid; - if (vdev_guid == 0) - vdev_guid = search.gs_vdev_guid; - devtype = search.gs_vdev_type; - } + search.gs_devid = devid; + search.gs_vdev_guid = vdev_guid; + search.gs_pool_guid = pool_guid; + zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search); + if (devid == NULL) + devid = search.gs_devid; + if (pool_guid == 0) + pool_guid = search.gs_pool_guid; + if (vdev_guid == 0) + vdev_guid = search.gs_vdev_guid; + devtype = search.gs_vdev_type; /* * We want to avoid reporting "remove" events coming from diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/Makefile.am b/sys/contrib/openzfs/cmd/zed/zed.d/Makefile.am index 093a04c4636a..c0b161ecf248 100644 --- a/sys/contrib/openzfs/cmd/zed/zed.d/Makefile.am +++ b/sys/contrib/openzfs/cmd/zed/zed.d/Makefile.am @@ -9,18 +9,18 @@ dist_zedexec_SCRIPTS = \ %D%/all-debug.sh \ %D%/all-syslog.sh \ %D%/data-notify.sh \ - %D%/deadman-slot_off.sh \ + %D%/deadman-sync-slot_off.sh \ %D%/generic-notify.sh \ - %D%/pool_import-led.sh \ + %D%/pool_import-sync-led.sh \ %D%/resilver_finish-notify.sh \ %D%/resilver_finish-start-scrub.sh \ %D%/scrub_finish-notify.sh \ - %D%/statechange-led.sh \ + %D%/statechange-sync-led.sh \ %D%/statechange-notify.sh \ - %D%/statechange-slot_off.sh \ + %D%/statechange-sync-slot_off.sh \ %D%/trim_finish-notify.sh \ - %D%/vdev_attach-led.sh \ - %D%/vdev_clear-led.sh + %D%/vdev_attach-sync-led.sh \ + %D%/vdev_clear-sync-led.sh nodist_zedexec_SCRIPTS = \ %D%/history_event-zfs-list-cacher.sh @@ -30,17 +30,17 @@ SUBSTFILES += $(nodist_zedexec_SCRIPTS) zedconfdefaults = \ all-syslog.sh \ data-notify.sh \ - deadman-slot_off.sh \ + deadman-sync-slot_off.sh \ history_event-zfs-list-cacher.sh \ - pool_import-led.sh \ + pool_import-sync-led.sh \ resilver_finish-notify.sh \ resilver_finish-start-scrub.sh \ scrub_finish-notify.sh \ - statechange-led.sh \ + statechange-sync-led.sh \ statechange-notify.sh \ - statechange-slot_off.sh \ - vdev_attach-led.sh \ - vdev_clear-led.sh + statechange-sync-slot_off.sh \ + vdev_attach-sync-led.sh \ + vdev_clear-sync-led.sh dist_noinst_DATA += %D%/README diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/deadman-slot_off.sh b/sys/contrib/openzfs/cmd/zed/zed.d/deadman-sync-slot_off.sh index 7b339b3add01..7b339b3add01 100755 --- a/sys/contrib/openzfs/cmd/zed/zed.d/deadman-slot_off.sh +++ b/sys/contrib/openzfs/cmd/zed/zed.d/deadman-sync-slot_off.sh diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/pool_import-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/pool_import-led.sh deleted file mode 120000 index 7d7404398a4a..000000000000 --- a/sys/contrib/openzfs/cmd/zed/zed.d/pool_import-led.sh +++ /dev/null @@ -1 +0,0 @@ -statechange-led.sh
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/pool_import-sync-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/pool_import-sync-led.sh new file mode 120000 index 000000000000..8b9c10c11ebb --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/pool_import-sync-led.sh @@ -0,0 +1 @@ +statechange-sync-led.sh
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/statechange-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-sync-led.sh index 40cb61f17307..40cb61f17307 100755 --- a/sys/contrib/openzfs/cmd/zed/zed.d/statechange-led.sh +++ b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-sync-led.sh diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/statechange-slot_off.sh b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-sync-slot_off.sh index 06acce93b8aa..06acce93b8aa 100755 --- a/sys/contrib/openzfs/cmd/zed/zed.d/statechange-slot_off.sh +++ b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-sync-slot_off.sh diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/vdev_attach-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_attach-led.sh deleted file mode 120000 index 7d7404398a4a..000000000000 --- a/sys/contrib/openzfs/cmd/zed/zed.d/vdev_attach-led.sh +++ /dev/null @@ -1 +0,0 @@ -statechange-led.sh
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/vdev_attach-sync-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_attach-sync-led.sh new file mode 120000 index 000000000000..8b9c10c11ebb --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_attach-sync-led.sh @@ -0,0 +1 @@ +statechange-sync-led.sh
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/vdev_clear-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_clear-led.sh deleted file mode 120000 index 7d7404398a4a..000000000000 --- a/sys/contrib/openzfs/cmd/zed/zed.d/vdev_clear-led.sh +++ /dev/null @@ -1 +0,0 @@ -statechange-led.sh
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/vdev_clear-sync-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_clear-sync-led.sh new file mode 120000 index 000000000000..8b9c10c11ebb --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_clear-sync-led.sh @@ -0,0 +1 @@ +statechange-sync-led.sh
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/zed-functions.sh b/sys/contrib/openzfs/cmd/zed/zed.d/zed-functions.sh index 6e00f153be1c..78d8f658ddd8 100644 --- a/sys/contrib/openzfs/cmd/zed/zed.d/zed-functions.sh +++ b/sys/contrib/openzfs/cmd/zed/zed.d/zed-functions.sh @@ -441,8 +441,9 @@ zed_notify_slack_webhook() "${pathname}")" # Construct the JSON message for posting. + # shellcheck disable=SC2016 # - msg_json="$(printf '{"text": "*%s*\\n%s"}' "${subject}" "${msg_body}" )" + msg_json="$(printf '{"text": "*%s*\\n```%s```"}' "${subject}" "${msg_body}" )" # Send the POST request and check for errors. # diff --git a/sys/contrib/openzfs/cmd/zed/zed_event.c b/sys/contrib/openzfs/cmd/zed/zed_event.c index 296c222ca382..ba7cba304b1d 100644 --- a/sys/contrib/openzfs/cmd/zed/zed_event.c +++ b/sys/contrib/openzfs/cmd/zed/zed_event.c @@ -110,7 +110,7 @@ zed_event_fini(struct zed_conf *zcp) static void _bump_event_queue_length(void) { - int zzlm = -1, wr; + int zzlm, wr; char qlen_buf[12] = {0}; /* parameter is int => max "-2147483647\n" */ long int qlen, orig_qlen; diff --git a/sys/contrib/openzfs/cmd/zed/zed_exec.c b/sys/contrib/openzfs/cmd/zed/zed_exec.c index 036081decd64..a14af4f20a85 100644 --- a/sys/contrib/openzfs/cmd/zed/zed_exec.c +++ b/sys/contrib/openzfs/cmd/zed/zed_exec.c @@ -196,37 +196,29 @@ _nop(int sig) (void) sig; } -static void * -_reap_children(void *arg) +static void +wait_for_children(boolean_t do_pause, boolean_t wait) { - (void) arg; - struct launched_process_node node, *pnode; pid_t pid; - int status; struct rusage usage; - struct sigaction sa = {}; - - (void) sigfillset(&sa.sa_mask); - (void) sigdelset(&sa.sa_mask, SIGCHLD); - (void) pthread_sigmask(SIG_SETMASK, &sa.sa_mask, NULL); - - (void) sigemptyset(&sa.sa_mask); - sa.sa_handler = _nop; - sa.sa_flags = SA_NOCLDSTOP; - (void) sigaction(SIGCHLD, &sa, NULL); + int status; + struct launched_process_node node, *pnode; for (_reap_children_stop = B_FALSE; !_reap_children_stop; ) { (void) pthread_mutex_lock(&_launched_processes_lock); - pid = wait4(0, &status, WNOHANG, &usage); - + pid = wait4(0, &status, wait ? 0 : WNOHANG, &usage); if (pid == 0 || pid == (pid_t)-1) { (void) pthread_mutex_unlock(&_launched_processes_lock); - if (pid == 0 || errno == ECHILD) - pause(); - else if (errno != EINTR) + if ((pid == 0) || (errno == ECHILD)) { + if (do_pause) + pause(); + } else if (errno != EINTR) zed_log_msg(LOG_WARNING, "Failed to wait for children: %s", strerror(errno)); + if (!do_pause) + return; + } else { memset(&node, 0, sizeof (node)); node.pid = pid; @@ -278,6 +270,25 @@ _reap_children(void *arg) } } +} + +static void * +_reap_children(void *arg) +{ + (void) arg; + struct sigaction sa = {}; + + (void) sigfillset(&sa.sa_mask); + (void) sigdelset(&sa.sa_mask, SIGCHLD); + (void) pthread_sigmask(SIG_SETMASK, &sa.sa_mask, NULL); + + (void) sigemptyset(&sa.sa_mask); + sa.sa_handler = _nop; + sa.sa_flags = SA_NOCLDSTOP; + (void) sigaction(SIGCHLD, &sa, NULL); + + wait_for_children(B_TRUE, B_FALSE); + return (NULL); } @@ -307,6 +318,45 @@ zed_exec_fini(void) } /* + * Check if the zedlet name indicates if it is a synchronous zedlet + * + * Synchronous zedlets have a "-sync-" immediately following the event name in + * their zedlet filename, like: + * + * EVENT_NAME-sync-ZEDLETNAME.sh + * + * For example, if you wanted a synchronous statechange script: + * + * statechange-sync-myzedlet.sh + * + * Synchronous zedlets are guaranteed to be the only zedlet running. No other + * zedlets may run in parallel with a synchronous zedlet. A synchronous + * zedlet will wait for all previously spawned zedlets to finish before running. + * Users should be careful to only use synchronous zedlets when needed, since + * they decrease parallelism. + */ +static boolean_t +zedlet_is_sync(const char *zedlet, const char *event) +{ + const char *sync_str = "-sync-"; + size_t sync_str_len; + size_t zedlet_len; + size_t event_len; + + sync_str_len = strlen(sync_str); + zedlet_len = strlen(zedlet); + event_len = strlen(event); + + if (event_len + sync_str_len >= zedlet_len) + return (B_FALSE); + + if (strncmp(&zedlet[event_len], sync_str, sync_str_len) == 0) + return (B_TRUE); + + return (B_FALSE); +} + +/* * Process the event [eid] by synchronously invoking all zedlets with a * matching class prefix. * @@ -368,9 +418,28 @@ zed_exec_process(uint64_t eid, const char *class, const char *subclass, z = zed_strings_next(zcp->zedlets)) { for (csp = class_strings; *csp; csp++) { n = strlen(*csp); - if ((strncmp(z, *csp, n) == 0) && !isalpha(z[n])) + if ((strncmp(z, *csp, n) == 0) && !isalpha(z[n])) { + boolean_t is_sync = zedlet_is_sync(z, *csp); + + if (is_sync) { + /* + * Wait for previous zedlets to + * finish + */ + wait_for_children(B_FALSE, B_TRUE); + } + _zed_exec_fork_child(eid, zcp->zedlet_dir, z, e, zcp->zevent_fd, zcp->do_foreground); + + if (is_sync) { + /* + * Wait for sync zedlet we just launched + * to finish. + */ + wait_for_children(B_FALSE, B_TRUE); + } + } } } free(e); diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_main.c b/sys/contrib/openzfs/cmd/zfs/zfs_main.c index 841c356508a5..484986bde719 100644 --- a/sys/contrib/openzfs/cmd/zfs/zfs_main.c +++ b/sys/contrib/openzfs/cmd/zfs/zfs_main.c @@ -440,7 +440,7 @@ get_usage(zfs_help_t idx) return (gettext("\tredact <snapshot> <bookmark> " "<redaction_snapshot> ...\n")); case HELP_REWRITE: - return (gettext("\trewrite [-rvx] [-o <offset>] [-l <length>] " + return (gettext("\trewrite [-Prvx] [-o <offset>] [-l <length>] " "<directory|file ...>\n")); case HELP_JAIL: return (gettext("\tjail <jailid|jailname> <filesystem>\n")); @@ -923,7 +923,7 @@ zfs_do_clone(int argc, char **argv) return (!!ret); usage: - ASSERT3P(zhp, ==, NULL); + ASSERT0P(zhp); nvlist_free(props); usage(B_FALSE); return (-1); @@ -1974,9 +1974,8 @@ fill_dataset_info(nvlist_t *list, zfs_handle_t *zhp, boolean_t as_int) } if (type == ZFS_TYPE_SNAPSHOT) { - char *ds, *snap; - ds = snap = strdup(zfs_get_name(zhp)); - ds = strsep(&snap, "@"); + char *snap = strdup(zfs_get_name(zhp)); + char *ds = strsep(&snap, "@"); fnvlist_add_string(list, "dataset", ds); fnvlist_add_string(list, "snapshot_name", snap); free(ds); @@ -2019,8 +2018,7 @@ get_callback(zfs_handle_t *zhp, void *data) nvlist_t *user_props = zfs_get_user_props(zhp); zprop_list_t *pl = cbp->cb_proplist; nvlist_t *propval; - nvlist_t *item, *d, *props; - item = d = props = NULL; + nvlist_t *item, *d = NULL, *props = NULL; const char *strval; const char *sourceval; boolean_t received = is_recvd_column(cbp); @@ -5305,6 +5303,7 @@ zfs_do_receive(int argc, char **argv) #define ZFS_DELEG_PERM_MOUNT "mount" #define ZFS_DELEG_PERM_SHARE "share" #define ZFS_DELEG_PERM_SEND "send" +#define ZFS_DELEG_PERM_SEND_RAW "send:raw" #define ZFS_DELEG_PERM_RECEIVE "receive" #define ZFS_DELEG_PERM_RECEIVE_APPEND "receive:append" #define ZFS_DELEG_PERM_ALLOW "allow" @@ -5347,6 +5346,7 @@ static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = { { ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME }, { ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK }, { ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND }, + { ZFS_DELEG_PERM_SEND_RAW, ZFS_DELEG_NOTE_SEND_RAW }, { ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE }, { ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT }, { ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK }, @@ -5879,7 +5879,7 @@ parse_fs_perm_set(fs_perm_set_t *fspset, nvlist_t *nvl) static inline const char * deleg_perm_comment(zfs_deleg_note_t note) { - const char *str = ""; + const char *str; /* subcommands */ switch (note) { @@ -5931,6 +5931,10 @@ deleg_perm_comment(zfs_deleg_note_t note) case ZFS_DELEG_NOTE_SEND: str = gettext(""); break; + case ZFS_DELEG_NOTE_SEND_RAW: + str = gettext("Allow sending ONLY encrypted (raw) replication" + "\n\t\t\t\tstreams"); + break; case ZFS_DELEG_NOTE_SHARE: str = gettext("Allows sharing file systems over NFS or SMB" "\n\t\t\t\tprotocols"); @@ -6860,17 +6864,17 @@ print_holds(boolean_t scripted, int nwidth, int tagwidth, nvlist_t *nvl, if (scripted) { if (parsable) { - (void) printf("%s\t%s\t%ld\n", zname, - tagname, (unsigned long)time); + (void) printf("%s\t%s\t%lld\n", zname, + tagname, (long long)time); } else { (void) printf("%s\t%s\t%s\n", zname, tagname, tsbuf); } } else { if (parsable) { - (void) printf("%-*s %-*s %ld\n", + (void) printf("%-*s %-*s %lld\n", nwidth, zname, tagwidth, - tagname, (unsigned long)time); + tagname, (long long)time); } else { (void) printf("%-*s %-*s %s\n", nwidth, zname, tagwidth, @@ -7729,6 +7733,7 @@ unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) struct extmnttab entry; const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount"; ino_t path_inode; + char *zfs_mntpnt, *entry_mntpnt; /* * Search for the given (major,minor) pair in the mount table. @@ -7770,6 +7775,24 @@ unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) goto out; } + /* + * If the filesystem is mounted, check that the mountpoint matches + * the one in the mnttab entry w.r.t. provided path. If it doesn't, + * then we should not proceed further. + */ + entry_mntpnt = strdup(entry.mnt_mountp); + if (zfs_is_mounted(zhp, &zfs_mntpnt)) { + if (strcmp(zfs_mntpnt, entry_mntpnt) != 0) { + (void) fprintf(stderr, gettext("cannot %s '%s': " + "not an original mountpoint\n"), cmdname, path); + free(zfs_mntpnt); + free(entry_mntpnt); + goto out; + } + free(zfs_mntpnt); + } + free(entry_mntpnt); + if (op == OP_SHARE) { char nfs_mnt_prop[ZFS_MAXPROPLEN]; char smbshare_prop[ZFS_MAXPROPLEN]; @@ -9160,8 +9183,11 @@ zfs_do_rewrite(int argc, char **argv) zfs_rewrite_args_t args; memset(&args, 0, sizeof (args)); - while ((c = getopt(argc, argv, "l:o:rvx")) != -1) { + while ((c = getopt(argc, argv, "Pl:o:rvx")) != -1) { switch (c) { + case 'P': + args.flags |= ZFS_REWRITE_PHYSICAL; + break; case 'l': args.len = strtoll(optarg, NULL, 0); break; diff --git a/sys/contrib/openzfs/cmd/zhack.c b/sys/contrib/openzfs/cmd/zhack.c index 8244bc83fa0d..8ffbf91ffb30 100644 --- a/sys/contrib/openzfs/cmd/zhack.c +++ b/sys/contrib/openzfs/cmd/zhack.c @@ -54,6 +54,7 @@ #include <sys/dmu_tx.h> #include <zfeature_common.h> #include <libzutil.h> +#include <sys/metaslab_impl.h> static importargs_t g_importargs; static char *g_pool; @@ -69,7 +70,8 @@ static __attribute__((noreturn)) void usage(void) { (void) fprintf(stderr, - "Usage: zhack [-c cachefile] [-d dir] <subcommand> <args> ...\n" + "Usage: zhack [-o tunable] [-c cachefile] [-d dir] <subcommand> " + "<args> ...\n" "where <subcommand> <args> is one of the following:\n" "\n"); @@ -93,7 +95,10 @@ usage(void) " -c repair corrupted label checksums\n" " -u restore the label on a detached device\n" "\n" - " <device> : path to vdev\n"); + " <device> : path to vdev\n" + "\n" + " metaslab leak <pool>\n" + " apply allocation map from zdb to specified pool\n"); exit(1); } @@ -162,9 +167,9 @@ zhack_import(char *target, boolean_t readonly) props = NULL; if (readonly) { - VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_uint64(props, - zpool_prop_to_name(ZPOOL_PROP_READONLY), 1) == 0); + VERIFY0(nvlist_alloc(&props, NV_UNIQUE_NAME, 0)); + VERIFY0(nvlist_add_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_READONLY), 1)); } zfeature_checks_disable = B_TRUE; @@ -218,8 +223,8 @@ dump_obj(objset_t *os, uint64_t obj, const char *name) } else { ASSERT(za->za_integer_length == 1); char val[1024]; - VERIFY(zap_lookup(os, obj, za->za_name, - 1, sizeof (val), val) == 0); + VERIFY0(zap_lookup(os, obj, za->za_name, + 1, sizeof (val), val)); (void) printf("\t%s = %s\n", za->za_name, val); } } @@ -363,10 +368,12 @@ feature_incr_sync(void *arg, dmu_tx_t *tx) zfeature_info_t *feature = arg; uint64_t refcount; + mutex_enter(&spa->spa_feat_stats_lock); VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount)); feature_sync(spa, feature, refcount + 1, tx); spa_history_log_internal(spa, "zhack feature incr", tx, "name=%s", feature->fi_guid); + mutex_exit(&spa->spa_feat_stats_lock); } static void @@ -376,10 +383,12 @@ feature_decr_sync(void *arg, dmu_tx_t *tx) zfeature_info_t *feature = arg; uint64_t refcount; + mutex_enter(&spa->spa_feat_stats_lock); VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount)); feature_sync(spa, feature, refcount - 1, tx); spa_history_log_internal(spa, "zhack feature decr", tx, "name=%s", feature->fi_guid); + mutex_exit(&spa->spa_feat_stats_lock); } static void @@ -496,6 +505,186 @@ zhack_do_feature(int argc, char **argv) return (0); } +static boolean_t +strstarts(const char *a, const char *b) +{ + return (strncmp(a, b, strlen(b)) == 0); +} + +static void +metaslab_force_alloc(metaslab_t *msp, uint64_t start, uint64_t size, + dmu_tx_t *tx) +{ + ASSERT(msp->ms_disabled); + ASSERT(MUTEX_HELD(&msp->ms_lock)); + uint64_t txg = dmu_tx_get_txg(tx); + + uint64_t off = start; + while (off < start + size) { + uint64_t ostart, osize; + boolean_t found = zfs_range_tree_find_in(msp->ms_allocatable, + off, start + size - off, &ostart, &osize); + if (!found) + break; + zfs_range_tree_remove(msp->ms_allocatable, ostart, osize); + + if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) + vdev_dirty(msp->ms_group->mg_vd, VDD_METASLAB, msp, + txg); + + zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK], ostart, + osize); + msp->ms_allocating_total += osize; + off = ostart + osize; + } +} + +static void +zhack_do_metaslab_leak(int argc, char **argv) +{ + int c; + char *target; + spa_t *spa; + + optind = 1; + boolean_t force = B_FALSE; + while ((c = getopt(argc, argv, "f")) != -1) { + switch (c) { + case 'f': + force = B_TRUE; + break; + default: + usage(); + break; + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, "error: missing pool name\n"); + usage(); + } + target = argv[0]; + + zhack_spa_open(target, B_FALSE, FTAG, &spa); + spa_config_enter(spa, SCL_VDEV | SCL_ALLOC, FTAG, RW_READER); + + char *line = NULL; + size_t cap = 0; + + vdev_t *vd = NULL; + metaslab_t *prev = NULL; + dmu_tx_t *tx = NULL; + while (getline(&line, &cap, stdin) > 0) { + if (strstarts(line, "\tvdev ")) { + uint64_t vdev_id, ms_shift; + if (sscanf(line, + "\tvdev %10"PRIu64"\t%*s metaslab shift %4"PRIu64, + &vdev_id, &ms_shift) == 1) { + VERIFY3U(sscanf(line, "\tvdev %"PRIu64 + "\t metaslab shift %4"PRIu64, + &vdev_id, &ms_shift), ==, 2); + } + vd = vdev_lookup_top(spa, vdev_id); + if (vd == NULL) { + fprintf(stderr, "error: no such vdev with " + "id %"PRIu64"\n", vdev_id); + break; + } + if (tx) { + dmu_tx_commit(tx); + mutex_exit(&prev->ms_lock); + metaslab_enable(prev, B_FALSE, B_FALSE); + tx = NULL; + prev = NULL; + } + if (vd->vdev_ms_shift != ms_shift) { + fprintf(stderr, "error: ms_shift mismatch: %" + PRIu64" != %"PRIu64"\n", vd->vdev_ms_shift, + ms_shift); + break; + } + } else if (strstarts(line, "\tmetaslabs ")) { + uint64_t ms_count; + VERIFY3U(sscanf(line, "\tmetaslabs %"PRIu64, &ms_count), + ==, 1); + ASSERT(vd); + if (!force && vd->vdev_ms_count != ms_count) { + fprintf(stderr, "error: ms_count mismatch: %" + PRIu64" != %"PRIu64"\n", vd->vdev_ms_count, + ms_count); + break; + } + } else if (strstarts(line, "ALLOC:")) { + uint64_t start, size; + VERIFY3U(sscanf(line, "ALLOC: %"PRIu64" %"PRIu64"\n", + &start, &size), ==, 2); + + ASSERT(vd); + metaslab_t *cur = + vd->vdev_ms[start >> vd->vdev_ms_shift]; + if (prev != cur) { + if (prev) { + dmu_tx_commit(tx); + mutex_exit(&prev->ms_lock); + metaslab_enable(prev, B_FALSE, B_FALSE); + } + ASSERT(cur); + metaslab_disable(cur); + mutex_enter(&cur->ms_lock); + metaslab_load(cur); + prev = cur; + tx = dmu_tx_create_dd( + spa_get_dsl(vd->vdev_spa)->dp_root_dir); + dmu_tx_assign(tx, DMU_TX_WAIT); + } + + metaslab_force_alloc(cur, start, size, tx); + } else { + continue; + } + } + if (tx) { + dmu_tx_commit(tx); + mutex_exit(&prev->ms_lock); + metaslab_enable(prev, B_FALSE, B_FALSE); + tx = NULL; + prev = NULL; + } + if (line) + free(line); + + spa_config_exit(spa, SCL_VDEV | SCL_ALLOC, FTAG); + spa_close(spa, FTAG); +} + +static int +zhack_do_metaslab(int argc, char **argv) +{ + char *subcommand; + + argc--; + argv++; + if (argc == 0) { + (void) fprintf(stderr, + "error: no metaslab operation specified\n"); + usage(); + } + + subcommand = argv[0]; + if (strcmp(subcommand, "leak") == 0) { + zhack_do_metaslab_leak(argc, argv); + } else { + (void) fprintf(stderr, "error: unknown subcommand: %s\n", + subcommand); + usage(); + } + + return (0); +} + #define ASHIFT_UBERBLOCK_SHIFT(ashift) \ MIN(MAX(ashift, UBERBLOCK_SHIFT), \ MAX_UBERBLOCK_SHIFT) @@ -525,6 +714,23 @@ zhack_repair_read_label(const int fd, vdev_label_t *vl, return (0); } +static int +zhack_repair_get_byteswap(const zio_eck_t *vdev_eck, const int l, int *byteswap) +{ + if (vdev_eck->zec_magic == ZEC_MAGIC) { + *byteswap = B_FALSE; + } else if (vdev_eck->zec_magic == BSWAP_64((uint64_t)ZEC_MAGIC)) { + *byteswap = B_TRUE; + } else { + (void) fprintf(stderr, "error: label %d: " + "Expected the nvlist checksum magic number but instead got " + "0x%" PRIx64 "\n", + l, vdev_eck->zec_magic); + return (1); + } + return (0); +} + static void zhack_repair_calc_cksum(const int byteswap, void *data, const uint64_t offset, const uint64_t abdsize, zio_eck_t *eck, zio_cksum_t *cksum) @@ -551,33 +757,10 @@ zhack_repair_calc_cksum(const int byteswap, void *data, const uint64_t offset, } static int -zhack_repair_check_label(uberblock_t *ub, const int l, const char **cfg_keys, - const size_t cfg_keys_len, nvlist_t *cfg, nvlist_t *vdev_tree_cfg, - uint64_t *ashift) +zhack_repair_get_ashift(nvlist_t *cfg, const int l, uint64_t *ashift) { int err; - - if (ub->ub_txg != 0) { - (void) fprintf(stderr, - "error: label %d: UB TXG of 0 expected, but got %" - PRIu64 "\n", - l, ub->ub_txg); - (void) fprintf(stderr, "It would appear the device was not " - "properly removed.\n"); - return (1); - } - - for (int i = 0; i < cfg_keys_len; i++) { - uint64_t val; - err = nvlist_lookup_uint64(cfg, cfg_keys[i], &val); - if (err) { - (void) fprintf(stderr, - "error: label %d, %d: " - "cannot find nvlist key %s\n", - l, i, cfg_keys[i]); - return (err); - } - } + nvlist_t *vdev_tree_cfg; err = nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree_cfg); @@ -601,7 +784,7 @@ zhack_repair_check_label(uberblock_t *ub, const int l, const char **cfg_keys, (void) fprintf(stderr, "error: label %d: nvlist key %s is zero\n", l, ZPOOL_CONFIG_ASHIFT); - return (err); + return (1); } return (0); @@ -616,30 +799,35 @@ zhack_repair_undetach(uberblock_t *ub, nvlist_t *cfg, const int l) */ if (BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp) != 0) { const uint64_t txg = BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp); + int err; + ub->ub_txg = txg; - if (nvlist_remove_all(cfg, ZPOOL_CONFIG_CREATE_TXG) != 0) { + err = nvlist_remove_all(cfg, ZPOOL_CONFIG_CREATE_TXG); + if (err) { (void) fprintf(stderr, "error: label %d: " "Failed to remove pool creation TXG\n", l); - return (1); + return (err); } - if (nvlist_remove_all(cfg, ZPOOL_CONFIG_POOL_TXG) != 0) { + err = nvlist_remove_all(cfg, ZPOOL_CONFIG_POOL_TXG); + if (err) { (void) fprintf(stderr, "error: label %d: Failed to remove pool TXG to " "be replaced.\n", l); - return (1); + return (err); } - if (nvlist_add_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, txg) != 0) { + err = nvlist_add_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, txg); + if (err) { (void) fprintf(stderr, "error: label %d: " "Failed to add pool TXG of %" PRIu64 "\n", l, txg); - return (1); + return (err); } } @@ -733,6 +921,7 @@ zhack_repair_test_cksum(const int byteswap, void *vdev_data, BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC; const uint64_t actual_magic = vdev_eck->zec_magic; int err = 0; + if (actual_magic != expected_magic) { (void) fprintf(stderr, "error: label %d: " "Expected " @@ -754,6 +943,36 @@ zhack_repair_test_cksum(const int byteswap, void *vdev_data, return (err); } +static int +zhack_repair_unpack_cfg(vdev_label_t *vl, const int l, nvlist_t **cfg) +{ + const char *cfg_keys[] = { ZPOOL_CONFIG_VERSION, + ZPOOL_CONFIG_POOL_STATE, ZPOOL_CONFIG_GUID }; + int err; + + err = nvlist_unpack(vl->vl_vdev_phys.vp_nvlist, + VDEV_PHYS_SIZE - sizeof (zio_eck_t), cfg, 0); + if (err) { + (void) fprintf(stderr, + "error: cannot unpack nvlist label %d\n", l); + return (err); + } + + for (int i = 0; i < ARRAY_SIZE(cfg_keys); i++) { + uint64_t val; + err = nvlist_lookup_uint64(*cfg, cfg_keys[i], &val); + if (err) { + (void) fprintf(stderr, + "error: label %d, %d: " + "cannot find nvlist key %s\n", + l, i, cfg_keys[i]); + return (err); + } + } + + return (0); +} + static void zhack_repair_one_label(const zhack_repair_op_t op, const int fd, vdev_label_t *vl, const uint64_t label_offset, const int l, @@ -767,10 +986,7 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd, (zio_eck_t *)((char *)(vdev_data) + VDEV_PHYS_SIZE) - 1; const uint64_t vdev_phys_offset = label_offset + offsetof(vdev_label_t, vl_vdev_phys); - const char *cfg_keys[] = { ZPOOL_CONFIG_VERSION, - ZPOOL_CONFIG_POOL_STATE, ZPOOL_CONFIG_GUID }; nvlist_t *cfg; - nvlist_t *vdev_tree_cfg = NULL; uint64_t ashift; int byteswap; @@ -778,18 +994,9 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd, if (err) return; - if (vdev_eck->zec_magic == 0) { - (void) fprintf(stderr, "error: label %d: " - "Expected the nvlist checksum magic number to not be zero" - "\n", - l); - (void) fprintf(stderr, "There should already be a checksum " - "for the label.\n"); + err = zhack_repair_get_byteswap(vdev_eck, l, &byteswap); + if (err) return; - } - - byteswap = - (vdev_eck->zec_magic == BSWAP_64((uint64_t)ZEC_MAGIC)); if (byteswap) { byteswap_uint64_array(&vdev_eck->zec_cksum, @@ -805,16 +1012,7 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd, return; } - err = nvlist_unpack(vl->vl_vdev_phys.vp_nvlist, - VDEV_PHYS_SIZE - sizeof (zio_eck_t), &cfg, 0); - if (err) { - (void) fprintf(stderr, - "error: cannot unpack nvlist label %d\n", l); - return; - } - - err = zhack_repair_check_label(ub, - l, cfg_keys, ARRAY_SIZE(cfg_keys), cfg, vdev_tree_cfg, &ashift); + err = zhack_repair_unpack_cfg(vl, l, &cfg); if (err) return; @@ -822,6 +1020,19 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd, char *buf; size_t buflen; + if (ub->ub_txg != 0) { + (void) fprintf(stderr, + "error: label %d: UB TXG of 0 expected, but got %" + PRIu64 "\n", l, ub->ub_txg); + (void) fprintf(stderr, "It would appear the device was " + "not properly detached.\n"); + return; + } + + err = zhack_repair_get_ashift(cfg, l, &ashift); + if (err) + return; + err = zhack_repair_undetach(ub, cfg, l); if (err) return; @@ -981,7 +1192,7 @@ main(int argc, char **argv) dprintf_setup(&argc, argv); zfs_prop_init(); - while ((c = getopt(argc, argv, "+c:d:")) != -1) { + while ((c = getopt(argc, argv, "+c:d:o:")) != -1) { switch (c) { case 'c': g_importargs.cachefile = optarg; @@ -990,6 +1201,10 @@ main(int argc, char **argv) assert(g_importargs.paths < MAX_NUM_PATHS); g_importargs.path[g_importargs.paths++] = optarg; break; + case 'o': + if (handle_tunable_option(optarg, B_FALSE) != 0) + exit(1); + break; default: usage(); break; @@ -1011,6 +1226,8 @@ main(int argc, char **argv) rv = zhack_do_feature(argc, argv); } else if (strcmp(subcommand, "label") == 0) { return (zhack_do_label(argc, argv)); + } else if (strcmp(subcommand, "metaslab") == 0) { + rv = zhack_do_metaslab(argc, argv); } else { (void) fprintf(stderr, "error: unknown subcommand: %s\n", subcommand); diff --git a/sys/contrib/openzfs/cmd/zilstat.in b/sys/contrib/openzfs/cmd/zilstat.in index 4140398bf4a3..d01db9b0914b 100755 --- a/sys/contrib/openzfs/cmd/zilstat.in +++ b/sys/contrib/openzfs/cmd/zilstat.in @@ -47,6 +47,7 @@ cols = { "cec": [5, 1000, "zil_commit_error_count"], "csc": [5, 1000, "zil_commit_stall_count"], "cSc": [5, 1000, "zil_commit_suspend_count"], + "cCc": [5, 1000, "zil_commit_crash_count"], "ic": [5, 1000, "zil_itx_count"], "iic": [5, 1000, "zil_itx_indirect_count"], "iib": [5, 1024, "zil_itx_indirect_bytes"], diff --git a/sys/contrib/openzfs/cmd/zpool/Makefile.am b/sys/contrib/openzfs/cmd/zpool/Makefile.am index 2f962408e5a3..5bb6d8160b18 100644 --- a/sys/contrib/openzfs/cmd/zpool/Makefile.am +++ b/sys/contrib/openzfs/cmd/zpool/Makefile.am @@ -148,6 +148,7 @@ dist_zpoolcompat_DATA = \ %D%/compatibility.d/openzfs-2.1-linux \ %D%/compatibility.d/openzfs-2.2 \ %D%/compatibility.d/openzfs-2.3 \ + %D%/compatibility.d/openzfs-2.4 \ %D%/compatibility.d/openzfsonosx-1.7.0 \ %D%/compatibility.d/openzfsonosx-1.8.1 \ %D%/compatibility.d/openzfsonosx-1.9.3 \ @@ -187,7 +188,9 @@ zpoolcompatlinks = \ "openzfs-2.2 openzfs-2.2-linux" \ "openzfs-2.2 openzfs-2.2-freebsd" \ "openzfs-2.3 openzfs-2.3-linux" \ - "openzfs-2.3 openzfs-2.3-freebsd" + "openzfs-2.3 openzfs-2.3-freebsd" \ + "openzfs-2.4 openzfs-2.4-linux" \ + "openzfs-2.4 openzfs-2.4-freebsd" zpoolconfdir = $(sysconfdir)/zfs/zpool.d INSTALL_DATA_HOOKS += zpool-install-data-hook diff --git a/sys/contrib/openzfs/cmd/zpool/compatibility.d/openzfs-2.4 b/sys/contrib/openzfs/cmd/zpool/compatibility.d/openzfs-2.4 new file mode 100644 index 000000000000..3fbd91014c95 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/compatibility.d/openzfs-2.4 @@ -0,0 +1,48 @@ +# Features supported by OpenZFS 2.4 on Linux and FreeBSD +allocation_classes +async_destroy +blake3 +block_cloning +block_cloning_endian +bookmark_v2 +bookmark_written +bookmarks +device_rebuild +device_removal +draid +dynamic_gang_header +edonr +embedded_data +empty_bpobj +enabled_txg +encryption +extensible_dataset +fast_dedup +filesystem_limits +head_errlog +hole_birth +large_blocks +large_dnode +large_microzap +livelist +log_spacemap +longname +lz4_compress +multi_vdev_crash_dump +obsolete_counts +physical_rewrite +project_quota +raidz_expansion +redacted_datasets +redaction_bookmarks +redaction_list_spill +resilver_defer +sha512 +skein +spacemap_histogram +spacemap_v2 +userobj_accounting +vdev_zaps_v2 +zilsaxattr +zpool_checkpoint +zstd_compress diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_iter.c b/sys/contrib/openzfs/cmd/zpool/zpool_iter.c index 2ec189b98653..2eec9a95e24c 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_iter.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_iter.c @@ -379,8 +379,8 @@ process_unique_cmd_columns(vdev_cmd_data_list_t *vcdl) static int vdev_process_cmd_output(vdev_cmd_data_t *data, char *line) { - char *col = NULL; - char *val = line; + char *col; + char *val; char *equals; char **tmp; @@ -397,6 +397,7 @@ vdev_process_cmd_output(vdev_cmd_data_t *data, char *line) col = line; val = equals + 1; } else { + col = NULL; val = line; } diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c index e62441894cd7..2c46ad0df895 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c @@ -34,7 +34,7 @@ * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com> * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> * Copyright (c) 2021, 2023, Klara Inc. - * Copyright [2021] Hewlett Packard Enterprise Development LP + * Copyright (c) 2021, 2025 Hewlett Packard Enterprise Development LP. */ #include <assert.h> @@ -456,7 +456,7 @@ get_usage(zpool_help_t idx) "<pool> <vdev> ...\n")); case HELP_ATTACH: return (gettext("\tattach [-fsw] [-o property=value] " - "<pool> <device> <new-device>\n")); + "<pool> <vdev> <new-device>\n")); case HELP_CLEAR: return (gettext("\tclear [[--power]|[-nF]] <pool> [device]\n")); case HELP_CREATE: @@ -510,16 +510,16 @@ get_usage(zpool_help_t idx) case HELP_REOPEN: return (gettext("\treopen [-n] <pool>\n")); case HELP_INITIALIZE: - return (gettext("\tinitialize [-c | -s | -u] [-w] <pool> " - "[<device> ...]\n")); + return (gettext("\tinitialize [-c | -s | -u] [-w] <-a | <pool> " + "[<device> ...]>\n")); case HELP_SCRUB: - return (gettext("\tscrub [-e | -s | -p | -C] [-w] " - "<pool> ...\n")); + return (gettext("\tscrub [-e | -s | -p | -C | -E | -S] [-w] " + "<-a | <pool> [<pool> ...]>\n")); case HELP_RESILVER: return (gettext("\tresilver <pool> ...\n")); case HELP_TRIM: - return (gettext("\ttrim [-dw] [-r <rate>] [-c | -s] <pool> " - "[<device> ...]\n")); + return (gettext("\ttrim [-dw] [-r <rate>] [-c | -s] " + "<-a | <pool> [<device> ...]>\n")); case HELP_STATUS: return (gettext("\tstatus [-DdegiLPpstvx] " "[-c script1[,script2,...]] ...\n" @@ -560,33 +560,6 @@ get_usage(zpool_help_t idx) } } -static void -zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res) -{ - uint_t children = 0; - nvlist_t **child; - uint_t i; - - (void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, - &child, &children); - - if (children == 0) { - char *path = zpool_vdev_name(g_zfs, zhp, nvroot, - VDEV_NAME_PATH); - - if (strcmp(path, VDEV_TYPE_INDIRECT) != 0 && - strcmp(path, VDEV_TYPE_HOLE) != 0) - fnvlist_add_boolean(res, path); - - free(path); - return; - } - - for (i = 0; i < children; i++) { - zpool_collect_leaves(zhp, child[i], res); - } -} - /* * Callback routine that will print out a pool property value. */ @@ -779,10 +752,11 @@ usage(boolean_t requested) } /* - * zpool initialize [-c | -s | -u] [-w] <pool> [<vdev> ...] + * zpool initialize [-c | -s | -u] [-w] <-a | pool> [<vdev> ...] * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool * if none specified. * + * -a Use all pools. * -c Cancel. Ends active initializing. * -s Suspend. Initializing can then be restarted with no flags. * -u Uninitialize. Clears initialization state. @@ -794,22 +768,26 @@ zpool_do_initialize(int argc, char **argv) int c; char *poolname; zpool_handle_t *zhp; - nvlist_t *vdevs; int err = 0; boolean_t wait = B_FALSE; + boolean_t initialize_all = B_FALSE; struct option long_options[] = { {"cancel", no_argument, NULL, 'c'}, {"suspend", no_argument, NULL, 's'}, {"uninit", no_argument, NULL, 'u'}, {"wait", no_argument, NULL, 'w'}, + {"all", no_argument, NULL, 'a'}, {0, 0, 0, 0} }; pool_initialize_func_t cmd_type = POOL_INITIALIZE_START; - while ((c = getopt_long(argc, argv, "csuw", long_options, + while ((c = getopt_long(argc, argv, "acsuw", long_options, NULL)) != -1) { switch (c) { + case 'a': + initialize_all = B_TRUE; + break; case 'c': if (cmd_type != POOL_INITIALIZE_START && cmd_type != POOL_INITIALIZE_CANCEL) { @@ -856,7 +834,18 @@ zpool_do_initialize(int argc, char **argv) argc -= optind; argv += optind; - if (argc < 1) { + initialize_cbdata_t cbdata = { + .wait = wait, + .cmd_type = cmd_type + }; + + if (initialize_all && argc > 0) { + (void) fprintf(stderr, gettext("-a cannot be combined with " + "individual pools or vdevs\n")); + usage(B_FALSE); + } + + if (argc < 1 && !initialize_all) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); return (-1); @@ -868,30 +857,35 @@ zpool_do_initialize(int argc, char **argv) usage(B_FALSE); } - poolname = argv[0]; - zhp = zpool_open(g_zfs, poolname); - if (zhp == NULL) - return (-1); - - vdevs = fnvlist_alloc(); - if (argc == 1) { - /* no individual leaf vdevs specified, so add them all */ - nvlist_t *config = zpool_get_config(zhp, NULL); - nvlist_t *nvroot = fnvlist_lookup_nvlist(config, - ZPOOL_CONFIG_VDEV_TREE); - zpool_collect_leaves(zhp, nvroot, vdevs); + if (argc == 0 && initialize_all) { + /* Initilize each pool recursively */ + err = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, + B_FALSE, zpool_initialize_one, &cbdata); + return (err); + } else if (argc == 1) { + /* no individual leaf vdevs specified, initialize the pool */ + poolname = argv[0]; + zhp = zpool_open(g_zfs, poolname); + if (zhp == NULL) + return (-1); + err = zpool_initialize_one(zhp, &cbdata); } else { + /* individual leaf vdevs specified, initialize them */ + poolname = argv[0]; + zhp = zpool_open(g_zfs, poolname); + if (zhp == NULL) + return (-1); + nvlist_t *vdevs = fnvlist_alloc(); for (int i = 1; i < argc; i++) { fnvlist_add_boolean(vdevs, argv[i]); } + if (wait) + err = zpool_initialize_wait(zhp, cmd_type, vdevs); + else + err = zpool_initialize(zhp, cmd_type, vdevs); + fnvlist_free(vdevs); } - if (wait) - err = zpool_initialize_wait(zhp, cmd_type, vdevs); - else - err = zpool_initialize(zhp, cmd_type, vdevs); - - fnvlist_free(vdevs); zpool_close(zhp); return (err); @@ -1788,7 +1782,7 @@ zpool_do_labelclear(int argc, char **argv) { char vdev[MAXPATHLEN]; char *name = NULL; - int c, fd = -1, ret = 0; + int c, fd, ret = 0; nvlist_t *config; pool_state_t state; boolean_t inuse = B_FALSE; @@ -6157,7 +6151,6 @@ static void get_interval_count_filter_guids(int *argc, char **argv, float *interval, unsigned long *count, iostat_cbdata_t *cb) { - char **tmpargv = argv; int argc_for_interval = 0; /* Is the last arg an interval value? Or a guid? */ @@ -6181,7 +6174,7 @@ get_interval_count_filter_guids(int *argc, char **argv, float *interval, } /* Point to our list of possible intervals */ - tmpargv = &argv[*argc - argc_for_interval]; + char **tmpargv = &argv[*argc - argc_for_interval]; *argc = *argc - argc_for_interval; get_interval_count(&argc_for_interval, tmpargv, @@ -6377,7 +6370,6 @@ zpool_do_iostat(int argc, char **argv) int npools; float interval = 0; unsigned long count = 0; - int winheight = 24; zpool_list_t *list; boolean_t verbose = B_FALSE; boolean_t latency = B_FALSE, l_histo = B_FALSE, rq_histo = B_FALSE; @@ -6673,7 +6665,7 @@ zpool_do_iostat(int argc, char **argv) * even when terminal window has its height * changed. */ - winheight = terminal_height(); + int winheight = terminal_height(); /* * Are we connected to TTY? If not, headers_once * should be true, to avoid breaking scripts. @@ -7652,7 +7644,7 @@ zpool_do_replace(int argc, char **argv) } /* - * zpool attach [-fsw] [-o property=value] <pool> <device>|<vdev> <new_device> + * zpool attach [-fsw] [-o property=value] <pool> <vdev> <new_device> * * -f Force attach, even if <new_device> appears to be in use. * -s Use sequential instead of healing reconstruction for resilver. @@ -7660,9 +7652,9 @@ zpool_do_replace(int argc, char **argv) * -w Wait for resilvering (mirror) or expansion (raidz) to complete * before returning. * - * Attach <new_device> to a <device> or <vdev>, where the vdev can be of type - * mirror or raidz. If <device> is not part of a mirror, then <device> will - * be transformed into a mirror of <device> and <new_device>. When a mirror + * Attach <new_device> to a <vdev>, where the vdev can be of type + * device, mirror or raidz. If <vdev> is not part of a mirror, then <vdev> will + * be transformed into a mirror of <vdev> and <new_device>. When a mirror * is involved, <new_device> will begin life with a DTL of [0, now], and will * immediately begin to resilver itself. For the raidz case, a expansion will * commence and reflow the raidz data across all the disks including the @@ -8368,6 +8360,8 @@ zpool_do_reopen(int argc, char **argv) typedef struct scrub_cbdata { int cb_type; pool_scrub_cmd_t cb_scrub_cmd; + time_t cb_date_start; + time_t cb_date_end; } scrub_cbdata_t; static boolean_t @@ -8411,8 +8405,8 @@ scrub_callback(zpool_handle_t *zhp, void *data) return (1); } - err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd); - + err = zpool_scan_range(zhp, cb->cb_type, cb->cb_scrub_cmd, + cb->cb_date_start, cb->cb_date_end); if (err == 0 && zpool_has_checkpoint(zhp) && cb->cb_type == POOL_SCAN_SCRUB) { (void) printf(gettext("warning: will not scrub state that " @@ -8430,10 +8424,35 @@ wait_callback(zpool_handle_t *zhp, void *data) return (zpool_wait(zhp, *act)); } +static time_t +date_string_to_sec(const char *timestr, boolean_t rounding) +{ + struct tm tm = {0}; + int adjustment = rounding ? 1 : 0; + + /* Allow mktime to determine timezone. */ + tm.tm_isdst = -1; + + if (strptime(timestr, "%Y-%m-%d %H:%M", &tm) == NULL) { + if (strptime(timestr, "%Y-%m-%d", &tm) == NULL) { + fprintf(stderr, gettext("Failed to parse the date.\n")); + usage(B_FALSE); + } + adjustment *= 24 * 60 * 60; + } else { + adjustment *= 60; + } + + return (mktime(&tm) + adjustment); +} + /* - * zpool scrub [-e | -s | -p | -C] [-w] <pool> ... + * zpool scrub [-e | -s | -p | -C | -E | -S] [-w] [-a | <pool> ...] * + * -a Scrub all pools. * -e Only scrub blocks in the error log. + * -E End date of scrub. + * -S Start date of scrub. * -s Stop. Stops any in-progress scrub. * -p Pause. Pause in-progress scrub. * -w Wait. Blocks until scrub has completed. @@ -8449,21 +8468,36 @@ zpool_do_scrub(int argc, char **argv) cb.cb_type = POOL_SCAN_SCRUB; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; + cb.cb_date_start = cb.cb_date_end = 0; boolean_t is_error_scrub = B_FALSE; boolean_t is_pause = B_FALSE; boolean_t is_stop = B_FALSE; boolean_t is_txg_continue = B_FALSE; + boolean_t scrub_all = B_FALSE; /* check options */ - while ((c = getopt(argc, argv, "spweC")) != -1) { + while ((c = getopt(argc, argv, "aspweCE:S:")) != -1) { switch (c) { + case 'a': + scrub_all = B_TRUE; + break; case 'e': is_error_scrub = B_TRUE; break; + case 'E': + /* + * Round the date. It's better to scrub more data than + * less. This also makes the date inclusive. + */ + cb.cb_date_end = date_string_to_sec(optarg, B_TRUE); + break; case 's': is_stop = B_TRUE; break; + case 'S': + cb.cb_date_start = date_string_to_sec(optarg, B_FALSE); + break; case 'p': is_pause = B_TRUE; break; @@ -8511,6 +8545,19 @@ zpool_do_scrub(int argc, char **argv) } } + if ((cb.cb_date_start != 0 || cb.cb_date_end != 0) && + cb.cb_scrub_cmd != POOL_SCRUB_NORMAL) { + (void) fprintf(stderr, gettext("invalid option combination: " + "start/end date is available only with normal scrub\n")); + usage(B_FALSE); + } + if (cb.cb_date_start != 0 && cb.cb_date_end != 0 && + cb.cb_date_start > cb.cb_date_end) { + (void) fprintf(stderr, gettext("invalid arguments: " + "end date has to be later than start date\n")); + usage(B_FALSE); + } + if (wait && (cb.cb_type == POOL_SCAN_NONE || cb.cb_scrub_cmd == POOL_SCRUB_PAUSE)) { (void) fprintf(stderr, gettext("invalid option combination: " @@ -8521,7 +8568,7 @@ zpool_do_scrub(int argc, char **argv) argc -= optind; argv += optind; - if (argc < 1) { + if (argc < 1 && !scrub_all) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } @@ -8551,6 +8598,7 @@ zpool_do_resilver(int argc, char **argv) cb.cb_type = POOL_SCAN_RESILVER; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; + cb.cb_date_start = cb.cb_date_end = 0; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { @@ -8575,8 +8623,9 @@ zpool_do_resilver(int argc, char **argv) } /* - * zpool trim [-d] [-r <rate>] [-c | -s] <pool> [<device> ...] + * zpool trim [-d] [-r <rate>] [-c | -s] <-a | pool> [<device> ...] * + * -a Trim all pools. * -c Cancel. Ends any in-progress trim. * -d Secure trim. Requires kernel and device support. * -r <rate> Sets the TRIM rate in bytes (per second). Supports @@ -8593,6 +8642,7 @@ zpool_do_trim(int argc, char **argv) {"rate", required_argument, NULL, 'r'}, {"suspend", no_argument, NULL, 's'}, {"wait", no_argument, NULL, 'w'}, + {"all", no_argument, NULL, 'a'}, {0, 0, 0, 0} }; @@ -8600,11 +8650,16 @@ zpool_do_trim(int argc, char **argv) uint64_t rate = 0; boolean_t secure = B_FALSE; boolean_t wait = B_FALSE; + boolean_t trimall = B_FALSE; + int error; int c; - while ((c = getopt_long(argc, argv, "cdr:sw", long_options, NULL)) + while ((c = getopt_long(argc, argv, "acdr:sw", long_options, NULL)) != -1) { switch (c) { + case 'a': + trimall = B_TRUE; + break; case 'c': if (cmd_type != POOL_TRIM_START && cmd_type != POOL_TRIM_CANCEL) { @@ -8663,7 +8718,18 @@ zpool_do_trim(int argc, char **argv) argc -= optind; argv += optind; - if (argc < 1) { + trimflags_t trim_flags = { + .secure = secure, + .rate = rate, + .wait = wait, + }; + + trim_cbdata_t cbdata = { + .trim_flags = trim_flags, + .cmd_type = cmd_type + }; + + if (argc < 1 && !trimall) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); return (-1); @@ -8671,41 +8737,46 @@ zpool_do_trim(int argc, char **argv) if (wait && (cmd_type != POOL_TRIM_START)) { (void) fprintf(stderr, gettext("-w cannot be used with -c or " - "-s\n")); + "-s options\n")); usage(B_FALSE); } - char *poolname = argv[0]; - zpool_handle_t *zhp = zpool_open(g_zfs, poolname); - if (zhp == NULL) - return (-1); - - trimflags_t trim_flags = { - .secure = secure, - .rate = rate, - .wait = wait, - }; + if (trimall && argc > 0) { + (void) fprintf(stderr, gettext("-a cannot be combined with " + "individual zpools or vdevs\n")); + usage(B_FALSE); + } - nvlist_t *vdevs = fnvlist_alloc(); - if (argc == 1) { + if (argc == 0 && trimall) { + cbdata.trim_flags.fullpool = B_TRUE; + /* Trim each pool recursively */ + error = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, + B_FALSE, zpool_trim_one, &cbdata); + } else if (argc == 1) { + char *poolname = argv[0]; + zpool_handle_t *zhp = zpool_open(g_zfs, poolname); + if (zhp == NULL) + return (-1); /* no individual leaf vdevs specified, so add them all */ - nvlist_t *config = zpool_get_config(zhp, NULL); - nvlist_t *nvroot = fnvlist_lookup_nvlist(config, - ZPOOL_CONFIG_VDEV_TREE); - zpool_collect_leaves(zhp, nvroot, vdevs); - trim_flags.fullpool = B_TRUE; + error = zpool_trim_one(zhp, &cbdata); + zpool_close(zhp); } else { - trim_flags.fullpool = B_FALSE; + char *poolname = argv[0]; + zpool_handle_t *zhp = zpool_open(g_zfs, poolname); + if (zhp == NULL) + return (-1); + /* leaf vdevs specified, trim only those */ + cbdata.trim_flags.fullpool = B_FALSE; + nvlist_t *vdevs = fnvlist_alloc(); for (int i = 1; i < argc; i++) { fnvlist_add_boolean(vdevs, argv[i]); } + error = zpool_trim(zhp, cbdata.cmd_type, vdevs, + &cbdata.trim_flags); + fnvlist_free(vdevs); + zpool_close(zhp); } - int error = zpool_trim(zhp, cmd_type, vdevs, &trim_flags); - - fnvlist_free(vdevs); - zpool_close(zhp); - return (error); } @@ -10706,7 +10777,6 @@ status_callback_json(zpool_handle_t *zhp, void *data) uint_t c; vdev_stat_t *vs; nvlist_t *item, *d, *load_info, *vds; - item = d = NULL; /* If dedup stats were requested, also fetch dedupcached. */ if (cbp->cb_dedup_stats > 1) @@ -11330,7 +11400,8 @@ upgrade_enable_all(zpool_handle_t *zhp, int *countp) const char *fname = spa_feature_table[i].fi_uname; const char *fguid = spa_feature_table[i].fi_guid; - if (!spa_feature_table[i].fi_zfs_mod_supported) + if (!spa_feature_table[i].fi_zfs_mod_supported || + (spa_feature_table[i].fi_flags & ZFEATURE_FLAG_NO_UPGRADE)) continue; if (!nvlist_exists(enabled, fguid) && requested_features[i]) { @@ -11485,7 +11556,11 @@ upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg) "Note that the pool " "'compatibility' feature can be " "used to inhibit\nfeature " - "upgrades.\n\n")); + "upgrades.\n\n" + "Features marked with (*) are not " + "applied automatically on upgrade, " + "and\nmust be applied explicitly " + "with zpool-set(7).\n\n")); (void) printf(gettext("POOL " "FEATURE\n")); (void) printf(gettext("------" @@ -11499,7 +11574,9 @@ upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg) poolfirst = B_FALSE; } - (void) printf(gettext(" %s\n"), fname); + (void) printf(gettext(" %s%s\n"), fname, + spa_feature_table[i].fi_flags & + ZFEATURE_FLAG_NO_UPGRADE ? "(*)" : ""); } /* * If they did "zpool upgrade -a", then we could @@ -12300,7 +12377,7 @@ zpool_do_events_next(ev_opts_t *opts) nvlist_free(nvl); } - VERIFY(0 == close(zevent_fd)); + VERIFY0(close(zevent_fd)); return (ret); } diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c index 07868a30d7e7..684b46a2d673 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c @@ -574,7 +574,6 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) nvlist_t *cnv = child[c]; const char *path; struct stat64 statbuf; - int64_t size = -1LL; const char *childtype; int fd, err; @@ -656,7 +655,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) statbuf.st_size == MAXOFFSET_T) continue; - size = statbuf.st_size; + int64_t size = statbuf.st_size; /* * Also make sure that devices and @@ -876,6 +875,18 @@ check_replication(nvlist_t *config, nvlist_t *newroot) (u_longlong_t)mirror->zprl_children); ret = -1; } + } else if (is_raidz_draid(current, new)) { + if (current->zprl_parity != new->zprl_parity) { + vdev_error(gettext( + "mismatched replication level: pool and " + "new vdev with different redundancy, %s " + "and %s vdevs, %llu vs. %llu\n"), + current->zprl_type, + new->zprl_type, + (u_longlong_t)current->zprl_parity, + (u_longlong_t)new->zprl_parity); + ret = -1; + } } else if (strcmp(current->zprl_type, new->zprl_type) != 0) { vdev_error(gettext( "mismatched replication level: pool uses %s " @@ -1353,7 +1364,7 @@ is_grouping(const char *type, int *mindev, int *maxdev) static int draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) { - uint64_t nparity = 1; + uint64_t nparity; uint64_t nspares = 0; uint64_t ndata = UINT64_MAX; uint64_t ngroups = 1; @@ -1581,13 +1592,12 @@ construct_spec(nvlist_t *props, int argc, char **argv) is_dedup = is_spare = B_FALSE; } - if (is_log || is_special || is_dedup) { + if (is_log) { if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { (void) fprintf(stderr, gettext("invalid vdev " - "specification: unsupported '%s' " - "device: %s\n"), is_log ? "log" : - "special", type); + "specification: unsupported 'log' " + "device: %s\n"), type); goto spec_out; } nlogs++; diff --git a/sys/contrib/openzfs/cmd/zstream/Makefile.am b/sys/contrib/openzfs/cmd/zstream/Makefile.am index be3539fe905d..80ef1ea7ca11 100644 --- a/sys/contrib/openzfs/cmd/zstream/Makefile.am +++ b/sys/contrib/openzfs/cmd/zstream/Makefile.am @@ -18,6 +18,7 @@ zstream_LDADD = \ libzpool.la \ libnvpair.la -PHONY += install-exec-hook -install-exec-hook: +cmd-zstream-install-exec-hook: cd $(DESTDIR)$(sbindir) && $(LN_S) -f zstream zstreamdump + +INSTALL_EXEC_HOOKS += cmd-zstream-install-exec-hook diff --git a/sys/contrib/openzfs/cmd/ztest.c b/sys/contrib/openzfs/cmd/ztest.c index 89264c97ff10..89752dcb0f0f 100644 --- a/sys/contrib/openzfs/cmd/ztest.c +++ b/sys/contrib/openzfs/cmd/ztest.c @@ -273,7 +273,6 @@ extern int zfs_compressed_arc_enabled; extern int zfs_abd_scatter_enabled; extern uint_t dmu_object_alloc_chunk_shift; extern boolean_t zfs_force_some_double_word_sm_entries; -extern unsigned long zio_decompress_fail_fraction; extern unsigned long zfs_reconstruct_indirect_damage_fraction; extern uint64_t raidz_expand_max_reflow_bytes; extern uint_t raidz_expand_pause_point; @@ -809,8 +808,8 @@ static ztest_option_t option_table[] = { { 'X', "raidz-expansion", NULL, "Perform a dedicated raidz expansion test", NO_DEFAULT, NULL}, - { 'o', "option", "\"OPTION=INTEGER\"", - "Set global variable to an unsigned 32-bit integer value", + { 'o', "option", "\"NAME=VALUE\"", + "Set the named tunable to the given value", NO_DEFAULT, NULL}, { 'G', "dump-debug-msg", NULL, "Dump zfs_dbgmsg buffer before exiting due to an error", @@ -829,8 +828,8 @@ static char *short_opts = NULL; static void init_options(void) { - ASSERT3P(long_opts, ==, NULL); - ASSERT3P(short_opts, ==, NULL); + ASSERT0P(long_opts); + ASSERT0P(short_opts); int count = sizeof (option_table) / sizeof (option_table[0]); long_opts = umem_alloc(sizeof (struct option) * count, UMEM_NOFAIL); @@ -919,7 +918,7 @@ ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) { char name[32]; char *value; - int state = ZTEST_VDEV_CLASS_RND; + int state; (void) strlcpy(name, input, sizeof (name)); @@ -1686,7 +1685,7 @@ ztest_rll_init(rll_t *rll) static void ztest_rll_destroy(rll_t *rll) { - ASSERT3P(rll->rll_writer, ==, NULL); + ASSERT0P(rll->rll_writer); ASSERT0(rll->rll_readers); mutex_destroy(&rll->rll_lock); cv_destroy(&rll->rll_cv); @@ -1720,7 +1719,7 @@ ztest_rll_unlock(rll_t *rll) rll->rll_writer = NULL; } else { ASSERT3S(rll->rll_readers, >, 0); - ASSERT3P(rll->rll_writer, ==, NULL); + ASSERT0P(rll->rll_writer); rll->rll_readers--; } @@ -1996,7 +1995,7 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING) != 0) { - zil_itx_destroy(itx); + zil_itx_destroy(itx, 0); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); write_state = WR_NEED_COPY; } @@ -2278,8 +2277,8 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) ztest_block_tag_t rbt; - VERIFY(dmu_read(os, lr->lr_foid, offset, - sizeof (rbt), &rbt, flags) == 0); + VERIFY0(dmu_read(os, lr->lr_foid, offset, + sizeof (rbt), &rbt, flags)); if (rbt.bt_magic == BT_MAGIC) { ztest_bt_verify(&rbt, os, lr->lr_foid, 0, offset, gen, txg, crtxg); @@ -2966,7 +2965,7 @@ ztest_zil_commit(ztest_ds_t *zd, uint64_t id) (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); - zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); + VERIFY0(zil_commit(zilog, ztest_random(ZTEST_OBJECTS))); /* * Remember the committed values in zd, which is in parent/child @@ -3882,7 +3881,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) * If newvd is too small, it should fail with EOVERFLOW. * * If newvd is a distributed spare and it's being attached to a - * dRAID which is not its parent it should fail with EINVAL. + * dRAID which is not its parent it should fail with ENOTSUP. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops && (!replacing || @@ -3901,7 +3900,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) else if (ashift > oldvd->vdev_top->vdev_ashift) expected_error = EDOM; else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) - expected_error = EINVAL; + expected_error = ENOTSUP; else expected_error = 0; @@ -4007,7 +4006,7 @@ raidz_scratch_verify(void) * requested by user, but scratch object was not created. */ case RRSS_SCRATCH_NOT_IN_USE: - ASSERT3U(offset, ==, 0); + ASSERT0(offset); break; /* @@ -5537,8 +5536,8 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) } if (i == 1) { - VERIFY(dmu_buf_hold(os, bigobj, off, - FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); + VERIFY0(dmu_buf_hold(os, bigobj, off, + FTAG, &dbt, DMU_READ_NO_PREFETCH)); } if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, @@ -7069,7 +7068,7 @@ ztest_set_global_vars(void) char *kv = ztest_opts.zo_gvars[i]; VERIFY3U(strlen(kv), <=, ZO_GVARS_MAX_ARGLEN); VERIFY3U(strlen(kv), >, 0); - int err = set_global_var(kv); + int err = handle_tunable_option(kv, B_TRUE); if (ztest_opts.zo_verbose > 0) { (void) printf("setting global var %s ... %s\n", kv, err ? "failed" : "ok"); @@ -7813,6 +7812,9 @@ ztest_dataset_open(int d) ztest_dataset_name(name, ztest_opts.zo_pool, d); + if (ztest_opts.zo_verbose >= 6) + (void) printf("Opening %s\n", name); + (void) pthread_rwlock_rdlock(&ztest_name_lock); error = ztest_dataset_create(name); @@ -7934,7 +7936,7 @@ ztest_freeze(void) */ while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { ztest_dmu_object_alloc_free(zd, 0); - zil_commit(zd->zd_zilog, 0); + VERIFY0(zil_commit(zd->zd_zilog, 0)); } txg_wait_synced(spa_get_dsl(spa), 0); @@ -7976,7 +7978,7 @@ ztest_freeze(void) /* * Commit all of the changes we just generated. */ - zil_commit(zd->zd_zilog, 0); + VERIFY0(zil_commit(zd->zd_zilog, 0)); txg_wait_synced(spa_get_dsl(spa), 0); /* @@ -8308,41 +8310,44 @@ static void ztest_generic_run(ztest_shared_t *zs, spa_t *spa) { kthread_t **run_threads; - int t; + int i, ndatasets; run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), UMEM_NOFAIL); /* - * Kick off all the tests that run in parallel. + * Actual number of datasets to be used. */ - for (t = 0; t < ztest_opts.zo_threads; t++) { - if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { - umem_free(run_threads, ztest_opts.zo_threads * - sizeof (kthread_t *)); - return; - } + ndatasets = MIN(ztest_opts.zo_datasets, ztest_opts.zo_threads); - run_threads[t] = thread_create(NULL, 0, ztest_thread, - (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, + /* + * Prepare the datasets first. + */ + for (i = 0; i < ndatasets; i++) + VERIFY0(ztest_dataset_open(i)); + + /* + * Kick off all the tests that run in parallel. + */ + for (i = 0; i < ztest_opts.zo_threads; i++) { + run_threads[i] = thread_create(NULL, 0, ztest_thread, + (void *)(uintptr_t)i, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); } /* * Wait for all of the tests to complete. */ - for (t = 0; t < ztest_opts.zo_threads; t++) - VERIFY0(thread_join(run_threads[t])); + for (i = 0; i < ztest_opts.zo_threads; i++) + VERIFY0(thread_join(run_threads[i])); /* * Close all datasets. This must be done after all the threads * are joined so we can be sure none of the datasets are in-use * by any of the threads. */ - for (t = 0; t < ztest_opts.zo_threads; t++) { - if (t < ztest_opts.zo_datasets) - ztest_dataset_close(t); - } + for (i = 0; i < ndatasets; i++) + ztest_dataset_close(i); txg_wait_synced(spa_get_dsl(spa), 0); @@ -8465,6 +8470,7 @@ ztest_run(ztest_shared_t *zs) int d = ztest_random(ztest_opts.zo_datasets); ztest_dataset_destroy(d); + txg_wait_synced(spa_get_dsl(spa), 0); } zs->zs_enospc_count = 0; @@ -8972,7 +8978,7 @@ main(int argc, char **argv) exit(EXIT_FAILURE); } else { /* children should not be spawned if setting gvars fails */ - VERIFY3S(err, ==, 0); + VERIFY0(err); } /* Override location of zpool.cache */ diff --git a/sys/contrib/openzfs/config/Shellcheck.am b/sys/contrib/openzfs/config/Shellcheck.am index 1ab13516066c..87e6494056cf 100644 --- a/sys/contrib/openzfs/config/Shellcheck.am +++ b/sys/contrib/openzfs/config/Shellcheck.am @@ -16,10 +16,14 @@ SHELLCHECK_OPTS = $(call JUST_SHELLCHECK_OPTS,$(1)) $(call JUST_CHECKBAS PHONY += shellcheck +shellcheck_verbose = $(shellcheck_verbose_@AM_V@) +shellcheck_verbose_ = $(shellcheck_verbose_@AM_DEFAULT_V@) +shellcheck_verbose_0 = @echo SHELLCHECK $(_STGT); + _STGT = $(subst ^,/,$(subst shellcheck-here-,,$@)) shellcheck-here-%: if HAVE_SHELLCHECK - shellcheck --format=gcc --enable=all --exclude=SC1090,SC1091,SC2039,SC2250,SC2312,SC2317,SC3043 $$([ -n "$(SHELLCHECK_SHELL)" ] && echo "--shell=$(SHELLCHECK_SHELL)") "$$([ -e "$(_STGT)" ] || echo "$(srcdir)/")$(_STGT)" + $(shellcheck_verbose)shellcheck --format=gcc --enable=all --exclude=SC1090,SC1091,SC2039,SC2250,SC2312,SC2317,SC3043 $$([ -n "$(SHELLCHECK_SHELL)" ] && echo "--shell=$(SHELLCHECK_SHELL)") "$$([ -e "$(_STGT)" ] || echo "$(srcdir)/")$(_STGT)" else @echo "skipping shellcheck of" $(_STGT) "because shellcheck is not installed" endif @@ -29,11 +33,15 @@ shellcheck: $(SHELLCHECKSCRIPTS) $(call JUST_SHELLCHECK_OPTS,$(SHELLCHECKSCRIPTS PHONY += checkbashisms +checkbashisms_verbose = $(checkbashisms_verbose_@AM_V@) +checkbashisms_verbose_ = $(checkbashisms_verbose_@AM_DEFAULT_V@) +checkbashisms_verbose_0 = @echo CHECKBASHISMS $(_BTGT); + # command -v *is* specified by POSIX and every shell in existence supports it _BTGT = $(subst ^,/,$(subst checkbashisms-here-,,$@)) checkbashisms-here-%: if HAVE_CHECKBASHISMS - ! { [ -n "$(SHELLCHECK_SHELL)" ] && echo '#!/bin/$(SHELLCHECK_SHELL)'; cat "$$([ -e "$(_BTGT)" ] || echo "$(srcdir)/")$(_BTGT)"; } | \ + $(checkbashisms_verbose)! { [ -n "$(SHELLCHECK_SHELL)" ] && echo '#!/bin/$(SHELLCHECK_SHELL)'; cat "$$([ -e "$(_BTGT)" ] || echo "$(srcdir)/")$(_BTGT)"; } | \ checkbashisms -npx 2>&1 | grep -vFe "'command' with option other than -p" -e 'command -v' -e 'any possible bashisms' $(CHECKBASHISMS_IGNORE) >&2 else @echo "skipping checkbashisms of" $(_BTGT) "because checkbashisms is not installed" diff --git a/sys/contrib/openzfs/config/always-arch.m4 b/sys/contrib/openzfs/config/always-arch.m4 index 9f413eeddf95..d73b878916cb 100644 --- a/sys/contrib/openzfs/config/always-arch.m4 +++ b/sys/contrib/openzfs/config/always-arch.m4 @@ -34,8 +34,26 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_ARCH], [ esac AM_CONDITIONAL([TARGET_CPU_AARCH64], test $TARGET_CPU = aarch64) + AM_CONDITIONAL([TARGET_CPU_I386], test $TARGET_CPU = i386) AM_CONDITIONAL([TARGET_CPU_X86_64], test $TARGET_CPU = x86_64) AM_CONDITIONAL([TARGET_CPU_POWERPC], test $TARGET_CPU = powerpc) AM_CONDITIONAL([TARGET_CPU_SPARC64], test $TARGET_CPU = sparc64) AM_CONDITIONAL([TARGET_CPU_ARM], test $TARGET_CPU = arm) ]) +dnl # +dnl # Check for conflicting environment variables +dnl # +dnl # If ARCH env variable is set up, then kernel Makefile in the /usr/src/kernel +dnl # can misbehave during the zfs ./configure test of the module compilation. +AC_DEFUN([ZFS_AC_CONFIG_CHECK_ARCH_VAR], [ + AC_MSG_CHECKING([for conflicting environment variables]) + if test -n "$ARCH"; then + AC_MSG_RESULT([warning]) + AC_MSG_WARN(m4_normalize([ARCH environment variable is set to "$ARCH". + This can cause build kernel modules support check failure. + Please unset it.])) + else + AC_MSG_RESULT([done]) + fi +]) + diff --git a/sys/contrib/openzfs/config/always-compiler-options.m4 b/sys/contrib/openzfs/config/always-compiler-options.m4 index 6383b12506ee..0e96435e3713 100644 --- a/sys/contrib/openzfs/config/always-compiler-options.m4 +++ b/sys/contrib/openzfs/config/always-compiler-options.m4 @@ -156,6 +156,34 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_ZERO_LENGTH], [ ]) dnl # +dnl # Check if kernel cc supports -Wno-format-zero-length option. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_KERNEL_CC_NO_FORMAT_ZERO_LENGTH], [ + saved_cc="$CC" + AS_IF( + [ test -n "$KERNEL_CC" ], [ CC="$KERNEL_CC" ], + [ test -n "$KERNEL_LLVM" ], [ CC="clang" ], + [ CC="gcc" ] + ) + AC_MSG_CHECKING([whether $CC supports -Wno-format-zero-length]) + + saved_flags="$CFLAGS" + CFLAGS="$CFLAGS -Werror -Wno-format-zero-length" + + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [ + KERNEL_NO_FORMAT_ZERO_LENGTH=-Wno-format-zero-length + AC_MSG_RESULT([yes]) + ], [ + KERNEL_NO_FORMAT_ZERO_LENGTH= + AC_MSG_RESULT([no]) + ]) + + CC="$saved_cc" + CFLAGS="$saved_flags" + AC_SUBST([KERNEL_NO_FORMAT_ZERO_LENGTH]) +]) + +dnl # dnl # Check if cc supports -Wno-clobbered option. dnl # dnl # We actually invoke it with the -Wclobbered option @@ -182,6 +210,27 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_CLOBBERED], [ ]) dnl # +dnl # Check if cc supports -Wno-atomic-alignment option. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_ATOMIC_ALIGNMENT], [ + AC_MSG_CHECKING([whether $CC supports -Wno-atomic-alignment]) + + saved_flags="$CFLAGS" + CFLAGS="$CFLAGS -Werror -Wno-atomic-alignment" + + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [ + NO_ATOMIC_ALIGNMENT=-Wno-atomic-alignment + AC_MSG_RESULT([yes]) + ], [ + NO_ATOMIC_ALIGNMENT= + AC_MSG_RESULT([no]) + ]) + + CFLAGS="$saved_flags" + AC_SUBST([NO_ATOMIC_ALIGNMENT]) +]) + +dnl # dnl # Check if cc supports -Wimplicit-fallthrough option. dnl # AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_IMPLICIT_FALLTHROUGH], [ @@ -231,20 +280,17 @@ dnl # dnl # Check if kernel cc supports -Winfinite-recursion option. dnl # AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_KERNEL_CC_INFINITE_RECURSION], [ - AC_MSG_CHECKING([whether $KERNEL_CC supports -Winfinite-recursion]) - saved_cc="$CC" + AS_IF( + [ test -n "$KERNEL_CC" ], [ CC="$KERNEL_CC" ], + [ test -n "$KERNEL_LLVM" ], [ CC="clang" ], + [ CC="gcc" ] + ) + AC_MSG_CHECKING([whether $CC supports -Winfinite-recursion]) + saved_flags="$CFLAGS" - CC="gcc" CFLAGS="$CFLAGS -Werror -Winfinite-recursion" - AS_IF([ test -n "$KERNEL_CC" ], [ - CC="$KERNEL_CC" - ]) - AS_IF([ test -n "$KERNEL_LLVM" ], [ - CC="clang" - ]) - AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [ KERNEL_INFINITE_RECURSION=-Winfinite-recursion AC_DEFINE([HAVE_KERNEL_INFINITE_RECURSION], 1, @@ -329,20 +375,17 @@ dnl # dnl # Check if kernel cc supports -fno-ipa-sra option. dnl # AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_KERNEL_CC_NO_IPA_SRA], [ - AC_MSG_CHECKING([whether $KERNEL_CC supports -fno-ipa-sra]) - saved_cc="$CC" + AS_IF( + [ test -n "$KERNEL_CC" ], [ CC="$KERNEL_CC" ], + [ test -n "$KERNEL_LLVM" ], [ CC="clang" ], + [ CC="gcc" ] + ) + AC_MSG_CHECKING([whether $CC supports -fno-ipa-sra]) + saved_flags="$CFLAGS" - CC="gcc" CFLAGS="$CFLAGS -Werror -fno-ipa-sra" - AS_IF([ test -n "$KERNEL_CC" ], [ - CC="$KERNEL_CC" - ]) - AS_IF([ test -n "$KERNEL_LLVM" ], [ - CC="clang" - ]) - AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [ KERNEL_NO_IPA_SRA=-fno-ipa-sra AC_MSG_RESULT([yes]) diff --git a/sys/contrib/openzfs/config/ax_python_devel.m4 b/sys/contrib/openzfs/config/ax_python_devel.m4 index 1f480db6d233..935056cc4c0a 100644 --- a/sys/contrib/openzfs/config/ax_python_devel.m4 +++ b/sys/contrib/openzfs/config/ax_python_devel.m4 @@ -72,7 +72,7 @@ # modified version of the Autoconf Macro, you may extend this special # exception to the GPL to apply to your modified version as well. -#serial 36 +#serial 37 AU_ALIAS([AC_PYTHON_DEVEL], [AX_PYTHON_DEVEL]) AC_DEFUN([AX_PYTHON_DEVEL],[ @@ -316,7 +316,7 @@ EOD` PYTHON_LIBS="-L$ac_python_libdir -lpython$ac_python_version" fi - if test -z "PYTHON_LIBS"; then + if test -z "$PYTHON_LIBS"; then AC_MSG_WARN([ Cannot determine location of your Python DSO. Please check it was installed with dynamic libraries enabled, or try setting PYTHON_LIBS by hand. diff --git a/sys/contrib/openzfs/config/kernel-blkdev.m4 b/sys/contrib/openzfs/config/kernel-blkdev.m4 index 83190c6fbe3f..02011bf39fb2 100644 --- a/sys/contrib/openzfs/config/kernel-blkdev.m4 +++ b/sys/contrib/openzfs/config/kernel-blkdev.m4 @@ -29,9 +29,8 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG], [ const char *path = "path"; fmode_t mode = 0; void *holder = NULL; - struct blk_holder_ops h; - bdev = blkdev_get_by_path(path, mode, holder, &h); + bdev = blkdev_get_by_path(path, mode, holder, NULL); ]) ]) @@ -48,9 +47,8 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_OPEN_BY_PATH], [ const char *path = "path"; fmode_t mode = 0; void *holder = NULL; - struct blk_holder_ops h; - bdh = bdev_open_by_path(path, mode, holder, &h); + bdh = bdev_open_by_path(path, mode, holder, NULL); ]) ]) @@ -68,9 +66,8 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BDEV_FILE_OPEN_BY_PATH], [ const char *path = "path"; fmode_t mode = 0; void *holder = NULL; - struct blk_holder_ops h; - file = bdev_file_open_by_path(path, mode, holder, &h); + file = bdev_file_open_by_path(path, mode, holder, NULL); ]) ]) diff --git a/sys/contrib/openzfs/config/kernel-dentry-operations.m4 b/sys/contrib/openzfs/config/kernel-dentry-operations.m4 index aa5a9f2aff39..ce0e6e5be959 100644 --- a/sys/contrib/openzfs/config/kernel-dentry-operations.m4 +++ b/sys/contrib/openzfs/config/kernel-dentry-operations.m4 @@ -24,6 +24,9 @@ dnl # dnl # 2.6.38 API change dnl # Added d_set_d_op() helper function. dnl # +dnl # 6.17 API change +dnl # d_set_d_op() removed. No direct replacement. +dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_D_SET_D_OP], [ ZFS_LINUX_TEST_SRC([d_set_d_op], [ #include <linux/dcache.h> @@ -34,22 +37,46 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_D_SET_D_OP], [ AC_DEFUN([ZFS_AC_KERNEL_D_SET_D_OP], [ AC_MSG_CHECKING([whether d_set_d_op() is available]) - ZFS_LINUX_TEST_RESULT_SYMBOL([d_set_d_op], - [d_set_d_op], [fs/dcache.c], [ + ZFS_LINUX_TEST_RESULT([d_set_d_op], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_D_SET_D_OP, 1, + [Define if d_set_d_op() is available]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 6.17 API change +dnl # sb->s_d_op removed; set_default_d_op(sb, dop) added +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SET_DEFAULT_D_OP], [ + ZFS_LINUX_TEST_SRC([set_default_d_op], [ + #include <linux/dcache.h> + ], [ + set_default_d_op(NULL, NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SET_DEFAULT_D_OP], [ + AC_MSG_CHECKING([whether set_default_d_op() is available]) + ZFS_LINUX_TEST_RESULT([set_default_d_op], [ AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SET_DEFAULT_D_OP, 1, + [Define if set_default_d_op() is available]) ], [ - ZFS_LINUX_TEST_ERROR([d_set_d_op]) + AC_MSG_RESULT(no) ]) ]) AC_DEFUN([ZFS_AC_KERNEL_SRC_DENTRY], [ ZFS_AC_KERNEL_SRC_D_OBTAIN_ALIAS ZFS_AC_KERNEL_SRC_D_SET_D_OP - ZFS_AC_KERNEL_SRC_S_D_OP + ZFS_AC_KERNEL_SRC_SET_DEFAULT_D_OP ]) AC_DEFUN([ZFS_AC_KERNEL_DENTRY], [ ZFS_AC_KERNEL_D_OBTAIN_ALIAS ZFS_AC_KERNEL_D_SET_D_OP - ZFS_AC_KERNEL_S_D_OP + ZFS_AC_KERNEL_SET_DEFAULT_D_OP ]) diff --git a/sys/contrib/openzfs/config/kernel-free-inode.m4 b/sys/contrib/openzfs/config/kernel-free-inode.m4 new file mode 100644 index 000000000000..baa1c34845bb --- /dev/null +++ b/sys/contrib/openzfs/config/kernel-free-inode.m4 @@ -0,0 +1,24 @@ +dnl # +dnl # Linux 5.2 API change +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SOPS_FREE_INODE], [ + ZFS_LINUX_TEST_SRC([super_operations_free_inode], [ + #include <linux/fs.h> + + static void free_inode(struct inode *) { } + + static struct super_operations sops __attribute__ ((unused)) = { + .free_inode = free_inode, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SOPS_FREE_INODE], [ + AC_MSG_CHECKING([whether sops->free_inode() exists]) + ZFS_LINUX_TEST_RESULT([super_operations_free_inode], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SOPS_FREE_INODE, 1, [sops->free_inode() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/sys/contrib/openzfs/config/kernel-mkdir.m4 b/sys/contrib/openzfs/config/kernel-mkdir.m4 index c1aebc387abe..78b32447c593 100644 --- a/sys/contrib/openzfs/config/kernel-mkdir.m4 +++ b/sys/contrib/openzfs/config/kernel-mkdir.m4 @@ -84,6 +84,8 @@ AC_DEFUN([ZFS_AC_KERNEL_MKDIR], [ AC_DEFINE(HAVE_IOPS_MKDIR_DENTRY, 1, [iops->mkdir() returns struct dentry*]) ],[ + AC_MSG_RESULT(no) + dnl # dnl # 6.3 API change dnl # mkdir() takes struct mnt_idmap * as the first arg diff --git a/sys/contrib/openzfs/config/kernel.m4 b/sys/contrib/openzfs/config/kernel.m4 index 7e6af62dede5..35819e4d68c5 100644 --- a/sys/contrib/openzfs/config/kernel.m4 +++ b/sys/contrib/openzfs/config/kernel.m4 @@ -70,6 +70,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_COMMIT_METADATA ZFS_AC_KERNEL_SRC_SETATTR_PREPARE ZFS_AC_KERNEL_SRC_INSERT_INODE_LOCKED + ZFS_AC_KERNEL_SRC_DENTRY ZFS_AC_KERNEL_SRC_TRUNCATE_SETSIZE ZFS_AC_KERNEL_SRC_SECURITY_INODE ZFS_AC_KERNEL_SRC_FST_MOUNT @@ -134,6 +135,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_PIN_USER_PAGES ZFS_AC_KERNEL_SRC_TIMER ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_WB_ERR + ZFS_AC_KERNEL_SRC_SOPS_FREE_INODE case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE @@ -187,6 +189,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_COMMIT_METADATA ZFS_AC_KERNEL_SETATTR_PREPARE ZFS_AC_KERNEL_INSERT_INODE_LOCKED + ZFS_AC_KERNEL_DENTRY ZFS_AC_KERNEL_TRUNCATE_SETSIZE ZFS_AC_KERNEL_SECURITY_INODE ZFS_AC_KERNEL_FST_MOUNT @@ -252,6 +255,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_PIN_USER_PAGES ZFS_AC_KERNEL_TIMER ZFS_AC_KERNEL_SUPER_BLOCK_S_WB_ERR + ZFS_AC_KERNEL_SOPS_FREE_INODE case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE diff --git a/sys/contrib/openzfs/config/toolchain-simd.m4 b/sys/contrib/openzfs/config/toolchain-simd.m4 index 061576fd94e3..f18c91007cde 100644 --- a/sys/contrib/openzfs/config/toolchain-simd.m4 +++ b/sys/contrib/openzfs/config/toolchain-simd.m4 @@ -24,6 +24,8 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_TOOLCHAIN_SIMD], [ ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AES ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_PCLMULQDQ ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_MOVBE + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_VAES + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_VPCLMULQDQ ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVE ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVEOPT ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVES @@ -38,9 +40,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE], [ AC_MSG_CHECKING([whether host toolchain supports SSE]) AC_LINK_IFELSE([AC_LANG_SOURCE([[ - void main() + int main() { __asm__ __volatile__("xorps %xmm0, %xmm1"); + return (0); } ]])], [ AC_DEFINE([HAVE_SSE], 1, [Define if host toolchain supports SSE]) @@ -57,9 +60,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE2], [ AC_MSG_CHECKING([whether host toolchain supports SSE2]) AC_LINK_IFELSE([AC_LANG_SOURCE([[ - void main() + int main() { __asm__ __volatile__("pxor %xmm0, %xmm1"); + return (0); } ]])], [ AC_DEFINE([HAVE_SSE2], 1, [Define if host toolchain supports SSE2]) @@ -76,10 +80,11 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE3], [ AC_MSG_CHECKING([whether host toolchain supports SSE3]) AC_LINK_IFELSE([AC_LANG_SOURCE([[ - void main() + int main() { char v[16]; __asm__ __volatile__("lddqu %0,%%xmm0" :: "m"(v[0])); + return (0); } ]])], [ AC_DEFINE([HAVE_SSE3], 1, [Define if host toolchain supports SSE3]) @@ -96,9 +101,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSSE3], [ AC_MSG_CHECKING([whether host toolchain supports SSSE3]) AC_LINK_IFELSE([AC_LANG_SOURCE([[ - void main() + int main() { __asm__ __volatile__("pshufb %xmm0,%xmm1"); + return (0); } ]])], [ AC_DEFINE([HAVE_SSSE3], 1, [Define if host toolchain supports SSSE3]) @@ -115,9 +121,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE4_1], [ AC_MSG_CHECKING([whether host toolchain supports SSE4.1]) AC_LINK_IFELSE([AC_LANG_SOURCE([[ - void main() + int main() { __asm__ __volatile__("pmaxsb %xmm0,%xmm1"); + return (0); } ]])], [ AC_DEFINE([HAVE_SSE4_1], 1, [Define if host toolchain supports SSE4.1]) @@ -134,9 +141,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE4_2], [ AC_MSG_CHECKING([whether host toolchain supports SSE4.2]) AC_LINK_IFELSE([AC_LANG_SOURCE([[ - void main() + int main() { __asm__ __volatile__("pcmpgtq %xmm0, %xmm1"); + return (0); } ]])], [ AC_DEFINE([HAVE_SSE4_2], 1, [Define if host toolchain supports SSE4.2]) @@ -153,10 +161,11 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX], [ AC_MSG_CHECKING([whether host toolchain supports AVX]) AC_LINK_IFELSE([AC_LANG_SOURCE([[ - void main() + int main() { char v[32]; __asm__ __volatile__("vmovdqa %0,%%ymm0" :: "m"(v[0])); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -174,9 +183,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX2], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("vpshufb %ymm0,%ymm1,%ymm2"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -194,9 +204,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512F], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("vpandd %zmm0,%zmm1,%zmm2"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -214,9 +225,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512CD], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("vplzcntd %zmm0,%zmm1"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -234,9 +246,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512DQ], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("vandpd %zmm0,%zmm1,%zmm2"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -254,9 +267,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512BW], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("vpshufb %zmm0,%zmm1,%zmm2"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -274,9 +288,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512IFMA], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("vpmadd52luq %zmm0,%zmm1,%zmm2"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -294,9 +309,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VBMI], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("vpermb %zmm0,%zmm1,%zmm2"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -314,9 +330,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512PF], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("vgatherpf0dps (%rsi,%zmm0,4){%k1}"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -334,9 +351,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512ER], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("vexp2pd %zmm0,%zmm1"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -354,9 +372,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VL], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("vpabsq %zmm0,%zmm1"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -374,9 +393,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AES], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("aesenc %xmm0, %xmm1"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -394,9 +414,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_PCLMULQDQ], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("pclmulqdq %0, %%xmm0, %%xmm1" :: "i"(0)); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -414,9 +435,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_MOVBE], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("movbe 0(%eax), %eax"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -427,6 +449,48 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_MOVBE], [ ]) dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_VAES +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_VAES], [ + AC_MSG_CHECKING([whether host toolchain supports VAES]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + int main() + { + __asm__ __volatile__("vaesenc %ymm0, %ymm1, %ymm0"); + return (0); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_VAES], 1, [Define if host toolchain supports VAES]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_VPCLMULQDQ +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_VPCLMULQDQ], [ + AC_MSG_CHECKING([whether host toolchain supports VPCLMULQDQ]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + int main() + { + __asm__ __volatile__("vpclmulqdq %0, %%ymm4, %%ymm3, %%ymm5" :: "i"(0)); + return (0); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_VPCLMULQDQ], 1, [Define if host toolchain supports VPCLMULQDQ]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVE dnl # AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVE], [ @@ -434,10 +498,11 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVE], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { char b[4096] __attribute__ ((aligned (64))); __asm__ __volatile__("xsave %[b]\n" : : [b] "m" (*b) : "memory"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -455,10 +520,11 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVEOPT], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { char b[4096] __attribute__ ((aligned (64))); __asm__ __volatile__("xsaveopt %[b]\n" : : [b] "m" (*b) : "memory"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -476,10 +542,11 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVES], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { char b[4096] __attribute__ ((aligned (64))); __asm__ __volatile__("xsaves %[b]\n" : : [b] "m" (*b) : "memory"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) diff --git a/sys/contrib/openzfs/config/user-statx.m4 b/sys/contrib/openzfs/config/user-statx.m4 new file mode 100644 index 000000000000..1ba74a40e9b8 --- /dev/null +++ b/sys/contrib/openzfs/config/user-statx.m4 @@ -0,0 +1,34 @@ +dnl # +dnl # Check for statx() function and STATX_MNT_ID availability +dnl # +AC_DEFUN([ZFS_AC_CONFIG_USER_STATX], [ + AC_CHECK_HEADERS([sys/stat.h], + [have_stat_headers=yes], + [have_stat_headers=no]) + + AS_IF([test "x$have_stat_headers" = "xyes"], [ + AC_CHECK_FUNC([statx], [ + AC_DEFINE([HAVE_STATX], [1], [statx() is available]) + + dnl Check for STATX_MNT_ID availability + AC_MSG_CHECKING([for STATX_MNT_ID]) + AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM([[ + #include <sys/stat.h> + ]], [[ + struct statx stx; + int mask = STATX_MNT_ID; + (void)mask; + (void)stx.stx_mnt_id; + ]]) + ], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_STATX_MNT_ID], [1], [STATX_MNT_ID is available]) + ], [ + AC_MSG_RESULT([no]) + ]) + ]) + ], [ + AC_MSG_WARN([sys/stat.h not found; skipping statx support]) + ]) +]) dnl end AC_DEFUN diff --git a/sys/contrib/openzfs/config/user.m4 b/sys/contrib/openzfs/config/user.m4 index badd920d2b8a..62e59ed94437 100644 --- a/sys/contrib/openzfs/config/user.m4 +++ b/sys/contrib/openzfs/config/user.m4 @@ -17,6 +17,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [ ZFS_AC_CONFIG_USER_LIBUDEV ZFS_AC_CONFIG_USER_LIBUUID ZFS_AC_CONFIG_USER_LIBBLKID + ZFS_AC_CONFIG_USER_STATX ]) ZFS_AC_CONFIG_USER_LIBTIRPC ZFS_AC_CONFIG_USER_LIBCRYPTO diff --git a/sys/contrib/openzfs/config/zfs-build.m4 b/sys/contrib/openzfs/config/zfs-build.m4 index 7cf1b02d8757..161d390466db 100644 --- a/sys/contrib/openzfs/config/zfs-build.m4 +++ b/sys/contrib/openzfs/config/zfs-build.m4 @@ -252,10 +252,12 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [ ZFS_AC_CONFIG_ALWAYS_CC_NO_CLOBBERED ZFS_AC_CONFIG_ALWAYS_CC_INFINITE_RECURSION ZFS_AC_CONFIG_ALWAYS_KERNEL_CC_INFINITE_RECURSION + ZFS_AC_CONFIG_ALWAYS_CC_NO_ATOMIC_ALIGNMENT ZFS_AC_CONFIG_ALWAYS_CC_IMPLICIT_FALLTHROUGH ZFS_AC_CONFIG_ALWAYS_CC_FRAME_LARGER_THAN ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_TRUNCATION ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_ZERO_LENGTH + ZFS_AC_CONFIG_ALWAYS_KERNEL_CC_NO_FORMAT_ZERO_LENGTH ZFS_AC_CONFIG_ALWAYS_CC_FORMAT_OVERFLOW ZFS_AC_CONFIG_ALWAYS_CC_NO_OMIT_FRAME_POINTER ZFS_AC_CONFIG_ALWAYS_CC_NO_IPA_SRA @@ -265,6 +267,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [ ZFS_AC_CONFIG_ALWAYS_TOOLCHAIN_SIMD ZFS_AC_CONFIG_ALWAYS_SYSTEM ZFS_AC_CONFIG_ALWAYS_ARCH + ZFS_AC_CONFIG_CHECK_ARCH_VAR ZFS_AC_CONFIG_ALWAYS_PYTHON ZFS_AC_CONFIG_ALWAYS_PYZFS ZFS_AC_CONFIG_ALWAYS_SED diff --git a/sys/contrib/openzfs/contrib/debian/control b/sys/contrib/openzfs/contrib/debian/control index 96a2bdd88665..c5358dedc0fd 100644 --- a/sys/contrib/openzfs/contrib/debian/control +++ b/sys/contrib/openzfs/contrib/debian/control @@ -100,8 +100,8 @@ Depends: ${misc:Depends}, ${shlibs:Depends} # The libcurl4 is loaded through dlopen("libcurl.so.4"). # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=988521 Recommends: libcurl4 -Breaks: libzfs2, libzfs4, libzfs4linux, libzfs6linux -Replaces: libzfs2, libzfs4, libzfs4linux, libzfs6linux +Breaks: libzfs2, libzfs4, libzfs4linux, libzfs6linux, openzfs-libzfs4 +Replaces: libzfs2, libzfs4, libzfs4linux, libzfs6linux, openzfs-libzfs4 Conflicts: libzfs6linux Description: OpenZFS filesystem library for Linux - general support OpenZFS is a storage platform that encompasses the functionality of @@ -128,8 +128,8 @@ Package: openzfs-libzpool6 Section: contrib/libs Architecture: linux-any Depends: ${misc:Depends}, ${shlibs:Depends} -Breaks: libzpool2, libzpool5, libzpool5linux, libzpool6linux -Replaces: libzpool2, libzpool5, libzpool5linux, libzpool6linux +Breaks: libzpool2, libzpool5, libzpool6linux +Replaces: libzpool2, libzpool5, libzpool6linux Conflicts: libzpool6linux Description: OpenZFS pool library for Linux OpenZFS is a storage platform that encompasses the functionality of diff --git a/sys/contrib/openzfs/contrib/debian/copyright b/sys/contrib/openzfs/contrib/debian/copyright index 65c7d209d8eb..006f32fdf924 100644 --- a/sys/contrib/openzfs/contrib/debian/copyright +++ b/sys/contrib/openzfs/contrib/debian/copyright @@ -4,7 +4,7 @@ The detailed contributor information can be found in [2][3]. Files: contrib/debian/* Copyright: - 2013-2016, Aron Xu <aron@debian.org> + 2013-2025, Aron Xu <aron@debian.org> 2016, Petter Reinholdtsen <pere@hungry.com> 2013, Carlos Alberto Lopez Perez <clopez@igalia.com> 2013, Turbo Fredriksson <turbo@bayour.com> @@ -12,6 +12,8 @@ Copyright: 2011-2013, Darik Horn <dajhorn@vanadac.com> 2018-2019, Mo Zhou <cdluminate@gmail.com> 2018-2020, Mo Zhou <lumin@debian.org> + 2023-2024, Shengqi Chen <harry-chen@outlook.com> + 2024-2025, Shengqi Chen <harry@debian.org> License: GPL-2+ [1] https://tracker.debian.org/pkg/zfs-linux diff --git a/sys/contrib/openzfs/contrib/debian/not-installed b/sys/contrib/openzfs/contrib/debian/not-installed index 88557f76fcae..9c08da5a6a7b 100644 --- a/sys/contrib/openzfs/contrib/debian/not-installed +++ b/sys/contrib/openzfs/contrib/debian/not-installed @@ -1,4 +1,4 @@ -usr/bin/arc_summary.py +usr/bin/zarcsummary.py usr/share/zfs/zfs-helpers.sh etc/default/zfs etc/init.d @@ -9,4 +9,4 @@ etc/zfs/vdev_id.conf.sas_direct.example etc/zfs/vdev_id.conf.sas_switch.example etc/zfs/vdev_id.conf.scsi.example etc/zfs/zfs-functions -lib/systemd/system/zfs-import.service +usr/lib/systemd/system/zfs-import.service diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-libnvpair3.install.in b/sys/contrib/openzfs/contrib/debian/openzfs-libnvpair3.install.in index ed7b541e3607..fce542270dd8 100644 --- a/sys/contrib/openzfs/contrib/debian/openzfs-libnvpair3.install.in +++ b/sys/contrib/openzfs/contrib/debian/openzfs-libnvpair3.install.in @@ -1 +1 @@ -lib/@DEB_HOST_MULTIARCH@/libnvpair.so.* +usr/lib/@DEB_HOST_MULTIARCH@/libnvpair.so.* diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-libpam-zfs.install b/sys/contrib/openzfs/contrib/debian/openzfs-libpam-zfs.install index c33123f69a8d..bafdebe9bb91 100644 --- a/sys/contrib/openzfs/contrib/debian/openzfs-libpam-zfs.install +++ b/sys/contrib/openzfs/contrib/debian/openzfs-libpam-zfs.install @@ -1,2 +1,2 @@ -lib/*/security/pam_zfs_key.so +usr/lib/*/security/pam_zfs_key.so usr/share/pam-configs/zfs_key diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-libpam-zfs.postinst b/sys/contrib/openzfs/contrib/debian/openzfs-libpam-zfs.postinst index 03893454eee9..db4db73d6d5a 100644 --- a/sys/contrib/openzfs/contrib/debian/openzfs-libpam-zfs.postinst +++ b/sys/contrib/openzfs/contrib/debian/openzfs-libpam-zfs.postinst @@ -1,7 +1,7 @@ #!/bin/sh set -e -if ! $(ldd "/lib/$(dpkg-architecture -qDEB_HOST_MULTIARCH)/security/pam_zfs_key.so" | grep -q "libasan") ; then +if ! $(ldd "/usr/lib/$(dpkg-architecture -qDEB_HOST_MULTIARCH)/security/pam_zfs_key.so" | grep -q "libasan") ; then pam-auth-update --package fi diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-libuutil3.install.in b/sys/contrib/openzfs/contrib/debian/openzfs-libuutil3.install.in index a197d030d743..bb33386791e1 100644 --- a/sys/contrib/openzfs/contrib/debian/openzfs-libuutil3.install.in +++ b/sys/contrib/openzfs/contrib/debian/openzfs-libuutil3.install.in @@ -1 +1 @@ -lib/@DEB_HOST_MULTIARCH@/libuutil.so.* +usr/lib/@DEB_HOST_MULTIARCH@/libuutil.so.* diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-libzfs-dev.install.in b/sys/contrib/openzfs/contrib/debian/openzfs-libzfs-dev.install.in index eaa8c3925e24..5673e2661c6a 100644 --- a/sys/contrib/openzfs/contrib/debian/openzfs-libzfs-dev.install.in +++ b/sys/contrib/openzfs/contrib/debian/openzfs-libzfs-dev.install.in @@ -1,3 +1,5 @@ -lib/@DEB_HOST_MULTIARCH@/*.a usr/lib/@DEB_HOST_MULTIARCH@ +usr/lib/@DEB_HOST_MULTIARCH@/*.a +usr/lib/@DEB_HOST_MULTIARCH@/*.so +usr/lib/@DEB_HOST_MULTIARCH@/pkgconfig usr/include -usr/lib/@DEB_HOST_MULTIARCH@ + diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-libzfs6.install.in b/sys/contrib/openzfs/contrib/debian/openzfs-libzfs6.install.in index 6765aaee59cc..a9054c14cc73 100644 --- a/sys/contrib/openzfs/contrib/debian/openzfs-libzfs6.install.in +++ b/sys/contrib/openzfs/contrib/debian/openzfs-libzfs6.install.in @@ -1,2 +1,2 @@ -lib/@DEB_HOST_MULTIARCH@/libzfs.so.* -lib/@DEB_HOST_MULTIARCH@/libzfs_core.so.* +usr/lib/@DEB_HOST_MULTIARCH@/libzfs.so.* +usr/lib/@DEB_HOST_MULTIARCH@/libzfs_core.so.* diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-libzfsbootenv1.install.in b/sys/contrib/openzfs/contrib/debian/openzfs-libzfsbootenv1.install.in index 49216742433f..b61b8ab63265 100644 --- a/sys/contrib/openzfs/contrib/debian/openzfs-libzfsbootenv1.install.in +++ b/sys/contrib/openzfs/contrib/debian/openzfs-libzfsbootenv1.install.in @@ -1 +1 @@ -lib/@DEB_HOST_MULTIARCH@/libzfsbootenv.so.* +usr/lib/@DEB_HOST_MULTIARCH@/libzfsbootenv.so.* diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-libzpool6.install.in b/sys/contrib/openzfs/contrib/debian/openzfs-libzpool6.install.in index b9e872df9ba8..0e087a2709b3 100644 --- a/sys/contrib/openzfs/contrib/debian/openzfs-libzpool6.install.in +++ b/sys/contrib/openzfs/contrib/debian/openzfs-libzpool6.install.in @@ -1 +1 @@ -lib/@DEB_HOST_MULTIARCH@/libzpool.so.* +usr/lib/@DEB_HOST_MULTIARCH@/libzpool.so.* diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-zfs-test.install b/sys/contrib/openzfs/contrib/debian/openzfs-zfs-test.install index b3afef50dbd4..496cab2ad5e4 100644 --- a/sys/contrib/openzfs/contrib/debian/openzfs-zfs-test.install +++ b/sys/contrib/openzfs/contrib/debian/openzfs-zfs-test.install @@ -1,4 +1,4 @@ -sbin/ztest +usr/sbin/ztest usr/bin/raidz_test usr/share/man/man1/raidz_test.1 usr/share/man/man1/test-runner.1 diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-zfs-zed.install b/sys/contrib/openzfs/contrib/debian/openzfs-zfs-zed.install index a348ba828ee5..30699a8a98da 100644 --- a/sys/contrib/openzfs/contrib/debian/openzfs-zfs-zed.install +++ b/sys/contrib/openzfs/contrib/debian/openzfs-zfs-zed.install @@ -1,5 +1,5 @@ etc/zfs/zed.d/* -lib/systemd/system/zfs-zed.service +usr/lib/systemd/system/zfs-zed.service usr/lib/zfs-linux/zed.d/* usr/sbin/zed usr/share/man/man8/zed.8 diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.install b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.install index 4573cc77ea74..6810108f2c5d 100644 --- a/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.install +++ b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.install @@ -1,47 +1,48 @@ etc/default/zfs etc/zfs/zfs-functions etc/zfs/zpool.d/ -lib/systemd/system-generators/ -lib/systemd/system-preset/ -lib/systemd/system/zfs-import-cache.service -lib/systemd/system/zfs-import-scan.service -lib/systemd/system/zfs-import.target -lib/systemd/system/zfs-load-key.service -lib/systemd/system/zfs-mount.service -lib/systemd/system/zfs-scrub-monthly@.timer -lib/systemd/system/zfs-scrub-weekly@.timer -lib/systemd/system/zfs-scrub@.service -lib/systemd/system/zfs-trim-monthly@.timer -lib/systemd/system/zfs-trim-weekly@.timer -lib/systemd/system/zfs-trim@.service -lib/systemd/system/zfs-share.service -lib/systemd/system/zfs-volume-wait.service -lib/systemd/system/zfs-volumes.target -lib/systemd/system/zfs.target -lib/udev/ -sbin/fsck.zfs -sbin/mount.zfs -sbin/zdb -sbin/zfs -sbin/zfs_ids_to_path -sbin/zgenhostid -sbin/zhack -sbin/zinject -sbin/zpool -sbin/zstream -sbin/zstreamdump +usr/lib/systemd/system-generators/ +usr/lib/systemd/system-preset/ +usr/lib/systemd/system/zfs-import-cache.service +usr/lib/systemd/system/zfs-import-scan.service +usr/lib/systemd/system/zfs-import.target +usr/lib/systemd/system/zfs-load-key.service +usr/lib/systemd/system/zfs-mount.service +usr/lib/systemd/system/zfs-mount@.service +usr/lib/systemd/system/zfs-scrub-monthly@.timer +usr/lib/systemd/system/zfs-scrub-weekly@.timer +usr/lib/systemd/system/zfs-scrub@.service +usr/lib/systemd/system/zfs-trim-monthly@.timer +usr/lib/systemd/system/zfs-trim-weekly@.timer +usr/lib/systemd/system/zfs-trim@.service +usr/lib/systemd/system/zfs-share.service +usr/lib/systemd/system/zfs-volume-wait.service +usr/lib/systemd/system/zfs-volumes.target +usr/lib/systemd/system/zfs.target +usr/lib/udev/ +usr/sbin/fsck.zfs +usr/sbin/mount.zfs +usr/sbin/zdb +usr/sbin/zfs +usr/sbin/zfs_ids_to_path +usr/sbin/zgenhostid +usr/sbin/zhack +usr/sbin/zinject +usr/sbin/zpool +usr/sbin/zstream +usr/sbin/zstreamdump usr/bin/zvol_wait -usr/lib/modules-load.d/ lib/ +usr/lib/modules-load.d/ usr/lib/zfs-linux/zpool.d/ usr/lib/zfs-linux/zpool_influxdb usr/lib/zfs-linux/zfs_prepare_disk -usr/sbin/arc_summary -usr/sbin/arcstat -usr/sbin/dbufstat -usr/sbin/zilstat +usr/bin/zarcsummary +usr/bin/zarcstat +usr/bin/dbufstat usr/sbin +usr/bin/zilstat usr/share/zfs/compatibility.d/ usr/share/bash-completion/completions -usr/share/man/man1/arcstat.1 +usr/share/man/man1/zarcstat.1 usr/share/man/man1/zhack.1 usr/share/man/man1/zvol_wait.1 usr/share/man/man5/ diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.links b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.links new file mode 100644 index 000000000000..54099e6573b0 --- /dev/null +++ b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.links @@ -0,0 +1,3 @@ +usr/sbin/zfs usr/bin/zfs +usr/sbin/zpool usr/bin/zpool +usr/lib/zfs-linux/zpool_influxdb usr/bin/zpool_influxdb diff --git a/sys/contrib/openzfs/contrib/debian/rules.in b/sys/contrib/openzfs/contrib/debian/rules.in index 3226d604546c..5087a7e18e16 100755 --- a/sys/contrib/openzfs/contrib/debian/rules.in +++ b/sys/contrib/openzfs/contrib/debian/rules.in @@ -37,18 +37,19 @@ override_dh_auto_configure: @# Build the userland, but don't build the kernel modules. dh_auto_configure -- @CFGOPTS@ \ --bindir=/usr/bin \ - --sbindir=/sbin \ - --libdir=/lib/"$(DEB_HOST_MULTIARCH)" \ - --with-udevdir=/lib/udev \ + --sbindir=/usr/sbin \ + --with-mounthelperdir=/usr/sbin \ + --libdir=/usr/lib/"$(DEB_HOST_MULTIARCH)" \ + --with-udevdir=/usr/lib/udev \ --with-zfsexecdir=/usr/lib/zfs-linux \ --enable-systemd \ --enable-pyzfs \ --with-python=python3 \ - --with-pammoduledir='/lib/$(DEB_HOST_MULTIARCH)/security' \ + --with-pammoduledir='/usr/lib/$(DEB_HOST_MULTIARCH)/security' \ --with-pkgconfigdir='/usr/lib/$(DEB_HOST_MULTIARCH)/pkgconfig' \ - --with-systemdunitdir=/lib/systemd/system \ - --with-systemdpresetdir=/lib/systemd/system-preset \ - --with-systemdgeneratordir=/lib/systemd/system-generators \ + --with-systemdunitdir=/usr/lib/systemd/system \ + --with-systemdpresetdir=/usr/lib/systemd/system-preset \ + --with-systemdgeneratordir=/usr/lib/systemd/system-generators \ --with-config=user for i in $(wildcard $(CURDIR)/debian/*.install.in) ; do \ @@ -77,23 +78,10 @@ override_dh_auto_install: @# Install the utilities. $(MAKE) install DESTDIR='$(CURDIR)/debian/tmp' - # Move from bin_dir to /usr/sbin - # Remove suffix (.py) as per policy 10.4 - Scripts - # https://www.debian.org/doc/debian-policy/ch-files.html#s-scripts - mkdir -p '$(CURDIR)/debian/tmp/usr/sbin/' - mv '$(CURDIR)/debian/tmp/usr/bin/arc_summary' '$(CURDIR)/debian/tmp/usr/sbin/arc_summary' - mv '$(CURDIR)/debian/tmp/usr/bin/arcstat' '$(CURDIR)/debian/tmp/usr/sbin/arcstat' - mv '$(CURDIR)/debian/tmp/usr/bin/dbufstat' '$(CURDIR)/debian/tmp/usr/sbin/dbufstat' - mv '$(CURDIR)/debian/tmp/usr/bin/zilstat' '$(CURDIR)/debian/tmp/usr/sbin/zilstat' - - @# Zed has dependencies outside of the system root. - mv '$(CURDIR)/debian/tmp/sbin/zed' '$(CURDIR)/debian/tmp/usr/sbin/zed' - sed -i 's|ExecStart=/sbin/|ExecStart=/usr/sbin/|g' '$(CURDIR)/debian/tmp/lib/systemd/system/zfs-zed.service' - @# Install the DKMS source. @# We only want the files needed to build the modules install -D -t '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/scripts' \ - '$(CURDIR)/scripts/dkms.postbuild' + '$(CURDIR)/scripts/dkms.postbuild' '$(CURDIR)/scripts/objtool-wrapper.in' $(foreach file,$(DKMSFILES),mv '$(CURDIR)/$(NAME)-$(DEB_VERSION_UPSTREAM)/$(file)' '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)' || exit 1;) @# Only ever build Linux modules @@ -108,8 +96,8 @@ override_dh_auto_install: @# - zfs.release$ @# * Takes care of spaces and tabs @# * Remove reference to ZFS_AC_PACKAGE - awk '/^AC_CONFIG_FILES\(\[/,/^\]\)/ {\ - if ($$0 !~ /^(AC_CONFIG_FILES\(\[([ \t]+)?$$|\]\)([ \t]+)?$$|([ \t]+)?(include\/(Makefile|sys|os\/(Makefile|linux))|module\/|Makefile([ \t]+)?$$|zfs\.release([ \t]+)?$$))/) \ + awk '/^AC_CONFIG_FILES\(\[/,/\]\)/ {\ + if ($$0 !~ /^(AC_CONFIG_FILES\(\[([ \t]+)?$$|\]\)([ \t]+)?$$|([ \t]+)?(include\/(Makefile|sys|os\/(Makefile|linux))|module\/|Makefile([ \t]+)?$$|zfs\.release([ \t]+)?$$))|scripts\/objtool-wrapper.*\]\)$$/) \ {next} } {print}' \ '$(CURDIR)/$(NAME)-$(DEB_VERSION_UPSTREAM)/configure.ac' | sed '/ZFS_AC_PACKAGE/d' > '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/configure.ac' @# Set "SUBDIRS = module include" for CONFIG_KERNEL and remove SUBDIRS for all other configs. @@ -131,11 +119,6 @@ override_dh_auto_install: cd '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)'; ./autogen.sh rm -fr '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/autom4te.cache' - for i in `ls $(CURDIR)/debian/tmp/lib/$(DEB_HOST_MULTIARCH)/*.so`; do \ - ln -s '/lib/$(DEB_HOST_MULTIARCH)/'`readlink $${i}` '$(CURDIR)/debian/tmp/usr/lib/$(DEB_HOST_MULTIARCH)/'`basename $${i}`; \ - rm $${i}; \ - done - chmod a-x '$(CURDIR)/debian/tmp/etc/zfs/zfs-functions' chmod a-x '$(CURDIR)/debian/tmp/etc/default/zfs' @@ -159,7 +142,7 @@ override_dh_auto_clean: @if test -e META.orig; then mv META.orig META; fi override_dh_install: - find debian/tmp/lib -name '*.la' -delete + find debian/tmp/usr/lib -name '*.la' -delete dh_install override_dh_missing: @@ -173,8 +156,8 @@ override_dh_installinit: dh_installinit -R --name zfs-zed override_dh_installsystemd: - mkdir -p debian/openzfs-zfsutils/lib/systemd/system - ln -sr /dev/null debian/openzfs-zfsutils/lib/systemd/system/zfs-import.service + mkdir -p debian/openzfs-zfsutils/usr/lib/systemd/system + ln -sr /dev/null debian/openzfs-zfsutils/usr/lib/systemd/system/zfs-import.service dh_installsystemd --no-stop-on-upgrade -X zfs-zed.service dh_installsystemd --name zfs-zed diff --git a/sys/contrib/openzfs/contrib/debian/tree/zfs-initramfs/usr/share/initramfs-tools/hooks/zdev b/sys/contrib/openzfs/contrib/debian/tree/zfs-initramfs/usr/share/initramfs-tools/hooks/zdev index 0cf21a4211a8..d4f968aed8f2 100755 --- a/sys/contrib/openzfs/contrib/debian/tree/zfs-initramfs/usr/share/initramfs-tools/hooks/zdev +++ b/sys/contrib/openzfs/contrib/debian/tree/zfs-initramfs/usr/share/initramfs-tools/hooks/zdev @@ -5,7 +5,7 @@ PREREQ="udev" PREREQ_UDEV_RULES="60-zvol.rules 69-vdev.rules" -COPY_EXEC_LIST="/lib/udev/zvol_id /lib/udev/vdev_id" +COPY_EXEC_LIST="/usr/lib/udev/zvol_id /usr/lib/udev/vdev_id" # Generic result code. RC=0 diff --git a/sys/contrib/openzfs/contrib/dracut/90zfs/module-setup.sh.in b/sys/contrib/openzfs/contrib/dracut/90zfs/module-setup.sh.in index acad468edfd1..130d94c70707 100755 --- a/sys/contrib/openzfs/contrib/dracut/90zfs/module-setup.sh.in +++ b/sys/contrib/openzfs/contrib/dracut/90zfs/module-setup.sh.in @@ -16,7 +16,8 @@ depends() { } installkernel() { - instmods -c zfs + hostonly='' instmods -c zfs + instmods mpt3sas virtio_blk } install() { diff --git a/sys/contrib/openzfs/contrib/icp/gcm-simd/boringssl/LICENSE b/sys/contrib/openzfs/contrib/icp/gcm-simd/boringssl/LICENSE new file mode 100644 index 000000000000..04c03a37e0cb --- /dev/null +++ b/sys/contrib/openzfs/contrib/icp/gcm-simd/boringssl/LICENSE @@ -0,0 +1,253 @@ +BoringSSL is a fork of OpenSSL. As such, large parts of it fall under OpenSSL +licensing. Files that are completely new have a Google copyright and an ISC +license. This license is reproduced at the bottom of this file. + +Contributors to BoringSSL are required to follow the CLA rules for Chromium: +https://cla.developers.google.com/clas + +Files in third_party/ have their own licenses, as described therein. The MIT +license, for third_party/fiat, which, unlike other third_party directories, is +compiled into non-test libraries, is included below. + +The OpenSSL toolkit stays under a dual license, i.e. both the conditions of the +OpenSSL License and the original SSLeay license apply to the toolkit. See below +for the actual license texts. Actually both licenses are BSD-style Open Source +licenses. In case of any license issues related to OpenSSL please contact +openssl-core@openssl.org. + +The following are Google-internal bug numbers where explicit permission from +some authors is recorded for use of their work. (This is purely for our own +record keeping.) + 27287199 + 27287880 + 27287883 + 263291445 + + + OpenSSL License + --------------- + +/* ==================================================================== + * Copyright (c) 1998-2011 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ + + Original SSLeay License + ----------------------- + +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + + +ISC license used for completely new code in BoringSSL: + +/* Copyright 2015 The BoringSSL Authors + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + + +The code in third_party/fiat carries the MIT license: + +Copyright (c) 2015-2016 the fiat-crypto authors (see +https://github.com/mit-plv/fiat-crypto/blob/master/AUTHORS). + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + + +Licenses for support code +------------------------- + +Parts of the TLS test suite are under the Go license. This code is not included +in BoringSSL (i.e. libcrypto and libssl) when compiled, however, so +distributing code linked against BoringSSL does not trigger this license: + +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +BoringSSL uses the Chromium test infrastructure to run a continuous build, +trybots etc. The scripts which manage this, and the script for generating build +metadata, are under the Chromium license. Distributing code linked against +BoringSSL does not trigger this license. + +Copyright 2015 The Chromium Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/sys/contrib/openzfs/contrib/icp/gcm-simd/boringssl/README b/sys/contrib/openzfs/contrib/icp/gcm-simd/boringssl/README new file mode 100644 index 000000000000..aa6fb6d477fa --- /dev/null +++ b/sys/contrib/openzfs/contrib/icp/gcm-simd/boringssl/README @@ -0,0 +1,11 @@ +This directory contains the original BoringSSL [1] GCM x86-64 assembly +files [2]. + +The assembler files where then further modified to fit the ICP conventions. + +The main purpose to include these files (and the original ones) here, is to +serve as a reference if upstream changes need to be applied to the files +included and modified in the ICP. + +[1] https://github.com/google/boringssl +[2] https://github.com/google/boringssl/blob/d5440dd2c2c500ac2d3bba4afec47a054b4d99ae/gen/bcm/aes-gcm-avx2-x86_64-linux.S diff --git a/sys/contrib/openzfs/contrib/icp/gcm-simd/boringssl/aes-gcm-avx2-x86_64-linux.S b/sys/contrib/openzfs/contrib/icp/gcm-simd/boringssl/aes-gcm-avx2-x86_64-linux.S new file mode 100644 index 000000000000..e7327c9de872 --- /dev/null +++ b/sys/contrib/openzfs/contrib/icp/gcm-simd/boringssl/aes-gcm-avx2-x86_64-linux.S @@ -0,0 +1,1328 @@ +// SPDX-License-Identifier: Apache-2.0 +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include <openssl/asm_base.h> + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.section .rodata +.align 16 + + +.Lbswap_mask: +.quad 0x08090a0b0c0d0e0f, 0x0001020304050607 + + + + + + + + +.Lgfpoly: +.quad 1, 0xc200000000000000 + + +.Lgfpoly_and_internal_carrybit: +.quad 1, 0xc200000000000001 + +.align 32 + +.Lctr_pattern: +.quad 0, 0 +.quad 1, 0 +.Linc_2blocks: +.quad 2, 0 +.quad 2, 0 + +.text +.globl gcm_init_vpclmulqdq_avx2 +.hidden gcm_init_vpclmulqdq_avx2 +.type gcm_init_vpclmulqdq_avx2,@function +.align 32 +gcm_init_vpclmulqdq_avx2: +.cfi_startproc + +_CET_ENDBR + + + + + + vpshufd $0x4e,(%rsi),%xmm3 + + + + + + vpshufd $0xd3,%xmm3,%xmm0 + vpsrad $31,%xmm0,%xmm0 + vpaddq %xmm3,%xmm3,%xmm3 + vpand .Lgfpoly_and_internal_carrybit(%rip),%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + + vbroadcasti128 .Lgfpoly(%rip),%ymm6 + + + vpclmulqdq $0x00,%xmm3,%xmm3,%xmm0 + vpclmulqdq $0x01,%xmm3,%xmm3,%xmm1 + vpclmulqdq $0x10,%xmm3,%xmm3,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vpclmulqdq $0x01,%xmm0,%xmm6,%xmm2 + vpshufd $0x4e,%xmm0,%xmm0 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm3,%xmm3,%xmm5 + vpclmulqdq $0x01,%xmm1,%xmm6,%xmm0 + vpshufd $0x4e,%xmm1,%xmm1 + vpxor %xmm1,%xmm5,%xmm5 + vpxor %xmm0,%xmm5,%xmm5 + + + + vinserti128 $1,%xmm3,%ymm5,%ymm3 + vinserti128 $1,%xmm5,%ymm5,%ymm5 + + + vpclmulqdq $0x00,%ymm5,%ymm3,%ymm0 + vpclmulqdq $0x01,%ymm5,%ymm3,%ymm1 + vpclmulqdq $0x10,%ymm5,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x11,%ymm5,%ymm3,%ymm4 + vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + + + vmovdqu %ymm3,96(%rdi) + vmovdqu %ymm4,64(%rdi) + + + + vpunpcklqdq %ymm3,%ymm4,%ymm0 + vpunpckhqdq %ymm3,%ymm4,%ymm1 + vpxor %ymm1,%ymm0,%ymm0 + vmovdqu %ymm0,128+32(%rdi) + + + vpclmulqdq $0x00,%ymm5,%ymm4,%ymm0 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm1 + vpclmulqdq $0x10,%ymm5,%ymm4,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x11,%ymm5,%ymm4,%ymm3 + vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm3,%ymm3 + vpxor %ymm0,%ymm3,%ymm3 + + vpclmulqdq $0x00,%ymm5,%ymm3,%ymm0 + vpclmulqdq $0x01,%ymm5,%ymm3,%ymm1 + vpclmulqdq $0x10,%ymm5,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x11,%ymm5,%ymm3,%ymm4 + vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + vmovdqu %ymm3,32(%rdi) + vmovdqu %ymm4,0(%rdi) + + + + vpunpcklqdq %ymm3,%ymm4,%ymm0 + vpunpckhqdq %ymm3,%ymm4,%ymm1 + vpxor %ymm1,%ymm0,%ymm0 + vmovdqu %ymm0,128(%rdi) + + vzeroupper + ret + +.cfi_endproc +.size gcm_init_vpclmulqdq_avx2, . - gcm_init_vpclmulqdq_avx2 +.globl gcm_gmult_vpclmulqdq_avx2 +.hidden gcm_gmult_vpclmulqdq_avx2 +.type gcm_gmult_vpclmulqdq_avx2,@function +.align 32 +gcm_gmult_vpclmulqdq_avx2: +.cfi_startproc + +_CET_ENDBR + + + + vmovdqu (%rdi),%xmm0 + vmovdqu .Lbswap_mask(%rip),%xmm1 + vmovdqu 128-16(%rsi),%xmm2 + vmovdqu .Lgfpoly(%rip),%xmm3 + vpshufb %xmm1,%xmm0,%xmm0 + + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm4 + vpclmulqdq $0x01,%xmm2,%xmm0,%xmm5 + vpclmulqdq $0x10,%xmm2,%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x01,%xmm4,%xmm3,%xmm6 + vpshufd $0x4e,%xmm4,%xmm4 + vpxor %xmm4,%xmm5,%xmm5 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x01,%xmm5,%xmm3,%xmm4 + vpshufd $0x4e,%xmm5,%xmm5 + vpxor %xmm5,%xmm0,%xmm0 + vpxor %xmm4,%xmm0,%xmm0 + + + vpshufb %xmm1,%xmm0,%xmm0 + vmovdqu %xmm0,(%rdi) + ret + +.cfi_endproc +.size gcm_gmult_vpclmulqdq_avx2, . - gcm_gmult_vpclmulqdq_avx2 +.globl gcm_ghash_vpclmulqdq_avx2 +.hidden gcm_ghash_vpclmulqdq_avx2 +.type gcm_ghash_vpclmulqdq_avx2,@function +.align 32 +gcm_ghash_vpclmulqdq_avx2: +.cfi_startproc + +_CET_ENDBR + + + + + + + vmovdqu .Lbswap_mask(%rip),%xmm6 + vmovdqu .Lgfpoly(%rip),%xmm7 + + + vmovdqu (%rdi),%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + + + cmpq $32,%rcx + jb .Lghash_lastblock + + + + vinserti128 $1,%xmm6,%ymm6,%ymm6 + vinserti128 $1,%xmm7,%ymm7,%ymm7 + + cmpq $127,%rcx + jbe .Lghash_loop_1x + + + vmovdqu 128(%rsi),%ymm8 + vmovdqu 128+32(%rsi),%ymm9 +.Lghash_loop_4x: + + vmovdqu 0(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 0(%rsi),%ymm2 + vpxor %ymm5,%ymm1,%ymm1 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x00,%ymm8,%ymm0,%ymm4 + + vmovdqu 32(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 32(%rsi),%ymm2 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x10,%ymm8,%ymm0,%ymm0 + vpxor %ymm0,%ymm4,%ymm4 + + vmovdqu 64(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 64(%rsi),%ymm2 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x00,%ymm9,%ymm0,%ymm0 + vpxor %ymm0,%ymm4,%ymm4 + + + vmovdqu 96(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 96(%rsi),%ymm2 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x10,%ymm9,%ymm0,%ymm0 + vpxor %ymm0,%ymm4,%ymm4 + + vpxor %ymm3,%ymm4,%ymm4 + vpxor %ymm5,%ymm4,%ymm4 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm2 + vpclmulqdq $0x01,%ymm3,%ymm2,%ymm0 + vpshufd $0x4e,%ymm3,%ymm3 + vpxor %ymm3,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + vpclmulqdq $0x01,%ymm4,%ymm2,%ymm0 + vpshufd $0x4e,%ymm4,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm0,%ymm5,%ymm5 + vextracti128 $1,%ymm5,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + + subq $-128,%rdx + addq $-128,%rcx + cmpq $127,%rcx + ja .Lghash_loop_4x + + + cmpq $32,%rcx + jb .Lghash_loop_1x_done +.Lghash_loop_1x: + vmovdqu (%rdx),%ymm0 + vpshufb %ymm6,%ymm0,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vmovdqu 128-32(%rsi),%ymm0 + vpclmulqdq $0x00,%ymm0,%ymm5,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm5,%ymm2 + vpclmulqdq $0x10,%ymm0,%ymm5,%ymm3 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x01,%ymm1,%ymm7,%ymm3 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm2,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x11,%ymm0,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm7,%ymm1 + vpshufd $0x4e,%ymm2,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm1,%ymm5,%ymm5 + + vextracti128 $1,%ymm5,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + addq $32,%rdx + subq $32,%rcx + cmpq $32,%rcx + jae .Lghash_loop_1x +.Lghash_loop_1x_done: + + +.Lghash_lastblock: + testq %rcx,%rcx + jz .Lghash_done + vmovdqu (%rdx),%xmm0 + vpshufb %xmm6,%xmm0,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + vmovdqu 128-16(%rsi),%xmm0 + vpclmulqdq $0x00,%xmm0,%xmm5,%xmm1 + vpclmulqdq $0x01,%xmm0,%xmm5,%xmm2 + vpclmulqdq $0x10,%xmm0,%xmm5,%xmm3 + vpxor %xmm3,%xmm2,%xmm2 + vpclmulqdq $0x01,%xmm1,%xmm7,%xmm3 + vpshufd $0x4e,%xmm1,%xmm1 + vpxor %xmm1,%xmm2,%xmm2 + vpxor %xmm3,%xmm2,%xmm2 + vpclmulqdq $0x11,%xmm0,%xmm5,%xmm5 + vpclmulqdq $0x01,%xmm2,%xmm7,%xmm1 + vpshufd $0x4e,%xmm2,%xmm2 + vpxor %xmm2,%xmm5,%xmm5 + vpxor %xmm1,%xmm5,%xmm5 + + +.Lghash_done: + + vpshufb %xmm6,%xmm5,%xmm5 + vmovdqu %xmm5,(%rdi) + + vzeroupper + ret + +.cfi_endproc +.size gcm_ghash_vpclmulqdq_avx2, . - gcm_ghash_vpclmulqdq_avx2 +.globl aes_gcm_enc_update_vaes_avx2 +.hidden aes_gcm_enc_update_vaes_avx2 +.type aes_gcm_enc_update_vaes_avx2,@function +.align 32 +aes_gcm_enc_update_vaes_avx2: +.cfi_startproc + +_CET_ENDBR + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + + movq 16(%rsp),%r12 +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit +.hidden BORINGSSL_function_hit + movb $1,BORINGSSL_function_hit+8(%rip) +#endif + vbroadcasti128 .Lbswap_mask(%rip),%ymm0 + + + + vmovdqu (%r12),%xmm1 + vpshufb %xmm0,%xmm1,%xmm1 + vbroadcasti128 (%r8),%ymm11 + vpshufb %ymm0,%ymm11,%ymm11 + + + + movl 240(%rcx),%r10d + leal -20(,%r10,4),%r10d + + + + + leaq 96(%rcx,%r10,4),%r11 + vbroadcasti128 (%rcx),%ymm9 + vbroadcasti128 (%r11),%ymm10 + + + vpaddd .Lctr_pattern(%rip),%ymm11,%ymm11 + + + + cmpq $127,%rdx + jbe .Lcrypt_loop_4x_done__func1 + + vmovdqu 128(%r9),%ymm7 + vmovdqu 128+32(%r9),%ymm8 + + + + vmovdqu .Linc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + leaq 16(%rcx),%rax +.Lvaesenc_loop_first_4_vecs__func1: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_first_4_vecs__func1 + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 + vaesenclast %ymm2,%ymm12,%ymm12 + vaesenclast %ymm3,%ymm13,%ymm13 + vaesenclast %ymm5,%ymm14,%ymm14 + vaesenclast %ymm6,%ymm15,%ymm15 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + addq $-128,%rdx + cmpq $127,%rdx + jbe .Lghash_last_ciphertext_4x__func1 +.align 16 +.Lcrypt_loop_4x__func1: + + + + + vmovdqu .Linc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + cmpl $24,%r10d + jl .Laes128__func1 + je .Laes192__func1 + + vbroadcasti128 -208(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -192(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +.Laes192__func1: + vbroadcasti128 -176(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -160(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +.Laes128__func1: + prefetcht0 512(%rdi) + prefetcht0 512+64(%rdi) + + vmovdqu 0(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6 + + vbroadcasti128 -144(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vbroadcasti128 -128(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 32(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -112(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 64(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + + vbroadcasti128 -96(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -80(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + + vbroadcasti128 -64(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vmovdqu 96(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -48(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm4 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -32(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -16(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + + subq $-128,%rsi + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 + vaesenclast %ymm2,%ymm12,%ymm12 + vaesenclast %ymm3,%ymm13,%ymm13 + vaesenclast %ymm5,%ymm14,%ymm14 + vaesenclast %ymm6,%ymm15,%ymm15 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + + addq $-128,%rdx + cmpq $127,%rdx + ja .Lcrypt_loop_4x__func1 +.Lghash_last_ciphertext_4x__func1: + + vmovdqu 0(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6 + + vmovdqu 32(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vmovdqu 64(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 96(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm4 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + subq $-128,%rsi +.Lcrypt_loop_4x_done__func1: + + testq %rdx,%rdx + jz .Ldone__func1 + + + + + + leaq 128(%r9),%r8 + subq %rdx,%r8 + + + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + vpxor %xmm7,%xmm7,%xmm7 + + cmpq $64,%rdx + jb .Llessthan64bytes__func1 + + + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_1__func1: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_1__func1 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%ymm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %ymm3,%ymm13,%ymm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + + + vpshufb %ymm0,%ymm12,%ymm12 + vpshufb %ymm0,%ymm13,%ymm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%ymm3 + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm7 + vpclmulqdq $0x00,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + + addq $64,%r8 + addq $64,%rdi + addq $64,%rsi + subq $64,%rdx + jz .Lreduce__func1 + + vpxor %xmm1,%xmm1,%xmm1 + + +.Llessthan64bytes__func1: + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_2__func1: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_2__func1 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + + + cmpq $32,%rdx + jb .Lxor_one_block__func1 + je .Lxor_two_blocks__func1 + +.Lxor_three_blocks__func1: + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%xmm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %xmm3,%xmm13,%xmm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %xmm13,32(%rsi) + + vpshufb %ymm0,%ymm12,%ymm12 + vpshufb %xmm0,%xmm13,%xmm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%xmm3 + vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm7,%ymm7 + jmp .Lghash_mul_one_vec_unreduced__func1 + +.Lxor_two_blocks__func1: + vmovdqu (%rdi),%ymm2 + vpxor %ymm2,%ymm12,%ymm12 + vmovdqu %ymm12,(%rsi) + vpshufb %ymm0,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + jmp .Lghash_mul_one_vec_unreduced__func1 + +.Lxor_one_block__func1: + vmovdqu (%rdi),%xmm2 + vpxor %xmm2,%xmm12,%xmm12 + vmovdqu %xmm12,(%rsi) + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm1,%xmm12,%xmm12 + vmovdqu (%r8),%xmm2 + +.Lghash_mul_one_vec_unreduced__func1: + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + +.Lreduce__func1: + + vbroadcasti128 .Lgfpoly(%rip),%ymm2 + vpclmulqdq $0x01,%ymm5,%ymm2,%ymm3 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpclmulqdq $0x01,%ymm6,%ymm2,%ymm3 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm7,%ymm7 + vpxor %ymm3,%ymm7,%ymm7 + vextracti128 $1,%ymm7,%xmm1 + vpxor %xmm7,%xmm1,%xmm1 + +.Ldone__func1: + + vpshufb %xmm0,%xmm1,%xmm1 + vmovdqu %xmm1,(%r12) + + vzeroupper + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + ret + +.cfi_endproc +.size aes_gcm_enc_update_vaes_avx2, . - aes_gcm_enc_update_vaes_avx2 +.globl aes_gcm_dec_update_vaes_avx2 +.hidden aes_gcm_dec_update_vaes_avx2 +.type aes_gcm_dec_update_vaes_avx2,@function +.align 32 +aes_gcm_dec_update_vaes_avx2: +.cfi_startproc + +_CET_ENDBR + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + + movq 16(%rsp),%r12 + vbroadcasti128 .Lbswap_mask(%rip),%ymm0 + + + + vmovdqu (%r12),%xmm1 + vpshufb %xmm0,%xmm1,%xmm1 + vbroadcasti128 (%r8),%ymm11 + vpshufb %ymm0,%ymm11,%ymm11 + + + + movl 240(%rcx),%r10d + leal -20(,%r10,4),%r10d + + + + + leaq 96(%rcx,%r10,4),%r11 + vbroadcasti128 (%rcx),%ymm9 + vbroadcasti128 (%r11),%ymm10 + + + vpaddd .Lctr_pattern(%rip),%ymm11,%ymm11 + + + + cmpq $127,%rdx + jbe .Lcrypt_loop_4x_done__func2 + + vmovdqu 128(%r9),%ymm7 + vmovdqu 128+32(%r9),%ymm8 +.align 16 +.Lcrypt_loop_4x__func2: + + + + + vmovdqu .Linc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + cmpl $24,%r10d + jl .Laes128__func2 + je .Laes192__func2 + + vbroadcasti128 -208(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -192(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +.Laes192__func2: + vbroadcasti128 -176(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -160(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +.Laes128__func2: + prefetcht0 512(%rdi) + prefetcht0 512+64(%rdi) + + vmovdqu 0(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6 + + vbroadcasti128 -144(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vbroadcasti128 -128(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 32(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -112(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 64(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + + vbroadcasti128 -96(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -80(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + + vbroadcasti128 -64(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vmovdqu 96(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -48(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm4 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -32(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -16(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + + + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 + vaesenclast %ymm2,%ymm12,%ymm12 + vaesenclast %ymm3,%ymm13,%ymm13 + vaesenclast %ymm5,%ymm14,%ymm14 + vaesenclast %ymm6,%ymm15,%ymm15 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + subq $-128,%rsi + addq $-128,%rdx + cmpq $127,%rdx + ja .Lcrypt_loop_4x__func2 +.Lcrypt_loop_4x_done__func2: + + testq %rdx,%rdx + jz .Ldone__func2 + + + + + + leaq 128(%r9),%r8 + subq %rdx,%r8 + + + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + vpxor %xmm7,%xmm7,%xmm7 + + cmpq $64,%rdx + jb .Llessthan64bytes__func2 + + + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_1__func2: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_1__func2 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%ymm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %ymm3,%ymm13,%ymm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + + + vpshufb %ymm0,%ymm2,%ymm12 + vpshufb %ymm0,%ymm3,%ymm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%ymm3 + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm7 + vpclmulqdq $0x00,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + + addq $64,%r8 + addq $64,%rdi + addq $64,%rsi + subq $64,%rdx + jz .Lreduce__func2 + + vpxor %xmm1,%xmm1,%xmm1 + + +.Llessthan64bytes__func2: + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_2__func2: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_2__func2 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + + + cmpq $32,%rdx + jb .Lxor_one_block__func2 + je .Lxor_two_blocks__func2 + +.Lxor_three_blocks__func2: + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%xmm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %xmm3,%xmm13,%xmm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %xmm13,32(%rsi) + + vpshufb %ymm0,%ymm2,%ymm12 + vpshufb %xmm0,%xmm3,%xmm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%xmm3 + vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm7,%ymm7 + jmp .Lghash_mul_one_vec_unreduced__func2 + +.Lxor_two_blocks__func2: + vmovdqu (%rdi),%ymm2 + vpxor %ymm2,%ymm12,%ymm12 + vmovdqu %ymm12,(%rsi) + vpshufb %ymm0,%ymm2,%ymm12 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + jmp .Lghash_mul_one_vec_unreduced__func2 + +.Lxor_one_block__func2: + vmovdqu (%rdi),%xmm2 + vpxor %xmm2,%xmm12,%xmm12 + vmovdqu %xmm12,(%rsi) + vpshufb %xmm0,%xmm2,%xmm12 + vpxor %xmm1,%xmm12,%xmm12 + vmovdqu (%r8),%xmm2 + +.Lghash_mul_one_vec_unreduced__func2: + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + +.Lreduce__func2: + + vbroadcasti128 .Lgfpoly(%rip),%ymm2 + vpclmulqdq $0x01,%ymm5,%ymm2,%ymm3 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpclmulqdq $0x01,%ymm6,%ymm2,%ymm3 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm7,%ymm7 + vpxor %ymm3,%ymm7,%ymm7 + vextracti128 $1,%ymm7,%xmm1 + vpxor %xmm7,%xmm1,%xmm1 + +.Ldone__func2: + + vpshufb %xmm0,%xmm1,%xmm1 + vmovdqu %xmm1,(%r12) + + vzeroupper + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + ret + +.cfi_endproc +.size aes_gcm_dec_update_vaes_avx2, . - aes_gcm_dec_update_vaes_avx2 +#endif diff --git a/sys/contrib/openzfs/contrib/initramfs/hooks/zfsunlock.in b/sys/contrib/openzfs/contrib/initramfs/hooks/zfsunlock.in index 4776087d9a76..db9bf0e20274 100644 --- a/sys/contrib/openzfs/contrib/initramfs/hooks/zfsunlock.in +++ b/sys/contrib/openzfs/contrib/initramfs/hooks/zfsunlock.in @@ -8,3 +8,12 @@ fi . /usr/share/initramfs-tools/hook-functions copy_exec /usr/share/initramfs-tools/zfsunlock /usr/bin/zfsunlock + +if [ -f /etc/initramfs-tools/etc/motd ]; then + copy_file text /etc/initramfs-tools/etc/motd /etc/motd +else + tmpf=$(mktemp) + echo "If you use zfs encrypted root filesystems, you can use \`zfsunlock\` to manually unlock it" > "$tmpf" + copy_file text "$tmpf" /etc/motd + rm -f "$tmpf" +fi diff --git a/sys/contrib/openzfs/contrib/initramfs/scripts/zfs b/sys/contrib/openzfs/contrib/initramfs/scripts/zfs index c569b2528368..67707e9d80f4 100644 --- a/sys/contrib/openzfs/contrib/initramfs/scripts/zfs +++ b/sys/contrib/openzfs/contrib/initramfs/scripts/zfs @@ -979,7 +979,8 @@ mountroot() touch /run/zfs_unlock_complete if [ -e /run/zfs_unlock_complete_notify ]; then - read -r < /run/zfs_unlock_complete_notify + # shellcheck disable=SC2034 + read -r zfs_unlock_complete_notify < /run/zfs_unlock_complete_notify fi # ------------ diff --git a/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c b/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c index a0bc172c6f44..88698dedabbc 100644 --- a/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c +++ b/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c @@ -391,7 +391,11 @@ static int zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, int argc, const char **argv) { +#if defined(__FreeBSD__) + config->homes_prefix = strdup("zroot/home"); +#else config->homes_prefix = strdup("rpool/home"); +#endif if (config->homes_prefix == NULL) { pam_syslog(pamh, LOG_ERR, "strdup failure"); return (PAM_SERVICE_ERR); diff --git a/sys/contrib/openzfs/contrib/pyzfs/libzfs_core/exceptions.py b/sys/contrib/openzfs/contrib/pyzfs/libzfs_core/exceptions.py index b26a37f5de10..26d66a452726 100644 --- a/sys/contrib/openzfs/contrib/pyzfs/libzfs_core/exceptions.py +++ b/sys/contrib/openzfs/contrib/pyzfs/libzfs_core/exceptions.py @@ -604,5 +604,4 @@ class RaidzExpansionRunning(ZFSError): errno = ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS message = "A raidz device is currently expanding" - # vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/sys/contrib/openzfs/etc/Makefile.am b/sys/contrib/openzfs/etc/Makefile.am index 7187762d3802..808c729cd968 100644 --- a/sys/contrib/openzfs/etc/Makefile.am +++ b/sys/contrib/openzfs/etc/Makefile.am @@ -56,6 +56,7 @@ systemdunit_DATA = \ %D%/systemd/system/zfs-import-scan.service \ %D%/systemd/system/zfs-import.target \ %D%/systemd/system/zfs-mount.service \ + %D%/systemd/system/zfs-mount@.service \ %D%/systemd/system/zfs-scrub-monthly@.timer \ %D%/systemd/system/zfs-scrub-weekly@.timer \ %D%/systemd/system/zfs-scrub@.service \ diff --git a/sys/contrib/openzfs/etc/systemd/system/zfs-mount@.service.in b/sys/contrib/openzfs/etc/systemd/system/zfs-mount@.service.in new file mode 100644 index 000000000000..0698fad12074 --- /dev/null +++ b/sys/contrib/openzfs/etc/systemd/system/zfs-mount@.service.in @@ -0,0 +1,26 @@ +[Unit] +Description=Mount ZFS filesystem %I +Documentation=man:zfs(8) +DefaultDependencies=no +After=systemd-udev-settle.service +After=zfs-import.target +After=zfs-mount.service +After=systemd-remount-fs.service +Before=local-fs.target +ConditionPathIsDirectory=/sys/module/zfs + +# This merely tells the service manager +# that unmounting everything undoes the +# effect of this service. No extra logic +# is ran as a result of these settings. +Conflicts=umount.target +Before=umount.target + +[Service] +Type=oneshot +RemainAfterExit=yes +EnvironmentFile=-@initconfdir@/zfs +ExecStart=@sbindir@/zfs mount -R %I + +[Install] +WantedBy=zfs.target diff --git a/sys/contrib/openzfs/include/Makefile.am b/sys/contrib/openzfs/include/Makefile.am index a9258deabfd7..7588cd0aedc9 100644 --- a/sys/contrib/openzfs/include/Makefile.am +++ b/sys/contrib/openzfs/include/Makefile.am @@ -10,6 +10,7 @@ COMMON_H = \ cityhash.h \ zfeature_common.h \ zfs_comutil.h \ + zfs_crrd.h \ zfs_deleg.h \ zfs_fletcher.h \ zfs_namecheck.h \ @@ -69,7 +70,6 @@ COMMON_H = \ sys/metaslab_impl.h \ sys/mmp.h \ sys/mntent.h \ - sys/mod.h \ sys/multilist.h \ sys/nvpair.h \ sys/nvpair_impl.h \ diff --git a/sys/contrib/openzfs/include/libzfs.h b/sys/contrib/openzfs/include/libzfs.h index 485af793862c..3fcdc176a621 100644 --- a/sys/contrib/openzfs/include/libzfs.h +++ b/sys/contrib/openzfs/include/libzfs.h @@ -30,6 +30,7 @@ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> + * Copyright (c) 2025 Hewlett Packard Enterprise Development LP. */ #ifndef _LIBZFS_H @@ -288,10 +289,22 @@ typedef struct trimflags { uint64_t rate; } trimflags_t; +typedef struct trim_cbdata { + trimflags_t trim_flags; + pool_trim_func_t cmd_type; +} trim_cbdata_t; + +typedef struct initialize_cbdata { + boolean_t wait; + pool_initialize_func_t cmd_type; +} initialize_cbdata_t; /* * Functions to manipulate pool and vdev state */ _LIBZFS_H int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t); +_LIBZFS_H int zpool_scan_range(zpool_handle_t *, pool_scan_func_t, + pool_scrub_cmd_t, time_t, time_t); +_LIBZFS_H int zpool_initialize_one(zpool_handle_t *, void *); _LIBZFS_H int zpool_initialize(zpool_handle_t *, pool_initialize_func_t, nvlist_t *); _LIBZFS_H int zpool_initialize_wait(zpool_handle_t *, pool_initialize_func_t, @@ -304,7 +317,9 @@ _LIBZFS_H int zpool_reguid(zpool_handle_t *); _LIBZFS_H int zpool_set_guid(zpool_handle_t *, const uint64_t *); _LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *); +_LIBZFS_H void zpool_collect_leaves(zpool_handle_t *, nvlist_t *, nvlist_t *); _LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *); +_LIBZFS_H int zpool_trim_one(zpool_handle_t *, void *); _LIBZFS_H int zpool_ddt_prune(zpool_handle_t *, zpool_ddt_prune_unit_t, uint64_t); diff --git a/sys/contrib/openzfs/include/os/freebsd/Makefile.am b/sys/contrib/openzfs/include/os/freebsd/Makefile.am index d975c4fe69fa..d6b6923d033f 100644 --- a/sys/contrib/openzfs/include/os/freebsd/Makefile.am +++ b/sys/contrib/openzfs/include/os/freebsd/Makefile.am @@ -33,7 +33,7 @@ noinst_HEADERS = \ %D%/spl/sys/list_impl.h \ %D%/spl/sys/lock.h \ %D%/spl/sys/misc.h \ - %D%/spl/sys/mod_os.h \ + %D%/spl/sys/mod.h \ %D%/spl/sys/mode.h \ %D%/spl/sys/mount.h \ %D%/spl/sys/mutex.h \ diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/debug.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/debug.h index 974704e92bbd..32bc02f3dc86 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/debug.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/debug.h @@ -69,6 +69,10 @@ #define __maybe_unused __attribute__((unused)) #endif +#ifndef __must_check +#define __must_check __attribute__((__warn_unused_result__)) +#endif + /* * Without this, we see warnings from objtool during normal Linux builds when * the kernel is built with CONFIG_STACK_VALIDATION=y: diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/misc.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/misc.h index 091ebe772810..acce8734b2c5 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/misc.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/misc.h @@ -56,4 +56,9 @@ struct opensolaris_utsname { #define task_io_account_read(n) #define task_io_account_write(n) +/* + * Check if the current thread is a memory reclaim thread. + */ +extern int current_is_reclaim_thread(void); + #endif /* _OPENSOLARIS_SYS_MISC_H_ */ diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/mod_os.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/mod.h index 4214189c32df..4214189c32df 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/mod_os.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/mod.h diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/proc.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/proc.h index c6bc10d6babe..1cbd79ec893f 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/proc.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/proc.h @@ -77,8 +77,8 @@ do_thread_create(caddr_t stk, size_t stksize, void (*proc)(void *), void *arg, /* * Be sure there are no surprises. */ - ASSERT(stk == NULL); - ASSERT(len == 0); + ASSERT0P(stk); + ASSERT0(len); ASSERT(state == TS_RUN); if (pp == &p0) diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/time.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/time.h index 2f5fe4619ef7..14b42f2e7087 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/time.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/time.h @@ -63,6 +63,17 @@ typedef longlong_t hrtime_t; #define NSEC_TO_TICK(nsec) ((nsec) / (NANOSEC / hz)) static __inline hrtime_t +getlrtime(void) +{ + struct timespec ts; + hrtime_t nsec; + + getnanouptime(&ts); + nsec = ((hrtime_t)ts.tv_sec * NANOSEC) + ts.tv_nsec; + return (nsec); +} + +static __inline hrtime_t gethrtime(void) { struct timespec ts; diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/vm.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/vm.h index 454078f0fe79..d36bee881d0b 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/vm.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/vm.h @@ -35,6 +35,7 @@ extern const int zfs_vm_pagerret_bad; extern const int zfs_vm_pagerret_error; extern const int zfs_vm_pagerret_ok; +extern const int zfs_vm_pagerret_pend; extern const int zfs_vm_pagerput_sync; extern const int zfs_vm_pagerput_inval; diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/vnode_impl.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/vnode_impl.h index 0df3378c23e7..b18836aa563e 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/vnode_impl.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/vnode_impl.h @@ -227,6 +227,7 @@ struct taskq; #define LOOKUP_XATTR 0x02 /* lookup up extended attr dir */ #define CREATE_XATTR_DIR 0x04 /* Create extended attr dir */ #define LOOKUP_HAVE_SYSATTR_DIR 0x08 /* Already created virtual GFS dir */ +#define LOOKUP_NAMED_ATTR 0x10 /* Lookup a named attribute */ /* * Public vnode manipulation functions. diff --git a/sys/contrib/openzfs/include/os/linux/Makefile.am b/sys/contrib/openzfs/include/os/linux/Makefile.am index 4fe6705defe5..e156ca183dbd 100644 --- a/sys/contrib/openzfs/include/os/linux/Makefile.am +++ b/sys/contrib/openzfs/include/os/linux/Makefile.am @@ -75,7 +75,7 @@ kernel_spl_sys_HEADERS = \ %D%/spl/sys/kstat.h \ %D%/spl/sys/list.h \ %D%/spl/sys/misc.h \ - %D%/spl/sys/mod_os.h \ + %D%/spl/sys/mod.h \ %D%/spl/sys/mutex.h \ %D%/spl/sys/param.h \ %D%/spl/sys/proc.h \ diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/dcache_compat.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/dcache_compat.h index 16e8a319a5f8..152e5a606f0e 100644 --- a/sys/contrib/openzfs/include/os/linux/kernel/linux/dcache_compat.h +++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/dcache_compat.h @@ -61,32 +61,6 @@ #endif /* - * 2.6.30 API change, - * The const keyword was added to the 'struct dentry_operations' in - * the dentry structure. To handle this we define an appropriate - * dentry_operations_t typedef which can be used. - */ -typedef const struct dentry_operations dentry_operations_t; - -/* - * 2.6.38 API addition, - * Added d_clear_d_op() helper function which clears some flags and the - * registered dentry->d_op table. This is required because d_set_d_op() - * issues a warning when the dentry operations table is already set. - * For the .zfs control directory to work properly we must be able to - * override the default operations table and register custom .d_automount - * and .d_revalidate callbacks. - */ -static inline void -d_clear_d_op(struct dentry *dentry) -{ - dentry->d_op = NULL; - dentry->d_flags &= ~( - DCACHE_OP_HASH | DCACHE_OP_COMPARE | - DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE); -} - -/* * Walk and invalidate all dentry aliases of an inode * unless it's a mountpoint */ diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/simd_x86.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/simd_x86.h index cd245a5f0135..326f471d7c9b 100644 --- a/sys/contrib/openzfs/include/os/linux/kernel/linux/simd_x86.h +++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/simd_x86.h @@ -139,15 +139,6 @@ */ #if defined(HAVE_KERNEL_FPU_INTERNAL) -/* - * For kernels not exporting *kfpu_{begin,end} we have to use inline assembly - * with the XSAVE{,OPT,S} instructions, so we need the toolchain to support at - * least XSAVE. - */ -#if !defined(HAVE_XSAVE) -#error "Toolchain needs to support the XSAVE assembler instruction" -#endif - #ifndef XFEATURE_MASK_XTILE /* * For kernels where this doesn't exist yet, we still don't want to break @@ -335,9 +326,13 @@ kfpu_begin(void) return; } #endif +#if defined(HAVE_XSAVE) if (static_cpu_has(X86_FEATURE_XSAVE)) { kfpu_do_xsave("xsave", state, ~XFEATURE_MASK_XTILE); - } else if (static_cpu_has(X86_FEATURE_FXSR)) { + return; + } +#endif + if (static_cpu_has(X86_FEATURE_FXSR)) { kfpu_save_fxsr(state); } else { kfpu_save_fsave(state); @@ -390,9 +385,13 @@ kfpu_end(void) goto out; } #endif +#if defined(HAVE_XSAVE) if (static_cpu_has(X86_FEATURE_XSAVE)) { kfpu_do_xrstor("xrstor", state, ~XFEATURE_MASK_XTILE); - } else if (static_cpu_has(X86_FEATURE_FXSR)) { + goto out; + } +#endif + if (static_cpu_has(X86_FEATURE_FXSR)) { kfpu_restore_fxsr(state); } else { kfpu_restore_fsave(state); @@ -599,6 +598,32 @@ zfs_movbe_available(void) } /* + * Check if VAES instruction set is available + */ +static inline boolean_t +zfs_vaes_available(void) +{ +#if defined(X86_FEATURE_VAES) + return (!!boot_cpu_has(X86_FEATURE_VAES)); +#else + return (B_FALSE); +#endif +} + +/* + * Check if VPCLMULQDQ instruction set is available + */ +static inline boolean_t +zfs_vpclmulqdq_available(void) +{ +#if defined(X86_FEATURE_VPCLMULQDQ) + return (!!boot_cpu_has(X86_FEATURE_VPCLMULQDQ)); +#else + return (B_FALSE); +#endif +} + +/* * Check if SHA_NI instruction set is available */ static inline boolean_t diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/atomic.h b/sys/contrib/openzfs/include/os/linux/spl/sys/atomic.h index b2a39d7d6cbf..f4bcd58bd281 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/atomic.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/atomic.h @@ -71,6 +71,22 @@ atomic_cas_ptr(volatile void *target, void *cmp, void *newval) return ((void *)atomic_cas_64((volatile uint64_t *)target, (uint64_t)cmp, (uint64_t)newval)); } +static __inline__ void * +atomic_swap_ptr(volatile void *target, void *newval) +{ + return ((void *)atomic_swap_64((volatile uint64_t *)target, + (uint64_t)newval)); +} +static __inline__ void * +atomic_load_ptr(volatile void *target) +{ + return ((void *)atomic_load_64((volatile uint64_t *)target)); +} +static __inline__ void +atomic_store_ptr(volatile void *target, void *newval) +{ + atomic_store_64((volatile uint64_t *)target, (uint64_t)newval); +} #else /* _LP64 */ static __inline__ void * atomic_cas_ptr(volatile void *target, void *cmp, void *newval) @@ -78,6 +94,22 @@ atomic_cas_ptr(volatile void *target, void *cmp, void *newval) return ((void *)atomic_cas_32((volatile uint32_t *)target, (uint32_t)cmp, (uint32_t)newval)); } +static __inline__ void * +atomic_swap_ptr(volatile void *target, void *newval) +{ + return ((void *)atomic_swap_32((volatile uint32_t *)target, + (uint32_t)newval)); +} +static __inline__ void * +atomic_load_ptr(volatile void *target) +{ + return ((void *)atomic_load_32((volatile uint32_t *)target)); +} +static __inline__ void +atomic_store_ptr(volatile void *target, void *newval) +{ + atomic_store_32((volatile uint32_t *)target, (uint32_t)newval); +} #endif /* _LP64 */ #endif /* _SPL_ATOMIC_H */ diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/debug.h b/sys/contrib/openzfs/include/os/linux/spl/sys/debug.h index 1671ba4074da..85b96e1e23a7 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/debug.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/debug.h @@ -69,6 +69,10 @@ #define __maybe_unused __attribute__((unused)) #endif +#ifndef __must_check +#define __must_check __attribute__((__warn_unused_result__)) +#endif + /* * Without this, we see warnings from objtool during normal Linux builds when * the kernel is built with CONFIG_STACK_VALIDATION=y: diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/kmem.h b/sys/contrib/openzfs/include/os/linux/spl/sys/kmem.h index 995236117dd4..fe34de9c179e 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/kmem.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/kmem.h @@ -61,7 +61,7 @@ void *spl_kvmalloc(size_t size, gfp_t flags); /* * Convert a KM_* flags mask to its Linux GFP_* counterpart. The conversion * function is context aware which means that KM_SLEEP allocations can be - * safely used in syncing contexts which have set PF_FSTRANS. + * safely used in syncing contexts which have set SPL_FSTRANS. */ static inline gfp_t kmem_flags_convert(int flags) @@ -91,25 +91,11 @@ typedef struct { } fstrans_cookie_t; /* - * Introduced in Linux 3.9, however this cannot be solely relied on before - * Linux 3.18 as it doesn't turn off __GFP_FS as it should. + * SPL_FSTRANS is the set of flags that indicate that the task is in a + * filesystem or IO codepath, and so any allocation must not call back into + * those codepaths (eg to swap). */ -#ifdef PF_MEMALLOC_NOIO -#define __SPL_PF_MEMALLOC_NOIO (PF_MEMALLOC_NOIO) -#else -#define __SPL_PF_MEMALLOC_NOIO (0) -#endif - -/* - * PF_FSTRANS is removed from Linux 4.12 - */ -#ifdef PF_FSTRANS -#define __SPL_PF_FSTRANS (PF_FSTRANS) -#else -#define __SPL_PF_FSTRANS (0) -#endif - -#define SPL_FSTRANS (__SPL_PF_FSTRANS|__SPL_PF_MEMALLOC_NOIO) +#define SPL_FSTRANS (PF_MEMALLOC_NOIO) static inline fstrans_cookie_t spl_fstrans_mark(void) @@ -141,43 +127,8 @@ spl_fstrans_check(void) return (current->flags & SPL_FSTRANS); } -/* - * specifically used to check PF_FSTRANS flag, cannot be relied on for - * checking spl_fstrans_mark(). - */ -static inline int -__spl_pf_fstrans_check(void) -{ - return (current->flags & __SPL_PF_FSTRANS); -} - -/* - * Kernel compatibility for GFP flags - */ -/* < 4.13 */ -#ifndef __GFP_RETRY_MAYFAIL -#define __GFP_RETRY_MAYFAIL __GFP_REPEAT -#endif -/* < 4.4 */ -#ifndef __GFP_RECLAIM -#define __GFP_RECLAIM __GFP_WAIT -#endif - -#ifdef HAVE_ATOMIC64_T -#define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used) -#define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used) -#define kmem_alloc_used_read() atomic64_read(&kmem_alloc_used) -#define kmem_alloc_used_set(size) atomic64_set(&kmem_alloc_used, size) extern atomic64_t kmem_alloc_used; -extern unsigned long long kmem_alloc_max; -#else /* HAVE_ATOMIC64_T */ -#define kmem_alloc_used_add(size) atomic_add(size, &kmem_alloc_used) -#define kmem_alloc_used_sub(size) atomic_sub(size, &kmem_alloc_used) -#define kmem_alloc_used_read() atomic_read(&kmem_alloc_used) -#define kmem_alloc_used_set(size) atomic_set(&kmem_alloc_used, size) -extern atomic_t kmem_alloc_used; -extern unsigned long long kmem_alloc_max; -#endif /* HAVE_ATOMIC64_T */ +extern uint64_t kmem_alloc_max; extern unsigned int spl_kmem_alloc_warn; extern unsigned int spl_kmem_alloc_max; diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/misc.h b/sys/contrib/openzfs/include/os/linux/spl/sys/misc.h index 0b44786f8a6e..fbaaf229bd1a 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/misc.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/misc.h @@ -24,7 +24,13 @@ #define _OS_LINUX_SPL_MISC_H #include <linux/kobject.h> +#include <linux/swap.h> extern void spl_signal_kobj_evt(struct block_device *bdev); +/* + * Check if the current thread is a memory reclaim thread. + */ +extern int current_is_reclaim_thread(void); + #endif diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/mod_os.h b/sys/contrib/openzfs/include/os/linux/spl/sys/mod.h index eaeb9255039e..eaeb9255039e 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/mod_os.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/mod.h diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/mutex.h b/sys/contrib/openzfs/include/os/linux/spl/sys/mutex.h index f000f53ab9b6..4eca2414fc5b 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/mutex.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/mutex.h @@ -111,7 +111,7 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \ #undef mutex_destroy #define mutex_destroy(mp) \ { \ - VERIFY3P(mutex_owner(mp), ==, NULL); \ + VERIFY0P(mutex_owner(mp)); \ } #define mutex_tryenter(mp) \ diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/rwlock.h b/sys/contrib/openzfs/include/os/linux/spl/sys/rwlock.h index 563e0a19663d..c883836c2f83 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/rwlock.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/rwlock.h @@ -130,7 +130,7 @@ RW_READ_HELD(krwlock_t *rwp) /* * The Linux rwsem implementation does not require a matching destroy. */ -#define rw_destroy(rwp) ((void) 0) +#define rw_destroy(rwp) ASSERT(!(RW_LOCK_HELD(rwp))) /* * Upgrading a rwsem from a reader to a writer is not supported by the diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/stat.h b/sys/contrib/openzfs/include/os/linux/spl/sys/stat.h index 087389b57b34..ad2815e46394 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/stat.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/stat.h @@ -25,6 +25,6 @@ #ifndef _SPL_STAT_H #define _SPL_STAT_H -#include <linux/stat.h> +#include <sys/stat.h> #endif /* SPL_STAT_H */ diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/time.h b/sys/contrib/openzfs/include/os/linux/spl/sys/time.h index 33b273b53996..4edc42a8aef9 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/time.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/time.h @@ -80,6 +80,14 @@ gethrestime_sec(void) } static inline hrtime_t +getlrtime(void) +{ + inode_timespec_t ts; + ktime_get_coarse_ts64(&ts); + return (((hrtime_t)ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec); +} + +static inline hrtime_t gethrtime(void) { struct timespec64 ts; diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_acl.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_acl.h index 8923657daf02..d88b4937ef08 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_acl.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_acl.h @@ -59,8 +59,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __field(uint64_t, z_size) __field(uint64_t, z_pflags) __field(uint32_t, z_sync_cnt) - __field(uint32_t, z_sync_writes_cnt) - __field(uint32_t, z_async_writes_cnt) __field(mode_t, z_mode) __field(boolean_t, z_is_sa) __field(boolean_t, z_is_ctldir) @@ -92,8 +90,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __entry->z_size = zn->z_size; __entry->z_pflags = zn->z_pflags; __entry->z_sync_cnt = zn->z_sync_cnt; - __entry->z_sync_writes_cnt = zn->z_sync_writes_cnt; - __entry->z_async_writes_cnt = zn->z_async_writes_cnt; __entry->z_mode = zn->z_mode; __entry->z_is_sa = zn->z_is_sa; __entry->z_is_ctldir = zn->z_is_ctldir; @@ -117,7 +113,7 @@ DECLARE_EVENT_CLASS(zfs_ace_class, TP_printk("zn { id %llu unlinked %u atime_dirty %u " "zn_prefetch %u blksz %u seq %u " "mapcnt %llu size %llu pflags %llu " - "sync_cnt %u sync_writes_cnt %u async_writes_cnt %u " + "sync_cnt %u " "mode 0x%x is_sa %d is_ctldir %d " "inode { uid %u gid %u ino %lu nlink %u size %lli " "blkbits %u bytes %u mode 0x%x generation %x } } " @@ -126,7 +122,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __entry->z_zn_prefetch, __entry->z_blksz, __entry->z_seq, __entry->z_mapcnt, __entry->z_size, __entry->z_pflags, __entry->z_sync_cnt, - __entry->z_sync_writes_cnt, __entry->z_async_writes_cnt, __entry->z_mode, __entry->z_is_sa, __entry->z_is_ctldir, __entry->i_uid, __entry->i_gid, __entry->i_ino, __entry->i_nlink, __entry->i_size, __entry->i_blkbits, diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_common.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_common.h index 85cf8cc20b09..e1b6d61099b9 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_common.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_common.h @@ -45,7 +45,7 @@ __field(zio_flag_t, zio_orig_flags) \ __field(enum zio_stage, zio_orig_stage) \ __field(enum zio_stage, zio_orig_pipeline) \ - __field(uint8_t, zio_reexecute) \ + __field(uint8_t, zio_post) \ __field(uint64_t, zio_txg) \ __field(int, zio_error) \ __field(uint64_t, zio_ena) \ @@ -74,7 +74,7 @@ __entry->zio_orig_flags = zio->io_orig_flags; \ __entry->zio_orig_stage = zio->io_orig_stage; \ __entry->zio_orig_pipeline = zio->io_orig_pipeline; \ - __entry->zio_reexecute = zio->io_reexecute; \ + __entry->zio_post = zio->io_post; \ __entry->zio_txg = zio->io_txg; \ __entry->zio_error = zio->io_error; \ __entry->zio_ena = zio->io_ena; \ @@ -92,7 +92,7 @@ "zio { type %u prio %u size %llu orig_size %llu " \ "offset %llu timestamp %llu delta %llu delay %llu " \ "flags 0x%llx stage 0x%x pipeline 0x%x orig_flags 0x%llx " \ - "orig_stage 0x%x orig_pipeline 0x%x reexecute %u " \ + "orig_stage 0x%x orig_pipeline 0x%x post %u " \ "txg %llu error %d ena %llu prop { checksum %u compress %u " \ "type %u level %u copies %u dedup %u dedup_verify %u nopwrite %u } }" @@ -102,7 +102,7 @@ __entry->zio_timestamp, __entry->zio_delta, __entry->zio_delay, \ __entry->zio_flags, __entry->zio_stage, __entry->zio_pipeline, \ __entry->zio_orig_flags, __entry->zio_orig_stage, \ - __entry->zio_orig_pipeline, __entry->zio_reexecute, \ + __entry->zio_orig_pipeline, __entry->zio_post, \ __entry->zio_txg, __entry->zio_error, __entry->zio_ena, \ __entry->zp_checksum, __entry->zp_compress, __entry->zp_type, \ __entry->zp_level, __entry->zp_copies, __entry->zp_dedup, \ diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_zil.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_zil.h index 955462c85d10..e34ea46b3fe8 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_zil.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_zil.h @@ -139,18 +139,18 @@ #define ZCW_TP_STRUCT_ENTRY \ __field(lwb_t *, zcw_lwb) \ __field(boolean_t, zcw_done) \ - __field(int, zcw_zio_error) \ + __field(int, zcw_error) \ #define ZCW_TP_FAST_ASSIGN \ __entry->zcw_lwb = zcw->zcw_lwb; \ __entry->zcw_done = zcw->zcw_done; \ - __entry->zcw_zio_error = zcw->zcw_zio_error; + __entry->zcw_error = zcw->zcw_error; #define ZCW_TP_PRINTK_FMT \ "zcw { lwb %p done %u error %u }" #define ZCW_TP_PRINTK_ARGS \ - __entry->zcw_lwb, __entry->zcw_done, __entry->zcw_zio_error + __entry->zcw_lwb, __entry->zcw_done, __entry->zcw_error /* * Generic support for two argument tracepoints of the form: diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_znode_impl.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_znode_impl.h index b38847b20462..6a77e40abe10 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_znode_impl.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_znode_impl.h @@ -157,6 +157,7 @@ struct znode; extern int zfs_sync(struct super_block *, int, cred_t *); extern int zfs_inode_alloc(struct super_block *, struct inode **ip); +extern void zfs_inode_free(struct inode *); extern void zfs_inode_destroy(struct inode *); extern void zfs_mark_inode_dirty(struct inode *); extern boolean_t zfs_relatime_need_update(const struct inode *); diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/zpl.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/zpl.h index f5a9105cd885..8994aab889fe 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/zpl.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/zpl.h @@ -55,6 +55,7 @@ extern const struct file_operations zpl_dir_file_operations; extern void zpl_prune_sb(uint64_t nr_to_scan, void *arg); extern const struct super_operations zpl_super_operations; +extern const struct dentry_operations zpl_dentry_operations; extern const struct export_operations zpl_export_operations; extern struct file_system_type zpl_fs_type; diff --git a/sys/contrib/openzfs/include/sys/arc_impl.h b/sys/contrib/openzfs/include/sys/arc_impl.h index 1b30389107c5..b55d5da3378c 100644 --- a/sys/contrib/openzfs/include/sys/arc_impl.h +++ b/sys/contrib/openzfs/include/sys/arc_impl.h @@ -954,7 +954,7 @@ typedef struct arc_sums { wmsum_t arcstat_data_size; wmsum_t arcstat_metadata_size; wmsum_t arcstat_dbuf_size; - wmsum_t arcstat_dnode_size; + aggsum_t arcstat_dnode_size; wmsum_t arcstat_bonus_size; wmsum_t arcstat_l2_hits; wmsum_t arcstat_l2_misses; diff --git a/sys/contrib/openzfs/include/sys/dbuf.h b/sys/contrib/openzfs/include/sys/dbuf.h index 756459b2fbb5..baf3b1508335 100644 --- a/sys/contrib/openzfs/include/sys/dbuf.h +++ b/sys/contrib/openzfs/include/sys/dbuf.h @@ -164,6 +164,7 @@ typedef struct dbuf_dirty_record { boolean_t dr_nopwrite; boolean_t dr_brtwrite; boolean_t dr_diowrite; + boolean_t dr_rewrite; boolean_t dr_has_raw_params; /* Override and raw params are mutually exclusive. */ diff --git a/sys/contrib/openzfs/include/sys/dmu.h b/sys/contrib/openzfs/include/sys/dmu.h index 0b2e443a433a..aa5035862def 100644 --- a/sys/contrib/openzfs/include/sys/dmu.h +++ b/sys/contrib/openzfs/include/sys/dmu.h @@ -414,6 +414,9 @@ typedef struct dmu_buf { #define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint" #define DMU_POOL_LOG_SPACEMAP_ZAP "com.delphix:log_spacemap_zap" #define DMU_POOL_DELETED_CLONES "com.delphix:deleted_clones" +#define DMU_POOL_TXG_LOG_TIME_MINUTES "com.klarasystems:txg_log_time:minutes" +#define DMU_POOL_TXG_LOG_TIME_DAYS "com.klarasystems:txg_log_time:days" +#define DMU_POOL_TXG_LOG_TIME_MONTHS "com.klarasystems:txg_log_time:months" /* * Allocate an object from this objset. The range of object numbers @@ -739,8 +742,8 @@ dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync, dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp __maybe_unused) { - ASSERT(dbu->dbu_evict_func_sync == NULL); - ASSERT(dbu->dbu_evict_func_async == NULL); + ASSERT0P(dbu->dbu_evict_func_sync); + ASSERT0P(dbu->dbu_evict_func_async); /* must have at least one evict func */ IMPLY(evict_func_sync == NULL, evict_func_async != NULL); @@ -822,6 +825,7 @@ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); */ void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_dirty_flags(dmu_buf_t *db, dmu_tx_t *tx, dmu_flags_t flags); +void dmu_buf_will_rewrite(dmu_buf_t *db, dmu_tx_t *tx); boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx); diff --git a/sys/contrib/openzfs/include/sys/dmu_impl.h b/sys/contrib/openzfs/include/sys/dmu_impl.h index 21a8b16a3ee6..bae872bd1907 100644 --- a/sys/contrib/openzfs/include/sys/dmu_impl.h +++ b/sys/contrib/openzfs/include/sys/dmu_impl.h @@ -168,12 +168,10 @@ extern "C" { * dn_allocated_txg * dn_free_txg * dn_assigned_txg - * dn_dirty_txg + * dn_dirtycnt * dd_assigned_tx * dn_notxholds * dn_nodnholds - * dn_dirtyctx - * dn_dirtyctx_firstset * (dn_phys copy fields?) * (dn_phys contents?) * held from: diff --git a/sys/contrib/openzfs/include/sys/dmu_objset.h b/sys/contrib/openzfs/include/sys/dmu_objset.h index 288ad30166df..492be29200e4 100644 --- a/sys/contrib/openzfs/include/sys/dmu_objset.h +++ b/sys/contrib/openzfs/include/sys/dmu_objset.h @@ -152,7 +152,7 @@ struct objset { * The largest zpl file block allowed in special class. * cached here instead of zfsvfs for easier access. */ - int os_zpl_special_smallblock; + uint64_t os_zpl_special_smallblock; /* * Pointer is constant; the blkptr it points to is protected by diff --git a/sys/contrib/openzfs/include/sys/dmu_traverse.h b/sys/contrib/openzfs/include/sys/dmu_traverse.h index 3196b2addeee..70cafa4c74f1 100644 --- a/sys/contrib/openzfs/include/sys/dmu_traverse.h +++ b/sys/contrib/openzfs/include/sys/dmu_traverse.h @@ -59,6 +59,13 @@ typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, */ #define TRAVERSE_NO_DECRYPT (1<<5) +/* + * Always use logical birth time for birth time comparisons. This is useful + * for operations that care about user data changes rather than physical + * block rewrites (e.g., incremental replication). + */ +#define TRAVERSE_LOGICAL (1<<6) + /* Special traverse error return value to indicate skipping of children */ #define TRAVERSE_VISIT_NO_CHILDREN -1 diff --git a/sys/contrib/openzfs/include/sys/dnode.h b/sys/contrib/openzfs/include/sys/dnode.h index 76218c8b09ca..8bd1db5b7165 100644 --- a/sys/contrib/openzfs/include/sys/dnode.h +++ b/sys/contrib/openzfs/include/sys/dnode.h @@ -141,12 +141,6 @@ struct dmu_buf_impl; struct objset; struct zio; -enum dnode_dirtycontext { - DN_UNDIRTIED, - DN_DIRTY_OPEN, - DN_DIRTY_SYNC -}; - /* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */ #define DNODE_FLAG_USED_BYTES (1 << 0) #define DNODE_FLAG_USERUSED_ACCOUNTED (1 << 1) @@ -340,11 +334,9 @@ struct dnode { uint64_t dn_allocated_txg; uint64_t dn_free_txg; uint64_t dn_assigned_txg; - uint64_t dn_dirty_txg; /* txg dnode was last dirtied */ + uint8_t dn_dirtycnt; kcondvar_t dn_notxholds; kcondvar_t dn_nodnholds; - enum dnode_dirtycontext dn_dirtyctx; - const void *dn_dirtyctx_firstset; /* dbg: contents meaningless */ /* protected by own devices */ zfs_refcount_t dn_tx_holds; @@ -440,7 +432,6 @@ void dnode_rele_and_unlock(dnode_t *dn, const void *tag, boolean_t evicting); int dnode_try_claim(objset_t *os, uint64_t object, int slots); boolean_t dnode_is_dirty(dnode_t *dn); void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); -void dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, const void *tag); void dnode_sync(dnode_t *dn, dmu_tx_t *tx); void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx); @@ -468,9 +459,6 @@ void dnode_free_interior_slots(dnode_t *dn); void dnode_set_storage_type(dnode_t *dn, dmu_object_type_t type); -#define DNODE_IS_DIRTY(_dn) \ - ((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa)) - #define DNODE_LEVEL_IS_CACHEABLE(_dn, _level) \ ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \ (((_level) > 0 || DMU_OT_IS_METADATA((_dn)->dn_type)) && \ diff --git a/sys/contrib/openzfs/include/sys/dsl_deleg.h b/sys/contrib/openzfs/include/sys/dsl_deleg.h index ae729b9f32ff..36dd6211219d 100644 --- a/sys/contrib/openzfs/include/sys/dsl_deleg.h +++ b/sys/contrib/openzfs/include/sys/dsl_deleg.h @@ -46,6 +46,7 @@ extern "C" { #define ZFS_DELEG_PERM_MOUNT "mount" #define ZFS_DELEG_PERM_SHARE "share" #define ZFS_DELEG_PERM_SEND "send" +#define ZFS_DELEG_PERM_SEND_RAW "send:raw" #define ZFS_DELEG_PERM_RECEIVE "receive" #define ZFS_DELEG_PERM_RECEIVE_APPEND "receive:append" #define ZFS_DELEG_PERM_ALLOW "allow" diff --git a/sys/contrib/openzfs/include/sys/fm/fs/zfs.h b/sys/contrib/openzfs/include/sys/fm/fs/zfs.h index 659c64bf15a6..a771b11420fd 100644 --- a/sys/contrib/openzfs/include/sys/fm/fs/zfs.h +++ b/sys/contrib/openzfs/include/sys/fm/fs/zfs.h @@ -58,6 +58,7 @@ extern "C" { #define FM_EREPORT_ZFS_PROBE_FAILURE "probe_failure" #define FM_EREPORT_ZFS_LOG_REPLAY "log_replay" #define FM_EREPORT_ZFS_CONFIG_CACHE_WRITE "config_cache_write" +#define FM_EREPORT_ZFS_SITOUT "sitout" #define FM_EREPORT_PAYLOAD_ZFS_POOL "pool" #define FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE "pool_failmode" diff --git a/sys/contrib/openzfs/include/sys/fs/zfs.h b/sys/contrib/openzfs/include/sys/fs/zfs.h index c8deb5be419e..49ab9d3db795 100644 --- a/sys/contrib/openzfs/include/sys/fs/zfs.h +++ b/sys/contrib/openzfs/include/sys/fs/zfs.h @@ -385,6 +385,8 @@ typedef enum { VDEV_PROP_TRIM_SUPPORT, VDEV_PROP_TRIM_ERRORS, VDEV_PROP_SLOW_IOS, + VDEV_PROP_SIT_OUT, + VDEV_PROP_AUTOSIT, VDEV_NUM_PROPS } vdev_prop_t; @@ -1627,6 +1629,9 @@ typedef struct zfs_rewrite_args { uint64_t arg; } zfs_rewrite_args_t; +/* zfs_rewrite_args flags */ +#define ZFS_REWRITE_PHYSICAL 0x1 /* Preserve logical birth time. */ + #define ZFS_IOC_REWRITE _IOW(0x83, 3, zfs_rewrite_args_t) /* @@ -1670,6 +1675,7 @@ typedef enum { ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, ZFS_ERR_ASHIFT_MISMATCH, ZFS_ERR_STREAM_LARGE_MICROZAP, + ZFS_ERR_TOO_MANY_SITOUTS, } zfs_errno_t; /* diff --git a/sys/contrib/openzfs/include/sys/metaslab.h b/sys/contrib/openzfs/include/sys/metaslab.h index 4d57e52e8468..36cbe06bacce 100644 --- a/sys/contrib/openzfs/include/sys/metaslab.h +++ b/sys/contrib/openzfs/include/sys/metaslab.h @@ -110,9 +110,10 @@ void metaslab_class_balance(metaslab_class_t *mc, boolean_t onsync); void metaslab_class_histogram_verify(metaslab_class_t *); uint64_t metaslab_class_fragmentation(metaslab_class_t *); uint64_t metaslab_class_expandable_space(metaslab_class_t *); -boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, zio_t *, - boolean_t, boolean_t *); -boolean_t metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *); +boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int, + uint64_t, boolean_t, boolean_t *); +boolean_t metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, + uint64_t); void metaslab_class_evict_old(metaslab_class_t *, uint64_t); const char *metaslab_class_get_name(metaslab_class_t *); uint64_t metaslab_class_get_alloc(metaslab_class_t *); diff --git a/sys/contrib/openzfs/include/sys/metaslab_impl.h b/sys/contrib/openzfs/include/sys/metaslab_impl.h index 83fbe620fe37..6ce995d0a086 100644 --- a/sys/contrib/openzfs/include/sys/metaslab_impl.h +++ b/sys/contrib/openzfs/include/sys/metaslab_impl.h @@ -539,6 +539,8 @@ typedef struct metaslab_unflushed_phys { uint64_t msp_unflushed_txg; } metaslab_unflushed_phys_t; +char *metaslab_rt_name(metaslab_group_t *, metaslab_t *, const char *); + #ifdef __cplusplus } #endif diff --git a/sys/contrib/openzfs/include/sys/mod.h b/sys/contrib/openzfs/include/sys/mod.h deleted file mode 100644 index 4122889ab758..000000000000 --- a/sys/contrib/openzfs/include/sys/mod.h +++ /dev/null @@ -1,36 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf <behlendorf1@llnl.gov>. - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see <http://www.gnu.org/licenses/>. - */ -#ifndef _SYS_MOD_H -#define _SYS_MOD_H - -#ifdef _KERNEL -#include <sys/mod_os.h> -#else -/* - * Exported symbols - */ -#define EXPORT_SYMBOL(x) -#endif - -#endif /* SYS_MOD_H */ diff --git a/sys/contrib/openzfs/include/sys/range_tree.h b/sys/contrib/openzfs/include/sys/range_tree.h index 23e80f64284b..0f6884682459 100644 --- a/sys/contrib/openzfs/include/sys/range_tree.h +++ b/sys/contrib/openzfs/include/sys/range_tree.h @@ -49,6 +49,9 @@ typedef enum zfs_range_seg_type { ZFS_RANGE_SEG_NUM_TYPES, } zfs_range_seg_type_t; +#define ZFS_RT_NAME(rt) (((rt)->rt_name != NULL) ? (rt)->rt_name : "") +#define ZFS_RT_F_DYN_NAME (1ULL << 0) /* if rt_name must be freed */ + /* * Note: the range_tree may not be accessed concurrently; consumers * must provide external locking if required. @@ -68,6 +71,9 @@ typedef struct zfs_range_tree { void *rt_arg; uint64_t rt_gap; /* allowable inter-segment gap */ + uint64_t rt_flags; + const char *rt_name; /* details for debugging */ + /* * The rt_histogram maintains a histogram of ranges. Each bucket, * rt_histogram[i], contains the number of ranges whose size is: @@ -281,6 +287,9 @@ zfs_range_tree_t *zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops, uint64_t gap); zfs_range_tree_t *zfs_range_tree_create(const zfs_range_tree_ops_t *ops, zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift); +zfs_range_tree_t *zfs_range_tree_create_flags(const zfs_range_tree_ops_t *ops, + zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, + uint64_t flags, const char *name); void zfs_range_tree_destroy(zfs_range_tree_t *rt); boolean_t zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start, uint64_t size); diff --git a/sys/contrib/openzfs/include/sys/spa.h b/sys/contrib/openzfs/include/sys/spa.h index a3e36c1f59ae..66db16b33c51 100644 --- a/sys/contrib/openzfs/include/sys/spa.h +++ b/sys/contrib/openzfs/include/sys/spa.h @@ -140,7 +140,7 @@ typedef struct zio_cksum_salt { * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 7 | padding | + * 7 |R| padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 8 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ @@ -175,6 +175,7 @@ typedef struct zio_cksum_salt { * E blkptr_t contains embedded data (see below) * lvl level of indirection * type DMU object type + * R rewrite (reallocated/rewritten at phys birth TXG) * phys birth txg when dva[0] was written; zero if same as logical birth txg * note that typically all the dva's would be written in this * txg, but they could be different if they were moved by @@ -190,11 +191,11 @@ typedef struct zio_cksum_salt { * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 0 | vdev1 | pad | ASIZE | + * 0 | pad | vdev1 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 2 | vdev2 | pad | ASIZE | + * 2 | pad | vdev2 | pad | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ @@ -204,7 +205,7 @@ typedef struct zio_cksum_salt { * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 7 | padding | + * 7 |R| padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 8 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ @@ -373,7 +374,8 @@ typedef enum bp_embedded_type { typedef struct blkptr { dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ uint64_t blk_prop; /* size, compression, type, etc */ - uint64_t blk_pad[2]; /* Extra space for the future */ + uint64_t blk_prop2; /* additional properties */ + uint64_t blk_pad; /* Extra space for the future */ uint64_t blk_birth_word[2]; uint64_t blk_fill; /* fill count */ zio_cksum_t blk_cksum; /* 256-bit checksum */ @@ -476,32 +478,51 @@ typedef struct blkptr { #define BP_GET_FREE(bp) BF64_GET((bp)->blk_fill, 0, 1) #define BP_SET_FREE(bp, x) BF64_SET((bp)->blk_fill, 0, 1, x) +/* + * Block birth time macros for different use cases: + * - BP_GET_LOGICAL_BIRTH(): When the block was logically modified by user. + * To be used with a focus on user data, like incremental replication. + * - BP_GET_PHYSICAL_BIRTH(): When the block was physically written to disks. + * For regular writes is equal to logical birth. For dedup and block cloning + * can be smaller than logical birth. For remapped and rewritten blocks can + * be bigger. To be used with focus on physical disk content: ARC, DDT, scrub. + * - BP_GET_RAW_PHYSICAL_BIRTH(): Raw physical birth value. Zero if equal + * to logical birth. Should only be used for BP copying and debugging. + * - BP_GET_BIRTH(): When the block was allocated, which is a physical birth + * for rewritten blocks (rewrite flag set) or logical birth otherwise. + */ #define BP_GET_LOGICAL_BIRTH(bp) (bp)->blk_birth_word[1] #define BP_SET_LOGICAL_BIRTH(bp, x) ((bp)->blk_birth_word[1] = (x)) -#define BP_GET_PHYSICAL_BIRTH(bp) (bp)->blk_birth_word[0] +#define BP_GET_RAW_PHYSICAL_BIRTH(bp) (bp)->blk_birth_word[0] #define BP_SET_PHYSICAL_BIRTH(bp, x) ((bp)->blk_birth_word[0] = (x)) -#define BP_GET_BIRTH(bp) \ - (BP_IS_EMBEDDED(bp) ? 0 : \ - BP_GET_PHYSICAL_BIRTH(bp) ? BP_GET_PHYSICAL_BIRTH(bp) : \ +#define BP_GET_PHYSICAL_BIRTH(bp) \ + (BP_IS_EMBEDDED(bp) ? 0 : \ + BP_GET_RAW_PHYSICAL_BIRTH(bp) ? BP_GET_RAW_PHYSICAL_BIRTH(bp) : \ BP_GET_LOGICAL_BIRTH(bp)) -#define BP_SET_BIRTH(bp, logical, physical) \ -{ \ - ASSERT(!BP_IS_EMBEDDED(bp)); \ - BP_SET_LOGICAL_BIRTH(bp, logical); \ - BP_SET_PHYSICAL_BIRTH(bp, \ - ((logical) == (physical) ? 0 : (physical))); \ +#define BP_GET_BIRTH(bp) \ + ((BP_IS_EMBEDDED(bp) || !BP_GET_REWRITE(bp)) ? \ + BP_GET_LOGICAL_BIRTH(bp) : BP_GET_PHYSICAL_BIRTH(bp)) + +#define BP_SET_BIRTH(bp, logical, physical) \ +{ \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ + BP_SET_LOGICAL_BIRTH(bp, logical); \ + BP_SET_PHYSICAL_BIRTH(bp, \ + ((logical) == (physical) ? 0 : (physical))); \ } #define BP_GET_FILL(bp) \ - ((BP_IS_ENCRYPTED(bp)) ? BF64_GET((bp)->blk_fill, 0, 32) : \ - ((BP_IS_EMBEDDED(bp)) ? 1 : (bp)->blk_fill)) + (BP_IS_EMBEDDED(bp) ? 1 : \ + BP_IS_ENCRYPTED(bp) ? BF64_GET((bp)->blk_fill, 0, 32) : \ + (bp)->blk_fill) #define BP_SET_FILL(bp, fill) \ { \ - if (BP_IS_ENCRYPTED(bp)) \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ + if (BP_IS_ENCRYPTED(bp)) \ BF64_SET((bp)->blk_fill, 0, 32, fill); \ else \ (bp)->blk_fill = fill; \ @@ -516,6 +537,15 @@ typedef struct blkptr { BF64_SET((bp)->blk_fill, 32, 32, iv2); \ } +#define BP_GET_REWRITE(bp) \ + (BP_IS_EMBEDDED(bp) ? 0 : BF64_GET((bp)->blk_prop2, 63, 1)) + +#define BP_SET_REWRITE(bp, x) \ +{ \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ + BF64_SET((bp)->blk_prop2, 63, 1, x); \ +} + #define BP_IS_METADATA(bp) \ (BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) @@ -545,7 +575,7 @@ typedef struct blkptr { (dva1)->dva_word[0] == (dva2)->dva_word[0]) #define BP_EQUAL(bp1, bp2) \ - (BP_GET_BIRTH(bp1) == BP_GET_BIRTH(bp2) && \ + (BP_GET_PHYSICAL_BIRTH(bp1) == BP_GET_PHYSICAL_BIRTH(bp2) && \ BP_GET_LOGICAL_BIRTH(bp1) == BP_GET_LOGICAL_BIRTH(bp2) && \ DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ @@ -588,8 +618,8 @@ typedef struct blkptr { { \ BP_ZERO_DVAS(bp); \ (bp)->blk_prop = 0; \ - (bp)->blk_pad[0] = 0; \ - (bp)->blk_pad[1] = 0; \ + (bp)->blk_prop2 = 0; \ + (bp)->blk_pad = 0; \ (bp)->blk_birth_word[0] = 0; \ (bp)->blk_birth_word[1] = 0; \ (bp)->blk_fill = 0; \ @@ -696,7 +726,7 @@ typedef struct blkptr { (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)BP_GET_PSIZE(bp), \ (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp), \ - (u_longlong_t)BP_GET_BIRTH(bp), \ + (u_longlong_t)BP_GET_PHYSICAL_BIRTH(bp), \ (u_longlong_t)BP_GET_FILL(bp), \ ws, \ (u_longlong_t)bp->blk_cksum.zc_word[0], \ @@ -850,7 +880,6 @@ extern kcondvar_t spa_namespace_cv; #define SPA_CONFIG_UPDATE_VDEVS 1 extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t, boolean_t); -extern void spa_config_load(void); extern int spa_all_configs(uint64_t *generation, nvlist_t **pools); extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, @@ -1065,6 +1094,7 @@ extern metaslab_class_t *spa_normal_class(spa_t *spa); extern metaslab_class_t *spa_log_class(spa_t *spa); extern metaslab_class_t *spa_embedded_log_class(spa_t *spa); extern metaslab_class_t *spa_special_class(spa_t *spa); +extern metaslab_class_t *spa_special_embedded_log_class(spa_t *spa); extern metaslab_class_t *spa_dedup_class(spa_t *spa); extern metaslab_class_t *spa_preferred_class(spa_t *spa, const zio_t *zio); extern boolean_t spa_special_has_ddt(spa_t *spa); @@ -1213,7 +1243,6 @@ extern void vdev_mirror_stat_fini(void); /* Initialization and termination */ extern void spa_init(spa_mode_t mode); extern void spa_fini(void); -extern void spa_boot_init(void *); /* properties */ extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); diff --git a/sys/contrib/openzfs/include/sys/spa_impl.h b/sys/contrib/openzfs/include/sys/spa_impl.h index 8c52f751a819..07a959db3447 100644 --- a/sys/contrib/openzfs/include/sys/spa_impl.h +++ b/sys/contrib/openzfs/include/sys/spa_impl.h @@ -55,6 +55,8 @@ #include <sys/dsl_deadlist.h> #include <zfeature_common.h> +#include "zfs_crrd.h" + #ifdef __cplusplus extern "C" { #endif @@ -246,6 +248,7 @@ struct spa { metaslab_class_t *spa_log_class; /* intent log data class */ metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */ metaslab_class_t *spa_special_class; /* special allocation class */ + metaslab_class_t *spa_special_embedded_log_class; /* log on special */ metaslab_class_t *spa_dedup_class; /* dedup allocation class */ uint64_t spa_first_txg; /* first txg after spa_open() */ uint64_t spa_final_txg; /* txg of export/destroy */ @@ -343,6 +346,12 @@ struct spa { spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */ zthr_t *spa_checkpoint_discard_zthr; + kmutex_t spa_txg_log_time_lock; /* for spa_txg_log_time */ + dbrrd_t spa_txg_log_time; + uint64_t spa_last_noted_txg; + uint64_t spa_last_noted_txg_time; + uint64_t spa_last_flush_txg_time; + space_map_t *spa_syncing_log_sm; /* current log space map */ avl_tree_t spa_sm_logs_by_txg; kmutex_t spa_flushed_ms_lock; /* for metaslabs_by_flushed */ diff --git a/sys/contrib/openzfs/include/sys/vdev.h b/sys/contrib/openzfs/include/sys/vdev.h index 7f457c3a0b76..510474d6c085 100644 --- a/sys/contrib/openzfs/include/sys/vdev.h +++ b/sys/contrib/openzfs/include/sys/vdev.h @@ -139,6 +139,7 @@ extern uint64_t vdev_asize_to_psize_txg(vdev_t *vd, uint64_t asize, extern uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg); extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); +extern uint64_t vdev_get_min_alloc(vdev_t *vd); /* * Return the amount of space allocated for a gang block header. Note that @@ -148,7 +149,20 @@ extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); static inline uint64_t vdev_gang_header_asize(vdev_t *vd) { - return (vdev_psize_to_asize_txg(vd, SPA_GANGBLOCKSIZE, 0)); + return (vdev_psize_to_asize_txg(vd, SPA_OLD_GANGBLOCKSIZE, 0)); +} + +/* + * Return the amount of data that can be stored in a gang header. Because we + * need to ensure gang headers can always be allocated (as long as there is + * space available), this is the minimum allocatable size on the vdev. Note that + * since the physical birth txg is not provided, this must be constant for + * a given vdev. (e.g. raidz expansion can't change this) + */ +static inline uint64_t +vdev_gang_header_psize(vdev_t *vd) +{ + return (vdev_get_min_alloc(vd)); } extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux); diff --git a/sys/contrib/openzfs/include/sys/vdev_impl.h b/sys/contrib/openzfs/include/sys/vdev_impl.h index 385d7224f2c5..5a8c2f846be2 100644 --- a/sys/contrib/openzfs/include/sys/vdev_impl.h +++ b/sys/contrib/openzfs/include/sys/vdev_impl.h @@ -279,10 +279,12 @@ struct vdev { uint64_t vdev_noalloc; /* device is passivated? */ uint64_t vdev_removing; /* device is being removed? */ uint64_t vdev_failfast; /* device failfast setting */ + boolean_t vdev_autosit; /* automatic sitout management */ boolean_t vdev_rz_expanding; /* raidz is being expanded? */ boolean_t vdev_ishole; /* is a hole in the namespace */ uint64_t vdev_top_zap; vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */ + uint64_t vdev_last_latency_check; /* pool checkpoint related */ space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */ @@ -431,6 +433,10 @@ struct vdev { hrtime_t vdev_mmp_pending; /* 0 if write finished */ uint64_t vdev_mmp_kstat_id; /* to find kstat entry */ uint64_t vdev_expansion_time; /* vdev's last expansion time */ + /* used to calculate average read latency */ + uint64_t *vdev_prev_histo; + int64_t vdev_outlier_count; /* read outlier amongst peers */ + hrtime_t vdev_read_sit_out_expire; /* end of sit out period */ list_node_t vdev_leaf_node; /* leaf vdev list */ /* @@ -621,7 +627,6 @@ extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg); extern uint64_t vdev_default_min_asize(vdev_t *vd); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); -extern uint64_t vdev_get_min_alloc(vdev_t *vd); extern uint64_t vdev_get_nparity(vdev_t *vd); extern uint64_t vdev_get_ndisks(vdev_t *vd); @@ -645,10 +650,11 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise); int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj); void vdev_metaslab_group_create(vdev_t *vd); uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b); -#if defined(__linux__) +#if defined(__linux__) && defined(_KERNEL) int param_get_raidz_impl(char *buf, zfs_kernel_param_t *kp); #endif int param_set_raidz_impl(ZFS_MODULE_PARAM_ARGS); +char *vdev_rt_name(vdev_t *vd, const char *name); /* * Vdev ashift optimization tunables diff --git a/sys/contrib/openzfs/include/sys/vdev_raidz.h b/sys/contrib/openzfs/include/sys/vdev_raidz.h index 3b02728cdbf3..df8c2aed4045 100644 --- a/sys/contrib/openzfs/include/sys/vdev_raidz.h +++ b/sys/contrib/openzfs/include/sys/vdev_raidz.h @@ -61,6 +61,9 @@ void vdev_raidz_checksum_error(zio_t *, struct raidz_col *, abd_t *); struct raidz_row *vdev_raidz_row_alloc(int, zio_t *); void vdev_raidz_reflow_copy_scratch(spa_t *); void raidz_dtl_reassessed(vdev_t *); +boolean_t vdev_sit_out_reads(vdev_t *, zio_flag_t); +void vdev_raidz_sit_child(vdev_t *, uint64_t); +void vdev_raidz_unsit_child(vdev_t *); extern const zio_vsd_ops_t vdev_raidz_vsd_ops; diff --git a/sys/contrib/openzfs/include/sys/vdev_raidz_impl.h b/sys/contrib/openzfs/include/sys/vdev_raidz_impl.h index debce6f09a22..8c8dcfb077f6 100644 --- a/sys/contrib/openzfs/include/sys/vdev_raidz_impl.h +++ b/sys/contrib/openzfs/include/sys/vdev_raidz_impl.h @@ -119,6 +119,7 @@ typedef struct raidz_col { uint8_t rc_need_orig_restore:1; /* need to restore from orig_data? */ uint8_t rc_force_repair:1; /* Write good data to this column */ uint8_t rc_allow_repair:1; /* Allow repair I/O to this column */ + uint8_t rc_latency_outlier:1; /* Latency outlier for this device */ int rc_shadow_devidx; /* for double write during expansion */ int rc_shadow_error; /* for double write during expansion */ uint64_t rc_shadow_offset; /* for double write during expansion */ @@ -133,6 +134,7 @@ typedef struct raidz_row { int rr_firstdatacol; /* First data column/parity count */ abd_t *rr_abd_empty; /* dRAID empty sector buffer */ int rr_nempty; /* empty sectors included in parity */ + int rr_outlier_cnt; /* Count of latency outlier devices */ #ifdef ZFS_DEBUG uint64_t rr_offset; /* Logical offset for *_io_verify() */ uint64_t rr_size; /* Physical size for *_io_verify() */ diff --git a/sys/contrib/openzfs/include/sys/xvattr.h b/sys/contrib/openzfs/include/sys/xvattr.h index 447842d269b3..5dadbdb4c619 100644 --- a/sys/contrib/openzfs/include/sys/xvattr.h +++ b/sys/contrib/openzfs/include/sys/xvattr.h @@ -311,6 +311,7 @@ xva_getxoptattr(xvattr_t *xvap) */ #define V_ACE_MASK 0x1 /* mask represents NFSv4 ACE permissions */ #define V_APPEND 0x2 /* want to do append only check */ +#define V_NAMEDATTR 0x4 /* is a named attribute check */ /* * Structure used on VOP_GETSECATTR and VOP_SETSECATTR operations diff --git a/sys/contrib/openzfs/include/sys/zfs_context.h b/sys/contrib/openzfs/include/sys/zfs_context.h index 256c9c2cc2d3..7112d3ef5c99 100644 --- a/sys/contrib/openzfs/include/sys/zfs_context.h +++ b/sys/contrib/openzfs/include/sys/zfs_context.h @@ -205,18 +205,6 @@ extern void vpanic(const char *, va_list) #define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) /* - * Tunables. - */ -typedef struct zfs_kernel_param { - const char *name; /* unused stub */ -} zfs_kernel_param_t; - -#define ZFS_MODULE_PARAM(scope_prefix, name_prefix, name, type, perm, desc) -#define ZFS_MODULE_PARAM_ARGS void -#define ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, setfunc, \ - getfunc, perm, desc) - -/* * Threads. */ typedef pthread_t kthread_t; @@ -236,6 +224,11 @@ typedef pthread_t kthread_t; #define thread_join(t) pthread_join((pthread_t)(t), NULL) #define newproc(f, a, cid, pri, ctp, pid) (ENOSYS) +/* + * Check if the current thread is a memory reclaim thread. + * Always returns false in userspace (no memory reclaim thread). + */ +#define current_is_reclaim_thread() (0) /* in libzpool, p0 exists only to have its address taken */ typedef struct proc { @@ -673,7 +666,7 @@ extern void random_fini(void); struct spa; extern void show_pool_stats(struct spa *); -extern int set_global_var(char const *arg); +extern int handle_tunable_option(const char *, boolean_t); typedef struct callb_cpr { kmutex_t *cc_lockp; @@ -778,7 +771,6 @@ typedef int fstrans_cookie_t; extern fstrans_cookie_t spl_fstrans_mark(void); extern void spl_fstrans_unmark(fstrans_cookie_t); -extern int __spl_pf_fstrans_check(void); extern int kmem_cache_reap_active(void); diff --git a/sys/contrib/openzfs/include/sys/zfs_file.h b/sys/contrib/openzfs/include/sys/zfs_file.h index a1f344c2bb79..67abe9988aaa 100644 --- a/sys/contrib/openzfs/include/sys/zfs_file.h +++ b/sys/contrib/openzfs/include/sys/zfs_file.h @@ -46,7 +46,7 @@ void zfs_file_close(zfs_file_t *fp); int zfs_file_write(zfs_file_t *fp, const void *buf, size_t len, ssize_t *resid); int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t len, loff_t off, - ssize_t *resid); + uint8_t ashift, ssize_t *resid); int zfs_file_read(zfs_file_t *fp, void *buf, size_t len, ssize_t *resid); int zfs_file_pread(zfs_file_t *fp, void *buf, size_t len, loff_t off, ssize_t *resid); diff --git a/sys/contrib/openzfs/include/sys/zfs_znode.h b/sys/contrib/openzfs/include/sys/zfs_znode.h index 2fedaff78534..79b845a672a8 100644 --- a/sys/contrib/openzfs/include/sys/zfs_znode.h +++ b/sys/contrib/openzfs/include/sys/zfs_znode.h @@ -73,7 +73,7 @@ extern "C" { pflags |= attr; \ else \ pflags &= ~attr; \ - VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(ZTOZSB(zp)), \ + VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(ZTOZSB(zp)), \ &pflags, sizeof (pflags), tx)); \ } @@ -202,8 +202,6 @@ typedef struct znode { uint64_t z_size; /* file size (cached) */ uint64_t z_pflags; /* pflags (cached) */ uint32_t z_sync_cnt; /* synchronous open count */ - uint32_t z_sync_writes_cnt; /* synchronous write count */ - uint32_t z_async_writes_cnt; /* asynchronous write count */ mode_t z_mode; /* mode (cached) */ kmutex_t z_acl_lock; /* acl data lock */ zfs_acl_t *z_acl_cached; /* cached acl */ diff --git a/sys/contrib/openzfs/include/sys/zil.h b/sys/contrib/openzfs/include/sys/zil.h index fa7945d8ab8b..da085998879b 100644 --- a/sys/contrib/openzfs/include/sys/zil.h +++ b/sys/contrib/openzfs/include/sys/zil.h @@ -456,7 +456,7 @@ typedef enum { WR_NUM_STATES /* number of states */ } itx_wr_state_t; -typedef void (*zil_callback_t)(void *data); +typedef void (*zil_callback_t)(void *data, int err); typedef struct itx { list_node_t itx_node; /* linkage on zl_itx_list */ @@ -498,10 +498,13 @@ typedef struct zil_stats { * (see zil_commit_writer_stall()) * - suspend: ZIL suspended * (see zil_commit(), zil_get_commit_list()) + * - crash: ZIL crashed + * (see zil_crash(), zil_commit(), ...) */ kstat_named_t zil_commit_error_count; kstat_named_t zil_commit_stall_count; kstat_named_t zil_commit_suspend_count; + kstat_named_t zil_commit_crash_count; /* * Number of transactions (reads, writes, renames, etc.) @@ -549,6 +552,7 @@ typedef struct zil_sums { wmsum_t zil_commit_error_count; wmsum_t zil_commit_stall_count; wmsum_t zil_commit_suspend_count; + wmsum_t zil_commit_crash_count; wmsum_t zil_itx_count; wmsum_t zil_itx_indirect_count; wmsum_t zil_itx_indirect_bytes; @@ -577,6 +581,25 @@ typedef struct zil_sums { #define ZIL_STAT_BUMP(zil, stat) \ ZIL_STAT_INCR(zil, stat, 1); +/* + * Flags for zil_commit_flags(). zil_commit() is a shortcut for + * zil_commit_flags(ZIL_COMMIT_FAILMODE), which is the most common use. + */ +typedef enum { + /* + * Try to commit the ZIL. If it fails, fall back to txg_wait_synced(). + * If that fails, return EIO. + */ + ZIL_COMMIT_NOW = 0, + + /* + * Like ZIL_COMMIT_NOW, but if the ZIL commit fails because the pool + * suspended, act according to the pool's failmode= setting (wait for + * the pool to resume, or return EIO). + */ + ZIL_COMMIT_FAILMODE = (1 << 1), +} zil_commit_flag_t; + typedef int zil_parse_blk_func_t(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t txg); typedef int zil_parse_lr_func_t(zilog_t *zilog, const lr_t *lr, void *arg, @@ -606,14 +629,16 @@ extern boolean_t zil_destroy(zilog_t *zilog, boolean_t keep_first); extern void zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx); extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize); -extern void zil_itx_destroy(itx_t *itx); +extern void zil_itx_destroy(itx_t *itx, int err); extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); extern void zil_async_to_sync(zilog_t *zilog, uint64_t oid); -extern void zil_commit(zilog_t *zilog, uint64_t oid); -extern void zil_commit_impl(zilog_t *zilog, uint64_t oid); extern void zil_remove_async(zilog_t *zilog, uint64_t oid); +extern int zil_commit_flags(zilog_t *zilog, uint64_t oid, + zil_commit_flag_t flags); +extern int __must_check zil_commit(zilog_t *zilog, uint64_t oid); + extern int zil_reset(const char *osname, void *txarg); extern int zil_claim(struct dsl_pool *dp, struct dsl_dataset *ds, void *txarg); @@ -635,6 +660,8 @@ extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval); extern uint64_t zil_max_copied_data(zilog_t *zilog); extern uint64_t zil_max_log_data(zilog_t *zilog, size_t hdrsize); +extern itx_wr_state_t zil_write_state(zilog_t *zilog, uint64_t size, + uint32_t blocksize, boolean_t o_direct, boolean_t commit); extern void zil_sums_init(zil_sums_t *zs); extern void zil_sums_fini(zil_sums_t *zs); @@ -642,6 +669,8 @@ extern void zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums); extern int zil_replay_disable; +extern uint_t zfs_immediate_write_sz; +extern int zil_special_is_slog; #ifdef __cplusplus } diff --git a/sys/contrib/openzfs/include/sys/zil_impl.h b/sys/contrib/openzfs/include/sys/zil_impl.h index 252264b9eae9..ea1364a7e35a 100644 --- a/sys/contrib/openzfs/include/sys/zil_impl.h +++ b/sys/contrib/openzfs/include/sys/zil_impl.h @@ -41,8 +41,8 @@ extern "C" { * * An lwb will start out in the "new" state, and transition to the "opened" * state via a call to zil_lwb_write_open() on first itx assignment. When - * transitioning from "new" to "opened" the zilog's "zl_issuer_lock" must be - * held. + * transitioning from "new" to "opened" the zilog's "zl_issuer_lock" and + * LWB's "lwb_lock" must be held. * * After the lwb is "opened", it can be assigned number of itxs and transition * into the "closed" state via zil_lwb_write_close() when full or on timeout. @@ -100,16 +100,22 @@ typedef enum { * holding the "zl_issuer_lock". After the lwb is issued, the zilog's * "zl_lock" is used to protect the lwb against concurrent access. */ +typedef enum { + LWB_FLAG_SLIM = (1<<0), /* log block has slim format */ + LWB_FLAG_SLOG = (1<<1), /* lwb_blk is on SLOG device */ + LWB_FLAG_CRASHED = (1<<2), /* lwb is on the crash list */ +} lwb_flag_t; + typedef struct lwb { zilog_t *lwb_zilog; /* back pointer to log struct */ blkptr_t lwb_blk; /* on disk address of this log blk */ - boolean_t lwb_slim; /* log block has slim format */ - boolean_t lwb_slog; /* lwb_blk is on SLOG device */ + lwb_flag_t lwb_flags; /* extra info about this lwb */ int lwb_error; /* log block allocation error */ int lwb_nmax; /* max bytes in the buffer */ int lwb_nused; /* # used bytes in buffer */ int lwb_nfilled; /* # filled bytes in buffer */ int lwb_sz; /* size of block and buffer */ + int lwb_min_sz; /* min size for range allocation */ lwb_state_t lwb_state; /* the state of this lwb */ char *lwb_buf; /* log write buffer */ zio_t *lwb_child_zio; /* parent zio for children */ @@ -124,7 +130,7 @@ typedef struct lwb { list_t lwb_itxs; /* list of itx's */ list_t lwb_waiters; /* list of zil_commit_waiter's */ avl_tree_t lwb_vdev_tree; /* vdevs to flush after lwb write */ - kmutex_t lwb_vdev_lock; /* protects lwb_vdev_tree */ + kmutex_t lwb_lock; /* protects lwb_vdev_tree and size */ } lwb_t; /* @@ -149,7 +155,7 @@ typedef struct zil_commit_waiter { list_node_t zcw_node; /* linkage in lwb_t:lwb_waiter list */ lwb_t *zcw_lwb; /* back pointer to lwb when linked */ boolean_t zcw_done; /* B_TRUE when "done", else B_FALSE */ - int zcw_zio_error; /* contains the zio io_error value */ + int zcw_error; /* result to return from zil_commit() */ } zil_commit_waiter_t; /* @@ -221,6 +227,7 @@ struct zilog { uint64_t zl_cur_left; /* current burst remaining size */ uint64_t zl_cur_max; /* biggest record in current burst */ list_t zl_lwb_list; /* in-flight log write list */ + list_t zl_lwb_crash_list; /* log writes in-flight at crash */ avl_tree_t zl_bp_tree; /* track bps during log parse */ clock_t zl_replay_time; /* lbolt of when replay started */ uint64_t zl_replay_blks; /* number of log blocks replayed */ @@ -245,6 +252,9 @@ struct zilog { */ uint64_t zl_max_block_size; + /* After crash, txg to restart zil */ + uint64_t zl_restart_txg; + /* Pointer for per dataset zil sums */ zil_sums_t *zl_sums; }; diff --git a/sys/contrib/openzfs/include/sys/zio.h b/sys/contrib/openzfs/include/sys/zio.h index d91a4eb1e998..a8acb83b4c2f 100644 --- a/sys/contrib/openzfs/include/sys/zio.h +++ b/sys/contrib/openzfs/include/sys/zio.h @@ -59,21 +59,37 @@ typedef struct zio_eck { /* * Gang block headers are self-checksumming and contain an array - * of block pointers. + * of block pointers. The old gang block size has enough room for 3 blkptrs, + * while new gang blocks can store more. + * + * Layout: + * +--------+--------+--------+-----+---------+-----------+ + * | | | | | | | + * | blkptr | blkptr | blkptr | ... | padding | zio_eck_t | + * | 1 | 2 | 3 | | | | + * +--------+--------+--------+-----+---------+-----------+ + * 128B 128B 128B 88B 40B */ -#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE -#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ - sizeof (zio_eck_t)) / sizeof (blkptr_t)) -#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ - sizeof (zio_eck_t) - \ - (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ - sizeof (uint64_t)) - -typedef struct zio_gbh { - blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; - uint64_t zg_filler[SPA_GBH_FILLER]; - zio_eck_t zg_tail; -} zio_gbh_phys_t; +#define SPA_OLD_GANGBLOCKSIZE SPA_MINBLOCKSIZE +typedef void zio_gbh_phys_t; + +static inline uint64_t +gbh_nblkptrs(uint64_t size) { + ASSERT(IS_P2ALIGNED(size, sizeof (blkptr_t))); + return ((size - sizeof (zio_eck_t)) / sizeof (blkptr_t)); +} + +static inline zio_eck_t * +gbh_eck(zio_gbh_phys_t *gbh, uint64_t size) { + ASSERT(IS_P2ALIGNED(size, sizeof (blkptr_t))); + return ((zio_eck_t *)((uintptr_t)gbh + (size_t)size - + sizeof (zio_eck_t))); +} + +static inline blkptr_t * +gbh_bp(zio_gbh_phys_t *gbh, int bp) { + return (&((blkptr_t *)gbh)[bp]); +} enum zio_checksum { ZIO_CHECKSUM_INHERIT = 0, @@ -196,7 +212,7 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_DONT_RETRY (1ULL << 10) #define ZIO_FLAG_NODATA (1ULL << 12) #define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13) -#define ZIO_FLAG_IO_ALLOCATING (1ULL << 14) +#define ZIO_FLAG_ALLOC_THROTTLED (1ULL << 14) #define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) #define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) @@ -226,8 +242,7 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_NOPWRITE (1ULL << 29) #define ZIO_FLAG_REEXECUTED (1ULL << 30) #define ZIO_FLAG_DELEGATED (1ULL << 31) -#define ZIO_FLAG_DIO_CHKSUM_ERR (1ULL << 32) -#define ZIO_FLAG_PREALLOCATED (1ULL << 33) +#define ZIO_FLAG_PREALLOCATED (1ULL << 32) #define ZIO_ALLOCATOR_NONE (-1) #define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE) @@ -346,25 +361,26 @@ struct zbookmark_err_phys { (zb)->zb_blkid == ZB_ROOT_BLKID) typedef struct zio_prop { - enum zio_checksum zp_checksum; - enum zio_compress zp_compress; + enum zio_checksum zp_checksum:8; + enum zio_compress zp_compress:8; uint8_t zp_complevel; uint8_t zp_level; uint8_t zp_copies; uint8_t zp_gang_copies; - dmu_object_type_t zp_type; - boolean_t zp_dedup; - boolean_t zp_dedup_verify; - boolean_t zp_nopwrite; - boolean_t zp_brtwrite; - boolean_t zp_encrypt; - boolean_t zp_byteorder; - boolean_t zp_direct_write; + dmu_object_type_t zp_type:8; + dmu_object_type_t zp_storage_type:8; + boolean_t zp_dedup:1; + boolean_t zp_dedup_verify:1; + boolean_t zp_nopwrite:1; + boolean_t zp_brtwrite:1; + boolean_t zp_encrypt:1; + boolean_t zp_byteorder:1; + boolean_t zp_direct_write:1; + boolean_t zp_rewrite:1; + uint32_t zp_zpl_smallblk; uint8_t zp_salt[ZIO_DATA_SALT_LEN]; uint8_t zp_iv[ZIO_DATA_IV_LEN]; uint8_t zp_mac[ZIO_DATA_MAC_LEN]; - uint32_t zp_zpl_smallblk; - dmu_object_type_t zp_storage_type; } zio_prop_t; typedef struct zio_cksum_report zio_cksum_report_t; @@ -399,7 +415,9 @@ typedef struct zio_vsd_ops { typedef struct zio_gang_node { zio_gbh_phys_t *gn_gbh; - struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS]; + uint64_t gn_gangblocksize; + uint64_t gn_allocsize; + struct zio_gang_node *gn_child[]; } zio_gang_node_t; typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, @@ -418,14 +436,16 @@ typedef struct zio_transform { typedef zio_t *zio_pipe_stage_t(zio_t *zio); /* - * The io_reexecute flags are distinct from io_flags because the child must - * be able to propagate them to the parent. The normal io_flags are local - * to the zio, not protected by any lock, and not modifiable by children; - * the reexecute flags are protected by io_lock, modifiable by children, - * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set. + * The io_post flags describe additional actions that a parent IO should + * consider or perform on behalf of a child. They are distinct from io_flags + * because the child must be able to propagate them to the parent. The normal + * io_flags are local to the zio, not protected by any lock, and not modifiable + * by children; the reexecute flags are protected by io_lock, modifiable by + * children, and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set. */ -#define ZIO_REEXECUTE_NOW 0x01 -#define ZIO_REEXECUTE_SUSPEND 0x02 +#define ZIO_POST_REEXECUTE (1 << 0) +#define ZIO_POST_SUSPEND (1 << 1) +#define ZIO_POST_DIO_CHKSUM_ERR (1 << 2) /* * The io_trim flags are used to specify the type of TRIM to perform. They @@ -461,7 +481,7 @@ struct zio { enum zio_child io_child_type; enum trim_flag io_trim_flags; zio_priority_t io_priority; - uint8_t io_reexecute; + uint8_t io_post; uint8_t io_state[ZIO_WAIT_TYPES]; uint64_t io_txg; spa_t *io_spa; @@ -603,7 +623,8 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_flag_t flags); extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, - blkptr_t *new_bp, uint64_t size, boolean_t *slog); + blkptr_t *new_bp, uint64_t min_size, uint64_t max_size, boolean_t *slog, + boolean_t allow_larger); extern void zio_flush(zio_t *zio, vdev_t *vd); extern void zio_shrink(zio_t *zio, uint64_t size); diff --git a/sys/contrib/openzfs/include/sys/zvol.h b/sys/contrib/openzfs/include/sys/zvol.h index 32e703650935..5791246e99e4 100644 --- a/sys/contrib/openzfs/include/sys/zvol.h +++ b/sys/contrib/openzfs/include/sys/zvol.h @@ -36,8 +36,7 @@ #define SPEC_MAXOFFSET_T ((1LL << ((NBBY * sizeof (daddr32_t)) + \ DEV_BSHIFT - 1)) - 1) -extern void zvol_create_minor(const char *); -extern void zvol_create_minors_recursive(const char *); +extern void zvol_create_minors(const char *); extern void zvol_remove_minors(spa_t *, const char *, boolean_t); extern void zvol_rename_minors(spa_t *, const char *, const char *, boolean_t); @@ -54,7 +53,7 @@ extern int zvol_set_volsize(const char *, uint64_t); extern int zvol_set_volthreading(const char *, boolean_t); extern int zvol_set_common(const char *, zfs_prop_t, zprop_source_t, uint64_t); extern int zvol_set_ro(const char *, boolean_t); -extern zvol_state_handle_t *zvol_suspend(const char *); +extern int zvol_suspend(const char *, zvol_state_handle_t **); extern int zvol_resume(zvol_state_handle_t *); extern void *zvol_tag(zvol_state_handle_t *); diff --git a/sys/contrib/openzfs/include/sys/zvol_impl.h b/sys/contrib/openzfs/include/sys/zvol_impl.h index 038d4cb48f97..5422e66832c0 100644 --- a/sys/contrib/openzfs/include/sys/zvol_impl.h +++ b/sys/contrib/openzfs/include/sys/zvol_impl.h @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2024, 2025, Klara, Inc. */ #ifndef _SYS_ZVOL_IMPL_H @@ -56,6 +56,7 @@ typedef struct zvol_state { atomic_t zv_suspend_ref; /* refcount for suspend */ krwlock_t zv_suspend_lock; /* suspend lock */ kcondvar_t zv_removing_cv; /* ready to remove minor */ + list_node_t zv_remove_node; /* node on removal list */ struct zvol_state_os *zv_zso; /* private platform state */ boolean_t zv_threading; /* volthreading property */ } zvol_state_t; @@ -108,7 +109,6 @@ zvol_state_t *zvol_find_by_name_hash(const char *name, uint64_t hash, int mode); int zvol_first_open(zvol_state_t *zv, boolean_t readonly); uint64_t zvol_name_hash(const char *name); -void zvol_remove_minors_impl(const char *name); void zvol_last_close(zvol_state_t *zv); void zvol_insert(zvol_state_t *zv); void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, @@ -132,11 +132,11 @@ void zv_request_task_free(zv_request_task_t *task); * platform dependent functions exported to platform independent code */ void zvol_os_free(zvol_state_t *zv); -void zvol_os_rename_minor(zvol_state_t *zv, const char *newname); +int zvol_os_rename_minor(zvol_state_t *zv, const char *newname); int zvol_os_create_minor(const char *name); int zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize); boolean_t zvol_os_is_zvol(const char *path); -void zvol_os_clear_private(zvol_state_t *zv); +void zvol_os_remove_minor(zvol_state_t *zv); void zvol_os_set_disk_ro(zvol_state_t *zv, int flags); void zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity); diff --git a/sys/contrib/openzfs/include/zfeature_common.h b/sys/contrib/openzfs/include/zfeature_common.h index 85537c1ae96e..56382ca85b55 100644 --- a/sys/contrib/openzfs/include/zfeature_common.h +++ b/sys/contrib/openzfs/include/zfeature_common.h @@ -87,6 +87,9 @@ typedef enum spa_feature { SPA_FEATURE_FAST_DEDUP, SPA_FEATURE_LONGNAME, SPA_FEATURE_LARGE_MICROZAP, + SPA_FEATURE_DYNAMIC_GANG_HEADER, + SPA_FEATURE_BLOCK_CLONING_ENDIAN, + SPA_FEATURE_PHYSICAL_REWRITE, SPA_FEATURES } spa_feature_t; @@ -103,7 +106,15 @@ typedef enum zfeature_flags { /* Activate this feature at the same time it is enabled. */ ZFEATURE_FLAG_ACTIVATE_ON_ENABLE = (1 << 2), /* Each dataset has a field set if it has ever used this feature. */ - ZFEATURE_FLAG_PER_DATASET = (1 << 3) + ZFEATURE_FLAG_PER_DATASET = (1 << 3), + /* + * This feature isn't enabled by zpool upgrade; it must be explicitly + * listed to be enabled. It will also be applied if listed in an + * explicitly provided compatibility list. This flag can be removed + * from a given feature once support is sufficiently widespread, or + * worries about backwards compatibility are no longer relevant. + */ + ZFEATURE_FLAG_NO_UPGRADE = (1 << 4) } zfeature_flags_t; typedef enum zfeature_type { diff --git a/sys/contrib/openzfs/include/zfs_crrd.h b/sys/contrib/openzfs/include/zfs_crrd.h new file mode 100644 index 000000000000..ba192a2062ea --- /dev/null +++ b/sys/contrib/openzfs/include/zfs_crrd.h @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2024 Klara Inc. + * + * This software was developed by + * Mariusz Zaborski <mariusz.zaborski@klarasystems.com> + * Fred Weigel <fred.weigel@klarasystems.com> + * under sponsorship from Wasabi Technology, Inc. and Klara Inc. + */ + +#ifndef _CRRD_H_ +#define _CRRD_H_ + +#define RRD_MAX_ENTRIES 256 + +#define RRD_ENTRY_SIZE sizeof (uint64_t) +#define RRD_STRUCT_ELEM (sizeof (rrd_t) / RRD_ENTRY_SIZE) + +typedef enum { + DBRRD_FLOOR, + DBRRD_CEILING +} dbrrd_rounding_t; + +typedef struct { + uint64_t rrdd_time; + uint64_t rrdd_txg; +} rrd_data_t; + +typedef struct { + uint64_t rrd_head; /* head (beginning) */ + uint64_t rrd_tail; /* tail (end) */ + uint64_t rrd_length; + + rrd_data_t rrd_entries[RRD_MAX_ENTRIES]; +} rrd_t; + +typedef struct { + rrd_t dbr_minutes; + rrd_t dbr_days; + rrd_t dbr_months; +} dbrrd_t; + +size_t rrd_len(rrd_t *rrd); + +const rrd_data_t *rrd_entry(rrd_t *r, size_t i); +rrd_data_t *rrd_tail_entry(rrd_t *rrd); +uint64_t rrd_tail(rrd_t *rrd); +uint64_t rrd_get(rrd_t *rrd, size_t i); + +void rrd_add(rrd_t *rrd, hrtime_t time, uint64_t txg); + +void dbrrd_add(dbrrd_t *db, hrtime_t time, uint64_t txg); +uint64_t dbrrd_query(dbrrd_t *r, hrtime_t tv, dbrrd_rounding_t rouding); + +#endif diff --git a/sys/contrib/openzfs/include/zfs_deleg.h b/sys/contrib/openzfs/include/zfs_deleg.h index f80fe46d35f8..a7bbf1620ad5 100644 --- a/sys/contrib/openzfs/include/zfs_deleg.h +++ b/sys/contrib/openzfs/include/zfs_deleg.h @@ -55,6 +55,7 @@ typedef enum { ZFS_DELEG_NOTE_PROMOTE, ZFS_DELEG_NOTE_RENAME, ZFS_DELEG_NOTE_SEND, + ZFS_DELEG_NOTE_SEND_RAW, ZFS_DELEG_NOTE_RECEIVE, ZFS_DELEG_NOTE_ALLOW, ZFS_DELEG_NOTE_USERPROP, diff --git a/sys/contrib/openzfs/lib/libicp/Makefile.am b/sys/contrib/openzfs/lib/libicp/Makefile.am index ce24d13a760f..23adba10bc44 100644 --- a/sys/contrib/openzfs/lib/libicp/Makefile.am +++ b/sys/contrib/openzfs/lib/libicp/Makefile.am @@ -69,6 +69,7 @@ nodist_libicp_la_SOURCES += \ module/icp/asm-x86_64/aes/aes_aesni.S \ module/icp/asm-x86_64/modes/gcm_pclmulqdq.S \ module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S \ + module/icp/asm-x86_64/modes/aesni-gcm-avx2-vaes.S \ module/icp/asm-x86_64/modes/ghash-x86_64.S \ module/icp/asm-x86_64/sha2/sha256-x86_64.S \ module/icp/asm-x86_64/sha2/sha512-x86_64.S \ diff --git a/sys/contrib/openzfs/lib/libspl/Makefile.am b/sys/contrib/openzfs/lib/libspl/Makefile.am index f8943572bf29..0fd907d3011e 100644 --- a/sys/contrib/openzfs/lib/libspl/Makefile.am +++ b/sys/contrib/openzfs/lib/libspl/Makefile.am @@ -2,6 +2,9 @@ include $(srcdir)/%D%/include/Makefile.am libspl_assert_la_CFLAGS = $(AM_CFLAGS) $(LIBRARY_CFLAGS) $(LIBUNWIND_CFLAGS) libspl_la_CFLAGS = $(libspl_assert_la_CFLAGS) +if TARGET_CPU_I386 +libspl_la_CFLAGS += $(NO_ATOMIC_ALIGNMENT) +endif noinst_LTLIBRARIES += libspl_assert.la libspl.la CPPCHECKTARGETS += libspl_assert.la libspl.la @@ -20,6 +23,7 @@ libspl_la_SOURCES = \ %D%/strlcat.c \ %D%/strlcpy.c \ %D%/timestamp.c \ + %D%/tunables.c \ %D%/include/sys/list.h \ %D%/include/sys/list_impl.h diff --git a/sys/contrib/openzfs/lib/libspl/include/Makefile.am b/sys/contrib/openzfs/lib/libspl/include/Makefile.am index 8c286142f298..21f0c70db9e7 100644 --- a/sys/contrib/openzfs/lib/libspl/include/Makefile.am +++ b/sys/contrib/openzfs/lib/libspl/include/Makefile.am @@ -45,6 +45,7 @@ libspl_sys_HEADERS = \ %D%/sys/list_impl.h \ %D%/sys/mhd.h \ %D%/sys/mkdev.h \ + %D%/sys/mod.h \ %D%/sys/policy.h \ %D%/sys/poll.h \ %D%/sys/priv.h \ @@ -58,6 +59,7 @@ libspl_sys_HEADERS = \ %D%/sys/time.h \ %D%/sys/trace_spl.h \ %D%/sys/trace_zfs.h \ + %D%/sys/tunables.h \ %D%/sys/types.h \ %D%/sys/types32.h \ %D%/sys/uio.h \ diff --git a/sys/contrib/openzfs/lib/libspl/include/os/linux/sys/stat.h b/sys/contrib/openzfs/lib/libspl/include/os/linux/sys/stat.h index 488554f4e844..13cc0b46ac93 100644 --- a/sys/contrib/openzfs/lib/libspl/include/os/linux/sys/stat.h +++ b/sys/contrib/openzfs/lib/libspl/include/os/linux/sys/stat.h @@ -31,6 +31,11 @@ #include <sys/mount.h> /* for BLKGETSIZE64 */ +#ifdef HAVE_STATX +#include <fcntl.h> +#include <sys/stat.h> +#endif + /* * Emulate Solaris' behavior of returning the block device size in fstat64(). */ diff --git a/sys/contrib/openzfs/lib/libspl/include/sys/debug.h b/sys/contrib/openzfs/lib/libspl/include/sys/debug.h index cced309bd1bb..02f33a68b75b 100644 --- a/sys/contrib/openzfs/lib/libspl/include/sys/debug.h +++ b/sys/contrib/openzfs/lib/libspl/include/sys/debug.h @@ -38,4 +38,8 @@ #define __maybe_unused __attribute__((unused)) #endif +#ifndef __must_check +#define __must_check __attribute__((warn_unused_result)) +#endif + #endif diff --git a/sys/contrib/openzfs/lib/libspl/include/sys/mod.h b/sys/contrib/openzfs/lib/libspl/include/sys/mod.h new file mode 100644 index 000000000000..ad19b6607a42 --- /dev/null +++ b/sys/contrib/openzfs/lib/libspl/include/sys/mod.h @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> + */ + +#ifndef _SYS_MOD_H +#define _SYS_MOD_H + +#include <sys/tunables.h> + +#define ZFS_MODULE_PARAM(scope, prefix, name, type, perm, desc) \ + static const zfs_tunable_t _zfs_tunable_##prefix##name = { \ + .zt_name = #prefix#name, \ + .zt_varp = &prefix##name, \ + .zt_varsz = sizeof (prefix##name), \ + .zt_type = ZFS_TUNABLE_TYPE_##type, \ + .zt_perm = ZFS_TUNABLE_PERM_##perm, \ + .zt_desc = desc \ + }; \ + static const zfs_tunable_t * \ + __zfs_tunable_##prefix##name \ + __attribute__((__section__("zfs_tunables"))) \ + __attribute__((__used__)) \ + = &_zfs_tunable_##prefix##name; + +#define ZFS_MODULE_PARAM_ARGS void +#define ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, setfunc, \ + getfunc, perm, desc) + +#define EXPORT_SYMBOL(x) + +#endif diff --git a/sys/contrib/openzfs/lib/libspl/include/sys/simd.h b/sys/contrib/openzfs/lib/libspl/include/sys/simd.h index 1ef24f5a7d39..4772a5416b2e 100644 --- a/sys/contrib/openzfs/lib/libspl/include/sys/simd.h +++ b/sys/contrib/openzfs/lib/libspl/include/sys/simd.h @@ -102,7 +102,9 @@ typedef enum cpuid_inst_sets { AES, PCLMULQDQ, MOVBE, - SHA_NI + SHA_NI, + VAES, + VPCLMULQDQ } cpuid_inst_sets_t; /* @@ -127,6 +129,8 @@ typedef struct cpuid_feature_desc { #define _AES_BIT (1U << 25) #define _PCLMULQDQ_BIT (1U << 1) #define _MOVBE_BIT (1U << 22) +#define _VAES_BIT (1U << 9) +#define _VPCLMULQDQ_BIT (1U << 10) #define _SHA_NI_BIT (1U << 29) /* @@ -157,6 +161,8 @@ static const cpuid_feature_desc_t cpuid_features[] = { [PCLMULQDQ] = {1U, 0U, _PCLMULQDQ_BIT, ECX }, [MOVBE] = {1U, 0U, _MOVBE_BIT, ECX }, [SHA_NI] = {7U, 0U, _SHA_NI_BIT, EBX }, + [VAES] = {7U, 0U, _VAES_BIT, ECX }, + [VPCLMULQDQ] = {7U, 0U, _VPCLMULQDQ_BIT, ECX }, }; /* @@ -231,6 +237,8 @@ CPUID_FEATURE_CHECK(aes, AES); CPUID_FEATURE_CHECK(pclmulqdq, PCLMULQDQ); CPUID_FEATURE_CHECK(movbe, MOVBE); CPUID_FEATURE_CHECK(shani, SHA_NI); +CPUID_FEATURE_CHECK(vaes, VAES); +CPUID_FEATURE_CHECK(vpclmulqdq, VPCLMULQDQ); /* * Detect register set support @@ -382,6 +390,24 @@ zfs_shani_available(void) } /* + * Check if VAES instruction is available + */ +static inline boolean_t +zfs_vaes_available(void) +{ + return (__cpuid_has_vaes()); +} + +/* + * Check if VPCLMULQDQ instruction is available + */ +static inline boolean_t +zfs_vpclmulqdq_available(void) +{ + return (__cpuid_has_vpclmulqdq()); +} + +/* * AVX-512 family of instruction sets: * * AVX512F Foundation diff --git a/sys/contrib/openzfs/lib/libspl/include/sys/time.h b/sys/contrib/openzfs/lib/libspl/include/sys/time.h index da80a5852ae5..062c6ec979fc 100644 --- a/sys/contrib/openzfs/lib/libspl/include/sys/time.h +++ b/sys/contrib/openzfs/lib/libspl/include/sys/time.h @@ -98,6 +98,15 @@ gethrestime_sec(void) } static inline hrtime_t +getlrtime(void) +{ + struct timeval tv; + (void) gettimeofday(&tv, NULL); + return ((((uint64_t)tv.tv_sec) * NANOSEC) + + ((uint64_t)tv.tv_usec * NSEC_PER_USEC)); +} + +static inline hrtime_t gethrtime(void) { struct timespec ts; diff --git a/sys/contrib/openzfs/lib/libspl/include/sys/tunables.h b/sys/contrib/openzfs/lib/libspl/include/sys/tunables.h new file mode 100644 index 000000000000..5d9bb3d71a4a --- /dev/null +++ b/sys/contrib/openzfs/lib/libspl/include/sys/tunables.h @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> + */ + +#ifndef _SYS_TUNABLES_H +#define _SYS_TUNABLES_H + +typedef enum { + ZFS_TUNABLE_TYPE_INT, + ZFS_TUNABLE_TYPE_UINT, + ZFS_TUNABLE_TYPE_ULONG, + ZFS_TUNABLE_TYPE_U64, + ZFS_TUNABLE_TYPE_STRING, +} zfs_tunable_type_t; + +typedef enum { + ZFS_TUNABLE_PERM_ZMOD_RW, + ZFS_TUNABLE_PERM_ZMOD_RD, +} zfs_tunable_perm_t; + +typedef struct zfs_tunable { + const char *zt_name; + void *zt_varp; + size_t zt_varsz; + zfs_tunable_type_t zt_type; + zfs_tunable_perm_t zt_perm; + const char *zt_desc; +} zfs_tunable_t; + +int zfs_tunable_set(const zfs_tunable_t *tunable, const char *val); +int zfs_tunable_get(const zfs_tunable_t *tunable, char *val, size_t valsz); + +const zfs_tunable_t *zfs_tunable_lookup(const char *name); + +typedef int (*zfs_tunable_iter_t)(const zfs_tunable_t *tunable, void *arg); +void zfs_tunable_iter(zfs_tunable_iter_t cb, void *arg); + +#endif diff --git a/sys/contrib/openzfs/lib/libspl/include/umem.h b/sys/contrib/openzfs/lib/libspl/include/umem.h index 6945aae9f3ce..3e44610e4e21 100644 --- a/sys/contrib/openzfs/lib/libspl/include/umem.h +++ b/sys/contrib/openzfs/lib/libspl/include/umem.h @@ -102,7 +102,7 @@ static inline void * umem_alloc_aligned(size_t size, size_t align, int flags) { void *ptr = NULL; - int rc = EINVAL; + int rc; do { rc = posix_memalign(&ptr, align, size); diff --git a/sys/contrib/openzfs/lib/libspl/os/linux/getmntany.c b/sys/contrib/openzfs/lib/libspl/os/linux/getmntany.c index dcdf7b3d6fc9..ee1cdf59b9e5 100644 --- a/sys/contrib/openzfs/lib/libspl/os/linux/getmntany.c +++ b/sys/contrib/openzfs/lib/libspl/os/linux/getmntany.c @@ -85,13 +85,21 @@ _sol_getmntent(FILE *fp, struct mnttab *mgetp) } static int -getextmntent_impl(FILE *fp, struct extmnttab *mp) +getextmntent_impl(FILE *fp, struct extmnttab *mp, uint64_t *mnt_id) { int ret; struct stat64 st; + *mnt_id = 0; ret = _sol_getmntent(fp, (struct mnttab *)mp); if (ret == 0) { +#ifdef HAVE_STATX_MNT_ID + struct statx stx; + if (statx(AT_FDCWD, mp->mnt_mountp, + AT_STATX_SYNC_AS_STAT | AT_SYMLINK_NOFOLLOW, + STATX_MNT_ID, &stx) == 0 && (stx.stx_mask & STATX_MNT_ID)) + *mnt_id = stx.stx_mnt_id; +#endif if (stat64(mp->mnt_mountp, &st) != 0) { mp->mnt_major = 0; mp->mnt_minor = 0; @@ -110,6 +118,12 @@ getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf) struct stat64 st; FILE *fp; int match; + boolean_t have_mnt_id = B_FALSE; + uint64_t target_mnt_id = 0; + uint64_t entry_mnt_id; +#ifdef HAVE_STATX_MNT_ID + struct statx stx; +#endif if (strlen(path) >= MAXPATHLEN) { (void) fprintf(stderr, "invalid object; pathname too long\n"); @@ -128,6 +142,13 @@ getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf) return (-1); } +#ifdef HAVE_STATX_MNT_ID + if (statx(AT_FDCWD, path, AT_STATX_SYNC_AS_STAT | AT_SYMLINK_NOFOLLOW, + STATX_MNT_ID, &stx) == 0 && (stx.stx_mask & STATX_MNT_ID)) { + have_mnt_id = B_TRUE; + target_mnt_id = stx.stx_mnt_id; + } +#endif if ((fp = fopen(MNTTAB, "re")) == NULL) { (void) fprintf(stderr, "cannot open %s\n", MNTTAB); @@ -139,12 +160,15 @@ getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf) */ match = 0; - while (getextmntent_impl(fp, entry) == 0) { - if (makedev(entry->mnt_major, entry->mnt_minor) == - statbuf->st_dev) { - match = 1; - break; + while (getextmntent_impl(fp, entry, &entry_mnt_id) == 0) { + if (have_mnt_id) { + match = (entry_mnt_id == target_mnt_id); + } else { + match = makedev(entry->mnt_major, entry->mnt_minor) == + statbuf->st_dev; } + if (match) + break; } (void) fclose(fp); diff --git a/sys/contrib/openzfs/lib/libspl/tunables.c b/sys/contrib/openzfs/lib/libspl/tunables.c new file mode 100644 index 000000000000..67dc9710dee8 --- /dev/null +++ b/sys/contrib/openzfs/lib/libspl/tunables.c @@ -0,0 +1,319 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> + */ + +#include <stddef.h> +#include <string.h> +#include <stdlib.h> +#include <stdio.h> +#include <errno.h> +#include <limits.h> +#include <inttypes.h> +#include <sys/tunables.h> + +/* + * Userspace tunables. + * + * Tunables are external pointers to global variables that are wired up to the + * host environment in some way that allows the operator to directly change + * their values "under the hood". + * + * In userspace, the "host environment" is the program using libzpool.so. So + * that it can manipulate tunables if it wants, we provide an API to access + * them. + * + * Tunables are declared through the ZFS_MODULE_PARAM* macros, which associate + * a global variable with some metadata we can use to describe and access the + * tunable. This is done by creating a uniquely-named zfs_tunable_t. + * + * At runtime, we need a way to discover these zfs_tunable_t items. Since they + * are declared globally, all over the codebase, there's no central place to + * record or list them. So, we take advantage of the compiler's "linker set" + * feature. + * + * In the ZFS_MODULE_PARAM macro, after we create the zfs_tunable_t, we also + * create a zfs_tunable_t* pointing to it. That pointer is forced into the + * "zfs_tunables" ELF section in compiled object. At link time, the linker will + * collect all these pointers into one single big "zfs_tunable" section, and + * will generate two new symbols in the final object: __start_zfs_tunable and + * __stop_zfs_tunable. These point to the first and last item in that section, + * which allows us to access the pointers in that section like an array, and + * through those pointers access the tunable metadata, and from there the + * actual C variable that the tunable describes. + */ + +extern const zfs_tunable_t *__start_zfs_tunables; +extern const zfs_tunable_t *__stop_zfs_tunables; + +/* + * Because there are no tunables in libspl itself, the above symbols will not + * be generated, which will stop libspl being linked at all. To work around + * that, we force a symbol into that section, and then when iterating, skip + * any NULL pointers. + */ +static void *__zfs_tunable__placeholder + __attribute__((__section__("zfs_tunables"))) + __attribute__((__used__)) = NULL; + +/* + * Find the name tunable by walking through the linker set and comparing names, + * as described above. This is not particularly efficient but it's a fairly + * rare task, so it shouldn't be a big deal. + */ +const zfs_tunable_t * +zfs_tunable_lookup(const char *name) +{ + for (const zfs_tunable_t **ztp = &__start_zfs_tunables; + ztp != &__stop_zfs_tunables; ztp++) { + const zfs_tunable_t *zt = *ztp; + if (zt == NULL) + continue; + if (strcmp(name, zt->zt_name) == 0) + return (zt); + } + + return (NULL); +} + +/* + * Like zfs_tunable_lookup, but call the provided callback for each tunable. + */ +void +zfs_tunable_iter(zfs_tunable_iter_t cb, void *arg) +{ + for (const zfs_tunable_t **ztp = &__start_zfs_tunables; + ztp != &__stop_zfs_tunables; ztp++) { + const zfs_tunable_t *zt = *ztp; + if (zt == NULL) + continue; + if (cb(zt, arg)) + return; + } +} + +/* + * Parse a string into an int or uint. It's easier to have a pair of "generic" + * functions that clamp to a given min and max rather than have multiple + * functions for each width of type. + */ +static int +zfs_tunable_parse_int(const char *val, intmax_t *np, + intmax_t min, intmax_t max) +{ + intmax_t n; + char *end; + errno = 0; + n = strtoimax(val, &end, 0); + if (errno != 0) + return (errno); + if (*end != '\0') + return (EINVAL); + if (n < min || n > max) + return (ERANGE); + *np = n; + return (0); +} + +static int +zfs_tunable_parse_uint(const char *val, uintmax_t *np, + uintmax_t min, uintmax_t max) +{ + uintmax_t n; + char *end; + errno = 0; + n = strtoumax(val, &end, 0); + if (errno != 0) + return (errno); + if (*end != '\0') + return (EINVAL); + if (strchr(val, '-')) + return (ERANGE); + if (n < min || n > max) + return (ERANGE); + *np = n; + return (0); +} + +/* + * Set helpers for each tunable type. Parses the string, and if produces a + * valid value for the tunable, sets it. No effort is made to make sure the + * tunable is of the right type; that's done in zfs_tunable_set() below. + */ +static int +zfs_tunable_set_int(const zfs_tunable_t *zt, const char *val) +{ + intmax_t n; + int err = zfs_tunable_parse_int(val, &n, INT_MIN, INT_MAX); + if (err != 0) + return (err); + *(int *)zt->zt_varp = n; + return (0); +} + +static int +zfs_tunable_set_uint(const zfs_tunable_t *zt, const char *val) +{ + uintmax_t n; + int err = zfs_tunable_parse_uint(val, &n, 0, UINT_MAX); + if (err != 0) + return (err); + *(unsigned int *)zt->zt_varp = n; + return (0); +} + +static int +zfs_tunable_set_ulong(const zfs_tunable_t *zt, const char *val) +{ + uintmax_t n; + int err = zfs_tunable_parse_uint(val, &n, 0, ULONG_MAX); + if (err != 0) + return (err); + *(unsigned long *)zt->zt_varp = n; + return (0); +} + +static int +zfs_tunable_set_u64(const zfs_tunable_t *zt, const char *val) +{ + uintmax_t n; + int err = zfs_tunable_parse_uint(val, &n, 0, UINT64_MAX); + if (err != 0) + return (err); + *(uint64_t *)zt->zt_varp = n; + return (0); +} + +static int +zfs_tunable_set_string(const zfs_tunable_t *zt, const char *val) +{ + (void) zt, (void) val; + /* + * We can't currently handle strings. String tunables are pointers + * into read-only memory, so we can update the pointer, but not the + * contents. That would mean taking an allocation, but we don't have + * an obvious place to free it. + * + * For now, it's no big deal as there's only a couple of string + * tunables anyway. + */ + return (ENOTSUP); +} + +/* + * Get helpers for each tunable type. Converts the value to a string if + * necessary and writes it into the provided buffer. The type is assumed to + * be correct; zfs_tunable_get() below will call the correct function for the + * type. + */ +static int +zfs_tunable_get_int(const zfs_tunable_t *zt, char *val, size_t valsz) +{ + snprintf(val, valsz, "%d", *(int *)zt->zt_varp); + return (0); +} + +static int +zfs_tunable_get_uint(const zfs_tunable_t *zt, char *val, size_t valsz) +{ + snprintf(val, valsz, "%u", *(unsigned int *)zt->zt_varp); + return (0); +} + +static int +zfs_tunable_get_ulong(const zfs_tunable_t *zt, char *val, size_t valsz) +{ + snprintf(val, valsz, "%lu", *(unsigned long *)zt->zt_varp); + return (0); +} + +static int +zfs_tunable_get_u64(const zfs_tunable_t *zt, char *val, size_t valsz) +{ + snprintf(val, valsz, "%"PRIu64, *(uint64_t *)zt->zt_varp); + return (0); +} + +static int +zfs_tunable_get_string(const zfs_tunable_t *zt, char *val, size_t valsz) +{ + strlcpy(val, *(char **)zt->zt_varp, valsz); + return (0); +} + +/* The public set function. Delegates to the type-specific version. */ +int +zfs_tunable_set(const zfs_tunable_t *zt, const char *val) +{ + int err; + switch (zt->zt_type) { + case ZFS_TUNABLE_TYPE_INT: + err = zfs_tunable_set_int(zt, val); + break; + case ZFS_TUNABLE_TYPE_UINT: + err = zfs_tunable_set_uint(zt, val); + break; + case ZFS_TUNABLE_TYPE_ULONG: + err = zfs_tunable_set_ulong(zt, val); + break; + case ZFS_TUNABLE_TYPE_U64: + err = zfs_tunable_set_u64(zt, val); + break; + case ZFS_TUNABLE_TYPE_STRING: + err = zfs_tunable_set_string(zt, val); + break; + default: + err = EOPNOTSUPP; + break; + } + return (err); +} + +/* The public get function. Delegates to the type-specific version. */ +int +zfs_tunable_get(const zfs_tunable_t *zt, char *val, size_t valsz) +{ + int err; + switch (zt->zt_type) { + case ZFS_TUNABLE_TYPE_INT: + err = zfs_tunable_get_int(zt, val, valsz); + break; + case ZFS_TUNABLE_TYPE_UINT: + err = zfs_tunable_get_uint(zt, val, valsz); + break; + case ZFS_TUNABLE_TYPE_ULONG: + err = zfs_tunable_get_ulong(zt, val, valsz); + break; + case ZFS_TUNABLE_TYPE_U64: + err = zfs_tunable_get_u64(zt, val, valsz); + break; + case ZFS_TUNABLE_TYPE_STRING: + err = zfs_tunable_get_string(zt, val, valsz); + break; + default: + err = EOPNOTSUPP; + break; + } + return (err); +} diff --git a/sys/contrib/openzfs/lib/libuutil/libuutil.abi b/sys/contrib/openzfs/lib/libuutil/libuutil.abi index 0052f0d47a7f..6c736c61e4a5 100644 --- a/sys/contrib/openzfs/lib/libuutil/libuutil.abi +++ b/sys/contrib/openzfs/lib/libuutil/libuutil.abi @@ -244,6 +244,10 @@ <elf-symbol name='uu_strerror' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='uu_strndup' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='uu_zalloc' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> + <elf-symbol name='zfs_tunable_get' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> + <elf-symbol name='zfs_tunable_iter' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> + <elf-symbol name='zfs_tunable_lookup' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> + <elf-symbol name='zfs_tunable_set' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> </elf-function-symbols> <abi-instr address-size='64' path='lib/libspl/assert.c' language='LANG_C99'> <typedef-decl name='__pid_t' type-id='95e97e5e' id='3629bad8'/> @@ -612,7 +616,6 @@ <array-type-def dimensions='1' type-id='de572c22' size-in-bits='1472' id='6d3c2f42'> <subrange length='23' type-id='7359adad' id='fdd0f594'/> </array-type-def> - <type-decl name='long long int' size-in-bits='64' id='1eb56b1e'/> <array-type-def dimensions='1' type-id='3a47d82b' size-in-bits='256' id='a133ec23'> <subrange length='4' type-id='7359adad' id='16fe7105'/> </array-type-def> @@ -978,8 +981,6 @@ </abi-instr> <abi-instr address-size='64' path='lib/libspl/os/linux/gethostid.c' language='LANG_C99'> <type-decl name='long long unsigned int' size-in-bits='64' id='3a47d82b'/> - <pointer-type-def type-id='26a90f95' size-in-bits='64' id='9b23c9ad'/> - <qualified-type-def type-id='9b23c9ad' restrict='yes' id='8c85230f'/> <function-decl name='fclose' visibility='default' binding='global' size-in-bits='64'> <parameter type-id='822cd80b'/> <return type-id='95e97e5e'/> @@ -1019,6 +1020,13 @@ <array-type-def dimensions='1' type-id='03085adc' size-in-bits='192' id='083f8d58'> <subrange length='3' type-id='7359adad' id='56f209d2'/> </array-type-def> + <array-type-def dimensions='1' type-id='d315442e' size-in-bits='16' id='811205dc'> + <subrange length='1' type-id='7359adad' id='52f813b4'/> + </array-type-def> + <array-type-def dimensions='1' type-id='d3130597' size-in-bits='768' id='f63f23b9'> + <subrange length='12' type-id='7359adad' id='84827bdc'/> + </array-type-def> + <type-decl name='long long int' size-in-bits='64' id='1eb56b1e'/> <class-decl name='mnttab' size-in-bits='256' is-struct='yes' visibility='default' id='1b055409'> <data-member access='public' layout-offset-in-bits='0'> <var-decl name='mnt_special' type-id='26a90f95' visibility='default'/> @@ -1053,6 +1061,93 @@ <var-decl name='mnt_minor' type-id='3502e3ff' visibility='default'/> </data-member> </class-decl> + <typedef-decl name='__u16' type-id='8efea9e5' id='d315442e'/> + <typedef-decl name='__s32' type-id='95e97e5e' id='3158a266'/> + <typedef-decl name='__u32' type-id='f0981eeb' id='3f1a6b60'/> + <typedef-decl name='__s64' type-id='1eb56b1e' id='49659421'/> + <typedef-decl name='__u64' type-id='3a47d82b' id='d3130597'/> + <class-decl name='statx_timestamp' size-in-bits='128' is-struct='yes' visibility='default' id='94101016'> + <data-member access='public' layout-offset-in-bits='0'> + <var-decl name='tv_sec' type-id='49659421' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='64'> + <var-decl name='tv_nsec' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='96'> + <var-decl name='__reserved' type-id='3158a266' visibility='default'/> + </data-member> + </class-decl> + <class-decl name='statx' size-in-bits='2048' is-struct='yes' visibility='default' id='720b04c5'> + <data-member access='public' layout-offset-in-bits='0'> + <var-decl name='stx_mask' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='32'> + <var-decl name='stx_blksize' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='64'> + <var-decl name='stx_attributes' type-id='d3130597' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='128'> + <var-decl name='stx_nlink' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='160'> + <var-decl name='stx_uid' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='192'> + <var-decl name='stx_gid' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='224'> + <var-decl name='stx_mode' type-id='d315442e' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='240'> + <var-decl name='__spare0' type-id='811205dc' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='256'> + <var-decl name='stx_ino' type-id='d3130597' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='320'> + <var-decl name='stx_size' type-id='d3130597' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='384'> + <var-decl name='stx_blocks' type-id='d3130597' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='448'> + <var-decl name='stx_attributes_mask' type-id='d3130597' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='512'> + <var-decl name='stx_atime' type-id='94101016' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='640'> + <var-decl name='stx_btime' type-id='94101016' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='768'> + <var-decl name='stx_ctime' type-id='94101016' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='896'> + <var-decl name='stx_mtime' type-id='94101016' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='1024'> + <var-decl name='stx_rdev_major' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='1056'> + <var-decl name='stx_rdev_minor' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='1088'> + <var-decl name='stx_dev_major' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='1120'> + <var-decl name='stx_dev_minor' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='1152'> + <var-decl name='stx_mnt_id' type-id='d3130597' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='1216'> + <var-decl name='__spare2' type-id='d3130597' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='1280'> + <var-decl name='__spare3' type-id='f63f23b9' visibility='default'/> + </data-member> + </class-decl> <class-decl name='mntent' size-in-bits='320' is-struct='yes' visibility='default' id='56fe4a37'> <data-member access='public' layout-offset-in-bits='0'> <var-decl name='mnt_fsname' type-id='26a90f95' visibility='default'/> @@ -1142,6 +1237,8 @@ <pointer-type-def type-id='1b055409' size-in-bits='64' id='9d424d31'/> <pointer-type-def type-id='0bbec9cd' size-in-bits='64' id='62f7a03d'/> <qualified-type-def type-id='62f7a03d' restrict='yes' id='f1cadedf'/> + <pointer-type-def type-id='720b04c5' size-in-bits='64' id='936b8e35'/> + <qualified-type-def type-id='936b8e35' restrict='yes' id='31d265b7'/> <function-decl name='getmntent_r' visibility='default' binding='global' size-in-bits='64'> <parameter type-id='e75a27e9'/> <parameter type-id='3cad23cd'/> @@ -1157,6 +1254,14 @@ <parameter type-id='95e97e5e'/> <return type-id='26a90f95'/> </function-decl> + <function-decl name='statx' visibility='default' binding='global' size-in-bits='64'> + <parameter type-id='95e97e5e'/> + <parameter type-id='9d26089a'/> + <parameter type-id='95e97e5e'/> + <parameter type-id='f0981eeb'/> + <parameter type-id='31d265b7'/> + <return type-id='95e97e5e'/> + </function-decl> <function-decl name='__fprintf_chk' visibility='default' binding='global' size-in-bits='64'> <parameter type-id='e75a27e9'/> <parameter type-id='95e97e5e'/> @@ -1307,6 +1412,91 @@ <return type-id='48b5725f'/> </function-decl> </abi-instr> + <abi-instr address-size='64' path='lib/libspl/tunables.c' language='LANG_C99'> + <enum-decl name='zfs_tunable_type_t' naming-typedef-id='f50b1525' id='56905369'> + <underlying-type type-id='9cac1fee'/> + <enumerator name='ZFS_TUNABLE_TYPE_INT' value='0'/> + <enumerator name='ZFS_TUNABLE_TYPE_UINT' value='1'/> + <enumerator name='ZFS_TUNABLE_TYPE_ULONG' value='2'/> + <enumerator name='ZFS_TUNABLE_TYPE_U64' value='3'/> + <enumerator name='ZFS_TUNABLE_TYPE_STRING' value='4'/> + </enum-decl> + <typedef-decl name='zfs_tunable_type_t' type-id='56905369' id='f50b1525'/> + <enum-decl name='zfs_tunable_perm_t' naming-typedef-id='ada7336b' id='e80e6ebf'> + <underlying-type type-id='9cac1fee'/> + <enumerator name='ZFS_TUNABLE_PERM_ZMOD_RW' value='0'/> + <enumerator name='ZFS_TUNABLE_PERM_ZMOD_RD' value='1'/> + </enum-decl> + <typedef-decl name='zfs_tunable_perm_t' type-id='e80e6ebf' id='ada7336b'/> + <class-decl name='zfs_tunable' size-in-bits='320' is-struct='yes' visibility='default' id='1a97ee0e'> + <data-member access='public' layout-offset-in-bits='0'> + <var-decl name='zt_name' type-id='80f4b756' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='64'> + <var-decl name='zt_varp' type-id='eaa32e2f' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='128'> + <var-decl name='zt_varsz' type-id='b59d7dce' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='192'> + <var-decl name='zt_type' type-id='f50b1525' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='224'> + <var-decl name='zt_perm' type-id='ada7336b' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='256'> + <var-decl name='zt_desc' type-id='80f4b756' visibility='default'/> + </data-member> + </class-decl> + <typedef-decl name='zfs_tunable_t' type-id='1a97ee0e' id='12bf5c5e'/> + <typedef-decl name='zfs_tunable_iter_t' type-id='7ef33f92' id='d8d5f4ab'/> + <typedef-decl name='intmax_t' type-id='5b475db0' id='e104d842'/> + <typedef-decl name='uintmax_t' type-id='04d82f4b' id='f8b828c9'/> + <typedef-decl name='__intmax_t' type-id='bd54fe1a' id='5b475db0'/> + <typedef-decl name='__uintmax_t' type-id='7359adad' id='04d82f4b'/> + <pointer-type-def type-id='26a90f95' size-in-bits='64' id='9b23c9ad'/> + <qualified-type-def type-id='9b23c9ad' restrict='yes' id='8c85230f'/> + <qualified-type-def type-id='12bf5c5e' const='yes' id='180e47ee'/> + <pointer-type-def type-id='180e47ee' size-in-bits='64' id='a27af98c'/> + <pointer-type-def type-id='92f86508' size-in-bits='64' id='7ef33f92'/> + <function-decl name='strtoimax' visibility='default' binding='global' size-in-bits='64'> + <parameter type-id='9d26089a'/> + <parameter type-id='8c85230f'/> + <parameter type-id='95e97e5e'/> + <return type-id='e104d842'/> + </function-decl> + <function-decl name='strtoumax' visibility='default' binding='global' size-in-bits='64'> + <parameter type-id='9d26089a'/> + <parameter type-id='8c85230f'/> + <parameter type-id='95e97e5e'/> + <return type-id='f8b828c9'/> + </function-decl> + <function-decl name='zfs_tunable_lookup' mangled-name='zfs_tunable_lookup' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_lookup'> + <parameter type-id='80f4b756' name='name'/> + <return type-id='a27af98c'/> + </function-decl> + <function-decl name='zfs_tunable_iter' mangled-name='zfs_tunable_iter' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_iter'> + <parameter type-id='d8d5f4ab' name='cb'/> + <parameter type-id='eaa32e2f' name='arg'/> + <return type-id='48b5725f'/> + </function-decl> + <function-decl name='zfs_tunable_set' mangled-name='zfs_tunable_set' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_set'> + <parameter type-id='a27af98c' name='zt'/> + <parameter type-id='80f4b756' name='val'/> + <return type-id='95e97e5e'/> + </function-decl> + <function-decl name='zfs_tunable_get' mangled-name='zfs_tunable_get' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_get'> + <parameter type-id='a27af98c' name='zt'/> + <parameter type-id='26a90f95' name='val'/> + <parameter type-id='b59d7dce' name='valsz'/> + <return type-id='95e97e5e'/> + </function-decl> + <function-type size-in-bits='64' id='92f86508'> + <parameter type-id='a27af98c'/> + <parameter type-id='eaa32e2f'/> + <return type-id='95e97e5e'/> + </function-type> + </abi-instr> <abi-instr address-size='64' path='lib/libuutil/uu_alloc.c' language='LANG_C99'> <type-decl name='char' size-in-bits='8' id='a84c031d'/> <type-decl name='unsigned int' size-in-bits='32' id='f0981eeb'/> diff --git a/sys/contrib/openzfs/lib/libzdb/libzdb.c b/sys/contrib/openzfs/lib/libzdb/libzdb.c index 12144dc65e75..cca1327b1b03 100644 --- a/sys/contrib/openzfs/lib/libzdb/libzdb.c +++ b/sys/contrib/openzfs/lib/libzdb/libzdb.c @@ -93,9 +93,9 @@ livelist_compare(const void *larg, const void *rarg) * Since we're storing blkptrs without cancelling FREE/ALLOC pairs, * it's possible the offsets are equal. In that case, sort by txg */ - if (BP_GET_LOGICAL_BIRTH(l) < BP_GET_LOGICAL_BIRTH(r)) { + if (BP_GET_BIRTH(l) < BP_GET_BIRTH(r)) { return (-1); - } else if (BP_GET_LOGICAL_BIRTH(l) > BP_GET_LOGICAL_BIRTH(r)) { + } else if (BP_GET_BIRTH(l) > BP_GET_BIRTH(r)) { return (+1); } return (0); diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs.abi b/sys/contrib/openzfs/lib/libzfs/libzfs.abi index 35ecdca767db..184ea4a55b43 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs.abi +++ b/sys/contrib/openzfs/lib/libzfs/libzfs.abi @@ -451,6 +451,10 @@ <elf-symbol name='zfs_strip_partition' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfs_strip_path' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfs_truncate_shares' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> + <elf-symbol name='zfs_tunable_get' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> + <elf-symbol name='zfs_tunable_iter' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> + <elf-symbol name='zfs_tunable_lookup' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> + <elf-symbol name='zfs_tunable_set' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfs_type_to_name' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfs_unmount' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfs_unmountall' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> @@ -479,6 +483,7 @@ <elf-symbol name='zpool_clear' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_clear_label' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_close' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> + <elf-symbol name='zpool_collect_leaves' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_collect_unsup_feat' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_create' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_ddt_prune' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> @@ -528,6 +533,7 @@ <elf-symbol name='zpool_import_status' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_in_use' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_initialize' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> + <elf-symbol name='zpool_initialize_one' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_initialize_wait' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_is_draid_spare' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_iter' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> @@ -568,6 +574,7 @@ <elf-symbol name='zpool_reguid' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_reopen_one' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_scan' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> + <elf-symbol name='zpool_scan_range' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_search_import' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_set_bootenv' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_set_guid' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> @@ -577,6 +584,7 @@ <elf-symbol name='zpool_state_to_name' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_sync_one' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_trim' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> + <elf-symbol name='zpool_trim_one' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_upgrade' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_vdev_attach' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zpool_vdev_clear' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> @@ -631,7 +639,7 @@ <elf-symbol name='fletcher_4_superscalar_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> - <elf-symbol name='spa_feature_table' size='2464' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> + <elf-symbol name='spa_feature_table' size='2632' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfs_deleg_perm_tab' size='528' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> @@ -1450,8 +1458,103 @@ </function-decl> </abi-instr> <abi-instr address-size='64' path='lib/libspl/os/linux/getmntany.c' language='LANG_C99'> + <array-type-def dimensions='1' type-id='d315442e' size-in-bits='16' id='811205dc'> + <subrange length='1' type-id='7359adad' id='52f813b4'/> + </array-type-def> + <array-type-def dimensions='1' type-id='d3130597' size-in-bits='768' id='f63f23b9'> + <subrange length='12' type-id='7359adad' id='84827bdc'/> + </array-type-def> + <typedef-decl name='__u16' type-id='8efea9e5' id='d315442e'/> + <typedef-decl name='__s32' type-id='95e97e5e' id='3158a266'/> + <typedef-decl name='__u32' type-id='f0981eeb' id='3f1a6b60'/> + <typedef-decl name='__s64' type-id='1eb56b1e' id='49659421'/> + <typedef-decl name='__u64' type-id='3a47d82b' id='d3130597'/> + <class-decl name='statx_timestamp' size-in-bits='128' is-struct='yes' visibility='default' id='94101016'> + <data-member access='public' layout-offset-in-bits='0'> + <var-decl name='tv_sec' type-id='49659421' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='64'> + <var-decl name='tv_nsec' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='96'> + <var-decl name='__reserved' type-id='3158a266' visibility='default'/> + </data-member> + </class-decl> + <class-decl name='statx' size-in-bits='2048' is-struct='yes' visibility='default' id='720b04c5'> + <data-member access='public' layout-offset-in-bits='0'> + <var-decl name='stx_mask' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='32'> + <var-decl name='stx_blksize' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='64'> + <var-decl name='stx_attributes' type-id='d3130597' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='128'> + <var-decl name='stx_nlink' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='160'> + <var-decl name='stx_uid' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='192'> + <var-decl name='stx_gid' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='224'> + <var-decl name='stx_mode' type-id='d315442e' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='240'> + <var-decl name='__spare0' type-id='811205dc' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='256'> + <var-decl name='stx_ino' type-id='d3130597' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='320'> + <var-decl name='stx_size' type-id='d3130597' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='384'> + <var-decl name='stx_blocks' type-id='d3130597' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='448'> + <var-decl name='stx_attributes_mask' type-id='d3130597' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='512'> + <var-decl name='stx_atime' type-id='94101016' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='640'> + <var-decl name='stx_btime' type-id='94101016' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='768'> + <var-decl name='stx_ctime' type-id='94101016' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='896'> + <var-decl name='stx_mtime' type-id='94101016' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='1024'> + <var-decl name='stx_rdev_major' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='1056'> + <var-decl name='stx_rdev_minor' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='1088'> + <var-decl name='stx_dev_major' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='1120'> + <var-decl name='stx_dev_minor' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='1152'> + <var-decl name='stx_mnt_id' type-id='d3130597' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='1216'> + <var-decl name='__spare2' type-id='d3130597' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='1280'> + <var-decl name='__spare3' type-id='f63f23b9' visibility='default'/> + </data-member> + </class-decl> <pointer-type-def type-id='56fe4a37' size-in-bits='64' id='b6b61d2f'/> <qualified-type-def type-id='b6b61d2f' restrict='yes' id='3cad23cd'/> + <pointer-type-def type-id='720b04c5' size-in-bits='64' id='936b8e35'/> + <qualified-type-def type-id='936b8e35' restrict='yes' id='31d265b7'/> <function-decl name='getmntent_r' visibility='default' binding='global' size-in-bits='64'> <parameter type-id='e75a27e9'/> <parameter type-id='3cad23cd'/> @@ -1463,6 +1566,14 @@ <parameter type-id='822cd80b'/> <return type-id='95e97e5e'/> </function-decl> + <function-decl name='statx' visibility='default' binding='global' size-in-bits='64'> + <parameter type-id='95e97e5e'/> + <parameter type-id='9d26089a'/> + <parameter type-id='95e97e5e'/> + <parameter type-id='f0981eeb'/> + <parameter type-id='31d265b7'/> + <return type-id='95e97e5e'/> + </function-decl> </abi-instr> <abi-instr address-size='64' path='lib/libspl/timestamp.c' language='LANG_C99'> <typedef-decl name='nl_item' type-id='95e97e5e' id='03b79a94'/> @@ -1487,6 +1598,89 @@ <return type-id='48b5725f'/> </function-decl> </abi-instr> + <abi-instr address-size='64' path='lib/libspl/tunables.c' language='LANG_C99'> + <enum-decl name='zfs_tunable_type_t' naming-typedef-id='f50b1525' id='56905369'> + <underlying-type type-id='9cac1fee'/> + <enumerator name='ZFS_TUNABLE_TYPE_INT' value='0'/> + <enumerator name='ZFS_TUNABLE_TYPE_UINT' value='1'/> + <enumerator name='ZFS_TUNABLE_TYPE_ULONG' value='2'/> + <enumerator name='ZFS_TUNABLE_TYPE_U64' value='3'/> + <enumerator name='ZFS_TUNABLE_TYPE_STRING' value='4'/> + </enum-decl> + <typedef-decl name='zfs_tunable_type_t' type-id='56905369' id='f50b1525'/> + <enum-decl name='zfs_tunable_perm_t' naming-typedef-id='ada7336b' id='e80e6ebf'> + <underlying-type type-id='9cac1fee'/> + <enumerator name='ZFS_TUNABLE_PERM_ZMOD_RW' value='0'/> + <enumerator name='ZFS_TUNABLE_PERM_ZMOD_RD' value='1'/> + </enum-decl> + <typedef-decl name='zfs_tunable_perm_t' type-id='e80e6ebf' id='ada7336b'/> + <class-decl name='zfs_tunable' size-in-bits='320' is-struct='yes' visibility='default' id='1a97ee0e'> + <data-member access='public' layout-offset-in-bits='0'> + <var-decl name='zt_name' type-id='80f4b756' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='64'> + <var-decl name='zt_varp' type-id='eaa32e2f' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='128'> + <var-decl name='zt_varsz' type-id='b59d7dce' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='192'> + <var-decl name='zt_type' type-id='f50b1525' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='224'> + <var-decl name='zt_perm' type-id='ada7336b' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='256'> + <var-decl name='zt_desc' type-id='80f4b756' visibility='default'/> + </data-member> + </class-decl> + <typedef-decl name='zfs_tunable_t' type-id='1a97ee0e' id='12bf5c5e'/> + <typedef-decl name='zfs_tunable_iter_t' type-id='7ef33f92' id='d8d5f4ab'/> + <typedef-decl name='intmax_t' type-id='5b475db0' id='e104d842'/> + <typedef-decl name='uintmax_t' type-id='04d82f4b' id='f8b828c9'/> + <typedef-decl name='__intmax_t' type-id='bd54fe1a' id='5b475db0'/> + <typedef-decl name='__uintmax_t' type-id='7359adad' id='04d82f4b'/> + <qualified-type-def type-id='12bf5c5e' const='yes' id='180e47ee'/> + <pointer-type-def type-id='180e47ee' size-in-bits='64' id='a27af98c'/> + <pointer-type-def type-id='92f86508' size-in-bits='64' id='7ef33f92'/> + <function-decl name='strtoimax' visibility='default' binding='global' size-in-bits='64'> + <parameter type-id='9d26089a'/> + <parameter type-id='8c85230f'/> + <parameter type-id='95e97e5e'/> + <return type-id='e104d842'/> + </function-decl> + <function-decl name='strtoumax' visibility='default' binding='global' size-in-bits='64'> + <parameter type-id='9d26089a'/> + <parameter type-id='8c85230f'/> + <parameter type-id='95e97e5e'/> + <return type-id='f8b828c9'/> + </function-decl> + <function-decl name='zfs_tunable_lookup' mangled-name='zfs_tunable_lookup' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_lookup'> + <parameter type-id='80f4b756' name='name'/> + <return type-id='a27af98c'/> + </function-decl> + <function-decl name='zfs_tunable_iter' mangled-name='zfs_tunable_iter' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_iter'> + <parameter type-id='d8d5f4ab' name='cb'/> + <parameter type-id='eaa32e2f' name='arg'/> + <return type-id='48b5725f'/> + </function-decl> + <function-decl name='zfs_tunable_set' mangled-name='zfs_tunable_set' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_set'> + <parameter type-id='a27af98c' name='zt'/> + <parameter type-id='80f4b756' name='val'/> + <return type-id='95e97e5e'/> + </function-decl> + <function-decl name='zfs_tunable_get' mangled-name='zfs_tunable_get' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_get'> + <parameter type-id='a27af98c' name='zt'/> + <parameter type-id='26a90f95' name='val'/> + <parameter type-id='b59d7dce' name='valsz'/> + <return type-id='95e97e5e'/> + </function-decl> + <function-type size-in-bits='64' id='92f86508'> + <parameter type-id='a27af98c'/> + <parameter type-id='eaa32e2f'/> + <return type-id='95e97e5e'/> + </function-type> + </abi-instr> <abi-instr address-size='64' path='lib/libtpool/thread_pool.c' language='LANG_C99'> <array-type-def dimensions='1' type-id='49ef3ffd' size-in-bits='1024' id='a14403f5'> <subrange length='16' type-id='7359adad' id='848d0938'/> @@ -4135,13 +4329,6 @@ <parameter type-id='58603c44'/> <return type-id='9c313c2d'/> </function-decl> - <function-decl name='zpool_prop_get_feature' mangled-name='zpool_prop_get_feature' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_prop_get_feature'> - <parameter type-id='4c81de99'/> - <parameter type-id='80f4b756'/> - <parameter type-id='26a90f95'/> - <parameter type-id='b59d7dce'/> - <return type-id='95e97e5e'/> - </function-decl> <function-decl name='zfs_iter_snapshots_v2' mangled-name='zfs_iter_snapshots_v2' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_iter_snapshots_v2'> <parameter type-id='9200a744'/> <parameter type-id='95e97e5e'/> @@ -5930,7 +6117,9 @@ <enumerator name='VDEV_PROP_TRIM_SUPPORT' value='49'/> <enumerator name='VDEV_PROP_TRIM_ERRORS' value='50'/> <enumerator name='VDEV_PROP_SLOW_IOS' value='51'/> - <enumerator name='VDEV_NUM_PROPS' value='52'/> + <enumerator name='VDEV_PROP_SIT_OUT' value='52'/> + <enumerator name='VDEV_PROP_AUTOSIT' value='53'/> + <enumerator name='VDEV_NUM_PROPS' value='54'/> </enum-decl> <typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/> <class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'> @@ -6210,7 +6399,10 @@ <enumerator name='SPA_FEATURE_FAST_DEDUP' value='41'/> <enumerator name='SPA_FEATURE_LONGNAME' value='42'/> <enumerator name='SPA_FEATURE_LARGE_MICROZAP' value='43'/> - <enumerator name='SPA_FEATURES' value='44'/> + <enumerator name='SPA_FEATURE_DYNAMIC_GANG_HEADER' value='44'/> + <enumerator name='SPA_FEATURE_BLOCK_CLONING_ENDIAN' value='45'/> + <enumerator name='SPA_FEATURE_PHYSICAL_REWRITE' value='46'/> + <enumerator name='SPA_FEATURES' value='47'/> </enum-decl> <typedef-decl name='spa_feature_t' type-id='33ecb627' id='d6618c78'/> <qualified-type-def type-id='80f4b756' const='yes' id='b99c00c9'/> @@ -6629,6 +6821,13 @@ <parameter type-id='e4378506' name='plp'/> <return type-id='95e97e5e'/> </function-decl> + <function-decl name='zpool_prop_get_feature' mangled-name='zpool_prop_get_feature' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_prop_get_feature'> + <parameter type-id='4c81de99' name='zhp'/> + <parameter type-id='80f4b756' name='propname'/> + <parameter type-id='26a90f95' name='buf'/> + <parameter type-id='b59d7dce' name='len'/> + <return type-id='95e97e5e'/> + </function-decl> <function-decl name='zpool_get_state' mangled-name='zpool_get_state' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_get_state'> <parameter type-id='4c81de99' name='zhp'/> <return type-id='95e97e5e'/> @@ -6710,6 +6909,11 @@ <parameter type-id='95e97e5e' name='flags'/> <return type-id='95e97e5e'/> </function-decl> + <function-decl name='zpool_initialize_one' mangled-name='zpool_initialize_one' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_initialize_one'> + <parameter type-id='4c81de99' name='zhp'/> + <parameter type-id='eaa32e2f' name='data'/> + <return type-id='95e97e5e'/> + </function-decl> <function-decl name='zpool_initialize' mangled-name='zpool_initialize' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_initialize'> <parameter type-id='4c81de99' name='zhp'/> <parameter type-id='7063e1ab' name='cmd_type'/> @@ -6722,6 +6926,17 @@ <parameter type-id='5ce45b60' name='vds'/> <return type-id='95e97e5e'/> </function-decl> + <function-decl name='zpool_collect_leaves' mangled-name='zpool_collect_leaves' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_collect_leaves'> + <parameter type-id='4c81de99' name='zhp'/> + <parameter type-id='5ce45b60' name='nvroot'/> + <parameter type-id='5ce45b60' name='res'/> + <return type-id='48b5725f'/> + </function-decl> + <function-decl name='zpool_trim_one' mangled-name='zpool_trim_one' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_trim_one'> + <parameter type-id='4c81de99' name='zhp'/> + <parameter type-id='eaa32e2f' name='data'/> + <return type-id='95e97e5e'/> + </function-decl> <function-decl name='zpool_trim' mangled-name='zpool_trim' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_trim'> <parameter type-id='4c81de99' name='zhp'/> <parameter type-id='b1146b8d' name='cmd_type'/> @@ -6735,6 +6950,14 @@ <parameter type-id='b51cf3c2' name='cmd'/> <return type-id='95e97e5e'/> </function-decl> + <function-decl name='zpool_scan_range' mangled-name='zpool_scan_range' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_scan_range'> + <parameter type-id='4c81de99' name='zhp'/> + <parameter type-id='7313fbe2' name='func'/> + <parameter type-id='b51cf3c2' name='cmd'/> + <parameter type-id='c9d12d66' name='date_start'/> + <parameter type-id='c9d12d66' name='date_end'/> + <return type-id='95e97e5e'/> + </function-decl> <function-decl name='zpool_find_vdev_by_physpath' mangled-name='zpool_find_vdev_by_physpath' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_find_vdev_by_physpath'> <parameter type-id='4c81de99' name='zhp'/> <parameter type-id='80f4b756' name='ppath'/> @@ -9394,8 +9617,8 @@ </function-decl> </abi-instr> <abi-instr address-size='64' path='module/zcommon/zfeature_common.c' language='LANG_C99'> - <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='19712' id='fd4573e5'> - <subrange length='44' type-id='7359adad' id='cf8ba455'/> + <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='21056' id='fd43354e'> + <subrange length='47' type-id='7359adad' id='8f8900fe'/> </array-type-def> <enum-decl name='zfeature_flags' id='6db816a4'> <underlying-type type-id='9cac1fee'/> @@ -9403,6 +9626,7 @@ <enumerator name='ZFEATURE_FLAG_MOS' value='2'/> <enumerator name='ZFEATURE_FLAG_ACTIVATE_ON_ENABLE' value='4'/> <enumerator name='ZFEATURE_FLAG_PER_DATASET' value='8'/> + <enumerator name='ZFEATURE_FLAG_NO_UPGRADE' value='16'/> </enum-decl> <typedef-decl name='zfeature_flags_t' type-id='6db816a4' id='fc329033'/> <enum-decl name='zfeature_type' id='c4fa2355'> @@ -9472,7 +9696,7 @@ <pointer-type-def type-id='611586a1' size-in-bits='64' id='2e243169'/> <qualified-type-def type-id='eaa32e2f' const='yes' id='83be723c'/> <pointer-type-def type-id='83be723c' size-in-bits='64' id='7acd98a2'/> - <var-decl name='spa_feature_table' type-id='fd4573e5' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/> + <var-decl name='spa_feature_table' type-id='fd43354e' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/> <var-decl name='zfeature_checks_disable' type-id='c19b74c3' mangled-name='zfeature_checks_disable' visibility='default' elf-symbol-id='zfeature_checks_disable'/> <function-decl name='opendir' visibility='default' binding='global' size-in-bits='64'> <parameter type-id='80f4b756'/> diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_crypto.c b/sys/contrib/openzfs/lib/libzfs/libzfs_crypto.c index 8907802ec259..b34a44c30eb4 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_crypto.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_crypto.c @@ -584,7 +584,7 @@ get_key_material_https(libzfs_handle_t *hdl, const char *uri, goto end; } - int kfd = -1; + int kfd; #ifdef O_TMPFILE kfd = open(getenv("TMPDIR") ?: "/tmp", O_RDWR | O_TMPFILE | O_EXCL | O_CLOEXEC, 0600); diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c b/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c index 91560b40b022..e1b91fc47291 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c @@ -1039,7 +1039,6 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, nvlist_t *ret; int chosen_normal = -1; int chosen_utf = -1; - int set_maxbs = 0; if (nvlist_alloc(&ret, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); @@ -1258,46 +1257,20 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } - /* save the ZFS_PROP_RECORDSIZE during create op */ - if (zpool_hdl == NULL && prop == ZFS_PROP_RECORDSIZE) { - set_maxbs = intval; - } break; } case ZFS_PROP_SPECIAL_SMALL_BLOCKS: { - int maxbs = - set_maxbs == 0 ? SPA_OLD_MAXBLOCKSIZE : set_maxbs; + int maxbs = SPA_MAXBLOCKSIZE; char buf[64]; - if (zpool_hdl != NULL) { - char state[64] = ""; - - maxbs = zpool_get_prop_int(zpool_hdl, - ZPOOL_PROP_MAXBLOCKSIZE, NULL); - - /* - * Issue a warning but do not fail so that - * tests for settable properties succeed. - */ - if (zpool_prop_get_feature(zpool_hdl, - "feature@allocation_classes", state, - sizeof (state)) != 0 || - strcmp(state, ZFS_FEATURE_ACTIVE) != 0) { - (void) fprintf(stderr, gettext( - "%s: property requires a special " - "device in the pool\n"), propname); - } - } - if (intval != 0 && - (intval < SPA_MINBLOCKSIZE || - intval > maxbs || !ISP2(intval))) { + if (intval > SPA_MAXBLOCKSIZE) { zfs_nicebytes(maxbs, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid '%s=%llu' property: must be zero " - "or a power of 2 from 512B to %s"), - propname, (unsigned long long)intval, buf); + "invalid '%s' property: must be between " + "zero and %s"), + propname, buf); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_diff.c b/sys/contrib/openzfs/lib/libzfs/libzfs_diff.c index 6aa0375f98d7..5f50bce531f7 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_diff.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_diff.c @@ -81,7 +81,7 @@ get_stats_for_obj(differ_info_t *di, const char *dsname, uint64_t obj, /* we can get stats even if we failed to get a path */ (void) memcpy(sb, &zc.zc_stat, sizeof (zfs_stat_t)); if (error == 0) { - ASSERT(di->zerr == 0); + ASSERT0(di->zerr); (void) strlcpy(pn, zc.zc_value, maxlen); return (0); } @@ -404,7 +404,7 @@ write_free_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr) (void) strlcpy(zc.zc_name, di->fromsnap, sizeof (zc.zc_name)); zc.zc_obj = dr->ddr_first - 1; - ASSERT(di->zerr == 0); + ASSERT0(di->zerr); while (zc.zc_obj < dr->ddr_last) { int err; diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_import.c b/sys/contrib/openzfs/lib/libzfs/libzfs_import.c index 599e8e6f7819..7f276e9592c9 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_import.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_import.c @@ -122,7 +122,7 @@ const pool_config_ops_t libzfs_config_ops = { static uint64_t label_offset(uint64_t size, int l) { - ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0); + ASSERT0(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t)); return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? 0 : size - VDEV_LABELS * sizeof (vdev_label_t))); } diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_mount.c b/sys/contrib/openzfs/lib/libzfs/libzfs_mount.c index 2a81b658d342..5c9e2199eed4 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_mount.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_mount.c @@ -516,7 +516,7 @@ zfs_mount_at(zfs_handle_t *zhp, const char *options, int flags, } else if (rc == ENOTSUP) { int spa_version; - VERIFY(zfs_spa_version(zhp, &spa_version) == 0); + VERIFY0(zfs_spa_version(zhp, &spa_version)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Can't mount a version %llu " "file system on a version %d pool. Pool must be" diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c b/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c index 6f8fb994f814..ce154ae1a4cd 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c @@ -31,6 +31,7 @@ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com> * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> * Copyright (c) 2021, 2023, Klara Inc. + * Copyright (c) 2025 Hewlett Packard Enterprise Development LP. */ #include <errno.h> @@ -896,7 +897,7 @@ int zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval) { zfs_cmd_t zc = {"\0"}; - int ret = -1; + int ret; char errbuf[ERRBUFLEN]; nvlist_t *nvl = NULL; nvlist_t *realprops; @@ -1422,30 +1423,6 @@ zpool_get_state(zpool_handle_t *zhp) } /* - * Check if vdev list contains a special vdev - */ -static boolean_t -zpool_has_special_vdev(nvlist_t *nvroot) -{ - nvlist_t **child; - uint_t children; - - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, - &children) == 0) { - for (uint_t c = 0; c < children; c++) { - const char *bias; - - if (nvlist_lookup_string(child[c], - ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0 && - strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) { - return (B_TRUE); - } - } - } - return (B_FALSE); -} - -/* * Check if vdev list contains a dRAID vdev */ static boolean_t @@ -1548,16 +1525,6 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, goto create_failed; } - if (nvlist_exists(zc_fsprops, - zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS)) && - !zpool_has_special_vdev(nvroot)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "%s property requires a special vdev"), - zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS)); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto create_failed; - } - if (!zc_props && (nvlist_alloc(&zc_props, NV_UNIQUE_NAME, 0) != 0)) { goto create_failed; @@ -2470,6 +2437,30 @@ xlate_init_err(int err) return (err); } +int +zpool_initialize_one(zpool_handle_t *zhp, void *data) +{ + int error; + libzfs_handle_t *hdl = zpool_get_handle(zhp); + const char *pool_name = zpool_get_name(zhp); + if (zpool_open_silent(hdl, pool_name, &zhp) != 0) + return (-1); + initialize_cbdata_t *cb = data; + nvlist_t *vdevs = fnvlist_alloc(); + + nvlist_t *config = zpool_get_config(zhp, NULL); + nvlist_t *nvroot = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE); + zpool_collect_leaves(zhp, nvroot, vdevs); + if (cb->wait) + error = zpool_initialize_wait(zhp, cb->cmd_type, vdevs); + else + error = zpool_initialize(zhp, cb->cmd_type, vdevs); + fnvlist_free(vdevs); + + return (error); +} + /* * Begin, suspend, cancel, or uninit (clear) the initialization (initializing * of all free blocks) for the given vdevs in the given pool. @@ -2590,6 +2581,58 @@ xlate_trim_err(int err) return (err); } +void +zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res) +{ + libzfs_handle_t *hdl = zhp->zpool_hdl; + uint_t children = 0; + nvlist_t **child; + uint_t i; + + (void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children); + + if (children == 0) { + char *path = zpool_vdev_name(hdl, zhp, nvroot, + VDEV_NAME_PATH); + + if (strcmp(path, VDEV_TYPE_INDIRECT) != 0 && + strcmp(path, VDEV_TYPE_HOLE) != 0) + fnvlist_add_boolean(res, path); + + free(path); + return; + } + + for (i = 0; i < children; i++) { + zpool_collect_leaves(zhp, child[i], res); + } +} + +int +zpool_trim_one(zpool_handle_t *zhp, void *data) +{ + int error; + libzfs_handle_t *hdl = zpool_get_handle(zhp); + const char *pool_name = zpool_get_name(zhp); + if (zpool_open_silent(hdl, pool_name, &zhp) != 0) + return (-1); + + trim_cbdata_t *cb = data; + nvlist_t *vdevs = fnvlist_alloc(); + + /* no individual leaf vdevs specified, so add them all */ + nvlist_t *config = zpool_get_config(zhp, NULL); + nvlist_t *nvroot = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE); + + zpool_collect_leaves(zhp, nvroot, vdevs); + error = zpool_trim(zhp, cb->cmd_type, vdevs, &cb->trim_flags); + fnvlist_free(vdevs); + + return (error); +} + static int zpool_trim_wait(zpool_handle_t *zhp, nvlist_t *vdev_guids) { @@ -2730,7 +2773,13 @@ out: * Scan the pool. */ int -zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) +zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) { + return (zpool_scan_range(zhp, func, cmd, 0, 0)); +} + +int +zpool_scan_range(zpool_handle_t *zhp, pool_scan_func_t func, + pool_scrub_cmd_t cmd, time_t date_start, time_t date_end) { char errbuf[ERRBUFLEN]; int err; @@ -2739,6 +2788,8 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) nvlist_t *args = fnvlist_alloc(); fnvlist_add_uint64(args, "scan_type", (uint64_t)func); fnvlist_add_uint64(args, "scan_command", (uint64_t)cmd); + fnvlist_add_uint64(args, "scan_date_start", (uint64_t)date_start); + fnvlist_add_uint64(args, "scan_date_end", (uint64_t)date_end); err = lzc_scrub(ZFS_IOC_POOL_SCRUB, zhp->zpool_name, args, NULL); fnvlist_free(args); @@ -4344,7 +4395,7 @@ zpool_set_guid(zpool_handle_t *zhp, const uint64_t *guid) libzfs_handle_t *hdl = zhp->zpool_hdl; nvlist_t *nvl = NULL; zfs_cmd_t zc = {"\0"}; - int error = -1; + int error; if (guid != NULL) { if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) @@ -5127,9 +5178,10 @@ zpool_load_compat(const char *compat, boolean_t *features, char *report, /* special cases (unset), "" and "off" => enable all features */ if (compat == NULL || compat[0] == '\0' || strcmp(compat, ZPOOL_COMPAT_OFF) == 0) { - if (features != NULL) + if (features != NULL) { for (uint_t i = 0; i < SPA_FEATURES; i++) features[i] = B_TRUE; + } if (report != NULL) strlcpy(report, gettext("all features enabled"), rlen); return (ZPOOL_COMPATIBILITY_OK); @@ -5497,6 +5549,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, /* Only use if provided by the RAIDZ VDEV above */ if (prop == VDEV_PROP_RAIDZ_EXPANDING) return (ENOENT); + if (prop == VDEV_PROP_SIT_OUT) + return (ENOENT); } if (vdev_prop_index_to_string(prop, intval, (const char **)&strval) != 0) @@ -5666,8 +5720,16 @@ zpool_set_vdev_prop(zpool_handle_t *zhp, const char *vdevname, nvlist_free(nvl); nvlist_free(outnvl); - if (ret) - (void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf); + if (ret) { + if (errno == ENOTSUP) { + zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN, + "property not supported for this vdev")); + (void) zfs_error(zhp->zpool_hdl, EZFS_PROPTYPE, errbuf); + } else { + (void) zpool_standard_error(zhp->zpool_hdl, errno, + errbuf); + } + } return (ret); } diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_sendrecv.c b/sys/contrib/openzfs/lib/libzfs/libzfs_sendrecv.c index 1ad10ebb3c15..77134d197904 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_sendrecv.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_sendrecv.c @@ -2505,7 +2505,7 @@ zfs_send_cb_impl(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, err = ENOENT; if (sdd.cleanup_fd != -1) { - VERIFY(0 == close(sdd.cleanup_fd)); + VERIFY0(close(sdd.cleanup_fd)); sdd.cleanup_fd = -1; } @@ -2531,7 +2531,7 @@ err_out: fnvlist_free(sdd.snapholds); if (sdd.cleanup_fd != -1) - VERIFY(0 == close(sdd.cleanup_fd)); + VERIFY0(close(sdd.cleanup_fd)); return (err); } @@ -5108,7 +5108,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, nvlist_t *holds, *errors = NULL; int cleanup_fd = -1; - VERIFY(0 == nvlist_alloc(&holds, 0, KM_SLEEP)); + VERIFY0(nvlist_alloc(&holds, 0, KM_SLEEP)); for (pair = nvlist_next_nvpair(snapholds_nvlist, NULL); pair != NULL; pair = nvlist_next_nvpair(snapholds_nvlist, pair)) { @@ -5560,7 +5560,7 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, if ((cp = strchr(nonpackage_sendfs, '@')) != NULL) *cp = '\0'; sendfs = nonpackage_sendfs; - VERIFY(finalsnap == NULL); + VERIFY0P(finalsnap); } return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags, &drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs, diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_status.c b/sys/contrib/openzfs/lib/libzfs/libzfs_status.c index 1ee703968237..bdddefb92165 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_status.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_status.c @@ -484,7 +484,8 @@ check_status(nvlist_t *config, boolean_t isimport, } for (i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *fi = &spa_feature_table[i]; - if (!fi->fi_zfs_mod_supported) + if (!fi->fi_zfs_mod_supported || + (fi->fi_flags & ZFEATURE_FLAG_NO_UPGRADE)) continue; if (c_features[i] && !nvlist_exists(feat, fi->fi_guid)) return (ZPOOL_STATUS_FEAT_DISABLED); diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_util.c b/sys/contrib/openzfs/lib/libzfs/libzfs_util.c index 4edddc2a759b..26f5135dff62 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_util.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_util.c @@ -776,6 +776,11 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) case ZFS_ERR_ASHIFT_MISMATCH: zfs_verror(hdl, EZFS_ASHIFT_MISMATCH, fmt, ap); break; + case ZFS_ERR_TOO_MANY_SITOUTS: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "too many disks " + "already sitting out")); + zfs_verror(hdl, EZFS_BUSY, fmt, ap); + break; default: zfs_error_aux(hdl, "%s", zfs_strerror(error)); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); diff --git a/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi b/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi index 63904f447e85..7464b3adb254 100644 --- a/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi +++ b/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi @@ -222,6 +222,10 @@ <elf-symbol name='spl_pagesize' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='strlcat' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='strlcpy' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> + <elf-symbol name='zfs_tunable_get' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> + <elf-symbol name='zfs_tunable_iter' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> + <elf-symbol name='zfs_tunable_lookup' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> + <elf-symbol name='zfs_tunable_set' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> </elf-function-symbols> <abi-instr address-size='64' path='lib/libspl/assert.c' language='LANG_C99'> <class-decl name='__va_list_tag' size-in-bits='192' is-struct='yes' visibility='default' id='d5027220'> @@ -613,7 +617,6 @@ <array-type-def dimensions='1' type-id='de572c22' size-in-bits='1472' id='6d3c2f42'> <subrange length='23' type-id='7359adad' id='fdd0f594'/> </array-type-def> - <type-decl name='long long int' size-in-bits='64' id='1eb56b1e'/> <array-type-def dimensions='1' type-id='3a47d82b' size-in-bits='256' id='a133ec23'> <subrange length='4' type-id='7359adad' id='16fe7105'/> </array-type-def> @@ -974,8 +977,6 @@ </abi-instr> <abi-instr address-size='64' path='lib/libspl/os/linux/gethostid.c' language='LANG_C99'> <type-decl name='long long unsigned int' size-in-bits='64' id='3a47d82b'/> - <pointer-type-def type-id='26a90f95' size-in-bits='64' id='9b23c9ad'/> - <qualified-type-def type-id='9b23c9ad' restrict='yes' id='8c85230f'/> <function-decl name='strtoull' visibility='default' binding='global' size-in-bits='64'> <parameter type-id='9d26089a'/> <parameter type-id='8c85230f'/> @@ -987,6 +988,13 @@ </function-decl> </abi-instr> <abi-instr address-size='64' path='lib/libspl/os/linux/getmntany.c' language='LANG_C99'> + <array-type-def dimensions='1' type-id='d315442e' size-in-bits='16' id='811205dc'> + <subrange length='1' type-id='7359adad' id='52f813b4'/> + </array-type-def> + <array-type-def dimensions='1' type-id='d3130597' size-in-bits='768' id='f63f23b9'> + <subrange length='12' type-id='7359adad' id='84827bdc'/> + </array-type-def> + <type-decl name='long long int' size-in-bits='64' id='1eb56b1e'/> <class-decl name='mnttab' size-in-bits='256' is-struct='yes' visibility='default' id='1b055409'> <data-member access='public' layout-offset-in-bits='0'> <var-decl name='mnt_special' type-id='26a90f95' visibility='default'/> @@ -1021,6 +1029,93 @@ <var-decl name='mnt_minor' type-id='3502e3ff' visibility='default'/> </data-member> </class-decl> + <typedef-decl name='__u16' type-id='8efea9e5' id='d315442e'/> + <typedef-decl name='__s32' type-id='95e97e5e' id='3158a266'/> + <typedef-decl name='__u32' type-id='f0981eeb' id='3f1a6b60'/> + <typedef-decl name='__s64' type-id='1eb56b1e' id='49659421'/> + <typedef-decl name='__u64' type-id='3a47d82b' id='d3130597'/> + <class-decl name='statx_timestamp' size-in-bits='128' is-struct='yes' visibility='default' id='94101016'> + <data-member access='public' layout-offset-in-bits='0'> + <var-decl name='tv_sec' type-id='49659421' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='64'> + <var-decl name='tv_nsec' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='96'> + <var-decl name='__reserved' type-id='3158a266' visibility='default'/> + </data-member> + </class-decl> + <class-decl name='statx' size-in-bits='2048' is-struct='yes' visibility='default' id='720b04c5'> + <data-member access='public' layout-offset-in-bits='0'> + <var-decl name='stx_mask' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='32'> + <var-decl name='stx_blksize' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='64'> + <var-decl name='stx_attributes' type-id='d3130597' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='128'> + <var-decl name='stx_nlink' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='160'> + <var-decl name='stx_uid' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='192'> + <var-decl name='stx_gid' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='224'> + <var-decl name='stx_mode' type-id='d315442e' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='240'> + <var-decl name='__spare0' type-id='811205dc' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='256'> + <var-decl name='stx_ino' type-id='d3130597' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='320'> + <var-decl name='stx_size' type-id='d3130597' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='384'> + <var-decl name='stx_blocks' type-id='d3130597' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='448'> + <var-decl name='stx_attributes_mask' type-id='d3130597' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='512'> + <var-decl name='stx_atime' type-id='94101016' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='640'> + <var-decl name='stx_btime' type-id='94101016' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='768'> + <var-decl name='stx_ctime' type-id='94101016' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='896'> + <var-decl name='stx_mtime' type-id='94101016' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='1024'> + <var-decl name='stx_rdev_major' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='1056'> + <var-decl name='stx_rdev_minor' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='1088'> + <var-decl name='stx_dev_major' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='1120'> + <var-decl name='stx_dev_minor' type-id='3f1a6b60' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='1152'> + <var-decl name='stx_mnt_id' type-id='d3130597' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='1216'> + <var-decl name='__spare2' type-id='d3130597' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='1280'> + <var-decl name='__spare3' type-id='f63f23b9' visibility='default'/> + </data-member> + </class-decl> <class-decl name='mntent' size-in-bits='320' is-struct='yes' visibility='default' id='56fe4a37'> <data-member access='public' layout-offset-in-bits='0'> <var-decl name='mnt_fsname' type-id='26a90f95' visibility='default'/> @@ -1096,6 +1191,8 @@ <pointer-type-def type-id='1b055409' size-in-bits='64' id='9d424d31'/> <pointer-type-def type-id='0bbec9cd' size-in-bits='64' id='62f7a03d'/> <qualified-type-def type-id='62f7a03d' restrict='yes' id='f1cadedf'/> + <pointer-type-def type-id='720b04c5' size-in-bits='64' id='936b8e35'/> + <qualified-type-def type-id='936b8e35' restrict='yes' id='31d265b7'/> <function-decl name='getmntent_r' visibility='default' binding='global' size-in-bits='64'> <parameter type-id='e75a27e9'/> <parameter type-id='3cad23cd'/> @@ -1107,15 +1204,18 @@ <parameter type-id='822cd80b'/> <return type-id='95e97e5e'/> </function-decl> - <function-decl name='strcmp' visibility='default' binding='global' size-in-bits='64'> - <parameter type-id='80f4b756'/> - <parameter type-id='80f4b756'/> - <return type-id='95e97e5e'/> - </function-decl> <function-decl name='strerror' visibility='default' binding='global' size-in-bits='64'> <parameter type-id='95e97e5e'/> <return type-id='26a90f95'/> </function-decl> + <function-decl name='statx' visibility='default' binding='global' size-in-bits='64'> + <parameter type-id='95e97e5e'/> + <parameter type-id='9d26089a'/> + <parameter type-id='95e97e5e'/> + <parameter type-id='f0981eeb'/> + <parameter type-id='31d265b7'/> + <return type-id='95e97e5e'/> + </function-decl> <function-decl name='stat64' visibility='default' binding='global' size-in-bits='64'> <parameter type-id='9d26089a'/> <parameter type-id='f1cadedf'/> @@ -1258,6 +1358,96 @@ <return type-id='48b5725f'/> </function-decl> </abi-instr> + <abi-instr address-size='64' path='lib/libspl/tunables.c' language='LANG_C99'> + <enum-decl name='zfs_tunable_type_t' naming-typedef-id='f50b1525' id='56905369'> + <underlying-type type-id='9cac1fee'/> + <enumerator name='ZFS_TUNABLE_TYPE_INT' value='0'/> + <enumerator name='ZFS_TUNABLE_TYPE_UINT' value='1'/> + <enumerator name='ZFS_TUNABLE_TYPE_ULONG' value='2'/> + <enumerator name='ZFS_TUNABLE_TYPE_U64' value='3'/> + <enumerator name='ZFS_TUNABLE_TYPE_STRING' value='4'/> + </enum-decl> + <typedef-decl name='zfs_tunable_type_t' type-id='56905369' id='f50b1525'/> + <enum-decl name='zfs_tunable_perm_t' naming-typedef-id='ada7336b' id='e80e6ebf'> + <underlying-type type-id='9cac1fee'/> + <enumerator name='ZFS_TUNABLE_PERM_ZMOD_RW' value='0'/> + <enumerator name='ZFS_TUNABLE_PERM_ZMOD_RD' value='1'/> + </enum-decl> + <typedef-decl name='zfs_tunable_perm_t' type-id='e80e6ebf' id='ada7336b'/> + <class-decl name='zfs_tunable' size-in-bits='320' is-struct='yes' visibility='default' id='1a97ee0e'> + <data-member access='public' layout-offset-in-bits='0'> + <var-decl name='zt_name' type-id='80f4b756' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='64'> + <var-decl name='zt_varp' type-id='eaa32e2f' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='128'> + <var-decl name='zt_varsz' type-id='b59d7dce' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='192'> + <var-decl name='zt_type' type-id='f50b1525' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='224'> + <var-decl name='zt_perm' type-id='ada7336b' visibility='default'/> + </data-member> + <data-member access='public' layout-offset-in-bits='256'> + <var-decl name='zt_desc' type-id='80f4b756' visibility='default'/> + </data-member> + </class-decl> + <typedef-decl name='zfs_tunable_t' type-id='1a97ee0e' id='12bf5c5e'/> + <typedef-decl name='zfs_tunable_iter_t' type-id='7ef33f92' id='d8d5f4ab'/> + <typedef-decl name='intmax_t' type-id='5b475db0' id='e104d842'/> + <typedef-decl name='uintmax_t' type-id='04d82f4b' id='f8b828c9'/> + <typedef-decl name='__intmax_t' type-id='bd54fe1a' id='5b475db0'/> + <typedef-decl name='__uintmax_t' type-id='7359adad' id='04d82f4b'/> + <pointer-type-def type-id='26a90f95' size-in-bits='64' id='9b23c9ad'/> + <qualified-type-def type-id='9b23c9ad' restrict='yes' id='8c85230f'/> + <qualified-type-def type-id='12bf5c5e' const='yes' id='180e47ee'/> + <pointer-type-def type-id='180e47ee' size-in-bits='64' id='a27af98c'/> + <pointer-type-def type-id='92f86508' size-in-bits='64' id='7ef33f92'/> + <function-decl name='strtoimax' visibility='default' binding='global' size-in-bits='64'> + <parameter type-id='9d26089a'/> + <parameter type-id='8c85230f'/> + <parameter type-id='95e97e5e'/> + <return type-id='e104d842'/> + </function-decl> + <function-decl name='strtoumax' visibility='default' binding='global' size-in-bits='64'> + <parameter type-id='9d26089a'/> + <parameter type-id='8c85230f'/> + <parameter type-id='95e97e5e'/> + <return type-id='f8b828c9'/> + </function-decl> + <function-decl name='strcmp' visibility='default' binding='global' size-in-bits='64'> + <parameter type-id='80f4b756'/> + <parameter type-id='80f4b756'/> + <return type-id='95e97e5e'/> + </function-decl> + <function-decl name='zfs_tunable_lookup' mangled-name='zfs_tunable_lookup' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_lookup'> + <parameter type-id='80f4b756' name='name'/> + <return type-id='a27af98c'/> + </function-decl> + <function-decl name='zfs_tunable_iter' mangled-name='zfs_tunable_iter' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_iter'> + <parameter type-id='d8d5f4ab' name='cb'/> + <parameter type-id='eaa32e2f' name='arg'/> + <return type-id='48b5725f'/> + </function-decl> + <function-decl name='zfs_tunable_set' mangled-name='zfs_tunable_set' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_set'> + <parameter type-id='a27af98c' name='zt'/> + <parameter type-id='80f4b756' name='val'/> + <return type-id='95e97e5e'/> + </function-decl> + <function-decl name='zfs_tunable_get' mangled-name='zfs_tunable_get' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zfs_tunable_get'> + <parameter type-id='a27af98c' name='zt'/> + <parameter type-id='26a90f95' name='val'/> + <parameter type-id='b59d7dce' name='valsz'/> + <return type-id='95e97e5e'/> + </function-decl> + <function-type size-in-bits='64' id='92f86508'> + <parameter type-id='a27af98c'/> + <parameter type-id='eaa32e2f'/> + <return type-id='95e97e5e'/> + </function-type> + </abi-instr> <abi-instr address-size='64' path='lib/libzfs_core/libzfs_core.c' language='LANG_C99'> <array-type-def dimensions='1' type-id='03085adc' size-in-bits='192' id='083f8d58'> <subrange length='3' type-id='7359adad' id='56f209d2'/> diff --git a/sys/contrib/openzfs/lib/libzpool/Makefile.am b/sys/contrib/openzfs/lib/libzpool/Makefile.am index 8553b377a760..aeacc595b363 100644 --- a/sys/contrib/openzfs/lib/libzpool/Makefile.am +++ b/sys/contrib/openzfs/lib/libzpool/Makefile.am @@ -177,6 +177,7 @@ nodist_libzpool_la_SOURCES = \ module/zfs/zfs_byteswap.c \ module/zfs/zfs_chksum.c \ module/zfs/zfs_debug_common.c \ + module/zfs/zfs_crrd.c \ module/zfs/zfs_fm.c \ module/zfs/zfs_fuid.c \ module/zfs/zfs_ratelimit.c \ @@ -199,7 +200,7 @@ libzpool_la_LIBADD = \ libzstd.la \ libzutil.la -libzpool_la_LIBADD += $(LIBCLOCK_GETTIME) $(ZLIB_LIBS) -ldl -lm +libzpool_la_LIBADD += $(LIBCLOCK_GETTIME) $(ZLIB_LIBS) -lm libzpool_la_LDFLAGS = -pthread diff --git a/sys/contrib/openzfs/lib/libzpool/abd_os.c b/sys/contrib/openzfs/lib/libzpool/abd_os.c index 0d5795de143a..8bd7a64ab24a 100644 --- a/sys/contrib/openzfs/lib/libzpool/abd_os.c +++ b/sys/contrib/openzfs/lib/libzpool/abd_os.c @@ -302,7 +302,7 @@ abd_iter_at_end(struct abd_iter *aiter) void abd_iter_advance(struct abd_iter *aiter, size_t amount) { - ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0P(aiter->iter_mapaddr); ASSERT0(aiter->iter_mapsize); if (abd_iter_at_end(aiter)) @@ -315,7 +315,7 @@ abd_iter_advance(struct abd_iter *aiter, size_t amount) void abd_iter_map(struct abd_iter *aiter) { - ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0P(aiter->iter_mapaddr); ASSERT0(aiter->iter_mapsize); if (abd_iter_at_end(aiter)) diff --git a/sys/contrib/openzfs/lib/libzpool/kernel.c b/sys/contrib/openzfs/lib/libzpool/kernel.c index e397fc851cc1..8ed374627264 100644 --- a/sys/contrib/openzfs/lib/libzpool/kernel.c +++ b/sys/contrib/openzfs/lib/libzpool/kernel.c @@ -38,6 +38,7 @@ #include <sys/processor.h> #include <sys/rrwlock.h> #include <sys/spa.h> +#include <sys/spa_impl.h> #include <sys/stat.h> #include <sys/systeminfo.h> #include <sys/time.h> @@ -369,7 +370,7 @@ cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime) if (delta <= 0) return (-1); - VERIFY(gettimeofday(&tv, NULL) == 0); + VERIFY0(gettimeofday(&tv, NULL)); ts.tv_sec = tv.tv_sec + delta / hz; ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % hz) * (NANOSEC / hz); @@ -811,6 +812,79 @@ umem_out_of_memory(void) return (0); } +static void +spa_config_load(void) +{ + void *buf = NULL; + nvlist_t *nvlist, *child; + nvpair_t *nvpair; + char *pathname; + zfs_file_t *fp; + zfs_file_attr_t zfa; + uint64_t fsize; + int err; + + /* + * Open the configuration file. + */ + pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + (void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path); + + err = zfs_file_open(pathname, O_RDONLY, 0, &fp); + if (err) + err = zfs_file_open(ZPOOL_CACHE_BOOT, O_RDONLY, 0, &fp); + + kmem_free(pathname, MAXPATHLEN); + + if (err) + return; + + if (zfs_file_getattr(fp, &zfa)) + goto out; + + fsize = zfa.zfa_size; + buf = kmem_alloc(fsize, KM_SLEEP); + + /* + * Read the nvlist from the file. + */ + if (zfs_file_read(fp, buf, fsize, NULL) < 0) + goto out; + + /* + * Unpack the nvlist. + */ + if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0) + goto out; + + /* + * Iterate over all elements in the nvlist, creating a new spa_t for + * each one with the specified configuration. + */ + mutex_enter(&spa_namespace_lock); + nvpair = NULL; + while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { + if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) + continue; + + child = fnvpair_value_nvlist(nvpair); + + if (spa_lookup(nvpair_name(nvpair)) != NULL) + continue; + (void) spa_add(nvpair_name(nvpair), child, NULL); + } + mutex_exit(&spa_namespace_lock); + + nvlist_free(nvlist); + +out: + if (buf != NULL) + kmem_free(buf, fsize); + + zfs_file_close(fp); +} + void kernel_init(int mode) { @@ -835,6 +909,7 @@ kernel_init(int mode) zstd_init(); spa_init((spa_mode_t)mode); + spa_config_load(); fletcher_4_init(); @@ -1025,25 +1100,13 @@ spl_fstrans_unmark(fstrans_cookie_t cookie) } int -__spl_pf_fstrans_check(void) -{ - return (0); -} - -int kmem_cache_reap_active(void) { return (0); } void -zvol_create_minor(const char *name) -{ - (void) name; -} - -void -zvol_create_minors_recursive(const char *name) +zvol_create_minors(const char *name) { (void) name; } @@ -1073,8 +1136,8 @@ zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname, int zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) { - int fd = -1; - int dump_fd = -1; + int fd; + int dump_fd; int err; int old_umask = 0; zfs_file_t *fp; @@ -1175,7 +1238,7 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) */ int zfs_file_pwrite(zfs_file_t *fp, const void *buf, - size_t count, loff_t pos, ssize_t *resid) + size_t count, loff_t pos, uint8_t ashift, ssize_t *resid) { ssize_t rc, split, done; int sectors; @@ -1185,8 +1248,8 @@ zfs_file_pwrite(zfs_file_t *fp, const void *buf, * system calls so that the process can be killed in between. * This is used by ztest to simulate realistic failure modes. */ - sectors = count >> SPA_MINBLOCKSHIFT; - split = (sectors > 0 ? rand() % sectors : 0) << SPA_MINBLOCKSHIFT; + sectors = count >> ashift; + split = (sectors > 0 ? rand() % sectors : 0) << ashift; rc = pwrite64(fp->f_fd, buf, split, pos); if (rc != -1) { done = rc; diff --git a/sys/contrib/openzfs/lib/libzpool/util.c b/sys/contrib/openzfs/lib/libzpool/util.c index a297daedbd4d..66d6f43967d5 100644 --- a/sys/contrib/openzfs/lib/libzpool/util.c +++ b/sys/contrib/openzfs/lib/libzpool/util.c @@ -36,7 +36,7 @@ #include <sys/fs/zfs.h> #include <sys/zfs_refcount.h> #include <sys/zfs_ioctl.h> -#include <dlfcn.h> +#include <sys/tunables.h> #include <libzutil.h> /* @@ -137,12 +137,10 @@ show_pool_stats(spa_t *spa) nvlist_t *config, *nvroot; const char *name; - VERIFY(spa_get_stats(spa_name(spa), &config, NULL, 0) == 0); + VERIFY0(spa_get_stats(spa_name(spa), &config, NULL, 0)); - VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, - &name) == 0); + VERIFY0(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)); + VERIFY0(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name)); show_vdev_stats(name, ZPOOL_CONFIG_CHILDREN, nvroot, 0); show_vdev_stats(NULL, ZPOOL_CONFIG_L2CACHE, nvroot, 0); @@ -151,97 +149,119 @@ show_pool_stats(spa_t *spa) nvlist_free(config); } -/* *k_out must be freed by the caller */ +/* + * Common helper for working with libzpool tunables from the command line. + * + * Valid inputs: + * + * <name> show named tunable and value + * <name>=<value> set tunable value + * + * show show all tunables and values + * show=<name> show named tunable and value + * info show info about all tunables + * info=<name> show info about named tunable + */ + +typedef enum { SHOW, INFO, SET } tunable_mode_t; + static int -set_global_var_parse_kv(const char *arg, char **k_out, u_longlong_t *v_out) +list_tunables_cb(const zfs_tunable_t *tunable, void *arg) { - int err; - VERIFY(arg); - char *d = strdup(arg); - - char *save = NULL; - char *k = strtok_r(d, "=", &save); - char *v_str = strtok_r(NULL, "=", &save); - char *follow = strtok_r(NULL, "=", &save); - if (k == NULL || v_str == NULL || follow != NULL) { - err = EINVAL; - goto err_free; - } - - u_longlong_t val = strtoull(v_str, NULL, 0); - if (val > UINT32_MAX) { - fprintf(stderr, "Value for global variable '%s' must " - "be a 32-bit unsigned integer, got '%s'\n", k, v_str); - err = EOVERFLOW; - goto err_free; + const tunable_mode_t *mode = arg; + + static const char *type[] = { + "int", "uint", "ulong", "u64", "str", + }; + static const char *perm[] = { + "rw", "rd", + }; + + if (*mode == SHOW) { + char val[64]; + int err = zfs_tunable_get(tunable, val, sizeof (val)); + if (err == 0) + printf("%s: %s\n", tunable->zt_name, val); + else + printf("%s: [error getting tunable value: %s]\n", + tunable->zt_name, strerror(err)); + } else { + printf("%s [%s %s]: %s\n", tunable->zt_name, + type[tunable->zt_type], perm[tunable->zt_perm], + tunable->zt_desc); } - *k_out = strdup(k); - *v_out = val; - free(d); return (0); - -err_free: - free(d); - - return (err); } - -/* - * Sets given global variable in libzpool to given unsigned 32-bit value. - * arg: "<variable>=<value>" - */ int -set_global_var(char const *arg) +handle_tunable_option(const char *_arg, boolean_t quiet) { - void *zpoolhdl; - char *varname; - u_longlong_t val; - int ret; - -#ifndef _ZFS_LITTLE_ENDIAN - /* - * On big endian systems changing a 64-bit variable would set the high - * 32 bits instead of the low 32 bits, which could cause unexpected - * results. - */ - fprintf(stderr, "Setting global variables is only supported on " - "little-endian systems\n"); - ret = ENOTSUP; - goto out_ret; -#endif + int err = 0; + char *arg = strdup(_arg); + char *k, *v; + + v = arg; + k = strsep(&v, "="); + + tunable_mode_t mode; + + if (strcmp(k, "show") == 0) { + mode = SHOW; + k = v; + } else if (strcmp(k, "info") == 0) { + mode = INFO; + k = v; + } else if (v == NULL) { + mode = SHOW; + } else { + mode = SET; + } - if ((ret = set_global_var_parse_kv(arg, &varname, &val)) != 0) { - goto out_ret; + if (quiet && mode != SET) { + err = EINVAL; + goto out; } - zpoolhdl = dlopen("libzpool.so", RTLD_LAZY); - if (zpoolhdl != NULL) { - uint32_t *var; - var = dlsym(zpoolhdl, varname); - if (var == NULL) { - fprintf(stderr, "Global variable '%s' does not exist " - "in libzpool.so\n", varname); - ret = EINVAL; - goto out_dlclose; + if (mode == SET) { + const zfs_tunable_t *tunable = zfs_tunable_lookup(k); + if (tunable == NULL) { + err = ENOENT; + goto out; } - *var = (uint32_t)val; + char vold[256], vnew[256]; + if (zfs_tunable_get(tunable, vold, sizeof (vold)) != 0) + strcpy(vold, "???"); + err = zfs_tunable_set(tunable, v); + if (err != 0) + goto out; + if (zfs_tunable_get(tunable, vnew, sizeof (vnew)) != 0) + strcpy(vnew, "???"); + + if (!quiet) + printf("%s: %s -> %s\n", k, vold, vnew); + } else if (k != NULL) { + const zfs_tunable_t *tunable = zfs_tunable_lookup(k); + if (tunable == NULL) { + err = ENOENT; + goto out; + } + list_tunables_cb(tunable, &mode); } else { - fprintf(stderr, "Failed to open libzpool.so to set global " - "variable\n"); - ret = EIO; - goto out_free; + zfs_tunable_iter(list_tunables_cb, &mode); } - ret = 0; +out: + if (!quiet) { + if (err == ENOENT) + fprintf(stderr, "no such tunable: %s\n", k); + else if (err != 0) + fprintf(stderr, "couldn't set tunable '%s': %s\n", + k, strerror(err)); + } -out_dlclose: - dlclose(zpoolhdl); -out_free: - free(varname); -out_ret: - return (ret); + free(arg); + return (err); } static nvlist_t * diff --git a/sys/contrib/openzfs/lib/libzutil/zutil_import.c b/sys/contrib/openzfs/lib/libzutil/zutil_import.c index ccdc874076c3..08367f4c064d 100644 --- a/sys/contrib/openzfs/lib/libzutil/zutil_import.c +++ b/sys/contrib/openzfs/lib/libzutil/zutil_import.c @@ -917,7 +917,7 @@ error: static uint64_t label_offset(uint64_t size, int l) { - ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0); + ASSERT0(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t)); return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? 0 : size - VDEV_LABELS * sizeof (vdev_label_t))); } @@ -1769,7 +1769,7 @@ zpool_find_import_cached(libpc_handle_t *hdl, importargs_t *iarg) fnvlist_add_nvlist(pools, nvpair_name(pair), fnvpair_value_nvlist(pair)); - VERIFY3P(nvlist_next_nvpair(nv, pair), ==, NULL); + VERIFY0P(nvlist_next_nvpair(nv, pair)); iarg->guid = saved_guid; iarg->poolname = saved_poolname; @@ -1903,30 +1903,43 @@ zpool_find_config(libpc_handle_t *hdl, const char *target, nvlist_t **configp, *sepp = '\0'; pools = zpool_search_import(hdl, args); + if (pools == NULL) { + zutil_error_aux(hdl, dgettext(TEXT_DOMAIN, "no pools found")); + (void) zutil_error_fmt(hdl, LPC_UNKNOWN, dgettext(TEXT_DOMAIN, + "failed to find config for pool '%s'"), targetdup); + free(targetdup); + return (ENOENT); + } - if (pools != NULL) { - nvpair_t *elem = NULL; - while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { - VERIFY0(nvpair_value_nvlist(elem, &config)); - if (pool_match(config, targetdup)) { - count++; - if (match != NULL) { - /* multiple matches found */ - continue; - } else { - match = fnvlist_dup(config); - } + nvpair_t *elem = NULL; + while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { + VERIFY0(nvpair_value_nvlist(elem, &config)); + if (pool_match(config, targetdup)) { + count++; + if (match != NULL) { + /* multiple matches found */ + continue; + } else { + match = fnvlist_dup(config); } } - fnvlist_free(pools); } + fnvlist_free(pools); if (count == 0) { + zutil_error_aux(hdl, dgettext(TEXT_DOMAIN, + "no matching pools")); + (void) zutil_error_fmt(hdl, LPC_UNKNOWN, dgettext(TEXT_DOMAIN, + "failed to find config for pool '%s'"), targetdup); free(targetdup); return (ENOENT); } if (count > 1) { + zutil_error_aux(hdl, dgettext(TEXT_DOMAIN, + "more than one matching pool")); + (void) zutil_error_fmt(hdl, LPC_UNKNOWN, dgettext(TEXT_DOMAIN, + "failed to find config for pool '%s'"), targetdup); free(targetdup); fnvlist_free(match); return (EINVAL); diff --git a/sys/contrib/openzfs/man/Makefile.am b/sys/contrib/openzfs/man/Makefile.am index 6a7b2d3e46b7..7a63641c1c39 100644 --- a/sys/contrib/openzfs/man/Makefile.am +++ b/sys/contrib/openzfs/man/Makefile.am @@ -2,7 +2,7 @@ dist_noinst_man_MANS = \ %D%/man1/cstyle.1 dist_man_MANS = \ - %D%/man1/arcstat.1 \ + %D%/man1/zarcstat.1 \ %D%/man1/raidz_test.1 \ %D%/man1/test-runner.1 \ %D%/man1/zhack.1 \ @@ -124,10 +124,21 @@ dist_noinst_DATA += $(dist_noinst_man_MANS) $(dist_man_MANS) SUBSTFILES += $(nodist_man_MANS) -CHECKS += mancheck -mancheck: - $(top_srcdir)/scripts/mancheck.sh $(srcdir)/%D% +MANFILES = $(dist_noinst_man_MANS) $(dist_man_MANS) $(nodist_man_MANS) + +PHONY += mancheck + +mancheck_verbose = $(mancheck_verbose_@AM_V@) +mancheck_verbose_ = $(mancheck_verbose_@AM_DEFAULT_V@) +mancheck_verbose_0 = @echo MANCHECK $(_MTGT); +_MTGT = $(subst ^,/,$(subst mancheck-,,$@)) +mancheck-%: + $(mancheck_verbose)scripts/mancheck.sh $(_MTGT) + +mancheck: $(foreach manfile, $(MANFILES), $(addprefix mancheck-,$(subst /,^,$(manfile)))) + +CHECKS += mancheck if BUILD_LINUX # The manual pager in most Linux distros defaults to "BSD" when .Os is blank, diff --git a/sys/contrib/openzfs/man/man1/cstyle.1 b/sys/contrib/openzfs/man/man1/cstyle.1 index 241c82edd5a8..8f29129ce175 100644 --- a/sys/contrib/openzfs/man/man1/cstyle.1 +++ b/sys/contrib/openzfs/man/man1/cstyle.1 @@ -21,7 +21,7 @@ .\" .\" CDDL HEADER END .\" -.Dd May 26, 2021 +.Dd April 4, 2022 .Dt CSTYLE 1 .Os . diff --git a/sys/contrib/openzfs/man/man1/arcstat.1 b/sys/contrib/openzfs/man/man1/zarcstat.1 index f2474fbb701f..3633c5d417fe 100644 --- a/sys/contrib/openzfs/man/man1/arcstat.1 +++ b/sys/contrib/openzfs/man/man1/zarcstat.1 @@ -13,12 +13,12 @@ .\" Copyright (c) 2015 by Delphix. All rights reserved. .\" Copyright (c) 2020 by AJ Jordan. All rights reserved. .\" -.Dd December 23, 2022 -.Dt ARCSTAT 1 +.Dd September 19, 2024 +.Dt ZARCSTAT 1 .Os . .Sh NAME -.Nm arcstat +.Nm zarcstat .Nd report ZFS ARC and L2ARC statistics .Sh SYNOPSIS .Nm diff --git a/sys/contrib/openzfs/man/man1/zhack.1 b/sys/contrib/openzfs/man/man1/zhack.1 index f58c0527649b..63658cf930e9 100644 --- a/sys/contrib/openzfs/man/man1/zhack.1 +++ b/sys/contrib/openzfs/man/man1/zhack.1 @@ -23,7 +23,7 @@ .\" .\" lint-ok: WARNING: sections out of conventional order: Sh SYNOPSIS .\" -.Dd May 26, 2021 +.Dd May 3, 2023 .Dt ZHACK 1 .Os . @@ -122,6 +122,24 @@ Example: .Nm zhack Cm label repair Fl cu Ar device Fix checksums and undetach a device . +.It Xo +.Nm zhack +.Cm metaslab leak +.Op Fl f +.Ar pool +.Xc +Apply a fragmentation profile generated by +.Sy zdb +to the specified +.Ar pool Ns +\&. +.Pp +The +.Fl f +flag forces the profile to apply even if the vdevs in the +.Ar pool +don't have the same number of metaslabs as the fragmentation profile. +. .El . .Sh GLOBAL OPTIONS @@ -143,6 +161,8 @@ Search for members in .Ar dir . Can be specified more than once. +.It Fl o Ar var Ns = Ns Ar value +Set the given tunable to the provided value. .El . .Sh EXAMPLES diff --git a/sys/contrib/openzfs/man/man1/ztest.1 b/sys/contrib/openzfs/man/man1/ztest.1 index 0cbb58e40dd8..ae857bfea29c 100644 --- a/sys/contrib/openzfs/man/man1/ztest.1 +++ b/sys/contrib/openzfs/man/man1/ztest.1 @@ -24,7 +24,7 @@ .\" reserved. .\" Copyright (c) 2017, Intel Corporation. .\" -.Dd May 26, 2021 +.Dd July 12, 2025 .Dt ZTEST 1 .Os . @@ -188,12 +188,8 @@ i.e. given will be loaded. .It Fl C , -vdev-class-state Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy random No (default : Sy random ) The vdev allocation class state. -.It Fl o , -option Ns = Ns Ar variable Ns = Ns Ar value -Set global -.Ar variable -to an unsigned 32-bit integer -.Ar value -(little-endian only). +.It Fl o , -option Ns = Ns Ar var Ns = Ns Ar value Ns … +Set the given tunable to the provided value. .It Fl G , -dump-debug Dump zfs_dbgmsg buffer before exiting due to an error. .It Fl V , -verbose diff --git a/sys/contrib/openzfs/man/man4/spl.4 b/sys/contrib/openzfs/man/man4/spl.4 index 683f8e2b631f..61dfe42e463d 100644 --- a/sys/contrib/openzfs/man/man4/spl.4 +++ b/sys/contrib/openzfs/man/man4/spl.4 @@ -15,7 +15,7 @@ .\" .\" Copyright 2013 Turbo Fredriksson <turbo@bayour.com>. All rights reserved. .\" -.Dd August 24, 2020 +.Dd May 7, 2025 .Dt SPL 4 .Os . diff --git a/sys/contrib/openzfs/man/man4/zfs.4 b/sys/contrib/openzfs/man/man4/zfs.4 index 67b2cef46e80..11bcbf430210 100644 --- a/sys/contrib/openzfs/man/man4/zfs.4 +++ b/sys/contrib/openzfs/man/man4/zfs.4 @@ -4,6 +4,7 @@ .\" Copyright (c) 2019, 2021 by Delphix. All rights reserved. .\" Copyright (c) 2019 Datto Inc. .\" Copyright (c) 2023, 2024, 2025, Klara, Inc. +.\" .\" The contents of this file are subject to the terms of the Common Development .\" and Distribution License (the "License"). You may not use this file except .\" in compliance with the License. You can obtain a copy of the license at @@ -17,7 +18,7 @@ .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .\" -.Dd May 29, 2025 +.Dd September 15, 2025 .Dt ZFS 4 .Os . @@ -601,6 +602,42 @@ new format when enabling the feature. The default is to convert all log entries. . +.It Sy vdev_read_sit_out_secs Ns = Ns Sy 600 Ns s Po 10 min Pc Pq ulong +When a slow disk outlier is detected it is placed in a sit out state. +While sitting out the disk will not participate in normal reads, instead its +data will be reconstructed as needed from parity. +Scrub operations will always read from a disk, even if it's sitting out. +A number of disks in a RAID-Z or dRAID vdev may sit out at the same time, up +to the number of parity devices. +Writes will still be issued to a disk which is sitting out to maintain full +redundancy. +Defaults to 600 seconds and a value of zero disables disk sit-outs in general, +including slow disk outlier detection. +. +.It Sy vdev_raidz_outlier_check_interval_ms Ns = Ns Sy 1000 Ns ms Po 1 sec Pc Pq ulong +How often each RAID-Z and dRAID vdev will check for slow disk outliers. +Increasing this interval will reduce the sensitivity of detection (since all +I/Os since the last check are included in the statistics), but will slow the +response to a disk developing a problem. +Defaults to once per second; setting extremely small values may cause negative +performance effects. +. +.It Sy vdev_raidz_outlier_insensitivity Ns = Ns Sy 50 Pq uint +When performing slow outlier checks for RAID-Z and dRAID vdevs, this value is +used to determine how far out an outlier must be before it counts as an event +worth consdering. +This is phrased as "insensitivity" because larger values result in fewer +detections. +Smaller values will result in more aggressive sitting out of disks that may have +problems, but may significantly increase the rate of spurious sit-outs. +.Pp +To provide a more technical definition of this parameter, this is the multiple +of the inter-quartile range (IQR) that is being used in a Tukey's Fence +detection algorithm. +This is much higher than a normal Tukey's Fence k-value, because the +distribution under consideration is probably an extreme-value distribution, +rather than a more typical Gaussian distribution. +. .It Sy vdev_removal_max_span Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint During top-level vdev removal, chunks of data are copied from the vdev which may include free space in order to trade bandwidth for IOPS. @@ -941,10 +978,6 @@ The target number of bytes the ARC should leave as free memory on the system. If zero, equivalent to the bigger of .Sy 512 KiB No and Sy all_system_memory/64 . . -.It Sy zfs_autoimport_disable Ns = Ns Sy 1 Ns | Ns 0 Pq int -Disable pool import at module load by ignoring the cache file -.Pq Sy spa_config_path . -. .It Sy zfs_checksum_events_per_second Ns = Ns Sy 20 Ns /s Pq uint Rate limit checksum events to this many per second. Note that this should not be set below the ZED thresholds @@ -1399,14 +1432,15 @@ If this setting is 0, then even if feature@block_cloning is enabled, using functions and system calls that attempt to clone blocks will act as though the feature is disabled. . -.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int -When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be -written to disk. -This allows the clone operation to reliably succeed when a file is +.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 1 Ns | Ns 0 Pq int +When set to 1 the FICLONE and FICLONERANGE ioctls will wait for any dirty +data to be written to disk before proceeding. +This ensures that the clone operation reliably succeeds, even if a file is modified and then immediately cloned. -For small files this may be slower than making a copy of the file. -Therefore, this setting defaults to 0 which causes a clone operation to -immediately fail when encountering a dirty block. +Note that for small files this may be slower than simply copying the file. +When set to 0 the clone operation will immediately fail if it encounters +any dirty blocks. +By default waiting is enabled. . .It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string Select a BLAKE3 implementation. @@ -1713,10 +1747,19 @@ Similar to but for cleanup of old indirection records for removed vdevs. . .It Sy zfs_immediate_write_sz Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq s64 -Largest data block to write to the ZIL. -Larger blocks will be treated as if the dataset being written to had the -.Sy logbias Ns = Ns Sy throughput -property set. +Largest write size to store the data directly into the ZIL if +.Sy logbias Ns = Ns Sy latency . +Larger writes may be written indirectly similar to +.Sy logbias Ns = Ns Sy throughput . +In presence of SLOG this parameter is ignored, as if it was set to infinity, +storing all written data into ZIL to not depend on regular vdev latency. +. +.It Sy zil_special_is_slog Ns = Ns Sy 1 Ns | Ns 0 Pq int +When enabled, and written blocks go to normal vdevs, treat present special +vdevs as SLOGs. +Blocks that go to the special vdevs are still written indirectly, as with +.Sy logbias Ns = Ns Sy throughput . +This parameter is ignored if an SLOG is present. . .It Sy zfs_initialize_value Ns = Ns Sy 16045690984833335022 Po 0xDEADBEEFDEADBEEE Pc Pq u64 Pattern written to vdev free space by @@ -2237,6 +2280,21 @@ Defer frees starting in this pass. Maximum memory used for prefetching a checkpoint's space map on each vdev while discarding the checkpoint. . +.It Sy zfs_spa_note_txg_time Ns = Ns Sy 600 Pq uint +This parameter defines, in seconds, how often the TXG time database will record +a new TXG if it has changed. +After the specified time interval has passed, and if the TXG number has changed, +the new value is recorded in the database. +These timestamps can later be used for more granular operations, such as +scrubbing. +. +.It Sy zfs_spa_flush_txg_time Ns = Ns Sy 600 Pq uint +This parameter defines, in seconds, how often the ZFS will flush +the TXG time database to disk. +It ensures that the data is actually written to persistent storage, which helps +preserve the database in case of unexpected shutdown. +The database is also automatically flushed during the export sequence. +. .It Sy zfs_special_class_metadata_reserve_pct Ns = Ns Sy 25 Ns % Pq uint Only allow small data blocks to be allocated on the special and dedup vdev types when the available free space percentage on these vdevs exceeds this @@ -2462,8 +2520,8 @@ code for this record type. The tunable has no effect if the feature is disabled. . .It Sy zfs_embedded_slog_min_ms Ns = Ns Sy 64 Pq uint -Usually, one metaslab from each normal-class vdev is dedicated for use by -the ZIL to log synchronous writes. +Usually, one metaslab from each normal and special class vdev is dedicated +for use by the ZIL to log synchronous writes. However, if there are fewer than .Sy zfs_embedded_slog_min_ms metaslabs in the vdev, this functionality is disabled. @@ -2525,6 +2583,49 @@ the xattr so as to not accumulate duplicates. .It Sy zio_requeue_io_start_cut_in_line Ns = Ns Sy 0 Ns | Ns 1 Pq int Prioritize requeued I/O. . +.It Sy zfs_delete_inode Ns = Ns Sy 0 Ns | Ns 1 Pq int +Sets whether the kernel should free an inode structure when the last reference +is released, or cache it in memory. +Intended for testing/debugging. +.Pp +A live inode structure "pins" versious internal OpenZFS structures in memory, +which can result in large amounts of "unusable" memory on systems with lots of +infrequently-accessed files, until the kernel's memory pressure mechanism +asks OpenZFS to release them. +.Pp +The default value of +.Sy 0 +always caches inodes that appear to still exist on disk. +Setting it to +.Sy 1 +will immediately release unused inodes and their associated memory back to the +dbuf cache or the ARC for reuse, but may reduce performance if inodes are +frequently evicted and reloaded. +.Pp +This parameter is only available on Linux. +. +.It Sy zfs_delete_dentry Ns = Ns Sy 0 Ns | Ns 1 Pq int +Sets whether the kernel should free a dentry structure when it is no longer +required, or hold it in the dentry cache. +Intended for testing/debugging. +. +Since a dentry structure holds an inode reference, a cached dentry can "pin" +an inode in memory indefinitely, along with associated OpenZFS structures (See +.Sy zfs_delete_inode ) . +.Pp +The default value of +.Sy 0 +instructs the kernel to cache entries and their associated inodes when they +are no longer directly referenced. +They will be reclaimed as part of the kernel's normal cache management +processes. +Setting it to +.Sy 1 +will instruct the kernel to release directory entries and their inodes as soon +as they are no longer referenced by the filesystem. +.Pp +This parameter is only available on Linux. +. .It Sy zio_taskq_batch_pct Ns = Ns Sy 80 Ns % Pq uint Percentage of online CPUs which will run a worker thread for I/O. These workers are responsible for I/O work such as compression, encryption, diff --git a/sys/contrib/openzfs/man/man5/vdev_id.conf.5 b/sys/contrib/openzfs/man/man5/vdev_id.conf.5 index d2f817631c15..299a23720201 100644 --- a/sys/contrib/openzfs/man/man5/vdev_id.conf.5 +++ b/sys/contrib/openzfs/man/man5/vdev_id.conf.5 @@ -9,7 +9,7 @@ .\" source. A copy of the CDDL is also available via the Internet at .\" http://www.illumos.org/license/CDDL. .\" -.Dd May 26, 2021 +.Dd October 8, 2024 .Dt VDEV_ID.CONF 5 .Os . diff --git a/sys/contrib/openzfs/man/man7/dracut.zfs.7 b/sys/contrib/openzfs/man/man7/dracut.zfs.7 index fb5da553af6e..3d051d4d3343 100644 --- a/sys/contrib/openzfs/man/man7/dracut.zfs.7 +++ b/sys/contrib/openzfs/man/man7/dracut.zfs.7 @@ -1,7 +1,7 @@ .\" SPDX-License-Identifier: CDDL-1.0 .\" SPDX-License-Identifier: 0BSD .\" -.Dd March 28, 2023 +.Dd July 13, 2024 .Dt DRACUT.ZFS 7 .Os . diff --git a/sys/contrib/openzfs/man/man7/vdevprops.7 b/sys/contrib/openzfs/man/man7/vdevprops.7 index acabe6b6613a..0fb28d7db13c 100644 --- a/sys/contrib/openzfs/man/man7/vdevprops.7 +++ b/sys/contrib/openzfs/man/man7/vdevprops.7 @@ -19,9 +19,9 @@ .\" .\" CDDL HEADER END .\" -.\" Copyright (c) 2021 Klara, Inc. +.\" Copyright (c) 2021, 2025, Klara, Inc. .\" -.Dd October 30, 2022 +.Dd July 23, 2024 .Dt VDEVPROPS 7 .Os . @@ -106,11 +106,17 @@ The number of children belonging to this vdev .It Sy read_errors , write_errors , checksum_errors , initialize_errors , trim_errors The number of errors of each type encountered by this vdev .It Sy slow_ios -The number of slow I/Os encountered by this vdev, -These represent I/O operations that didn't complete in +This indicates the number of slow I/O operations encountered by this vdev. +A slow I/O is defined as an operation that did not complete within the .Sy zio_slow_io_ms -milliseconds +threshold in milliseconds .Pq Sy 30000 No by default . +For +.Sy RAIDZ +and +.Sy DRAID +configurations, this value also represents the number of times the vdev was +identified as an outlier and excluded from participating in read I/O operations. .It Sy null_ops , read_ops , write_ops , free_ops , claim_ops , trim_ops The number of I/O operations of each type performed by this vdev .It Xo @@ -150,6 +156,31 @@ The amount of space to reserve for the EFI system partition .It Sy failfast If this device should propagate BIO errors back to ZFS, used to disable failfast. +.It Sy sit_out +Only valid for +.Sy RAIDZ +and +.Sy DRAID +vdevs. +True when a slow disk outlier was detected and the vdev is currently in a sit +out state. +This property can be manually set to cause vdevs to sit out. +It will also be automatically set by the +.Sy autosit +logic if that is enabled. +While sitting out, the vdev will not participate in normal reads, instead its +data will be reconstructed as needed from parity. +.It Sy autosit +Only valid for +.Sy RAIDZ +and +.Sy DRAID +vdevs. +If set, this enables the kernel-level slow disk detection logic. +This logic automatically causes any vdevs that are significant negative +performance outliers to sit out, as described in the +.Sy sit_out +property. .It Sy path The path to the device for this vdev .It Sy allocating diff --git a/sys/contrib/openzfs/man/man7/zfsconcepts.7 b/sys/contrib/openzfs/man/man7/zfsconcepts.7 index 5c736e53670d..bb2178d85bcd 100644 --- a/sys/contrib/openzfs/man/man7/zfsconcepts.7 +++ b/sys/contrib/openzfs/man/man7/zfsconcepts.7 @@ -31,7 +31,7 @@ .\" Copyright 2019 Joyent, Inc. .\" Copyright 2023 Klara, Inc. .\" -.Dd October 6, 2023 +.Dd October 2, 2024 .Dt ZFSCONCEPTS 7 .Os . diff --git a/sys/contrib/openzfs/man/man7/zfsprops.7 b/sys/contrib/openzfs/man/man7/zfsprops.7 index ac58203f00a1..77e994b912b6 100644 --- a/sys/contrib/openzfs/man/man7/zfsprops.7 +++ b/sys/contrib/openzfs/man/man7/zfsprops.7 @@ -39,7 +39,7 @@ .\" Copyright (c) 2019, Kjeld Schouten-Lebbing .\" Copyright (c) 2022 Hewlett Packard Enterprise Development LP. .\" -.Dd June 29, 2024 +.Dd September 13, 2025 .Dt ZFSPROPS 7 .Os . @@ -541,10 +541,16 @@ The .Sy blocksize cannot be changed once the volume has been written, so it should be set at volume creation time. -The default -.Sy blocksize -for volumes is 16 KiB. -Any power of 2 from 512 bytes to 128 KiB is valid. +The size specified must be a power of two greater than or equal to +.Ar 512 B +and less than or equal to +.Ar 128 KiB . +If the +.Sy large_blocks +feature is enabled on the pool, the size may be up to +.Ar 16 MiB . +The default size is +.Ar 16 KiB . .Pp This property can also be referred to by its shortened column name, .Sy volblock . @@ -1186,18 +1192,26 @@ keylocation can be with either .Nm zfs Cm set or .Nm zfs Cm change-key . +.Pp If .Sy prompt -is selected ZFS will ask for the key at the command prompt when it is required -to access the encrypted data (see +is selected, ZFS will expect the key to be provided when it is required to +access the encrypted data (see .Nm zfs Cm load-key for details). -This setting will also allow the key to be passed in via the standard input -stream, -but users should be careful not to place keys which should be kept secret on -the command line. -If a file URI is selected, the key will be loaded from the +If stdin is a TTY, then ZFS will ask for the key to be provided. +Otherwise, stdin is expected to be the key to use and will be processed as such. +Users should be careful not to place keys which should be kept secret on the +command line, as most operating systems may expose command line arguments to +other processes. +If the +.Dq raw +.Sy keyformat +was used, then the key must be provided via stdin. +.Pp +If a file URL is selected, the key will be loaded from the specified absolute file path. +.Pp If an HTTPS or HTTP URL is selected, it will be GETted using .Xr fetch 3 , libcurl, or nothing, depending on compile-time configuration and run-time @@ -1282,10 +1296,12 @@ This feature must be enabled to be used .It Sy special_small_blocks Ns = Ns Ar size This value represents the threshold block size for including small file or zvol blocks into the special allocation class. -Blocks smaller than or equal to this -value will be assigned to the special allocation class while greater blocks -will be assigned to the regular class. -Valid values are zero or a power of two from 512 up to 1048576 (1 MiB). +Blocks smaller than or equal to this value after compression and encryption +will be assigned to the special allocation class, while greater blocks will +be assigned to the regular class. +Valid values are from 0 to maximum block size ( +.Ar 16 MiB +). The default size is 0 which means no small file or zvol blocks will be allocated in the special class. .Pp @@ -1569,11 +1585,6 @@ See .Xr zpool-features 7 for details on ZFS feature flags. .Pp -However, blocks larger than -.Ar 1 MiB -can have an impact on i/o latency (e.g. tying up a spinning disk for -~300ms), and also potentially on the memory allocator. -.Pp Note that maximum size is still limited by default to .Ar 1 MiB on x86_32, see @@ -1864,7 +1875,8 @@ property is updated with , the property is set to desired value, but the operation to share, reshare or unshare the the dataset is not performed. .It Sy logbias Ns = Ns Sy latency Ns | Ns Sy throughput -Provide a hint to ZFS about handling of synchronous requests in this dataset. +Provide a hint to ZFS about handling of synchronous write requests in this +dataset. If .Sy logbias is set to @@ -1872,12 +1884,12 @@ is set to .Pq the default , ZFS will use pool log devices .Pq if configured -to handle the requests at low latency. +to handle the write requests at low latency. If .Sy logbias is set to .Sy throughput , -ZFS will not use configured pool log devices. +ZFS will not use configured pool log devices to store written data. ZFS will instead optimize synchronous operations for global pool throughput and efficient use of resources. .It Sy snapdev Ns = Ns Sy hidden Ns | Ns Sy visible diff --git a/sys/contrib/openzfs/man/man7/zpool-features.7 b/sys/contrib/openzfs/man/man7/zpool-features.7 index 8ae1b2b3b923..b4404a6eb58d 100644 --- a/sys/contrib/openzfs/man/man7/zpool-features.7 +++ b/sys/contrib/openzfs/man/man7/zpool-features.7 @@ -19,7 +19,7 @@ .\" Copyright (c) 2019, Allan Jude .\" Copyright (c) 2021, Colm Buckley <colm@tuatha.org> .\" -.Dd October 2, 2024 +.Dd July 23, 2025 .Dt ZPOOL-FEATURES 7 .Os . @@ -401,6 +401,17 @@ This feature becomes .Sy active when first block is cloned. When the last cloned block is freed, it goes back to the enabled state. +.feature com.truenas block_cloning_endian yes +This feature corrects ZAP entry endianness issues in the Block Reference +Table (BRT) used by block cloning. +During the original block cloning implementation, BRT ZAP entries were +mistakenly stored as arrays of 8 single-byte entries instead of single +8-byte entries, making pools non-endian-safe. +.Pp +This feature is activated when the first BRT ZAP is created (that way +ensuring compatibility with existing pools). +When active, new BRT entries are stored in the correct endian-safe format. +The feature becomes inactive when all BRT ZAPs are destroyed. .feature com.delphix bookmarks yes extensible_dataset This feature enables use of the .Nm zfs Cm bookmark @@ -493,6 +504,19 @@ vdev type, or when adding a new .Sy draid vdev to an existing pool. . +.feature com.klarasystems dynamic_gang_header no +This feature enables larger gang headers based on the sector size of the pool. +When enabled, gang headers will use the entire space allocated for them, instead +of always restricting themselves to 512 bytes. +This can reduce the need for nested gang trees in extreme fragmentation +scenarios. +.Pp +This feature becomes active when a gang header is written that is larger than +512 bytes. +This feature is not enabled by +.Xr zpool-upgrade 8 . +Instead, it must be manually enabled, or be part of a compatibility file. +. .feature org.illumos edonr no extensible_dataset This feature enables the use of the Edon-R hash algorithm for checksum, including for nopwrite @@ -829,6 +853,23 @@ when the command is used on a top-level vdev, and will never return to being .Sy enabled . . +.feature com.truenas physical_rewrite yes extensible_dataset +This feature enables physical block rewriting that preserves logical birth +times, avoiding unnecessary inclusion of rewritten blocks in incremental +.Nm zfs Cm send +streams. +When enabled, the +.Nm zfs Cm rewrite Fl P +command can be used. +.Pp +This feature becomes +.Sy active +the first time +.Nm zfs Cm rewrite Fl P +is used on any dataset, and will return to being +.Sy enabled +once all datasets that have ever used physical rewrite are destroyed. +. .feature org.zfsonlinux project_quota yes extensible_dataset This feature allows administrators to account the spaces and objects usage information against the project identifier diff --git a/sys/contrib/openzfs/man/man7/zpoolconcepts.7 b/sys/contrib/openzfs/man/man7/zpoolconcepts.7 index 60f16269a0ac..b9c8926d835d 100644 --- a/sys/contrib/openzfs/man/man7/zpoolconcepts.7 +++ b/sys/contrib/openzfs/man/man7/zpoolconcepts.7 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd April 7, 2023 +.Dd August 6, 2025 .Dt ZPOOLCONCEPTS 7 .Os . @@ -390,11 +390,6 @@ Multiple log devices can also be specified, and they can be mirrored. See the .Sx EXAMPLES section for an example of mirroring multiple log devices. -.Pp -Log devices can be added, replaced, attached, detached, and removed. -In addition, log devices are imported and exported as part of the pool -that contains them. -Mirrored devices can be removed by specifying the top-level mirror vdev. . .Ss Cache Devices Devices can be added to a storage pool as @@ -486,8 +481,8 @@ current state of the pool won't be scanned during a scrub. . .Ss Special Allocation Class Allocations in the special class are dedicated to specific block types. -By default, this includes all metadata, the indirect blocks of user data, and -any deduplication tables. +By default, this includes all metadata, the indirect blocks of user data, +intent log (in absence of separate log device), and deduplication tables. The class can also be provisioned to accept small file blocks or zvol blocks on a per dataset granularity. .Pp diff --git a/sys/contrib/openzfs/man/man7/zpoolprops.7 b/sys/contrib/openzfs/man/man7/zpoolprops.7 index 5d84753193ee..d3b4c2376943 100644 --- a/sys/contrib/openzfs/man/man7/zpoolprops.7 +++ b/sys/contrib/openzfs/man/man7/zpoolprops.7 @@ -29,7 +29,7 @@ .\" Copyright (c) 2021, Colm Buckley <colm@tuatha.org> .\" Copyright (c) 2023, Klara Inc. .\" -.Dd November 18, 2024 +.Dd December 4, 2024 .Dt ZPOOLPROPS 7 .Os . diff --git a/sys/contrib/openzfs/man/man8/zdb.8 b/sys/contrib/openzfs/man/man8/zdb.8 index 8bfd0dcdc381..c3290ea14769 100644 --- a/sys/contrib/openzfs/man/man8/zdb.8 +++ b/sys/contrib/openzfs/man/man8/zdb.8 @@ -15,7 +15,7 @@ .\" Copyright (c) 2017 Lawrence Livermore National Security, LLC. .\" Copyright (c) 2017 Intel Corporation. .\" -.Dd October 27, 2024 +.Dd August 12, 2025 .Dt ZDB 8 .Os . @@ -69,6 +69,13 @@ .Op Fl U Ar cache .Ar poolname Op Ar vdev Oo Ar metaslab Oc Ns … .Nm +.Fl -allocated-map +.Op Fl mAFLPXY +.Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns … +.Op Fl t Ar txg +.Op Fl U Ar cache +.Ar poolname Op Ar vdev Oo Ar metaslab Oc Ns … +.Nm .Fl O .Op Fl K Ar key .Ar dataset path @@ -128,6 +135,11 @@ that zdb may interpret inconsistent pool data and behave erratically. .Sh OPTIONS Display options: .Bl -tag -width Ds +.It Fl Sy -allocated-map +Prints out a list of all the allocated regions in the pool. +Primarily intended for use with the +.Nm zhack metaslab leak +subcommand. .It Fl b , -block-stats Display statistics regarding the number, size .Pq logical, physical and allocated @@ -474,10 +486,15 @@ as it runs. Exercise extreme caution when using this option in shared or uncontrolled environments. .It Fl o , -option Ns = Ns Ar var Ns = Ns Ar value Ns … -Set the given global libzpool variable to the provided value. -The value must be an unsigned 32-bit integer. -Currently only little-endian systems are supported to avoid accidentally setting -the high 32 bits of 64-bit variables. +Set the given tunable to the provided value. +.It Fl o , -option Ns = Ns Ar var Ns … +Show the value of the given tunable. +.It Fl o , -option Ns = Ns show +Show all tunables and their values. +.It Fl o , -option Ns = Ns info Ns = Ns Ar value Ns … +Show info about a tunable, including their name, type and description. +.It Fl o , -option Ns = Ns info +Show info about all tunables. .It Fl P , -parseable Print numbers in an unscaled form more amenable to parsing, e.g.\& .Sy 1000000 @@ -526,6 +543,18 @@ option, with more occurrences enabling more verbosity. If no options are specified, all information about the named pool will be displayed at default verbosity. . +.Sh EXIT STATUS +The +.Nm +utility exits +.Sy 0 +on success, +.Sy 1 +if a fatal error occurs, +.Sy 2 +if invalid command line options were specified, or +.Sy 3 +if on-disk corruption was detected, but was not fatal. .Sh EXAMPLES .Ss Example 1 : No Display the configuration of imported pool Ar rpool .Bd -literal diff --git a/sys/contrib/openzfs/man/man8/zed.8.in b/sys/contrib/openzfs/man/man8/zed.8.in index c90a1834403b..2d19f2d8496b 100644 --- a/sys/contrib/openzfs/man/man8/zed.8.in +++ b/sys/contrib/openzfs/man/man8/zed.8.in @@ -13,7 +13,7 @@ .\" .\" Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) .\" -.Dd May 26, 2021 +.Dd August 22, 2022 .Dt ZED 8 .Os . @@ -158,6 +158,8 @@ Multiple ZEDLETs may be invoked for a given zevent. ZEDLETs are executables invoked by the ZED in response to a given zevent. They should be written under the presumption they can be invoked concurrently, and they should use appropriate locking to access any shared resources. +The one exception to this are "synchronous zedlets", which are described later +in this page. Common variables used by ZEDLETs can be stored in the default rc file which is sourced by scripts; these variables should be prefixed with .Sy ZED_ . @@ -233,6 +235,36 @@ and .Sy ZPOOL . These variables may be overridden in the rc file. . +.Sh Synchronous ZEDLETS +ZED's normal behavior is to spawn off zedlets in parallel and ignore their +completion order. +This means that ZED can potentially +have zedlets for event ID number 2 starting before zedlets for event ID number +1 have finished. +Most of the time this is fine, and it actually helps when the system is getting +hammered with hundreds of events. +.Pp +However, there are times when you want your zedlets to be executed in sequence +with the event ID. +That is where synchronous zedlets come in. +.Pp +ZED will wait for all previously spawned zedlets to finish before running +a synchronous zedlet. +Synchronous zedlets are guaranteed to be the only +zedlet running. +No other zedlets may run in parallel with a synchronous zedlet. +Users should be careful to only use synchronous zedlets when needed, since +they decrease parallelism. +.Pp +To make a zedlet synchronous, simply add a "-sync-" immediately following the +event name in the zedlet's file name: +.Pp +.Sy EVENT_NAME-sync-ZEDLETNAME.sh +.Pp +For example, if you wanted a synchronous statechange script: +.Pp +.Sy statechange-sync-myzedlet.sh +. .Sh FILES .Bl -tag -width "-c" .It Pa @sysconfdir@/zfs/zed.d diff --git a/sys/contrib/openzfs/man/man8/zfs-allow.8 b/sys/contrib/openzfs/man/man8/zfs-allow.8 index 5a8e80bf6a43..e3b0e1ab3e12 100644 --- a/sys/contrib/openzfs/man/man8/zfs-allow.8 +++ b/sys/contrib/openzfs/man/man8/zfs-allow.8 @@ -30,7 +30,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd March 16, 2022 +.Dd September 8, 2025 .Dt ZFS-ALLOW 8 .Os . @@ -212,7 +212,8 @@ receive subcommand Must also have the \fBmount\fR and \fBcreate\fR ability, requ release subcommand Allows releasing a user hold which might destroy the snapshot rename subcommand Must also have the \fBmount\fR and \fBcreate\fR ability in the new parent rollback subcommand Must also have the \fBmount\fR ability -send subcommand +send subcommand Allows sending a replication stream of a dataset. +send:raw subcommand Only allows sending raw replication streams, preventing encrypted datasets being sent in decrypted form. share subcommand Allows sharing file systems over NFS or SMB protocols snapshot subcommand Must also have the \fBmount\fR ability diff --git a/sys/contrib/openzfs/man/man8/zfs-bookmark.8 b/sys/contrib/openzfs/man/man8/zfs-bookmark.8 index 083ff46d241b..5a0933820020 100644 --- a/sys/contrib/openzfs/man/man8/zfs-bookmark.8 +++ b/sys/contrib/openzfs/man/man8/zfs-bookmark.8 @@ -31,7 +31,7 @@ .\" Copyright 2019 Joyent, Inc. .\" Copyright (c) 2019, 2020 by Christian Schwarz. All Rights Reserved. .\" -.Dd May 12, 2022 +.Dd July 11, 2022 .Dt ZFS-BOOKMARK 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zfs-clone.8 b/sys/contrib/openzfs/man/man8/zfs-clone.8 index cd412815f5fe..9609cf2ce36a 100644 --- a/sys/contrib/openzfs/man/man8/zfs-clone.8 +++ b/sys/contrib/openzfs/man/man8/zfs-clone.8 @@ -30,7 +30,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd March 16, 2022 +.Dd July 11, 2022 .Dt ZFS-CLONE 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zfs-create.8 b/sys/contrib/openzfs/man/man8/zfs-create.8 index 91878056cc7d..58bde5799240 100644 --- a/sys/contrib/openzfs/man/man8/zfs-create.8 +++ b/sys/contrib/openzfs/man/man8/zfs-create.8 @@ -30,7 +30,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd March 16, 2022 +.Dd June 2, 2023 .Dt ZFS-CREATE 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zfs-destroy.8 b/sys/contrib/openzfs/man/man8/zfs-destroy.8 index 38359be02430..6a6791f7a44e 100644 --- a/sys/contrib/openzfs/man/man8/zfs-destroy.8 +++ b/sys/contrib/openzfs/man/man8/zfs-destroy.8 @@ -30,7 +30,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd March 16, 2022 +.Dd February 5, 2025 .Dt ZFS-DESTROY 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zfs-diff.8 b/sys/contrib/openzfs/man/man8/zfs-diff.8 index d4c48f4109be..5b94ea524666 100644 --- a/sys/contrib/openzfs/man/man8/zfs-diff.8 +++ b/sys/contrib/openzfs/man/man8/zfs-diff.8 @@ -30,7 +30,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd March 16, 2022 +.Dd July 11, 2022 .Dt ZFS-DIFF 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zfs-hold.8 b/sys/contrib/openzfs/man/man8/zfs-hold.8 index 0c88937f0dc8..a877e428f88b 100644 --- a/sys/contrib/openzfs/man/man8/zfs-hold.8 +++ b/sys/contrib/openzfs/man/man8/zfs-hold.8 @@ -30,7 +30,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd June 30, 2019 +.Dd November 8, 2022 .Dt ZFS-HOLD 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zfs-jail.8 b/sys/contrib/openzfs/man/man8/zfs-jail.8 index 53499a279d05..569f5f57eab4 100644 --- a/sys/contrib/openzfs/man/man8/zfs-jail.8 +++ b/sys/contrib/openzfs/man/man8/zfs-jail.8 @@ -37,7 +37,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd May 27, 2021 +.Dd July 11, 2022 .Dt ZFS-JAIL 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zfs-list.8 b/sys/contrib/openzfs/man/man8/zfs-list.8 index 677d8292e207..42eff94f9762 100644 --- a/sys/contrib/openzfs/man/man8/zfs-list.8 +++ b/sys/contrib/openzfs/man/man8/zfs-list.8 @@ -30,7 +30,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd February 8, 2024 +.Dd August 25, 2025 .Dt ZFS-LIST 8 .Os . @@ -50,27 +50,25 @@ .Oo Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Oc Ns … . .Sh DESCRIPTION -If specified, you can list property information by the absolute pathname or the -relative pathname. -By default, all file systems and volumes are displayed. +By default, all file systems and volumes are displayed, with the following +fields: +.Sy name , Sy used , Sy available , Sy referenced , Sy mountpoint . Snapshots are displayed if the .Sy listsnapshots pool property is .Sy on .Po the default is .Sy off -.Pc , +.Pc or if the .Fl t Sy snapshot or .Fl t Sy all options are specified. -The following fields are displayed: -.Sy name , Sy used , Sy available , Sy referenced , Sy mountpoint . .Bl -tag -width "-H" .It Fl H Used for scripting mode. -Do not print headers and separate fields by a single tab instead of arbitrary +Do not print headers, and separate fields by a single tab instead of arbitrary white space. .It Fl j , -json Op Ar --json-int Print the output in JSON format. @@ -87,7 +85,7 @@ of will display only the dataset and its direct children. .It Fl o Ar property A comma-separated list of properties to display. -The property must be: +Each property must be: .Bl -bullet -compact .It One of the properties described in the @@ -125,30 +123,41 @@ section of or the value .Sy name to sort by the dataset name. -Multiple properties can be specified at one time using multiple +Multiple properties can be specified to operate together using multiple .Fl s -property options. +or +.Fl S +options. Multiple .Fl s -options are evaluated from left to right in decreasing order of importance. -The following is a list of sorting criteria: +and +.Fl S +options are evaluated from left to right to supply sort keys in +decreasing order of priority. +Property types operate as follows: .Bl -bullet -compact .It Numeric types sort in numeric order. .It String types sort in alphabetical order. .It -Types inappropriate for a row sort that row to the literal bottom, regardless of -the specified ordering. +Types inappropriate for a row sort that row to the literal bottom, +regardless of the specified ordering. .El .Pp -If no sorting options are specified the existing behavior of -.Nm zfs Cm list -is preserved. +If no sort columns are specified, or if two lines of output would sort +equally across all specified columns, then datasets and bookmarks are +sorted by name, whereas snapshots are sorted first by the name of their +dataset and then by the time of their creation. +When no sort columns are specified but snapshots are listed, this +default behavior causes snapshots to be grouped under their datasets in +chronological order by creation time. .It Fl S Ar property Same as .Fl s , -but sorts by property in descending order. +but sorts by +.Ar property +in descending order. .It Fl t Ar type A comma-separated list of types to display, where .Ar type diff --git a/sys/contrib/openzfs/man/man8/zfs-load-key.8 b/sys/contrib/openzfs/man/man8/zfs-load-key.8 index 7838c46d9e77..3a11cea99fd6 100644 --- a/sys/contrib/openzfs/man/man8/zfs-load-key.8 +++ b/sys/contrib/openzfs/man/man8/zfs-load-key.8 @@ -30,7 +30,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd January 13, 2020 +.Dd July 11, 2022 .Dt ZFS-LOAD-KEY 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zfs-mount-generator.8.in b/sys/contrib/openzfs/man/man8/zfs-mount-generator.8.in index ea470247daac..9e44ea30c636 100644 --- a/sys/contrib/openzfs/man/man8/zfs-mount-generator.8.in +++ b/sys/contrib/openzfs/man/man8/zfs-mount-generator.8.in @@ -23,7 +23,7 @@ .\" OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION .\" WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. .\" -.Dd May 31, 2021 +.Dd November 30, 2021 .Dt ZFS-MOUNT-GENERATOR 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zfs-mount.8 b/sys/contrib/openzfs/man/man8/zfs-mount.8 index 9fca6fffd5bb..2689b6dc345b 100644 --- a/sys/contrib/openzfs/man/man8/zfs-mount.8 +++ b/sys/contrib/openzfs/man/man8/zfs-mount.8 @@ -30,7 +30,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd February 16, 2019 +.Dd October 12, 2024 .Dt ZFS-MOUNT 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zfs-project.8 b/sys/contrib/openzfs/man/man8/zfs-project.8 index 36547680f53e..4ebfdf6ffe4f 100644 --- a/sys/contrib/openzfs/man/man8/zfs-project.8 +++ b/sys/contrib/openzfs/man/man8/zfs-project.8 @@ -30,7 +30,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd May 27, 2021 +.Dd July 11, 2022 .Dt ZFS-PROJECT 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zfs-promote.8 b/sys/contrib/openzfs/man/man8/zfs-promote.8 index 767045812607..435a7a5d0144 100644 --- a/sys/contrib/openzfs/man/man8/zfs-promote.8 +++ b/sys/contrib/openzfs/man/man8/zfs-promote.8 @@ -30,7 +30,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd March 16, 2022 +.Dd July 11, 2022 .Dt ZFS-PROMOTE 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zfs-rename.8 b/sys/contrib/openzfs/man/man8/zfs-rename.8 index 4cf192c0682b..8fedc67469e6 100644 --- a/sys/contrib/openzfs/man/man8/zfs-rename.8 +++ b/sys/contrib/openzfs/man/man8/zfs-rename.8 @@ -30,7 +30,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd March 16, 2022 +.Dd July 11, 2022 .Dt ZFS-RENAME 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zfs-rewrite.8 b/sys/contrib/openzfs/man/man8/zfs-rewrite.8 index 423d6d439e28..ca5340c7e5eb 100644 --- a/sys/contrib/openzfs/man/man8/zfs-rewrite.8 +++ b/sys/contrib/openzfs/man/man8/zfs-rewrite.8 @@ -21,7 +21,7 @@ .\" .\" Copyright (c) 2025 iXsystems, Inc. .\" -.Dd May 6, 2025 +.Dd July 23, 2025 .Dt ZFS-REWRITE 8 .Os . @@ -31,7 +31,7 @@ .Sh SYNOPSIS .Nm zfs .Cm rewrite -.Oo Fl rvx Ns Oc +.Oo Fl Prvx Ns Oc .Op Fl l Ar length .Op Fl o Ar offset .Ar file Ns | Ns Ar directory Ns … @@ -43,6 +43,15 @@ as is without modification at a new location and possibly with new properties, such as checksum, compression, dedup, copies, etc, as if they were atomically read and written back. .Bl -tag -width "-r" +.It Fl P +Perform physical rewrite, preserving logical birth time of blocks. +By default, rewrite updates logical birth times, making blocks appear +as modified in snapshots and incremental send streams. +Physical rewrite preserves logical birth times, avoiding unnecessary +inclusion in incremental streams. +Physical rewrite requires the +.Sy physical_rewrite +feature to be enabled on the pool. .It Fl l Ar length Rewrite at most this number of bytes. .It Fl o Ar offset @@ -60,17 +69,22 @@ same as some property changes may increase pool space usage. Holes that were never written or were previously zero-compressed are not rewritten and will remain holes even if compression is disabled. .Pp -Rewritten blocks will be seen as modified in next snapshot and as such -included into the incremental -.Nm zfs Cm send -stream. -.Pp If a .Fl l or .Fl o value request a rewrite to regions past the end of the file, then those regions are silently ignored, and no error is reported. +.Pp +By default, rewritten blocks update their logical birth time, +meaning they will be included in incremental +.Nm zfs Cm send +streams as modified data. +When the +.Fl P +flag is used, rewritten blocks preserve their logical birth time, since +there are no user data changes. . .Sh SEE ALSO -.Xr zfsprops 7 +.Xr zfsprops 7 , +.Xr zpool-features 7 diff --git a/sys/contrib/openzfs/man/man8/zfs-send.8 b/sys/contrib/openzfs/man/man8/zfs-send.8 index c920a5a48798..6c5f6b94afd5 100644 --- a/sys/contrib/openzfs/man/man8/zfs-send.8 +++ b/sys/contrib/openzfs/man/man8/zfs-send.8 @@ -31,7 +31,7 @@ .\" Copyright 2019 Joyent, Inc. .\" Copyright (c) 2024, Klara, Inc. .\" -.Dd October 2, 2024 +.Dd August 29, 2025 .Dt ZFS-SEND 8 .Os . @@ -173,8 +173,10 @@ The receiving system must have the feature enabled. If the .Sy lz4_compress -feature is active on the sending system, then the receiving system must have -that feature enabled as well. +or +.Sy zstd_compress +features are active on the sending system, then the receiving system must have +the corresponding features enabled as well. Datasets that are sent with this flag may not be received as an encrypted dataset, since encrypted datasets cannot use the .Sy embedded_data @@ -201,8 +203,10 @@ property for details .Pc . If the .Sy lz4_compress -feature is active on the sending system, then the receiving system must have -that feature enabled as well. +or +.Sy zstd_compress +features are active on the sending system, then the receiving system must have +the corresponding features enabled as well. If the .Sy large_blocks feature is enabled on the sending system but the @@ -357,8 +361,10 @@ property for details .Pc . If the .Sy lz4_compress -feature is active on the sending system, then the receiving system must have -that feature enabled as well. +or +.Sy zstd_compress +features are active on the sending system, then the receiving system must have +the corresponding features enabled as well. If the .Sy large_blocks feature is enabled on the sending system but the @@ -400,8 +406,10 @@ The receiving system must have the feature enabled. If the .Sy lz4_compress -feature is active on the sending system, then the receiving system must have -that feature enabled as well. +or +.Sy zstd_compress +features are active on the sending system, then the receiving system must have +the corresponding features enabled as well. Datasets that are sent with this flag may not be received as an encrypted dataset, since encrypted datasets cannot use the diff --git a/sys/contrib/openzfs/man/man8/zfs-set.8 b/sys/contrib/openzfs/man/man8/zfs-set.8 index 67f4d6eba171..08daf09d05f8 100644 --- a/sys/contrib/openzfs/man/man8/zfs-set.8 +++ b/sys/contrib/openzfs/man/man8/zfs-set.8 @@ -30,7 +30,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd April 20, 2024 +.Dd October 12, 2024 .Dt ZFS-SET 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zfs-share.8 b/sys/contrib/openzfs/man/man8/zfs-share.8 index f7a09a189182..e9c32a44b0c7 100644 --- a/sys/contrib/openzfs/man/man8/zfs-share.8 +++ b/sys/contrib/openzfs/man/man8/zfs-share.8 @@ -30,7 +30,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd May 17, 2021 +.Dd July 11, 2022 .Dt ZFS-SHARE 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zfs-snapshot.8 b/sys/contrib/openzfs/man/man8/zfs-snapshot.8 index 3ddd1273c8e8..8f4b2c335f09 100644 --- a/sys/contrib/openzfs/man/man8/zfs-snapshot.8 +++ b/sys/contrib/openzfs/man/man8/zfs-snapshot.8 @@ -30,7 +30,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd March 16, 2022 +.Dd July 11, 2022 .Dt ZFS-SNAPSHOT 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zfs-upgrade.8 b/sys/contrib/openzfs/man/man8/zfs-upgrade.8 index bac74e37aef9..a5ce2b760da4 100644 --- a/sys/contrib/openzfs/man/man8/zfs-upgrade.8 +++ b/sys/contrib/openzfs/man/man8/zfs-upgrade.8 @@ -30,7 +30,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd June 30, 2019 +.Dd July 11, 2022 .Dt ZFS-UPGRADE 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zfs-userspace.8 b/sys/contrib/openzfs/man/man8/zfs-userspace.8 index d7a4d18e83b1..c255d911740d 100644 --- a/sys/contrib/openzfs/man/man8/zfs-userspace.8 +++ b/sys/contrib/openzfs/man/man8/zfs-userspace.8 @@ -30,7 +30,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd June 30, 2019 +.Dd July 11, 2022 .Dt ZFS-USERSPACE 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zfs-wait.8 b/sys/contrib/openzfs/man/man8/zfs-wait.8 index 554a67455c60..e5c60010d2f9 100644 --- a/sys/contrib/openzfs/man/man8/zfs-wait.8 +++ b/sys/contrib/openzfs/man/man8/zfs-wait.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd May 31, 2021 +.Dd July 11, 2022 .Dt ZFS-WAIT 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zfs-zone.8 b/sys/contrib/openzfs/man/man8/zfs-zone.8 index 7ad0ac89463c..a56a304e82b2 100644 --- a/sys/contrib/openzfs/man/man8/zfs-zone.8 +++ b/sys/contrib/openzfs/man/man8/zfs-zone.8 @@ -38,7 +38,7 @@ .\" Copyright 2019 Joyent, Inc. .\" Copyright 2021 Klara, Inc. .\" -.Dd June 3, 2022 +.Dd July 11, 2022 .Dt ZFS-ZONE 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zfs.8 b/sys/contrib/openzfs/man/man8/zfs.8 index e16a3a82b672..b7566a727469 100644 --- a/sys/contrib/openzfs/man/man8/zfs.8 +++ b/sys/contrib/openzfs/man/man8/zfs.8 @@ -37,7 +37,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd April 18, 2025 +.Dd May 12, 2025 .Dt ZFS 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zfs_ids_to_path.8 b/sys/contrib/openzfs/man/man8/zfs_ids_to_path.8 index eef0ce68f17b..465e336d170c 100644 --- a/sys/contrib/openzfs/man/man8/zfs_ids_to_path.8 +++ b/sys/contrib/openzfs/man/man8/zfs_ids_to_path.8 @@ -21,7 +21,7 @@ .\" .\" Copyright (c) 2020 by Delphix. All rights reserved. .\" -.Dd April 17, 2020 +.Dd July 11, 2022 .Dt ZFS_IDS_TO_PATH 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zgenhostid.8 b/sys/contrib/openzfs/man/man8/zgenhostid.8 index 2b5b4fc18216..ff564880f97d 100644 --- a/sys/contrib/openzfs/man/man8/zgenhostid.8 +++ b/sys/contrib/openzfs/man/man8/zgenhostid.8 @@ -21,7 +21,7 @@ .\" .\" Copyright (c) 2017 by Lawrence Livermore National Security, LLC. .\" -.Dd May 26, 2021 +.Dd July 11, 2022 .Dt ZGENHOSTID 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-attach.8 b/sys/contrib/openzfs/man/man8/zpool-attach.8 index 51d876767666..04996ed4fa11 100644 --- a/sys/contrib/openzfs/man/man8/zpool-attach.8 +++ b/sys/contrib/openzfs/man/man8/zpool-attach.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd June 28, 2023 +.Dd November 8, 2023 .Dt ZPOOL-ATTACH 8 .Os . @@ -39,24 +39,24 @@ .Cm attach .Op Fl fsw .Oo Fl o Ar property Ns = Ns Ar value Oc -.Ar pool device new_device +.Ar pool vdev new_device . .Sh DESCRIPTION Attaches .Ar new_device to the existing -.Ar device . +.Ar vdev . The behavior differs depending on if the existing -.Ar device +.Ar vdev is a RAID-Z device, or a mirror/plain device. .Pp -If the existing device is a mirror or plain device +If the existing vdev is a mirror or plain device .Pq e.g. specified as Qo Li sda Qc or Qq Li mirror-7 , -the new device will be mirrored with the existing device, a resilver will be +the new device will be mirrored with the existing vdev, a resilver will be initiated, and the new device will contribute to additional redundancy once the resilver completes. If -.Ar device +.Ar vdev is not currently part of a mirrored configuration, .Ar device automatically transforms into a two-way mirror of @@ -64,7 +64,7 @@ automatically transforms into a two-way mirror of and .Ar new_device . If -.Ar device +.Ar vdev is part of a two-way mirror, attaching .Ar new_device creates a three-way mirror, and so on. @@ -72,7 +72,7 @@ In either case, .Ar new_device begins to resilver immediately and any running scrub is canceled. .Pp -If the existing device is a RAID-Z device +If the existing vdev is a RAID-Z device .Pq e.g. specified as Qq Ar raidz2-0 , the new device will become part of that RAID-Z group. A "raidz expansion" will be initiated, and once the expansion completes, @@ -112,7 +112,7 @@ the checksums of all blocks which have been copied during the expansion. Forces use of .Ar new_device , even if it appears to be in use. -Not all devices can be overridden in this manner. +Not all vdevs can be overridden in this manner. .It Fl o Ar property Ns = Ns Ar value Sets the given pool properties. See the @@ -121,7 +121,7 @@ manual page for a list of valid properties that can be set. The only property supported at the moment is .Sy ashift . .It Fl s -When attaching to a mirror or plain device, the +When attaching to a mirror or plain vdev, the .Ar new_device is reconstructed sequentially to restore redundancy as quickly as possible. Checksums are not verified during sequential reconstruction so a scrub is diff --git a/sys/contrib/openzfs/man/man8/zpool-checkpoint.8 b/sys/contrib/openzfs/man/man8/zpool-checkpoint.8 index d97d10d5df6e..b654f669cfa2 100644 --- a/sys/contrib/openzfs/man/man8/zpool-checkpoint.8 +++ b/sys/contrib/openzfs/man/man8/zpool-checkpoint.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd May 27, 2021 +.Dd July 11, 2022 .Dt ZPOOL-CHECKPOINT 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-clear.8 b/sys/contrib/openzfs/man/man8/zpool-clear.8 index 19cd4be36408..70cd8325bd0e 100644 --- a/sys/contrib/openzfs/man/man8/zpool-clear.8 +++ b/sys/contrib/openzfs/man/man8/zpool-clear.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd May 27, 2021 +.Dd April 29, 2024 .Dt ZPOOL-CLEAR 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-create.8 b/sys/contrib/openzfs/man/man8/zpool-create.8 index 490c67629a20..a36ae260a158 100644 --- a/sys/contrib/openzfs/man/man8/zpool-create.8 +++ b/sys/contrib/openzfs/man/man8/zpool-create.8 @@ -28,7 +28,7 @@ .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" Copyright (c) 2021, Colm Buckley <colm@tuatha.org> .\" -.Dd March 16, 2022 +.Dd July 11, 2022 .Dt ZPOOL-CREATE 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-destroy.8 b/sys/contrib/openzfs/man/man8/zpool-destroy.8 index f49f29804ad7..82f3f3e203d6 100644 --- a/sys/contrib/openzfs/man/man8/zpool-destroy.8 +++ b/sys/contrib/openzfs/man/man8/zpool-destroy.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd March 16, 2022 +.Dd July 11, 2022 .Dt ZPOOL-DESTROY 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-detach.8 b/sys/contrib/openzfs/man/man8/zpool-detach.8 index ae02dbc2d5b8..79a44310110d 100644 --- a/sys/contrib/openzfs/man/man8/zpool-detach.8 +++ b/sys/contrib/openzfs/man/man8/zpool-detach.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd August 9, 2019 +.Dd July 11, 2022 .Dt ZPOOL-DETACH 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-events.8 b/sys/contrib/openzfs/man/man8/zpool-events.8 index 7af1917da6d9..36a9864dc73b 100644 --- a/sys/contrib/openzfs/man/man8/zpool-events.8 +++ b/sys/contrib/openzfs/man/man8/zpool-events.8 @@ -28,7 +28,7 @@ .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" Copyright (c) 2024, 2025, Klara, Inc. .\" -.Dd May 27, 2025 +.Dd July 3, 2025 .Dt ZPOOL-EVENTS 8 .Os . @@ -190,6 +190,16 @@ Issued when a scrub is resumed on a pool. .It Sy scrub.paused Issued when a scrub is paused on a pool. .It Sy bootfs.vdev.attach +.It Sy sitout +Issued when a +.Sy RAIDZ +or +.Sy DRAID +vdev triggers the +.Sy autosit +logic. +This logic detects when a disk in such a vdev is significantly slower than its +peers, and sits them out temporarily to preserve the performance of the pool. .El . .Sh PAYLOADS @@ -465,7 +475,7 @@ ZIO_FLAG_DONT_RETRY:0x00000400 ZIO_FLAG_NODATA:0x00001000 ZIO_FLAG_INDUCE_DAMAGE:0x00002000 -ZIO_FLAG_IO_ALLOCATING:0x00004000 +ZIO_FLAG_ALLOC_THROTTLED:0x00004000 ZIO_FLAG_IO_RETRY:0x00008000 ZIO_FLAG_PROBE:0x00010000 ZIO_FLAG_TRYHARD:0x00020000 diff --git a/sys/contrib/openzfs/man/man8/zpool-export.8 b/sys/contrib/openzfs/man/man8/zpool-export.8 index 171a7541c6d2..02495c088f94 100644 --- a/sys/contrib/openzfs/man/man8/zpool-export.8 +++ b/sys/contrib/openzfs/man/man8/zpool-export.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd March 16, 2022 +.Dd July 11, 2022 .Dt ZPOOL-EXPORT 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-get.8 b/sys/contrib/openzfs/man/man8/zpool-get.8 index 1d6d1f08afa6..bfe1bae7619f 100644 --- a/sys/contrib/openzfs/man/man8/zpool-get.8 +++ b/sys/contrib/openzfs/man/man8/zpool-get.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd August 9, 2019 +.Dd October 12, 2024 .Dt ZPOOL-GET 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-history.8 b/sys/contrib/openzfs/man/man8/zpool-history.8 index f15086eabc47..f02168951ff2 100644 --- a/sys/contrib/openzfs/man/man8/zpool-history.8 +++ b/sys/contrib/openzfs/man/man8/zpool-history.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd August 9, 2019 +.Dd July 11, 2022 .Dt ZPOOL-HISTORY 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-import.8 b/sys/contrib/openzfs/man/man8/zpool-import.8 index 9076f5c34929..c6d5f222b6b2 100644 --- a/sys/contrib/openzfs/man/man8/zpool-import.8 +++ b/sys/contrib/openzfs/man/man8/zpool-import.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd March 16, 2022 +.Dd July 11, 2022 .Dt ZPOOL-IMPORT 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-initialize.8 b/sys/contrib/openzfs/man/man8/zpool-initialize.8 index d7c9d22aba97..5299a897cb97 100644 --- a/sys/contrib/openzfs/man/man8/zpool-initialize.8 +++ b/sys/contrib/openzfs/man/man8/zpool-initialize.8 @@ -26,8 +26,9 @@ .\" Copyright (c) 2018 George Melikov. All Rights Reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" Copyright (c) 2025 Hewlett Packard Enterprise Development LP. .\" -.Dd May 27, 2021 +.Dd July 30, 2025 .Dt ZPOOL-INITIALIZE 8 .Os . @@ -39,7 +40,7 @@ .Cm initialize .Op Fl c Ns | Ns Fl s | Ns Fl u .Op Fl w -.Ar pool +.Fl a Ns | Ns Ar pool .Oo Ar device Oc Ns … . .Sh DESCRIPTION @@ -48,6 +49,10 @@ devices, or all eligible devices in the pool if no individual devices are specified. Only leaf data or log devices may be initialized. .Bl -tag -width Ds +.It Fl a , -all +Begin, cancel, suspend initializing on +all +pools. .It Fl c , -cancel Cancel initializing on the specified devices, or all eligible devices if none are specified. diff --git a/sys/contrib/openzfs/man/man8/zpool-iostat.8 b/sys/contrib/openzfs/man/man8/zpool-iostat.8 index d8c21d0cfc6c..5dd9c9d55e20 100644 --- a/sys/contrib/openzfs/man/man8/zpool-iostat.8 +++ b/sys/contrib/openzfs/man/man8/zpool-iostat.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd March 16, 2022 +.Dd January 29, 2024 .Dt ZPOOL-IOSTAT 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-labelclear.8 b/sys/contrib/openzfs/man/man8/zpool-labelclear.8 index ba3d1509aa75..b807acaaede3 100644 --- a/sys/contrib/openzfs/man/man8/zpool-labelclear.8 +++ b/sys/contrib/openzfs/man/man8/zpool-labelclear.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd May 31, 2021 +.Dd July 11, 2022 .Dt ZPOOL-LABELCLEAR 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-list.8 b/sys/contrib/openzfs/man/man8/zpool-list.8 index b720e203c1c9..106399941f98 100644 --- a/sys/contrib/openzfs/man/man8/zpool-list.8 +++ b/sys/contrib/openzfs/man/man8/zpool-list.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd March 16, 2022 +.Dd October 12, 2024 .Dt ZPOOL-LIST 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-offline.8 b/sys/contrib/openzfs/man/man8/zpool-offline.8 index 49b1f34ad5d5..388c7634acce 100644 --- a/sys/contrib/openzfs/man/man8/zpool-offline.8 +++ b/sys/contrib/openzfs/man/man8/zpool-offline.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd August 9, 2019 +.Dd December 21, 2023 .Dt ZPOOL-OFFLINE 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-reguid.8 b/sys/contrib/openzfs/man/man8/zpool-reguid.8 index 77101fc07326..b98c88e320de 100644 --- a/sys/contrib/openzfs/man/man8/zpool-reguid.8 +++ b/sys/contrib/openzfs/man/man8/zpool-reguid.8 @@ -29,7 +29,7 @@ .\" Copyright (c) 2024, Klara Inc. .\" Copyright (c) 2024, Mateusz Piotrowski .\" -.Dd June 21, 2023 +.Dd August 26, 2024 .Dt ZPOOL-REGUID 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-remove.8 b/sys/contrib/openzfs/man/man8/zpool-remove.8 index d10a92e49bbe..4d5fc431d332 100644 --- a/sys/contrib/openzfs/man/man8/zpool-remove.8 +++ b/sys/contrib/openzfs/man/man8/zpool-remove.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd March 16, 2022 +.Dd November 19, 2024 .Dt ZPOOL-REMOVE 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-reopen.8 b/sys/contrib/openzfs/man/man8/zpool-reopen.8 index 594cff3d16d8..c4e10f0a546e 100644 --- a/sys/contrib/openzfs/man/man8/zpool-reopen.8 +++ b/sys/contrib/openzfs/man/man8/zpool-reopen.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd June 2, 2021 +.Dd July 11, 2022 .Dt ZPOOL-REOPEN 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-replace.8 b/sys/contrib/openzfs/man/man8/zpool-replace.8 index 9f3156eeb3ef..651af13b19b8 100644 --- a/sys/contrib/openzfs/man/man8/zpool-replace.8 +++ b/sys/contrib/openzfs/man/man8/zpool-replace.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd May 29, 2021 +.Dd July 11, 2022 .Dt ZPOOL-REPLACE 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-resilver.8 b/sys/contrib/openzfs/man/man8/zpool-resilver.8 index 2161d77f62ed..59c4be5db209 100644 --- a/sys/contrib/openzfs/man/man8/zpool-resilver.8 +++ b/sys/contrib/openzfs/man/man8/zpool-resilver.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd May 27, 2021 +.Dd July 11, 2022 .Dt ZPOOL-RESILVER 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-scrub.8 b/sys/contrib/openzfs/man/man8/zpool-scrub.8 index 21bd6735ede4..cf7ead5788bf 100644 --- a/sys/contrib/openzfs/man/man8/zpool-scrub.8 +++ b/sys/contrib/openzfs/man/man8/zpool-scrub.8 @@ -26,8 +26,9 @@ .\" Copyright (c) 2018, 2021 George Melikov. All Rights Reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" Copyright (c) 2025 Hewlett Packard Enterprise Development LP. .\" -.Dd November 18, 2024 +.Dd August 6, 2025 .Dt ZPOOL-SCRUB 8 .Os . @@ -39,7 +40,9 @@ .Cm scrub .Op Ns Fl e | Ns Fl p | Fl s Ns | Fl C Ns .Op Fl w -.Ar pool Ns … +.Op Fl S Ar date +.Op Fl E Ar date +.Fl a Ns | Ns Ar pool Ns … . .Sh DESCRIPTION Begins a scrub or resumes a paused scrub. @@ -89,6 +92,12 @@ During this period, no completion time estimate will be provided. . .Sh OPTIONS .Bl -tag -width "-s" +.It Fl a , -all +Begin, pause, stop scrub on +all +pools. +Initiating scrubs on multiple pools can put considerable load and memory +pressure on the system, so this operation should be performed with caution. .It Fl s Stop scrubbing. .It Fl p @@ -118,6 +127,44 @@ resilvering, nor can it be run when a regular scrub is paused. Continue scrub from last saved txg (see zpool .Sy last_scrubbed_txg property). +.It Fl S Ar date , Fl E Ar date +Allows specifying the date range for blocks created between these dates. +.Bl -bullet -compact -offset indent +.It +.Fl S +Defines a start date. +If not specified, scrubbing begins from the start of the pool's +existence. +.It +.Fl E +Defines an end date. +If not specified, scrubbing continues up to the most recent data. +.El +The provided date should be in the format: +.Dq YYYY-MM-DD HH:MM . +Where: +.Bl -bullet -compact -offset indent +.It +.Dq YYYY +is the year. +.It +.Dq MM +is the numeric representation of the month. +.It +.Dq DD +is the day of the month. +.It +.Dq HH +is the hour. +.It +.Dq MM +is the minutes. +.El +The hour and minutes parameters can be omitted. +The time should be provided in machine local time zone. +Specifying dates prior to enabling this feature will result in scrubbing +starting from the date the pool was created. +If the time was moved backward manually the data range may become inaccurate. .El .Sh EXAMPLES .Ss Example 1 diff --git a/sys/contrib/openzfs/man/man8/zpool-split.8 b/sys/contrib/openzfs/man/man8/zpool-split.8 index a67c865cf30c..ee4c6384cf23 100644 --- a/sys/contrib/openzfs/man/man8/zpool-split.8 +++ b/sys/contrib/openzfs/man/man8/zpool-split.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd June 2, 2021 +.Dd July 11, 2022 .Dt ZPOOL-SPLIT 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-status.8 b/sys/contrib/openzfs/man/man8/zpool-status.8 index a7f3e088043b..108a1067b384 100644 --- a/sys/contrib/openzfs/man/man8/zpool-status.8 +++ b/sys/contrib/openzfs/man/man8/zpool-status.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd February 14, 2024 +.Dd May 20, 2025 .Dt ZPOOL-STATUS 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-sync.8 b/sys/contrib/openzfs/man/man8/zpool-sync.8 index 8f438f363e83..d1dc05d0c202 100644 --- a/sys/contrib/openzfs/man/man8/zpool-sync.8 +++ b/sys/contrib/openzfs/man/man8/zpool-sync.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd August 9, 2019 +.Dd July 11, 2022 .Dt ZPOOL-SYNC 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-trim.8 b/sys/contrib/openzfs/man/man8/zpool-trim.8 index 06cbd5abf7eb..c4e849019789 100644 --- a/sys/contrib/openzfs/man/man8/zpool-trim.8 +++ b/sys/contrib/openzfs/man/man8/zpool-trim.8 @@ -26,8 +26,9 @@ .\" Copyright (c) 2018 George Melikov. All Rights Reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" Copyright (c) 2025 Hewlett Packard Enterprise Development LP. .\" -.Dd May 27, 2021 +.Dd July 30, 2025 .Dt ZPOOL-TRIM 8 .Os . @@ -40,7 +41,7 @@ .Op Fl dw .Op Fl r Ar rate .Op Fl c Ns | Ns Fl s -.Ar pool +.Fl a Ns | Ns Ar pool .Oo Ar device Ns Oc Ns … . .Sh DESCRIPTION @@ -57,6 +58,10 @@ See the documentation for the .Sy autotrim property above for the types of vdev devices which can be trimmed. .Bl -tag -width Ds +.It Fl a , -all +Perform TRIM operation on +all +pools. .It Fl d , -secure Causes a secure TRIM to be initiated. When performing a secure TRIM, the diff --git a/sys/contrib/openzfs/man/man8/zpool-upgrade.8 b/sys/contrib/openzfs/man/man8/zpool-upgrade.8 index 20632ae4bba0..cf69060da5ce 100644 --- a/sys/contrib/openzfs/man/man8/zpool-upgrade.8 +++ b/sys/contrib/openzfs/man/man8/zpool-upgrade.8 @@ -28,7 +28,7 @@ .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" Copyright (c) 2021, Colm Buckley <colm@tuatha.org> .\" -.Dd March 16, 2022 +.Dd July 11, 2022 .Dt ZPOOL-UPGRADE 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool-wait.8 b/sys/contrib/openzfs/man/man8/zpool-wait.8 index 0ffb4badfb7b..28a51d29a913 100644 --- a/sys/contrib/openzfs/man/man8/zpool-wait.8 +++ b/sys/contrib/openzfs/man/man8/zpool-wait.8 @@ -28,7 +28,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd May 27, 2021 +.Dd January 29, 2024 .Dt ZPOOL-WAIT 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zpool.8 b/sys/contrib/openzfs/man/man8/zpool.8 index b96944050594..3bfef780b298 100644 --- a/sys/contrib/openzfs/man/man8/zpool.8 +++ b/sys/contrib/openzfs/man/man8/zpool.8 @@ -27,7 +27,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd February 14, 2024 +.Dd November 19, 2024 .Dt ZPOOL 8 .Os . diff --git a/sys/contrib/openzfs/man/man8/zstream.8 b/sys/contrib/openzfs/man/man8/zstream.8 index 03a8479c9e6a..5b3d063bc4a5 100644 --- a/sys/contrib/openzfs/man/man8/zstream.8 +++ b/sys/contrib/openzfs/man/man8/zstream.8 @@ -21,7 +21,7 @@ .\" .\" Copyright (c) 2020 by Delphix. All rights reserved. .\" -.Dd October 4, 2022 +.Dd November 10, 2022 .Dt ZSTREAM 8 .Os . diff --git a/sys/contrib/openzfs/module/Kbuild.in b/sys/contrib/openzfs/module/Kbuild.in index 667f061c6e16..58a80dc4402c 100644 --- a/sys/contrib/openzfs/module/Kbuild.in +++ b/sys/contrib/openzfs/module/Kbuild.in @@ -4,7 +4,7 @@ ZFS_MODULE_CFLAGS += -std=gnu99 -Wno-declaration-after-statement ZFS_MODULE_CFLAGS += -Wmissing-prototypes -ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@ @NO_FORMAT_ZERO_LENGTH@ +ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@ @KERNEL_NO_FORMAT_ZERO_LENGTH@ ifneq ($(KBUILD_EXTMOD),) zfs_include = @abs_top_srcdir@/include @@ -135,6 +135,7 @@ ICP_OBJS_X86_64 := \ asm-x86_64/sha2/sha256-x86_64.o \ asm-x86_64/sha2/sha512-x86_64.o \ asm-x86_64/modes/aesni-gcm-x86_64.o \ + asm-x86_64/modes/aesni-gcm-avx2-vaes.o \ asm-x86_64/modes/gcm_pclmulqdq.o \ asm-x86_64/modes/ghash-x86_64.o @@ -406,6 +407,7 @@ ZFS_OBJS := \ zfs_byteswap.o \ zfs_chksum.o \ zfs_debug_common.o \ + zfs_crrd.o \ zfs_fm.o \ zfs_fuid.o \ zfs_impl.o \ @@ -494,3 +496,34 @@ UBSAN_SANITIZE_zfs/sa.o := n ifeq ($(CONFIG_ALTIVEC),y) $(obj)/zfs/vdev_raidz_math_powerpc_altivec.o : c_flags += -maltivec endif + +# The following recipes attempt to fix out of src-tree builds, where $(src) != $(obj), so that the +# subdir %.c/%.S -> %.o targets will work as expected. The in-kernel pattern targets do not seem to +# be working on subdirs since about ~6.10 +zobjdirs = $(dir $(zfs-objs)) $(dir $(spl-objs)) \ + $(dir $(zfs-$(CONFIG_X86))) $(dir $(zfs-$(CONFIG_UML_X86))) $(dir $(zfs-$(CONFIG_ARM64))) \ + $(dir $(zfs-$(CONFIG_PPC64))) $(dir $(zfs-$(CONFIG_PPC))) + +z_cdirs = $(sort $(filter-out lua/setjmp/ $(addprefix icp/asm-aarch64/, aes/ blake3/ modes/ sha2/) \ + $(addprefix icp/asm-x86_64/, aes/ blake3/ modes/ sha2/) \ + $(addprefix icp/asm-ppc/, aes/ blake3/ modes/ sha2/) \ + $(addprefix icp/asm-ppc64/, aes/ blake3/ modes/ sha2/), $(zobjdirs))) +z_sdirs = $(sort $(filter lua/setjmp/ $(addprefix icp/asm-aarch64/, aes/ blake3/ modes/ sha2/) \ + $(addprefix icp/asm-x86_64/, aes/ blake3/ modes/ sha2/) \ + $(addprefix icp/asm-ppc/, aes/ blake3/ modes/ sha2/) \ + $(addprefix icp/asm-ppc64/, aes/ blake3/ modes/ sha2/), $(zobjdirs))) + +define ZKMOD_C_O_MAKE_TARGET +$1%.o: $(src)/$1%.c FORCE + $$(call if_changed_rule,cc_o_c) + $$(call cmd,force_checksrc) +endef + +define ZKMOD_S_O_MAKE_TARGET +$1%.o: $(src)/$1%.S FORCE + $$(call if_changed_rule,as_o_S) + $$(call cmd,force_checksrc) +endef + +$(foreach target,$(z_cdirs), $(eval $(call ZKMOD_C_O_MAKE_TARGET,$(target)))) +$(foreach target,$(z_sdirs), $(eval $(call ZKMOD_S_O_MAKE_TARGET,$(target)))) diff --git a/sys/contrib/openzfs/module/Makefile.bsd b/sys/contrib/openzfs/module/Makefile.bsd index 7e7c3db73a43..3ba38c43f25b 100644 --- a/sys/contrib/openzfs/module/Makefile.bsd +++ b/sys/contrib/openzfs/module/Makefile.bsd @@ -217,6 +217,7 @@ SRCS+= abd_os.c \ vdev_label_os.c \ zfs_acl.c \ zfs_ctldir.c \ + zfs_crrd.c \ zfs_debug.c \ zfs_dir.c \ zfs_file_os.c \ diff --git a/sys/contrib/openzfs/module/Makefile.in b/sys/contrib/openzfs/module/Makefile.in index e9a268121762..859ba8649dd7 100644 --- a/sys/contrib/openzfs/module/Makefile.in +++ b/sys/contrib/openzfs/module/Makefile.in @@ -57,7 +57,7 @@ modules-Linux: $(if @KERNEL_LD@,LD=@KERNEL_LD@) $(if @KERNEL_LLVM@,LLVM=@KERNEL_LLVM@) \ $(if @KERNEL_CROSS_COMPILE@,CROSS_COMPILE=@KERNEL_CROSS_COMPILE@) \ $(if @KERNEL_ARCH@,ARCH=@KERNEL_ARCH@) \ - $(if @OBJTOOL_DISABLE_WERROR@,objtool=@top_builddir@/scripts/objtool-wrapper) \ + $(if @OBJTOOL_DISABLE_WERROR@,objtool=@abs_top_builddir@/scripts/objtool-wrapper) \ M="$$PWD" @KERNEL_MAKE@ CONFIG_ZFS=m modules modules-FreeBSD: diff --git a/sys/contrib/openzfs/module/avl/avl.c b/sys/contrib/openzfs/module/avl/avl.c index b6c1c02bc3f2..67cbcd3adeec 100644 --- a/sys/contrib/openzfs/module/avl/avl.c +++ b/sys/contrib/openzfs/module/avl/avl.c @@ -225,7 +225,7 @@ avl_nearest(avl_tree_t *tree, avl_index_t where, int direction) size_t off = tree->avl_offset; if (node == NULL) { - ASSERT(tree->avl_root == NULL); + ASSERT0P(tree->avl_root); return (NULL); } data = AVL_NODE2DATA(node, off); @@ -478,7 +478,7 @@ avl_insert(avl_tree_t *tree, void *new_data, avl_index_t where) size_t off = tree->avl_offset; #ifdef _LP64 - ASSERT(((uintptr_t)new_data & 0x7) == 0); + ASSERT0(((uintptr_t)new_data & 0x7)); #endif node = AVL_DATA2NODE(new_data, off); @@ -495,10 +495,10 @@ avl_insert(avl_tree_t *tree, void *new_data, avl_index_t where) AVL_SETBALANCE(node, 0); AVL_SETPARENT(node, parent); if (parent != NULL) { - ASSERT(parent->avl_child[which_child] == NULL); + ASSERT0P(parent->avl_child[which_child]); parent->avl_child[which_child] = node; } else { - ASSERT(tree->avl_root == NULL); + ASSERT0P(tree->avl_root); tree->avl_root = node; } /* @@ -608,7 +608,7 @@ avl_insert_here( ASSERT(diff > 0 ? child == 1 : child == 0); #endif } - ASSERT(node->avl_child[child] == NULL); + ASSERT0P(node->avl_child[child]); avl_insert(tree, new_data, AVL_MKINDEX(node, child)); } @@ -881,7 +881,7 @@ avl_create(avl_tree_t *tree, int (*compar) (const void *, const void *), ASSERT(size > 0); ASSERT(size >= offset + sizeof (avl_node_t)); #ifdef _LP64 - ASSERT((offset & 0x7) == 0); + ASSERT0((offset & 0x7)); #endif tree->avl_compar = compar; @@ -897,8 +897,8 @@ void avl_destroy(avl_tree_t *tree) { ASSERT(tree); - ASSERT(tree->avl_numnodes == 0); - ASSERT(tree->avl_root == NULL); + ASSERT0(tree->avl_numnodes); + ASSERT0P(tree->avl_root); } diff --git a/sys/contrib/openzfs/module/icp/algs/modes/gcm.c b/sys/contrib/openzfs/module/icp/algs/modes/gcm.c index c2a982b5a376..3cfa5b8165ce 100644 --- a/sys/contrib/openzfs/module/icp/algs/modes/gcm.c +++ b/sys/contrib/openzfs/module/icp/algs/modes/gcm.c @@ -46,6 +46,9 @@ #define IMPL_CYCLE (UINT32_MAX-1) #ifdef CAN_USE_GCM_ASM #define IMPL_AVX (UINT32_MAX-2) +#if CAN_USE_GCM_ASM >= 2 +#define IMPL_AVX2 (UINT32_MAX-3) +#endif #endif #define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i)) static uint32_t icp_gcm_impl = IMPL_FASTEST; @@ -56,17 +59,16 @@ static uint32_t user_sel_impl = IMPL_FASTEST; boolean_t gcm_avx_can_use_movbe = B_FALSE; /* * Whether to use the optimized openssl gcm and ghash implementations. - * Set to true if module parameter icp_gcm_impl == "avx". */ -static boolean_t gcm_use_avx = B_FALSE; -#define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx) +static gcm_impl gcm_impl_used = GCM_IMPL_GENERIC; +#define GCM_IMPL_USED (*(volatile gcm_impl *)&gcm_impl_used) extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *); static inline boolean_t gcm_avx_will_work(void); -static inline void gcm_set_avx(boolean_t); -static inline boolean_t gcm_toggle_avx(void); -static inline size_t gcm_simd_get_htab_size(boolean_t); +static inline boolean_t gcm_avx2_will_work(void); +static inline void gcm_use_impl(gcm_impl impl); +static inline gcm_impl gcm_toggle_impl(void); static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t, crypto_data_t *, size_t); @@ -89,7 +91,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, void (*xor_block)(uint8_t *, uint8_t *)) { #ifdef CAN_USE_GCM_ASM - if (ctx->gcm_use_avx == B_TRUE) + if (ctx->impl != GCM_IMPL_GENERIC) return (gcm_mode_encrypt_contiguous_blocks_avx( ctx, data, length, out, block_size)); #endif @@ -208,7 +210,7 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, { (void) copy_block; #ifdef CAN_USE_GCM_ASM - if (ctx->gcm_use_avx == B_TRUE) + if (ctx->impl != GCM_IMPL_GENERIC) return (gcm_encrypt_final_avx(ctx, out, block_size)); #endif @@ -374,7 +376,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, void (*xor_block)(uint8_t *, uint8_t *)) { #ifdef CAN_USE_GCM_ASM - if (ctx->gcm_use_avx == B_TRUE) + if (ctx->impl != GCM_IMPL_GENERIC) return (gcm_decrypt_final_avx(ctx, out, block_size)); #endif @@ -631,23 +633,23 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, ((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap; if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) { - gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX; + gcm_ctx->impl = GCM_IMPL_USED; } else { /* - * Handle the "cycle" implementation by creating avx and - * non-avx contexts alternately. + * Handle the "cycle" implementation by creating different + * contexts, one per implementation. */ - gcm_ctx->gcm_use_avx = gcm_toggle_avx(); + gcm_ctx->impl = gcm_toggle_impl(); - /* The avx impl. doesn't handle byte swapped key schedules. */ - if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) { - gcm_ctx->gcm_use_avx = B_FALSE; + /* The AVX impl. doesn't handle byte swapped key schedules. */ + if (needs_bswap == B_TRUE) { + gcm_ctx->impl = GCM_IMPL_GENERIC; } /* - * If this is a GCM context, use the MOVBE and the BSWAP + * If this is an AVX context, use the MOVBE and the BSWAP * variants alternately. */ - if (gcm_ctx->gcm_use_avx == B_TRUE && + if (gcm_ctx->impl == GCM_IMPL_AVX && zfs_movbe_available() == B_TRUE) { (void) atomic_toggle_boolean_nv( (volatile boolean_t *)&gcm_avx_can_use_movbe); @@ -658,12 +660,13 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, * still they could be created by the aes generic implementation. * Make sure not to use them since we'll corrupt data if we do. */ - if (gcm_ctx->gcm_use_avx == B_TRUE && needs_bswap == B_TRUE) { - gcm_ctx->gcm_use_avx = B_FALSE; + if (gcm_ctx->impl != GCM_IMPL_GENERIC && needs_bswap == B_TRUE) { + gcm_ctx->impl = GCM_IMPL_GENERIC; cmn_err_once(CE_WARN, "ICP: Can't use the aes generic or cycle implementations " - "in combination with the gcm avx implementation!"); + "in combination with the gcm avx or avx2-vaes " + "implementation!"); cmn_err_once(CE_WARN, "ICP: Falling back to a compatible implementation, " "aes-gcm performance will likely be degraded."); @@ -672,36 +675,20 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, "restore performance."); } - /* Allocate Htab memory as needed. */ - if (gcm_ctx->gcm_use_avx == B_TRUE) { - size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx); - - if (htab_len == 0) { - return (CRYPTO_MECHANISM_PARAM_INVALID); - } - gcm_ctx->gcm_htab_len = htab_len; - gcm_ctx->gcm_Htable = - kmem_alloc(htab_len, KM_SLEEP); - - if (gcm_ctx->gcm_Htable == NULL) { - return (CRYPTO_HOST_MEMORY); - } + /* + * AVX implementations use Htable with sizes depending on + * implementation. + */ + if (gcm_ctx->impl != GCM_IMPL_GENERIC) { + rv = gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len, + block_size); } - /* Avx and non avx context initialization differs from here on. */ - if (gcm_ctx->gcm_use_avx == B_FALSE) { + else #endif /* ifdef CAN_USE_GCM_ASM */ - if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size, - encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) { - rv = CRYPTO_MECHANISM_PARAM_INVALID; - } -#ifdef CAN_USE_GCM_ASM - } else { - if (gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len, - block_size) != CRYPTO_SUCCESS) { - rv = CRYPTO_MECHANISM_PARAM_INVALID; - } + if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size, + encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) { + rv = CRYPTO_MECHANISM_PARAM_INVALID; } -#endif /* ifdef CAN_USE_GCM_ASM */ return (rv); } @@ -767,6 +754,9 @@ gcm_impl_get_ops(void) break; #ifdef CAN_USE_GCM_ASM case IMPL_AVX: +#if CAN_USE_GCM_ASM >= 2 + case IMPL_AVX2: +#endif /* * Make sure that we return a valid implementation while * switching to the avx implementation since there still @@ -828,6 +818,13 @@ gcm_impl_init(void) * Use the avx implementation if it's available and the implementation * hasn't changed from its default value of fastest on module load. */ +#if CAN_USE_GCM_ASM >= 2 + if (gcm_avx2_will_work()) { + if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) { + gcm_use_impl(GCM_IMPL_AVX2); + } + } else +#endif if (gcm_avx_will_work()) { #ifdef HAVE_MOVBE if (zfs_movbe_available() == B_TRUE) { @@ -835,7 +832,7 @@ gcm_impl_init(void) } #endif if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) { - gcm_set_avx(B_TRUE); + gcm_use_impl(GCM_IMPL_AVX); } } #endif @@ -852,6 +849,7 @@ static const struct { { "fastest", IMPL_FASTEST }, #ifdef CAN_USE_GCM_ASM { "avx", IMPL_AVX }, + { "avx2-vaes", IMPL_AVX2 }, #endif }; @@ -887,7 +885,13 @@ gcm_impl_set(const char *val) /* Check mandatory options */ for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { #ifdef CAN_USE_GCM_ASM +#if CAN_USE_GCM_ASM >= 2 /* Ignore avx implementation if it won't work. */ + if (gcm_impl_opts[i].sel == IMPL_AVX2 && + !gcm_avx2_will_work()) { + continue; + } +#endif if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { continue; } @@ -915,11 +919,17 @@ gcm_impl_set(const char *val) * Use the avx implementation if available and the requested one is * avx or fastest. */ +#if CAN_USE_GCM_ASM >= 2 + if (gcm_avx2_will_work() == B_TRUE && + (impl == IMPL_AVX2 || impl == IMPL_FASTEST)) { + gcm_use_impl(GCM_IMPL_AVX2); + } else +#endif if (gcm_avx_will_work() == B_TRUE && (impl == IMPL_AVX || impl == IMPL_FASTEST)) { - gcm_set_avx(B_TRUE); + gcm_use_impl(GCM_IMPL_AVX); } else { - gcm_set_avx(B_FALSE); + gcm_use_impl(GCM_IMPL_GENERIC); } #endif @@ -952,6 +962,12 @@ icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp) for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { #ifdef CAN_USE_GCM_ASM /* Ignore avx implementation if it won't work. */ +#if CAN_USE_GCM_ASM >= 2 + if (gcm_impl_opts[i].sel == IMPL_AVX2 && + !gcm_avx2_will_work()) { + continue; + } +#endif if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { continue; } @@ -993,9 +1009,6 @@ MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation."); /* Clear the FPU registers since they hold sensitive internal state. */ #define clear_fpu_regs() clear_fpu_regs_avx() -#define GHASH_AVX(ctx, in, len) \ - gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \ - in, len) #define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1) @@ -1010,20 +1023,77 @@ MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation."); static uint32_t gcm_avx_chunk_size = ((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; +/* + * GCM definitions: uint128_t is copied from include/crypto/modes.h + * Avoiding u128 because it is already defined in kernel sources. + */ +typedef struct { + uint64_t hi, lo; +} uint128_t; + extern void ASMABI clear_fpu_regs_avx(void); extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst); extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr, const uint32_t pt[4], uint32_t ct[4]); extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]); +#if CAN_USE_GCM_ASM >= 2 +extern void ASMABI gcm_init_vpclmulqdq_avx2(uint128_t Htable[16], + const uint64_t H[2]); +#endif extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable, const uint8_t *in, size_t len); +#if CAN_USE_GCM_ASM >= 2 +extern void ASMABI gcm_ghash_vpclmulqdq_avx2(uint64_t ghash[2], + const uint64_t *Htable, const uint8_t *in, size_t len); +#endif +static inline void GHASH_AVX(gcm_ctx_t *ctx, const uint8_t *in, size_t len) +{ + switch (ctx->impl) { +#if CAN_USE_GCM_ASM >= 2 + case GCM_IMPL_AVX2: + gcm_ghash_vpclmulqdq_avx2(ctx->gcm_ghash, + (const uint64_t *)ctx->gcm_Htable, in, len); + break; +#endif + + case GCM_IMPL_AVX: + gcm_ghash_avx(ctx->gcm_ghash, + (const uint64_t *)ctx->gcm_Htable, in, len); + break; + + default: + VERIFY(B_FALSE); + } +} +typedef size_t ASMABI aesni_gcm_encrypt_impl(const uint8_t *, uint8_t *, + size_t, const void *, uint64_t *, const uint64_t *Htable, uint64_t *); extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t, const void *, uint64_t *, uint64_t *); +#if CAN_USE_GCM_ASM >= 2 +extern void ASMABI aes_gcm_enc_update_vaes_avx2(const uint8_t *in, + uint8_t *out, size_t len, const void *key, const uint8_t ivec[16], + const uint128_t Htable[16], uint8_t Xi[16]); +#endif +typedef size_t ASMABI aesni_gcm_decrypt_impl(const uint8_t *, uint8_t *, + size_t, const void *, uint64_t *, const uint64_t *Htable, uint64_t *); extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t, const void *, uint64_t *, uint64_t *); +#if CAN_USE_GCM_ASM >= 2 +extern void ASMABI aes_gcm_dec_update_vaes_avx2(const uint8_t *in, + uint8_t *out, size_t len, const void *key, const uint8_t ivec[16], + const uint128_t Htable[16], uint8_t Xi[16]); +#endif + +static inline boolean_t +gcm_avx2_will_work(void) +{ + return (kfpu_allowed() && + zfs_avx2_available() && zfs_vaes_available() && + zfs_vpclmulqdq_available()); +} static inline boolean_t gcm_avx_will_work(void) @@ -1035,33 +1105,67 @@ gcm_avx_will_work(void) } static inline void -gcm_set_avx(boolean_t val) +gcm_use_impl(gcm_impl impl) { - if (gcm_avx_will_work() == B_TRUE) { - atomic_swap_32(&gcm_use_avx, val); + switch (impl) { +#if CAN_USE_GCM_ASM >= 2 + case GCM_IMPL_AVX2: + if (gcm_avx2_will_work() == B_TRUE) { + atomic_swap_32(&gcm_impl_used, impl); + return; + } + + zfs_fallthrough; +#endif + + case GCM_IMPL_AVX: + if (gcm_avx_will_work() == B_TRUE) { + atomic_swap_32(&gcm_impl_used, impl); + return; + } + + zfs_fallthrough; + + default: + atomic_swap_32(&gcm_impl_used, GCM_IMPL_GENERIC); } } static inline boolean_t -gcm_toggle_avx(void) +gcm_impl_will_work(gcm_impl impl) { - if (gcm_avx_will_work() == B_TRUE) { - return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX)); - } else { - return (B_FALSE); + switch (impl) { +#if CAN_USE_GCM_ASM >= 2 + case GCM_IMPL_AVX2: + return (gcm_avx2_will_work()); +#endif + + case GCM_IMPL_AVX: + return (gcm_avx_will_work()); + + default: + return (B_TRUE); } } -static inline size_t -gcm_simd_get_htab_size(boolean_t simd_mode) +static inline gcm_impl +gcm_toggle_impl(void) { - switch (simd_mode) { - case B_TRUE: - return (2 * 6 * 2 * sizeof (uint64_t)); + gcm_impl current_impl, new_impl; + do { /* handle races */ + current_impl = atomic_load_32(&gcm_impl_used); + new_impl = current_impl; + while (B_TRUE) { /* handle incompatble implementations */ + new_impl = (new_impl + 1) % GCM_IMPL_MAX; + if (gcm_impl_will_work(new_impl)) { + break; + } + } - default: - return (0); - } + } while (atomic_cas_32(&gcm_impl_used, current_impl, new_impl) != + current_impl); + + return (new_impl); } @@ -1077,6 +1181,50 @@ gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n) ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; } +static size_t aesni_gcm_encrypt_avx(const uint8_t *in, uint8_t *out, + size_t len, const void *key, uint64_t *iv, const uint64_t *Htable, + uint64_t *Xip) +{ + (void) Htable; + return (aesni_gcm_encrypt(in, out, len, key, iv, Xip)); +} + +#if CAN_USE_GCM_ASM >= 2 +// kSizeTWithoutLower4Bits is a mask that can be used to zero the lower four +// bits of a |size_t|. +// This is from boringssl/crypto/fipsmodule/aes/gcm.cc.inc +static const size_t kSizeTWithoutLower4Bits = (size_t)-16; + +/* The following CRYPTO methods are from boringssl/crypto/internal.h */ +static inline uint32_t CRYPTO_bswap4(uint32_t x) { + return (__builtin_bswap32(x)); +} + +static inline uint32_t CRYPTO_load_u32_be(const void *in) { + uint32_t v; + memcpy(&v, in, sizeof (v)); + return (CRYPTO_bswap4(v)); +} + +static inline void CRYPTO_store_u32_be(void *out, uint32_t v) { + v = CRYPTO_bswap4(v); + memcpy(out, &v, sizeof (v)); +} + +static size_t aesni_gcm_encrypt_avx2(const uint8_t *in, uint8_t *out, + size_t len, const void *key, uint64_t *iv, const uint64_t *Htable, + uint64_t *Xip) +{ + uint8_t *ivec = (uint8_t *)iv; + len &= kSizeTWithoutLower4Bits; + aes_gcm_enc_update_vaes_avx2(in, out, len, key, ivec, + (const uint128_t *)Htable, (uint8_t *)Xip); + CRYPTO_store_u32_be(&ivec[12], + CRYPTO_load_u32_be(&ivec[12]) + len / 16); + return (len); +} +#endif /* if CAN_USE_GCM_ASM >= 2 */ + /* * Encrypt multiple blocks of data in GCM mode. * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines @@ -1091,8 +1239,15 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, size_t done = 0; uint8_t *datap = (uint8_t *)data; size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; + aesni_gcm_encrypt_impl *encrypt_blocks = +#if CAN_USE_GCM_ASM >= 2 + ctx->impl == GCM_IMPL_AVX2 ? + aesni_gcm_encrypt_avx2 : +#endif + aesni_gcm_encrypt_avx; const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); uint64_t *ghash = ctx->gcm_ghash; + uint64_t *htable = ctx->gcm_Htable; uint64_t *cb = ctx->gcm_cb; uint8_t *ct_buf = NULL; uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; @@ -1156,8 +1311,8 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, /* Do the bulk encryption in chunk_size blocks. */ for (; bleft >= chunk_size; bleft -= chunk_size) { kfpu_begin(); - done = aesni_gcm_encrypt( - datap, ct_buf, chunk_size, key, cb, ghash); + done = encrypt_blocks( + datap, ct_buf, chunk_size, key, cb, htable, ghash); clear_fpu_regs(); kfpu_end(); @@ -1180,7 +1335,8 @@ gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, /* Bulk encrypt the remaining data. */ kfpu_begin(); if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { - done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash); + done = encrypt_blocks(datap, ct_buf, bleft, key, cb, htable, + ghash); if (done == 0) { rv = CRYPTO_FAILED; goto out; @@ -1293,6 +1449,29 @@ gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) return (CRYPTO_SUCCESS); } +static size_t aesni_gcm_decrypt_avx(const uint8_t *in, uint8_t *out, + size_t len, const void *key, uint64_t *iv, const uint64_t *Htable, + uint64_t *Xip) +{ + (void) Htable; + return (aesni_gcm_decrypt(in, out, len, key, iv, Xip)); +} + +#if CAN_USE_GCM_ASM >= 2 +static size_t aesni_gcm_decrypt_avx2(const uint8_t *in, uint8_t *out, + size_t len, const void *key, uint64_t *iv, const uint64_t *Htable, + uint64_t *Xip) +{ + uint8_t *ivec = (uint8_t *)iv; + len &= kSizeTWithoutLower4Bits; + aes_gcm_dec_update_vaes_avx2(in, out, len, key, ivec, + (const uint128_t *)Htable, (uint8_t *)Xip); + CRYPTO_store_u32_be(&ivec[12], + CRYPTO_load_u32_be(&ivec[12]) + len / 16); + return (len); +} +#endif /* if CAN_USE_GCM_ASM >= 2 */ + /* * Finalize decryption: We just have accumulated crypto text, so now we * decrypt it here inplace. @@ -1306,10 +1485,17 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) B_FALSE); size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; + aesni_gcm_decrypt_impl *decrypt_blocks = +#if CAN_USE_GCM_ASM >= 2 + ctx->impl == GCM_IMPL_AVX2 ? + aesni_gcm_decrypt_avx2 : +#endif + aesni_gcm_decrypt_avx; size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; uint8_t *datap = ctx->gcm_pt_buf; const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); uint32_t *cb = (uint32_t *)ctx->gcm_cb; + uint64_t *htable = ctx->gcm_Htable; uint64_t *ghash = ctx->gcm_ghash; uint32_t *tmp = (uint32_t *)ctx->gcm_tmp; int rv = CRYPTO_SUCCESS; @@ -1322,8 +1508,8 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) */ for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) { kfpu_begin(); - done = aesni_gcm_decrypt(datap, datap, chunk_size, - (const void *)key, ctx->gcm_cb, ghash); + done = decrypt_blocks(datap, datap, chunk_size, + (const void *)key, ctx->gcm_cb, htable, ghash); clear_fpu_regs(); kfpu_end(); if (done != chunk_size) { @@ -1334,8 +1520,8 @@ gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) /* Decrypt remainder, which is less than chunk size, in one go. */ kfpu_begin(); if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) { - done = aesni_gcm_decrypt(datap, datap, bleft, - (const void *)key, ctx->gcm_cb, ghash); + done = decrypt_blocks(datap, datap, bleft, + (const void *)key, ctx->gcm_cb, htable, ghash); if (done == 0) { clear_fpu_regs(); kfpu_end(); @@ -1424,13 +1610,42 @@ gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len, ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==, B_FALSE); + size_t htab_len = 0; +#if CAN_USE_GCM_ASM >= 2 + if (ctx->impl == GCM_IMPL_AVX2) { + /* + * BoringSSL's API specifies uint128_t[16] for htab; but only + * uint128_t[12] are used. + * See https://github.com/google/boringssl/blob/ + * 813840dd094f9e9c1b00a7368aa25e656554221f1/crypto/fipsmodule/ + * modes/asm/aes-gcm-avx2-x86_64.pl#L198-L200 + */ + htab_len = (2 * 8 * sizeof (uint128_t)); + } else +#endif /* CAN_USE_GCM_ASM >= 2 */ + { + htab_len = (2 * 6 * sizeof (uint128_t)); + } + + ctx->gcm_Htable = kmem_alloc(htab_len, KM_SLEEP); + if (ctx->gcm_Htable == NULL) { + return (CRYPTO_HOST_MEMORY); + } + /* Init H (encrypt zero block) and create the initial counter block. */ memset(H, 0, sizeof (ctx->gcm_H)); kfpu_begin(); aes_encrypt_intel(keysched, aes_rounds, (const uint32_t *)H, (uint32_t *)H); - gcm_init_htab_avx(ctx->gcm_Htable, H); +#if CAN_USE_GCM_ASM >= 2 + if (ctx->impl == GCM_IMPL_AVX2) { + gcm_init_vpclmulqdq_avx2((uint128_t *)ctx->gcm_Htable, H); + } else +#endif /* if CAN_USE_GCM_ASM >= 2 */ + { + gcm_init_htab_avx(ctx->gcm_Htable, H); + } if (iv_len == 12) { memcpy(cb, iv, 12); diff --git a/sys/contrib/openzfs/module/icp/algs/modes/modes.c b/sys/contrib/openzfs/module/icp/algs/modes/modes.c index 343591cd9691..ef3c1806e4b6 100644 --- a/sys/contrib/openzfs/module/icp/algs/modes/modes.c +++ b/sys/contrib/openzfs/module/icp/algs/modes/modes.c @@ -171,7 +171,7 @@ gcm_clear_ctx(gcm_ctx_t *ctx) explicit_memset(ctx->gcm_remainder, 0, sizeof (ctx->gcm_remainder)); explicit_memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H)); #if defined(CAN_USE_GCM_ASM) - if (ctx->gcm_use_avx == B_TRUE) { + if (ctx->impl != GCM_IMPL_GENERIC) { ASSERT3P(ctx->gcm_Htable, !=, NULL); explicit_memset(ctx->gcm_Htable, 0, ctx->gcm_htab_len); kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len); diff --git a/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c b/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c index 6d3bcca9f995..dcb0a391dda4 100644 --- a/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c +++ b/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c @@ -38,11 +38,14 @@ kfpu_begin(); E(s, d, b); kfpu_end(); \ } +#if defined(__x86_64) || defined(__aarch64__) || defined(__arm__) || \ + defined(__PPC64__) /* some implementation is always okay */ static inline boolean_t sha2_is_supported(void) { return (B_TRUE); } +#endif #if defined(__x86_64) diff --git a/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c b/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c index 2efd9fcf4c99..a85a71a83df4 100644 --- a/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c +++ b/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c @@ -38,11 +38,14 @@ kfpu_begin(); E(s, d, b); kfpu_end(); \ } +#if defined(__x86_64) || defined(__aarch64__) || defined(__arm__) || \ + defined(__aarch64__) || defined(__arm__) || defined(__PPC64__) /* some implementation is always okay */ static inline boolean_t sha2_is_supported(void) { return (B_TRUE); } +#endif #if defined(__x86_64) diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.boringssl b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.boringssl new file mode 100644 index 000000000000..04c03a37e0cb --- /dev/null +++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.boringssl @@ -0,0 +1,253 @@ +BoringSSL is a fork of OpenSSL. As such, large parts of it fall under OpenSSL +licensing. Files that are completely new have a Google copyright and an ISC +license. This license is reproduced at the bottom of this file. + +Contributors to BoringSSL are required to follow the CLA rules for Chromium: +https://cla.developers.google.com/clas + +Files in third_party/ have their own licenses, as described therein. The MIT +license, for third_party/fiat, which, unlike other third_party directories, is +compiled into non-test libraries, is included below. + +The OpenSSL toolkit stays under a dual license, i.e. both the conditions of the +OpenSSL License and the original SSLeay license apply to the toolkit. See below +for the actual license texts. Actually both licenses are BSD-style Open Source +licenses. In case of any license issues related to OpenSSL please contact +openssl-core@openssl.org. + +The following are Google-internal bug numbers where explicit permission from +some authors is recorded for use of their work. (This is purely for our own +record keeping.) + 27287199 + 27287880 + 27287883 + 263291445 + + + OpenSSL License + --------------- + +/* ==================================================================== + * Copyright (c) 1998-2011 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ + + Original SSLeay License + ----------------------- + +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + + +ISC license used for completely new code in BoringSSL: + +/* Copyright 2015 The BoringSSL Authors + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + + +The code in third_party/fiat carries the MIT license: + +Copyright (c) 2015-2016 the fiat-crypto authors (see +https://github.com/mit-plv/fiat-crypto/blob/master/AUTHORS). + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + + +Licenses for support code +------------------------- + +Parts of the TLS test suite are under the Go license. This code is not included +in BoringSSL (i.e. libcrypto and libssl) when compiled, however, so +distributing code linked against BoringSSL does not trigger this license: + +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +BoringSSL uses the Chromium test infrastructure to run a continuous build, +trybots etc. The scripts which manage this, and the script for generating build +metadata, are under the Chromium license. Distributing code linked against +BoringSSL does not trigger this license. + +Copyright 2015 The Chromium Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.boringssl.descrip b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.boringssl.descrip new file mode 100644 index 000000000000..f63a67a4d2ae --- /dev/null +++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.boringssl.descrip @@ -0,0 +1 @@ +PORTIONS OF AES GCM and GHASH FUNCTIONALITY diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-avx2-vaes.S b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-avx2-vaes.S new file mode 100644 index 000000000000..3d1b045127e2 --- /dev/null +++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-avx2-vaes.S @@ -0,0 +1,1323 @@ +// SPDX-License-Identifier: Apache-2.0 +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if defined(__x86_64__) && defined(HAVE_AVX) && \ + defined(HAVE_VAES) && defined(HAVE_VPCLMULQDQ) + +#define _ASM +#include <sys/asm_linkage.h> + +/* Windows userland links with OpenSSL */ +#if !defined (_WIN32) || defined (_KERNEL) + +.section .rodata +.balign 16 + + +.Lbswap_mask: +.quad 0x08090a0b0c0d0e0f, 0x0001020304050607 + + + + + + + + +.Lgfpoly: +.quad 1, 0xc200000000000000 + + +.Lgfpoly_and_internal_carrybit: +.quad 1, 0xc200000000000001 + +.balign 32 + +.Lctr_pattern: +.quad 0, 0 +.quad 1, 0 +.Linc_2blocks: +.quad 2, 0 +.quad 2, 0 + +ENTRY_ALIGN(gcm_init_vpclmulqdq_avx2, 32) +.cfi_startproc + +ENDBR + + + + + + vmovdqu (%rsi),%xmm3 + // KCF/ICP stores H in network byte order with the hi qword first + // so we need to swap all bytes, not the 2 qwords. + vmovdqu .Lbswap_mask(%rip),%xmm4 + vpshufb %xmm4,%xmm3,%xmm3 + + + + + + vpshufd $0xd3,%xmm3,%xmm0 + vpsrad $31,%xmm0,%xmm0 + vpaddq %xmm3,%xmm3,%xmm3 + vpand .Lgfpoly_and_internal_carrybit(%rip),%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + + vbroadcasti128 .Lgfpoly(%rip),%ymm6 + + + vpclmulqdq $0x00,%xmm3,%xmm3,%xmm0 + vpclmulqdq $0x11,%xmm3,%xmm3,%xmm5 + vpclmulqdq $0x01,%xmm0,%xmm6,%xmm1 + vpshufd $0x4e,%xmm0,%xmm0 + vpxor %xmm0,%xmm1,%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm6,%xmm0 + vpshufd $0x4e,%xmm1,%xmm1 + vpxor %xmm1,%xmm5,%xmm5 + vpxor %xmm0,%xmm5,%xmm5 + + + + vinserti128 $1,%xmm3,%ymm5,%ymm3 + vinserti128 $1,%xmm5,%ymm5,%ymm5 + + + vpclmulqdq $0x00,%ymm5,%ymm3,%ymm0 + vpclmulqdq $0x01,%ymm5,%ymm3,%ymm1 + vpclmulqdq $0x10,%ymm5,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x11,%ymm5,%ymm3,%ymm4 + vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + + + vmovdqu %ymm3,96(%rdi) + vmovdqu %ymm4,64(%rdi) + + + + vpunpcklqdq %ymm3,%ymm4,%ymm0 + vpunpckhqdq %ymm3,%ymm4,%ymm1 + vpxor %ymm1,%ymm0,%ymm0 + vmovdqu %ymm0,128+32(%rdi) + + + vpclmulqdq $0x00,%ymm5,%ymm4,%ymm0 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm1 + vpclmulqdq $0x10,%ymm5,%ymm4,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x11,%ymm5,%ymm4,%ymm3 + vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm3,%ymm3 + vpxor %ymm0,%ymm3,%ymm3 + + vpclmulqdq $0x00,%ymm5,%ymm3,%ymm0 + vpclmulqdq $0x01,%ymm5,%ymm3,%ymm1 + vpclmulqdq $0x10,%ymm5,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x11,%ymm5,%ymm3,%ymm4 + vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + vmovdqu %ymm3,32(%rdi) + vmovdqu %ymm4,0(%rdi) + + + + vpunpcklqdq %ymm3,%ymm4,%ymm0 + vpunpckhqdq %ymm3,%ymm4,%ymm1 + vpxor %ymm1,%ymm0,%ymm0 + vmovdqu %ymm0,128(%rdi) + + vzeroupper + RET + +.cfi_endproc +SET_SIZE(gcm_init_vpclmulqdq_avx2) +ENTRY_ALIGN(gcm_gmult_vpclmulqdq_avx2, 32) +.cfi_startproc + +ENDBR + + + + vmovdqu (%rdi),%xmm0 + vmovdqu .Lbswap_mask(%rip),%xmm1 + vmovdqu 128-16(%rsi),%xmm2 + vmovdqu .Lgfpoly(%rip),%xmm3 + vpshufb %xmm1,%xmm0,%xmm0 + + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm4 + vpclmulqdq $0x01,%xmm2,%xmm0,%xmm5 + vpclmulqdq $0x10,%xmm2,%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x01,%xmm4,%xmm3,%xmm6 + vpshufd $0x4e,%xmm4,%xmm4 + vpxor %xmm4,%xmm5,%xmm5 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x01,%xmm5,%xmm3,%xmm4 + vpshufd $0x4e,%xmm5,%xmm5 + vpxor %xmm5,%xmm0,%xmm0 + vpxor %xmm4,%xmm0,%xmm0 + + + vpshufb %xmm1,%xmm0,%xmm0 + vmovdqu %xmm0,(%rdi) + + + RET + +.cfi_endproc +SET_SIZE(gcm_gmult_vpclmulqdq_avx2) +ENTRY_ALIGN(gcm_ghash_vpclmulqdq_avx2, 32) +.cfi_startproc + +ENDBR + + + + + + + vmovdqu .Lbswap_mask(%rip),%xmm6 + vmovdqu .Lgfpoly(%rip),%xmm7 + + + vmovdqu (%rdi),%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + + + cmpq $32,%rcx + jb .Lghash_lastblock + + + + vinserti128 $1,%xmm6,%ymm6,%ymm6 + vinserti128 $1,%xmm7,%ymm7,%ymm7 + + cmpq $127,%rcx + jbe .Lghash_loop_1x + + + vmovdqu 128(%rsi),%ymm8 + vmovdqu 128+32(%rsi),%ymm9 +.Lghash_loop_4x: + + vmovdqu 0(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 0(%rsi),%ymm2 + vpxor %ymm5,%ymm1,%ymm1 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x00,%ymm8,%ymm0,%ymm4 + + vmovdqu 32(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 32(%rsi),%ymm2 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x10,%ymm8,%ymm0,%ymm0 + vpxor %ymm0,%ymm4,%ymm4 + + vmovdqu 64(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 64(%rsi),%ymm2 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x00,%ymm9,%ymm0,%ymm0 + vpxor %ymm0,%ymm4,%ymm4 + + + vmovdqu 96(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 96(%rsi),%ymm2 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x10,%ymm9,%ymm0,%ymm0 + vpxor %ymm0,%ymm4,%ymm4 + + vpxor %ymm3,%ymm4,%ymm4 + vpxor %ymm5,%ymm4,%ymm4 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm2 + vpclmulqdq $0x01,%ymm3,%ymm2,%ymm0 + vpshufd $0x4e,%ymm3,%ymm3 + vpxor %ymm3,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + vpclmulqdq $0x01,%ymm4,%ymm2,%ymm0 + vpshufd $0x4e,%ymm4,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm0,%ymm5,%ymm5 + vextracti128 $1,%ymm5,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + + subq $-128,%rdx + addq $-128,%rcx + cmpq $127,%rcx + ja .Lghash_loop_4x + + + cmpq $32,%rcx + jb .Lghash_loop_1x_done +.Lghash_loop_1x: + vmovdqu (%rdx),%ymm0 + vpshufb %ymm6,%ymm0,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vmovdqu 128-32(%rsi),%ymm0 + vpclmulqdq $0x00,%ymm0,%ymm5,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm5,%ymm2 + vpclmulqdq $0x10,%ymm0,%ymm5,%ymm3 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x01,%ymm1,%ymm7,%ymm3 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm2,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x11,%ymm0,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm7,%ymm1 + vpshufd $0x4e,%ymm2,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm1,%ymm5,%ymm5 + + vextracti128 $1,%ymm5,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + addq $32,%rdx + subq $32,%rcx + cmpq $32,%rcx + jae .Lghash_loop_1x +.Lghash_loop_1x_done: + + +.Lghash_lastblock: + testq %rcx,%rcx + jz .Lghash_done + vmovdqu (%rdx),%xmm0 + vpshufb %xmm6,%xmm0,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + vmovdqu 128-16(%rsi),%xmm0 + vpclmulqdq $0x00,%xmm0,%xmm5,%xmm1 + vpclmulqdq $0x01,%xmm0,%xmm5,%xmm2 + vpclmulqdq $0x10,%xmm0,%xmm5,%xmm3 + vpxor %xmm3,%xmm2,%xmm2 + vpclmulqdq $0x01,%xmm1,%xmm7,%xmm3 + vpshufd $0x4e,%xmm1,%xmm1 + vpxor %xmm1,%xmm2,%xmm2 + vpxor %xmm3,%xmm2,%xmm2 + vpclmulqdq $0x11,%xmm0,%xmm5,%xmm5 + vpclmulqdq $0x01,%xmm2,%xmm7,%xmm1 + vpshufd $0x4e,%xmm2,%xmm2 + vpxor %xmm2,%xmm5,%xmm5 + vpxor %xmm1,%xmm5,%xmm5 + + +.Lghash_done: + + vpshufb %xmm6,%xmm5,%xmm5 + vmovdqu %xmm5,(%rdi) + + vzeroupper + RET + +.cfi_endproc +SET_SIZE(gcm_ghash_vpclmulqdq_avx2) +ENTRY_ALIGN(aes_gcm_enc_update_vaes_avx2, 32) +.cfi_startproc + +ENDBR + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + + movq 16(%rsp),%r12 +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit +.hidden BORINGSSL_function_hit + movb $1,BORINGSSL_function_hit+6(%rip) +#endif + vbroadcasti128 .Lbswap_mask(%rip),%ymm0 + + + + vmovdqu (%r12),%xmm1 + vpshufb %xmm0,%xmm1,%xmm1 + vbroadcasti128 (%r8),%ymm11 + vpshufb %ymm0,%ymm11,%ymm11 + + + + movl 504(%rcx),%r10d // ICP has a larger offset for rounds. + leal -24(,%r10,4),%r10d // ICP uses 10,12,14 not 9,11,13 for rounds. + + + + + leaq 96(%rcx,%r10,4),%r11 + vbroadcasti128 (%rcx),%ymm9 + vbroadcasti128 (%r11),%ymm10 + + + vpaddd .Lctr_pattern(%rip),%ymm11,%ymm11 + + + + cmpq $127,%rdx + jbe .Lcrypt_loop_4x_done__func1 + + vmovdqu 128(%r9),%ymm7 + vmovdqu 128+32(%r9),%ymm8 + + + + vmovdqu .Linc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + leaq 16(%rcx),%rax +.Lvaesenc_loop_first_4_vecs__func1: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_first_4_vecs__func1 + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 + vaesenclast %ymm2,%ymm12,%ymm12 + vaesenclast %ymm3,%ymm13,%ymm13 + vaesenclast %ymm5,%ymm14,%ymm14 + vaesenclast %ymm6,%ymm15,%ymm15 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + addq $-128,%rdx + cmpq $127,%rdx + jbe .Lghash_last_ciphertext_4x__func1 +.balign 16 +.Lcrypt_loop_4x__func1: + + + + + vmovdqu .Linc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + cmpl $24,%r10d + jl .Laes128__func1 + je .Laes192__func1 + + vbroadcasti128 -208(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -192(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +.Laes192__func1: + vbroadcasti128 -176(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -160(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +.Laes128__func1: + prefetcht0 512(%rdi) + prefetcht0 512+64(%rdi) + + vmovdqu 0(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6 + + vbroadcasti128 -144(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vbroadcasti128 -128(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 32(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -112(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 64(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + + vbroadcasti128 -96(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -80(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + + vbroadcasti128 -64(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vmovdqu 96(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -48(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm4 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -32(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -16(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + + subq $-128,%rsi + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 + vaesenclast %ymm2,%ymm12,%ymm12 + vaesenclast %ymm3,%ymm13,%ymm13 + vaesenclast %ymm5,%ymm14,%ymm14 + vaesenclast %ymm6,%ymm15,%ymm15 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + + addq $-128,%rdx + cmpq $127,%rdx + ja .Lcrypt_loop_4x__func1 +.Lghash_last_ciphertext_4x__func1: + + vmovdqu 0(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6 + + vmovdqu 32(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vmovdqu 64(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 96(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm4 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + subq $-128,%rsi +.Lcrypt_loop_4x_done__func1: + + testq %rdx,%rdx + jz .Ldone__func1 + + + + + + leaq 128(%r9),%r8 + subq %rdx,%r8 + + + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + vpxor %xmm7,%xmm7,%xmm7 + + cmpq $64,%rdx + jb .Llessthan64bytes__func1 + + + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_1__func1: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_1__func1 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%ymm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %ymm3,%ymm13,%ymm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + + + vpshufb %ymm0,%ymm12,%ymm12 + vpshufb %ymm0,%ymm13,%ymm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%ymm3 + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm7 + vpclmulqdq $0x00,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + + addq $64,%r8 + addq $64,%rdi + addq $64,%rsi + subq $64,%rdx + jz .Lreduce__func1 + + vpxor %xmm1,%xmm1,%xmm1 + + +.Llessthan64bytes__func1: + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_2__func1: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_2__func1 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + + + cmpq $32,%rdx + jb .Lxor_one_block__func1 + je .Lxor_two_blocks__func1 + +.Lxor_three_blocks__func1: + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%xmm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %xmm3,%xmm13,%xmm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %xmm13,32(%rsi) + + vpshufb %ymm0,%ymm12,%ymm12 + vpshufb %xmm0,%xmm13,%xmm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%xmm3 + vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm7,%ymm7 + jmp .Lghash_mul_one_vec_unreduced__func1 + +.Lxor_two_blocks__func1: + vmovdqu (%rdi),%ymm2 + vpxor %ymm2,%ymm12,%ymm12 + vmovdqu %ymm12,(%rsi) + vpshufb %ymm0,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + jmp .Lghash_mul_one_vec_unreduced__func1 + +.Lxor_one_block__func1: + vmovdqu (%rdi),%xmm2 + vpxor %xmm2,%xmm12,%xmm12 + vmovdqu %xmm12,(%rsi) + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm1,%xmm12,%xmm12 + vmovdqu (%r8),%xmm2 + +.Lghash_mul_one_vec_unreduced__func1: + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + +.Lreduce__func1: + + vbroadcasti128 .Lgfpoly(%rip),%ymm2 + vpclmulqdq $0x01,%ymm5,%ymm2,%ymm3 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpclmulqdq $0x01,%ymm6,%ymm2,%ymm3 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm7,%ymm7 + vpxor %ymm3,%ymm7,%ymm7 + vextracti128 $1,%ymm7,%xmm1 + vpxor %xmm7,%xmm1,%xmm1 + +.Ldone__func1: + + vpshufb %xmm0,%xmm1,%xmm1 + vmovdqu %xmm1,(%r12) + + vzeroupper + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + RET + +.cfi_endproc +SET_SIZE(aes_gcm_enc_update_vaes_avx2) +ENTRY_ALIGN(aes_gcm_dec_update_vaes_avx2, 32) +.cfi_startproc + +ENDBR + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + + movq 16(%rsp),%r12 + vbroadcasti128 .Lbswap_mask(%rip),%ymm0 + + + + vmovdqu (%r12),%xmm1 + vpshufb %xmm0,%xmm1,%xmm1 + vbroadcasti128 (%r8),%ymm11 + vpshufb %ymm0,%ymm11,%ymm11 + + + + movl 504(%rcx),%r10d // ICP has a larger offset for rounds. + leal -24(,%r10,4),%r10d // ICP uses 10,12,14 not 9,11,13 for rounds. + + + + + leaq 96(%rcx,%r10,4),%r11 + vbroadcasti128 (%rcx),%ymm9 + vbroadcasti128 (%r11),%ymm10 + + + vpaddd .Lctr_pattern(%rip),%ymm11,%ymm11 + + + + cmpq $127,%rdx + jbe .Lcrypt_loop_4x_done__func2 + + vmovdqu 128(%r9),%ymm7 + vmovdqu 128+32(%r9),%ymm8 +.balign 16 +.Lcrypt_loop_4x__func2: + + + + + vmovdqu .Linc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + cmpl $24,%r10d + jl .Laes128__func2 + je .Laes192__func2 + + vbroadcasti128 -208(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -192(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +.Laes192__func2: + vbroadcasti128 -176(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -160(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +.Laes128__func2: + prefetcht0 512(%rdi) + prefetcht0 512+64(%rdi) + + vmovdqu 0(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6 + + vbroadcasti128 -144(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vbroadcasti128 -128(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 32(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -112(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 64(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + + vbroadcasti128 -96(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -80(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + + vbroadcasti128 -64(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vmovdqu 96(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -48(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm4 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -32(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -16(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + + + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 + vaesenclast %ymm2,%ymm12,%ymm12 + vaesenclast %ymm3,%ymm13,%ymm13 + vaesenclast %ymm5,%ymm14,%ymm14 + vaesenclast %ymm6,%ymm15,%ymm15 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + subq $-128,%rsi + addq $-128,%rdx + cmpq $127,%rdx + ja .Lcrypt_loop_4x__func2 +.Lcrypt_loop_4x_done__func2: + + testq %rdx,%rdx + jz .Ldone__func2 + + + + + + leaq 128(%r9),%r8 + subq %rdx,%r8 + + + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + vpxor %xmm7,%xmm7,%xmm7 + + cmpq $64,%rdx + jb .Llessthan64bytes__func2 + + + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_1__func2: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_1__func2 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%ymm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %ymm3,%ymm13,%ymm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + + + vpshufb %ymm0,%ymm2,%ymm12 + vpshufb %ymm0,%ymm3,%ymm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%ymm3 + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm7 + vpclmulqdq $0x00,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + + addq $64,%r8 + addq $64,%rdi + addq $64,%rsi + subq $64,%rdx + jz .Lreduce__func2 + + vpxor %xmm1,%xmm1,%xmm1 + + +.Llessthan64bytes__func2: + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_2__func2: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_2__func2 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + + + cmpq $32,%rdx + jb .Lxor_one_block__func2 + je .Lxor_two_blocks__func2 + +.Lxor_three_blocks__func2: + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%xmm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %xmm3,%xmm13,%xmm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %xmm13,32(%rsi) + + vpshufb %ymm0,%ymm2,%ymm12 + vpshufb %xmm0,%xmm3,%xmm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%xmm3 + vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm7,%ymm7 + jmp .Lghash_mul_one_vec_unreduced__func2 + +.Lxor_two_blocks__func2: + vmovdqu (%rdi),%ymm2 + vpxor %ymm2,%ymm12,%ymm12 + vmovdqu %ymm12,(%rsi) + vpshufb %ymm0,%ymm2,%ymm12 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + jmp .Lghash_mul_one_vec_unreduced__func2 + +.Lxor_one_block__func2: + vmovdqu (%rdi),%xmm2 + vpxor %xmm2,%xmm12,%xmm12 + vmovdqu %xmm12,(%rsi) + vpshufb %xmm0,%xmm2,%xmm12 + vpxor %xmm1,%xmm12,%xmm12 + vmovdqu (%r8),%xmm2 + +.Lghash_mul_one_vec_unreduced__func2: + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + +.Lreduce__func2: + + vbroadcasti128 .Lgfpoly(%rip),%ymm2 + vpclmulqdq $0x01,%ymm5,%ymm2,%ymm3 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpclmulqdq $0x01,%ymm6,%ymm2,%ymm3 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm7,%ymm7 + vpxor %ymm3,%ymm7,%ymm7 + vextracti128 $1,%ymm7,%xmm1 + vpxor %xmm7,%xmm1,%xmm1 + +.Ldone__func2: + + vpshufb %xmm0,%xmm1,%xmm1 + vmovdqu %xmm1,(%r12) + + vzeroupper + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + RET + +.cfi_endproc +SET_SIZE(aes_gcm_dec_update_vaes_avx2) + +#endif /* !_WIN32 || _KERNEL */ + +/* Mark the stack non-executable. */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */ diff --git a/sys/contrib/openzfs/module/icp/core/kcf_sched.c b/sys/contrib/openzfs/module/icp/core/kcf_sched.c index 759f0d81d521..75e1052a4ed4 100644 --- a/sys/contrib/openzfs/module/icp/core/kcf_sched.c +++ b/sys/contrib/openzfs/module/icp/core/kcf_sched.c @@ -124,7 +124,7 @@ kcf_context_cache_destructor(void *buf, void *cdrarg) (void) cdrarg; kcf_context_t *kctx = (kcf_context_t *)buf; - ASSERT(kctx->kc_refcnt == 0); + ASSERT0(kctx->kc_refcnt); } void diff --git a/sys/contrib/openzfs/module/icp/include/modes/modes.h b/sys/contrib/openzfs/module/icp/include/modes/modes.h index ca734cf4f045..de11d9eafafb 100644 --- a/sys/contrib/openzfs/module/icp/include/modes/modes.h +++ b/sys/contrib/openzfs/module/icp/include/modes/modes.h @@ -42,7 +42,7 @@ extern "C" { */ #if defined(__x86_64__) && defined(HAVE_AVX) && \ defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) -#define CAN_USE_GCM_ASM +#define CAN_USE_GCM_ASM (HAVE_VAES && HAVE_VPCLMULQDQ ? 2 : 1) extern boolean_t gcm_avx_can_use_movbe; #endif @@ -129,6 +129,15 @@ typedef struct ccm_ctx { #define ccm_copy_to ccm_common.cc_copy_to #define ccm_flags ccm_common.cc_flags +#ifdef CAN_USE_GCM_ASM +typedef enum gcm_impl { + GCM_IMPL_GENERIC = 0, + GCM_IMPL_AVX, + GCM_IMPL_AVX2, + GCM_IMPL_MAX, +} gcm_impl; +#endif + /* * gcm_tag_len: Length of authentication tag. * @@ -174,7 +183,7 @@ typedef struct gcm_ctx { uint64_t gcm_len_a_len_c[2]; uint8_t *gcm_pt_buf; #ifdef CAN_USE_GCM_ASM - boolean_t gcm_use_avx; + enum gcm_impl impl; #endif } gcm_ctx_t; diff --git a/sys/contrib/openzfs/module/icp/io/aes.c b/sys/contrib/openzfs/module/icp/io/aes.c index ba703efa71fc..ca586eaf97ef 100644 --- a/sys/contrib/openzfs/module/icp/io/aes.c +++ b/sys/contrib/openzfs/module/icp/io/aes.c @@ -236,16 +236,16 @@ aes_encrypt_atomic(crypto_mechanism_t *mechanism, aes_xor_block); if (ret != CRYPTO_SUCCESS) goto out; - ASSERT(aes_ctx.ac_remainder_len == 0); + ASSERT0(aes_ctx.ac_remainder_len); } else if (mechanism->cm_type == AES_GCM_MECH_INFO_TYPE) { ret = gcm_encrypt_final((gcm_ctx_t *)&aes_ctx, ciphertext, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, aes_xor_block); if (ret != CRYPTO_SUCCESS) goto out; - ASSERT(aes_ctx.ac_remainder_len == 0); + ASSERT0(aes_ctx.ac_remainder_len); } else { - ASSERT(aes_ctx.ac_remainder_len == 0); + ASSERT0(aes_ctx.ac_remainder_len); } if (plaintext != ciphertext) { @@ -337,7 +337,7 @@ aes_decrypt_atomic(crypto_mechanism_t *mechanism, ret = ccm_decrypt_final((ccm_ctx_t *)&aes_ctx, plaintext, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, aes_xor_block); - ASSERT(aes_ctx.ac_remainder_len == 0); + ASSERT0(aes_ctx.ac_remainder_len); if ((ret == CRYPTO_SUCCESS) && (ciphertext != plaintext)) { plaintext->cd_length = @@ -349,7 +349,7 @@ aes_decrypt_atomic(crypto_mechanism_t *mechanism, ret = gcm_decrypt_final((gcm_ctx_t *)&aes_ctx, plaintext, AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); - ASSERT(aes_ctx.ac_remainder_len == 0); + ASSERT0(aes_ctx.ac_remainder_len); if ((ret == CRYPTO_SUCCESS) && (ciphertext != plaintext)) { plaintext->cd_length = diff --git a/sys/contrib/openzfs/module/nvpair/nvpair.c b/sys/contrib/openzfs/module/nvpair/nvpair.c index 811cfc87d7a4..eb8c14b4a783 100644 --- a/sys/contrib/openzfs/module/nvpair/nvpair.c +++ b/sys/contrib/openzfs/module/nvpair/nvpair.c @@ -265,7 +265,7 @@ nv_priv_alloc_embedded(nvpriv_t *priv) static int nvt_tab_alloc(nvpriv_t *priv, uint64_t buckets) { - ASSERT3P(priv->nvp_hashtable, ==, NULL); + ASSERT0P(priv->nvp_hashtable); ASSERT0(priv->nvp_nbuckets); ASSERT0(priv->nvp_nentries); @@ -334,7 +334,7 @@ nvt_lookup_name_type(const nvlist_t *nvl, const char *name, data_type_t type) i_nvp_t **tab = priv->nvp_hashtable; if (tab == NULL) { - ASSERT3P(priv->nvp_list, ==, NULL); + ASSERT0P(priv->nvp_list); ASSERT0(priv->nvp_nbuckets); ASSERT0(priv->nvp_nentries); return (NULL); @@ -540,7 +540,7 @@ nvt_add_nvpair(nvlist_t *nvl, nvpair_t *nvp) /* insert link at the beginning of the bucket */ i_nvp_t *new_entry = NVPAIR2I_NVP(nvp); - ASSERT3P(new_entry->nvi_hashtable_next, ==, NULL); + ASSERT0P(new_entry->nvi_hashtable_next); new_entry->nvi_hashtable_next = bucket; // cppcheck-suppress nullPointerRedundantCheck tab[index] = new_entry; diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c index 6d198fad5203..ae6e36d988c2 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c @@ -160,7 +160,7 @@ kmem_cache_create(const char *name, size_t bufsize, size_t align, { kmem_cache_t *cache; - ASSERT3P(vmp, ==, NULL); + ASSERT0P(vmp); cache = kmem_alloc(sizeof (*cache), KM_SLEEP); strlcpy(cache->kc_name, name, sizeof (cache->kc_name)); diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c index f9125a067cd1..3f360d167b17 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c @@ -101,6 +101,15 @@ spl_panic(const char *file, const char *func, int line, const char *fmt, ...) va_end(ap); } +/* + * Check if the current thread is a memory reclaim thread. + * Returns true if curproc is pageproc (FreeBSD's page daemon). + */ +int +current_is_reclaim_thread(void) +{ + return (curproc == pageproc); +} SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY, opensolaris_utsname_init, NULL); diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c index 9da633c2b1be..3c2d39b20c09 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c @@ -256,7 +256,7 @@ sysevent_worker(void *arg __unused) * free `ze`, so just inline the free() here -- events have already * been drained. */ - VERIFY3P(ze->ze_zevent, ==, NULL); + VERIFY0P(ze->ze_zevent); kmem_free(ze, sizeof (zfs_zevent_t)); kthread_exit(); diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c index 733c2bd07ebb..9d5f025423a1 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c @@ -43,6 +43,7 @@ const int zfs_vm_pagerret_bad = VM_PAGER_BAD; const int zfs_vm_pagerret_error = VM_PAGER_ERROR; const int zfs_vm_pagerret_ok = VM_PAGER_OK; +const int zfs_vm_pagerret_pend = VM_PAGER_PEND; const int zfs_vm_pagerput_sync = VM_PAGER_PUT_SYNC; const int zfs_vm_pagerput_inval = VM_PAGER_PUT_INVAL; diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c index fbf67f6a14a8..4bf487cdc469 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c @@ -507,7 +507,7 @@ abd_iter_at_end(struct abd_iter *aiter) void abd_iter_advance(struct abd_iter *aiter, size_t amount) { - ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0P(aiter->iter_mapaddr); ASSERT0(aiter->iter_mapsize); /* There's nothing left to advance to, so do nothing */ @@ -526,7 +526,7 @@ abd_iter_map(struct abd_iter *aiter) { void *paddr; - ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0P(aiter->iter_mapaddr); ASSERT0(aiter->iter_mapsize); /* There's nothing left to iterate over, so do nothing */ diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c index 364bbfc60abd..26cc7981bfcd 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c @@ -156,7 +156,7 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, if (dbp[0]->db_offset != 0 || numbufs > 1) { for (i = 0; i < numbufs; i++) { ASSERT(ISP2(dbp[i]->db_size)); - ASSERT3U((dbp[i]->db_offset % dbp[i]->db_size), ==, 0); + ASSERT0((dbp[i]->db_offset % dbp[i]->db_size)); ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size); } } @@ -175,7 +175,7 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, vm_page_sunbusy(m); break; } - ASSERT3U(m->dirty, ==, 0); + ASSERT0(m->dirty); ASSERT(!pmap_page_is_write_mapped(m)); ASSERT3U(db->db_size, >, PAGE_SIZE); @@ -201,7 +201,7 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, if (m != bogus_page) { vm_page_assert_xbusied(m); ASSERT(vm_page_none_valid(m)); - ASSERT3U(m->dirty, ==, 0); + ASSERT0(m->dirty); ASSERT(!pmap_page_is_write_mapped(m)); va = zfs_map_page(m, &sf); } @@ -295,7 +295,7 @@ dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, vm_page_sunbusy(m); break; } - ASSERT3U(m->dirty, ==, 0); + ASSERT0(m->dirty); ASSERT(!pmap_page_is_write_mapped(m)); ASSERT3U(db->db_size, >, PAGE_SIZE); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c index c114db14a916..b218c0da8125 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c @@ -112,7 +112,6 @@ static int zfs__fini(void); static void zfs_shutdown(void *, int); static eventhandler_tag zfs_shutdown_event_tag; -static eventhandler_tag zfs_mountroot_event_tag; #define ZFS_MIN_KSTACK_PAGES 4 @@ -311,9 +310,6 @@ zfs_modevent(module_t mod, int type, void *unused __unused) zfs_shutdown_event_tag = EVENTHANDLER_REGISTER( shutdown_post_sync, zfs_shutdown, NULL, SHUTDOWN_PRI_FIRST); - zfs_mountroot_event_tag = EVENTHANDLER_REGISTER( - mountroot, spa_boot_init, NULL, - SI_ORDER_ANY); } return (err); case MOD_UNLOAD: @@ -322,9 +318,6 @@ zfs_modevent(module_t mod, int type, void *unused __unused) if (zfs_shutdown_event_tag != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, zfs_shutdown_event_tag); - if (zfs_mountroot_event_tag != NULL) - EVENTHANDLER_DEREGISTER(mountroot, - zfs_mountroot_event_tag); } return (err); case MOD_SHUTDOWN: diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c index ace2360c032d..393bfaa65ff5 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c @@ -188,11 +188,6 @@ param_set_arc_max(SYSCTL_HANDLER_ARGS) return (0); } -SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, - CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, - NULL, 0, param_set_arc_max, "LU", - "Maximum ARC size in bytes (LEGACY)"); - int param_set_arc_min(SYSCTL_HANDLER_ARGS) { @@ -217,11 +212,6 @@ param_set_arc_min(SYSCTL_HANDLER_ARGS) return (0); } -SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, - CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, - NULL, 0, param_set_arc_min, "LU", - "Minimum ARC size in bytes (LEGACY)"); - extern uint_t zfs_arc_free_target; int @@ -245,16 +235,6 @@ param_set_arc_free_target(SYSCTL_HANDLER_ARGS) return (0); } -/* - * NOTE: This sysctl is CTLFLAG_RW not CTLFLAG_RWTUN due to its dependency on - * pagedaemon initialization. - */ -SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, - CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, - NULL, 0, param_set_arc_free_target, "IU", - "Desired number of free pages below which ARC triggers reclaim" - " (LEGACY)"); - int param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) { @@ -273,187 +253,6 @@ param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) return (0); } -SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, - CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, - NULL, 0, param_set_arc_no_grow_shift, "I", - "log2(fraction of ARC which must be free to allow growing) (LEGACY)"); - -extern uint64_t l2arc_write_max; - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, - CTLFLAG_RWTUN, &l2arc_write_max, 0, - "Max write bytes per interval (LEGACY)"); - -extern uint64_t l2arc_write_boost; - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, - CTLFLAG_RWTUN, &l2arc_write_boost, 0, - "Extra write bytes during device warmup (LEGACY)"); - -extern uint64_t l2arc_headroom; - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, - CTLFLAG_RWTUN, &l2arc_headroom, 0, - "Number of max device writes to precache (LEGACY)"); - -extern uint64_t l2arc_headroom_boost; - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom_boost, - CTLFLAG_RWTUN, &l2arc_headroom_boost, 0, - "Compressed l2arc_headroom multiplier (LEGACY)"); - -extern uint64_t l2arc_feed_secs; - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, - CTLFLAG_RWTUN, &l2arc_feed_secs, 0, - "Seconds between L2ARC writing (LEGACY)"); - -extern uint64_t l2arc_feed_min_ms; - -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, - CTLFLAG_RWTUN, &l2arc_feed_min_ms, 0, - "Min feed interval in milliseconds (LEGACY)"); - -extern int l2arc_noprefetch; - -SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, - CTLFLAG_RWTUN, &l2arc_noprefetch, 0, - "Skip caching prefetched buffers (LEGACY)"); - -extern int l2arc_feed_again; - -SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, - CTLFLAG_RWTUN, &l2arc_feed_again, 0, - "Turbo L2ARC warmup (LEGACY)"); - -extern int l2arc_norw; - -SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, - CTLFLAG_RWTUN, &l2arc_norw, 0, - "No reads during writes (LEGACY)"); - -static int -param_get_arc_state_size(SYSCTL_HANDLER_ARGS) -{ - arc_state_t *state = (arc_state_t *)arg1; - int64_t val; - - val = zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]) + - zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]); - return (sysctl_handle_64(oidp, &val, 0, req)); -} - -extern arc_state_t ARC_anon; - -SYSCTL_PROC(_vfs_zfs, OID_AUTO, anon_size, - CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, - &ARC_anon, 0, param_get_arc_state_size, "Q", - "size of anonymous state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, - &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of evictable metadata in anonymous state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, - &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of evictable data in anonymous state"); - -extern arc_state_t ARC_mru; - -SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_size, - CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, - &ARC_mru, 0, param_get_arc_state_size, "Q", - "size of mru state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, - &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of evictable metadata in mru state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, - &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of evictable data in mru state"); - -extern arc_state_t ARC_mru_ghost; - -SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_ghost_size, - CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, - &ARC_mru_ghost, 0, param_get_arc_state_size, "Q", - "size of mru ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, - &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of evictable metadata in mru ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, - &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of evictable data in mru ghost state"); - -extern arc_state_t ARC_mfu; - -SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_size, - CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, - &ARC_mfu, 0, param_get_arc_state_size, "Q", - "size of mfu state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, - &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of evictable metadata in mfu state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, - &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of evictable data in mfu state"); - -extern arc_state_t ARC_mfu_ghost; - -SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_ghost_size, - CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, - &ARC_mfu_ghost, 0, param_get_arc_state_size, "Q", - "size of mfu ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, - &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of evictable metadata in mfu ghost state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, - &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of evictable data in mfu ghost state"); - -extern arc_state_t ARC_uncached; - -SYSCTL_PROC(_vfs_zfs, OID_AUTO, uncached_size, - CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, - &ARC_uncached, 0, param_get_arc_state_size, "Q", - "size of uncached state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_metadata_esize, CTLFLAG_RD, - &ARC_uncached.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, - "size of evictable metadata in uncached state"); -SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_data_esize, CTLFLAG_RD, - &ARC_uncached.arcs_esize[ARC_BUFC_DATA].rc_count, 0, - "size of evictable data in uncached state"); - -extern arc_state_t ARC_l2c_only; - -SYSCTL_PROC(_vfs_zfs, OID_AUTO, l2c_only_size, - CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, - &ARC_l2c_only, 0, param_get_arc_state_size, "Q", - "size of l2c_only state"); - -/* dbuf.c */ - -/* dmu.c */ - -/* dmu_zfetch.c */ - -SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH (LEGACY)"); - -extern uint32_t zfetch_max_distance; - -SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, - CTLFLAG_RWTUN, &zfetch_max_distance, 0, - "Max bytes to prefetch per stream (LEGACY)"); - -extern uint32_t zfetch_max_idistance; - -SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, - CTLFLAG_RWTUN, &zfetch_max_idistance, 0, - "Max bytes to prefetch indirects for per stream (LEGACY)"); - -/* dsl_pool.c */ - -/* dnode.c */ - -/* dsl_scan.c */ - /* metaslab.c */ int @@ -514,19 +313,6 @@ SYSCTL_UINT(_vfs_zfs, OID_AUTO, condense_pct, "Condense on-disk spacemap when it is more than this many percents" " of in-memory counterpart"); -extern uint_t zfs_remove_max_segment; - -SYSCTL_UINT(_vfs_zfs, OID_AUTO, remove_max_segment, - CTLFLAG_RWTUN, &zfs_remove_max_segment, 0, - "Largest contiguous segment ZFS will attempt to allocate when removing" - " a device"); - -extern int zfs_removal_suspend_progress; - -SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress, - CTLFLAG_RWTUN, &zfs_removal_suspend_progress, 0, - "Ensures certain actions can happen while in the middle of a removal"); - /* * Minimum size which forces the dynamic allocator to change * it's allocation strategy. Once the space map cannot satisfy @@ -749,12 +535,6 @@ param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS) return (0); } -SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, - CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, - &zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift), - param_set_min_auto_ashift, "IU", - "Min ashift used when creating new top-level vdev. (LEGACY)"); - int param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS) { @@ -774,13 +554,6 @@ param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS) return (0); } -SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, - CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, - &zfs_vdev_max_auto_ashift, sizeof (zfs_vdev_max_auto_ashift), - param_set_max_auto_ashift, "IU", - "Max ashift used when optimizing for logical -> physical sector size on" - " new top-level vdevs. (LEGACY)"); - /* * Since the DTL space map of a vdev is not expected to have a lot of * entries, we default its block size to 4K. @@ -802,23 +575,6 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN, &zfs_vdev_standard_sm_blksz, 0, "Block size for standard space map. Power of 2 greater than 4096."); -extern int vdev_validate_skip; - -SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip, - CTLFLAG_RDTUN, &vdev_validate_skip, 0, - "Enable to bypass vdev_validate()."); - -/* vdev_mirror.c */ - -/* vdev_queue.c */ - -extern uint_t zfs_vdev_max_active; - -SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, - CTLFLAG_RWTUN, &zfs_vdev_max_active, 0, - "The maximum number of I/Os of all types active for each device." - " (LEGACY)"); - /* zio.c */ SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c index c8ab7cc7cf8e..bbd1dafc69be 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c @@ -1236,7 +1236,7 @@ vdev_geom_io_done(zio_t *zio) struct bio *bp = zio->io_bio; if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { - ASSERT3P(bp, ==, NULL); + ASSERT0P(bp); return; } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c index 334264f6da2f..cb5787269db2 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c @@ -1175,7 +1175,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) int count = 0; zfs_acl_phys_t acl_phys; - if (zp->z_zfsvfs->z_replay == B_FALSE) { + if (ZTOV(zp) != NULL && zp->z_zfsvfs->z_replay == B_FALSE) { ASSERT_VOP_IN_SEQC(ZTOV(zp)); } @@ -1632,7 +1632,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, if (zfsvfs->z_replay == B_FALSE) ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); } else - ASSERT3P(dzp->z_vnode, ==, NULL); + ASSERT0P(dzp->z_vnode); memset(acl_ids, 0, sizeof (zfs_acl_ids_t)); acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode); @@ -2014,7 +2014,7 @@ top: error = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(error); - ASSERT3P(zp->z_acl_cached, ==, NULL); + ASSERT0P(zp->z_acl_cached); zp->z_acl_cached = aclp; if (fuid_dirtied) @@ -2357,10 +2357,42 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr, * In FreeBSD, we don't care about permissions of individual ADS. * Note that not checking them is not just an optimization - without * this shortcut, EA operations may bogusly fail with EACCES. + * + * If this is a named attribute lookup, do the checks. */ +#if __FreeBSD_version >= 1500040 + if ((zp->z_pflags & ZFS_XATTR) && (flags & V_NAMEDATTR) == 0) +#else if (zp->z_pflags & ZFS_XATTR) +#endif return (0); + /* + * If a named attribute directory then validate against base file + */ + if (is_attr) { + if ((error = zfs_zget(ZTOZSB(zp), + zp->z_xattr_parent, &xzp)) != 0) { + return (error); + } + + check_zp = xzp; + + /* + * fixup mode to map to xattr perms + */ + + if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { + mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + mode |= ACE_WRITE_NAMED_ATTRS; + } + + if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { + mode &= ~(ACE_READ_DATA|ACE_EXECUTE); + mode |= ACE_READ_NAMED_ATTRS; + } + } + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); /* diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c index 8d0ff9b25e30..4de48e013ec4 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c @@ -357,7 +357,7 @@ zfsctl_create(zfsvfs_t *zfsvfs) vnode_t *rvp; uint64_t crtime[2]; - ASSERT3P(zfsvfs->z_ctldir, ==, NULL); + ASSERT0P(zfsvfs->z_ctldir); snapdir = sfs_alloc_node(sizeof (*snapdir), "snapshot", ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR); @@ -494,7 +494,7 @@ zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) vap->va_uid = 0; vap->va_gid = 0; - vap->va_rdev = 0; + vap->va_rdev = NODEV; /* * We are a purely virtual object, so we have no * blocksize or allocated blocks. @@ -688,6 +688,8 @@ zfsctl_root_readdir(struct vop_readdir_args *ap) * count to return is 0. */ if (zfs_uio_offset(&uio) == 3 * sizeof (entry)) { + if (eofp != NULL) + *eofp = 1; return (0); } @@ -1367,7 +1369,7 @@ zfsctl_snapshot_unmount(const char *snapname, int flags __unused) int err = getzfsvfs(snapname, &zfsvfs); if (err != 0) { - ASSERT3P(zfsvfs, ==, NULL); + ASSERT0P(zfsvfs); return (0); } vfsp = zfsvfs->z_vfs; diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c index 191df832d726..75ba2ea0cb9e 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c @@ -273,7 +273,7 @@ zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) zfsvfs_t *zfsvfs = zp->z_zfsvfs; ASSERT(zp->z_unlinked); - ASSERT3U(zp->z_links, ==, 0); + ASSERT0(zp->z_links); VERIFY0(zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); @@ -437,7 +437,7 @@ zfs_rmnode(znode_t *zp) uint64_t count; int error; - ASSERT3U(zp->z_links, ==, 0); + ASSERT0(zp->z_links); if (zfsvfs->z_replay == B_FALSE) ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c index 21e5f7938f9f..ca13569a1235 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c @@ -164,8 +164,9 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off, - ssize_t *resid) + uint8_t ashift, ssize_t *resid) { + (void) ashift; return (zfs_file_write_impl(fp, buf, count, &off, resid)); } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c index 493ac9f69ad4..79b784288911 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c @@ -455,8 +455,13 @@ zfs_sync(vfs_t *vfsp, int waitfor) return (0); } - if (zfsvfs->z_log != NULL) - zil_commit(zfsvfs->z_log, 0); + if (zfsvfs->z_log != NULL) { + error = zil_commit(zfsvfs->z_log, 0); + if (error != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } + } zfs_exit(zfsvfs, FTAG); } else { @@ -1091,7 +1096,7 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) if (mounting) { boolean_t readonly; - ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); + ASSERT0P(zfsvfs->z_kstat.dk_kstats); error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); if (error) return (error); @@ -1209,6 +1214,8 @@ zfs_set_fuid_feature(zfsvfs_t *zfsvfs) zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } +extern int zfs_xattr_compat; + static int zfs_domount(vfs_t *vfsp, char *osname) { @@ -1289,6 +1296,16 @@ zfs_domount(vfs_t *vfsp, char *osname) goto out; } +#if __FreeBSD_version >= 1500040 + /* + * Named attributes can only work if the xattr property is set to + * on/dir and not sa. Also, zfs_xattr_compat must be set. + */ + if ((zfsvfs->z_flags & ZSB_XATTR) != 0 && !zfsvfs->z_xattr_sa && + zfs_xattr_compat) + vfsp->mnt_flag |= MNT_NAMEDATTR; +#endif + vfs_mountedfrom(vfsp, osname); if (!zfsvfs->z_issnap) @@ -1812,6 +1829,14 @@ zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) err = vn_lock(*vpp, flags); if (err != 0) vrele(*vpp); +#if __FreeBSD_version >= 1500040 + else if ((zp->z_pflags & ZFS_XATTR) != 0) { + if ((*vpp)->v_type == VDIR) + vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR); + else + vn_irflag_set_cond(*vpp, VIRF_NAMEDATTR); + } +#endif } if (err != 0) *vpp = NULL; @@ -1964,9 +1989,17 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) *vpp = ZTOV(zp); zfs_exit(zfsvfs, FTAG); err = vn_lock(*vpp, flags); - if (err == 0) + if (err == 0) { vnode_create_vobject(*vpp, zp->z_size, curthread); - else +#if __FreeBSD_version >= 1500040 + if ((zp->z_pflags & ZFS_XATTR) != 0) { + if ((*vpp)->v_type == VDIR) + vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR); + else + vn_irflag_set_cond(*vpp, VIRF_NAMEDATTR); + } +#endif + } else *vpp = NULL; return (err); } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c index 8a5006c488f3..411225786089 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c @@ -25,6 +25,7 @@ * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2025, Klara, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ @@ -60,6 +61,7 @@ #include <sys/fs/zfs.h> #include <sys/dmu.h> #include <sys/dmu_objset.h> +#include <sys/dsl_dataset.h> #include <sys/spa.h> #include <sys/txg.h> #include <sys/dbuf.h> @@ -114,6 +116,8 @@ typedef uint64_t cookie_t; typedef ulong_t cookie_t; #endif +static int zfs_check_attrname(const char *name); + /* * Programming rules. * @@ -385,7 +389,9 @@ zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred, error = vn_lock(vp, LK_EXCLUSIVE); if (error) return (error); + vn_seqc_write_begin(vp); error = zfs_ioctl_setxattr(vp, fsx, cred); + vn_seqc_write_end(vp); VOP_UNLOCK(vp); return (error); } @@ -813,7 +819,12 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, /* * Do we have permission to get into attribute directory? */ - error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr, NULL); + if (flags & LOOKUP_NAMED_ATTR) + error = zfs_zaccess(zp, ACE_EXECUTE, V_NAMEDATTR, + B_FALSE, cr, NULL); + else + error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr, + NULL); if (error) { vrele(ZTOV(zp)); } @@ -1093,7 +1104,7 @@ zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode, zfs_exit(zfsvfs, FTAG); return (error); } - ASSERT3P(zp, ==, NULL); + ASSERT0P(zp); /* * Create a new file object and update the directory @@ -1185,8 +1196,8 @@ out: *zpp = zp; } - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); @@ -1315,9 +1326,8 @@ out: if (xzp) vrele(ZTOV(xzp)); - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); @@ -1474,7 +1484,7 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, zfs_exit(zfsvfs, FTAG); return (error); } - ASSERT3P(zp, ==, NULL); + ASSERT0P(zp); if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr, mnt_ns))) { @@ -1548,8 +1558,8 @@ out: getnewvnode_drop_reserve(); - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); @@ -1629,8 +1639,8 @@ zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) if (zfsvfs->z_use_namecache) cache_vop_rmdir(dvp, vp); out: - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); @@ -1728,7 +1738,7 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, /* * Quit if directory has been removed (posix) */ - if ((*eofp = zp->z_unlinked) != 0) { + if ((*eofp = (zp->z_unlinked != 0)) != 0) { zfs_exit(zfsvfs, FTAG); return (0); } @@ -2006,7 +2016,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) if (vp->v_type == VBLK || vp->v_type == VCHR) vap->va_rdev = zfs_cmpldev(rdev); else - vap->va_rdev = 0; + vap->va_rdev = NODEV; vap->va_gen = zp->z_gen; vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ vap->va_filerev = zp->z_seq; @@ -2196,6 +2206,7 @@ zfs_setattr_dir(znode_t *dzp) if (err) break; + vn_seqc_write_begin(ZTOV(zp)); mutex_enter(&dzp->z_lock); if (zp->z_uid != dzp->z_uid) { @@ -2245,6 +2256,7 @@ sa_add_projid_err: dmu_tx_abort(tx); } tx = NULL; + vn_seqc_write_end(ZTOV(zp)); if (err != 0 && err != ENOENT) break; @@ -3001,8 +3013,8 @@ out: } out2: - if (os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (err == 0 && os->os_sync == ZFS_SYNC_ALWAYS) + err = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (err); @@ -3531,7 +3543,7 @@ out_seq: out: if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + error = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); @@ -3723,7 +3735,7 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, *zpp = zp; if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + error = zil_commit(zilog, 0); } zfs_exit(zfsvfs, FTAG); @@ -3913,8 +3925,8 @@ zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr, vnevent_link(ZTOV(szp), ct); } - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); @@ -4299,6 +4311,43 @@ zfs_freebsd_getpages(struct vop_getpages_args *ap) ap->a_rahead)); } +typedef struct { + uint_t pca_npages; + vm_page_t pca_pages[]; +} putpage_commit_arg_t; + +static void +zfs_putpage_commit_cb(void *arg, int err) +{ + putpage_commit_arg_t *pca = arg; + vm_object_t object = pca->pca_pages[0]->object; + + zfs_vmobject_wlock(object); + + for (uint_t i = 0; i < pca->pca_npages; i++) { + vm_page_t pp = pca->pca_pages[i]; + + if (err == 0) { + /* + * Writeback succeeded, so undirty the page. If it + * fails, we leave it in the same state it was. That's + * most likely dirty, so it will get tried again some + * other time. + */ + vm_page_undirty(pp); + } + + vm_page_sunbusy(pp); + } + + vm_object_pip_wakeupn(object, pca->pca_npages); + + zfs_vmobject_wunlock(object); + + kmem_free(pca, + offsetof(putpage_commit_arg_t, pca_pages[pca->pca_npages])); +} + static int zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, int *rtvals) @@ -4400,10 +4449,12 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, } if (zp->z_blksz < PAGE_SIZE) { - for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { - tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; + vm_ooffset_t woff = off; + size_t wlen = len; + for (i = 0; wlen > 0; woff += tocopy, wlen -= tocopy, i++) { + tocopy = MIN(PAGE_SIZE, wlen); va = zfs_map_page(ma[i], &sf); - dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); + dmu_write(zfsvfs->z_os, zp->z_id, woff, tocopy, va, tx); zfs_unmap_page(sf); } } else { @@ -4424,19 +4475,48 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT0(err); - /* - * XXX we should be passing a callback to undirty - * but that would make the locking messier - */ - zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, - len, commit, B_FALSE, NULL, NULL); - zfs_vmobject_wlock(object); - for (i = 0; i < ncount; i++) { - rtvals[i] = zfs_vm_pagerret_ok; - vm_page_undirty(ma[i]); + if (commit) { + /* + * Caller requested that we commit immediately. We set + * a callback on the log entry, to be called once its + * on disk after the call to zil_commit() below. The + * pages will be undirtied and unbusied there. + */ + putpage_commit_arg_t *pca = kmem_alloc( + offsetof(putpage_commit_arg_t, pca_pages[ncount]), + KM_SLEEP); + pca->pca_npages = ncount; + memcpy(pca->pca_pages, ma, sizeof (vm_page_t) * ncount); + + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, + B_TRUE, B_FALSE, zfs_putpage_commit_cb, pca); + + for (i = 0; i < ncount; i++) + rtvals[i] = zfs_vm_pagerret_pend; + } else { + /* + * Caller just wants the page written back somewhere, + * but doesn't need it committed yet. We've already + * written it back to the DMU, so we just need to put + * it on the async log, then undirty the page and + * return. + * + * We cannot use a callback here, because it would keep + * the page busy (locked) until it is eventually + * written down at txg sync. + */ + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, + B_FALSE, B_FALSE, NULL, NULL); + + zfs_vmobject_wlock(object); + for (i = 0; i < ncount; i++) { + rtvals[i] = zfs_vm_pagerret_ok; + vm_page_undirty(ma[i]); + } + zfs_vmobject_wunlock(object); } - zfs_vmobject_wunlock(object); + VM_CNT_INC(v_vnodeout); VM_CNT_ADD(v_vnodepgsout, ncount); } @@ -4444,8 +4524,13 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, out: zfs_rangelock_exit(lr); - if (commit) - zil_commit(zfsvfs->z_log, zp->z_id); + if (commit) { + err = zil_commit(zfsvfs->z_log, zp->z_id); + if (err != 0) { + zfs_exit(zfsvfs, FTAG); + return (err); + } + } dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, len); @@ -4707,8 +4792,16 @@ zfs_freebsd_access(struct vop_access_args *ap) * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, */ accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); - if (accmode != 0) - error = zfs_access(zp, accmode, 0, ap->a_cred); + if (accmode != 0) { +#if __FreeBSD_version >= 1500040 + /* For named attributes, do the checks. */ + if ((vn_irflag_read(vp) & VIRF_NAMEDATTR) != 0) + error = zfs_access(zp, accmode, V_NAMEDATTR, + ap->a_cred); + else +#endif + error = zfs_access(zp, accmode, 0, ap->a_cred); + } /* * VADMIN has to be handled by vaccess(). @@ -4741,6 +4834,190 @@ struct vop_lookup_args { }; #endif +#if __FreeBSD_version >= 1500040 +static int +zfs_lookup_nameddir(struct vnode *dvp, struct componentname *cnp, + struct vnode **vpp) +{ + struct vnode *xvp; + int error, flags; + + *vpp = NULL; + flags = LOOKUP_XATTR | LOOKUP_NAMED_ATTR; + if ((cnp->cn_flags & CREATENAMED) != 0) + flags |= CREATE_XATTR_DIR; + error = zfs_lookup(dvp, NULL, &xvp, NULL, 0, cnp->cn_cred, flags, + B_FALSE); + if (error == 0) { + if ((cnp->cn_flags & LOCKLEAF) != 0) + error = vn_lock(xvp, cnp->cn_lkflags); + if (error == 0) { + vn_irflag_set_cond(xvp, VIRF_NAMEDDIR); + *vpp = xvp; + } else { + vrele(xvp); + } + } + return (error); +} + +static ssize_t +zfs_readdir_named(struct vnode *vp, char *buf, ssize_t blen, off_t *offp, + int *eofflagp, struct ucred *cred, struct thread *td) +{ + struct uio io; + struct iovec iv; + zfs_uio_t uio; + int error; + + io.uio_offset = *offp; + io.uio_segflg = UIO_SYSSPACE; + io.uio_rw = UIO_READ; + io.uio_td = td; + iv.iov_base = buf; + iv.iov_len = blen; + io.uio_iov = &iv; + io.uio_iovcnt = 1; + io.uio_resid = blen; + zfs_uio_init(&uio, &io); + error = zfs_readdir(vp, &uio, cred, eofflagp, NULL, NULL); + if (error != 0) + return (-1); + *offp = io.uio_offset; + return (blen - io.uio_resid); +} + +static bool +zfs_has_namedattr(struct vnode *vp, struct ucred *cred) +{ + struct componentname cn; + struct vnode *xvp; + struct dirent *dp; + off_t offs; + ssize_t rsize; + char *buf, *cp, *endcp; + int eofflag, error; + bool ret; + + MNT_ILOCK(vp->v_mount); + if ((vp->v_mount->mnt_flag & MNT_NAMEDATTR) == 0) { + MNT_IUNLOCK(vp->v_mount); + return (false); + } + MNT_IUNLOCK(vp->v_mount); + + /* Now see if a named attribute directory exists. */ + cn.cn_flags = LOCKLEAF; + cn.cn_lkflags = LK_SHARED; + cn.cn_cred = cred; + error = zfs_lookup_nameddir(vp, &cn, &xvp); + if (error != 0) + return (false); + + /* It exists, so see if there is any entry other than "." and "..". */ + buf = malloc(DEV_BSIZE, M_TEMP, M_WAITOK); + ret = false; + offs = 0; + do { + rsize = zfs_readdir_named(xvp, buf, DEV_BSIZE, &offs, &eofflag, + cred, curthread); + if (rsize <= 0) + break; + cp = buf; + endcp = &buf[rsize]; + while (cp < endcp) { + dp = (struct dirent *)cp; + if (dp->d_fileno != 0 && (dp->d_type == DT_REG || + dp->d_type == DT_UNKNOWN) && + !ZFS_XA_NS_PREFIX_FORBIDDEN(dp->d_name) && + ((dp->d_namlen == 1 && dp->d_name[0] != '.') || + (dp->d_namlen == 2 && (dp->d_name[0] != '.' || + dp->d_name[1] != '.')) || dp->d_namlen > 2)) { + ret = true; + break; + } + cp += dp->d_reclen; + } + } while (!ret && rsize > 0 && eofflag == 0); + vput(xvp); + free(buf, M_TEMP); + return (ret); +} + +static int +zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached) +{ + struct componentname *cnp = ap->a_cnp; + char nm[NAME_MAX + 1]; + int error; + struct vnode **vpp = ap->a_vpp, *dvp = ap->a_dvp, *xvp; + bool is_nameddir, needs_nameddir, opennamed = false; + + /* + * These variables are used to handle the named attribute cases: + * opennamed - Is true when this is a call from open with O_NAMEDATTR + * specified and it is the last component. + * is_nameddir - Is true when the directory is a named attribute dir. + * needs_nameddir - Is set when the lookup needs to look for/create + * a named attribute directory. It is only set when is_nameddir + * is_nameddir is false and opennamed is true. + * xvp - Is the directory that the lookup needs to be done in. + * Usually dvp, unless needs_nameddir is true where it is the + * result of the first non-named directory lookup. + * Note that name caching must be disabled for named attribute + * handling. + */ + needs_nameddir = false; + xvp = dvp; + opennamed = (cnp->cn_flags & (OPENNAMED | ISLASTCN)) == + (OPENNAMED | ISLASTCN); + is_nameddir = (vn_irflag_read(dvp) & VIRF_NAMEDDIR) != 0; + if (is_nameddir && (cnp->cn_flags & ISLASTCN) == 0) + return (ENOATTR); + if (opennamed && !is_nameddir && (cnp->cn_flags & ISDOTDOT) != 0) + return (ENOATTR); + if (opennamed || is_nameddir) + cnp->cn_flags &= ~MAKEENTRY; + if (opennamed && !is_nameddir) + needs_nameddir = true; + ASSERT3U(cnp->cn_namelen, <, sizeof (nm)); + error = 0; + *vpp = NULL; + if (needs_nameddir) { + if (VOP_ISLOCKED(dvp) != LK_EXCLUSIVE) + vn_lock(dvp, LK_UPGRADE | LK_RETRY); + error = zfs_lookup_nameddir(dvp, cnp, &xvp); + if (error == 0) + is_nameddir = true; + } + if (error == 0) { + if (!needs_nameddir || cnp->cn_namelen != 1 || + *cnp->cn_nameptr != '.') { + strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, + sizeof (nm))); + error = zfs_lookup(xvp, nm, vpp, cnp, cnp->cn_nameiop, + cnp->cn_cred, 0, cached); + if (is_nameddir && error == 0 && + (cnp->cn_namelen != 1 || *cnp->cn_nameptr != '.') && + (cnp->cn_flags & ISDOTDOT) == 0) { + if ((*vpp)->v_type == VDIR) + vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR); + else + vn_irflag_set_cond(*vpp, + VIRF_NAMEDATTR); + } + if (needs_nameddir && xvp != *vpp) + vput(xvp); + } else { + /* + * Lookup of "." when a named attribute dir is needed. + */ + *vpp = xvp; + } + } + return (error); +} +#else static int zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached) { @@ -4753,6 +5030,7 @@ zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached) return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, cnp->cn_cred, 0, cached)); } +#endif static int zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap) @@ -4775,7 +5053,11 @@ zfs_cache_lookup(struct vop_lookup_args *ap) zfsvfs_t *zfsvfs; zfsvfs = ap->a_dvp->v_mount->mnt_data; +#if __FreeBSD_version >= 1500040 + if (zfsvfs->z_use_namecache && (ap->a_cnp->cn_flags & OPENNAMED) == 0) +#else if (zfsvfs->z_use_namecache) +#endif return (vfs_cache_lookup(ap)); else return (zfs_freebsd_lookup(ap, B_FALSE)); @@ -4798,6 +5080,11 @@ zfs_freebsd_create(struct vop_create_args *ap) vattr_t *vap = ap->a_vap; znode_t *zp = NULL; int rc, mode; + struct vnode *dvp = ap->a_dvp; +#if __FreeBSD_version >= 1500040 + struct vnode *xvp; + bool is_nameddir; +#endif #if __FreeBSD_version < 1400068 ASSERT(cnp->cn_flags & SAVENAME); @@ -4808,10 +5095,36 @@ zfs_freebsd_create(struct vop_create_args *ap) zfsvfs = ap->a_dvp->v_mount->mnt_data; *ap->a_vpp = NULL; - rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, 0, mode, - &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */, NULL); + rc = 0; +#if __FreeBSD_version >= 1500040 + xvp = NULL; + is_nameddir = (vn_irflag_read(dvp) & VIRF_NAMEDDIR) != 0; + if (!is_nameddir && (cnp->cn_flags & OPENNAMED) != 0) { + /* Needs a named attribute directory. */ + rc = zfs_lookup_nameddir(dvp, cnp, &xvp); + if (rc == 0) { + dvp = xvp; + is_nameddir = true; + } + } + if (is_nameddir && rc == 0) + rc = zfs_check_attrname(cnp->cn_nameptr); +#endif + if (rc == 0) + rc = zfs_create(VTOZ(dvp), cnp->cn_nameptr, vap, 0, mode, + &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */, NULL); +#if __FreeBSD_version >= 1500040 + if (xvp != NULL) + vput(xvp); +#endif + if (rc == 0) { *ap->a_vpp = ZTOV(zp); +#if __FreeBSD_version >= 1500040 + if (is_nameddir) + vn_irflag_set_cond(*ap->a_vpp, VIRF_NAMEDATTR); +#endif + } if (zfsvfs->z_use_namecache && rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0) cache_enter(ap->a_dvp, *ap->a_vpp, cnp); @@ -4830,13 +5143,21 @@ struct vop_remove_args { static int zfs_freebsd_remove(struct vop_remove_args *ap) { + int error = 0; #if __FreeBSD_version < 1400068 ASSERT(ap->a_cnp->cn_flags & SAVENAME); #endif - return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, - ap->a_cnp->cn_cred)); +#if __FreeBSD_version >= 1500040 + if ((vn_irflag_read(ap->a_dvp) & VIRF_NAMEDDIR) != 0) + error = zfs_check_attrname(ap->a_cnp->cn_nameptr); +#endif + + if (error == 0) + error = zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, + ap->a_cnp->cn_cred); + return (error); } #ifndef _SYS_SYSPROTO_H_ @@ -4921,8 +5242,32 @@ struct vop_fsync_args { static int zfs_freebsd_fsync(struct vop_fsync_args *ap) { + vnode_t *vp = ap->a_vp; + int err = 0; + + /* + * Push any dirty mmap()'d data out to the DMU and ZIL, ready for + * zil_commit() to be called in zfs_fsync(). + */ + if (vm_object_mightbedirty(vp->v_object)) { + zfs_vmobject_wlock(vp->v_object); + if (!vm_object_page_clean(vp->v_object, 0, 0, 0)) + err = SET_ERROR(EIO); + zfs_vmobject_wunlock(vp->v_object); + if (err) { + /* + * Unclear what state things are in. zfs_putpages() + * will ensure the pages remain dirty if they haven't + * been written down to the DMU, but because there may + * be nothing logged, we can't assume that zfs_sync() + * -> zil_commit() will give us a useful error. It's + * safest if we just error out here. + */ + return (err); + } + } - return (zfs_fsync(VTOZ(ap->a_vp), 0, ap->a_td->td_ucred)); + return (zfs_fsync(VTOZ(vp), 0, ap->a_td->td_ucred)); } #ifndef _SYS_SYSPROTO_H_ @@ -4994,6 +5339,11 @@ zfs_freebsd_getattr(struct vop_getattr_args *ap) #undef FLAG_CHECK *vap = xvap.xva_vattr; vap->va_flags = fflags; + +#if __FreeBSD_version >= 1500040 + if ((vn_irflag_read(ap->a_vp) & (VIRF_NAMEDDIR | VIRF_NAMEDATTR)) != 0) + vap->va_bsdflags |= SFBSD_NAMEDATTR; +#endif return (0); } @@ -5136,15 +5486,24 @@ zfs_freebsd_rename(struct vop_rename_args *ap) vnode_t *fvp = ap->a_fvp; vnode_t *tdvp = ap->a_tdvp; vnode_t *tvp = ap->a_tvp; - int error; + int error = 0; #if __FreeBSD_version < 1400068 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); #endif - error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, - ap->a_tcnp, ap->a_fcnp->cn_cred); +#if __FreeBSD_version >= 1500040 + if ((vn_irflag_read(fdvp) & VIRF_NAMEDDIR) != 0) { + error = zfs_check_attrname(ap->a_fcnp->cn_nameptr); + if (error == 0) + error = zfs_check_attrname(ap->a_tcnp->cn_nameptr); + } +#endif + + if (error == 0) + error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, + ap->a_tcnp, ap->a_fcnp->cn_cred); vrele(fdvp); vrele(fvp); @@ -5373,6 +5732,9 @@ zfs_freebsd_pathconf(struct vop_pathconf_args *ap) { ulong_t val; int error; +#ifdef _PC_CLONE_BLKSIZE + zfsvfs_t *zfsvfs; +#endif error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); @@ -5398,12 +5760,48 @@ zfs_freebsd_pathconf(struct vop_pathconf_args *ap) return (0); } return (EINVAL); +#if __FreeBSD_version >= 1500040 + case _PC_NAMEDATTR_ENABLED: + MNT_ILOCK(ap->a_vp->v_mount); + if ((ap->a_vp->v_mount->mnt_flag & MNT_NAMEDATTR) != 0) + *ap->a_retval = 1; + else + *ap->a_retval = 0; + MNT_IUNLOCK(ap->a_vp->v_mount); + return (0); + case _PC_HAS_NAMEDATTR: + if (zfs_has_namedattr(ap->a_vp, curthread->td_ucred)) + *ap->a_retval = 1; + else + *ap->a_retval = 0; + return (0); +#endif +#ifdef _PC_HAS_HIDDENSYSTEM + case _PC_HAS_HIDDENSYSTEM: + *ap->a_retval = 1; + return (0); +#endif +#ifdef _PC_CLONE_BLKSIZE + case _PC_CLONE_BLKSIZE: + zfsvfs = (zfsvfs_t *)ap->a_vp->v_mount->mnt_data; + if (zfs_bclone_enabled && + spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os), + SPA_FEATURE_BLOCK_CLONING)) + *ap->a_retval = dsl_dataset_feature_is_active( + zfsvfs->z_os->os_dsl_dataset, + SPA_FEATURE_LARGE_BLOCKS) ? + SPA_MAXBLOCKSIZE : + SPA_OLD_MAXBLOCKSIZE; + else + *ap->a_retval = 0; + return (0); +#endif default: return (vop_stdpathconf(ap)); } } -static int zfs_xattr_compat = 1; +int zfs_xattr_compat = 1; static int zfs_check_attrname(const char *name) @@ -6436,9 +6834,11 @@ zfs_deallocate(struct vop_deallocate_args *ap) if (error == 0) { if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS || (ap->a_ioflag & IO_SYNC) != 0) - zil_commit(zilog, zp->z_id); - *ap->a_offset = off + len; - *ap->a_len = 0; + error = zil_commit(zilog, zp->z_id); + if (error == 0) { + *ap->a_offset = off + len; + *ap->a_len = 0; + } } zfs_exit(zfsvfs, FTAG); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c index 9bad1e13d7cc..649022ab5bcb 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c @@ -150,8 +150,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; zp->z_vnode = NULL; - zp->z_sync_writes_cnt = 0; - zp->z_async_writes_cnt = 0; return (0); } @@ -163,18 +161,15 @@ zfs_znode_cache_destructor(void *buf, void *arg) znode_t *zp = buf; ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); - ASSERT3P(zp->z_vnode, ==, NULL); + ASSERT0P(zp->z_vnode); ASSERT(!list_link_active(&zp->z_link_node)); mutex_destroy(&zp->z_lock); mutex_destroy(&zp->z_acl_lock); rw_destroy(&zp->z_xattr_lock); zfs_rangelock_fini(&zp->z_rangelock); - ASSERT3P(zp->z_acl_cached, ==, NULL); - ASSERT3P(zp->z_xattr_cached, ==, NULL); - - ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); - ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); + ASSERT0P(zp->z_acl_cached); + ASSERT0P(zp->z_xattr_cached); } @@ -200,7 +195,7 @@ zfs_znode_init(void) /* * Initialize zcache */ - ASSERT3P(znode_uma_zone, ==, NULL); + ASSERT0P(znode_uma_zone); znode_uma_zone = uma_zcreate("zfs_znode_cache", sizeof (znode_t), zfs_znode_cache_constructor_smr, zfs_znode_cache_destructor_smr, NULL, NULL, 0, 0); @@ -229,7 +224,7 @@ zfs_znode_init(void) /* * Initialize zcache */ - ASSERT3P(znode_cache, ==, NULL); + ASSERT0P(znode_cache); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_RECLAIMABLE); @@ -358,8 +353,8 @@ zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); - ASSERT3P(zp->z_sa_hdl, ==, NULL); - ASSERT3P(zp->z_acl_cached, ==, NULL); + ASSERT0P(zp->z_sa_hdl); + ASSERT0P(zp->z_acl_cached); if (sa_hdl == NULL) { VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, zp, SA_HDL_SHARED, &zp->z_sa_hdl)); @@ -456,8 +451,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; - zp->z_sync_writes_cnt = 0; - zp->z_async_writes_cnt = 0; atomic_store_ptr(&zp->z_cached_symlink, NULL); zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); @@ -824,6 +817,10 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, (*zpp)->z_dnodesize = dnodesize; (*zpp)->z_projid = projid; + vnode_t *vp = ZTOV(*zpp); + if (!(flag & IS_ROOT_NODE)) + vn_seqc_write_begin(vp); + if (vap->va_mask & AT_XVATTR) zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx); @@ -832,7 +829,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); } if (!(flag & IS_ROOT_NODE)) { - vnode_t *vp = ZTOV(*zpp); + vn_seqc_write_end(vp); vp->v_vflag |= VV_FORCEINSMQ; int err = insmntque(vp, zfsvfs->z_vfs); vp->v_vflag &= ~VV_FORCEINSMQ; @@ -1134,7 +1131,7 @@ zfs_rezget(znode_t *zp) } rw_exit(&zp->z_xattr_lock); - ASSERT3P(zp->z_sa_hdl, ==, NULL); + ASSERT0P(zp->z_sa_hdl); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); @@ -1305,7 +1302,7 @@ zfs_znode_free(znode_t *zp) zfsvfs_t *zfsvfs = zp->z_zfsvfs; char *symlink; - ASSERT3P(zp->z_sa_hdl, ==, NULL); + ASSERT0P(zp->z_sa_hdl); zp->z_vnode = NULL; mutex_enter(&zfsvfs->z_znodes_lock); POINTER_INVALIDATE(&zp->z_zfsvfs); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c index 212ef560db07..0dd2ecd7fd8d 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c @@ -31,7 +31,7 @@ * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] - * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2024, 2025, Klara, Inc. */ /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */ @@ -196,7 +196,6 @@ DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); static int zvol_geom_open(struct g_provider *pp, int flag, int count); static int zvol_geom_close(struct g_provider *pp, int flag, int count); -static void zvol_geom_destroy(zvol_state_t *zv); static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); static void zvol_geom_bio_start(struct bio *bp); static int zvol_geom_bio_getattr(struct bio *bp); @@ -226,25 +225,14 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) } retry: - rw_enter(&zvol_state_lock, ZVOL_RW_READER); - /* - * Obtain a copy of private under zvol_state_lock to make sure either - * the result of zvol free code setting private to NULL is observed, - * or the zv is protected from being freed because of the positive - * zv_open_count. - */ - zv = pp->private; - if (zv == NULL) { - rw_exit(&zvol_state_lock); - err = SET_ERROR(ENXIO); - goto out_locked; - } + zv = atomic_load_ptr(&pp->private); + if (zv == NULL) + return (SET_ERROR(ENXIO)); mutex_enter(&zv->zv_state_lock); if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) { - rw_exit(&zvol_state_lock); err = SET_ERROR(ENXIO); - goto out_zv_locked; + goto out_locked; } ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); @@ -257,8 +245,24 @@ retry: drop_suspend = B_TRUE; if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { mutex_exit(&zv->zv_state_lock); + + /* + * Removal may happen while the locks are down, so + * we can't trust zv any longer; we have to start over. + */ + zv = atomic_load_ptr(&pp->private); + if (zv == NULL) + return (SET_ERROR(ENXIO)); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); + + if (zv->zv_zso->zso_dying || + zv->zv_flags & ZVOL_REMOVING) { + err = SET_ERROR(ENXIO); + goto out_locked; + } + /* Check to see if zv_suspend_lock is needed. */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); @@ -266,7 +270,6 @@ retry: } } } - rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -294,7 +297,7 @@ retry: if (drop_namespace) mutex_exit(&spa_namespace_lock); if (err) - goto out_zv_locked; + goto out_locked; pp->mediasize = zv->zv_volsize; pp->stripeoffset = 0; pp->stripesize = zv->zv_volblocksize; @@ -329,9 +332,8 @@ out_opened: zvol_last_close(zv); wakeup(zv); } -out_zv_locked: - mutex_exit(&zv->zv_state_lock); out_locked: + mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (err); @@ -345,12 +347,9 @@ zvol_geom_close(struct g_provider *pp, int flag, int count) boolean_t drop_suspend = B_TRUE; int new_open_count; - rw_enter(&zvol_state_lock, ZVOL_RW_READER); - zv = pp->private; - if (zv == NULL) { - rw_exit(&zvol_state_lock); + zv = atomic_load_ptr(&pp->private); + if (zv == NULL) return (SET_ERROR(ENXIO)); - } mutex_enter(&zv->zv_state_lock); if (zv->zv_flags & ZVOL_EXCL) { @@ -377,6 +376,15 @@ zvol_geom_close(struct g_provider *pp, int flag, int count) mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); + + /* + * Unlike in zvol_geom_open(), we don't check if + * removal started here, because we might be one of the + * openers that needs to be thrown out! If we're the + * last, we need to call zvol_last_close() below to + * finish cleanup. So, no special treatment for us. + */ + /* Check to see if zv_suspend_lock is needed. */ new_open_count = zv->zv_open_count - count; if (new_open_count != 0) { @@ -387,7 +395,6 @@ zvol_geom_close(struct g_provider *pp, int flag, int count) } else { drop_suspend = B_FALSE; } - rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -408,20 +415,6 @@ zvol_geom_close(struct g_provider *pp, int flag, int count) return (0); } -static void -zvol_geom_destroy(zvol_state_t *zv) -{ - struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; - struct g_provider *pp = zsg->zsg_provider; - - ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); - - g_topology_assert(); - - zsg->zsg_provider = NULL; - g_wither_geom(pp->geom, ENXIO); -} - void zvol_wait_close(zvol_state_t *zv) { @@ -454,7 +447,7 @@ zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", pp->name, acr, acw, ace)); - if (pp->private == NULL) { + if (atomic_load_ptr(&pp->private) == NULL) { if (acr <= 0 && acw <= 0 && ace <= 0) return (0); return (pp->error); @@ -727,9 +720,9 @@ unlock: break; } - if (commit) { + if (error == 0 && commit) { commit: - zil_commit(zv->zv_zilog, ZVOL_OBJ); + error = zil_commit(zv->zv_zilog, ZVOL_OBJ); } resume: rw_exit(&zv->zv_suspend_lock); @@ -906,8 +899,8 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag) zfs_rangelock_exit(lr); int64_t nwritten = start_resid - zfs_uio_resid(&uio); dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); - if (commit) - zil_commit(zv->zv_zilog, ZVOL_OBJ); + if (error == 0 && commit) + error = zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); return (error); @@ -921,25 +914,14 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) boolean_t drop_suspend = B_FALSE; retry: - rw_enter(&zvol_state_lock, ZVOL_RW_READER); - /* - * Obtain a copy of si_drv2 under zvol_state_lock to make sure either - * the result of zvol free code setting si_drv2 to NULL is observed, - * or the zv is protected from being freed because of the positive - * zv_open_count. - */ - zv = dev->si_drv2; - if (zv == NULL) { - rw_exit(&zvol_state_lock); - err = SET_ERROR(ENXIO); - goto out_locked; - } + zv = atomic_load_ptr(&dev->si_drv2); + if (zv == NULL) + return (SET_ERROR(ENXIO)); mutex_enter(&zv->zv_state_lock); - if (zv->zv_zso->zso_dying) { - rw_exit(&zvol_state_lock); + if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) { err = SET_ERROR(ENXIO); - goto out_zv_locked; + goto out_locked; } ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); @@ -954,6 +936,13 @@ retry: mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); + + if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { + /* Removal started while locks were down. */ + err = SET_ERROR(ENXIO); + goto out_locked; + } + /* Check to see if zv_suspend_lock is needed. */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); @@ -961,7 +950,6 @@ retry: } } } - rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -989,7 +977,7 @@ retry: if (drop_namespace) mutex_exit(&spa_namespace_lock); if (err) - goto out_zv_locked; + goto out_locked; } ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -1016,9 +1004,8 @@ out_opened: zvol_last_close(zv); wakeup(zv); } -out_zv_locked: - mutex_exit(&zv->zv_state_lock); out_locked: + mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (err); @@ -1030,12 +1017,9 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; - rw_enter(&zvol_state_lock, ZVOL_RW_READER); - zv = dev->si_drv2; - if (zv == NULL) { - rw_exit(&zvol_state_lock); + zv = atomic_load_ptr(&dev->si_drv2); + if (zv == NULL) return (SET_ERROR(ENXIO)); - } mutex_enter(&zv->zv_state_lock); if (zv->zv_flags & ZVOL_EXCL) { @@ -1060,6 +1044,15 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); + + /* + * Unlike in zvol_cdev_open(), we don't check if + * removal started here, because we might be one of the + * openers that needs to be thrown out! If we're the + * last, we need to call zvol_last_close() below to + * finish cleanup. So, no special treatment for us. + */ + /* Check to see if zv_suspend_lock is needed. */ if (zv->zv_open_count != 1) { rw_exit(&zv->zv_suspend_lock); @@ -1069,7 +1062,6 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) } else { drop_suspend = B_FALSE; } - rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -1101,7 +1093,8 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, int error; boolean_t sync; - zv = dev->si_drv2; + zv = atomic_load_ptr(&dev->si_drv2); + ASSERT3P(zv, !=, NULL); error = 0; KASSERT(zv->zv_open_count > 0, @@ -1117,7 +1110,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, case DIOCGFLUSH: rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); if (zv->zv_zilog != NULL) - zil_commit(zv->zv_zilog, ZVOL_OBJ); + error = zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); break; case DIOCGDELETE: @@ -1152,7 +1145,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, } zfs_rangelock_exit(lr); if (sync) - zil_commit(zv->zv_zilog, ZVOL_OBJ); + error = zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); break; case DIOCGSTRIPESIZE: @@ -1162,6 +1155,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, *(off_t *)data = 0; break; case DIOCGATTR: { + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); spa_t *spa = dmu_objset_spa(zv->zv_objset); struct diocgattr_arg *arg = (struct diocgattr_arg *)data; uint64_t refd, avail, usedobjs, availobjs; @@ -1186,6 +1180,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, arg->value.off = refd / DEV_BSIZE; } else error = SET_ERROR(ENOIOCTL); + rw_exit(&zv->zv_suspend_lock); break; } case FIOSEEKHOLE: @@ -1196,10 +1191,12 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, hole = (cmd == FIOSEEKHOLE); noff = *off; + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX, RL_READER); error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); zfs_rangelock_exit(lr); + rw_exit(&zv->zv_suspend_lock); *off = noff; break; } @@ -1248,9 +1245,11 @@ zvol_os_is_zvol(const char *device) return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0); } -void +int zvol_os_rename_minor(zvol_state_t *zv, const char *newname) { + int error = 0; + ASSERT(RW_LOCK_HELD(&zvol_state_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -1304,56 +1303,159 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) args.mda_gid = GID_OPERATOR; args.mda_mode = 0640; args.mda_si_drv2 = zv; - if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname) - == 0) { + error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname); + if (error == 0) { dev->si_iosize_max = maxphys; zsd->zsd_cdev = dev; } } strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); dataset_kstats_rename(&zv->zv_kstat, newname); + + return (error); } /* - * Remove minor node for the specified volume. + * Allocate memory for a new zvol_state_t and setup the required + * request queue and generic disk structures for the block device. */ -void -zvol_os_free(zvol_state_t *zv) +static int +zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize, + zvol_state_t **zvp) { - ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); - ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); - ASSERT0(zv->zv_open_count); + zvol_state_t *zv; + uint64_t volmode; + int error; - ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); + error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_VOLMODE), + &volmode, NULL); + if (error) + return (error); - rw_destroy(&zv->zv_suspend_lock); - zfs_rangelock_fini(&zv->zv_rangelock); + if (volmode == ZFS_VOLMODE_DEFAULT) + volmode = zvol_volmode; + if (volmode == ZFS_VOLMODE_NONE) + return (0); + + zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); + mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); + zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); + zv->zv_volmode = volmode; + zv->zv_volsize = volsize; + zv->zv_volblocksize = volblocksize; if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; - struct g_provider *pp __maybe_unused = zsg->zsg_provider; + struct g_provider *pp; + struct g_geom *gp; + + g_topology_lock(); + gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); + gp->start = zvol_geom_bio_start; + gp->access = zvol_geom_access; + pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); + pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; + pp->sectorsize = DEV_BSIZE; + pp->mediasize = 0; + pp->private = zv; + + zsg->zsg_provider = pp; + } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { + struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + struct cdev *dev; + struct make_dev_args args; + + make_dev_args_init(&args); + args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; + args.mda_devsw = &zvol_cdevsw; + args.mda_cr = NULL; + args.mda_uid = UID_ROOT; + args.mda_gid = GID_OPERATOR; + args.mda_mode = 0640; + args.mda_si_drv2 = zv; + error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name); + if (error) { + kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); + kmem_free(zv, sizeof (zvol_state_t)); + return (error); + } + + dev->si_iosize_max = maxphys; + zsd->zsd_cdev = dev; + knlist_init_sx(&zsd->zsd_selinfo.si_note, &zv->zv_state_lock); + } + (void) strlcpy(zv->zv_name, name, MAXPATHLEN); + rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); + zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); + + *zvp = zv; + return (error); +} - ASSERT3P(pp->private, ==, NULL); +/* + * Remove minor node for the specified volume. + */ +void +zvol_os_remove_minor(zvol_state_t *zv) +{ + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + ASSERT0(zv->zv_open_count); + ASSERT0(atomic_read(&zv->zv_suspend_ref)); + ASSERT(zv->zv_flags & ZVOL_REMOVING); + + struct zvol_state_os *zso = zv->zv_zso; + zv->zv_zso = NULL; + + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + atomic_store_ptr(&pp->private, NULL); + mutex_exit(&zv->zv_state_lock); g_topology_lock(); - zvol_geom_destroy(zv); + g_wither_geom(pp->geom, ENXIO); g_topology_unlock(); } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { - struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + struct zvol_state_dev *zsd = &zso->zso_dev; struct cdev *dev = zsd->zsd_cdev; + if (dev != NULL) + atomic_store_ptr(&dev->si_drv2, NULL); + mutex_exit(&zv->zv_state_lock); + if (dev != NULL) { - ASSERT3P(dev->si_drv2, ==, NULL); destroy_dev(dev); knlist_clear(&zsd->zsd_selinfo.si_note, 0); knlist_destroy(&zsd->zsd_selinfo.si_note); } } + kmem_free(zso, sizeof (struct zvol_state_os)); + + mutex_enter(&zv->zv_state_lock); +} + +void +zvol_os_free(zvol_state_t *zv) +{ + ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + ASSERT0(zv->zv_open_count); + ASSERT0P(zv->zv_zso); + + ASSERT0P(zv->zv_objset); + ASSERT0P(zv->zv_zilog); + ASSERT0P(zv->zv_dn); + + ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); + + rw_destroy(&zv->zv_suspend_lock); + zfs_rangelock_fini(&zv->zv_rangelock); + mutex_destroy(&zv->zv_state_lock); cv_destroy(&zv->zv_removing_cv); dataset_kstats_destroy(&zv->zv_kstat); - kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); zvol_minors--; } @@ -1364,11 +1466,11 @@ zvol_os_free(zvol_state_t *zv) int zvol_os_create_minor(const char *name) { - zvol_state_t *zv; + zvol_state_t *zv = NULL; objset_t *os; dmu_object_info_t *doi; uint64_t volsize; - uint64_t volmode, hash, len; + uint64_t hash, len; int error; bool replayed_zil = B_FALSE; @@ -1400,74 +1502,22 @@ zvol_os_create_minor(const char *name) if (error) goto out_dmu_objset_disown; - error = dsl_prop_get_integer(name, - zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL); - if (error || volmode == ZFS_VOLMODE_DEFAULT) - volmode = zvol_volmode; - error = 0; + error = zvol_alloc(name, volsize, doi->doi_data_block_size, &zv); + if (error || zv == NULL) + goto out_dmu_objset_disown; - /* - * zvol_alloc equivalent ... - */ - zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); zv->zv_hash = hash; - mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); - zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); - zv->zv_volmode = volmode; - if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { - struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; - struct g_provider *pp; - struct g_geom *gp; - - g_topology_lock(); - gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); - gp->start = zvol_geom_bio_start; - gp->access = zvol_geom_access; - pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); - pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; - pp->sectorsize = DEV_BSIZE; - pp->mediasize = 0; - pp->private = zv; - - zsg->zsg_provider = pp; - } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { - struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; - struct cdev *dev; - struct make_dev_args args; - - make_dev_args_init(&args); - args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; - args.mda_devsw = &zvol_cdevsw; - args.mda_cr = NULL; - args.mda_uid = UID_ROOT; - args.mda_gid = GID_OPERATOR; - args.mda_mode = 0640; - args.mda_si_drv2 = zv; - if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name) - == 0) { - dev->si_iosize_max = maxphys; - zsd->zsd_cdev = dev; - knlist_init_sx(&zsd->zsd_selinfo.si_note, - &zv->zv_state_lock); - } - } - (void) strlcpy(zv->zv_name, name, MAXPATHLEN); - rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); - zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) zv->zv_flags |= ZVOL_RDONLY; - zv->zv_volblocksize = doi->doi_data_block_size; - zv->zv_volsize = volsize; zv->zv_objset = os; - ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); + ASSERT0P(zv->zv_kstat.dk_kstats); error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); if (error) goto out_dmu_objset_disown; - ASSERT3P(zv->zv_zilog, ==, NULL); + ASSERT0P(zv->zv_zilog); zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) @@ -1490,13 +1540,14 @@ zvol_os_create_minor(const char *name) out_dmu_objset_disown: dmu_objset_disown(os, B_TRUE, FTAG); - if (error == 0 && volmode == ZFS_VOLMODE_GEOM) { + if (error == 0 && zv && zv->zv_volmode == ZFS_VOLMODE_GEOM) { g_error_provider(zv->zv_zso->zso_geom.zsg_provider, 0); + /* geom was locked inside zvol_alloc() function */ g_topology_unlock(); } out_doi: kmem_free(doi, sizeof (dmu_object_info_t)); - if (error == 0) { + if (error == 0 && zv) { rw_enter(&zvol_state_lock, RW_WRITER); zvol_insert(zv); zvol_minors++; @@ -1507,28 +1558,6 @@ out_doi: return (error); } -void -zvol_os_clear_private(zvol_state_t *zv) -{ - ASSERT(RW_LOCK_HELD(&zvol_state_lock)); - if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { - struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; - struct g_provider *pp = zsg->zsg_provider; - - if (pp->private == NULL) /* already cleared */ - return; - - pp->private = NULL; - ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); - } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { - struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; - struct cdev *dev = zsd->zsd_cdev; - - if (dev != NULL) - dev->si_drv2 = NULL; - } -} - int zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) { diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c b/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c index ce9c9e39e60c..aac5f2ebbfd2 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c @@ -66,9 +66,9 @@ void __cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg) { ASSERT(cvp); - ASSERT(name == NULL); + ASSERT0P(name); ASSERT(type == CV_DEFAULT); - ASSERT(arg == NULL); + ASSERT0P(arg); cvp->cv_magic = CV_MAGIC; init_waitqueue_head(&cvp->cv_event); @@ -83,7 +83,7 @@ static int cv_destroy_wakeup(kcondvar_t *cvp) { if (!atomic_read(&cvp->cv_waiters) && !atomic_read(&cvp->cv_refs)) { - ASSERT(cvp->cv_mutex == NULL); + ASSERT0P(cvp->cv_mutex); ASSERT(!waitqueue_active(&cvp->cv_event)); return (1); } @@ -104,7 +104,7 @@ __cv_destroy(kcondvar_t *cvp) while (cv_destroy_wakeup(cvp) == 0) wait_event_timeout(cvp->cv_destroy, cv_destroy_wakeup(cvp), 1); - ASSERT3P(cvp->cv_mutex, ==, NULL); + ASSERT0P(cvp->cv_mutex); ASSERT3S(atomic_read(&cvp->cv_refs), ==, 0); ASSERT3S(atomic_read(&cvp->cv_waiters), ==, 0); ASSERT3S(waitqueue_active(&cvp->cv_event), ==, 0); diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c index f37699b4347e..89ca4a648b2f 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c @@ -709,7 +709,7 @@ zone_get_hostid(void *zone) { uint32_t hostid; - ASSERT3P(zone, ==, NULL); + ASSERT0P(zone); if (spl_hostid != 0) return ((uint32_t)(spl_hostid & HW_HOSTID_MASK)); diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c index fab80289b278..22e4ed169d03 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c @@ -296,7 +296,7 @@ spl_slab_free(spl_kmem_slab_t *sks, spl_kmem_cache_t *skc; ASSERT(sks->sks_magic == SKS_MAGIC); - ASSERT(sks->sks_ref == 0); + ASSERT0(sks->sks_ref); skc = sks->sks_cache; ASSERT(skc->skc_magic == SKC_MAGIC); @@ -598,7 +598,7 @@ static void spl_magazine_free(spl_kmem_magazine_t *skm) { ASSERT(skm->skm_magic == SKM_MAGIC); - ASSERT(skm->skm_avail == 0); + ASSERT0(skm->skm_avail); kfree(skm); } @@ -610,7 +610,7 @@ spl_magazine_create(spl_kmem_cache_t *skc) { int i = 0; - ASSERT((skc->skc_flags & KMC_SLAB) == 0); + ASSERT0((skc->skc_flags & KMC_SLAB)); skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) * num_possible_cpus(), kmem_flags_convert(KM_SLEEP)); @@ -640,7 +640,7 @@ spl_magazine_destroy(spl_kmem_cache_t *skc) spl_kmem_magazine_t *skm; int i = 0; - ASSERT((skc->skc_flags & KMC_SLAB) == 0); + ASSERT0((skc->skc_flags & KMC_SLAB)); for_each_possible_cpu(i) { skm = skc->skc_mag[i]; @@ -679,8 +679,8 @@ spl_kmem_cache_create(const char *name, size_t size, size_t align, /* * Unsupported flags */ - ASSERT(vmp == NULL); - ASSERT(reclaim == NULL); + ASSERT0P(vmp); + ASSERT0P(reclaim); might_sleep(); @@ -863,11 +863,11 @@ spl_kmem_cache_destroy(spl_kmem_cache_t *skc) * Validate there are no objects in use and free all the * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. */ - ASSERT3U(skc->skc_slab_alloc, ==, 0); - ASSERT3U(skc->skc_obj_alloc, ==, 0); - ASSERT3U(skc->skc_slab_total, ==, 0); - ASSERT3U(skc->skc_obj_total, ==, 0); - ASSERT3U(skc->skc_obj_emergency, ==, 0); + ASSERT0(skc->skc_slab_alloc); + ASSERT0(skc->skc_obj_alloc); + ASSERT0(skc->skc_slab_total); + ASSERT0(skc->skc_obj_total); + ASSERT0(skc->skc_obj_emergency); ASSERT(list_empty(&skc->skc_complete_list)); ASSERT3U(percpu_counter_sum(&skc->skc_linux_alloc), ==, 0); @@ -986,7 +986,7 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) ASSERT0(flags & ~KM_PUBLIC_MASK); ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT((skc->skc_flags & KMC_SLAB) == 0); + ASSERT0((skc->skc_flags & KMC_SLAB)); *obj = NULL; diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c index 337a4bcf76a0..9fe008cef868 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c @@ -302,13 +302,8 @@ spl_kmem_free_impl(const void *buf, size_t size) #ifdef DEBUG_KMEM /* Shim layer memory accounting */ -#ifdef HAVE_ATOMIC64_T atomic64_t kmem_alloc_used = ATOMIC64_INIT(0); -unsigned long long kmem_alloc_max = 0; -#else /* HAVE_ATOMIC64_T */ -atomic_t kmem_alloc_used = ATOMIC_INIT(0); -unsigned long long kmem_alloc_max = 0; -#endif /* HAVE_ATOMIC64_T */ +uint64_t kmem_alloc_max = 0; EXPORT_SYMBOL(kmem_alloc_used); EXPORT_SYMBOL(kmem_alloc_max); @@ -320,9 +315,9 @@ spl_kmem_alloc_debug(size_t size, int flags, int node) ptr = spl_kmem_alloc_impl(size, flags, node); if (ptr) { - kmem_alloc_used_add(size); - if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) - kmem_alloc_max = kmem_alloc_used_read(); + atomic64_add(size, &kmem_alloc_used); + if (unlikely(atomic64_read(&kmem_alloc_used) > kmem_alloc_max)) + kmem_alloc_max = atomic64_read(&kmem_alloc_used); } return (ptr); @@ -331,7 +326,7 @@ spl_kmem_alloc_debug(size_t size, int flags, int node) inline void spl_kmem_free_debug(const void *ptr, size_t size) { - kmem_alloc_used_sub(size); + atomic64_sub(size, &kmem_alloc_used); spl_kmem_free_impl(ptr, size); } @@ -595,7 +590,7 @@ spl_kmem_init(void) { #ifdef DEBUG_KMEM - kmem_alloc_used_set(0); + atomic64_set(&kmem_alloc_used, 0); @@ -617,9 +612,10 @@ spl_kmem_fini(void) * at that address to aid in debugging. Performance is not * a serious concern here since it is module unload time. */ - if (kmem_alloc_used_read() != 0) + if (atomic64_read(&kmem_alloc_used) != 0) printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n", - (unsigned long)kmem_alloc_used_read(), kmem_alloc_max); + (unsigned long)atomic64_read(&kmem_alloc_used), + kmem_alloc_max); #ifdef DEBUG_KMEM_TRACKING spl_kmem_fini_tracking(&kmem_list, &kmem_lock); diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c index 48f70b00c96b..02c5b42bc4a0 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c @@ -541,7 +541,7 @@ __kstat_create(const char *ks_module, int ks_instance, const char *ks_name, kstat_t *ksp; ASSERT(ks_module); - ASSERT(ks_instance == 0); + ASSERT0(ks_instance); ASSERT(ks_name); if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO)) diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c index 4ed0deedd5b9..8cdd5fc5cfe5 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c @@ -82,11 +82,7 @@ proc_domemused(CONST_CTL_TABLE *table, int write, if (write) { *ppos += *lenp; } else { -#ifdef HAVE_ATOMIC64_T val = atomic64_read((atomic64_t *)table->data); -#else - val = atomic_read((atomic_t *)table->data); -#endif /* HAVE_ATOMIC64_T */ rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos); } @@ -315,18 +311,14 @@ static struct ctl_table spl_kmem_table[] = { { .procname = "kmem_used", .data = &kmem_alloc_used, -#ifdef HAVE_ATOMIC64_T .maxlen = sizeof (atomic64_t), -#else - .maxlen = sizeof (atomic_t), -#endif /* HAVE_ATOMIC64_T */ .mode = 0444, .proc_handler = &proc_domemused, }, { .procname = "kmem_max", .data = &kmem_alloc_max, - .maxlen = sizeof (unsigned long), + .maxlen = sizeof (uint64_t), .extra1 = &table_min, .extra2 = &table_max, .mode = 0444, diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c index 1398483a3ac8..8f5c73b13df5 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c @@ -28,6 +28,7 @@ #include <sys/kmem.h> #include <sys/tsd.h> #include <sys/string.h> +#include <sys/misc.h> /* * Thread interfaces @@ -79,7 +80,7 @@ __thread_create(caddr_t stk, size_t stksize, thread_func_t func, /* Option pp is simply ignored */ /* Variable stack size unsupported */ - ASSERT(stk == NULL); + ASSERT0P(stk); tp = kmem_alloc(sizeof (thread_priv_t), KM_PUSHPAGE); if (tp == NULL) @@ -197,3 +198,14 @@ issig(void) } EXPORT_SYMBOL(issig); + +/* + * Check if the current thread is a memory reclaim thread. + * Returns true if current thread is kswapd. + */ +int +current_is_reclaim_thread(void) +{ + return (current_is_kswapd()); +} +EXPORT_SYMBOL(current_is_reclaim_thread); diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c b/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c index 34a61bef7d4f..2e8cedf0dc87 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c @@ -161,7 +161,7 @@ tsd_hash_add(tsd_hash_table_t *table, uint_t key, pid_t pid, void *value) ulong_t hash; int rc = 0; - ASSERT3P(tsd_hash_search(table, key, pid), ==, NULL); + ASSERT0P(tsd_hash_search(table, key, pid)); /* New entry allocate structure, set value, and add to hash */ entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c index e1140b31a97a..8a8316f63c48 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c @@ -256,10 +256,6 @@ abd_unmark_zfs_page(struct page *page) #ifndef CONFIG_HIGHMEM -#ifndef __GFP_RECLAIM -#define __GFP_RECLAIM __GFP_WAIT -#endif - /* * The goal is to minimize fragmentation by preferentially populating ABDs * with higher order compound pages from a single zone. Allocation size is @@ -867,9 +863,9 @@ abd_iter_advance(struct abd_iter *aiter, size_t amount) * Ensure that last chunk is not in use. abd_iterate_*() must clear * this state (directly or abd_iter_unmap()) before advancing. */ - ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0P(aiter->iter_mapaddr); ASSERT0(aiter->iter_mapsize); - ASSERT3P(aiter->iter_page, ==, NULL); + ASSERT0P(aiter->iter_page); ASSERT0(aiter->iter_page_doff); ASSERT0(aiter->iter_page_dsize); @@ -901,7 +897,7 @@ abd_iter_map(struct abd_iter *aiter) void *paddr; size_t offset = 0; - ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0P(aiter->iter_mapaddr); ASSERT0(aiter->iter_mapsize); /* There's nothing left to iterate over, so do nothing */ diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c index 154ca22d9513..1bd3500e9f66 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c @@ -471,13 +471,17 @@ vdev_disk_close(vdev_t *v) if (v->vdev_reopening || vd == NULL) return; + rw_enter(&vd->vd_lock, RW_WRITER); + if (vd->vd_bdh != NULL) vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa), zfs_vdev_holder); + v->vdev_tsd = NULL; + + rw_exit(&vd->vd_lock); rw_destroy(&vd->vd_lock); kmem_free(vd, sizeof (vdev_disk_t)); - v->vdev_tsd = NULL; } /* @@ -552,7 +556,7 @@ vdev_bio_associate_blkg(struct bio *bio) #endif ASSERT3P(q, !=, NULL); - ASSERT3P(bio->bi_blkg, ==, NULL); + ASSERT0P(bio->bi_blkg); if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) bio->bi_blkg = q->root_blkg; @@ -574,7 +578,7 @@ vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) bio->bi_bdev = bdev; ASSERT3P(q, !=, NULL); - ASSERT3P(bio->bi_blkg, ==, NULL); + ASSERT0P(bio->bi_blkg); if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) bio->bi_blkg = q->root_blkg; @@ -806,7 +810,7 @@ vbio_completion(struct bio *bio) * here; instead we stash vbio on the zio and take care of it in the * done callback. */ - ASSERT3P(zio->io_bio, ==, NULL); + ASSERT0P(zio->io_bio); zio->io_bio = vbio; zio_delay_interrupt(zio); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c index 1b169122f25b..daa4b5776837 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c @@ -1900,7 +1900,7 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, if (!(flag & IS_ROOT_NODE) && (dzp->z_pflags & ZFS_INHERIT_ACE) && !(dzp->z_pflags & ZFS_XATTR)) { - VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE, + VERIFY0(zfs_acl_node_read(dzp, B_TRUE, &paclp, B_FALSE)); acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, vap->va_mode, paclp, acl_ids->z_mode, &need_chmod); @@ -2204,8 +2204,8 @@ top: } error = zfs_aclset_common(zp, aclp, cr, tx); - ASSERT(error == 0); - ASSERT(zp->z_acl_cached == NULL); + ASSERT0(error); + ASSERT0P(zp->z_acl_cached); zp->z_acl_cached = aclp; if (fuid_dirtied) diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c index 84b25cb2c5ac..fb4de50480a3 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c @@ -494,9 +494,9 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, if (!creation) now = current_time(ip); zp = ITOZ(ip); - ASSERT3P(zp->z_dirlocks, ==, NULL); - ASSERT3P(zp->z_acl_cached, ==, NULL); - ASSERT3P(zp->z_xattr_cached, ==, NULL); + ASSERT0P(zp->z_dirlocks); + ASSERT0P(zp->z_acl_cached); + ASSERT0P(zp->z_xattr_cached); zp->z_id = id; zp->z_unlinked = B_FALSE; zp->z_atime_dirty = B_FALSE; @@ -511,8 +511,6 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, zp->z_pflags = 0; zp->z_mode = 0; zp->z_sync_cnt = 0; - zp->z_sync_writes_cnt = 0; - zp->z_async_writes_cnt = 0; ip->i_generation = 0; ip->i_ino = id; ip->i_mode = (S_IFDIR | S_IRWXUGO); @@ -592,7 +590,7 @@ zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id, int zfsctl_create(zfsvfs_t *zfsvfs) { - ASSERT(zfsvfs->z_ctldir == NULL); + ASSERT0P(zfsvfs->z_ctldir); zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT, &zpl_fops_root, &zpl_ops_root, 0); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c index 2f935bb3fc8c..e8de536606e2 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c @@ -463,7 +463,7 @@ zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) zfsvfs_t *zfsvfs = ZTOZSB(zp); ASSERT(zp->z_unlinked); - ASSERT(ZTOI(zp)->i_nlink == 0); + ASSERT0(ZTOI(zp)->i_nlink); VERIFY3U(0, ==, zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); @@ -662,8 +662,8 @@ zfs_rmnode(znode_t *zp) uint64_t links; int error; - ASSERT(ZTOI(zp)->i_nlink == 0); - ASSERT(atomic_read(&ZTOI(zp)->i_count) == 0); + ASSERT0(ZTOI(zp)->i_nlink); + ASSERT0(atomic_read(&ZTOI(zp)->i_count)); /* * If this is an attribute directory, purge its contents. @@ -710,7 +710,7 @@ zfs_rmnode(znode_t *zp) &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); - ASSERT(error == 0); + ASSERT0(error); } acl_obj = zfs_external_acl(zp); @@ -744,12 +744,12 @@ zfs_rmnode(znode_t *zp) } if (xzp) { - ASSERT(error == 0); + ASSERT0(error); mutex_enter(&xzp->z_lock); xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ clear_nlink(ZTOI(xzp)); /* no more links to it */ links = 0; - VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + VERIFY0(sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), &links, sizeof (links), tx)); mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); @@ -872,7 +872,7 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) ctime); } error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - ASSERT(error == 0); + ASSERT0(error); mutex_exit(&zp->z_lock); @@ -894,7 +894,7 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) &dzp->z_pflags, sizeof (dzp->z_pflags)); zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); - ASSERT(error == 0); + ASSERT0(error); mutex_exit(&dzp->z_lock); return (0); @@ -986,7 +986,7 @@ zfs_drop_nlink_locked(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp) SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, sizeof (links)); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - ASSERT3U(error, ==, 0); + ASSERT0(error); if (unlinkedp != NULL) *unlinkedp = unlinked; @@ -1058,7 +1058,7 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, /* The only error is !zfs_dirempty() and we checked earlier. */ error = zfs_drop_nlink_locked(zp, tx, &unlinked); - ASSERT3U(error, ==, 0); + ASSERT0(error); mutex_exit(&zp->z_lock); } else { error = zfs_dropname(dl, zp, dzp, tx, flag); @@ -1083,7 +1083,7 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); - ASSERT(error == 0); + ASSERT0(error); mutex_exit(&dzp->z_lock); if (unlinkedp != NULL) @@ -1167,7 +1167,7 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr) ASSERT(error == 0 && parent == zp->z_id); #endif - VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, + VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, sizeof (xzp->z_id), tx)); if (!zp->z_unlinked) diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c index d193eb80dca2..3fdcdbac6f68 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c @@ -115,8 +115,9 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) */ int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off, - ssize_t *resid) + uint8_t ashift, ssize_t *resid) { + (void) ashift; ssize_t rc; rc = kernel_write(fp, buf, count, &off); @@ -260,24 +261,12 @@ zfs_file_fsync(zfs_file_t *filp, int flags) { int datasync = 0; int error; - int fstrans; if (flags & O_DSYNC) datasync = 1; - /* - * May enter XFS which generates a warning when PF_FSTRANS is set. - * To avoid this the flag is cleared over vfs_sync() and then reset. - */ - fstrans = __spl_pf_fstrans_check(); - if (fstrans) - current->flags &= ~(__SPL_PF_FSTRANS); - error = -vfs_fsync(filp, datasync); - if (fstrans) - current->flags |= __SPL_PF_FSTRANS; - return (error); } @@ -292,14 +281,6 @@ int zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len) { /* - * May enter XFS which generates a warning when PF_FSTRANS is set. - * To avoid this the flag is cleared over vfs_sync() and then reset. - */ - int fstrans = __spl_pf_fstrans_check(); - if (fstrans) - current->flags &= ~(__SPL_PF_FSTRANS); - - /* * When supported by the underlying file system preferentially * use the fallocate() callback to preallocate the space. */ @@ -308,9 +289,6 @@ zfs_file_deallocate(zfs_file_t *fp, loff_t offset, loff_t len) error = -fp->f_op->fallocate(fp, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, len); - if (fstrans) - current->flags |= __SPL_PF_FSTRANS; - if (error) return (SET_ERROR(error)); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c index 1c187d7b9cab..895d80b2d79e 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c @@ -223,7 +223,7 @@ zfs_kobj_add(zfs_mod_kobj_t *zkobj, struct kobject *parent, const char *name) { /* zko_default_group.attrs must be NULL terminated */ ASSERT(zkobj->zko_default_group.attrs != NULL); - ASSERT(zkobj->zko_default_group.attrs[zkobj->zko_attr_count] == NULL); + ASSERT0P(zkobj->zko_default_group.attrs[zkobj->zko_attr_count]); kobject_init(&zkobj->zko_kobj, &zkobj->zko_kobj_type); return (kobject_add(&zkobj->zko_kobj, parent, name)); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c index a3837f784668..8a7d14ab6119 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c @@ -279,19 +279,14 @@ zfs_sync(struct super_block *sb, int wait, cred_t *cr) return (err); /* - * If the pool is suspended, just return an error. This is to help - * with shutting down with pools suspended, as we don't want to block - * in that case. + * Sync any pending writes, but do not block if the pool is suspended. + * This is to help with shutting down with pools suspended, as we don't + * want to block in that case. */ - if (spa_suspended(zfsvfs->z_os->os_spa)) { - zfs_exit(zfsvfs, FTAG); - return (SET_ERROR(EIO)); - } - - zil_commit(zfsvfs->z_log, 0); + err = zil_commit_flags(zfsvfs->z_log, 0, ZIL_COMMIT_NOW); zfs_exit(zfsvfs, FTAG); - return (0); + return (err); } static void @@ -883,7 +878,7 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) * operations out since we closed the ZIL. */ if (mounting) { - ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); + ASSERT0P(zfsvfs->z_kstat.dk_kstats); error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); if (error) return (error); @@ -1217,6 +1212,63 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp) } /* + * Dentry and inode caches referenced by a task in non-root memcg are + * not going to be scanned by the kernel-provided shrinker. So, if + * kernel prunes nothing, fall back to this manual walk to free dnodes. + * To avoid scanning the same znodes multiple times they are always rotated + * to the end of the z_all_znodes list. New znodes are inserted at the + * end of the list so we're always scanning the oldest znodes first. + */ +static int +zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) +{ + znode_t **zp_array, *zp; + int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *)); + int objects = 0; + int i = 0, j = 0; + + zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP); + + mutex_enter(&zfsvfs->z_znodes_lock); + while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) { + + if ((i++ > nr_to_scan) || (j >= max_array)) + break; + + ASSERT(list_link_active(&zp->z_link_node)); + list_remove(&zfsvfs->z_all_znodes, zp); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + + /* Skip active znodes and .zfs entries */ + if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir) + continue; + + if (igrab(ZTOI(zp)) == NULL) + continue; + + zp_array[j] = zp; + j++; + } + mutex_exit(&zfsvfs->z_znodes_lock); + + for (i = 0; i < j; i++) { + zp = zp_array[i]; + + ASSERT3P(zp, !=, NULL); + d_prune_aliases(ZTOI(zp)); + + if (atomic_read(&ZTOI(zp)->i_count) == 1) + objects++; + + zrele(zp); + } + + vmem_free(zp_array, max_array * sizeof (znode_t *)); + + return (objects); +} + +/* * The ARC has requested that the filesystem drop entries from the dentry * and inode caches. This can occur when the ARC needs to free meta data * blocks but can't because they are all pinned by entries in these caches. @@ -1267,6 +1319,14 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) *objects = (*shrinker->scan_objects)(shrinker, &sc); #endif + /* + * Fall back to zfs_prune_aliases if kernel's shrinker did nothing + * due to dentry and inode caches being referenced by a task running + * in non-root memcg. + */ + if (*objects == 0) + *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); + zfs_exit(zfsvfs, FTAG); dprintf_ds(zfsvfs->z_os->os_dsl_dataset, @@ -1496,6 +1556,12 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) sb->s_xattr = zpl_xattr_handlers; sb->s_export_op = &zpl_export_operations; +#ifdef HAVE_SET_DEFAULT_D_OP + set_default_d_op(sb, &zpl_dentry_operations); +#else + sb->s_d_op = &zpl_dentry_operations; +#endif + /* Set features for file system. */ zfs_set_fuid_feature(zfsvfs); @@ -1611,7 +1677,7 @@ zfs_umount(struct super_block *sb) if (zfsvfs->z_arc_prune != NULL) arc_remove_prune_callback(zfsvfs->z_arc_prune); - VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); + VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE)); os = zfsvfs->z_os; /* @@ -1737,8 +1803,8 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) ASSERT(*ipp != NULL); if (object == ZFSCTL_INO_SNAPDIR) { - VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp, - 0, kcred, NULL, NULL) == 0); + VERIFY0(zfsctl_root_lookup(*ipp, "snapshot", ipp, + 0, kcred, NULL, NULL)); } else { /* * Must have an existing ref, so igrab() @@ -1840,7 +1906,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) goto bail; ds->ds_dir->dd_activity_cancelled = B_FALSE; - VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); + VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE)); zfs_set_fuid_feature(zfsvfs); zfsvfs->z_rollback_time = jiffies; @@ -2013,7 +2079,7 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT0(error); - VERIFY(0 == sa_set_sa_object(os, sa_obj)); + VERIFY0(sa_set_sa_object(os, sa_obj)); sa_register_update_callback(os, zfs_sa_upgrade); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c index ed9721dade76..6106726651a3 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c @@ -25,6 +25,7 @@ * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2025, Klara, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ @@ -840,8 +841,8 @@ out: *zpp = zp; } - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); @@ -1202,8 +1203,8 @@ out: zfs_zrele_async(xzp); } - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); @@ -1391,14 +1392,15 @@ out: zfs_dirent_unlock(dl); - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - if (error != 0) { zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); + } zfs_exit(zfsvfs, FTAG); return (error); @@ -1527,8 +1529,8 @@ out: zfs_znode_update_vfs(zp); zrele(zp); - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); @@ -2482,10 +2484,10 @@ top: new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); - ASSERT(err == 0); + ASSERT0(err); if (attrzp) { err = zfs_acl_chown_setattr(attrzp); - ASSERT(err == 0); + ASSERT0(err); } } @@ -2599,7 +2601,7 @@ out: if (err == 0 && xattr_count > 0) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); - ASSERT(err2 == 0); + ASSERT0(err2); } if (aclp) @@ -2629,8 +2631,8 @@ out: } out2: - if (os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (err == 0 && os->os_sync == ZFS_SYNC_ALWAYS) + err = zil_commit(zilog, 0); out3: kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); @@ -3156,7 +3158,7 @@ top: * zfs_link_create() to add back the same entry, but with a new * dnode (szp), should not fail. */ - ASSERT3P(tzp, ==, NULL); + ASSERT0P(tzp); goto commit_link_tzp; } @@ -3234,8 +3236,8 @@ out: zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); @@ -3435,7 +3437,7 @@ top: *zpp = zp; if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + error = zil_commit(zilog, 0); } else { zrele(zp); } @@ -3653,8 +3655,8 @@ top: * operation are sync safe. */ if (is_tmpfile) { - VERIFY(zap_remove_int(zfsvfs->z_os, - zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); + VERIFY0(zap_remove_int(zfsvfs->z_os, + zfsvfs->z_unlinkedobj, szp->z_id, tx)); } else { if (flags & FIGNORECASE) txtype |= TX_CI; @@ -3669,18 +3671,20 @@ top: zfs_dirent_unlock(dl); - if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { - txg_wait_flag_t wait_flags = - spa_get_failmode(dmu_objset_spa(zfsvfs->z_os)) == - ZIO_FAILURE_MODE_CONTINUE ? TXG_WAIT_SUSPEND : 0; - error = txg_wait_synced_flags(dmu_objset_pool(zfsvfs->z_os), - txg, wait_flags); - if (error != 0) { - ASSERT3U(error, ==, ESHUTDOWN); - error = SET_ERROR(EIO); + if (error == 0) { + if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); + + if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { + txg_wait_flag_t wait_flags = + spa_get_failmode(dmu_objset_spa(zfsvfs->z_os)) == + ZIO_FAILURE_MODE_CONTINUE ? TXG_WAIT_SUSPEND : 0; + error = txg_wait_synced_flags( + dmu_objset_pool(zfsvfs->z_os), txg, wait_flags); + if (error != 0) { + ASSERT3U(error, ==, ESHUTDOWN); + error = SET_ERROR(EIO); + } } } @@ -3690,24 +3694,39 @@ top: return (error); } -static void -zfs_putpage_sync_commit_cb(void *arg) +/* Finish page writeback. */ +static inline void +zfs_page_writeback_done(struct page *pp, int err) { - struct page *pp = arg; + if (err != 0) { + /* + * Writeback failed. Re-dirty the page. It was undirtied before + * the IO was issued (in zfs_putpage() or write_cache_pages()). + * The kernel only considers writeback for dirty pages; if we + * don't do this, it is eligible for eviction without being + * written out, which we definitely don't want. + */ +#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO + filemap_dirty_folio(page_mapping(pp), page_folio(pp)); +#else + __set_page_dirty_nobuffers(pp); +#endif + } ClearPageError(pp); end_page_writeback(pp); } +/* + * ZIL callback for page writeback. Passes to zfs_log_write() in zfs_putpage() + * for syncing writes. Called when the ZIL itx has been written to the log or + * the whole txg syncs, or if the ZIL crashes or the pool suspends. Any failure + * is passed as `err`. + */ static void -zfs_putpage_async_commit_cb(void *arg) +zfs_putpage_commit_cb(void *arg, int err) { - struct page *pp = arg; - znode_t *zp = ITOZ(pp->mapping->host); - - ClearPageError(pp); - end_page_writeback(pp); - atomic_dec_32(&zp->z_async_writes_cnt); + zfs_page_writeback_done(arg, err); } /* @@ -3827,15 +3846,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, zfs_rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) { - /* - * Speed up any non-sync page writebacks since - * they may take several seconds to complete. - * Refer to the comment in zpl_fsync() for details. - */ - if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { - zil_commit(zfsvfs->z_log, zp->z_id); - } - if (PageWriteback(pp)) #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT folio_wait_bit(page_folio(pp), PG_writeback); @@ -3861,8 +3871,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, * was in fact not skipped and should not be counted as if it were. */ wbc->pages_skipped--; - if (!for_sync) - atomic_inc_32(&zp->z_async_writes_cnt); set_page_writeback(pp); unlock_page(pp); @@ -3874,18 +3882,15 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, err = dmu_tx_assign(tx, DMU_TX_WAIT); if (err != 0) { dmu_tx_abort(tx); -#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO - filemap_dirty_folio(page_mapping(pp), page_folio(pp)); -#else - __set_page_dirty_nobuffers(pp); -#endif - ClearPageError(pp); - end_page_writeback(pp); - if (!for_sync) - atomic_dec_32(&zp->z_async_writes_cnt); + zfs_page_writeback_done(pp, err); zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); - return (err); + + /* + * Don't return error for an async writeback; we've re-dirtied + * the page so it will be tried again some other time. + */ + return (for_sync ? err : 0); } va = kmap(pp); @@ -3908,36 +3913,70 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); - boolean_t commit = B_FALSE; - if (wbc->sync_mode != WB_SYNC_NONE) { - /* - * Note that this is rarely called under writepages(), because - * writepages() normally handles the entire commit for - * performance reasons. - */ - commit = B_TRUE; - } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) { + /* + * A note about for_sync vs wbc->sync_mode. + * + * for_sync indicates that this is a syncing writeback, that is, kernel + * caller expects the data to be durably stored before being notified. + * Often, but not always, the call was triggered by a userspace syncing + * op (eg fsync(), msync(MS_SYNC)). For our purposes, for_sync==TRUE + * means that that page should remain "locked" (in the writeback state) + * until it is definitely on disk (ie zil_commit() or spa_sync()). + * Otherwise, we can unlock and return as soon as it is on the + * in-memory ZIL. + * + * wbc->sync_mode has similar meaning. wbc is passed from the kernel to + * zpl_writepages()/zpl_writepage(); wbc->sync_mode==WB_SYNC_NONE + * indicates this a regular async writeback (eg a cache eviction) and + * so does not need a durability guarantee, while WB_SYNC_ALL indicates + * a syncing op that must be waited on (by convention, we test for + * !WB_SYNC_NONE rather than WB_SYNC_ALL, to prefer durability over + * performance should there ever be a new mode that we have not yet + * added support for). + * + * So, why a separate for_sync field? This is because zpl_writepages() + * calls zfs_putpage() multiple times for a single "logical" operation. + * It wants all the individual pages to be for_sync==TRUE ie only + * unlocked once durably stored, but it only wants one call to + * zil_commit() at the very end, once all the pages are synced. So, + * it repurposes sync_mode slightly to indicate who issue and wait for + * the IO: for NONE, the caller to zfs_putpage() will do it, while for + * ALL, zfs_putpage should do it. + * + * Summary: + * for_sync: 0=unlock immediately; 1=unlock once on disk + * sync_mode: NONE=caller will commit; ALL=we will commit + */ + boolean_t need_commit = (wbc->sync_mode != WB_SYNC_NONE); + + /* + * We use for_sync as the "commit" arg to zfs_log_write() (arg 7) + * because it is a policy flag that indicates "someone will call + * zil_commit() soon". for_sync=TRUE means exactly that; the only + * question is whether it will be us, or zpl_writepages(). + */ + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, for_sync, + B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp); + + if (!for_sync) { /* - * If the caller does not intend to wait synchronously - * for this page writeback to complete and there are active - * synchronous calls on this file, do a commit so that - * the latter don't accidentally end up waiting for - * our writeback to complete. Refer to the comment in - * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details. + * Async writeback is logged and written to the DMU, so page + * can now be unlocked. */ - commit = B_TRUE; + zfs_page_writeback_done(pp, 0); } - zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit, - B_FALSE, for_sync ? zfs_putpage_sync_commit_cb : - zfs_putpage_async_commit_cb, pp); - dmu_tx_commit(tx); zfs_rangelock_exit(lr); - if (commit) - zil_commit(zfsvfs->z_log, zp->z_id); + if (need_commit) { + err = zil_commit_flags(zfsvfs->z_log, zp->z_id, ZIL_COMMIT_NOW); + if (err != 0) { + zfs_exit(zfsvfs, FTAG); + return (err); + } + } dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c index 54e60b4820f6..bcaabeb32b8a 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode_os.c @@ -126,8 +126,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zp->z_acl_cached = NULL; zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; - zp->z_sync_writes_cnt = 0; - zp->z_async_writes_cnt = 0; return (0); } @@ -146,12 +144,9 @@ zfs_znode_cache_destructor(void *buf, void *arg) rw_destroy(&zp->z_xattr_lock); zfs_rangelock_fini(&zp->z_rangelock); - ASSERT3P(zp->z_dirlocks, ==, NULL); - ASSERT3P(zp->z_acl_cached, ==, NULL); - ASSERT3P(zp->z_xattr_cached, ==, NULL); - - ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); - ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); + ASSERT0P(zp->z_dirlocks); + ASSERT0P(zp->z_acl_cached); + ASSERT0P(zp->z_xattr_cached); } static int @@ -183,13 +178,13 @@ zfs_znode_init(void) * backed by kmalloc() when on the Linux slab in order that any * wait_on_bit() operations on the related inode operate properly. */ - ASSERT(znode_cache == NULL); + ASSERT0P(znode_cache); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB | KMC_RECLAIMABLE); - ASSERT(znode_hold_cache == NULL); + ASSERT0P(znode_hold_cache); znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache", sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor, zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0); @@ -332,10 +327,10 @@ zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, mutex_enter(&zp->z_lock); - ASSERT(zp->z_sa_hdl == NULL); - ASSERT(zp->z_acl_cached == NULL); + ASSERT0P(zp->z_sa_hdl); + ASSERT0P(zp->z_acl_cached); if (sa_hdl == NULL) { - VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, + VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, zp, SA_HDL_SHARED, &zp->z_sa_hdl)); } else { zp->z_sa_hdl = sa_hdl; @@ -371,6 +366,12 @@ zfs_inode_alloc(struct super_block *sb, struct inode **ip) return (0); } +void +zfs_inode_free(struct inode *ip) +{ + kmem_cache_free(znode_cache, ITOZ(ip)); +} + /* * Called in multiple places when an inode should be destroyed. */ @@ -395,8 +396,15 @@ zfs_inode_destroy(struct inode *ip) nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } - - kmem_cache_free(znode_cache, zp); +#ifndef HAVE_SOPS_FREE_INODE + /* + * inode needs to be freed in RCU callback. If we have + * super_operations->free_inode, Linux kernel will do call_rcu + * for us. But if we don't have it, since call_rcu is GPL-only + * symbol, we can only free synchronously and accept the risk. + */ + zfs_inode_free(ip); +#endif } static void @@ -522,9 +530,9 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, return (NULL); zp = ITOZ(ip); - ASSERT(zp->z_dirlocks == NULL); - ASSERT3P(zp->z_acl_cached, ==, NULL); - ASSERT3P(zp->z_xattr_cached, ==, NULL); + ASSERT0P(zp->z_dirlocks); + ASSERT0P(zp->z_acl_cached); + ASSERT0P(zp->z_xattr_cached); zp->z_unlinked = B_FALSE; zp->z_atime_dirty = B_FALSE; zp->z_is_ctldir = B_FALSE; @@ -535,8 +543,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; - zp->z_sync_writes_cnt = 0; - zp->z_async_writes_cnt = 0; zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); @@ -605,7 +611,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, * processing so do not hash unlinked znodes. */ if (links > 0) - VERIFY3S(insert_inode_locked(ip), ==, 0); + VERIFY0(insert_inode_locked(ip)); mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); @@ -805,7 +811,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, } /* Now add in all of the "SA" attributes */ - VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, + VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, &sa_hdl)); /* @@ -895,7 +901,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, acl_ids->z_fuid, acl_ids->z_fgid); } - VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); + VERIFY0(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx)); if (!(flag & IS_ROOT_NODE)) { /* @@ -1194,7 +1200,7 @@ zfs_rezget(znode_t *zp) } rw_exit(&zp->z_xattr_lock); - ASSERT(zp->z_sa_hdl == NULL); + ASSERT0P(zp->z_sa_hdl); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { zfs_znode_hold_exit(zfsvfs, zh); @@ -1308,9 +1314,9 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) zh = zfs_znode_hold_enter(zfsvfs, obj); if (acl_obj) { VERIFY(!zp->z_is_sa); - VERIFY(0 == dmu_object_free(os, acl_obj, tx)); + VERIFY0(dmu_object_free(os, acl_obj, tx)); } - VERIFY(0 == dmu_object_free(os, obj, tx)); + VERIFY0(dmu_object_free(os, obj, tx)); zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); } @@ -1530,7 +1536,7 @@ zfs_extend(znode_t *zp, uint64_t end) zp->z_size = end; - VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)), + VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)), &zp->z_size, sizeof (zp->z_size), tx)); zfs_rangelock_exit(lr); @@ -1720,7 +1726,7 @@ zfs_trunc(znode_t *zp, uint64_t end) SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); } - VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); + VERIFY0(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); dmu_tx_commit(tx); zfs_rangelock_exit(lr); @@ -1787,7 +1793,7 @@ log: NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - ASSERT(error == 0); + ASSERT0(error); zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); @@ -1834,7 +1840,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) moid = MASTER_NODE_OBJ; error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, DMU_OT_NONE, 0, tx); - ASSERT(error == 0); + ASSERT0(error); /* * Set starting attributes. @@ -1847,7 +1853,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) const char *name; ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); - VERIFY(nvpair_value_uint64(elem, &val) == 0); + VERIFY0(nvpair_value_uint64(elem, &val)); name = nvpair_name(elem); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { if (val < version) @@ -1855,7 +1861,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) } else { error = zap_update(os, moid, name, 8, 1, &val, tx); } - ASSERT(error == 0); + ASSERT0(error); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) norm = val; else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) @@ -1863,7 +1869,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) } ASSERT(version != 0); error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); - ASSERT(error == 0); + ASSERT0(error); /* * Create zap object used for SA attribute registration @@ -1873,7 +1879,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); - ASSERT(error == 0); + ASSERT0(error); } else { sa_obj = 0; } @@ -1883,7 +1889,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); - ASSERT(error == 0); + ASSERT0(error); /* * Create root znode. Create minimal znode/inode/zfsvfs/sb @@ -1916,7 +1922,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); - ASSERT(error == 0); + ASSERT0(error); /* * Fold case on file systems that are always or sometimes case @@ -1940,12 +1946,12 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); } - VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, + VERIFY0(zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, cr, NULL, &acl_ids, zfs_init_idmap)); zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, rootzp); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); - ASSERT(error == 0); + ASSERT0(error); zfs_acl_ids_free(&acl_ids); atomic_set(&ZTOI(rootzp)->i_count, 0); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c index 48dae79a2373..81ac26cb0c93 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c @@ -202,7 +202,7 @@ zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags) return (!!dentry->d_inode); } -static dentry_operations_t zpl_dops_snapdirs = { +static const struct dentry_operations zpl_dops_snapdirs = { /* * Auto mounting of snapshots is only supported for 2.6.37 and * newer kernels. Prior to this kernel the ops->follow_link() @@ -215,6 +215,51 @@ static dentry_operations_t zpl_dops_snapdirs = { .d_revalidate = zpl_snapdir_revalidate, }; +/* + * For the .zfs control directory to work properly we must be able to override + * the default operations table and register custom .d_automount and + * .d_revalidate callbacks. + */ +static void +set_snapdir_dentry_ops(struct dentry *dentry, unsigned int extraflags) { + static const unsigned int op_flags = + DCACHE_OP_HASH | DCACHE_OP_COMPARE | + DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE | + DCACHE_OP_PRUNE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_REAL; + +#ifdef HAVE_D_SET_D_OP + /* + * d_set_d_op() will set the DCACHE_OP_ flags according to what it + * finds in the passed dentry_operations, so we don't have to. + * + * We clear the flags and the old op table before calling d_set_d_op() + * because issues a warning when the dentry operations table is already + * set. + */ + dentry->d_op = NULL; + dentry->d_flags &= ~op_flags; + d_set_d_op(dentry, &zpl_dops_snapdirs); + dentry->d_flags |= extraflags; +#else + /* + * Since 6.17 there's no exported way to modify dentry ops, so we have + * to reach in and do it ourselves. This should be safe for our very + * narrow use case, which is to create or splice in an entry to give + * access to a snapshot. + * + * We need to set the op flags directly. We hardcode + * DCACHE_OP_REVALIDATE because that's the only operation we have; if + * we ever extend zpl_dops_snapdirs we will need to update the op flags + * to match. + */ + spin_lock(&dentry->d_lock); + dentry->d_op = &zpl_dops_snapdirs; + dentry->d_flags &= ~op_flags; + dentry->d_flags |= DCACHE_OP_REVALIDATE | extraflags; + spin_unlock(&dentry->d_lock); +#endif +} + static struct dentry * zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) @@ -236,10 +281,7 @@ zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, return (ERR_PTR(error)); ASSERT(error == 0 || ip == NULL); - d_clear_d_op(dentry); - d_set_d_op(dentry, &zpl_dops_snapdirs); - dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; - + set_snapdir_dentry_ops(dentry, DCACHE_NEED_AUTOMOUNT); return (d_splice_alias(ip, dentry)); } @@ -373,8 +415,7 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0); if (error == 0) { - d_clear_d_op(dentry); - d_set_d_op(dentry, &zpl_dops_snapdirs); + set_snapdir_dentry_ops(dentry, 0); d_instantiate(dentry, ip); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c index 1a82c13e1523..d07317b0d910 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + * Copyright (c) 2025, Klara, Inc. */ @@ -106,60 +107,52 @@ zpl_iterate(struct file *filp, struct dir_context *ctx) return (error); } +static inline int +zpl_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, void *data); + static int zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; znode_t *zp = ITOZ(inode); - zfsvfs_t *zfsvfs = ITOZSB(inode); cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; /* - * The variables z_sync_writes_cnt and z_async_writes_cnt work in - * tandem so that sync writes can detect if there are any non-sync - * writes going on and vice-versa. The "vice-versa" part to this logic - * is located in zfs_putpage() where non-sync writes check if there are - * any ongoing sync writes. If any sync and non-sync writes overlap, - * we do a commit to complete the non-sync writes since the latter can - * potentially take several seconds to complete and thus block sync - * writes in the upcoming call to filemap_write_and_wait_range(). - */ - atomic_inc_32(&zp->z_sync_writes_cnt); - /* - * If the following check does not detect an overlapping non-sync write - * (say because it's just about to start), then it is guaranteed that - * the non-sync write will detect this sync write. This is because we - * always increment z_sync_writes_cnt / z_async_writes_cnt before doing - * the check on z_async_writes_cnt / z_sync_writes_cnt here and in - * zfs_putpage() respectively. + * Force dirty pages in the range out to the DMU and the log, ready + * for zil_commit() to write down. + * + * We call write_cache_pages() directly to ensure that zpl_putpage() is + * called with the flags we need. We need WB_SYNC_NONE to avoid a call + * to zil_commit() (since we're doing this as a kind of pre-sync); but + * we do need for_sync so that the pages remain in writeback until + * they're on disk, and so that we get an error if the DMU write fails. */ - if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { - if ((error = zpl_enter(zfsvfs, FTAG)) != 0) { - atomic_dec_32(&zp->z_sync_writes_cnt); + if (filemap_range_has_page(inode->i_mapping, start, end)) { + int for_sync = 1; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = LONG_MAX, + .range_start = start, + .range_end = end, + }; + error = + zpl_write_cache_pages(inode->i_mapping, &wbc, &for_sync); + if (error != 0) { + /* + * Unclear what state things are in. zfs_putpage() will + * ensure the pages remain dirty if they haven't been + * written down to the DMU, but because there may be + * nothing logged, we can't assume that zfs_sync() -> + * zil_commit() will give us a useful error. It's + * safest if we just error out here. + */ return (error); } - zil_commit(zfsvfs->z_log, zp->z_id); - zpl_exit(zfsvfs, FTAG); } - error = filemap_write_and_wait_range(inode->i_mapping, start, end); - - /* - * The sync write is not complete yet but we decrement - * z_sync_writes_cnt since zfs_fsync() increments and decrements - * it internally. If a non-sync write starts just after the decrement - * operation but before we call zfs_fsync(), it may not detect this - * overlapping sync write but it does not matter since we have already - * gone past filemap_write_and_wait_range() and we won't block due to - * the non-sync write. - */ - atomic_dec_32(&zp->z_sync_writes_cnt); - - if (error) - return (error); - crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_fsync(zp, datasync, cr); @@ -535,11 +528,30 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) if (sync_mode != wbc->sync_mode) { if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (result); - if (zfsvfs->z_log != NULL) - zil_commit(zfsvfs->z_log, zp->z_id); + + if (zfsvfs->z_log != NULL) { + /* + * We don't want to block here if the pool suspends, + * because this is not a syncing op by itself, but + * might be part of one that the caller will + * coordinate. + */ + result = -zil_commit_flags(zfsvfs->z_log, zp->z_id, + ZIL_COMMIT_NOW); + } + zpl_exit(zfsvfs, FTAG); /* + * If zil_commit_flags() failed, it's unclear what state things + * are currently in. putpage() has written back out what it can + * to the DMU, but it may not be on disk. We have little choice + * but to escape. + */ + if (result != 0) + return (result); + + /* * We need to call write_cache_pages() again (we can't just * return after the commit) because the previous call in * non-SYNC mode does not guarantee that we got all the dirty diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c index f9f6406f8b47..f97662d052c7 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c @@ -247,7 +247,7 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, * and fifos, but we want to know if this behavior ever changes. */ if (S_ISSOCK(mode) || S_ISFIFO(mode)) - ASSERT(rdev == 0); + ASSERT0(rdev); crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c index a682bfd33c38..444948d03cb3 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2023, Datto Inc. All rights reserved. + * Copyright (c) 2025, Klara, Inc. */ @@ -33,6 +34,20 @@ #include <linux/iversion.h> #include <linux/version.h> +/* + * What to do when the last reference to an inode is released. If 0, the kernel + * will cache it on the superblock. If 1, the inode will be freed immediately. + * See zpl_drop_inode(). + */ +int zfs_delete_inode = 0; + +/* + * What to do when the last reference to a dentry is released. If 0, the kernel + * will cache it until the entry (file) is destroyed. If 1, the dentry will be + * marked for cleanup, at which time its inode reference will be released. See + * zpl_dentry_delete(). + */ +int zfs_delete_dentry = 0; static struct inode * zpl_inode_alloc(struct super_block *sb) @@ -45,10 +60,19 @@ zpl_inode_alloc(struct super_block *sb) return (ip); } +#ifdef HAVE_SOPS_FREE_INODE +static void +zpl_inode_free(struct inode *ip) +{ + ASSERT0(atomic_read(&ip->i_count)); + zfs_inode_free(ip); +} +#endif + static void zpl_inode_destroy(struct inode *ip) { - ASSERT(atomic_read(&ip->i_count) == 0); + ASSERT0(atomic_read(&ip->i_count)); zfs_inode_destroy(ip); } @@ -68,11 +92,36 @@ zpl_dirty_inode(struct inode *ip, int flags) } /* - * When ->drop_inode() is called its return value indicates if the - * inode should be evicted from the inode cache. If the inode is - * unhashed and has no links the default policy is to evict it - * immediately. + * ->drop_inode() is called when the last reference to an inode is released. + * Its return value indicates if the inode should be destroyed immediately, or + * cached on the superblock structure. + * + * By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns + * "destroy immediately" if the inode is unhashed and has no links (roughly: no + * longer exists on disk). On datasets with millions of rarely-accessed files, + * this can cause a large amount of memory to be "pinned" by cached inodes, + * which in turn pin their associated dnodes and dbufs, until the kernel starts + * reporting memory pressure and requests OpenZFS release some memory (see + * zfs_prune()). + * + * When set to 1, we call generic_delete_node(), which always returns "destroy + * immediately", resulting in inodes being destroyed immediately, releasing + * their associated dnodes and dbufs to the dbuf cached and the ARC to be + * evicted as normal. * + * Note that the "last reference" doesn't always mean the last _userspace_ + * reference; the dentry cache also holds a reference, so "busy" inodes will + * still be kept alive that way (subject to dcache tuning). + */ +static int +zpl_drop_inode(struct inode *ip) +{ + if (zfs_delete_inode) + return (generic_delete_inode(ip)); + return (generic_drop_inode(ip)); +} + +/* * The ->evict_inode() callback must minimally truncate the inode pages, * and call clear_inode(). For 2.6.35 and later kernels this will * simply update the inode state, with the sync occurring before the @@ -455,9 +504,13 @@ zpl_prune_sb(uint64_t nr_to_scan, void *arg) const struct super_operations zpl_super_operations = { .alloc_inode = zpl_inode_alloc, +#ifdef HAVE_SOPS_FREE_INODE + .free_inode = zpl_inode_free, +#endif .destroy_inode = zpl_inode_destroy, .dirty_inode = zpl_dirty_inode, .write_inode = NULL, + .drop_inode = zpl_drop_inode, .evict_inode = zpl_evict_inode, .put_super = zpl_put_super, .sync_fs = zpl_sync_fs, @@ -468,6 +521,35 @@ const struct super_operations zpl_super_operations = { .show_stats = NULL, }; +/* + * ->d_delete() is called when the last reference to a dentry is released. Its + * return value indicates if the dentry should be destroyed immediately, or + * retained in the dentry cache. + * + * By default (zfs_delete_dentry=0) the kernel will always cache unused + * entries. Each dentry holds an inode reference, so cached dentries can hold + * the final inode reference indefinitely, leading to the inode and its related + * data being pinned (see zpl_drop_inode()). + * + * When set to 1, we signal that the dentry should be destroyed immediately and + * never cached. This reduces memory usage, at the cost of higher overheads to + * lookup a file, as the inode and its underlying data (dnode/dbuf) need to be + * reloaded and reinflated. + * + * Note that userspace does not have direct control over dentry references and + * reclaim; rather, this is part of the kernel's caching and reclaim subsystems + * (eg vm.vfs_cache_pressure). + */ +static int +zpl_dentry_delete(const struct dentry *dentry) +{ + return (zfs_delete_dentry ? 1 : 0); +} + +const struct dentry_operations zpl_dentry_operations = { + .d_delete = zpl_dentry_delete, +}; + struct file_system_type zpl_fs_type = { .owner = THIS_MODULE, .name = ZFS_DRIVER, @@ -479,3 +561,10 @@ struct file_system_type zpl_fs_type = { .mount = zpl_mount, .kill_sb = zpl_kill_sb, }; + +ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW, + "Delete inodes as soon as the last reference is released."); + +ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW, + "Delete dentries from dentry cache as soon as the last reference is " + "released."); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c index a098197e7448..d93282db815a 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c @@ -1494,7 +1494,7 @@ zpl_posix_acl_free(void *arg) acl_rel_head = NULL; if (cmpxchg(&acl_rel_tail, &a->next, &acl_rel_head) == &a->next) { - ASSERT3P(a->next, ==, NULL); + ASSERT0P(a->next); a->next = freelist; freelist = a; break; @@ -1544,7 +1544,7 @@ zpl_posix_acl_release_impl(struct posix_acl *acl) a->time = ddi_get_lbolt(); /* atomically points tail to us and get the previous tail */ prev = xchg(&acl_rel_tail, &a->next); - ASSERT3P(*prev, ==, NULL); + ASSERT0P(*prev); *prev = a; /* if it was empty before, schedule the free task */ if (prev == &acl_rel_head) diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index 57a9711e9027..bac166fcd89e 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2024, Rob Norris <robn@despairlabs.com> - * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2024, 2025, Klara, Inc. */ #include <sys/dataset_kstats.h> @@ -84,8 +84,9 @@ static unsigned int zvol_blk_mq_blocks_per_thread = 8; static inline void zvol_end_io(struct bio *bio, struct request *rq, int error) { + ASSERT3U(error, >=, 0); if (bio) { - bio->bi_status = errno_to_bi_status(-error); + bio->bi_status = errno_to_bi_status(error); bio_endio(bio); } else { blk_mq_end_request(rq, errno_to_bi_status(error)); @@ -208,8 +209,14 @@ zvol_write(zv_request_t *zvr) disk = zv->zv_zso->zvo_disk; /* bio marked as FLUSH need to flush before write */ - if (io_is_flush(bio, rq)) - zil_commit(zv->zv_zilog, ZVOL_OBJ); + if (io_is_flush(bio, rq)) { + error = zil_commit(zv->zv_zilog, ZVOL_OBJ); + if (error != 0) { + rw_exit(&zv->zv_suspend_lock); + zvol_end_io(bio, rq, -error); + return; + } + } /* Some requests are just for flush and nothing else. */ if (io_size(bio, rq) == 0) { @@ -273,8 +280,8 @@ zvol_write(zv_request_t *zvr) dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); task_io_account_write(nwritten); - if (sync) - zil_commit(zv->zv_zilog, ZVOL_OBJ); + if (error == 0 && sync) + error = zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); @@ -282,7 +289,7 @@ zvol_write(zv_request_t *zvr) blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); } - zvol_end_io(bio, rq, -error); + zvol_end_io(bio, rq, error); } static void @@ -361,7 +368,7 @@ zvol_discard(zv_request_t *zvr) zfs_rangelock_exit(lr); if (error == 0 && sync) - zil_commit(zv->zv_zilog, ZVOL_OBJ); + error = zil_commit(zv->zv_zilog, ZVOL_OBJ); unlock: rw_exit(&zv->zv_suspend_lock); @@ -371,7 +378,7 @@ unlock: start_time); } - zvol_end_io(bio, rq, -error); + zvol_end_io(bio, rq, error); } static void @@ -449,7 +456,7 @@ zvol_read(zv_request_t *zvr) blk_generic_end_io_acct(q, disk, READ, bio, start_time); } - zvol_end_io(bio, rq, -error); + zvol_end_io(bio, rq, error); } static void @@ -480,7 +487,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, int rw = io_data_dir(bio, rq); if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { - zvol_end_io(bio, rq, -SET_ERROR(ENXIO)); + zvol_end_io(bio, rq, SET_ERROR(ENXIO)); goto out; } @@ -499,7 +506,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, (long long unsigned)offset, (long unsigned)size); - zvol_end_io(bio, rq, -SET_ERROR(EIO)); + zvol_end_io(bio, rq, SET_ERROR(EIO)); goto out; } @@ -512,8 +519,8 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, #ifdef HAVE_BLK_MQ_RQ_HCTX blk_mq_hw_queue = rq->mq_hctx->queue_num; #else - blk_mq_hw_queue = - rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num; + blk_mq_hw_queue = rq->q->queue_hw_ctx[ + rq->q->mq_map[raw_smp_processor_id()]]->queue_num; #endif taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, blk_mq_hw_queue); @@ -521,7 +528,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, if (rw == WRITE) { if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { - zvol_end_io(bio, rq, -SET_ERROR(EROFS)); + zvol_end_io(bio, rq, SET_ERROR(EROFS)); goto out; } @@ -672,28 +679,19 @@ zvol_open(struct block_device *bdev, fmode_t flag) retry: #endif - rw_enter(&zvol_state_lock, RW_READER); - /* - * Obtain a copy of private_data under the zvol_state_lock to make - * sure that either the result of zvol free code path setting - * disk->private_data to NULL is observed, or zvol_os_free() - * is not called on this zv because of the positive zv_open_count. - */ + #ifdef HAVE_BLK_MODE_T - zv = disk->private_data; + zv = atomic_load_ptr(&disk->private_data); #else - zv = bdev->bd_disk->private_data; + zv = atomic_load_ptr(&bdev->bd_disk->private_data); #endif if (zv == NULL) { - rw_exit(&zvol_state_lock); return (-SET_ERROR(ENXIO)); } mutex_enter(&zv->zv_state_lock); - if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { mutex_exit(&zv->zv_state_lock); - rw_exit(&zvol_state_lock); return (-SET_ERROR(ENXIO)); } @@ -705,8 +703,28 @@ retry: if (zv->zv_open_count == 0) { if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { mutex_exit(&zv->zv_state_lock); + + /* + * Removal may happen while the locks are down, so + * we can't trust zv any longer; we have to start over. + */ +#ifdef HAVE_BLK_MODE_T + zv = atomic_load_ptr(&disk->private_data); +#else + zv = atomic_load_ptr(&bdev->bd_disk->private_data); +#endif + if (zv == NULL) + return (-SET_ERROR(ENXIO)); + rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); + + if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { + mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); + return (-SET_ERROR(ENXIO)); + } + /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); @@ -717,7 +735,6 @@ retry: drop_suspend = B_TRUE; } } - rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -814,11 +831,11 @@ zvol_release(struct gendisk *disk, fmode_t unused) #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) (void) unused; #endif - zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; - rw_enter(&zvol_state_lock, RW_READER); - zv = disk->private_data; + zvol_state_t *zv = atomic_load_ptr(&disk->private_data); + if (zv == NULL) + return; mutex_enter(&zv->zv_state_lock); ASSERT3U(zv->zv_open_count, >, 0); @@ -832,6 +849,15 @@ zvol_release(struct gendisk *disk, fmode_t unused) mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); + + /* + * Unlike in zvol_open(), we don't check if removal + * started here, because we might be one of the openers + * that needs to be thrown out! If we're the last, we + * need to call zvol_last_close() below to finish + * cleanup. So, no special treatment for us. + */ + /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 1) { rw_exit(&zv->zv_suspend_lock); @@ -841,7 +867,6 @@ zvol_release(struct gendisk *disk, fmode_t unused) } else { drop_suspend = B_FALSE; } - rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -861,9 +886,10 @@ static int zvol_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { - zvol_state_t *zv = bdev->bd_disk->private_data; int error = 0; + zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data); + ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); switch (cmd) { @@ -886,16 +912,18 @@ zvol_ioctl(struct block_device *bdev, fmode_t mode, case BLKZNAME: mutex_enter(&zv->zv_state_lock); - error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); + error = -copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); mutex_exit(&zv->zv_state_lock); + if (error) + error = SET_ERROR(error); break; default: - error = -ENOTTY; + error = SET_ERROR(ENOTTY); break; } - return (SET_ERROR(error)); + return (-error); } #ifdef CONFIG_COMPAT @@ -914,9 +942,8 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing) { unsigned int mask = 0; - rw_enter(&zvol_state_lock, RW_READER); + zvol_state_t *zv = atomic_load_ptr(&disk->private_data); - zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; @@ -924,17 +951,14 @@ zvol_check_events(struct gendisk *disk, unsigned int clearing) mutex_exit(&zv->zv_state_lock); } - rw_exit(&zvol_state_lock); - return (mask); } static int zvol_revalidate_disk(struct gendisk *disk) { - rw_enter(&zvol_state_lock, RW_READER); + zvol_state_t *zv = atomic_load_ptr(&disk->private_data); - zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); set_capacity(zv->zv_zso->zvo_disk, @@ -942,8 +966,6 @@ zvol_revalidate_disk(struct gendisk *disk) mutex_exit(&zv->zv_state_lock); } - rw_exit(&zvol_state_lock); - return (0); } @@ -962,16 +984,6 @@ zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) return (0); } -void -zvol_os_clear_private(zvol_state_t *zv) -{ - /* - * Cleared while holding zvol_state_lock as a writer - * which will prevent zvol_open() from opening it. - */ - zv->zv_zso->zvo_disk->private_data = NULL; -} - /* * Provide a simple virtual geometry for legacy compatibility. For devices * smaller than 1 MiB a small head and sector count is used to allow very @@ -981,9 +993,10 @@ zvol_os_clear_private(zvol_state_t *zv) static int zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) { - zvol_state_t *zv = bdev->bd_disk->private_data; sector_t sectors; + zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data); + ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); sectors = get_capacity(zv->zv_zso->zvo_disk); @@ -1302,27 +1315,30 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) * Allocate memory for a new zvol_state_t and setup the required * request queue and generic disk structures for the block device. */ -static zvol_state_t * -zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize) +static int +zvol_alloc(dev_t dev, const char *name, uint64_t volsize, uint64_t volblocksize, + zvol_state_t **zvp) { zvol_state_t *zv; struct zvol_state_os *zso; uint64_t volmode; int ret; - if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) - return (NULL); + ret = dsl_prop_get_integer(name, "volmode", &volmode, NULL); + if (ret) + return (ret); if (volmode == ZFS_VOLMODE_DEFAULT) volmode = zvol_volmode; if (volmode == ZFS_VOLMODE_NONE) - return (NULL); + return (0); zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); zv->zv_zso = zso; zv->zv_volmode = volmode; + zv->zv_volsize = volsize; zv->zv_volblocksize = volblocksize; list_link_init(&zv->zv_next); @@ -1396,61 +1412,79 @@ zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize) snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", ZVOL_DEV_NAME, (dev & MINORMASK)); - return (zv); + *zvp = zv; + return (ret); out_kmem: kmem_free(zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); - return (NULL); + return (ret); } -/* - * Cleanup then free a zvol_state_t which was created by zvol_alloc(). - * At this time, the structure is not opened by anyone, is taken off - * the zvol_state_list, and has its private data set to NULL. - * The zvol_state_lock is dropped. - * - * This function may take many milliseconds to complete (e.g. we've seen - * it take over 256ms), due to the calls to "blk_cleanup_queue" and - * "del_gendisk". Thus, consumers need to be careful to account for this - * latency when calling this function. - */ void -zvol_os_free(zvol_state_t *zv) +zvol_os_remove_minor(zvol_state_t *zv) { - - ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); - ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT0(zv->zv_open_count); - ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); + ASSERT0(atomic_read(&zv->zv_suspend_ref)); + ASSERT(zv->zv_flags & ZVOL_REMOVING); - rw_destroy(&zv->zv_suspend_lock); - zfs_rangelock_fini(&zv->zv_rangelock); + struct zvol_state_os *zso = zv->zv_zso; + zv->zv_zso = NULL; + + /* Clearing private_data will make new callers return immediately. */ + atomic_store_ptr(&zso->zvo_disk->private_data, NULL); + + /* + * Drop the state lock before calling del_gendisk(). There may be + * callers waiting to acquire it, but del_gendisk() will block until + * they exit, which would deadlock. + */ + mutex_exit(&zv->zv_state_lock); - del_gendisk(zv->zv_zso->zvo_disk); + del_gendisk(zso->zvo_disk); #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) #if defined(HAVE_BLK_CLEANUP_DISK) - blk_cleanup_disk(zv->zv_zso->zvo_disk); + blk_cleanup_disk(zso->zvo_disk); #else - put_disk(zv->zv_zso->zvo_disk); + put_disk(zso->zvo_disk); #endif #else - blk_cleanup_queue(zv->zv_zso->zvo_queue); - put_disk(zv->zv_zso->zvo_disk); + blk_cleanup_queue(zso->zvo_queue); + put_disk(zso->zvo_disk); #endif - if (zv->zv_zso->use_blk_mq) - blk_mq_free_tag_set(&zv->zv_zso->tag_set); + if (zso->use_blk_mq) + blk_mq_free_tag_set(&zso->tag_set); + + ida_simple_remove(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS); + + kmem_free(zso, sizeof (struct zvol_state_os)); + + mutex_enter(&zv->zv_state_lock); +} + +void +zvol_os_free(zvol_state_t *zv) +{ - ida_simple_remove(&zvol_ida, - MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); + ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + ASSERT0(zv->zv_open_count); + ASSERT0P(zv->zv_zso); + + ASSERT0P(zv->zv_objset); + ASSERT0P(zv->zv_zilog); + ASSERT0P(zv->zv_dn); + + rw_destroy(&zv->zv_suspend_lock); + zfs_rangelock_fini(&zv->zv_rangelock); cv_destroy(&zv->zv_removing_cv); mutex_destroy(&zv->zv_state_lock); dataset_kstats_destroy(&zv->zv_kstat); - kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); } @@ -1470,7 +1504,9 @@ __zvol_os_add_disk(struct gendisk *disk) { int error = 0; #ifdef HAVE_ADD_DISK_RET - error = add_disk(disk); + error = -add_disk(disk); + if (error) + error = SET_ERROR(error); #else add_disk(disk); #endif @@ -1562,7 +1598,7 @@ zvol_os_add_disk(struct gendisk *disk) int zvol_os_create_minor(const char *name) { - zvol_state_t *zv; + zvol_state_t *zv = NULL; objset_t *os; dmu_object_info_t *doi; uint64_t volsize; @@ -1611,18 +1647,16 @@ zvol_os_create_minor(const char *name) if (error) goto out_dmu_objset_disown; - zv = zvol_alloc(MKDEV(zvol_major, minor), name, - doi->doi_data_block_size); - if (zv == NULL) { - error = SET_ERROR(EAGAIN); + error = zvol_alloc(MKDEV(zvol_major, minor), name, + volsize, doi->doi_data_block_size, &zv); + if (error || zv == NULL) goto out_dmu_objset_disown; - } + zv->zv_hash = hash; if (dmu_objset_is_snapshot(os)) zv->zv_flags |= ZVOL_RDONLY; - zv->zv_volsize = volsize; zv->zv_objset = os; /* Default */ @@ -1647,11 +1681,11 @@ zvol_os_create_minor(const char *name) blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); #endif - ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); + ASSERT0P(zv->zv_kstat.dk_kstats); error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); if (error) goto out_dmu_objset_disown; - ASSERT3P(zv->zv_zilog, ==, NULL); + ASSERT0P(zv->zv_zilog); zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) @@ -1689,7 +1723,7 @@ out_doi: * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() * directly as well. */ - if (error == 0) { + if (error == 0 && zv) { rw_enter(&zvol_state_lock, RW_WRITER); zvol_insert(zv); rw_exit(&zvol_state_lock); @@ -1701,7 +1735,7 @@ out_doi: return (error); } -void +int zvol_os_rename_minor(zvol_state_t *zv, const char *newname) { int readonly = get_disk_ro(zv->zv_zso->zvo_disk); @@ -1728,6 +1762,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) set_disk_ro(zv->zv_zso->zvo_disk, readonly); dataset_kstats_rename(&zv->zv_kstat, newname); + + return (0); } void @@ -1755,10 +1791,10 @@ zvol_init(void) return (error); } - error = register_blkdev(zvol_major, ZVOL_DRIVER); + error = -register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); - return (error); + return (SET_ERROR(error)); } if (zvol_blk_mq_queue_depth == 0) { diff --git a/sys/contrib/openzfs/module/zcommon/simd_stat.c b/sys/contrib/openzfs/module/zcommon/simd_stat.c index 11e2080ff9f2..007ae9e4fbbc 100644 --- a/sys/contrib/openzfs/module/zcommon/simd_stat.c +++ b/sys/contrib/openzfs/module/zcommon/simd_stat.c @@ -118,6 +118,10 @@ simd_stat_kstat_data(char *buf, size_t size, void *data) "pclmulqdq", zfs_pclmulqdq_available()); off += SIMD_STAT_PRINT(simd_stat_kstat_payload, "movbe", zfs_movbe_available()); + off += SIMD_STAT_PRINT(simd_stat_kstat_payload, + "vaes", zfs_vaes_available()); + off += SIMD_STAT_PRINT(simd_stat_kstat_payload, + "vpclmulqdq", zfs_vpclmulqdq_available()); off += SIMD_STAT_PRINT(simd_stat_kstat_payload, "osxsave", boot_cpu_has(X86_FEATURE_OSXSAVE)); diff --git a/sys/contrib/openzfs/module/zcommon/zfeature_common.c b/sys/contrib/openzfs/module/zcommon/zfeature_common.c index 0362d82efbc1..6ba9892eeb64 100644 --- a/sys/contrib/openzfs/module/zcommon/zfeature_common.c +++ b/sys/contrib/openzfs/module/zcommon/zfeature_common.c @@ -732,6 +732,12 @@ zpool_feature_init(void) ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + zfeature_register(SPA_FEATURE_BLOCK_CLONING_ENDIAN, + "com.truenas:block_cloning_endian", "block_cloning_endian", + "Fixes BRT ZAP endianness on new pools.", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, + sfeatures); + zfeature_register(SPA_FEATURE_AVZ_V2, "com.klarasystems:vdev_zaps_v2", "vdev_zaps_v2", "Support for root vdev ZAP.", @@ -786,6 +792,24 @@ zpool_feature_init(void) ZFEATURE_TYPE_BOOLEAN, large_microzap_deps, sfeatures); } + zfeature_register(SPA_FEATURE_DYNAMIC_GANG_HEADER, + "com.klarasystems:dynamic_gang_header", "dynamic_gang_header", + "Support for dynamically sized gang headers", + ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_NO_UPGRADE, + ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + + { + static const spa_feature_t physical_rewrite_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_PHYSICAL_REWRITE, + "com.truenas:physical_rewrite", "physical_rewrite", + "Support for preserving logical birth time during rewrite.", + ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET, + ZFEATURE_TYPE_BOOLEAN, physical_rewrite_deps, sfeatures); + } + zfs_mod_list_supported_free(sfeatures); } diff --git a/sys/contrib/openzfs/module/zcommon/zfs_deleg.c b/sys/contrib/openzfs/module/zcommon/zfs_deleg.c index 49bb534ca26c..87596558c9a1 100644 --- a/sys/contrib/openzfs/module/zcommon/zfs_deleg.c +++ b/sys/contrib/openzfs/module/zcommon/zfs_deleg.c @@ -59,6 +59,7 @@ const zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = { {ZFS_DELEG_PERM_SNAPSHOT}, {ZFS_DELEG_PERM_SHARE}, {ZFS_DELEG_PERM_SEND}, + {ZFS_DELEG_PERM_SEND_RAW}, {ZFS_DELEG_PERM_USERPROP}, {ZFS_DELEG_PERM_USERQUOTA}, {ZFS_DELEG_PERM_GROUPQUOTA}, diff --git a/sys/contrib/openzfs/module/zcommon/zfs_prop.c b/sys/contrib/openzfs/module/zcommon/zfs_prop.c index 8b4c4251703d..864e3898b365 100644 --- a/sys/contrib/openzfs/module/zcommon/zfs_prop.c +++ b/sys/contrib/openzfs/module/zcommon/zfs_prop.c @@ -640,7 +640,7 @@ zfs_prop_init(void) "<1.00x or higher if compressed>", "REFRATIO", B_FALSE, sfeatures); zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize", ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME, - ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK", B_FALSE, + ZFS_TYPE_VOLUME, "512 to 16M, power of 2", "VOLBLOCK", B_FALSE, sfeatures); zprop_register_index(ZFS_PROP_VOLTHREADING, "volthreading", 1, PROP_DEFAULT, ZFS_TYPE_VOLUME, "on | off", "zvol threading", @@ -734,13 +734,12 @@ zfs_prop_init(void) /* inherit number properties */ zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize", SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT, - ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE", B_FALSE, - sfeatures); + ZFS_TYPE_FILESYSTEM, "512 to 16M, power of 2", + "RECSIZE", B_FALSE, sfeatures); zprop_register_number(ZFS_PROP_SPECIAL_SMALL_BLOCKS, "special_small_blocks", 0, PROP_INHERIT, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "zero or 512 to 1M, power of 2", "SPECIAL_SMALL_BLOCKS", B_FALSE, - sfeatures); + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "0 to 16M", + "SPECIAL_SMALL_BLOCKS", B_FALSE, sfeatures); /* hidden properties */ zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER, diff --git a/sys/contrib/openzfs/module/zcommon/zfs_valstr.c b/sys/contrib/openzfs/module/zcommon/zfs_valstr.c index c39ac62f654f..0cb9f584acc6 100644 --- a/sys/contrib/openzfs/module/zcommon/zfs_valstr.c +++ b/sys/contrib/openzfs/module/zcommon/zfs_valstr.c @@ -203,7 +203,7 @@ _VALSTR_BITFIELD_IMPL(zio_flag, { '?', "??", "[UNUSED 11]" }, { '.', "ND", "NODATA" }, { '.', "ID", "INDUCE_DAMAGE" }, - { '.', "AL", "IO_ALLOCATING" }, + { '.', "AT", "ALLOC_THROTTLED" }, { '.', "RE", "IO_RETRY" }, { '.', "PR", "PROBE" }, { '.', "TH", "TRYHARD" }, @@ -221,7 +221,6 @@ _VALSTR_BITFIELD_IMPL(zio_flag, { '.', "NP", "NOPWRITE" }, { '.', "EX", "REEXECUTED" }, { '.', "DG", "DELEGATED" }, - { '.', "DC", "DIO_CHKSUM_ERR" }, { '.', "PA", "PREALLOCATED" }, ) diff --git a/sys/contrib/openzfs/module/zcommon/zpool_prop.c b/sys/contrib/openzfs/module/zcommon/zpool_prop.c index 04ae9f986d8f..07819ba2be8b 100644 --- a/sys/contrib/openzfs/module/zcommon/zpool_prop.c +++ b/sys/contrib/openzfs/module/zcommon/zpool_prop.c @@ -467,9 +467,15 @@ vdev_prop_init(void) zprop_register_index(VDEV_PROP_RAIDZ_EXPANDING, "raidz_expanding", 0, PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "RAIDZ_EXPANDING", boolean_table, sfeatures); + zprop_register_index(VDEV_PROP_SIT_OUT, "sit_out", 0, + PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "SIT_OUT", boolean_table, + sfeatures); zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0, PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP", boolean_table, sfeatures); + zprop_register_index(VDEV_PROP_AUTOSIT, "autosit", 0, + PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "AUTOSIT", boolean_table, + sfeatures); /* default index properties */ zprop_register_index(VDEV_PROP_FAILFAST, "failfast", B_TRUE, diff --git a/sys/contrib/openzfs/module/zfs/abd.c b/sys/contrib/openzfs/module/zfs/abd.c index 826928e67350..bf9b13c30509 100644 --- a/sys/contrib/openzfs/module/zfs/abd.c +++ b/sys/contrib/openzfs/module/zfs/abd.c @@ -563,7 +563,7 @@ abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size) left -= csize; off = 0; } - ASSERT3U(left, ==, 0); + ASSERT0(left); } else { abd = abd_get_offset_scatter(abd, sabd, off, size); } diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index 04ca32356a6d..bd6dc8edd8ca 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -486,13 +486,13 @@ static taskq_t *arc_flush_taskq; static uint_t zfs_arc_evict_threads = 0; /* The 7 states: */ -arc_state_t ARC_anon; -arc_state_t ARC_mru; -arc_state_t ARC_mru_ghost; -arc_state_t ARC_mfu; -arc_state_t ARC_mfu_ghost; -arc_state_t ARC_l2c_only; -arc_state_t ARC_uncached; +static arc_state_t ARC_anon; +/* */ arc_state_t ARC_mru; +static arc_state_t ARC_mru_ghost; +/* */ arc_state_t ARC_mfu; +static arc_state_t ARC_mfu_ghost; +static arc_state_t ARC_l2c_only; +static arc_state_t ARC_uncached; arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, @@ -832,15 +832,15 @@ typedef struct arc_async_flush { #define L2ARC_FEED_TYPES 4 /* L2ARC Performance Tunables */ -uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ -uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ -uint64_t l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ -uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; -uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ -uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ -int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ -int l2arc_feed_again = B_TRUE; /* turbo warmup */ -int l2arc_norw = B_FALSE; /* no reads during writes */ +static uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ +static uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ +static uint64_t l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ +static uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; +static uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +static uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ +static int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ +static int l2arc_feed_again = B_TRUE; /* turbo warmup */ +static int l2arc_norw = B_FALSE; /* no reads during writes */ static uint_t l2arc_meta_percent = 33; /* limit on headers size */ /* @@ -1052,7 +1052,7 @@ static arc_buf_hdr_t * buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) { const dva_t *dva = BP_IDENTITY(bp); - uint64_t birth = BP_GET_BIRTH(bp); + uint64_t birth = BP_GET_PHYSICAL_BIRTH(bp); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *hdr; @@ -2239,8 +2239,8 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT0P(hdr->b_l1hdr.b_buf); + ASSERT0P(hdr->b_l1hdr.b_pabd); ASSERT(!HDR_HAS_RABD(hdr)); (void) zfs_refcount_add_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); @@ -2278,8 +2278,8 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT0P(hdr->b_l1hdr.b_buf); + ASSERT0P(hdr->b_l1hdr.b_pabd); ASSERT(!HDR_HAS_RABD(hdr)); (void) zfs_refcount_remove_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); @@ -2319,7 +2319,7 @@ add_reference(arc_buf_hdr_t *hdr, const void *tag) if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) { ASSERT(state == arc_anon); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0P(hdr->b_l1hdr.b_buf); } if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && @@ -2503,7 +2503,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) (void) zfs_refcount_add_many( &new_state->arcs_size[type], HDR_GET_LSIZE(hdr), hdr); - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT0P(hdr->b_l1hdr.b_pabd); ASSERT(!HDR_HAS_RABD(hdr)); } else { @@ -2547,7 +2547,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr) if (update_old && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT0P(hdr->b_l1hdr.b_pabd); ASSERT(!HDR_HAS_RABD(hdr)); /* @@ -2631,7 +2631,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type) ARCSTAT_INCR(arcstat_bonus_size, space); break; case ARC_SPACE_DNODE: - ARCSTAT_INCR(arcstat_dnode_size, space); + aggsum_add(&arc_sums.arcstat_dnode_size, space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, space); @@ -2677,7 +2677,7 @@ arc_space_return(uint64_t space, arc_space_type_t type) ARCSTAT_INCR(arcstat_bonus_size, -space); break; case ARC_SPACE_DNODE: - ARCSTAT_INCR(arcstat_dnode_size, -space); + aggsum_add(&arc_sums.arcstat_dnode_size, -space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, -space); @@ -2758,7 +2758,7 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, VERIFY(hdr->b_type == ARC_BUFC_DATA || hdr->b_type == ARC_BUFC_METADATA); ASSERT3P(ret, !=, NULL); - ASSERT3P(*ret, ==, NULL); + ASSERT0P(*ret); IMPLY(encrypted, compressed); buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); @@ -2982,7 +2982,7 @@ static void arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_can_share(hdr, buf)); - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT0P(hdr->b_l1hdr.b_pabd); ASSERT(!ARC_BUF_ENCRYPTED(buf)); ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); @@ -3201,14 +3201,14 @@ arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags) if (alloc_rdata) { size = HDR_GET_PSIZE(hdr); - ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL); + ASSERT0P(hdr->b_crypt_hdr.b_rabd); hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr, alloc_flags); ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL); ARCSTAT_INCR(arcstat_raw_size, size); } else { size = arc_hdr_size(hdr); - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT0P(hdr->b_l1hdr.b_pabd); hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr, alloc_flags); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); @@ -3290,7 +3290,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, ASSERT(HDR_EMPTY(hdr)); #ifdef ZFS_DEBUG - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + ASSERT0P(hdr->b_l1hdr.b_freeze_cksum); #endif HDR_SET_PSIZE(hdr, psize); HDR_SET_LSIZE(hdr, lsize); @@ -3351,12 +3351,12 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) nhdr->b_l1hdr.b_state = arc_l2c_only; /* Verify previous threads set to NULL before freeing */ - ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT0P(nhdr->b_l1hdr.b_pabd); ASSERT(!HDR_HAS_RABD(hdr)); } else { - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0P(hdr->b_l1hdr.b_buf); #ifdef ZFS_DEBUG - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + ASSERT0P(hdr->b_l1hdr.b_freeze_cksum); #endif /* @@ -3375,7 +3375,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) * might try to be accessed, even though it was removed. */ VERIFY(!HDR_L2_WRITING(hdr)); - VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); + VERIFY0P(hdr->b_l1hdr.b_pabd); ASSERT(!HDR_HAS_RABD(hdr)); arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); @@ -3698,12 +3698,12 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) arc_hdr_free_abd(hdr, B_TRUE); } - ASSERT3P(hdr->b_hash_next, ==, NULL); + ASSERT0P(hdr->b_hash_next); if (HDR_HAS_L1HDR(hdr)) { ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + ASSERT0P(hdr->b_l1hdr.b_acb); #ifdef ZFS_DEBUG - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + ASSERT0P(hdr->b_l1hdr.b_freeze_cksum); #endif kmem_cache_free(hdr_full_cache, hdr); } else { @@ -3771,7 +3771,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted) ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0P(hdr->b_l1hdr.b_buf); ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); *real_evicted = 0; @@ -3796,7 +3796,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted) DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); if (HDR_HAS_L2HDR(hdr)) { - ASSERT(hdr->b_l1hdr.b_pabd == NULL); + ASSERT0P(hdr->b_l1hdr.b_pabd); ASSERT(!HDR_HAS_RABD(hdr)); /* * This buffer is cached on the 2nd Level ARC; @@ -4490,7 +4490,7 @@ arc_evict(void) * target is not evictable or if they go over arc_dnode_limit. */ int64_t prune = 0; - int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size); + int64_t dn = aggsum_value(&arc_sums.arcstat_dnode_size); int64_t nem = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) - zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) @@ -5082,11 +5082,13 @@ arc_is_overflowing(boolean_t lax, boolean_t use_reserve) * in the ARC. In practice, that's in the tens of MB, which is low * enough to be safe. */ - int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c - + int64_t arc_over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c - zfs_max_recordsize; + int64_t dn_over = aggsum_lower_bound(&arc_sums.arcstat_dnode_size) - + arc_dnode_limit; /* Always allow at least one block of overflow. */ - if (over < 0) + if (arc_over < 0 && dn_over <= 0) return (ARC_OVF_NONE); /* If we are under memory pressure, report severe overflow. */ @@ -5097,7 +5099,7 @@ arc_is_overflowing(boolean_t lax, boolean_t use_reserve) int64_t overflow = (arc_c >> zfs_arc_overflow_shift) / 2; if (use_reserve) overflow *= 3; - return (over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); + return (arc_over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); } static abd_t * @@ -5552,7 +5554,7 @@ static void arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) { if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { - ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); + ASSERT0(HDR_GET_PSIZE(hdr)); ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF); } else { if (HDR_COMPRESSION_ENABLED(hdr)) { @@ -5585,7 +5587,7 @@ arc_read_done(zio_t *zio) if (HDR_IN_HASH_TABLE(hdr)) { arc_buf_hdr_t *found; - ASSERT3U(hdr->b_birth, ==, BP_GET_BIRTH(zio->io_bp)); + ASSERT3U(hdr->b_birth, ==, BP_GET_PHYSICAL_BIRTH(zio->io_bp)); ASSERT3U(hdr->b_dva.dva_word[0], ==, BP_IDENTITY(zio->io_bp)->dva_word[0]); ASSERT3U(hdr->b_dva.dva_word[1], ==, @@ -5688,7 +5690,7 @@ arc_read_done(zio_t *zio) error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(zio->io_spa, &acb->acb_zb, - BP_GET_LOGICAL_BIRTH(zio->io_bp)); + BP_GET_PHYSICAL_BIRTH(zio->io_bp)); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, zio->io_spa, NULL, &acb->acb_zb, zio, 0); @@ -6107,7 +6109,7 @@ top: if (!embedded_bp) { hdr->b_dva = *BP_IDENTITY(bp); - hdr->b_birth = BP_GET_BIRTH(bp); + hdr->b_birth = BP_GET_PHYSICAL_BIRTH(bp); exists = buf_hash_insert(hdr, &hash_lock); } if (exists != NULL) { @@ -6130,14 +6132,14 @@ top: } if (GHOST_STATE(hdr->b_l1hdr.b_state)) { - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT0P(hdr->b_l1hdr.b_pabd); ASSERT(!HDR_HAS_RABD(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT0(zfs_refcount_count( &hdr->b_l1hdr.b_refcnt)); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0P(hdr->b_l1hdr.b_buf); #ifdef ZFS_DEBUG - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + ASSERT0P(hdr->b_l1hdr.b_freeze_cksum); #endif } else if (HDR_IO_IN_PROGRESS(hdr)) { /* @@ -6231,7 +6233,7 @@ top: acb->acb_nobuf = no_buf; acb->acb_zb = *zb; - ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + ASSERT0P(hdr->b_l1hdr.b_acb); hdr->b_l1hdr.b_acb = acb; if (HDR_HAS_L2HDR(hdr) && @@ -6715,7 +6717,7 @@ arc_release(arc_buf_t *buf, const void *tag) nhdr = arc_hdr_alloc(spa, psize, lsize, protected, compress, hdr->b_complevel, type); - ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0P(nhdr->b_l1hdr.b_buf); ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt)); VERIFY3U(nhdr->b_type, ==, type); ASSERT(!HDR_SHARED_DATA(nhdr)); @@ -6802,7 +6804,7 @@ arc_write_ready(zio_t *zio) if (HDR_HAS_RABD(hdr)) arc_hdr_free_abd(hdr, B_TRUE); } - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT0P(hdr->b_l1hdr.b_pabd); ASSERT(!HDR_HAS_RABD(hdr)); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!arc_buf_is_shared(buf)); @@ -6946,7 +6948,7 @@ arc_write_done(zio_t *zio) arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + ASSERT0P(hdr->b_l1hdr.b_acb); if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); @@ -6955,7 +6957,7 @@ arc_write_done(zio_t *zio) buf_discard_identity(hdr); } else { hdr->b_dva = *BP_IDENTITY(zio->io_bp); - hdr->b_birth = BP_GET_BIRTH(zio->io_bp); + hdr->b_birth = BP_GET_PHYSICAL_BIRTH(zio->io_bp); } } else { ASSERT(HDR_EMPTY(hdr)); @@ -6971,7 +6973,7 @@ arc_write_done(zio_t *zio) arc_buf_hdr_t *exists; kmutex_t *hash_lock; - ASSERT3U(zio->io_error, ==, 0); + ASSERT0(zio->io_error); arc_cksum_verify(buf); @@ -6992,7 +6994,7 @@ arc_write_done(zio_t *zio) arc_hdr_destroy(exists); mutex_exit(hash_lock); exists = buf_hash_insert(hdr, &hash_lock); - ASSERT3P(exists, ==, NULL); + ASSERT0P(exists); } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { /* nopwrite */ ASSERT(zio->io_prop.zp_nopwrite); @@ -7005,7 +7007,7 @@ arc_write_done(zio_t *zio) ASSERT(ARC_BUF_LAST(hdr->b_l1hdr.b_buf)); ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); - ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); + ASSERT0(BP_GET_LEVEL(zio->io_bp)); } } arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); @@ -7042,7 +7044,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, ASSERT3P(done, !=, NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + ASSERT0P(hdr->b_l1hdr.b_acb); ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL); if (uncached) arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED); @@ -7111,7 +7113,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); ASSERT(!arc_buf_is_shared(buf)); - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT0P(hdr->b_l1hdr.b_pabd); zio = zio_write(pio, spa, txg, bp, abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), @@ -7326,7 +7328,7 @@ arc_kstat_update(kstat_t *ksp, int rw) #if defined(COMPAT_FREEBSD11) as->arcstat_other_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size) + - wmsum_value(&arc_sums.arcstat_dnode_size) + + aggsum_value(&arc_sums.arcstat_dnode_size) + wmsum_value(&arc_sums.arcstat_dbuf_size); #endif @@ -7368,7 +7370,7 @@ arc_kstat_update(kstat_t *ksp, int rw) &as->arcstat_uncached_evictable_metadata); as->arcstat_dnode_size.value.ui64 = - wmsum_value(&arc_sums.arcstat_dnode_size); + aggsum_value(&arc_sums.arcstat_dnode_size); as->arcstat_bonus_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size); as->arcstat_l2_hits.value.ui64 = @@ -7738,7 +7740,7 @@ arc_state_init(void) wmsum_init(&arc_sums.arcstat_data_size, 0); wmsum_init(&arc_sums.arcstat_metadata_size, 0); wmsum_init(&arc_sums.arcstat_dbuf_size, 0); - wmsum_init(&arc_sums.arcstat_dnode_size, 0); + aggsum_init(&arc_sums.arcstat_dnode_size, 0); wmsum_init(&arc_sums.arcstat_bonus_size, 0); wmsum_init(&arc_sums.arcstat_l2_hits, 0); wmsum_init(&arc_sums.arcstat_l2_misses, 0); @@ -7897,7 +7899,7 @@ arc_state_fini(void) wmsum_fini(&arc_sums.arcstat_data_size); wmsum_fini(&arc_sums.arcstat_metadata_size); wmsum_fini(&arc_sums.arcstat_dbuf_size); - wmsum_fini(&arc_sums.arcstat_dnode_size); + aggsum_fini(&arc_sums.arcstat_dnode_size); wmsum_fini(&arc_sums.arcstat_bonus_size); wmsum_fini(&arc_sums.arcstat_l2_hits); wmsum_fini(&arc_sums.arcstat_l2_misses); diff --git a/sys/contrib/openzfs/module/zfs/bpobj.c b/sys/contrib/openzfs/module/zfs/bpobj.c index 8c19de93f12f..ea9fbd036c6e 100644 --- a/sys/contrib/openzfs/module/zfs/bpobj.c +++ b/sys/contrib/openzfs/module/zfs/bpobj.c @@ -160,8 +160,8 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) memset(bpo, 0, sizeof (*bpo)); mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL); - ASSERT(bpo->bpo_dbuf == NULL); - ASSERT(bpo->bpo_phys == NULL); + ASSERT0P(bpo->bpo_dbuf); + ASSERT0P(bpo->bpo_phys); ASSERT(object != 0); ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ); ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR); @@ -478,7 +478,7 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, * We have unprocessed subobjs. Process the next one. */ ASSERT(bpo->bpo_havecomp); - ASSERT3P(bpobj_size, ==, NULL); + ASSERT0P(bpobj_size); /* Add the last subobj to stack. */ int64_t i = bpi->bpi_unprocessed_subobjs - 1; @@ -954,8 +954,8 @@ space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) (void) bp_freed, (void) tx; struct space_range_arg *sra = arg; - if (BP_GET_LOGICAL_BIRTH(bp) > sra->mintxg && - BP_GET_LOGICAL_BIRTH(bp) <= sra->maxtxg) { + if (BP_GET_BIRTH(bp) > sra->mintxg && + BP_GET_BIRTH(bp) <= sra->maxtxg) { if (dsl_pool_sync_context(spa_get_dsl(sra->spa))) sra->used += bp_get_dsize_sync(sra->spa, bp); else diff --git a/sys/contrib/openzfs/module/zfs/brt.c b/sys/contrib/openzfs/module/zfs/brt.c index 27d9ed7ea2b0..40664354aa73 100644 --- a/sys/contrib/openzfs/module/zfs/brt.c +++ b/sys/contrib/openzfs/module/zfs/brt.c @@ -478,6 +478,18 @@ brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx)); BRT_DEBUG("Pool directory object created, object=%s", name); + /* + * Activate the endian-fixed feature if this is the first BRT ZAP + * (i.e., BLOCK_CLONING is not yet active) and the feature is enabled. + */ + if (spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN) && + !spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) { + spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx); + } else if (spa_feature_is_active(spa, + SPA_FEATURE_BLOCK_CLONING_ENDIAN)) { + spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx); + } + spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx); } @@ -658,6 +670,8 @@ brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) rw_exit(&brtvd->bv_lock); spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx); + if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN)) + spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx); } static void @@ -855,16 +869,29 @@ brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp) *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]); } +static boolean_t +brt_has_endian_fixed(spa_t *spa) +{ + return (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN)); +} + static int -brt_entry_lookup(brt_vdev_t *brtvd, brt_entry_t *bre) +brt_entry_lookup(spa_t *spa, brt_vdev_t *brtvd, brt_entry_t *bre) { uint64_t off = BRE_OFFSET(bre); if (brtvd->bv_mos_entries == 0) return (SET_ERROR(ENOENT)); - return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode, - &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), &bre->bre_count)); + if (brt_has_endian_fixed(spa)) { + return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode, + &off, BRT_KEY_WORDS, sizeof (bre->bre_count), 1, + &bre->bre_count)); + } else { + return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode, + &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), + &bre->bre_count)); + } } /* @@ -1056,7 +1083,7 @@ brt_entry_decref(spa_t *spa, const blkptr_t *bp) } rw_exit(&brtvd->bv_lock); - error = brt_entry_lookup(brtvd, &bre_search); + error = brt_entry_lookup(spa, brtvd, &bre_search); /* bre_search now contains correct bre_count */ if (error == ENOENT) { BRTSTAT_BUMP(brt_decref_no_entry); @@ -1118,7 +1145,7 @@ brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); if (bre == NULL) { rw_exit(&brtvd->bv_lock); - error = brt_entry_lookup(brtvd, &bre_search); + error = brt_entry_lookup(spa, brtvd, &bre_search); if (error == ENOENT) { refcnt = 0; } else { @@ -1270,10 +1297,18 @@ brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg) uint64_t off = BRE_OFFSET(bre); if (brtvd->bv_mos_entries != 0 && brt_vdev_lookup(spa, brtvd, off)) { - int error = zap_lookup_uint64_by_dnode( - brtvd->bv_mos_entries_dnode, &off, - BRT_KEY_WORDS, 1, sizeof (bre->bre_count), - &bre->bre_count); + int error; + if (brt_has_endian_fixed(spa)) { + error = zap_lookup_uint64_by_dnode( + brtvd->bv_mos_entries_dnode, &off, + BRT_KEY_WORDS, sizeof (bre->bre_count), 1, + &bre->bre_count); + } else { + error = zap_lookup_uint64_by_dnode( + brtvd->bv_mos_entries_dnode, &off, + BRT_KEY_WORDS, 1, sizeof (bre->bre_count), + &bre->bre_count); + } if (error == 0) { BRTSTAT_BUMP(brt_addref_entry_on_disk); } else { @@ -1326,7 +1361,7 @@ brt_pending_apply(spa_t *spa, uint64_t txg) } static void -brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx) +brt_sync_entry(spa_t *spa, dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx) { uint64_t off = BRE_OFFSET(bre); @@ -1337,9 +1372,15 @@ brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx) BRT_KEY_WORDS, tx); VERIFY(error == 0 || error == ENOENT); } else { - VERIFY0(zap_update_uint64_by_dnode(dn, &off, - BRT_KEY_WORDS, 1, sizeof (bre->bre_count), - &bre->bre_count, tx)); + if (brt_has_endian_fixed(spa)) { + VERIFY0(zap_update_uint64_by_dnode(dn, &off, + BRT_KEY_WORDS, sizeof (bre->bre_count), 1, + &bre->bre_count, tx)); + } else { + VERIFY0(zap_update_uint64_by_dnode(dn, &off, + BRT_KEY_WORDS, 1, sizeof (bre->bre_count), + &bre->bre_count, tx)); + } } } @@ -1368,7 +1409,8 @@ brt_sync_table(spa_t *spa, dmu_tx_t *tx) void *c = NULL; while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { - brt_sync_entry(brtvd->bv_mos_entries_dnode, bre, tx); + brt_sync_entry(spa, brtvd->bv_mos_entries_dnode, bre, + tx); kmem_cache_free(brt_entry_cache, bre); } diff --git a/sys/contrib/openzfs/module/zfs/btree.c b/sys/contrib/openzfs/module/zfs/btree.c index aa282f711bc3..725b96a3b2c7 100644 --- a/sys/contrib/openzfs/module/zfs/btree.c +++ b/sys/contrib/openzfs/module/zfs/btree.c @@ -1110,7 +1110,7 @@ zfs_btree_add_idx(zfs_btree_t *tree, const void *value, if (where->bti_node == NULL) { ASSERT3U(tree->bt_num_elems, ==, 1); ASSERT3S(tree->bt_height, ==, -1); - ASSERT3P(tree->bt_root, ==, NULL); + ASSERT0P(tree->bt_root); ASSERT0(where->bti_offset); tree->bt_num_nodes++; @@ -1947,7 +1947,7 @@ void zfs_btree_destroy(zfs_btree_t *tree) { ASSERT0(tree->bt_num_elems); - ASSERT3P(tree->bt_root, ==, NULL); + ASSERT0P(tree->bt_root); } /* Verify that every child of this node has the correct parent pointer. */ @@ -1969,10 +1969,10 @@ static void zfs_btree_verify_pointers(zfs_btree_t *tree) { if (tree->bt_height == -1) { - VERIFY3P(tree->bt_root, ==, NULL); + VERIFY0P(tree->bt_root); return; } - VERIFY3P(tree->bt_root->bth_parent, ==, NULL); + VERIFY0P(tree->bt_root->bth_parent); zfs_btree_verify_pointers_helper(tree, tree->bt_root); } diff --git a/sys/contrib/openzfs/module/zfs/dataset_kstats.c b/sys/contrib/openzfs/module/zfs/dataset_kstats.c index d3baabd6169f..e5abcd2044cf 100644 --- a/sys/contrib/openzfs/module/zfs/dataset_kstats.c +++ b/sys/contrib/openzfs/module/zfs/dataset_kstats.c @@ -44,6 +44,7 @@ static dataset_kstat_values_t empty_dataset_kstats = { { "zil_commit_error_count", KSTAT_DATA_UINT64 }, { "zil_commit_stall_count", KSTAT_DATA_UINT64 }, { "zil_commit_suspend_count", KSTAT_DATA_UINT64 }, + { "zil_commit_crash_count", KSTAT_DATA_UINT64 }, { "zil_itx_count", KSTAT_DATA_UINT64 }, { "zil_itx_indirect_count", KSTAT_DATA_UINT64 }, { "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 }, diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c index f1b5a17f337e..fccc4c5b5b94 100644 --- a/sys/contrib/openzfs/module/zfs/dbuf.c +++ b/sys/contrib/openzfs/module/zfs/dbuf.c @@ -523,7 +523,7 @@ dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) return; /* Only data blocks support the attachment of user data. */ - ASSERT(db->db_level == 0); + ASSERT0(db->db_level); /* Clients must resolve a dbuf before attaching user data. */ ASSERT(db->db.db_data != NULL); @@ -866,8 +866,16 @@ dbuf_evict_notify(uint64_t size) * and grabbing the lock results in massive lock contention. */ if (size > dbuf_cache_target_bytes()) { - if (size > dbuf_cache_hiwater_bytes()) + /* + * Avoid calling dbuf_evict_one() from memory reclaim context + * (e.g. Linux kswapd, FreeBSD pagedaemon) to prevent deadlocks. + * Memory reclaim threads can get stuck waiting for the dbuf + * hash lock. + */ + if (size > dbuf_cache_hiwater_bytes() && + !current_is_reclaim_thread()) { dbuf_evict_one(); + } cv_signal(&dbuf_evict_cv); } } @@ -1120,8 +1128,8 @@ dbuf_verify(dmu_buf_impl_t *db) DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn == NULL) { - ASSERT(db->db_parent == NULL); - ASSERT(db->db_blkptr == NULL); + ASSERT0P(db->db_parent); + ASSERT0P(db->db_blkptr); } else { ASSERT3U(db->db.db_object, ==, dn->dn_object); ASSERT3P(db->db_objset, ==, dn->dn_objset); @@ -1172,7 +1180,7 @@ dbuf_verify(dmu_buf_impl_t *db) /* db is pointed to by the dnode */ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) - ASSERT(db->db_parent == NULL); + ASSERT0P(db->db_parent); else ASSERT(db->db_parent != NULL); if (db->db_blkid != DMU_SPILL_BLKID) @@ -1211,7 +1219,7 @@ dbuf_verify(dmu_buf_impl_t *db) int i; for (i = 0; i < db->db.db_size >> 3; i++) { - ASSERT(buf[i] == 0); + ASSERT0(buf[i]); } } else { blkptr_t *bps = db->db.db_data; @@ -1235,11 +1243,9 @@ dbuf_verify(dmu_buf_impl_t *db) DVA_IS_EMPTY(&bp->blk_dva[1]) && DVA_IS_EMPTY(&bp->blk_dva[2])); ASSERT0(bp->blk_fill); - ASSERT0(bp->blk_pad[0]); - ASSERT0(bp->blk_pad[1]); ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(BP_IS_HOLE(bp)); - ASSERT0(BP_GET_PHYSICAL_BIRTH(bp)); + ASSERT0(BP_GET_RAW_PHYSICAL_BIRTH(bp)); } } } @@ -1253,7 +1259,7 @@ dbuf_clear_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); dbuf_evict_user(db); - ASSERT3P(db->db_buf, ==, NULL); + ASSERT0P(db->db_buf); db->db.db_data = NULL; if (db->db_state != DB_NOFILL) { db->db_state = DB_UNCACHED; @@ -1378,13 +1384,13 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, * All reads are synchronous, so we must have a hold on the dbuf */ ASSERT(zfs_refcount_count(&db->db_holds) > 0); - ASSERT(db->db_buf == NULL); - ASSERT(db->db.db_data == NULL); + ASSERT0P(db->db_buf); + ASSERT0P(db->db.db_data); if (buf == NULL) { /* i/o error */ ASSERT(zio == NULL || zio->io_error != 0); ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT3P(db->db_buf, ==, NULL); + ASSERT0P(db->db_buf); db->db_state = DB_UNCACHED; DTRACE_SET_STATE(db, "i/o error"); } else if (db->db_level == 0 && db->db_freed_in_flight) { @@ -1578,7 +1584,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, dmu_flags_t flags, ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); - ASSERT(db->db_buf == NULL); + ASSERT0P(db->db_buf); ASSERT(db->db_parent == NULL || RW_LOCK_HELD(&db->db_parent->db_rwlock)); @@ -1615,7 +1621,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, dmu_flags_t flags, */ if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bp)) { spa_log_error(db->db_objset->os_spa, &zb, - BP_GET_LOGICAL_BIRTH(bp)); + BP_GET_PHYSICAL_BIRTH(bp)); err = SET_ERROR(EIO); goto early_unlock; } @@ -1676,7 +1682,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db.db_data != NULL); - ASSERT(db->db_level == 0); + ASSERT0(db->db_level); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); if (dr == NULL || @@ -1895,8 +1901,8 @@ dbuf_noread(dmu_buf_impl_t *db, dmu_flags_t flags) while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { - ASSERT(db->db_buf == NULL); - ASSERT(db->db.db_data == NULL); + ASSERT0P(db->db_buf); + ASSERT0P(db->db.db_data); dbuf_set_data(db, dbuf_alloc_arcbuf(db)); db->db_state = DB_FILL; DTRACE_SET_STATE(db, "assigning filled buffer"); @@ -1923,7 +1929,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) * comes from dbuf_dirty() callers who must also hold a range lock. */ ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); - ASSERT(db->db_level == 0); + ASSERT0(db->db_level); if (db->db_blkid == DMU_BONUS_BLKID || dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) @@ -1988,7 +1994,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, mutex_enter(&dn->dn_dbufs_mtx); db = avl_find(&dn->dn_dbufs, db_search, &where); - ASSERT3P(db, ==, NULL); + ASSERT0P(db); db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); @@ -2011,7 +2017,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL || db->db_state == DB_EVICTING) { - ASSERT(db->db.db_data == NULL); + ASSERT0P(db->db.db_data); mutex_exit(&db->db_mtx); continue; } @@ -2154,6 +2160,12 @@ dbuf_redirty(dbuf_dirty_record_t *dr) ASSERT(arc_released(db->db_buf)); arc_buf_thaw(db->db_buf); } + + /* + * Clear the rewrite flag since this is now a logical + * modification. + */ + dr->dt.dl.dr_rewrite = B_FALSE; } } @@ -2258,14 +2270,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) if (dn->dn_objset->os_dsl_dataset != NULL) rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG); #endif - /* - * We make this assert for private objects as well, but after we - * check if we're already dirty. They are allowed to re-dirty - * in syncing context. - */ - ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || - dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == - (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); mutex_enter(&db->db_mtx); /* @@ -2277,12 +2281,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_state == DB_CACHED || db->db_state == DB_FILL || db->db_state == DB_NOFILL); - mutex_enter(&dn->dn_mtx); - dnode_set_dirtyctx(dn, tx, db); - if (tx->tx_txg > dn->dn_dirty_txg) - dn->dn_dirty_txg = tx->tx_txg; - mutex_exit(&dn->dn_mtx); - if (db->db_blkid == DMU_SPILL_BLKID) dn->dn_have_spill = B_TRUE; @@ -2301,13 +2299,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) return (dr_next); } - /* - * Only valid if not already dirty. - */ - ASSERT(dn->dn_object == 0 || - dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == - (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); - ASSERT3U(dn->dn_nlevels, >, db->db_level); /* @@ -2545,12 +2536,13 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* * Due to our use of dn_nlevels below, this can only be called - * in open context, unless we are operating on the MOS. - * From syncing context, dn_nlevels may be different from the - * dn_nlevels used when dbuf was dirtied. + * in open context, unless we are operating on the MOS or it's + * a special object. From syncing context, dn_nlevels may be + * different from the dn_nlevels used when dbuf was dirtied. */ ASSERT(db->db_objset == dmu_objset_pool(db->db_objset)->dp_meta_objset || + DMU_OBJECT_IS_SPECIAL(db->db.db_object) || txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT0(db->db_level); @@ -2701,6 +2693,38 @@ dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH); } +void +dmu_buf_will_rewrite(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + ASSERT(tx->tx_txg != 0); + ASSERT(!zfs_refcount_is_zero(&db->db_holds)); + + /* + * If the dbuf is already dirty in this txg, it will be written + * anyway, so there's nothing to do. + */ + mutex_enter(&db->db_mtx); + if (dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) { + mutex_exit(&db->db_mtx); + return; + } + mutex_exit(&db->db_mtx); + + /* + * The dbuf is not dirty, so we need to make it dirty and + * mark it for rewrite (preserve logical birth time). + */ + dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH); + + mutex_enter(&db->db_mtx); + dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); + if (dr != NULL && db->db_level == 0) + dr->dt.dl.dr_rewrite = B_TRUE; + mutex_exit(&db->db_mtx); +} + boolean_t dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) { @@ -2852,8 +2876,8 @@ dmu_buf_will_clone_or_dio(dmu_buf_t *db_fake, dmu_tx_t *tx) dbuf_clear_data(db); } - ASSERT3P(db->db_buf, ==, NULL); - ASSERT3P(db->db.db_data, ==, NULL); + ASSERT0P(db->db_buf); + ASSERT0P(db->db.db_data); db->db_state = DB_NOFILL; DTRACE_SET_STATE(db, @@ -2888,7 +2912,7 @@ dmu_buf_will_fill_flags(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail, ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(tx->tx_txg != 0); - ASSERT(db->db_level == 0); + ASSERT0(db->db_level); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || @@ -3100,7 +3124,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx, { ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT(db->db_level == 0); + ASSERT0(db->db_level); ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf)); ASSERT(buf != NULL); ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size); @@ -3165,7 +3189,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx, VERIFY(!dbuf_undirty(db, tx)); db->db_state = DB_UNCACHED; } - ASSERT(db->db_buf == NULL); + ASSERT0P(db->db_buf); dbuf_set_data(db, buf); db->db_state = DB_FILL; DTRACE_SET_STATE(db, "filling assigned arcbuf"); @@ -3225,7 +3249,7 @@ dbuf_destroy(dmu_buf_impl_t *db) } ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); - ASSERT(db->db_data_pending == NULL); + ASSERT0P(db->db_data_pending); ASSERT(list_is_empty(&db->db_dirty_records)); db->db_state = DB_EVICTING; @@ -3277,11 +3301,11 @@ dbuf_destroy(dmu_buf_impl_t *db) db->db_parent = NULL; - ASSERT(db->db_buf == NULL); - ASSERT(db->db.db_data == NULL); - ASSERT(db->db_hash_next == NULL); - ASSERT(db->db_blkptr == NULL); - ASSERT(db->db_data_pending == NULL); + ASSERT0P(db->db_buf); + ASSERT0P(db->db.db_data); + ASSERT0P(db->db_hash_next); + ASSERT0P(db->db_blkptr); + ASSERT0P(db->db_data_pending); ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); ASSERT(!multilist_link_active(&db->db_cache_link)); @@ -3916,7 +3940,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, if (fail_uncached) return (SET_ERROR(ENOENT)); - ASSERT3P(parent, ==, NULL); + ASSERT0P(parent); err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); if (fail_sparse) { if (err == 0 && bp && BP_IS_HOLE(bp)) @@ -4020,7 +4044,7 @@ dbuf_create_bonus(dnode_t *dn) { ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); - ASSERT(dn->dn_bonus == NULL); + ASSERT0P(dn->dn_bonus); dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL, dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID)); dn->dn_bonus->db_pending_evict = FALSE; @@ -4372,7 +4396,7 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) * inappropriate to hook it in (i.e., nlevels mismatch). */ ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); - ASSERT(db->db_parent == NULL); + ASSERT0P(db->db_parent); db->db_parent = dn->dn_dbuf; db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; DBUF_VERIFY(db); @@ -4433,7 +4457,7 @@ dbuf_prepare_encrypted_dnode_leaf(dbuf_dirty_record_t *dr) ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT); - ASSERT3U(db->db_level, ==, 0); + ASSERT0(db->db_level); if (!db->db_objset->os_raw_receive && arc_is_encrypted(db->db_buf)) { zbookmark_phys_t zb; @@ -4544,7 +4568,7 @@ dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr) /* ensure that everything is zero after our data */ for (; datap_end < datap_max; datap_end++) - ASSERT(*datap_end == 0); + ASSERT0(*datap_end); #endif } @@ -4552,7 +4576,7 @@ static blkptr_t * dbuf_lightweight_bp(dbuf_dirty_record_t *dr) { /* This must be a lightweight dirty record. */ - ASSERT3P(dr->dr_dbuf, ==, NULL); + ASSERT0P(dr->dr_dbuf); dnode_t *dn = dr->dr_dnode; if (dn->dn_phys->dn_nlevels == 1) { @@ -4695,7 +4719,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) */ if (db->db_state == DB_UNCACHED) { /* This buffer has been freed since it was dirtied */ - ASSERT3P(db->db.db_data, ==, NULL); + ASSERT0P(db->db.db_data); } else if (db->db_state == DB_FILL) { /* This buffer was freed and is now being re-filled */ ASSERT(db->db.db_data != dr->dt.dl.dr_data); @@ -4712,9 +4736,9 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) */ dbuf_dirty_record_t *dr_head = list_head(&db->db_dirty_records); - ASSERT3P(db->db_buf, ==, NULL); - ASSERT3P(db->db.db_data, ==, NULL); - ASSERT3P(dr_head->dt.dl.dr_data, ==, NULL); + ASSERT0P(db->db_buf); + ASSERT0P(db->db.db_data); + ASSERT0P(dr_head->dt.dl.dr_data); ASSERT3U(dr_head->dt.dl.dr_override_state, ==, DR_OVERRIDDEN); } else { ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); @@ -4899,7 +4923,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) dnode_diduse_space(dn, delta - zio->io_prev_space_delta); zio->io_prev_space_delta = delta; - if (BP_GET_LOGICAL_BIRTH(bp) != 0) { + if (BP_GET_BIRTH(bp) != 0) { ASSERT((db->db_blkid != DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_type) || (db->db_blkid == DMU_SPILL_BLKID && @@ -5186,7 +5210,7 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); drica.drica_os = dn->dn_objset; - drica.drica_blk_birth = BP_GET_LOGICAL_BIRTH(bp); + drica.drica_blk_birth = BP_GET_BIRTH(bp); drica.drica_tx = tx; if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, &drica)) { @@ -5201,8 +5225,7 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) if (dn->dn_objset != spa_meta_objset(spa)) { dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset); if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && - BP_GET_LOGICAL_BIRTH(bp) > - ds->ds_dir->dd_origin_txg) { + BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg) { ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(dsl_dir_is_clone(ds->ds_dir)); ASSERT(spa_feature_is_enabled(spa, @@ -5320,7 +5343,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) } ASSERT(db->db_level == 0 || data == db->db_buf); - ASSERT3U(BP_GET_LOGICAL_BIRTH(db->db_blkptr), <=, txg); + ASSERT3U(BP_GET_BIRTH(db->db_blkptr), <=, txg); ASSERT(pio); SET_BOOKMARK(&zb, os->os_dsl_dataset ? @@ -5334,6 +5357,24 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); /* + * Set rewrite properties for zfs_rewrite() operations. + */ + if (db->db_level == 0 && dr->dt.dl.dr_rewrite) { + zp.zp_rewrite = B_TRUE; + + /* + * Mark physical rewrite feature for activation. + * This will be activated automatically during dataset sync. + */ + dsl_dataset_t *ds = os->os_dsl_dataset; + if (!dsl_dataset_feature_is_active(ds, + SPA_FEATURE_PHYSICAL_REWRITE)) { + ds->ds_feature_activation[ + SPA_FEATURE_PHYSICAL_REWRITE] = (void *)B_TRUE; + } + } + + /* * We copy the blkptr now (rather than when we instantiate the dirty * record), because its value can change between open context and * syncing context. We do not need to hold dn_struct_rwlock to read @@ -5403,6 +5444,7 @@ EXPORT_SYMBOL(dbuf_release_bp); EXPORT_SYMBOL(dbuf_dirty); EXPORT_SYMBOL(dmu_buf_set_crypt_params); EXPORT_SYMBOL(dmu_buf_will_dirty); +EXPORT_SYMBOL(dmu_buf_will_rewrite); EXPORT_SYMBOL(dmu_buf_is_dirty); EXPORT_SYMBOL(dmu_buf_will_clone_or_dio); EXPORT_SYMBOL(dmu_buf_will_not_fill); diff --git a/sys/contrib/openzfs/module/zfs/ddt.c b/sys/contrib/openzfs/module/zfs/ddt.c index 60cbb7755a7e..0dc9adc7fd4f 100644 --- a/sys/contrib/openzfs/module/zfs/ddt.c +++ b/sys/contrib/openzfs/module/zfs/ddt.c @@ -397,7 +397,7 @@ ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class, ddt_object_name(ddt, type, class, name); - ASSERT3U(*objectp, ==, 0); + ASSERT0(*objectp); VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash)); ASSERT3U(*objectp, !=, 0); @@ -724,10 +724,13 @@ ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp) dvas[2] = bp->blk_dva[2]; if (ddt_phys_birth(ddp, v) == 0) { - if (v == DDT_PHYS_FLAT) - ddp->ddp_flat.ddp_phys_birth = BP_GET_BIRTH(bp); - else - ddp->ddp_trad[v].ddp_phys_birth = BP_GET_BIRTH(bp); + if (v == DDT_PHYS_FLAT) { + ddp->ddp_flat.ddp_phys_birth = + BP_GET_PHYSICAL_BIRTH(bp); + } else { + ddp->ddp_trad[v].ddp_phys_birth = + BP_GET_PHYSICAL_BIRTH(bp); + } } } @@ -891,14 +894,14 @@ ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp) if (ddt->ddt_flags & DDT_FLAG_FLAT) { if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_flat.ddp_dva[0]) && - BP_GET_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) { + BP_GET_PHYSICAL_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) { return (DDT_PHYS_FLAT); } } else /* traditional phys */ { for (int p = 0; p < DDT_PHYS_MAX; p++) { if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_trad[p].ddp_dva[0]) && - BP_GET_BIRTH(bp) == + BP_GET_PHYSICAL_BIRTH(bp) == ddp->ddp_trad[p].ddp_phys_birth) { return (p); } @@ -1008,7 +1011,7 @@ ddt_free(const ddt_t *ddt, ddt_entry_t *dde) { if (dde->dde_io != NULL) { for (int p = 0; p < DDT_NPHYS(ddt); p++) - ASSERT3P(dde->dde_io->dde_lead_zio[p], ==, NULL); + ASSERT0P(dde->dde_io->dde_lead_zio[p]); if (dde->dde_io->dde_repair_abd != NULL) abd_free(dde->dde_io->dde_repair_abd); @@ -1418,7 +1421,7 @@ ddt_key_compare(const void *x1, const void *x2) static void ddt_create_dir(ddt_t *ddt, dmu_tx_t *tx) { - ASSERT3U(ddt->ddt_dir_object, ==, 0); + ASSERT0(ddt->ddt_dir_object); ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT); char name[DDT_NAMELEN]; @@ -1698,9 +1701,11 @@ ddt_load(spa_t *spa) } } - error = ddt_log_load(ddt); - if (error != 0 && error != ENOENT) - return (error); + if (ddt->ddt_flags & DDT_FLAG_LOG) { + error = ddt_log_load(ddt); + if (error != 0 && error != ENOENT) + return (error); + } DDT_KSTAT_SET(ddt, dds_log_active_entries, avl_numnodes(&ddt->ddt_log_active->ddl_tree)); @@ -2392,7 +2397,7 @@ ddt_sync(spa_t *spa, uint64_t txg) * scan's root zio here so that we can wait for any scan IOs in * addition to the regular ddt IOs. */ - ASSERT3P(scn->scn_zio_root, ==, NULL); + ASSERT0P(scn->scn_zio_root); scn->scn_zio_root = rio; for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { diff --git a/sys/contrib/openzfs/module/zfs/ddt_log.c b/sys/contrib/openzfs/module/zfs/ddt_log.c index dbd381aa9609..c7a2426f3a77 100644 --- a/sys/contrib/openzfs/module/zfs/ddt_log.c +++ b/sys/contrib/openzfs/module/zfs/ddt_log.c @@ -116,7 +116,7 @@ static void ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx) { ASSERT3U(ddt->ddt_dir_object, >, 0); - ASSERT3U(ddl->ddl_object, ==, 0); + ASSERT0(ddl->ddl_object); char name[DDT_NAMELEN]; ddt_log_name(ddt, name, n); @@ -176,11 +176,13 @@ ddt_log_update_stats(ddt_t *ddt) * that's reasonable to expect anyway. */ dmu_object_info_t doi; - uint64_t nblocks; - dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi); - nblocks = doi.doi_physical_blocks_512; - dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi); - nblocks += doi.doi_physical_blocks_512; + uint64_t nblocks = 0; + if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, + &doi) == 0) + nblocks += doi.doi_physical_blocks_512; + if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, + &doi) == 0) + nblocks += doi.doi_physical_blocks_512; ddt_object_t *ddo = &ddt->ddt_log_stats; ddo->ddo_count = @@ -194,7 +196,7 @@ void ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu) { ASSERT3U(nentries, >, 0); - ASSERT3P(dlu->dlu_dbp, ==, NULL); + ASSERT0P(dlu->dlu_dbp); if (ddt->ddt_log_active->ddl_object == 0) ddt_log_create(ddt, tx); @@ -243,6 +245,13 @@ ddt_log_alloc_entry(ddt_t *ddt) } static void +ddt_log_free_entry(ddt_t *ddt, ddt_log_entry_t *ddle) +{ + kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? + ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); +} + +static void ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) { /* Create the log tree entry from a live or stored entry */ @@ -347,8 +356,7 @@ ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe); avl_remove(&ddl->ddl_tree, ddle); - kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? - ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); + ddt_log_free_entry(ddt, ddle); return (B_TRUE); } @@ -365,8 +373,7 @@ ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk) ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); avl_remove(&ddl->ddl_tree, ddle); - kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? - ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); + ddt_log_free_entry(ddt, ddle); return (B_TRUE); } @@ -527,8 +534,7 @@ ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl) IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree)); while ((ddle = avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) { - kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? - ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); + ddt_log_free_entry(ddt, ddle); } ASSERT(avl_is_empty(&ddl->ddl_tree)); } @@ -727,7 +733,7 @@ ddt_log_load(ddt_t *ddt) ddle = fe; fe = AVL_NEXT(fl, fe); avl_remove(fl, ddle); - + ddt_log_free_entry(ddt, ddle); ddle = ae; ae = AVL_NEXT(al, ae); } @@ -748,8 +754,8 @@ ddt_log_load(ddt_t *ddt) void ddt_log_alloc(ddt_t *ddt) { - ASSERT3P(ddt->ddt_log_active, ==, NULL); - ASSERT3P(ddt->ddt_log_flushing, ==, NULL); + ASSERT0P(ddt->ddt_log_active); + ASSERT0P(ddt->ddt_log_flushing); avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare, sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node)); diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c index 21c465328134..a7a5c89bdafb 100644 --- a/sys/contrib/openzfs/module/zfs/dmu.c +++ b/sys/contrib/openzfs/module/zfs/dmu.c @@ -759,6 +759,8 @@ dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset, */ uint8_t ibps = ibs - SPA_BLKPTRSHIFT; limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs; + if (limit == 0) + end2 = start2; do { level2++; start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps; @@ -1343,7 +1345,7 @@ dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, if (size == 0) return; - VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, + VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) { @@ -1689,8 +1691,8 @@ dmu_object_cached_size(objset_t *os, uint64_t object, dmu_object_info_from_dnode(dn, &doi); - for (uint64_t off = 0; off < doi.doi_max_offset; - off += dmu_prefetch_max) { + for (uint64_t off = 0; off < doi.doi_max_offset && + dmu_prefetch_max > 0; off += dmu_prefetch_max) { /* dbuf_read doesn't prefetch L1 blocks. */ dmu_prefetch_by_dnode(dn, 1, off, dmu_prefetch_max, ZIO_PRIORITY_SYNC_READ); @@ -1872,7 +1874,7 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) */ BP_SET_LSIZE(bp, db->db_size); } else if (!BP_IS_EMBEDDED(bp)) { - ASSERT(BP_GET_LEVEL(bp) == 0); + ASSERT0(BP_GET_LEVEL(bp)); BP_SET_FILL(bp, 1); } } @@ -1966,7 +1968,7 @@ dmu_sync_late_arrival_done(zio_t *zio) blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig; ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); - ASSERT(BP_GET_LOGICAL_BIRTH(zio->io_bp) == zio->io_txg); + ASSERT(BP_GET_BIRTH(zio->io_bp) == zio->io_txg); ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); zio_free(zio->io_spa, zio->io_txg, zio->io_bp); } @@ -2405,7 +2407,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) } } } else if (wp & WP_NOFILL) { - ASSERT(level == 0); + ASSERT0(level); /* * If we're writing preallocated blocks, we aren't actually @@ -2508,6 +2510,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) zp->zp_encrypt = encrypt; zp->zp_byteorder = ZFS_HOST_BYTEORDER; zp->zp_direct_write = (wp & WP_DIRECT_WR) ? B_TRUE : B_FALSE; + zp->zp_rewrite = B_FALSE; memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN); memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN); memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN); @@ -2655,11 +2658,12 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, * operation into ZIL, or it may be impossible to replay, since * the block may appear not yet allocated at that point. */ - if (BP_GET_BIRTH(bp) > spa_freeze_txg(os->os_spa)) { + if (BP_GET_PHYSICAL_BIRTH(bp) > spa_freeze_txg(os->os_spa)) { error = SET_ERROR(EINVAL); goto out; } - if (BP_GET_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) { + if (BP_GET_PHYSICAL_BIRTH(bp) > + spa_last_synced_txg(os->os_spa)) { error = SET_ERROR(EAGAIN); goto out; } @@ -2731,7 +2735,8 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, if (!BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) != 0) { if (!BP_IS_EMBEDDED(bp)) { BP_SET_BIRTH(&dl->dr_overridden_by, dr->dr_txg, - BP_GET_BIRTH(bp)); + BP_GET_PHYSICAL_BIRTH(bp)); + BP_SET_REWRITE(&dl->dr_overridden_by, 0); } else { BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg); @@ -2862,7 +2867,7 @@ byteswap_uint64_array(void *vbuf, size_t size) size_t count = size >> 3; int i; - ASSERT((size & 7) == 0); + ASSERT0((size & 7)); for (i = 0; i < count; i++) buf[i] = BSWAP_64(buf[i]); @@ -2875,7 +2880,7 @@ byteswap_uint32_array(void *vbuf, size_t size) size_t count = size >> 2; int i; - ASSERT((size & 3) == 0); + ASSERT0((size & 3)); for (i = 0; i < count; i++) buf[i] = BSWAP_32(buf[i]); @@ -2888,7 +2893,7 @@ byteswap_uint16_array(void *vbuf, size_t size) size_t count = size >> 1; int i; - ASSERT((size & 1) == 0); + ASSERT0((size & 1)); for (i = 0; i < count; i++) buf[i] = BSWAP_16(buf[i]); diff --git a/sys/contrib/openzfs/module/zfs/dmu_diff.c b/sys/contrib/openzfs/module/zfs/dmu_diff.c index 86f751e886c9..fb13b2f87f57 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_diff.c +++ b/sys/contrib/openzfs/module/zfs/dmu_diff.c @@ -224,8 +224,8 @@ dmu_diff(const char *tosnap_name, const char *fromsnap_name, * call the ZFS_IOC_OBJ_TO_STATS ioctl. */ error = traverse_dataset(tosnap, fromtxg, - TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT, - diff_cb, &da); + TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT | + TRAVERSE_LOGICAL, diff_cb, &da); if (error != 0) { da.da_err = error; diff --git a/sys/contrib/openzfs/module/zfs/dmu_direct.c b/sys/contrib/openzfs/module/zfs/dmu_direct.c index 12b0ffa2c99b..d44c686088fc 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_direct.c +++ b/sys/contrib/openzfs/module/zfs/dmu_direct.c @@ -95,16 +95,16 @@ dmu_write_direct_done(zio_t *zio) abd_free(zio->io_abd); mutex_enter(&db->db_mtx); - ASSERT3P(db->db_buf, ==, NULL); - ASSERT3P(dr->dt.dl.dr_data, ==, NULL); - ASSERT3P(db->db.db_data, ==, NULL); + ASSERT0P(db->db_buf); + ASSERT0P(dr->dt.dl.dr_data); + ASSERT0P(db->db.db_data); db->db_state = DB_UNCACHED; mutex_exit(&db->db_mtx); dmu_sync_done(zio, NULL, zio->io_private); if (zio->io_error != 0) { - if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) + if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR) ASSERT3U(zio->io_error, ==, EIO); /* diff --git a/sys/contrib/openzfs/module/zfs/dmu_object.c b/sys/contrib/openzfs/module/zfs/dmu_object.c index b4ff7d224cc9..207cc6d0e713 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_object.c +++ b/sys/contrib/openzfs/module/zfs/dmu_object.c @@ -90,7 +90,7 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, if (allocated_dnode != NULL) { ASSERT3P(tag, !=, NULL); } else { - ASSERT3P(tag, ==, NULL); + ASSERT0P(tag); tag = FTAG; } diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c index b3f792e4ae6b..8e6b569c2100 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_objset.c +++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c @@ -345,12 +345,6 @@ smallblk_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; - /* - * Inheritance and range checking should have been done by now. - */ - ASSERT(newval <= SPA_MAXBLOCKSIZE); - ASSERT(ISP2(newval)); - os->os_zpl_special_smallblock = newval; } @@ -730,7 +724,7 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) if (err == 0) { mutex_enter(&ds->ds_lock); - ASSERT(ds->ds_objset == NULL); + ASSERT0P(ds->ds_objset); ds->ds_objset = os; mutex_exit(&ds->ds_lock); } @@ -1376,7 +1370,7 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, 6, ZFS_SPACE_CHECK_NORMAL); if (rv == 0) - zvol_create_minor(name); + zvol_create_minors(name); crfree(cr); @@ -2043,6 +2037,8 @@ userquota_updates_task(void *arg) dn->dn_id_flags |= DN_ID_CHKED_BONUS; } dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); + ASSERT3U(dn->dn_dirtycnt, >, 0); + dn->dn_dirtycnt--; mutex_exit(&dn->dn_mtx); multilist_sublist_remove(list, dn); @@ -2076,6 +2072,10 @@ dnode_rele_task(void *arg) dnode_t *dn; while ((dn = multilist_sublist_head(list)) != NULL) { + mutex_enter(&dn->dn_mtx); + ASSERT3U(dn->dn_dirtycnt, >, 0); + dn->dn_dirtycnt--; + mutex_exit(&dn->dn_mtx); multilist_sublist_remove(list, dn); dnode_rele(dn, &os->os_synced_dnodes); } @@ -2232,7 +2232,7 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) rf |= DB_RF_HAVESTRUCT; error = dmu_spill_hold_by_dnode(dn, rf, FTAG, (dmu_buf_t **)&db); - ASSERT(error == 0); + ASSERT0(error); mutex_enter(&db->db_mtx); data = (before) ? db->db.db_data : dmu_objset_userquota_find_data(db, tx); diff --git a/sys/contrib/openzfs/module/zfs/dmu_recv.c b/sys/contrib/openzfs/module/zfs/dmu_recv.c index 3a4bd7a1cea9..45c7af2bdcd2 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_recv.c +++ b/sys/contrib/openzfs/module/zfs/dmu_recv.c @@ -866,7 +866,7 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) */ if (dcp == NULL && drrb->drr_fromguid == 0 && drba->drba_origin == NULL) { - ASSERT3P(dcp, ==, NULL); + ASSERT0P(dcp); dcp = &dummy_dcp; if (featureflags & DMU_BACKUP_FEATURE_RAW) @@ -881,7 +881,7 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) if (drba->drba_cookie->drc_fromsnapobj != 0) { VERIFY0(dsl_dataset_hold_obj(dp, drba->drba_cookie->drc_fromsnapobj, FTAG, &snap)); - ASSERT3P(dcp, ==, NULL); + ASSERT0P(dcp); } if (drc->drc_heal) { /* When healing we want to use the provided snapshot */ @@ -905,7 +905,7 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) if (drba->drba_origin != NULL) { VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, FTAG, &origin)); - ASSERT3P(dcp, ==, NULL); + ASSERT0P(dcp); } /* Create new dataset. */ @@ -1403,7 +1403,7 @@ corrective_read_done(zio_t *zio) /* Corruption corrected; update error log if needed */ if (zio->io_error == 0) { spa_remove_error(data->spa, &data->zb, - BP_GET_LOGICAL_BIRTH(zio->io_bp)); + BP_GET_PHYSICAL_BIRTH(zio->io_bp)); } kmem_free(data, sizeof (cr_cb_data_t)); abd_free(zio->io_abd); @@ -1530,7 +1530,7 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw, } rrd->abd = abd; - io = zio_rewrite(NULL, rwa->os->os_spa, BP_GET_LOGICAL_BIRTH(bp), bp, + io = zio_rewrite(NULL, rwa->os->os_spa, BP_GET_BIRTH(bp), bp, abd, BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags, &zb); @@ -2792,7 +2792,7 @@ receive_read_payload_and_next_header(dmu_recv_cookie_t *drc, int len, void *buf) drc->drc_rrd->bytes_read = drc->drc_bytes_read; } } else { - ASSERT3P(buf, ==, NULL); + ASSERT0P(buf); } drc->drc_prev_cksum = drc->drc_cksum; @@ -3450,7 +3450,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) break; } - ASSERT3P(drc->drc_rrd, ==, NULL); + ASSERT0P(drc->drc_rrd); drc->drc_rrd = drc->drc_next_rrd; drc->drc_next_rrd = NULL; /* Allocates and loads header into drc->drc_next_rrd */ @@ -3468,7 +3468,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) drc->drc_rrd = NULL; } - ASSERT3P(drc->drc_rrd, ==, NULL); + ASSERT0P(drc->drc_rrd); drc->drc_rrd = kmem_zalloc(sizeof (*drc->drc_rrd), KM_SLEEP); drc->drc_rrd->eos_marker = B_TRUE; bqueue_enqueue_flush(&rwa->q, drc->drc_rrd, 1); @@ -3831,11 +3831,11 @@ dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) nvlist_free(drc->drc_keynvl); } else if (!drc->drc_heal) { if (drc->drc_newfs) { - zvol_create_minor(drc->drc_tofs); + zvol_create_minors(drc->drc_tofs); } char *snapname = kmem_asprintf("%s@%s", drc->drc_tofs, drc->drc_tosnap); - zvol_create_minor(snapname); + zvol_create_minors(snapname); kmem_strfree(snapname); } diff --git a/sys/contrib/openzfs/module/zfs/dmu_redact.c b/sys/contrib/openzfs/module/zfs/dmu_redact.c index 65443d112f27..5a22ed71a5fe 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_redact.c +++ b/sys/contrib/openzfs/module/zfs/dmu_redact.c @@ -370,8 +370,8 @@ redact_traverse_thread(void *arg) #endif err = traverse_dataset_resume(rt_arg->ds, rt_arg->txg, - &rt_arg->resume, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, - redact_cb, rt_arg); + &rt_arg->resume, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | + TRAVERSE_LOGICAL, redact_cb, rt_arg); if (err != EINTR) rt_arg->error_code = err; @@ -1067,7 +1067,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, } if (err != 0) goto out; - VERIFY3P(nvlist_next_nvpair(redactnvl, pair), ==, NULL); + VERIFY0P(nvlist_next_nvpair(redactnvl, pair)); boolean_t resuming = B_FALSE; zfs_bookmark_phys_t bookmark; diff --git a/sys/contrib/openzfs/module/zfs/dmu_send.c b/sys/contrib/openzfs/module/zfs/dmu_send.c index 4f27f3df0e55..8ecb99d5f57c 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_send.c +++ b/sys/contrib/openzfs/module/zfs/dmu_send.c @@ -962,7 +962,7 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) char *data = NULL; if (srdp->abd != NULL) { data = abd_to_buf(srdp->abd); - ASSERT3P(srdp->abuf, ==, NULL); + ASSERT0P(srdp->abuf); } else if (srdp->abuf != NULL) { data = srdp->abuf->b_data; } @@ -1084,7 +1084,7 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, */ if (sta->os->os_encrypted && !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) { - spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp)); + spa_log_error(spa, zb, BP_GET_PHYSICAL_BIRTH(bp)); return (SET_ERROR(EIO)); } @@ -1210,7 +1210,7 @@ send_traverse_thread(void *arg) err = traverse_dataset_resume(st_arg->os->os_dsl_dataset, st_arg->fromtxg, &st_arg->resume, - st_arg->flags, send_cb, st_arg); + st_arg->flags | TRAVERSE_LOGICAL, send_cb, st_arg); if (err != EINTR) st_arg->error_code = err; @@ -2514,7 +2514,7 @@ dmu_send_impl(struct dmu_send_params *dspp) * list in the stream. */ if (dspp->numfromredactsnaps != NUM_SNAPS_NOT_REDACTED) { - ASSERT3P(from_rl, ==, NULL); + ASSERT0P(from_rl); fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_FROM_SNAPS, dspp->fromredactsnaps, (uint_t)dspp->numfromredactsnaps); if (dspp->numfromredactsnaps > 0) { @@ -2891,7 +2891,7 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, &fromds); if (err != 0) { - ASSERT3P(fromds, ==, NULL); + ASSERT0P(fromds); } else { /* * We need to make a deep copy of the redact diff --git a/sys/contrib/openzfs/module/zfs/dmu_traverse.c b/sys/contrib/openzfs/module/zfs/dmu_traverse.c index f534a7dd64e3..dd1df1705040 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_traverse.c +++ b/sys/contrib/openzfs/module/zfs/dmu_traverse.c @@ -74,6 +74,15 @@ static int traverse_dnode(traverse_data_t *td, const blkptr_t *bp, static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *, uint64_t objset, uint64_t object); +static inline uint64_t +get_birth_time(traverse_data_t *td, const blkptr_t *bp) +{ + if (td->td_flags & TRAVERSE_LOGICAL) + return (BP_GET_LOGICAL_BIRTH(bp)); + else + return (BP_GET_BIRTH(bp)); +} + static int traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg) @@ -85,7 +94,7 @@ traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, return (0); if (claim_txg == 0 && - BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(td->td_spa)) + get_birth_time(td, bp) >= spa_min_claim_txg(td->td_spa)) return (-1); SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, @@ -110,7 +119,7 @@ traverse_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, if (BP_IS_HOLE(bp)) return (0); - if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg) + if (claim_txg == 0 || get_birth_time(td, bp) < claim_txg) return (0); ASSERT3U(BP_GET_LSIZE(bp), !=, 0); @@ -194,7 +203,7 @@ traverse_prefetch_metadata(traverse_data_t *td, const dnode_phys_t *dnp, */ if (resume_skip_check(td, dnp, zb) != RESUME_SKIP_NONE) return (B_FALSE); - if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg) + if (BP_IS_HOLE(bp) || get_birth_time(td, bp) <= td->td_min_txg) return (B_FALSE); if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) return (B_FALSE); @@ -265,7 +274,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, zb->zb_object == DMU_META_DNODE_OBJECT) && td->td_hole_birth_enabled_txg <= td->td_min_txg) return (0); - } else if (BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg) { + } else if (get_birth_time(td, bp) <= td->td_min_txg) { return (0); } diff --git a/sys/contrib/openzfs/module/zfs/dmu_tx.c b/sys/contrib/openzfs/module/zfs/dmu_tx.c index d85d8b89423e..40c0b3402a05 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_tx.c +++ b/sys/contrib/openzfs/module/zfs/dmu_tx.c @@ -126,7 +126,7 @@ dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type, * problem, but there's no way for it to happen (for * now, at least). */ - ASSERT(dn->dn_assigned_txg == 0); + ASSERT0(dn->dn_assigned_txg); dn->dn_assigned_txg = tx->tx_txg; (void) zfs_refcount_add(&dn->dn_tx_holds, tx); mutex_exit(&dn->dn_mtx); @@ -443,7 +443,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) dnode_t *dn = txh->txh_dnode; int err; - ASSERT(tx->tx_txg == 0); + ASSERT0(tx->tx_txg); if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz) return; @@ -607,7 +607,7 @@ dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name) dnode_t *dn = txh->txh_dnode; int err; - ASSERT(tx->tx_txg == 0); + ASSERT0(tx->tx_txg); dmu_tx_count_dnode(txh); @@ -681,7 +681,7 @@ dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) { dmu_tx_hold_t *txh; - ASSERT(tx->tx_txg == 0); + ASSERT0(tx->tx_txg); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_BONUS, 0, 0); @@ -706,7 +706,7 @@ dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) { dmu_tx_hold_t *txh; - ASSERT(tx->tx_txg == 0); + ASSERT0(tx->tx_txg); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, THT_SPACE, space, 0); @@ -1232,7 +1232,7 @@ dmu_tx_assign(dmu_tx_t *tx, dmu_tx_flag_t flags) { int err; - ASSERT(tx->tx_txg == 0); + ASSERT0(tx->tx_txg); ASSERT0(flags & ~(DMU_TX_WAIT | DMU_TX_NOTHROTTLE | DMU_TX_SUSPEND)); IMPLY(flags & DMU_TX_SUSPEND, flags & DMU_TX_WAIT); ASSERT(!dsl_pool_sync_context(tx->tx_pool)); @@ -1328,7 +1328,7 @@ dmu_tx_wait(dmu_tx_t *tx) dsl_pool_t *dp = tx->tx_pool; hrtime_t before; - ASSERT(tx->tx_txg == 0); + ASSERT0(tx->tx_txg); ASSERT(!dsl_pool_config_held(tx->tx_pool)); /* @@ -1644,12 +1644,12 @@ dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); if (sa->sa_force_spill || may_grow || hdl->sa_spill) { - ASSERT(tx->tx_txg == 0); + ASSERT0(tx->tx_txg); dmu_tx_hold_spill(tx, object); } else { DB_DNODE_ENTER(db); if (DB_DNODE(db)->dn_have_spill) { - ASSERT(tx->tx_txg == 0); + ASSERT0(tx->tx_txg); dmu_tx_hold_spill(tx, object); } DB_DNODE_EXIT(db); diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c index 51165d0bf723..3d3a9c713568 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c +++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c @@ -57,19 +57,19 @@ static unsigned int zfetch_max_sec_reap = 2; /* min bytes to prefetch per stream (default 2MB) */ static unsigned int zfetch_min_distance = 2 * 1024 * 1024; /* max bytes to prefetch per stream (default 8MB) */ -unsigned int zfetch_max_distance = 8 * 1024 * 1024; +static unsigned int zfetch_max_distance = 8 * 1024 * 1024; #else /* min bytes to prefetch per stream (default 4MB) */ static unsigned int zfetch_min_distance = 4 * 1024 * 1024; /* max bytes to prefetch per stream (default 64MB) */ -unsigned int zfetch_max_distance = 64 * 1024 * 1024; +static unsigned int zfetch_max_distance = 64 * 1024 * 1024; #endif /* max bytes to prefetch indirects for per stream (default 128MB) */ -unsigned int zfetch_max_idistance = 128 * 1024 * 1024; +static unsigned int zfetch_max_idistance = 128 * 1024 * 1024; /* max request reorder distance within a stream (default 16MB) */ -unsigned int zfetch_max_reorder = 16 * 1024 * 1024; +static unsigned int zfetch_max_reorder = 16 * 1024 * 1024; /* Max log2 fraction of holes in a stream */ -unsigned int zfetch_hole_shift = 2; +static unsigned int zfetch_hole_shift = 2; typedef struct zfetch_stats { kstat_named_t zfetchstat_hits; diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c index 904a039edf95..6c150d31c669 100644 --- a/sys/contrib/openzfs/module/zfs/dnode.c +++ b/sys/contrib/openzfs/module/zfs/dnode.c @@ -86,6 +86,19 @@ int zfs_default_ibs = DN_MAX_INDBLKSHIFT; static kmem_cbrc_t dnode_move(void *, void *, size_t, void *); #endif /* _KERNEL */ +static char * +rt_name(dnode_t *dn, const char *name) +{ + struct objset *os = dn->dn_objset; + + return (kmem_asprintf("{spa=%s objset=%llu obj=%llu %s}", + spa_name(os->os_spa), + (u_longlong_t)(os->os_dsl_dataset ? + os->os_dsl_dataset->ds_object : DMU_META_OBJSET), + (u_longlong_t)dn->dn_object, + name)); +} + static int dbuf_compare(const void *x1, const void *x2) { @@ -160,9 +173,7 @@ dnode_cons(void *arg, void *unused, int kmflag) dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_assigned_txg = 0; - dn->dn_dirty_txg = 0; - dn->dn_dirtyctx = 0; - dn->dn_dirtyctx_firstset = NULL; + dn->dn_dirtycnt = 0; dn->dn_bonus = NULL; dn->dn_have_spill = B_FALSE; dn->dn_zio = NULL; @@ -201,7 +212,7 @@ dnode_dest(void *arg, void *unused) for (int i = 0; i < TXG_SIZE; i++) { ASSERT(!multilist_link_active(&dn->dn_dirty_link[i])); - ASSERT3P(dn->dn_free_ranges[i], ==, NULL); + ASSERT0P(dn->dn_free_ranges[i]); list_destroy(&dn->dn_dirty_records[i]); ASSERT0(dn->dn_next_nblkptr[i]); ASSERT0(dn->dn_next_nlevels[i]); @@ -216,12 +227,10 @@ dnode_dest(void *arg, void *unused) ASSERT0(dn->dn_allocated_txg); ASSERT0(dn->dn_free_txg); ASSERT0(dn->dn_assigned_txg); - ASSERT0(dn->dn_dirty_txg); - ASSERT0(dn->dn_dirtyctx); - ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL); - ASSERT3P(dn->dn_bonus, ==, NULL); + ASSERT0(dn->dn_dirtycnt); + ASSERT0P(dn->dn_bonus); ASSERT(!dn->dn_have_spill); - ASSERT3P(dn->dn_zio, ==, NULL); + ASSERT0P(dn->dn_zio); ASSERT0(dn->dn_oldused); ASSERT0(dn->dn_oldflags); ASSERT0(dn->dn_olduid); @@ -305,7 +314,7 @@ dnode_kstats_update(kstat_t *ksp, int rw) void dnode_init(void) { - ASSERT(dnode_cache == NULL); + ASSERT0P(dnode_cache); dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t), 0, dnode_cons, dnode_dest, NULL, NULL, NULL, KMC_RECLAIMABLE); kmem_cache_set_move(dnode_cache, dnode_move); @@ -496,7 +505,7 @@ dnode_buf_byteswap(void *vbuf, size_t size) int i = 0; ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT)); - ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0); + ASSERT0((size & (sizeof (dnode_phys_t)-1))); while (i < size) { dnode_phys_t *dnp = (void *)(((char *)vbuf) + i); @@ -660,7 +669,7 @@ dnode_destroy(dnode_t *dn) objset_t *os = dn->dn_objset; boolean_t complete_os_eviction = B_FALSE; - ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0); + ASSERT0((dn->dn_id_flags & DN_ID_NEW_EXIST)); mutex_enter(&os->os_lock); POINTER_INVALIDATE(&dn->dn_objset); @@ -679,10 +688,8 @@ dnode_destroy(dnode_t *dn) dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_assigned_txg = 0; - dn->dn_dirty_txg = 0; + dn->dn_dirtycnt = 0; - dn->dn_dirtyctx = 0; - dn->dn_dirtyctx_firstset = NULL; if (dn->dn_bonus != NULL) { mutex_enter(&dn->dn_bonus->db_mtx); dbuf_destroy(dn->dn_bonus); @@ -767,7 +774,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT0(dn->dn_next_maxblkid[i]); ASSERT(!multilist_link_active(&dn->dn_dirty_link[i])); ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); - ASSERT3P(dn->dn_free_ranges[i], ==, NULL); + ASSERT0P(dn->dn_free_ranges[i]); } dn->dn_type = ot; @@ -787,11 +794,9 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dn->dn_bonuslen = bonuslen; dn->dn_checksum = ZIO_CHECKSUM_INHERIT; dn->dn_compress = ZIO_COMPRESS_INHERIT; - dn->dn_dirtyctx = 0; dn->dn_free_txg = 0; - dn->dn_dirtyctx_firstset = NULL; - dn->dn_dirty_txg = 0; + dn->dn_dirtycnt = 0; dn->dn_allocated_txg = tx->tx_txg; dn->dn_id_flags = 0; @@ -942,10 +947,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) ndn->dn_allocated_txg = odn->dn_allocated_txg; ndn->dn_free_txg = odn->dn_free_txg; ndn->dn_assigned_txg = odn->dn_assigned_txg; - ndn->dn_dirty_txg = odn->dn_dirty_txg; - ndn->dn_dirtyctx = odn->dn_dirtyctx; - ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; - ASSERT(zfs_refcount_count(&odn->dn_tx_holds) == 0); + ndn->dn_dirtycnt = odn->dn_dirtycnt; + ASSERT0(zfs_refcount_count(&odn->dn_tx_holds)); zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds); ASSERT(avl_is_empty(&ndn->dn_dbufs)); avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs); @@ -1007,9 +1010,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) odn->dn_allocated_txg = 0; odn->dn_free_txg = 0; odn->dn_assigned_txg = 0; - odn->dn_dirty_txg = 0; - odn->dn_dirtyctx = 0; - odn->dn_dirtyctx_firstset = NULL; + odn->dn_dirtycnt = 0; odn->dn_have_spill = B_FALSE; odn->dn_zio = NULL; odn->dn_oldused = 0; @@ -1260,8 +1261,8 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots) } else if (DN_SLOT_IS_PTR(dn)) { mutex_enter(&dn->dn_mtx); boolean_t can_free = (dn->dn_type == DMU_OT_NONE && - zfs_refcount_is_zero(&dn->dn_holds) && - !DNODE_IS_DIRTY(dn)); + dn->dn_dirtycnt == 0 && + zfs_refcount_is_zero(&dn->dn_holds)); mutex_exit(&dn->dn_mtx); if (!can_free) @@ -1744,17 +1745,23 @@ dnode_hold(objset_t *os, uint64_t object, const void *tag, dnode_t **dnp) * reference on the dnode. Returns FALSE if unable to add a * new reference. */ +static boolean_t +dnode_add_ref_locked(dnode_t *dn, const void *tag) +{ + ASSERT(MUTEX_HELD(&dn->dn_mtx)); + if (zfs_refcount_is_zero(&dn->dn_holds)) + return (FALSE); + VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag)); + return (TRUE); +} + boolean_t dnode_add_ref(dnode_t *dn, const void *tag) { mutex_enter(&dn->dn_mtx); - if (zfs_refcount_is_zero(&dn->dn_holds)) { - mutex_exit(&dn->dn_mtx); - return (FALSE); - } - VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag)); + boolean_t r = dnode_add_ref_locked(dn, tag); mutex_exit(&dn->dn_mtx); - return (TRUE); + return (r); } void @@ -1817,31 +1824,20 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots) } /* - * Checks if the dnode itself is dirty, or is carrying any uncommitted records. - * It is important to check both conditions, as some operations (eg appending - * to a file) can dirty both as a single logical unit, but they are not synced - * out atomically, so checking one and not the other can result in an object - * appearing to be clean mid-way through a commit. + * Test if the dnode is dirty, or carrying uncommitted records. * - * Do not change this lightly! If you get it wrong, dmu_offset_next() can - * detect a hole where there is really data, leading to silent corruption. + * dn_dirtycnt is the number of txgs this dnode is dirty on. It's incremented + * in dnode_setdirty() the first time the dnode is dirtied on a txg, and + * decremented in either dnode_rele_task() or userquota_updates_task() when the + * txg is synced out. */ boolean_t dnode_is_dirty(dnode_t *dn) { mutex_enter(&dn->dn_mtx); - - for (int i = 0; i < TXG_SIZE; i++) { - if (multilist_link_active(&dn->dn_dirty_link[i]) || - !list_is_empty(&dn->dn_dirty_records[i])) { - mutex_exit(&dn->dn_mtx); - return (B_TRUE); - } - } - + boolean_t dirty = (dn->dn_dirtycnt != 0); mutex_exit(&dn->dn_mtx); - - return (B_FALSE); + return (dirty); } void @@ -1903,7 +1899,11 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) * dnode will hang around after we finish processing its * children. */ - VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg)); + mutex_enter(&dn->dn_mtx); + VERIFY(dnode_add_ref_locked(dn, (void *)(uintptr_t)tx->tx_txg)); + dn->dn_dirtycnt++; + ASSERT3U(dn->dn_dirtycnt, <=, 3); + mutex_exit(&dn->dn_mtx); (void) dbuf_dirty(dn->dn_dbuf, tx); @@ -2208,32 +2208,6 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, mutex_exit(&dn->dn_dbufs_mtx); } -void -dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, const void *tag) -{ - /* - * Don't set dirtyctx to SYNC if we're just modifying this as we - * initialize the objset. - */ - if (dn->dn_dirtyctx == DN_UNDIRTIED) { - dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; - - if (ds != NULL) { - rrw_enter(&ds->ds_bp_rwlock, RW_READER, tag); - } - if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) { - if (dmu_tx_is_syncing(tx)) - dn->dn_dirtyctx = DN_DIRTY_SYNC; - else - dn->dn_dirtyctx = DN_DIRTY_OPEN; - dn->dn_dirtyctx_firstset = tag; - } - if (ds != NULL) { - rrw_exit(&ds->ds_bp_rwlock, tag); - } - } -} - static void dnode_partial_zero(dnode_t *dn, uint64_t off, uint64_t blkoff, uint64_t len, dmu_tx_t *tx) @@ -2291,7 +2265,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) if ((off >> blkshift) > dn->dn_maxblkid) return; } else { - ASSERT(dn->dn_maxblkid == 0); + ASSERT0(dn->dn_maxblkid); if (off == 0 && len >= blksz) { /* * Freeing the whole block; fast-track this request. @@ -2436,8 +2410,10 @@ done: { int txgoff = tx->tx_txg & TXG_MASK; if (dn->dn_free_ranges[txgoff] == NULL) { - dn->dn_free_ranges[txgoff] = zfs_range_tree_create(NULL, - ZFS_RANGE_SEG64, NULL, 0, 0); + dn->dn_free_ranges[txgoff] = + zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, rt_name(dn, "dn_free_ranges")); } zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks); zfs_range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks); @@ -2509,7 +2485,7 @@ dnode_diduse_space(dnode_t *dn, int64_t delta) } space += delta; if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) { - ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0); + ASSERT0((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES)); ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT)); dn->dn_phys->dn_used = space >> DEV_BSHIFT; } else { diff --git a/sys/contrib/openzfs/module/zfs/dnode_sync.c b/sys/contrib/openzfs/module/zfs/dnode_sync.c index 4067f221f1bf..046ceddb3609 100644 --- a/sys/contrib/openzfs/module/zfs/dnode_sync.c +++ b/sys/contrib/openzfs/module/zfs/dnode_sync.c @@ -209,8 +209,8 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) rw_exit(&dn->dn_struct_rwlock); if (err == ENOENT) continue; - ASSERT(err == 0); - ASSERT(child->db_level == 0); + ASSERT0(err); + ASSERT0(child->db_level); dr = dbuf_find_dirty_eq(child, txg); /* data_old better be zeroed */ @@ -868,7 +868,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx); if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { - ASSERT3P(list_head(list), ==, NULL); + ASSERT0P(list_head(list)); dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); } diff --git a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c index e301fe19f645..ee574c499f9f 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c +++ b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c @@ -243,7 +243,7 @@ dsl_bookmark_create_check_impl(dsl_pool_t *dp, /* error is retval of the following if-cascade */ if (strchr(source, '@') != NULL) { dsl_dataset_t *source_snap_ds; - ASSERT3S(snapshot_namecheck(source, NULL, NULL), ==, 0); + ASSERT0(snapshot_namecheck(source, NULL, NULL)); error = dsl_dataset_hold(dp, source, FTAG, &source_snap_ds); if (error == 0) { VERIFY(source_snap_ds->ds_is_snapshot); @@ -258,7 +258,7 @@ dsl_bookmark_create_check_impl(dsl_pool_t *dp, } } else if (strchr(source, '#') != NULL) { zfs_bookmark_phys_t source_phys; - ASSERT3S(bookmark_namecheck(source, NULL, NULL), ==, 0); + ASSERT0(bookmark_namecheck(source, NULL, NULL)); /* * Source must exists and be an earlier point in newbm_ds's * timeline (newbm_ds's origin may be a snap of source's ds) @@ -501,7 +501,7 @@ dsl_bookmark_create_sync_impl_snap(const char *bookmark, const char *snapshot, sizeof (uint64_t) * num_redact_snaps); local_rl->rl_phys->rlp_num_snaps = num_redact_snaps; if (bookmark_redacted) { - ASSERT3P(redaction_list, ==, NULL); + ASSERT0P(redaction_list); local_rl->rl_phys->rlp_last_blkid = UINT64_MAX; local_rl->rl_phys->rlp_last_object = UINT64_MAX; dsl_redaction_list_long_rele(local_rl, tag); @@ -1523,7 +1523,7 @@ dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) * If the block was live (referenced) at the time of this * bookmark, add its space to the bookmark's FBN. */ - if (BP_GET_LOGICAL_BIRTH(bp) <= + if (BP_GET_BIRTH(bp) <= dbn->dbn_phys.zbm_creation_txg && (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) { mutex_enter(&dbn->dbn_lock); diff --git a/sys/contrib/openzfs/module/zfs/dsl_crypt.c b/sys/contrib/openzfs/module/zfs/dsl_crypt.c index db568f42d24e..f519b937edc0 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_crypt.c +++ b/sys/contrib/openzfs/module/zfs/dsl_crypt.c @@ -534,7 +534,7 @@ out: static void dsl_crypto_key_free(dsl_crypto_key_t *dck) { - ASSERT(zfs_refcount_count(&dck->dck_holds) == 0); + ASSERT0(zfs_refcount_count(&dck->dck_holds)); /* destroy the zio_crypt_key_t */ zio_crypt_key_destroy(&dck->dck_key); @@ -866,7 +866,7 @@ spa_keystore_load_wkey(const char *dsname, dsl_crypto_params_t *dcp, dsl_pool_rele(dp, FTAG); /* create any zvols under this ds */ - zvol_create_minors_recursive(dsname); + zvol_create_minors(dsname); return (0); @@ -1912,7 +1912,7 @@ dsl_dataset_create_crypt_sync(uint64_t dsobj, dsl_dir_t *dd, /* clones always use their origin's wrapping key */ if (dsl_dir_is_clone(dd)) { - ASSERT3P(dcp, ==, NULL); + ASSERT0P(dcp); /* * If this is an encrypted clone we just need to clone the diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c index c0a7872c40ad..420687480a76 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_dataset.c +++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c @@ -159,7 +159,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) return; } - ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >, + ASSERT3U(BP_GET_BIRTH(bp), >, dsl_dataset_phys(ds)->ds_prev_snap_txg); dmu_buf_will_dirty(ds->ds_dbuf, tx); mutex_enter(&ds->ds_lock); @@ -194,7 +194,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) * they do not need to be freed. */ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && - BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg && + BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg && !(BP_IS_EMBEDDED(bp))) { ASSERT(dsl_dir_is_clone(ds->ds_dir)); ASSERT(spa_feature_is_enabled(spa, @@ -263,7 +263,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, return (0); ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(BP_GET_LOGICAL_BIRTH(bp) <= tx->tx_txg); + ASSERT(BP_GET_BIRTH(bp) <= tx->tx_txg); if (ds == NULL) { dsl_free(tx->tx_pool, tx->tx_txg, bp); @@ -281,7 +281,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, * they do not need to be freed. */ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && - BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg && + BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg && !(BP_IS_EMBEDDED(bp))) { ASSERT(dsl_dir_is_clone(ds->ds_dir)); ASSERT(spa_feature_is_enabled(spa, @@ -289,7 +289,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, bplist_append(&ds->ds_dir->dd_pending_frees, bp); } - if (BP_GET_LOGICAL_BIRTH(bp) > dsl_dataset_phys(ds)->ds_prev_snap_txg) { + if (BP_GET_BIRTH(bp) > dsl_dataset_phys(ds)->ds_prev_snap_txg) { int64_t delta; /* @@ -346,14 +346,14 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0); /* if (logical birth > prev prev snap txg) prev unique += bs */ if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == - ds->ds_object && BP_GET_LOGICAL_BIRTH(bp) > + ds->ds_object && BP_GET_BIRTH(bp) > dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) { dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); mutex_enter(&ds->ds_prev->ds_lock); dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used; mutex_exit(&ds->ds_prev->ds_lock); } - if (BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg) { + if (BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg) { dsl_dir_transfer_space(ds->ds_dir, used, DD_USED_HEAD, DD_USED_SNAP, tx); } @@ -450,7 +450,7 @@ dsl_dataset_evict_sync(void *dbu) { dsl_dataset_t *ds = dbu; - ASSERT(ds->ds_owner == NULL); + ASSERT0P(ds->ds_owner); unique_remove(ds->ds_fsid_guid); } @@ -460,7 +460,7 @@ dsl_dataset_evict_async(void *dbu) { dsl_dataset_t *ds = dbu; - ASSERT(ds->ds_owner == NULL); + ASSERT0P(ds->ds_owner); ds->ds_dbuf = NULL; @@ -1187,7 +1187,7 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0); ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0); + ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj); dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); @@ -2005,7 +2005,7 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) if (error == 0) { for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { - zvol_create_minor(nvpair_name(pair)); + zvol_create_minors(nvpair_name(pair)); } } @@ -2112,7 +2112,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *rio, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(ds->ds_objset != NULL); - ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0); + ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj); /* * in case we had to change ds_fsid_guid when we opened it, @@ -2944,7 +2944,7 @@ dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap) if (snap == NULL) return (B_FALSE); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - birth = BP_GET_LOGICAL_BIRTH(dsl_dataset_get_blkptr(ds)); + birth = BP_GET_BIRTH(dsl_dataset_get_blkptr(ds)); rrw_exit(&ds->ds_bp_rwlock, FTAG); if (birth > dsl_dataset_phys(snap)->ds_creation_txg) { objset_t *os, *os_snap; @@ -3413,7 +3413,7 @@ dsl_dataset_clone(const char *clone, const char *origin) 6, ZFS_SPACE_CHECK_NORMAL); if (rv == 0) - zvol_create_minor(clone); + zvol_create_minors(clone); crfree(cr); @@ -4180,7 +4180,7 @@ dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_pool_t *dp = dmu_tx_pool(tx); int64_t unused_refres_delta; - ASSERT(clone->ds_reserved == 0); + ASSERT0(clone->ds_reserved); /* * NOTE: On DEBUG kernels there could be a race between this and * the check function if spa_asize_inflation is adjusted... diff --git a/sys/contrib/openzfs/module/zfs/dsl_deadlist.c b/sys/contrib/openzfs/module/zfs/dsl_deadlist.c index 3113d932fb68..41ac72bf1c16 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_deadlist.c +++ b/sys/contrib/openzfs/module/zfs/dsl_deadlist.c @@ -484,7 +484,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed, dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp); dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp); - dle_tofind.dle_mintxg = BP_GET_LOGICAL_BIRTH(bp); + dle_tofind.dle_mintxg = BP_GET_BIRTH(bp); dle = avl_find(&dl->dl_tree, &dle_tofind, &where); if (dle == NULL) dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); @@ -493,7 +493,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed, if (dle == NULL) { zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu", - bp, (longlong_t)BP_GET_LOGICAL_BIRTH(bp)); + bp, (longlong_t)BP_GET_BIRTH(bp)); dle = avl_first(&dl->dl_tree); } @@ -1037,7 +1037,7 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, avl_tree_t *avl = lia->avl; bplist_t *to_free = lia->to_free; zthr_t *t = lia->t; - ASSERT(tx == NULL); + ASSERT0P(tx); if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t))) return (SET_ERROR(EINTR)); @@ -1049,7 +1049,8 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(&found->le_bp)); ASSERT3U(BP_GET_CHECKSUM(bp), ==, BP_GET_CHECKSUM(&found->le_bp)); - ASSERT3U(BP_GET_BIRTH(bp), ==, BP_GET_BIRTH(&found->le_bp)); + ASSERT3U(BP_GET_PHYSICAL_BIRTH(bp), ==, + BP_GET_PHYSICAL_BIRTH(&found->le_bp)); } if (bp_freed) { if (found == NULL) { diff --git a/sys/contrib/openzfs/module/zfs/dsl_deleg.c b/sys/contrib/openzfs/module/zfs/dsl_deleg.c index c01a06e98340..fdd37b36e280 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_deleg.c +++ b/sys/contrib/openzfs/module/zfs/dsl_deleg.c @@ -102,7 +102,7 @@ dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr) nvlist_t *perms; nvpair_t *permpair = NULL; - VERIFY(nvpair_value_nvlist(whopair, &perms) == 0); + VERIFY0(nvpair_value_nvlist(whopair, &perms)); while ((permpair = nvlist_next_nvpair(perms, permpair))) { const char *perm = nvpair_name(permpair); @@ -189,8 +189,7 @@ dsl_deleg_set_sync(void *arg, dmu_tx_t *tx) const char *perm = nvpair_name(permpair); uint64_t n = 0; - VERIFY(zap_update(mos, jumpobj, - perm, 8, 1, &n, tx) == 0); + VERIFY0(zap_update(mos, jumpobj, perm, 8, 1, &n, tx)); spa_history_log_internal_dd(dd, "permission update", tx, "%s %s", whokey, perm); } @@ -225,7 +224,7 @@ dsl_deleg_unset_sync(void *arg, dmu_tx_t *tx) if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) == 0) { (void) zap_remove(mos, zapobj, whokey, tx); - VERIFY(0 == zap_destroy(mos, jumpobj, tx)); + VERIFY0(zap_destroy(mos, jumpobj, tx)); } spa_history_log_internal_dd(dd, "permission who remove", tx, "%s", whokey); @@ -243,7 +242,7 @@ dsl_deleg_unset_sync(void *arg, dmu_tx_t *tx) if (zap_count(mos, jumpobj, &n) == 0 && n == 0) { (void) zap_remove(mos, zapobj, whokey, tx); - VERIFY(0 == zap_destroy(mos, + VERIFY0(zap_destroy(mos, jumpobj, tx)); } spa_history_log_internal_dd(dd, "permission remove", tx, @@ -332,7 +331,7 @@ dsl_deleg_get(const char *ddname, nvlist_t **nvp) basezc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); baseza = zap_attribute_alloc(); source = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); - VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY0(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); for (dd = startdd; dd != NULL; dd = dd->dd_parent) { nvlist_t *sp_nvp; @@ -706,7 +705,7 @@ copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj, ZFS_DELEG_LOCAL, &uid); if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) == ENOENT) { jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); - VERIFY(zap_add(mos, zapobj, whokey, 8, 1, &jumpobj, tx) == 0); + VERIFY0(zap_add(mos, zapobj, whokey, 8, 1, &jumpobj, tx)); } za = zap_attribute_alloc(); @@ -716,8 +715,7 @@ copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj, uint64_t zero = 0; ASSERT(za->za_integer_length == 8 && za->za_num_integers == 1); - VERIFY(zap_update(mos, jumpobj, za->za_name, - 8, 1, &zero, tx) == 0); + VERIFY0(zap_update(mos, jumpobj, za->za_name, 8, 1, &zero, tx)); } zap_cursor_fini(&zc); zap_attribute_free(za); @@ -761,10 +759,10 @@ dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx) zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { ASSERT(za->za_integer_length == 8 && za->za_num_integers == 1); - VERIFY(0 == zap_destroy(mos, za->za_first_integer, tx)); + VERIFY0(zap_destroy(mos, za->za_first_integer, tx)); } zap_cursor_fini(&zc); - VERIFY(0 == zap_destroy(mos, zapobj, tx)); + VERIFY0(zap_destroy(mos, zapobj, tx)); zap_attribute_free(za); return (0); } diff --git a/sys/contrib/openzfs/module/zfs/dsl_destroy.c b/sys/contrib/openzfs/module/zfs/dsl_destroy.c index f5ec93b2dc5c..ea01ee586f8b 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_destroy.c +++ b/sys/contrib/openzfs/module/zfs/dsl_destroy.c @@ -133,11 +133,11 @@ process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) ASSERT(!BP_IS_HOLE(bp)); - if (BP_GET_LOGICAL_BIRTH(bp) <= + if (BP_GET_BIRTH(bp) <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) { dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx); if (poa->ds_prev && !poa->after_branch_point && - BP_GET_LOGICAL_BIRTH(bp) > + BP_GET_BIRTH(bp) > dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) { dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes += bp_get_dsize_sync(dp->dp_spa, bp); @@ -315,8 +315,7 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=, - tx->tx_txg); + ASSERT3U(BP_GET_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=, tx->tx_txg); rrw_exit(&ds->ds_bp_rwlock, FTAG); ASSERT(zfs_refcount_is_zero(&ds->ds_longholds)); @@ -351,7 +350,7 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) dsl_dataset_deactivate_feature(ds, f, tx); } if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { - ASSERT3P(ds->ds_prev, ==, NULL); + ASSERT0P(ds->ds_prev); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev)); after_branch_point = @@ -466,7 +465,7 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) &used, &comp, &uncomp); dsl_dataset_phys(ds_next)->ds_unique_bytes += used; dsl_dataset_rele(ds_nextnext, FTAG); - ASSERT3P(ds_next->ds_prev, ==, NULL); + ASSERT0P(ds_next->ds_prev); /* Collapse range in this head. */ dsl_dataset_t *hds; @@ -526,7 +525,7 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) /* remove from snapshot namespace */ dsl_dataset_t *ds_head; - ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0); + ASSERT0(dsl_dataset_phys(ds)->ds_snapnames_zapobj); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head)); VERIFY0(dsl_dataset_get_snapname(ds)); @@ -729,8 +728,8 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, */ dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); } else { - ASSERT(zilog == NULL); - ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >, + ASSERT0P(zilog); + ASSERT3U(BP_GET_BIRTH(bp), >, dsl_dataset_phys(ka->ds)->ds_prev_snap_txg); (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); } @@ -1020,8 +1019,7 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) ASSERT(ds->ds_prev == NULL || dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=, - tx->tx_txg); + ASSERT3U(BP_GET_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=, tx->tx_txg); rrw_exit(&ds->ds_bp_rwlock, FTAG); ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); diff --git a/sys/contrib/openzfs/module/zfs/dsl_dir.c b/sys/contrib/openzfs/module/zfs/dsl_dir.c index f24cd2049533..6ce1890cfea1 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_dir.c +++ b/sys/contrib/openzfs/module/zfs/dsl_dir.c @@ -151,8 +151,8 @@ dsl_dir_evict_async(void *dbu) for (t = 0; t < TXG_SIZE; t++) { ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); - ASSERT(dd->dd_tempreserved[t] == 0); - ASSERT(dd->dd_space_towrite[t] == 0); + ASSERT0(dd->dd_tempreserved[t]); + ASSERT0(dd->dd_space_towrite[t]); } if (dd->dd_parent) diff --git a/sys/contrib/openzfs/module/zfs/dsl_pool.c b/sys/contrib/openzfs/module/zfs/dsl_pool.c index f1088d87208b..f47822df8b53 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_pool.c +++ b/sys/contrib/openzfs/module/zfs/dsl_pool.c @@ -522,8 +522,8 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops __attribute__((unused)), /* create and open the free_bplist */ obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); - VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); + VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); } @@ -1056,7 +1056,7 @@ upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) * will be wrong. */ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - ASSERT0(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(prev)->ds_bp)); + ASSERT0(BP_GET_BIRTH(&dsl_dataset_phys(prev)->ds_bp)); rrw_exit(&ds->ds_bp_rwlock, FTAG); /* The origin doesn't get attached to itself */ @@ -1077,7 +1077,7 @@ upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) dsl_dataset_phys(prev)->ds_num_children++; if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) { - ASSERT(ds->ds_prev == NULL); + ASSERT0P(ds->ds_prev); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev)); @@ -1173,7 +1173,7 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) dsl_dataset_t *ds; ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(dp->dp_origin_snap == NULL); + ASSERT0P(dp->dp_origin_snap); ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER)); /* create the origin dir, ds, & snap-ds */ @@ -1250,7 +1250,7 @@ dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) { objset_t *mos = dp->dp_meta_objset; - ASSERT(dp->dp_tmp_userrefs_obj == 0); + ASSERT0(dp->dp_tmp_userrefs_obj); ASSERT(dmu_tx_is_syncing(tx)); dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, diff --git a/sys/contrib/openzfs/module/zfs/dsl_prop.c b/sys/contrib/openzfs/module/zfs/dsl_prop.c index b76f22df61e2..51f624da5689 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_prop.c +++ b/sys/contrib/openzfs/module/zfs/dsl_prop.c @@ -815,7 +815,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, */ err = zap_update(mos, zapobj, recvdstr, intsz, numints, value, tx); - ASSERT(err == 0); + ASSERT0(err); break; case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED): /* @@ -1166,7 +1166,7 @@ dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj, if (nvlist_exists(nv, propname)) continue; - VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY0(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP)); if (za->za_integer_length == 1) { /* * String property @@ -1179,8 +1179,7 @@ dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj, kmem_free(tmp, za->za_num_integers); break; } - VERIFY(nvlist_add_string(propval, ZPROP_VALUE, - tmp) == 0); + VERIFY0(nvlist_add_string(propval, ZPROP_VALUE, tmp)); kmem_free(tmp, za->za_num_integers); } else { /* @@ -1191,8 +1190,8 @@ dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj, za->za_first_integer); } - VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, source) == 0); - VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); + VERIFY0(nvlist_add_string(propval, ZPROP_SOURCE, source)); + VERIFY0(nvlist_add_nvlist(nv, propname, propval)); nvlist_free(propval); } zap_cursor_fini(&zc); @@ -1215,7 +1214,7 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, int err = 0; char setpoint[ZFS_MAX_DATASET_NAME_LEN]; - VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY0(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); if (ds->ds_is_snapshot) flags |= DSL_PROP_GET_SNAPSHOT; @@ -1333,18 +1332,18 @@ dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value) uint64_t default_value; if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) { - VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0); + VERIFY0(nvlist_add_uint64(propval, ZPROP_VALUE, value)); return; } - VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0); + VERIFY0(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP)); + VERIFY0(nvlist_add_uint64(propval, ZPROP_VALUE, value)); /* Indicate the default source if we can. */ if (dodefault(prop, 8, 1, &default_value) == 0 && value == default_value) { - VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0); + VERIFY0(nvlist_add_string(propval, ZPROP_SOURCE, "")); } - VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); + VERIFY0(nvlist_add_nvlist(nv, propname, propval)); nvlist_free(propval); } @@ -1355,13 +1354,13 @@ dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value) const char *propname = zfs_prop_to_name(prop); if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) { - VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0); + VERIFY0(nvlist_add_string(propval, ZPROP_VALUE, value)); return; } - VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0); - VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); + VERIFY0(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP)); + VERIFY0(nvlist_add_string(propval, ZPROP_VALUE, value)); + VERIFY0(nvlist_add_nvlist(nv, propname, propval)); nvlist_free(propval); } diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c index 1b2cd3e361d1..fcd50c459d07 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_scan.c +++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c @@ -454,7 +454,7 @@ static inline void bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i) { sio->sio_blk_prop = bp->blk_prop; - sio->sio_phys_birth = BP_GET_PHYSICAL_BIRTH(bp); + sio->sio_phys_birth = BP_GET_RAW_PHYSICAL_BIRTH(bp); sio->sio_birth = BP_GET_LOGICAL_BIRTH(bp); sio->sio_cksum = bp->blk_cksum; sio->sio_nr_dvas = BP_GET_NDVAS(bp); @@ -1768,7 +1768,7 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, ASSERT(!BP_IS_REDACTED(bp)); if (BP_IS_HOLE(bp) || - BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) + BP_GET_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) return (0); /* @@ -1778,13 +1778,13 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, * scrub there's nothing to do to it). */ if (claim_txg == 0 && - BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(dp->dp_spa)) + BP_GET_BIRTH(bp) >= spa_min_claim_txg(dp->dp_spa)) return (0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); - VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); + VERIFY0(scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); return (0); } @@ -1804,7 +1804,7 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, ASSERT(!BP_IS_REDACTED(bp)); if (BP_IS_HOLE(bp) || - BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) + BP_GET_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) return (0); /* @@ -1812,7 +1812,7 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, * already txg sync'ed (but this log block contains * other records that are not synced) */ - if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg) + if (claim_txg == 0 || BP_GET_BIRTH(bp) < claim_txg) return (0); ASSERT3U(BP_GET_LSIZE(bp), !=, 0); @@ -1820,7 +1820,7 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); - VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); + VERIFY0(scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); } return (0); } @@ -1952,7 +1952,7 @@ dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb) return; if (BP_IS_HOLE(bp) || - BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg || + BP_GET_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) return; @@ -2223,7 +2223,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, if (dnp != NULL && dnp->dn_bonuslen > DN_MAX_BONUS_LEN(dnp)) { scn->scn_phys.scn_errors++; - spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp)); + spa_log_error(spa, zb, BP_GET_PHYSICAL_BIRTH(bp)); return (SET_ERROR(EINVAL)); } @@ -2319,7 +2319,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, * by arc_read() for the cases above. */ scn->scn_phys.scn_errors++; - spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp)); + spa_log_error(spa, zb, BP_GET_PHYSICAL_BIRTH(bp)); return (SET_ERROR(EINVAL)); } @@ -2396,7 +2396,12 @@ dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb, if (f != SPA_FEATURE_NONE) ASSERT(dsl_dataset_feature_is_active(ds, f)); - if (BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) { + /* + * Recurse any blocks that were written either logically or physically + * at or after cur_min_txg. About logical birth we care for traversal, + * looking for any changes, while about physical for the actual scan. + */ + if (BP_GET_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) { scn->scn_lt_min_this_txg++; return; } @@ -2422,7 +2427,7 @@ dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb, * Don't scan it now unless we need to because something * under it was modified. */ - if (BP_GET_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) { + if (BP_GET_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) { scn->scn_gt_max_this_txg++; return; } @@ -4806,7 +4811,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, { dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; - uint64_t phys_birth = BP_GET_BIRTH(bp); + uint64_t phys_birth = BP_GET_PHYSICAL_BIRTH(bp); size_t psize = BP_GET_PSIZE(bp); boolean_t needs_io = B_FALSE; int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; @@ -5136,7 +5141,7 @@ dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd) mutex_enter(&svd->vdev_scan_io_queue_lock); mutex_enter(&tvd->vdev_scan_io_queue_lock); - VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL); + VERIFY0P(tvd->vdev_scan_io_queue); tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue; svd->vdev_scan_io_queue = NULL; if (tvd->vdev_scan_io_queue != NULL) diff --git a/sys/contrib/openzfs/module/zfs/dsl_userhold.c b/sys/contrib/openzfs/module/zfs/dsl_userhold.c index 57c70e4ce3d2..f91b7a1eb69a 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_userhold.c +++ b/sys/contrib/openzfs/module/zfs/dsl_userhold.c @@ -335,7 +335,7 @@ dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist) dduha.dduha_holds = holds; /* chkholds can have non-unique name */ - VERIFY(0 == nvlist_alloc(&dduha.dduha_chkholds, 0, KM_SLEEP)); + VERIFY0(nvlist_alloc(&dduha.dduha_chkholds, 0, KM_SLEEP)); dduha.dduha_errlist = errlist; dduha.dduha_minor = cleanup_minor; diff --git a/sys/contrib/openzfs/module/zfs/fm.c b/sys/contrib/openzfs/module/zfs/fm.c index a092817efedd..ae788b2310d8 100644 --- a/sys/contrib/openzfs/module/zfs/fm.c +++ b/sys/contrib/openzfs/module/zfs/fm.c @@ -337,7 +337,7 @@ zfs_zevent_next(zfs_zevent_t *ze, nvlist_t **event, uint64_t *event_size, } } - VERIFY(nvlist_size(ev->ev_nvl, &size, NV_ENCODE_NATIVE) == 0); + VERIFY0(nvlist_size(ev->ev_nvl, &size, NV_ENCODE_NATIVE)); if (size > *event_size) { *event_size = size; error = ENOMEM; diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c index 43b94eba2d58..9f4399af56bd 100644 --- a/sys/contrib/openzfs/module/zfs/metaslab.c +++ b/sys/contrib/openzfs/module/zfs/metaslab.c @@ -375,13 +375,23 @@ static metaslab_stats_t metaslab_stats = { #define METASLABSTAT_BUMP(stat) \ atomic_inc_64(&metaslab_stats.stat.value.ui64); +char * +metaslab_rt_name(metaslab_group_t *mg, metaslab_t *ms, const char *name) +{ + return (kmem_asprintf("{spa=%s vdev_guid=%llu ms_id=%llu %s}", + spa_name(mg->mg_vd->vdev_spa), + (u_longlong_t)mg->mg_vd->vdev_guid, + (u_longlong_t)ms->ms_id, + name)); +} + static kstat_t *metaslab_ksp; void metaslab_stat_init(void) { - ASSERT(metaslab_alloc_trace_cache == NULL); + ASSERT0P(metaslab_alloc_trace_cache); metaslab_alloc_trace_cache = kmem_cache_create( "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), 0, NULL, NULL, NULL, NULL, NULL, 0); @@ -446,16 +456,16 @@ metaslab_class_destroy(metaslab_class_t *mc) { spa_t *spa = mc->mc_spa; - ASSERT(mc->mc_alloc == 0); - ASSERT(mc->mc_deferred == 0); - ASSERT(mc->mc_space == 0); - ASSERT(mc->mc_dspace == 0); + ASSERT0(mc->mc_alloc); + ASSERT0(mc->mc_deferred); + ASSERT0(mc->mc_space); + ASSERT0(mc->mc_dspace); for (int i = 0; i < spa->spa_alloc_count; i++) { metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; avl_destroy(&mca->mca_tree); mutex_destroy(&mca->mca_lock); - ASSERT(mca->mca_rotor == NULL); + ASSERT0P(mca->mca_rotor); ASSERT0(mca->mca_reserved); } mutex_destroy(&mc->mc_lock); @@ -750,7 +760,8 @@ metaslab_class_histogram_verify(metaslab_class_t *mc) } IMPLY(mg == mg->mg_vd->vdev_log_mg, - mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); + mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) || + mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa)); for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) mc_hist[i] += mg->mg_histogram[i]; @@ -1076,8 +1087,8 @@ metaslab_group_destroy(metaslab_group_t *mg) { spa_t *spa = mg->mg_class->mc_spa; - ASSERT(mg->mg_prev == NULL); - ASSERT(mg->mg_next == NULL); + ASSERT0P(mg->mg_prev); + ASSERT0P(mg->mg_next); /* * We may have gone below zero with the activation count * either because we never activated in the first place or @@ -1107,8 +1118,8 @@ metaslab_group_activate(metaslab_group_t *mg) ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0); - ASSERT(mg->mg_prev == NULL); - ASSERT(mg->mg_next == NULL); + ASSERT0P(mg->mg_prev); + ASSERT0P(mg->mg_next); ASSERT(mg->mg_activation_count <= 0); if (++mg->mg_activation_count <= 0) @@ -1153,8 +1164,8 @@ metaslab_group_passivate(metaslab_group_t *mg) if (--mg->mg_activation_count != 0) { for (int i = 0; i < spa->spa_alloc_count; i++) ASSERT(mc->mc_allocator[i].mca_rotor != mg); - ASSERT(mg->mg_prev == NULL); - ASSERT(mg->mg_next == NULL); + ASSERT0P(mg->mg_prev); + ASSERT0P(mg->mg_next); ASSERT(mg->mg_activation_count < 0); return; } @@ -1183,14 +1194,16 @@ metaslab_group_passivate(metaslab_group_t *mg) if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, - metaslab_weight_from_range_tree(msp)); + metaslab_weight(msp, B_TRUE) & + ~METASLAB_ACTIVE_MASK); mutex_exit(&msp->ms_lock); } msp = mga->mga_secondary; if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, - metaslab_weight_from_range_tree(msp)); + metaslab_weight(msp, B_TRUE) & + ~METASLAB_ACTIVE_MASK); mutex_exit(&msp->ms_lock); } } @@ -1288,7 +1301,8 @@ metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) mutex_enter(&mc->mc_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { IMPLY(mg == mg->mg_vd->vdev_log_mg, - mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); + mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) || + mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa)); mg->mg_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] += @@ -1316,7 +1330,8 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) ASSERT3U(mc->mc_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); IMPLY(mg == mg->mg_vd->vdev_log_mg, - mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); + mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) || + mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa)); mg->mg_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; @@ -1330,7 +1345,7 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) static void metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) { - ASSERT(msp->ms_group == NULL); + ASSERT0P(msp->ms_group); mutex_enter(&mg->mg_lock); msp->ms_group = mg; msp->ms_weight = 0; @@ -2895,30 +2910,43 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, zfs_range_seg_type_t type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift); - ms->ms_allocatable = zfs_range_tree_create(NULL, type, NULL, start, - shift); + ms->ms_allocatable = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_allocatable")); for (int t = 0; t < TXG_SIZE; t++) { - ms->ms_allocating[t] = zfs_range_tree_create(NULL, type, - NULL, start, shift); - } - ms->ms_freeing = zfs_range_tree_create(NULL, type, NULL, start, shift); - ms->ms_freed = zfs_range_tree_create(NULL, type, NULL, start, shift); + ms->ms_allocating[t] = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, + metaslab_rt_name(mg, ms, "ms_allocating")); + } + ms->ms_freeing = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_freeing")); + ms->ms_freed = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_freed")); for (int t = 0; t < TXG_DEFER_SIZE; t++) { - ms->ms_defer[t] = zfs_range_tree_create(NULL, type, NULL, - start, shift); + ms->ms_defer[t] = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_defer")); } - ms->ms_checkpointing = - zfs_range_tree_create(NULL, type, NULL, start, shift); - ms->ms_unflushed_allocs = - zfs_range_tree_create(NULL, type, NULL, start, shift); + ms->ms_checkpointing = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_checkpointing")); + ms->ms_unflushed_allocs = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_unflushed_allocs")); metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); mrap->mra_bt = &ms->ms_unflushed_frees_by_size; mrap->mra_floor_shift = metaslab_by_size_min_shift; - ms->ms_unflushed_frees = zfs_range_tree_create(&metaslab_rt_ops, - type, mrap, start, shift); + ms->ms_unflushed_frees = zfs_range_tree_create_flags( + &metaslab_rt_ops, type, mrap, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_unflushed_frees")); - ms->ms_trim = zfs_range_tree_create(NULL, type, NULL, start, shift); + ms->ms_trim = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_trim")); metaslab_group_add(mg, ms); metaslab_set_fragmentation(ms, B_FALSE); @@ -2989,7 +3017,7 @@ metaslab_fini(metaslab_t *msp) metaslab_group_remove(mg, msp); mutex_enter(&msp->ms_lock); - VERIFY(msp->ms_group == NULL); + VERIFY0P(msp->ms_group); /* * If this metaslab hasn't been through metaslab_sync_done() yet its @@ -3892,7 +3920,10 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp, &start, &shift); - condense_tree = zfs_range_tree_create(NULL, type, NULL, start, shift); + condense_tree = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, + metaslab_rt_name(msp->ms_group, msp, "condense_tree")); for (int t = 0; t < TXG_DEFER_SIZE; t++) { zfs_range_tree_walk(msp->ms_defer[t], @@ -3949,8 +3980,10 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) * followed by FREES (due to space_map_write() in metaslab_sync()) for * sync pass 1. */ - zfs_range_tree_t *tmp_tree = zfs_range_tree_create(NULL, type, NULL, - start, shift); + zfs_range_tree_t *tmp_tree = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, + metaslab_rt_name(msp->ms_group, msp, "tmp_tree")); zfs_range_tree_add(tmp_tree, msp->ms_start, msp->ms_size); space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); @@ -5199,29 +5232,16 @@ next: /* * We were unable to allocate from this metaslab so determine - * a new weight for this metaslab. Now that we have loaded - * the metaslab we can provide a better hint to the metaslab - * selector. - * - * For space-based metaslabs, we use the maximum block size. - * This information is only available when the metaslab - * is loaded and is more accurate than the generic free - * space weight that was calculated by metaslab_weight(). - * This information allows us to quickly compare the maximum - * available allocation in the metaslab to the allocation - * size being requested. - * - * For segment-based metaslabs, determine the new weight - * based on the highest bucket in the range tree. We - * explicitly use the loaded segment weight (i.e. the range - * tree histogram) since it contains the space that is - * currently available for allocation and is accurate - * even within a sync pass. + * a new weight for this metaslab. The weight was last + * recalculated either when we loaded it (if this is the first + * TXG it's been loaded in), or the last time a txg was synced + * out. */ uint64_t weight; if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { - weight = metaslab_largest_allocatable(msp); - WEIGHT_SET_SPACEBASED(weight); + metaslab_set_fragmentation(msp, B_TRUE); + weight = metaslab_space_weight(msp) & + ~METASLAB_ACTIVE_MASK; } else { weight = metaslab_weight_from_range_tree(msp); } @@ -5233,13 +5253,6 @@ next: * For the case where we use the metaslab that is * active for another allocator we want to make * sure that we retain the activation mask. - * - * Note that we could attempt to use something like - * metaslab_recalculate_weight_and_sort() that - * retains the activation mask here. That function - * uses metaslab_weight() to set the weight though - * which is not as accurate as the calculations - * above. */ weight |= msp->ms_weight & METASLAB_ACTIVE_MASK; metaslab_group_sort(mg, msp, weight); @@ -5590,7 +5603,21 @@ remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; uint64_t physical_birth = vdev_indirect_births_physbirth(vib, DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); - BP_SET_PHYSICAL_BIRTH(bp, physical_birth); + + /* + * For rewritten blocks, use the old physical birth as the new logical + * birth (representing when the space was allocated) and the removal + * time as the new physical birth (representing when it was actually + * written). + */ + if (BP_GET_REWRITE(bp)) { + uint64_t old_physical_birth = BP_GET_PHYSICAL_BIRTH(bp); + ASSERT3U(old_physical_birth, <, physical_birth); + BP_SET_BIRTH(bp, old_physical_birth, physical_birth); + BP_SET_REWRITE(bp, 0); + } else { + BP_SET_PHYSICAL_BIRTH(bp, physical_birth); + } DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); DVA_SET_OFFSET(&bp->blk_dva[0], offset); @@ -5712,7 +5739,7 @@ metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) ASSERT(!vd->vdev_removing); ASSERT(vdev_is_concrete(vd)); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); - ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); + ASSERT0P(vd->vdev_indirect_mapping); if (DVA_GET_GANG(dva)) size = vdev_gang_header_asize(vd); @@ -5757,21 +5784,21 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) } /* - * Reserve some allocation slots. The reservation system must be called - * before we call into the allocator. If there aren't any available slots - * then the I/O will be throttled until an I/O completes and its slots are - * freed up. The function returns true if it was successful in placing - * the reservation. + * Reserve some space for a future allocation. The reservation system must be + * called before we call into the allocator. If there aren't enough space + * available, the calling I/O will be throttled until another I/O completes and + * its reservation is released. The function returns true if it was successful + * in placing the reservation. */ boolean_t -metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, - boolean_t must, boolean_t *more) +metaslab_class_throttle_reserve(metaslab_class_t *mc, int allocator, + int copies, uint64_t io_size, boolean_t must, boolean_t *more) { - metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator]; + metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; ASSERT(mc->mc_alloc_throttle_enabled); - if (mc->mc_alloc_io_size < zio->io_size) { - mc->mc_alloc_io_size = zio->io_size; + if (mc->mc_alloc_io_size < io_size) { + mc->mc_alloc_io_size = io_size; metaslab_class_balance(mc, B_FALSE); } if (must || mca->mca_reserved <= mc->mc_alloc_max) { @@ -5782,10 +5809,9 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, * worst that can happen is few more I/Os get to allocation * earlier, that is not a problem. */ - int64_t delta = slots * zio->io_size; + int64_t delta = copies * io_size; *more = (atomic_add_64_nv(&mca->mca_reserved, delta) <= mc->mc_alloc_max); - zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; return (B_TRUE); } *more = B_FALSE; @@ -5793,13 +5819,13 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, } boolean_t -metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, - zio_t *zio) +metaslab_class_throttle_unreserve(metaslab_class_t *mc, int allocator, + int copies, uint64_t io_size) { - metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator]; + metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; ASSERT(mc->mc_alloc_throttle_enabled); - int64_t delta = slots * zio->io_size; + int64_t delta = copies * io_size; return (atomic_add_64_nv(&mca->mca_reserved, -delta) <= mc->mc_alloc_max); } @@ -5960,7 +5986,7 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize, int error = 0; ASSERT0(BP_GET_LOGICAL_BIRTH(bp)); - ASSERT0(BP_GET_PHYSICAL_BIRTH(bp)); + ASSERT0(BP_GET_RAW_PHYSICAL_BIRTH(bp)); spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); @@ -5971,16 +5997,16 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize, } ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); - ASSERT(BP_GET_NDVAS(bp) == 0); + ASSERT0(BP_GET_NDVAS(bp)); ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); ASSERT3P(zal, !=, NULL); - uint64_t cur_psize = 0; - + uint64_t smallest_psize = UINT64_MAX; for (int d = 0; d < ndvas; d++) { - error = metaslab_alloc_dva_range(spa, mc, psize, max_psize, - dva, d, hintdva, txg, flags, zal, allocator, - actual_psize ? &cur_psize : NULL); + uint64_t cur_psize = 0; + error = metaslab_alloc_dva_range(spa, mc, psize, + MIN(smallest_psize, max_psize), dva, d, hintdva, txg, + flags, zal, allocator, actual_psize ? &cur_psize : NULL); if (error != 0) { for (d--; d >= 0; d--) { metaslab_unalloc_dva(spa, &dva[d], txg); @@ -6000,13 +6026,13 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize, DVA_GET_VDEV(&dva[d]), allocator, flags, psize, tag); if (actual_psize) - max_psize = MIN(cur_psize, max_psize); + smallest_psize = MIN(cur_psize, smallest_psize); } } - ASSERT(error == 0); + ASSERT0(error); ASSERT(BP_GET_NDVAS(bp) == ndvas); if (actual_psize) - *actual_psize = max_psize; + *actual_psize = smallest_psize; spa_config_exit(spa, SCL_ALLOC, FTAG); @@ -6022,7 +6048,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) int ndvas = BP_GET_NDVAS(bp); ASSERT(!BP_IS_HOLE(bp)); - ASSERT(!now || BP_GET_LOGICAL_BIRTH(bp) >= spa_syncing_txg(spa)); + ASSERT(!now || BP_GET_BIRTH(bp) >= spa_syncing_txg(spa)); /* * If we have a checkpoint for the pool we need to make sure that @@ -6040,7 +6066,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) * normally as they will be referenced by the checkpointed uberblock. */ boolean_t checkpoint = B_FALSE; - if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg && + if (BP_GET_BIRTH(bp) <= spa->spa_checkpoint_txg && spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { /* * At this point, if the block is part of the checkpoint diff --git a/sys/contrib/openzfs/module/zfs/mmp.c b/sys/contrib/openzfs/module/zfs/mmp.c index f3665d29b8b4..7db72b9b04b0 100644 --- a/sys/contrib/openzfs/module/zfs/mmp.c +++ b/sys/contrib/openzfs/module/zfs/mmp.c @@ -260,7 +260,7 @@ mmp_thread_stop(spa_t *spa) zfs_dbgmsg("MMP thread stopped pool '%s' gethrtime %llu", spa_name(spa), gethrtime()); - ASSERT(mmp->mmp_thread == NULL); + ASSERT0P(mmp->mmp_thread); mmp->mmp_thread_exiting = 0; } diff --git a/sys/contrib/openzfs/module/zfs/multilist.c b/sys/contrib/openzfs/module/zfs/multilist.c index 7b85d19e19ee..46fb79269310 100644 --- a/sys/contrib/openzfs/module/zfs/multilist.c +++ b/sys/contrib/openzfs/module/zfs/multilist.c @@ -81,7 +81,7 @@ multilist_create_impl(multilist_t *ml, size_t size, size_t offset, ml->ml_num_sublists = num; ml->ml_index_func = index_func; - ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) * + ml->ml_sublists = vmem_zalloc(sizeof (multilist_sublist_t) * ml->ml_num_sublists, KM_SLEEP); ASSERT3P(ml->ml_sublists, !=, NULL); @@ -134,7 +134,7 @@ multilist_destroy(multilist_t *ml) } ASSERT3P(ml->ml_sublists, !=, NULL); - kmem_free(ml->ml_sublists, + vmem_free(ml->ml_sublists, sizeof (multilist_sublist_t) * ml->ml_num_sublists); ml->ml_num_sublists = 0; diff --git a/sys/contrib/openzfs/module/zfs/range_tree.c b/sys/contrib/openzfs/module/zfs/range_tree.c index 373636c69254..ea2d2c7227c8 100644 --- a/sys/contrib/openzfs/module/zfs/range_tree.c +++ b/sys/contrib/openzfs/module/zfs/range_tree.c @@ -201,10 +201,10 @@ ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, zfs_range_seg64_t, ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf, zfs_range_seg_gap_t, zfs_range_tree_seg_gap_compare) -zfs_range_tree_t * -zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops, +static zfs_range_tree_t * +zfs_range_tree_create_impl(const zfs_range_tree_ops_t *ops, zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, - uint64_t gap) + uint64_t gap, uint64_t flags, const char *name) { zfs_range_tree_t *rt = kmem_zalloc(sizeof (zfs_range_tree_t), KM_SLEEP); @@ -236,6 +236,8 @@ zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops, rt->rt_ops = ops; rt->rt_gap = gap; + rt->rt_flags = flags; + rt->rt_name = name; rt->rt_arg = arg; rt->rt_type = type; rt->rt_start = start; @@ -248,10 +250,29 @@ zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops, } zfs_range_tree_t * +zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops, + zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, + uint64_t gap) +{ + return (zfs_range_tree_create_impl(ops, type, arg, start, shift, gap, + 0, NULL)); +} + +zfs_range_tree_t * zfs_range_tree_create(const zfs_range_tree_ops_t *ops, zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift) { - return (zfs_range_tree_create_gap(ops, type, arg, start, shift, 0)); + return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0, + 0, NULL)); +} + +zfs_range_tree_t * +zfs_range_tree_create_flags(const zfs_range_tree_ops_t *ops, + zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, + uint64_t flags, const char *name) +{ + return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0, + flags, name)); } void @@ -262,6 +283,9 @@ zfs_range_tree_destroy(zfs_range_tree_t *rt) if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL) rt->rt_ops->rtop_destroy(rt, rt->rt_arg); + if (rt->rt_name != NULL && (rt->rt_flags & ZFS_RT_F_DYN_NAME)) + kmem_strfree((char *)(uintptr_t)rt->rt_name); + zfs_btree_destroy(&rt->rt_root); kmem_free(rt, sizeof (*rt)); } @@ -271,15 +295,17 @@ zfs_range_tree_adjust_fill(zfs_range_tree_t *rt, zfs_range_seg_t *rs, int64_t delta) { if (delta < 0 && delta * -1 >= zfs_rs_get_fill(rs, rt)) { - zfs_panic_recover("zfs: attempting to decrease fill to or " - "below 0; probable double remove in segment [%llx:%llx]", + zfs_panic_recover("zfs: rt=%s: attempting to decrease fill to " + "or below 0; probable double remove in segment [%llx:%llx]", + ZFS_RT_NAME(rt), (longlong_t)zfs_rs_get_start(rs, rt), (longlong_t)zfs_rs_get_end(rs, rt)); } if (zfs_rs_get_fill(rs, rt) + delta > zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) { - zfs_panic_recover("zfs: attempting to increase fill beyond " - "max; probable double add in segment [%llx:%llx]", + zfs_panic_recover("zfs: rt=%s: attempting to increase fill " + "beyond max; probable double add in segment [%llx:%llx]", + ZFS_RT_NAME(rt), (longlong_t)zfs_rs_get_start(rs, rt), (longlong_t)zfs_rs_get_end(rs, rt)); } @@ -319,14 +345,17 @@ zfs_range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) * the normal code paths. */ if (rs != NULL) { + uint64_t rstart = zfs_rs_get_start(rs, rt); + uint64_t rend = zfs_rs_get_end(rs, rt); if (gap == 0) { - zfs_panic_recover("zfs: adding existent segment to " - "range tree (offset=%llx size=%llx)", - (longlong_t)start, (longlong_t)size); + zfs_panic_recover("zfs: rt=%s: adding segment " + "(offset=%llx size=%llx) overlapping with existing " + "one (offset=%llx size=%llx)", + ZFS_RT_NAME(rt), + (longlong_t)start, (longlong_t)size, + (longlong_t)rstart, (longlong_t)(rend - rstart)); return; } - uint64_t rstart = zfs_rs_get_start(rs, rt); - uint64_t rend = zfs_rs_get_end(rs, rt); if (rstart <= start && rend >= end) { zfs_range_tree_adjust_fill(rt, rs, fill); return; @@ -348,7 +377,7 @@ zfs_range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) return; } - ASSERT3P(rs, ==, NULL); + ASSERT0P(rs); /* * Determine whether or not we will have to merge with our neighbors. @@ -451,6 +480,7 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size, zfs_range_seg_t *rs; zfs_range_seg_max_t rsearch, rs_tmp; uint64_t end = start + size; + uint64_t rstart, rend; boolean_t left_over, right_over; VERIFY3U(size, !=, 0); @@ -464,12 +494,15 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size, /* Make sure we completely overlap with someone */ if (rs == NULL) { - zfs_panic_recover("zfs: removing nonexistent segment from " - "range tree (offset=%llx size=%llx)", - (longlong_t)start, (longlong_t)size); + zfs_panic_recover("zfs: rt=%s: removing nonexistent segment " + "from range tree (offset=%llx size=%llx)", + ZFS_RT_NAME(rt), (longlong_t)start, (longlong_t)size); return; } + rstart = zfs_rs_get_start(rs, rt); + rend = zfs_rs_get_end(rs, rt); + /* * Range trees with gap support must only remove complete segments * from the tree. This allows us to maintain accurate fill accounting @@ -479,31 +512,36 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size, if (rt->rt_gap != 0) { if (do_fill) { if (zfs_rs_get_fill(rs, rt) == size) { - start = zfs_rs_get_start(rs, rt); - end = zfs_rs_get_end(rs, rt); + start = rstart; + end = rend; size = end - start; } else { zfs_range_tree_adjust_fill(rt, rs, -size); return; } - } else if (zfs_rs_get_start(rs, rt) != start || - zfs_rs_get_end(rs, rt) != end) { - zfs_panic_recover("zfs: freeing partial segment of " - "gap tree (offset=%llx size=%llx) of " + } else if (rstart != start || rend != end) { + zfs_panic_recover("zfs: rt=%s: freeing partial segment " + "of gap tree (offset=%llx size=%llx) of " "(offset=%llx size=%llx)", + ZFS_RT_NAME(rt), (longlong_t)start, (longlong_t)size, - (longlong_t)zfs_rs_get_start(rs, rt), - (longlong_t)zfs_rs_get_end(rs, rt) - - zfs_rs_get_start(rs, rt)); + (longlong_t)rstart, (longlong_t)(rend - rstart)); return; } } - VERIFY3U(zfs_rs_get_start(rs, rt), <=, start); - VERIFY3U(zfs_rs_get_end(rs, rt), >=, end); + if (!(rstart <= start && rend >= end)) { + panic("zfs: rt=%s: removing segment " + "(offset=%llx size=%llx) not completely overlapped by " + "existing one (offset=%llx size=%llx)", + ZFS_RT_NAME(rt), + (longlong_t)start, (longlong_t)size, + (longlong_t)rstart, (longlong_t)(rend - rstart)); + return; + } - left_over = (zfs_rs_get_start(rs, rt) != start); - right_over = (zfs_rs_get_end(rs, rt) != end); + left_over = (rstart != start); + right_over = (rend != end); zfs_range_tree_stat_decr(rt, rs); @@ -829,7 +867,7 @@ zfs_range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, next = zfs_btree_next(&removefrom->rt_root, &where, &where); } - VERIFY3P(curr, ==, NULL); + VERIFY0P(curr); if (start != end) { VERIFY3U(start, <, end); diff --git a/sys/contrib/openzfs/module/zfs/rrwlock.c b/sys/contrib/openzfs/module/zfs/rrwlock.c index 8ee784619839..d0df39b93560 100644 --- a/sys/contrib/openzfs/module/zfs/rrwlock.c +++ b/sys/contrib/openzfs/module/zfs/rrwlock.c @@ -108,7 +108,7 @@ rrn_add(rrwlock_t *rrl, const void *tag) rn->rn_rrl = rrl; rn->rn_next = tsd_get(rrw_tsd_key); rn->rn_tag = tag; - VERIFY(tsd_set(rrw_tsd_key, rn) == 0); + VERIFY0(tsd_set(rrw_tsd_key, rn)); } /* @@ -129,7 +129,7 @@ rrn_find_and_remove(rrwlock_t *rrl, const void *tag) if (prev) prev->rn_next = rn->rn_next; else - VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0); + VERIFY0(tsd_set(rrw_tsd_key, rn->rn_next)); kmem_free(rn, sizeof (*rn)); return (B_TRUE); } @@ -155,7 +155,7 @@ rrw_destroy(rrwlock_t *rrl) { mutex_destroy(&rrl->rr_lock); cv_destroy(&rrl->rr_cv); - ASSERT(rrl->rr_writer == NULL); + ASSERT0P(rrl->rr_writer); zfs_refcount_destroy(&rrl->rr_anon_rcount); zfs_refcount_destroy(&rrl->rr_linked_rcount); } @@ -188,7 +188,7 @@ rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, const void *tag) } else { (void) zfs_refcount_add(&rrl->rr_anon_rcount, tag); } - ASSERT(rrl->rr_writer == NULL); + ASSERT0P(rrl->rr_writer); mutex_exit(&rrl->rr_lock); } diff --git a/sys/contrib/openzfs/module/zfs/sa.c b/sys/contrib/openzfs/module/zfs/sa.c index 5db470ce6242..7ad25d4d85ba 100644 --- a/sys/contrib/openzfs/module/zfs/sa.c +++ b/sys/contrib/openzfs/module/zfs/sa.c @@ -304,7 +304,7 @@ sa_get_spill(sa_handle_t *hdl) if (hdl->sa_spill == NULL) { if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL, &hdl->sa_spill)) == 0) - VERIFY(0 == sa_build_index(hdl, SA_SPILL)); + VERIFY0(sa_build_index(hdl, SA_SPILL)); } else { rc = 0; } @@ -432,7 +432,7 @@ sa_add_layout_entry(objset_t *os, const sa_attr_type_t *attrs, int attr_count, (void) snprintf(attr_name, sizeof (attr_name), "%d", (int)lot_num); - VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj, + VERIFY0(zap_update(os, os->os_sa->sa_layout_attr_obj, attr_name, 2, attr_count, attrs, tx)); } @@ -505,7 +505,7 @@ sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx) } error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx); - ASSERT(error == 0); + ASSERT0(error); return (error); } @@ -717,7 +717,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) > hdl->sa_spill->db_size) - VERIFY(0 == sa_resize_spill(hdl, + VERIFY0(sa_resize_spill(hdl, BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx)); } @@ -791,7 +791,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, hdl->sa_bonus_tab = NULL; } if (!sa->sa_force_spill) - VERIFY(0 == sa_build_index(hdl, SA_BONUS)); + VERIFY0(sa_build_index(hdl, SA_BONUS)); if (hdl->sa_spill) { sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); if (!spilling) { @@ -801,10 +801,10 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, dmu_buf_rele(hdl->sa_spill, NULL); hdl->sa_spill = NULL; hdl->sa_spill_tab = NULL; - VERIFY(0 == dmu_rm_spill(hdl->sa_os, + VERIFY0(dmu_rm_spill(hdl->sa_os, sa_handle_object(hdl), tx)); } else { - VERIFY(0 == sa_build_index(hdl, SA_SPILL)); + VERIFY0(sa_build_index(hdl, SA_SPILL)); } } @@ -1733,10 +1733,10 @@ sa_add_projid(sa_handle_t *hdl, dmu_tx_t *tx, uint64_t projid) NULL, dxattr_obj, dxattr_size); } - VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0); - VERIFY(sa_replace_all_by_template_locked(hdl, attrs, count, tx) == 0); + VERIFY0(dmu_set_bonustype(db, DMU_OT_SA, tx)); + VERIFY0(sa_replace_all_by_template_locked(hdl, attrs, count, tx)); if (znode_acl.z_acl_extern_obj) { - VERIFY(0 == dmu_object_free(zfsvfs->z_os, + VERIFY0(dmu_object_free(zfsvfs->z_os, znode_acl.z_acl_extern_obj, tx)); } @@ -1858,7 +1858,7 @@ sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx) continue; ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length, tb[i].sa_byteswap); - VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj, + VERIFY0(zap_update(hdl->sa_os, sa->sa_reg_attr_obj, tb[i].sa_name, 8, 1, &attr_value, tx)); tb[i].sa_registered = B_TRUE; } @@ -2013,7 +2013,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, * Only a variable-sized attribute can be * replaced here, and its size must be changing. */ - ASSERT3U(reg_length, ==, 0); + ASSERT0(reg_length); ASSERT3U(length, !=, buflen); SA_ADD_BULK_ATTR(attr_desc, j, attr, locator, datastart, buflen); diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c index 6b52c6cb1f9e..b3bb46da263b 100644 --- a/sys/contrib/openzfs/module/zfs/spa.c +++ b/sys/contrib/openzfs/module/zfs/spa.c @@ -100,6 +100,7 @@ #include <sys/vmsystm.h> #endif /* _KERNEL */ +#include "zfs_crrd.h" #include "zfs_prop.h" #include "zfs_comutil.h" #include <cityhash.h> @@ -311,6 +312,41 @@ static int zfs_livelist_condense_zthr_cancel = 0; static int zfs_livelist_condense_new_alloc = 0; /* + * Time variable to decide how often the txg should be added into the + * database (in seconds). + * The smallest available resolution is in minutes, which means an update occurs + * each time we reach `spa_note_txg_time` and the txg has changed. We provide + * a 256-slot ring buffer for minute-level resolution. The number is limited by + * the size of the structure we use and the maximum amount of bytes we can write + * into ZAP. Setting `spa_note_txg_time` to 10 minutes results in approximately + * 144 records per day. Given the 256 slots, this provides roughly 1.5 days of + * high-resolution data. + * + * The user can decrease `spa_note_txg_time` to increase resolution within + * a day, at the cost of retaining fewer days of data. Alternatively, increasing + * the interval allows storing data over a longer period, but with lower + * frequency. + * + * This parameter does not affect the daily or monthly databases, as those only + * store one record per day and per month, respectively. + */ +static uint_t spa_note_txg_time = 10 * 60; + +/* + * How often flush txg database to a disk (in seconds). + * We flush data every time we write to it, making it the most reliable option. + * Since this happens every 10 minutes, it shouldn't introduce any noticeable + * overhead for the system. In case of failure, we will always have an + * up-to-date version of the database. + * + * The user can adjust the flush interval to a lower value, but it probably + * doesn't make sense to flush more often than the database is updated. + * The user can also increase the interval if they're concerned about the + * performance of writing the entire database to disk. + */ +static uint_t spa_flush_txg_time = 10 * 60; + +/* * ========================================================================== * SPA properties routines * ========================================================================== @@ -390,10 +426,10 @@ spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval, { nvlist_t *propval; - VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); - VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); - VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); + VERIFY0(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP)); + VERIFY0(nvlist_add_uint64(propval, ZPROP_SOURCE, src)); + VERIFY0(nvlist_add_string(propval, ZPROP_VALUE, strval)); + VERIFY0(nvlist_add_nvlist(nvl, propname, propval)); nvlist_free(propval); } @@ -417,11 +453,15 @@ spa_prop_get_config(spa_t *spa, nvlist_t *nv) alloc += metaslab_class_get_alloc(spa_special_class(spa)); alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); + alloc += metaslab_class_get_alloc( + spa_special_embedded_log_class(spa)); size = metaslab_class_get_space(mc); size += metaslab_class_get_space(spa_special_class(spa)); size += metaslab_class_get_space(spa_dedup_class(spa)); size += metaslab_class_get_space(spa_embedded_log_class(spa)); + size += metaslab_class_get_space( + spa_special_embedded_log_class(spa)); spa_prop_add_list(nv, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(nv, ZPOOL_PROP_SIZE, NULL, size, src); @@ -925,7 +965,7 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp) uint64_t ver = 0; if (prop == ZPOOL_PROP_VERSION) { - VERIFY(nvpair_value_uint64(elem, &ver) == 0); + VERIFY0(nvpair_value_uint64(elem, &ver)); } else { ASSERT(zpool_prop_feature(nvpair_name(elem))); ver = SPA_VERSION_FEATURES; @@ -1255,7 +1295,7 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; if (tqs->stqs_taskq == NULL) { - ASSERT3U(tqs->stqs_count, ==, 0); + ASSERT0(tqs->stqs_count); return; } @@ -1679,6 +1719,8 @@ spa_activate(spa_t *spa, spa_mode_t mode) "embedded_log", msp, B_TRUE); spa->spa_special_class = metaslab_class_create(spa, "special", msp, B_FALSE); + spa->spa_special_embedded_log_class = metaslab_class_create(spa, + "special_embedded_log", msp, B_TRUE); spa->spa_dedup_class = metaslab_class_create(spa, "dedup", msp, B_FALSE); @@ -1794,9 +1836,9 @@ static void spa_deactivate(spa_t *spa) { ASSERT(spa->spa_sync_on == B_FALSE); - ASSERT(spa->spa_dsl_pool == NULL); - ASSERT(spa->spa_root_vdev == NULL); - ASSERT(spa->spa_async_zio_root == NULL); + ASSERT0P(spa->spa_dsl_pool); + ASSERT0P(spa->spa_root_vdev); + ASSERT0P(spa->spa_async_zio_root); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); spa_evicting_os_wait(spa); @@ -1853,6 +1895,9 @@ spa_deactivate(spa_t *spa) metaslab_class_destroy(spa->spa_special_class); spa->spa_special_class = NULL; + metaslab_class_destroy(spa->spa_special_embedded_log_class); + spa->spa_special_embedded_log_class = NULL; + metaslab_class_destroy(spa->spa_dedup_class); spa->spa_dedup_class = NULL; @@ -1976,7 +2021,7 @@ spa_unload_log_sm_flush_all(spa_t *spa) dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND)); - ASSERT3U(spa->spa_log_flushall_txg, ==, 0); + ASSERT0(spa->spa_log_flushall_txg); spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); dmu_tx_commit(tx); @@ -2031,6 +2076,111 @@ spa_destroy_aux_threads(spa_t *spa) } } +static void +spa_sync_time_logger(spa_t *spa, uint64_t txg) +{ + uint64_t curtime; + dmu_tx_t *tx; + + if (!spa_writeable(spa)) { + return; + } + curtime = gethrestime_sec(); + if (curtime < spa->spa_last_noted_txg_time + spa_note_txg_time) { + return; + } + + if (txg > spa->spa_last_noted_txg) { + spa->spa_last_noted_txg_time = curtime; + spa->spa_last_noted_txg = txg; + + mutex_enter(&spa->spa_txg_log_time_lock); + dbrrd_add(&spa->spa_txg_log_time, curtime, txg); + mutex_exit(&spa->spa_txg_log_time_lock); + } + + if (curtime < spa->spa_last_flush_txg_time + spa_flush_txg_time) { + return; + } + spa->spa_last_flush_txg_time = curtime; + + tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); + + VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TXG_LOG_TIME_MINUTES, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, + &spa->spa_txg_log_time.dbr_minutes, tx)); + VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TXG_LOG_TIME_DAYS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, + &spa->spa_txg_log_time.dbr_days, tx)); + VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TXG_LOG_TIME_MONTHS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, + &spa->spa_txg_log_time.dbr_months, tx)); + dmu_tx_commit(tx); +} + +static void +spa_unload_sync_time_logger(spa_t *spa) +{ + uint64_t txg; + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); + + txg = dmu_tx_get_txg(tx); + spa->spa_last_noted_txg_time = 0; + spa->spa_last_flush_txg_time = 0; + spa_sync_time_logger(spa, txg); + + dmu_tx_commit(tx); +} + +static void +spa_load_txg_log_time(spa_t *spa) +{ + int error; + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TXG_LOG_TIME_MINUTES, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, + &spa->spa_txg_log_time.dbr_minutes); + if (error != 0 && error != ENOENT) { + spa_load_note(spa, "unable to load a txg time database with " + "minute resolution [error=%d]", error); + } + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TXG_LOG_TIME_DAYS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, + &spa->spa_txg_log_time.dbr_days); + if (error != 0 && error != ENOENT) { + spa_load_note(spa, "unable to load a txg time database with " + "day resolution [error=%d]", error); + } + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TXG_LOG_TIME_MONTHS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, + &spa->spa_txg_log_time.dbr_months); + if (error != 0 && error != ENOENT) { + spa_load_note(spa, "unable to load a txg time database with " + "month resolution [error=%d]", error); + } +} + +static boolean_t +spa_should_sync_time_logger_on_unload(spa_t *spa) +{ + + if (!spa_writeable(spa)) + return (B_FALSE); + + if (!spa->spa_sync_on) + return (B_FALSE); + + if (spa_state(spa) != POOL_STATE_EXPORTED) + return (B_FALSE); + + if (spa->spa_last_noted_txg == 0) + return (B_FALSE); + + return (B_TRUE); +} + + /* * Opposite of spa_load(). */ @@ -2052,6 +2202,9 @@ spa_unload(spa_t *spa) * we delay the final TXGs beyond what spa_final_txg is set at. */ if (spa->spa_final_txg == UINT64_MAX) { + if (spa_should_sync_time_logger_on_unload(spa)) + spa_unload_sync_time_logger(spa); + /* * If the log space map feature is enabled and the pool is * getting exported (but not destroyed), we want to spend some @@ -2127,7 +2280,7 @@ spa_unload(spa_t *spa) */ if (spa->spa_root_vdev) vdev_free(spa->spa_root_vdev); - ASSERT(spa->spa_root_vdev == NULL); + ASSERT0P(spa->spa_root_vdev); /* * Close the dsl pool. @@ -2265,8 +2418,8 @@ spa_load_spares(spa_t *spa) spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) { - VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, - VDEV_ALLOC_SPARE) == 0); + VERIFY0(spa_config_parse(spa, &vd, spares[i], NULL, 0, + VDEV_ALLOC_SPARE)); ASSERT(vd != NULL); spa->spa_spares.sav_vdevs[i] = vd; @@ -2393,8 +2546,8 @@ spa_load_l2cache(spa_t *spa) /* * Create new vdev */ - VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, - VDEV_ALLOC_L2CACHE) == 0); + VERIFY0(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, + VDEV_ALLOC_L2CACHE)); ASSERT(vd != NULL); newvdevs[i] = vd; @@ -2646,7 +2799,7 @@ spa_passivate_log(spa_t *spa) vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_islog) { - ASSERT3P(tvd->vdev_log_mg, ==, NULL); + ASSERT0P(tvd->vdev_log_mg); metaslab_group_passivate(tvd->vdev_mg); slog_found = B_TRUE; } @@ -2669,7 +2822,7 @@ spa_activate_log(spa_t *spa) vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_islog) { - ASSERT3P(tvd->vdev_log_mg, ==, NULL); + ASSERT0P(tvd->vdev_log_mg); metaslab_group_activate(tvd->vdev_mg); } } @@ -2709,8 +2862,8 @@ spa_claim_notify(zio_t *zio) return; mutex_enter(&spa->spa_props_lock); /* any mutex will do */ - if (spa->spa_claim_max_txg < BP_GET_LOGICAL_BIRTH(zio->io_bp)) - spa->spa_claim_max_txg = BP_GET_LOGICAL_BIRTH(zio->io_bp); + if (spa->spa_claim_max_txg < BP_GET_BIRTH(zio->io_bp)) + spa->spa_claim_max_txg = BP_GET_BIRTH(zio->io_bp); mutex_exit(&spa->spa_props_lock); } @@ -3106,7 +3259,7 @@ spa_livelist_delete_cb(void *arg, zthr_t *z) static void spa_start_livelist_destroy_thread(spa_t *spa) { - ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); + ASSERT0P(spa->spa_livelist_delete_zthr); spa->spa_livelist_delete_zthr = zthr_create("z_livelist_destroy", spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa, @@ -3122,7 +3275,7 @@ static int livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { - ASSERT(tx == NULL); + ASSERT0P(tx); livelist_new_arg_t *lna = arg; if (bp_freed) { bplist_append(lna->frees, bp); @@ -3316,7 +3469,7 @@ spa_start_livelist_condensing_thread(spa_t *spa) spa->spa_to_condense.syncing = B_FALSE; spa->spa_to_condense.cancelled = B_FALSE; - ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); + ASSERT0P(spa->spa_livelist_condense_zthr); spa->spa_livelist_condense_zthr = zthr_create("z_livelist_condense", spa_livelist_condense_cb_check, @@ -3333,7 +3486,7 @@ spa_spawn_aux_threads(spa_t *spa) spa_start_livelist_destroy_thread(spa); spa_start_livelist_condensing_thread(spa); - ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); + ASSERT0P(spa->spa_checkpoint_discard_zthr); spa->spa_checkpoint_discard_zthr = zthr_create("z_checkpoint_discard", spa_checkpoint_discard_thread_check, @@ -3768,20 +3921,17 @@ out: * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool */ if (error == EREMOTEIO) { - const char *hostname = "<unknown>"; - uint64_t hostid = 0; - if (mmp_label) { if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { - hostname = fnvlist_lookup_string(mmp_label, - ZPOOL_CONFIG_HOSTNAME); + const char *hostname = fnvlist_lookup_string( + mmp_label, ZPOOL_CONFIG_HOSTNAME); fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_MMP_HOSTNAME, hostname); } if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { - hostid = fnvlist_lookup_uint64(mmp_label, - ZPOOL_CONFIG_HOSTID); + uint64_t hostid = fnvlist_lookup_uint64( + mmp_label, ZPOOL_CONFIG_HOSTID); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_HOSTID, hostid); } @@ -3941,11 +4091,11 @@ spa_ld_parse_config(spa_t *spa, spa_import_type_t type) nvlist_free(spa->spa_load_info); spa->spa_load_info = fnvlist_alloc(); - ASSERT(spa->spa_comment == NULL); + ASSERT0P(spa->spa_comment); if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) spa->spa_comment = spa_strdup(comment); - ASSERT(spa->spa_compatibility == NULL); + ASSERT0P(spa->spa_compatibility); if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY, &compatibility) == 0) spa->spa_compatibility = spa_strdup(compatibility); @@ -4711,6 +4861,9 @@ spa_ld_get_props(spa_t *spa) if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + /* Load time log */ + spa_load_txg_log_time(spa); + /* * Load the persistent error log. If we have an older pool, this will * not be present. @@ -5760,7 +5913,7 @@ spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, nvlist_free(config); if (state == SPA_LOAD_RECOVER) { - ASSERT3P(loadinfo, ==, NULL); + ASSERT0P(loadinfo); spa_import_progress_remove(spa_guid(spa)); return (rewind_error); } else { @@ -5899,7 +6052,7 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, } if (firstopen) - zvol_create_minors_recursive(spa_name(spa)); + zvol_create_minors(spa_name(spa)); *spapp = spa; @@ -6877,7 +7030,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) mutex_exit(&spa_namespace_lock); - zvol_create_minors_recursive(pool); + zvol_create_minors(pool); spa_import_os(spa); @@ -7134,6 +7287,9 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, spa_config_exit(spa, SCL_ALL, FTAG); } + if (spa_should_sync_time_logger_on_unload(spa)) + spa_unload_sync_time_logger(spa); + /* * If the log space map feature is enabled and the pool is * getting exported (but not destroyed), we want to spend some @@ -8935,7 +9091,7 @@ spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) { - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); + ASSERT0(spa_config_held(spa, SCL_ALL, RW_WRITER)); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); @@ -8946,7 +9102,7 @@ spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) int spa_scan_stop(spa_t *spa) { - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); + ASSERT0(spa_config_held(spa, SCL_ALL, RW_WRITER)); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); @@ -8963,7 +9119,7 @@ int spa_scan_range(spa_t *spa, pool_scan_func_t func, uint64_t txgstart, uint64_t txgend) { - ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); + ASSERT0(spa_config_held(spa, SCL_ALL, RW_WRITER)); if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) return (SET_ERROR(ENOTSUP)); @@ -9092,6 +9248,8 @@ spa_async_thread(void *arg) old_space += metaslab_class_get_space(spa_dedup_class(spa)); old_space += metaslab_class_get_space( spa_embedded_log_class(spa)); + old_space += metaslab_class_get_space( + spa_special_embedded_log_class(spa)); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); @@ -9100,6 +9258,8 @@ spa_async_thread(void *arg) new_space += metaslab_class_get_space(spa_dedup_class(spa)); new_space += metaslab_class_get_space( spa_embedded_log_class(spa)); + new_space += metaslab_class_get_space( + spa_special_embedded_log_class(spa)); mutex_exit(&spa_namespace_lock); /* @@ -9388,7 +9548,7 @@ spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) { zio_t *zio = zio_root(spa, NULL, NULL, 0); bplist_iterate(bpl, spa_free_sync_cb, zio, tx); - VERIFY(zio_wait(zio) == 0); + VERIFY0(zio_wait(zio)); } /* @@ -9427,7 +9587,7 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) size_t nvsize = 0; dmu_buf_t *db; - VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); + VERIFY0(nvlist_size(nv, &nvsize, NV_ENCODE_XDR)); /* * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration @@ -9437,15 +9597,15 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); packed = vmem_alloc(bufsize, KM_SLEEP); - VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, - KM_SLEEP) == 0); + VERIFY0(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, + KM_SLEEP)); memset(packed + nvsize, 0, bufsize - nvsize); dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); vmem_free(packed, bufsize); - VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); + VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); *(uint64_t *)db->db_data = nvsize; dmu_buf_rele(db, FTAG); @@ -10180,6 +10340,8 @@ spa_sync(spa_t *spa, uint64_t txg) */ brt_pending_apply(spa, txg); + spa_sync_time_logger(spa, txg); + /* * Lock out configuration changes. */ @@ -10222,6 +10384,7 @@ spa_sync(spa_t *spa, uint64_t txg) dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); spa->spa_sync_starttime = gethrtime(); + taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + @@ -10309,7 +10472,7 @@ spa_sync(spa_t *spa, uint64_t txg) metaslab_class_evict_old(spa->spa_normal_class, txg); metaslab_class_evict_old(spa->spa_log_class, txg); - /* spa_embedded_log_class has only one metaslab per vdev. */ + /* Embedded log classes have only one metaslab per vdev. */ metaslab_class_evict_old(spa->spa_special_class, txg); metaslab_class_evict_old(spa->spa_dedup_class, txg); @@ -10378,7 +10541,7 @@ spa_sync_tq_create(spa_t *spa, const char *name) { kthread_t **kthreads; - ASSERT(spa->spa_sync_tq == NULL); + ASSERT0P(spa->spa_sync_tq); ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus); /* @@ -11095,6 +11258,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, "Whether extra ALLOC blkptrs were added to a livelist entry while it " "was being condensed"); +ZFS_MODULE_PARAM(zfs_spa, spa_, note_txg_time, UINT, ZMOD_RW, + "How frequently TXG timestamps are stored internally (in seconds)"); + +ZFS_MODULE_PARAM(zfs_spa, spa_, flush_txg_time, UINT, ZMOD_RW, + "How frequently the TXG timestamps database should be flushed " + "to disk (in seconds)"); + #ifdef _KERNEL ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW, diff --git a/sys/contrib/openzfs/module/zfs/spa_config.c b/sys/contrib/openzfs/module/zfs/spa_config.c index 7d4d06659146..cf28955b0c50 100644 --- a/sys/contrib/openzfs/module/zfs/spa_config.c +++ b/sys/contrib/openzfs/module/zfs/spa_config.c @@ -48,18 +48,17 @@ /* * Pool configuration repository. * - * Pool configuration is stored as a packed nvlist on the filesystem. By - * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot - * (when the ZFS module is loaded). Pools can also have the 'cachefile' - * property set that allows them to be stored in an alternate location until - * the control of external software. + * Pool configuration is stored as a packed nvlist on the filesystem. When + * pools are imported they are added to the /etc/zfs/zpool.cache file and + * removed from it when exported. For each cache file, we have a single nvlist + * which holds all the configuration information. Pools can also have the + * 'cachefile' property set which allows this config to be stored in an + * alternate location under the control of external software. * - * For each cache file, we have a single nvlist which holds all the - * configuration information. When the module loads, we read this information - * from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is - * maintained independently in spa.c. Whenever the namespace is modified, or - * the configuration of a pool is changed, we call spa_write_cachefile(), which - * walks through all the active pools and writes the configuration to disk. + * The kernel independantly maintains an AVL tree of imported pools. See the + * "SPA locking" comment in spa.c. Whenever a pool configuration is modified + * we call spa_write_cachefile() which walks through all the active pools and + * writes the updated configuration to to /etc/zfs/zpool.cache file. */ static uint64_t spa_config_generation = 1; @@ -69,94 +68,6 @@ static uint64_t spa_config_generation = 1; * userland pools when doing testing. */ char *spa_config_path = (char *)ZPOOL_CACHE; -#ifdef _KERNEL -static int zfs_autoimport_disable = B_TRUE; -#endif - -/* - * Called when the module is first loaded, this routine loads the configuration - * file into the SPA namespace. It does not actually open or load the pools; it - * only populates the namespace. - */ -void -spa_config_load(void) -{ - void *buf = NULL; - nvlist_t *nvlist, *child; - nvpair_t *nvpair; - char *pathname; - zfs_file_t *fp; - zfs_file_attr_t zfa; - uint64_t fsize; - int err; - -#ifdef _KERNEL - if (zfs_autoimport_disable) - return; -#endif - - /* - * Open the configuration file. - */ - pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); - - (void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path); - - err = zfs_file_open(pathname, O_RDONLY, 0, &fp); - -#ifdef __FreeBSD__ - if (err) - err = zfs_file_open(ZPOOL_CACHE_BOOT, O_RDONLY, 0, &fp); -#endif - kmem_free(pathname, MAXPATHLEN); - - if (err) - return; - - if (zfs_file_getattr(fp, &zfa)) - goto out; - - fsize = zfa.zfa_size; - buf = kmem_alloc(fsize, KM_SLEEP); - - /* - * Read the nvlist from the file. - */ - if (zfs_file_read(fp, buf, fsize, NULL) < 0) - goto out; - - /* - * Unpack the nvlist. - */ - if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0) - goto out; - - /* - * Iterate over all elements in the nvlist, creating a new spa_t for - * each one with the specified configuration. - */ - mutex_enter(&spa_namespace_lock); - nvpair = NULL; - while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { - if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) - continue; - - child = fnvpair_value_nvlist(nvpair); - - if (spa_lookup(nvpair_name(nvpair)) != NULL) - continue; - (void) spa_add(nvpair_name(nvpair), child, NULL); - } - mutex_exit(&spa_namespace_lock); - - nvlist_free(nvlist); - -out: - if (buf != NULL) - kmem_free(buf, fsize); - - zfs_file_close(fp); -} static int spa_config_remove(spa_config_dirent_t *dp) @@ -623,7 +534,6 @@ spa_config_update(spa_t *spa, int what) spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); } -EXPORT_SYMBOL(spa_config_load); EXPORT_SYMBOL(spa_all_configs); EXPORT_SYMBOL(spa_config_set); EXPORT_SYMBOL(spa_config_generate); @@ -634,8 +544,3 @@ EXPORT_SYMBOL(spa_config_update); ZFS_MODULE_PARAM(zfs_spa, spa_, config_path, STRING, ZMOD_RD, "SPA config file (/etc/zfs/zpool.cache)"); #endif - -#ifdef _KERNEL -ZFS_MODULE_PARAM(zfs, zfs_, autoimport_disable, INT, ZMOD_RW, - "Disable pool import at module load"); -#endif diff --git a/sys/contrib/openzfs/module/zfs/spa_errlog.c b/sys/contrib/openzfs/module/zfs/spa_errlog.c index 3e08f261fda1..7252fd534bdf 100644 --- a/sys/contrib/openzfs/module/zfs/spa_errlog.c +++ b/sys/contrib/openzfs/module/zfs/spa_errlog.c @@ -253,7 +253,7 @@ find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep, if (error == 0 && BP_IS_HOLE(&bp)) error = SET_ERROR(ENOENT); - *birth_txg = BP_GET_LOGICAL_BIRTH(&bp); + *birth_txg = BP_GET_PHYSICAL_BIRTH(&bp); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (error); @@ -885,7 +885,7 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, if (error == EACCES) error = 0; else if (!error) - zep.zb_birth = BP_GET_LOGICAL_BIRTH(&bp); + zep.zb_birth = BP_GET_PHYSICAL_BIRTH(&bp); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c index f054e4290bbf..6f7c060f97f8 100644 --- a/sys/contrib/openzfs/module/zfs/spa_misc.c +++ b/sys/contrib/openzfs/module/zfs/spa_misc.c @@ -251,11 +251,11 @@ spa_mode_t spa_mode_global = SPA_MODE_UNINIT; #ifdef ZFS_DEBUG /* - * Everything except dprintf, set_error, spa, and indirect_remap is on - * by default in debug builds. + * Everything except dprintf, set_error, indirect_remap, and raidz_reconstruct + * is on by default in debug builds. */ int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SET_ERROR | - ZFS_DEBUG_INDIRECT_REMAP); + ZFS_DEBUG_INDIRECT_REMAP | ZFS_DEBUG_RAIDZ_RECONSTRUCT); #else int zfs_flags = 0; #endif @@ -471,9 +471,9 @@ spa_config_lock_destroy(spa_t *spa) spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_destroy(&scl->scl_lock); cv_destroy(&scl->scl_cv); - ASSERT(scl->scl_writer == NULL); - ASSERT(scl->scl_write_wanted == 0); - ASSERT(scl->scl_count == 0); + ASSERT0P(scl->scl_writer); + ASSERT0(scl->scl_write_wanted); + ASSERT0(scl->scl_count); } } @@ -715,6 +715,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_activities_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_txg_log_time_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); @@ -783,24 +784,23 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); list_insert_head(&spa->spa_config_list, dp); - VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, - KM_SLEEP) == 0); + VERIFY0(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, KM_SLEEP)); if (config != NULL) { nvlist_t *features; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, &features) == 0) { - VERIFY(nvlist_dup(features, &spa->spa_label_features, - 0) == 0); + VERIFY0(nvlist_dup(features, + &spa->spa_label_features, 0)); } - VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); + VERIFY0(nvlist_dup(config, &spa->spa_config, 0)); } if (spa->spa_label_features == NULL) { - VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, - KM_SLEEP) == 0); + VERIFY0(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, + KM_SLEEP)); } spa->spa_min_ashift = INT_MAX; @@ -903,6 +903,7 @@ spa_remove(spa_t *spa) mutex_destroy(&spa->spa_vdev_top_lock); mutex_destroy(&spa->spa_feat_stats_lock); mutex_destroy(&spa->spa_activities_lock); + mutex_destroy(&spa->spa_txg_log_time_lock); kmem_free(spa, sizeof (spa_t)); } @@ -1308,6 +1309,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, metaslab_class_validate(spa_log_class(spa)); metaslab_class_validate(spa_embedded_log_class(spa)); metaslab_class_validate(spa_special_class(spa)); + metaslab_class_validate(spa_special_embedded_log_class(spa)); metaslab_class_validate(spa_dedup_class(spa)); spa_config_exit(spa, SCL_ALL, spa); @@ -1896,6 +1898,8 @@ spa_get_slop_space(spa_t *spa) */ uint64_t embedded_log = metaslab_class_get_dspace(spa_embedded_log_class(spa)); + embedded_log += metaslab_class_get_dspace( + spa_special_embedded_log_class(spa)); slop -= MIN(embedded_log, slop >> 1); /* @@ -2001,6 +2005,12 @@ spa_special_class(spa_t *spa) } metaslab_class_t * +spa_special_embedded_log_class(spa_t *spa) +{ + return (spa->spa_special_embedded_log_class); +} + +metaslab_class_t * spa_dedup_class(spa_t *spa) { return (spa->spa_dedup_class); @@ -2538,13 +2548,6 @@ spa_name_compare(const void *a1, const void *a2) } void -spa_boot_init(void *unused) -{ - (void) unused; - spa_config_load(); -} - -void spa_init(spa_mode_t mode) { mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); @@ -2597,7 +2600,6 @@ spa_init(spa_mode_t mode) chksum_init(); zpool_prop_init(); zpool_feature_init(); - spa_config_load(); vdev_prop_init(); l2arc_start(); scan_init(); diff --git a/sys/contrib/openzfs/module/zfs/spa_stats.c b/sys/contrib/openzfs/module/zfs/spa_stats.c index 6d7cabcf766d..2c87122a0aa9 100644 --- a/sys/contrib/openzfs/module/zfs/spa_stats.c +++ b/sys/contrib/openzfs/module/zfs/spa_stats.c @@ -718,7 +718,7 @@ spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error, for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; smh = list_prev(&shl->procfs_list.pl_list, smh)) { if (smh->mmp_node_id == mmp_node_id) { - ASSERT(smh->io_error == 0); + ASSERT0(smh->io_error); smh->io_error = io_error; smh->duration = duration; error = 0; diff --git a/sys/contrib/openzfs/module/zfs/space_map.c b/sys/contrib/openzfs/module/zfs/space_map.c index c429e0edd168..5f24963f2291 100644 --- a/sys/contrib/openzfs/module/zfs/space_map.c +++ b/sys/contrib/openzfs/module/zfs/space_map.c @@ -817,7 +817,7 @@ space_map_open(space_map_t **smp, objset_t *os, uint64_t object, space_map_t *sm; int error; - ASSERT(*smp == NULL); + ASSERT0P(*smp); ASSERT(os != NULL); ASSERT(object != 0); diff --git a/sys/contrib/openzfs/module/zfs/space_reftree.c b/sys/contrib/openzfs/module/zfs/space_reftree.c index 9b2d5ed31dc9..889980e08c06 100644 --- a/sys/contrib/openzfs/module/zfs/space_reftree.c +++ b/sys/contrib/openzfs/module/zfs/space_reftree.c @@ -149,6 +149,6 @@ space_reftree_generate_map(avl_tree_t *t, zfs_range_tree_t *rt, int64_t minref) } } } - ASSERT(refcnt == 0); + ASSERT0(refcnt); ASSERT(start == -1ULL); } diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index 01758b0c54c0..fc6d445f9785 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -29,7 +29,7 @@ * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. - * Copyright (c) 2021, Klara Inc. + * Copyright (c) 2021, 2025, Klara, Inc. * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP. */ @@ -100,7 +100,7 @@ static uint_t zfs_vdev_default_ms_shift = 29; /* upper limit for metaslab size (16G) */ static uint_t zfs_vdev_max_ms_shift = 34; -int vdev_validate_skip = B_FALSE; +static int vdev_validate_skip = B_FALSE; /* * Since the DTL space map of a vdev is not expected to have a lot of @@ -243,6 +243,25 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent) vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); } +char * +vdev_rt_name(vdev_t *vd, const char *name) +{ + return (kmem_asprintf("{spa=%s vdev_guid=%llu %s}", + spa_name(vd->vdev_spa), + (u_longlong_t)vd->vdev_guid, + name)); +} + +static char * +vdev_rt_name_dtl(vdev_t *vd, const char *name, vdev_dtl_type_t dtl_type) +{ + return (kmem_asprintf("{spa=%s vdev_guid=%llu %s[%d]}", + spa_name(vd->vdev_spa), + (u_longlong_t)vd->vdev_guid, + name, + dtl_type)); +} + /* * Virtual device management. */ @@ -282,12 +301,15 @@ vdev_getops(const char *type) * Given a vdev and a metaslab class, find which metaslab group we're * interested in. All vdevs may belong to two different metaslab classes. * Dedicated slog devices use only the primary metaslab group, rather than a - * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL. + * separate log group. For embedded slogs, vdev_log_mg will be non-NULL and + * will point to a metaslab group of either embedded_log_class (for normal + * vdevs) or special_embedded_log_class (for special vdevs). */ metaslab_group_t * vdev_get_mg(vdev_t *vd, metaslab_class_t *mc) { - if (mc == spa_embedded_log_class(vd->vdev_spa) && + if ((mc == spa_embedded_log_class(vd->vdev_spa) || + mc == spa_special_embedded_log_class(vd->vdev_spa)) && vd->vdev_log_mg != NULL) return (vd->vdev_log_mg); else @@ -532,7 +554,7 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd) vdev_t **newchild; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); - ASSERT(cvd->vdev_parent == NULL); + ASSERT0P(cvd->vdev_parent); cvd->vdev_parent = pvd; @@ -556,7 +578,7 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd) pvd->vdev_nonrot &= cvd->vdev_nonrot; cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); - ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); + ASSERT0P(cvd->vdev_top->vdev_parent->vdev_parent); /* * Walk up all ancestors to update guid sum. @@ -692,8 +714,9 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); - vd->vdev_obsolete_segments = zfs_range_tree_create(NULL, - ZFS_RANGE_SEG64, NULL, 0, 0); + vd->vdev_obsolete_segments = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_obsolete_segments")); /* * Initialize rate limit structs for events. We rate limit ZIO delay @@ -747,8 +770,9 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { - vd->vdev_dtl[t] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + vd->vdev_dtl[t] = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name_dtl(vd, "vdev_dtl", t)); } txg_list_create(&vd->vdev_ms_list, spa, @@ -1062,6 +1086,10 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, } } + if (top_level && (ops == &vdev_raidz_ops || ops == &vdev_draid_ops)) + vd->vdev_autosit = + vdev_prop_default_numeric(VDEV_PROP_AUTOSIT); + /* * Add ourselves to the parent's list of children. */ @@ -1077,10 +1105,10 @@ vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; - ASSERT3P(vd->vdev_initialize_thread, ==, NULL); - ASSERT3P(vd->vdev_trim_thread, ==, NULL); - ASSERT3P(vd->vdev_autotrim_thread, ==, NULL); - ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); + ASSERT0P(vd->vdev_initialize_thread); + ASSERT0P(vd->vdev_trim_thread); + ASSERT0P(vd->vdev_autotrim_thread); + ASSERT0P(vd->vdev_rebuild_thread); /* * Scan queues are normally destroyed at the end of a scan. If the @@ -1109,7 +1137,7 @@ vdev_free(vdev_t *vd) for (int c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); - ASSERT(vd->vdev_child == NULL); + ASSERT0P(vd->vdev_child); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); if (vd->vdev_ops->vdev_op_fini != NULL) @@ -1138,7 +1166,7 @@ vdev_free(vdev_t *vd) */ vdev_remove_child(vd->vdev_parent, vd); - ASSERT(vd->vdev_parent == NULL); + ASSERT0P(vd->vdev_parent); ASSERT(!list_link_active(&vd->vdev_leaf_node)); /* @@ -1163,6 +1191,9 @@ vdev_free(vdev_t *vd) spa_spare_remove(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); + if (vd->vdev_prev_histo) + kmem_free(vd->vdev_prev_histo, + sizeof (uint64_t) * VDEV_L_HISTO_BUCKETS); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); @@ -1285,9 +1316,9 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) ASSERT0(tvd->vdev_indirect_config.vic_births_object); ASSERT0(tvd->vdev_indirect_config.vic_mapping_object); ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL); - ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL); - ASSERT3P(tvd->vdev_indirect_births, ==, NULL); - ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); + ASSERT0P(tvd->vdev_indirect_mapping); + ASSERT0P(tvd->vdev_indirect_births); + ASSERT0P(tvd->vdev_obsolete_sm); ASSERT0(tvd->vdev_noalloc); ASSERT0(tvd->vdev_removing); ASSERT0(tvd->vdev_rebuilding); @@ -1440,7 +1471,7 @@ vdev_remove_parent(vdev_t *cvd) if (cvd == cvd->vdev_top) vdev_top_transfer(mvd, cvd); - ASSERT(mvd->vdev_children == 0); + ASSERT0(mvd->vdev_children); vdev_free(mvd); } @@ -1508,8 +1539,13 @@ vdev_metaslab_group_create(vdev_t *vd) vd->vdev_mg = metaslab_group_create(mc, vd); if (!vd->vdev_islog) { - vd->vdev_log_mg = metaslab_group_create( - spa_embedded_log_class(spa), vd); + if (mc == spa_special_class(spa)) { + vd->vdev_log_mg = metaslab_group_create( + spa_special_embedded_log_class(spa), vd); + } else { + vd->vdev_log_mg = metaslab_group_create( + spa_embedded_log_class(spa), vd); + } } /* @@ -1624,9 +1660,10 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) /* * Find the emptiest metaslab on the vdev and mark it for use for * embedded slog by moving it from the regular to the log metaslab - * group. + * group. This works for normal and special vdevs. */ - if (vd->vdev_mg->mg_class == spa_normal_class(spa) && + if ((vd->vdev_mg->mg_class == spa_normal_class(spa) || + vd->vdev_mg->mg_class == spa_special_class(spa)) && vd->vdev_ms_count > zfs_embedded_slog_min_ms && avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) { uint64_t slog_msid = 0; @@ -2104,14 +2141,14 @@ vdev_open(vdev_t *vd) * faulted, bail out of the open. */ if (!vd->vdev_removed && vd->vdev_faulted) { - ASSERT(vd->vdev_children == 0); + ASSERT0(vd->vdev_children); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } else if (vd->vdev_offline) { - ASSERT(vd->vdev_children == 0); + ASSERT0(vd->vdev_children); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (SET_ERROR(ENXIO)); } @@ -2167,7 +2204,7 @@ vdev_open(vdev_t *vd) * the vdev is accessible. If we're faulted, bail. */ if (vd->vdev_faulted) { - ASSERT(vd->vdev_children == 0); + ASSERT0(vd->vdev_children); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, @@ -2176,7 +2213,7 @@ vdev_open(vdev_t *vd) } if (vd->vdev_degraded) { - ASSERT(vd->vdev_children == 0); + ASSERT0(vd->vdev_children); vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { @@ -3449,7 +3486,9 @@ vdev_dtl_load(vdev_t *vd) return (error); ASSERT(vd->vdev_dtl_sm != NULL); - rt = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); + rt = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_dtl_load:rt")); error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC); if (error == 0) { mutex_enter(&vd->vdev_dtl_lock); @@ -3597,7 +3636,8 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) ASSERT(vd->vdev_dtl_sm != NULL); } - rtsync = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); + rtsync = zfs_range_tree_create_flags(NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "rtsync")); mutex_enter(&vd->vdev_dtl_lock); zfs_range_tree_walk(rt, zfs_range_tree_add, rtsync); @@ -3824,6 +3864,26 @@ vdev_load(vdev_t *vd) } } + if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { + spa_t *spa = vd->vdev_spa; + uint64_t autosit; + + error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, + vdev_prop_to_name(VDEV_PROP_AUTOSIT), sizeof (autosit), + 1, &autosit); + if (error == 0) { + vd->vdev_autosit = autosit == 1; + } else if (error == ENOENT) { + vd->vdev_autosit = vdev_prop_default_numeric( + VDEV_PROP_AUTOSIT); + } else { + vdev_dbgmsg(vd, + "vdev_load: zap_lookup(top_zap=%llu) " + "failed [error=%d]", + (u_longlong_t)vd->vdev_top_zap, error); + } + } + /* * Load any rebuild state from the top-level vdev zap. */ @@ -3912,7 +3972,7 @@ vdev_load(vdev_t *vd) if (error == 0 && checkpoint_sm_obj != 0) { objset_t *mos = spa_meta_objset(vd->vdev_spa); ASSERT(vd->vdev_asize != 0); - ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL); + ASSERT0P(vd->vdev_checkpoint_sm); error = space_map_open(&vd->vdev_checkpoint_sm, mos, checkpoint_sm_obj, 0, vd->vdev_asize, @@ -3960,7 +4020,7 @@ vdev_load(vdev_t *vd) if (error == 0 && obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; ASSERT(vd->vdev_asize != 0); - ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); + ASSERT0P(vd->vdev_obsolete_sm); if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, obsolete_sm_object, 0, vd->vdev_asize, 0))) { @@ -4488,7 +4548,7 @@ top: /* * Prevent any future allocations. */ - ASSERT3P(tvd->vdev_log_mg, ==, NULL); + ASSERT0P(tvd->vdev_log_mg); metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); @@ -4583,6 +4643,8 @@ vdev_clear(spa_t *spa, vdev_t *vd) vd->vdev_stat.vs_checksum_errors = 0; vd->vdev_stat.vs_dio_verify_errors = 0; vd->vdev_stat.vs_slow_ios = 0; + atomic_store_64(&vd->vdev_outlier_count, 0); + vd->vdev_read_sit_out_expire = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); @@ -5161,7 +5223,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize) int64_t vdev_deflated_space(vdev_t *vd, int64_t space) { - ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0); + ASSERT0((space & (SPA_MINBLOCKSIZE-1))); ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio); @@ -5253,8 +5315,8 @@ vdev_config_dirty(vdev_t *vd) if (nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { - VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, - ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); + VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_SPARES, &aux, &naux)); } ASSERT(c < naux); @@ -5642,7 +5704,7 @@ vdev_expand(vdev_t *vd, uint64_t txg) (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); - VERIFY(vdev_metaslab_init(vd, txg) == 0); + VERIFY0(vdev_metaslab_init(vd, txg)); vdev_config_dirty(vd); } } @@ -6074,6 +6136,56 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) } vd->vdev_failfast = intval & 1; break; + case VDEV_PROP_SIT_OUT: + /* Only expose this for a draid or raidz leaf */ + if (!vd->vdev_ops->vdev_op_leaf || + vd->vdev_top == NULL || + (vd->vdev_top->vdev_ops != &vdev_raidz_ops && + vd->vdev_top->vdev_ops != &vdev_draid_ops)) { + error = ENOTSUP; + break; + } + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + if (intval == 1) { + vdev_t *ancestor = vd; + while (ancestor->vdev_parent != vd->vdev_top) + ancestor = ancestor->vdev_parent; + vdev_t *pvd = vd->vdev_top; + uint_t sitouts = 0; + for (int i = 0; i < pvd->vdev_children; i++) { + if (pvd->vdev_child[i] == ancestor) + continue; + if (vdev_sit_out_reads( + pvd->vdev_child[i], 0)) { + sitouts++; + } + } + if (sitouts >= vdev_get_nparity(pvd)) { + error = ZFS_ERR_TOO_MANY_SITOUTS; + break; + } + if (error == 0) + vdev_raidz_sit_child(vd, + INT64_MAX - gethrestime_sec()); + } else { + vdev_raidz_unsit_child(vd); + } + break; + case VDEV_PROP_AUTOSIT: + if (vd->vdev_ops != &vdev_raidz_ops && + vd->vdev_ops != &vdev_draid_ops) { + error = ENOTSUP; + break; + } + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + vd->vdev_autosit = intval == 1; + break; case VDEV_PROP_CHECKSUM_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; @@ -6423,6 +6535,19 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) ZPROP_SRC_NONE); } continue; + case VDEV_PROP_SIT_OUT: + /* Only expose this for a draid or raidz leaf */ + if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_top != NULL && + (vd->vdev_top->vdev_ops == + &vdev_raidz_ops || + vd->vdev_top->vdev_ops == + &vdev_draid_ops)) { + vdev_prop_add_list(outnvl, propname, + NULL, vdev_sit_out_reads(vd, 0), + ZPROP_SRC_NONE); + } + continue; case VDEV_PROP_TRIM_SUPPORT: /* only valid for leaf vdevs */ if (vd->vdev_ops->vdev_op_leaf) { @@ -6473,6 +6598,29 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) vdev_prop_add_list(outnvl, propname, strval, intval, src); break; + case VDEV_PROP_AUTOSIT: + /* Only raidz vdevs cannot have this property */ + if (vd->vdev_ops != &vdev_raidz_ops && + vd->vdev_ops != &vdev_draid_ops) { + src = ZPROP_SRC_NONE; + intval = ZPROP_BOOLEAN_NA; + } else { + err = vdev_prop_get_int(vd, prop, + &intval); + if (err && err != ENOENT) + break; + + if (intval == + vdev_prop_default_numeric(prop)) + src = ZPROP_SRC_DEFAULT; + else + src = ZPROP_SRC_LOCAL; + } + + vdev_prop_add_list(outnvl, propname, NULL, + intval, src); + break; + case VDEV_PROP_CHECKSUM_N: case VDEV_PROP_CHECKSUM_T: case VDEV_PROP_IO_N: diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid.c b/sys/contrib/openzfs/module/zfs/vdev_draid.c index feec5fd3ce17..8588cfee3f7d 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_draid.c +++ b/sys/contrib/openzfs/module/zfs/vdev_draid.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2018 Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + * Copyright (c) 2025, Klara, Inc. */ #include <sys/zfs_context.h> @@ -477,7 +478,7 @@ vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp) VERIFY3U(map->dm_children, <=, VDEV_DRAID_MAX_CHILDREN); VERIFY3U(map->dm_seed, !=, 0); VERIFY3U(map->dm_nperms, !=, 0); - VERIFY3P(map->dm_perms, ==, NULL); + VERIFY0P(map->dm_perms); #ifdef _KERNEL /* @@ -590,7 +591,7 @@ vdev_draid_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg) uint64_t asize = (rows * vdc->vdc_groupwidth) << ashift; ASSERT3U(asize, !=, 0); - ASSERT3U(asize % (vdc->vdc_groupwidth), ==, 0); + ASSERT0(asize % (vdc->vdc_groupwidth)); return (asize); } @@ -704,7 +705,7 @@ vdev_draid_map_alloc_scrub(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) uint64_t skip_off = 0; ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); - ASSERT3P(rr->rr_abd_empty, ==, NULL); + ASSERT0P(rr->rr_abd_empty); if (rr->rr_nempty > 0) { rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size, @@ -793,7 +794,7 @@ vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr) uint64_t skip_off = 0; ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); - ASSERT3P(rr->rr_abd_empty, ==, NULL); + ASSERT0P(rr->rr_abd_empty); if (rr->rr_nempty > 0) { rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size, @@ -807,7 +808,7 @@ vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr) /* empty data column (small read), add a skip sector */ ASSERT3U(skip_size, ==, parity_size); ASSERT3U(rr->rr_nempty, !=, 0); - ASSERT3P(rc->rc_abd, ==, NULL); + ASSERT0P(rc->rc_abd); rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty, skip_off, skip_size); skip_off += skip_size; @@ -1623,7 +1624,7 @@ vdev_draid_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize, SPA_MAXBLOCKSIZE); ASSERT3U(vdev_draid_get_astart(vd, start), ==, start); - ASSERT3U(asize % (vdc->vdc_groupwidth << ashift), ==, 0); + ASSERT0(asize % (vdc->vdc_groupwidth << ashift)); /* Chunks must evenly span all data columns in the group. */ psize = (((psize >> ashift) / ndata) * ndata) << ashift; @@ -1634,7 +1635,7 @@ vdev_draid_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize, uint64_t left = vdev_draid_group_to_offset(vd, group + 1) - start; chunk_size = MIN(chunk_size, left); - ASSERT3U(chunk_size % (vdc->vdc_groupwidth << ashift), ==, 0); + ASSERT0(chunk_size % (vdc->vdc_groupwidth << ashift)); ASSERT3U(vdev_draid_offset_to_group(vd, start), ==, vdev_draid_offset_to_group(vd, start + chunk_size - 1)); @@ -1996,6 +1997,33 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) rc->rc_allow_repair = 1; } } + + if (vdev_sit_out_reads(cvd, zio->io_flags)) { + rr->rr_outlier_cnt++; + ASSERT0(rc->rc_latency_outlier); + rc->rc_latency_outlier = 1; + } + } + + /* + * When the row contains a latency outlier and sufficient parity + * exists to reconstruct the column data, then skip reading the + * known slow child vdev as a performance optimization. + */ + if (rr->rr_outlier_cnt > 0 && + (rr->rr_firstdatacol - rr->rr_missingparity) >= + (rr->rr_missingdata + 1)) { + + for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_error == 0 && rc->rc_latency_outlier) { + rr->rr_missingdata++; + rc->rc_error = SET_ERROR(EAGAIN); + rc->rc_skipped = 1; + break; + } + } } /* @@ -2272,7 +2300,7 @@ vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd) ASSERT3U(vdc->vdc_groupwidth, <=, vdc->vdc_ndisks); ASSERT3U(vdc->vdc_groupsz, >=, 2 * VDEV_DRAID_ROWHEIGHT); ASSERT3U(vdc->vdc_devslicesz, >=, VDEV_DRAID_ROWHEIGHT); - ASSERT3U(vdc->vdc_devslicesz % VDEV_DRAID_ROWHEIGHT, ==, 0); + ASSERT0(vdc->vdc_devslicesz % VDEV_DRAID_ROWHEIGHT); ASSERT3U((vdc->vdc_groupwidth * vdc->vdc_ngroups) % vdc->vdc_ndisks, ==, 0); diff --git a/sys/contrib/openzfs/module/zfs/vdev_file.c b/sys/contrib/openzfs/module/zfs/vdev_file.c index f457669bc809..20b4db65ec06 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_file.c +++ b/sys/contrib/openzfs/module/zfs/vdev_file.c @@ -228,7 +228,8 @@ vdev_file_io_strategy(void *arg) abd_return_buf_copy(zio->io_abd, buf, size); } else { buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); - err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid); + err = zfs_file_pwrite(vf->vf_file, buf, size, off, + vd->vdev_ashift, &resid); abd_return_buf(zio->io_abd, buf, size); } zio->io_error = err; diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect.c b/sys/contrib/openzfs/module/zfs/vdev_indirect.c index fac2c3a5f154..7538f471e63c 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_indirect.c +++ b/sys/contrib/openzfs/module/zfs/vdev_indirect.c @@ -792,7 +792,7 @@ spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx) DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), sizeof (*scip) / sizeof (uint64_t), scip, tx)); - ASSERT3P(spa->spa_condensing_indirect, ==, NULL); + ASSERT0P(spa->spa_condensing_indirect); spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); zfs_dbgmsg("starting condense of vdev %llu in txg %llu: " @@ -882,7 +882,7 @@ spa_condense_fini(spa_t *spa) void spa_start_indirect_condensing_thread(spa_t *spa) { - ASSERT3P(spa->spa_condense_zthr, ==, NULL); + ASSERT0P(spa->spa_condense_zthr); spa->spa_condense_zthr = zthr_create("z_indirect_condense", spa_condense_indirect_thread_check, spa_condense_indirect_thread, spa, minclsyspri); @@ -1504,7 +1504,7 @@ vdev_indirect_splits_checksum_validate(indirect_vsd_t *iv, zio_t *zio) is != NULL; is = list_next(&iv->iv_splits, is)) { ASSERT3P(is->is_good_child->ic_data, !=, NULL); - ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL); + ASSERT0P(is->is_good_child->ic_duplicate); abd_copy_off(zio->io_abd, is->is_good_child->ic_data, is->is_split_offset, 0, is->is_size); @@ -1842,7 +1842,7 @@ vdev_indirect_io_done(zio_t *zio) */ if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) { zio->io_error = ret; - zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; + zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR; zio_dio_chksum_verify_error_report(zio); ret = 0; } diff --git a/sys/contrib/openzfs/module/zfs/vdev_initialize.c b/sys/contrib/openzfs/module/zfs/vdev_initialize.c index 4274728578ad..27188c46e561 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_initialize.c +++ b/sys/contrib/openzfs/module/zfs/vdev_initialize.c @@ -541,8 +541,9 @@ vdev_initialize_thread(void *arg) abd_t *deadbeef = vdev_initialize_block_alloc(); - vd->vdev_initialize_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + vd->vdev_initialize_tree = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_initialize_tree")); for (uint64_t i = 0; !vd->vdev_detached && i < vd->vdev_top->vdev_ms_count; i++) { @@ -631,7 +632,7 @@ vdev_initialize(vdev_t *vd) ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(vdev_is_concrete(vd)); - ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + ASSERT0P(vd->vdev_initialize_thread); ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_initialize_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); @@ -652,7 +653,7 @@ vdev_uninitialize(vdev_t *vd) ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(vdev_is_concrete(vd)); - ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + ASSERT0P(vd->vdev_initialize_thread); ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_initialize_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); @@ -671,7 +672,7 @@ vdev_initialize_stop_wait_impl(vdev_t *vd) while (vd->vdev_initialize_thread != NULL) cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock); - ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + ASSERT0P(vd->vdev_initialize_thread); vd->vdev_initialize_exit_wanted = B_FALSE; } diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c index 6baa6236aac2..c44f654b0261 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_label.c +++ b/sys/contrib/openzfs/module/zfs/vdev_label.c @@ -163,7 +163,7 @@ uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset) { ASSERT(offset < sizeof (vdev_label_t)); - ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0); + ASSERT0(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t)); return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? 0 : psize - VDEV_LABELS * sizeof (vdev_label_t))); @@ -768,12 +768,12 @@ vdev_top_config_generate(spa_t *spa, nvlist_t *config) } if (idx) { - VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY, - array, idx) == 0); + VERIFY0(nvlist_add_uint64_array(config, + ZPOOL_CONFIG_HOLE_ARRAY, array, idx)); } - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, - rvd->vdev_children) == 0); + VERIFY0(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, + rvd->vdev_children)); kmem_free(array, rvd->vdev_children * sizeof (uint64_t)); } @@ -1189,8 +1189,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) * vdev uses as described above, and automatically expires if we * fail. */ - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG, - crtxg) == 0); + VERIFY0(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG, + crtxg)); } buf = vp->vp_nvlist; diff --git a/sys/contrib/openzfs/module/zfs/vdev_mirror.c b/sys/contrib/openzfs/module/zfs/vdev_mirror.c index a6aee9437066..18efdaac006f 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_mirror.c +++ b/sys/contrib/openzfs/module/zfs/vdev_mirror.c @@ -532,7 +532,7 @@ vdev_mirror_child_select(zio_t *zio) uint64_t txg = zio->io_txg; int c, lowest_load; - ASSERT(zio->io_bp == NULL || BP_GET_BIRTH(zio->io_bp) == txg); + ASSERT(zio->io_bp == NULL || BP_GET_PHYSICAL_BIRTH(zio->io_bp) == txg); lowest_load = INT_MAX; mm->mm_preferred_cnt = 0; @@ -779,7 +779,7 @@ vdev_mirror_io_done(zio_t *zio) * being written out during self healing. */ if ((zio->io_flags & ZIO_FLAG_DIO_READ) && - (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) { + (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)) { zio_dio_chksum_verify_error_report(zio); zio->io_error = vdev_mirror_worst_error(mm); ASSERT3U(zio->io_error, ==, ECKSUM); diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c index aa41f7066036..e69e5598939e 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_queue.c +++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c @@ -122,7 +122,7 @@ * The maximum number of i/os active to each device. Ideally, this will be >= * the sum of each queue's max_active. */ -uint_t zfs_vdev_max_active = 1000; +static uint_t zfs_vdev_max_active = 1000; /* * Per-queue limits on the number of i/os active to each device. If the @@ -780,7 +780,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) if (dio->io_flags & ZIO_FLAG_NODATA) { /* allocate a buffer for a write gap */ ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); - ASSERT3P(dio->io_abd, ==, NULL); + ASSERT0P(dio->io_abd); abd_gang_add(aio->io_abd, abd_get_zeros(dio->io_size), B_TRUE); } else { diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c index 71c4bfbdaf00..56b8e3b60b22 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c @@ -24,6 +24,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2016 Gvozden Nešković. All rights reserved. + * Copyright (c) 2025, Klara, Inc. */ #include <sys/zfs_context.h> @@ -356,6 +357,32 @@ unsigned long raidz_expand_max_reflow_bytes = 0; uint_t raidz_expand_pause_point = 0; /* + * This represents the duration for a slow drive read sit out. + */ +static unsigned long vdev_read_sit_out_secs = 600; + +/* + * How often each RAID-Z and dRAID vdev will check for slow disk outliers. + * Increasing this interval will reduce the sensitivity of detection (since all + * I/Os since the last check are included in the statistics), but will slow the + * response to a disk developing a problem. + * + * Defaults to once per second; setting extremely small values may cause + * negative performance effects. + */ +static hrtime_t vdev_raidz_outlier_check_interval_ms = 1000; + +/* + * When performing slow outlier checks for RAID-Z and dRAID vdevs, this value is + * used to determine how far out an outlier must be before it counts as an event + * worth consdering. + * + * Smaller values will result in more aggressive sitting out of disks that may + * have problems, but may significantly increase the rate of spurious sit-outs. + */ +static uint32_t vdev_raidz_outlier_insensitivity = 50; + +/* * Maximum amount of copy io's outstanding at once. */ #ifdef _ILP32 @@ -412,7 +439,7 @@ vdev_raidz_map_free(raidz_map_t *rm) rm->rm_nphys_cols); } - ASSERT3P(rm->rm_lr, ==, NULL); + ASSERT0P(rm->rm_lr); kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); } @@ -2206,11 +2233,7 @@ vdev_raidz_close(vdev_t *vd) /* * Return the logical width to use, given the txg in which the allocation - * happened. Note that BP_GET_BIRTH() is usually the txg in which the - * BP was allocated. Remapped BP's (that were relocated due to device - * removal, see remap_blkptr_cb()), will have a more recent physical birth - * which reflects when the BP was relocated, but we can ignore these because - * they can't be on RAIDZ (device removal doesn't support RAIDZ). + * happened. */ static uint64_t vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) @@ -2249,10 +2272,9 @@ vdev_raidz_asize_to_psize(vdev_t *vd, uint64_t asize, uint64_t txg) vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t psize; uint64_t ashift = vd->vdev_top->vdev_ashift; - uint64_t cols = vdrz->vd_original_width; uint64_t nparity = vdrz->vd_nparity; - cols = vdev_raidz_get_logical_width(vdrz, txg); + uint64_t cols = vdev_raidz_get_logical_width(vdrz, txg); ASSERT0(asize % (1 << ashift)); @@ -2285,10 +2307,9 @@ vdev_raidz_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg) vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t asize; uint64_t ashift = vd->vdev_top->vdev_ashift; - uint64_t cols = vdrz->vd_original_width; uint64_t nparity = vdrz->vd_nparity; - cols = vdev_raidz_get_logical_width(vdrz, txg); + uint64_t cols = vdev_raidz_get_logical_width(vdrz, txg); asize = ((psize - 1) >> ashift) + 1; asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); @@ -2317,6 +2338,41 @@ vdev_raidz_min_asize(vdev_t *vd) vd->vdev_children); } +/* + * return B_TRUE if a read should be skipped due to being too slow. + * + * In vdev_child_slow_outlier() it looks for outliers based on disk + * latency from the most recent child reads. Here we're checking if, + * over time, a disk has has been an outlier too many times and is + * now in a sit out period. + */ +boolean_t +vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags) +{ + if (vdev_read_sit_out_secs == 0) + return (B_FALSE); + + /* Avoid skipping a data column read when scrubbing */ + if (io_flags & ZIO_FLAG_SCRUB) + return (B_FALSE); + + if (!vd->vdev_ops->vdev_op_leaf) { + boolean_t sitting = B_FALSE; + for (int c = 0; c < vd->vdev_children; c++) { + sitting |= vdev_sit_out_reads(vd->vdev_child[c], + io_flags); + } + return (sitting); + } + + if (vd->vdev_read_sit_out_expire >= gethrestime_sec()) + return (B_TRUE); + + vd->vdev_read_sit_out_expire = 0; + + return (B_FALSE); +} + void vdev_raidz_child_done(zio_t *zio) { @@ -2345,7 +2401,7 @@ vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + vdev_raidz_psize_to_asize(zio->io_vd, rr->rr_size, - BP_GET_BIRTH(zio->io_bp)); + BP_GET_PHYSICAL_BIRTH(zio->io_bp)); raidz_col_t *rc = &rr->rr_col[col]; vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; @@ -2437,7 +2493,7 @@ raidz_start_skip_writes(zio_t *zio) vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; if (rc->rc_size != 0) continue; - ASSERT3P(rc->rc_abd, ==, NULL); + ASSERT0P(rc->rc_abd); ASSERT3U(rc->rc_offset, <, cvd->vdev_psize - VDEV_LABEL_END_SIZE); @@ -2481,6 +2537,45 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) rc->rc_skipped = 1; continue; } + + if (vdev_sit_out_reads(cvd, zio->io_flags)) { + rr->rr_outlier_cnt++; + ASSERT0(rc->rc_latency_outlier); + rc->rc_latency_outlier = 1; + } + } + + /* + * When the row contains a latency outlier and sufficient parity + * exists to reconstruct the column data, then skip reading the + * known slow child vdev as a performance optimization. + */ + if (rr->rr_outlier_cnt > 0 && + (rr->rr_firstdatacol - rr->rr_missingparity) >= + (rr->rr_missingdata + 1)) { + + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_error == 0 && rc->rc_latency_outlier) { + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; + else + rr->rr_missingparity++; + rc->rc_error = SET_ERROR(EAGAIN); + rc->rc_skipped = 1; + break; + } + } + } + + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error || rc->rc_size == 0) + continue; + if (forceparity || c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { @@ -2504,6 +2599,7 @@ vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) ASSERT3U(prc->rc_devidx, ==, i); vdev_t *cvd = vd->vdev_child[i]; + if (!vdev_readable(cvd)) { prc->rc_error = SET_ERROR(ENXIO); prc->rc_tried = 1; /* don't even try */ @@ -2568,7 +2664,7 @@ vdev_raidz_io_start(zio_t *zio) raidz_map_t *rm; uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, - BP_GET_BIRTH(zio->io_bp)); + BP_GET_PHYSICAL_BIRTH(zio->io_bp)); if (logical_width != vdrz->vd_physical_width) { zfs_locked_range_t *lr = NULL; uint64_t synced_offset = UINT64_MAX; @@ -2691,7 +2787,7 @@ raidz_checksum_verify(zio_t *zio) */ if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) { zio->io_error = ret; - zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; + zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR; zio_dio_chksum_verify_error_report(zio); zio_checksum_verified(zio); return (0); @@ -2780,6 +2876,239 @@ vdev_raidz_worst_error(raidz_row_t *rr) return (error); } +/* + * Find the median value from a set of n values + */ +static uint64_t +latency_median_value(const uint64_t *data, size_t n) +{ + uint64_t m; + + if (n % 2 == 0) + m = (data[(n >> 1) - 1] + data[n >> 1]) >> 1; + else + m = data[((n + 1) >> 1) - 1]; + + return (m); +} + +/* + * Calculate the outlier fence from a set of n latency values + * + * fence = Q3 + vdev_raidz_outlier_insensitivity x (Q3 - Q1) + */ +static uint64_t +latency_quartiles_fence(const uint64_t *data, size_t n, uint64_t *iqr) +{ + uint64_t q1 = latency_median_value(&data[0], n >> 1); + uint64_t q3 = latency_median_value(&data[(n + 1) >> 1], n >> 1); + + /* + * To avoid detecting false positive outliers when N is small and + * and the latencies values are very close, make sure the IQR + * is at least 25% larger than Q1. + */ + *iqr = MAX(q3 - q1, q1 / 4); + + return (q3 + (*iqr * vdev_raidz_outlier_insensitivity)); +} +#define LAT_CHILDREN_MIN 5 +#define LAT_OUTLIER_LIMIT 20 + +static int +latency_compare(const void *arg1, const void *arg2) +{ + const uint64_t *l1 = (uint64_t *)arg1; + const uint64_t *l2 = (uint64_t *)arg2; + + return (TREE_CMP(*l1, *l2)); +} + +void +vdev_raidz_sit_child(vdev_t *svd, uint64_t secs) +{ + for (int c = 0; c < svd->vdev_children; c++) + vdev_raidz_sit_child(svd->vdev_child[c], secs); + + if (!svd->vdev_ops->vdev_op_leaf) + return; + + /* Begin a sit out period for this slow drive */ + svd->vdev_read_sit_out_expire = gethrestime_sec() + + secs; + + /* Count each slow io period */ + mutex_enter(&svd->vdev_stat_lock); + svd->vdev_stat.vs_slow_ios++; + mutex_exit(&svd->vdev_stat_lock); +} + +void +vdev_raidz_unsit_child(vdev_t *vd) +{ + for (int c = 0; c < vd->vdev_children; c++) + vdev_raidz_unsit_child(vd->vdev_child[c]); + + if (!vd->vdev_ops->vdev_op_leaf) + return; + + vd->vdev_read_sit_out_expire = 0; +} + +/* + * Check for any latency outlier from latest set of child reads. + * + * Uses a Tukey's fence, with K = 50, for detecting extreme outliers. This + * rule defines extreme outliers as data points outside the fence of the + * third quartile plus fifty times the Interquartile Range (IQR). This range + * is the distance between the first and third quartile. + * + * Fifty is an extremely large value for Tukey's fence, but the outliers we're + * attempting to detect here are orders of magnitude times larger than the + * median. This large value should capture any truly fault disk quickly, + * without causing spurious sit-outs. + * + * To further avoid spurious sit-outs, vdevs must be detected multiple times + * as an outlier before they are sat, and outlier counts will gradually decay. + * Every nchildren times we have detected an outlier, we subtract 2 from the + * outlier count of all children. If detected outliers are close to uniformly + * distributed, this will result in the outlier count remaining close to 0 + * (in expectation; over long enough time-scales, spurious sit-outs are still + * possible). + */ +static void +vdev_child_slow_outlier(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + if (!vd->vdev_autosit || vdev_read_sit_out_secs == 0 || + vd->vdev_children < LAT_CHILDREN_MIN) + return; + + hrtime_t now = getlrtime(); + uint64_t last = atomic_load_64(&vd->vdev_last_latency_check); + + if ((now - last) < MSEC2NSEC(vdev_raidz_outlier_check_interval_ms)) + return; + + /* Allow a single winner when there are racing callers. */ + if (atomic_cas_64(&vd->vdev_last_latency_check, last, now) != last) + return; + + int children = vd->vdev_children; + uint64_t *lat_data = kmem_alloc(sizeof (uint64_t) * children, KM_SLEEP); + + for (int c = 0; c < children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + if (cvd->vdev_prev_histo == NULL) { + mutex_enter(&cvd->vdev_stat_lock); + size_t size = + sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]); + cvd->vdev_prev_histo = kmem_zalloc(size, KM_SLEEP); + memcpy(cvd->vdev_prev_histo, + cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ], + size); + mutex_exit(&cvd->vdev_stat_lock); + } + } + uint64_t max = 0; + vdev_t *svd = NULL; + uint_t sitouts = 0; + boolean_t skip = B_FALSE, svd_sitting = B_FALSE; + for (int c = 0; c < children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + boolean_t sitting = vdev_sit_out_reads(cvd, 0) || + cvd->vdev_state != VDEV_STATE_HEALTHY; + + /* We can't sit out more disks than we have parity */ + if (sitting && ++sitouts >= vdev_get_nparity(vd)) + skip = B_TRUE; + + mutex_enter(&cvd->vdev_stat_lock); + + uint64_t *prev_histo = cvd->vdev_prev_histo; + uint64_t *histo = + cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ]; + if (skip) { + size_t size = + sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]); + memcpy(prev_histo, histo, size); + mutex_exit(&cvd->vdev_stat_lock); + continue; + } + uint64_t count = 0; + lat_data[c] = 0; + for (int i = 0; i < VDEV_L_HISTO_BUCKETS; i++) { + uint64_t this_count = histo[i] - prev_histo[i]; + lat_data[c] += (1ULL << i) * this_count; + count += this_count; + } + size_t size = sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]); + memcpy(prev_histo, histo, size); + mutex_exit(&cvd->vdev_stat_lock); + lat_data[c] /= MAX(1, count); + + /* Wait until all disks have been read from */ + if (lat_data[c] == 0 && !sitting) { + skip = B_TRUE; + continue; + } + + /* Keep track of the vdev with largest value */ + if (lat_data[c] > max) { + max = lat_data[c]; + svd = cvd; + svd_sitting = sitting; + } + } + + if (skip) { + kmem_free(lat_data, sizeof (uint64_t) * children); + return; + } + + qsort((void *)lat_data, children, sizeof (uint64_t), latency_compare); + + uint64_t iqr; + uint64_t fence = latency_quartiles_fence(lat_data, children, &iqr); + + ASSERT3U(lat_data[children - 1], ==, max); + if (max > fence && !svd_sitting) { + ASSERT3U(iqr, >, 0); + uint64_t incr = MAX(1, MIN((max - fence) / iqr, + LAT_OUTLIER_LIMIT / 4)); + vd->vdev_outlier_count += incr; + if (vd->vdev_outlier_count >= children) { + for (int c = 0; c < children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + cvd->vdev_outlier_count -= 2; + cvd->vdev_outlier_count = MAX(0, + cvd->vdev_outlier_count); + } + vd->vdev_outlier_count = 0; + } + /* + * Keep track of how many times this child has had + * an outlier read. A disk that persitently has a + * higher than peers outlier count will be considered + * a slow disk. + */ + svd->vdev_outlier_count += incr; + if (svd->vdev_outlier_count > LAT_OUTLIER_LIMIT) { + ASSERT0(svd->vdev_read_sit_out_expire); + vdev_raidz_sit_child(svd, vdev_read_sit_out_secs); + (void) zfs_ereport_post(FM_EREPORT_ZFS_SITOUT, + zio->io_spa, svd, NULL, NULL, 0); + vdev_dbgmsg(svd, "begin read sit out for %d secs", + (int)vdev_read_sit_out_secs); + + for (int c = 0; c < vd->vdev_children; c++) + vd->vdev_child[c]->vdev_outlier_count = 0; + } + } + + kmem_free(lat_data, sizeof (uint64_t) * children); +} + static void vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) { @@ -3048,7 +3377,7 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) /* Check for success */ if (raidz_checksum_verify(zio) == 0) { - if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) + if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR) return (0); /* Reconstruction succeeded - report errors */ @@ -3369,7 +3698,7 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, * also have been fewer parity errors than parity * columns or, again, we wouldn't be in this code path. */ - ASSERT(parity_untried == 0); + ASSERT0(parity_untried); ASSERT(parity_errors < rr->rr_firstdatacol); /* @@ -3514,13 +3843,16 @@ vdev_raidz_io_done(zio_t *zio) } if (raidz_checksum_verify(zio) == 0) { - if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) + if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR) goto done; for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_verified(zio, rr); } + /* Periodically check for a read outlier */ + if (zio->io_type == ZIO_TYPE_READ) + vdev_child_slow_outlier(zio); zio_checksum_verified(zio); } else { /* @@ -4591,8 +4923,10 @@ spa_raidz_expand_thread(void *arg, zthr_t *zthr) uint64_t shift, start; zfs_range_seg_type_t type = metaslab_calculate_range_tree_type( raidvd, msp, &start, &shift); - zfs_range_tree_t *rt = zfs_range_tree_create(NULL, type, NULL, - start, shift); + zfs_range_tree_t *rt = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, ZFS_RT_F_DYN_NAME, + metaslab_rt_name(msp->ms_group, msp, + "spa_raidz_expand_thread:rt")); zfs_range_tree_add(rt, msp->ms_start, msp->ms_size); zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove, rt); @@ -4747,7 +5081,7 @@ spa_raidz_expand_thread(void *arg, zthr_t *zthr) void spa_start_raidz_expansion_thread(spa_t *spa) { - ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL); + ASSERT0P(spa->spa_raidz_expand_zthr); spa->spa_raidz_expand_zthr = zthr_create("raidz_expand", spa_raidz_expand_thread_check, spa_raidz_expand_thread, spa, defclsyspri); @@ -5159,3 +5493,10 @@ ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, "For expanded RAIDZ, automatically start a pool scrub when expansion " "completes"); +ZFS_MODULE_PARAM(zfs_vdev, vdev_, read_sit_out_secs, ULONG, ZMOD_RW, + "Raidz/draid slow disk sit out time period in seconds"); +ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_check_interval_ms, U64, + ZMOD_RW, "Interval to check for slow raidz/draid children"); +ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_insensitivity, UINT, + ZMOD_RW, "How insensitive the slow raidz/draid child check should be"); +/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c index 0e296606d037..47b3b9921abe 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c +++ b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c @@ -256,7 +256,7 @@ vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx) "vdev_id=%llu vdev_guid=%llu started", (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); - ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); + ASSERT0P(vd->vdev_rebuild_thread); vd->vdev_rebuild_thread = thread_create(NULL, 0, vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); @@ -413,7 +413,7 @@ vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx) mutex_enter(&vd->vdev_rebuild_lock); ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); - ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); + ASSERT0P(vd->vdev_rebuild_thread); vrp->vrp_last_offset = 0; vrp->vrp_min_txg = 0; @@ -787,8 +787,9 @@ vdev_rebuild_thread(void *arg) vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; vr->vr_top_vdev = vd; vr->vr_scan_msp = NULL; - vr->vr_scan_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, - 0, 0); + vr->vr_scan_tree = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vr_scan_tree")); mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL); diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c index db79ded6dce4..2ce0121324ad 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_removal.c +++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c @@ -105,7 +105,7 @@ static const uint_t zfs_remove_max_copy_bytes = 64 * 1024 * 1024; * * See also the accessor function spa_remove_max_segment(). */ -uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE; +static uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE; /* * Ignore hard IO errors during device removal. When set if a device @@ -137,7 +137,7 @@ uint_t vdev_removal_max_span = 32 * 1024; * This is used by the test suite so that it can ensure that certain * actions happen while in the middle of a removal. */ -int zfs_removal_suspend_progress = 0; +static int zfs_removal_suspend_progress = 0; #define VDEV_REMOVAL_ZAP_OBJS "lzap" @@ -344,10 +344,10 @@ spa_vdev_remove_aux(nvlist_t *config, const char *name, nvlist_t **dev, for (int i = 0, j = 0; i < count; i++) { if (dev[i] == dev_to_remove) continue; - VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); + VERIFY0(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP)); } - VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); + VERIFY0(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY)); fnvlist_add_nvlist_array(config, name, (const nvlist_t * const *)newdev, count - 1); @@ -364,13 +364,15 @@ spa_vdev_removal_create(vdev_t *vd) spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP); mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); - svr->svr_allocd_segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + svr->svr_allocd_segs = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "svr_allocd_segs")); svr->svr_vdev_id = vd->vdev_id; for (int i = 0; i < TXG_SIZE; i++) { - svr->svr_frees[i] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + svr->svr_frees[i] = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "svr_frees")); list_create(&svr->svr_new_segments[i], sizeof (vdev_indirect_mapping_entry_t), offsetof(vdev_indirect_mapping_entry_t, vime_node)); @@ -421,7 +423,7 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) svr = spa_vdev_removal_create(vd); ASSERT(vd->vdev_removing); - ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); + ASSERT0P(vd->vdev_indirect_mapping); spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { @@ -527,7 +529,7 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) * but in any case only when there are outstanding free i/os, which * there are not). */ - ASSERT3P(spa->spa_vdev_removal, ==, NULL); + ASSERT0P(spa->spa_vdev_removal); spa->spa_vdev_removal = svr; svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri); @@ -1179,8 +1181,9 @@ spa_vdev_copy_segment(vdev_t *vd, zfs_range_tree_t *segs, * relative to the start of the range to be copied (i.e. relative to the * local variable "start"). */ - zfs_range_tree_t *obsolete_segs = zfs_range_tree_create(NULL, - ZFS_RANGE_SEG64, NULL, 0, 0); + zfs_range_tree_t *obsolete_segs = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "obsolete_segs")); zfs_btree_index_t where; zfs_range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where); @@ -1359,11 +1362,11 @@ vdev_remove_complete(spa_t *spa) txg_wait_synced(spa->spa_dsl_pool, 0); txg = spa_vdev_enter(spa); vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); - ASSERT3P(vd->vdev_initialize_thread, ==, NULL); - ASSERT3P(vd->vdev_trim_thread, ==, NULL); - ASSERT3P(vd->vdev_autotrim_thread, ==, NULL); + ASSERT0P(vd->vdev_initialize_thread); + ASSERT0P(vd->vdev_trim_thread); + ASSERT0P(vd->vdev_autotrim_thread); vdev_rebuild_stop_wait(vd); - ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); + ASSERT0P(vd->vdev_rebuild_thread); sysevent_t *ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_DEV); @@ -1448,8 +1451,9 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, * allocated segments that we are copying. We may also be copying * free segments (of up to vdev_removal_max_span bytes). */ - zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + zfs_range_tree_t *segs = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "spa_vdev_copy_impl:segs")); for (;;) { zfs_range_tree_t *rt = svr->svr_allocd_segs; zfs_range_seg_t *rs = zfs_range_tree_first(rt); @@ -1610,8 +1614,9 @@ spa_vdev_remove_thread(void *arg) vca.vca_read_error_bytes = 0; vca.vca_write_error_bytes = 0; - zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + zfs_range_tree_t *segs = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "spa_vdev_remove_thread:segs")); mutex_enter(&svr->svr_lock); @@ -1863,7 +1868,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; objset_t *mos = spa->spa_meta_objset; - ASSERT3P(svr->svr_thread, ==, NULL); + ASSERT0P(svr->svr_thread); spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); @@ -1895,8 +1900,9 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) vdev_indirect_mapping_max_offset(vim)); } - zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + zfs_range_tree_t *segs = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, ZFS_RT_F_DYN_NAME, + vdev_rt_name(vd, "spa_vdev_remove_cancel_sync:segs")); for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; @@ -2070,7 +2076,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); - ASSERT3P(vd->vdev_log_mg, ==, NULL); + ASSERT0P(vd->vdev_log_mg); ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* @@ -2106,7 +2112,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) if (error != 0) { metaslab_group_activate(mg); - ASSERT3P(vd->vdev_log_mg, ==, NULL); + ASSERT0P(vd->vdev_log_mg); return (error); } ASSERT0(vd->vdev_stat.vs_alloc); diff --git a/sys/contrib/openzfs/module/zfs/vdev_trim.c b/sys/contrib/openzfs/module/zfs/vdev_trim.c index 842bb3e690d4..eee18b367909 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_trim.c +++ b/sys/contrib/openzfs/module/zfs/vdev_trim.c @@ -902,7 +902,9 @@ vdev_trim_thread(void *arg) ta.trim_vdev = vd; ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min; - ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); + ta.trim_tree = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "trim_tree")); ta.trim_type = TRIM_TYPE_MANUAL; ta.trim_flags = 0; @@ -1008,7 +1010,7 @@ vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure) ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(vdev_is_concrete(vd)); - ASSERT3P(vd->vdev_trim_thread, ==, NULL); + ASSERT0P(vd->vdev_trim_thread); ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_trim_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); @@ -1030,7 +1032,7 @@ vdev_trim_stop_wait_impl(vdev_t *vd) while (vd->vdev_trim_thread != NULL) cv_wait(&vd->vdev_trim_cv, &vd->vdev_trim_lock); - ASSERT3P(vd->vdev_trim_thread, ==, NULL); + ASSERT0P(vd->vdev_trim_thread); vd->vdev_trim_exit_wanted = B_FALSE; } @@ -1305,8 +1307,10 @@ vdev_autotrim_thread(void *arg) * Allocate an empty range tree which is swapped in * for the existing ms_trim tree while it is processed. */ - trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + trim_tree = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, + vdev_rt_name(vd, "autotrim_tree")); zfs_range_tree_swap(&msp->ms_trim, &trim_tree); ASSERT(zfs_range_tree_is_empty(msp->ms_trim)); @@ -1360,8 +1364,10 @@ vdev_autotrim_thread(void *arg) if (!cvd->vdev_ops->vdev_op_leaf) continue; - ta->trim_tree = zfs_range_tree_create(NULL, - ZFS_RANGE_SEG64, NULL, 0, 0); + ta->trim_tree = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, + vdev_rt_name(vd, "autotrim_tree")); zfs_range_tree_walk(trim_tree, vdev_trim_range_add, ta); } @@ -1533,7 +1539,7 @@ vdev_autotrim_stop_wait(vdev_t *tvd) cv_wait(&tvd->vdev_autotrim_cv, &tvd->vdev_autotrim_lock); - ASSERT3P(tvd->vdev_autotrim_thread, ==, NULL); + ASSERT0P(tvd->vdev_autotrim_thread); tvd->vdev_autotrim_exit_wanted = B_FALSE; } mutex_exit(&tvd->vdev_autotrim_lock); @@ -1600,7 +1606,9 @@ vdev_trim_l2arc_thread(void *arg) vd->vdev_trim_secure = 0; ta.trim_vdev = vd; - ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); + ta.trim_tree = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "trim_tree")); ta.trim_type = TRIM_TYPE_MANUAL; ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; @@ -1704,7 +1712,7 @@ vdev_trim_l2arc(spa_t *spa) mutex_enter(&vd->vdev_trim_lock); ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(vdev_is_concrete(vd)); - ASSERT3P(vd->vdev_trim_thread, ==, NULL); + ASSERT0P(vd->vdev_trim_thread); ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_trim_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); @@ -1735,7 +1743,9 @@ vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size) ASSERT(!vd->vdev_top->vdev_rz_expanding); ta.trim_vdev = vd; - ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); + ta.trim_tree = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "trim_tree")); ta.trim_type = TRIM_TYPE_SIMPLE; ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; diff --git a/sys/contrib/openzfs/module/zfs/zap.c b/sys/contrib/openzfs/module/zfs/zap.c index 9711c91d7e4e..3e4e997798a3 100644 --- a/sys/contrib/openzfs/module/zfs/zap.c +++ b/sys/contrib/openzfs/module/zfs/zap.c @@ -921,7 +921,7 @@ fzap_add_cd(zap_name_t *zn, ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT(!zap->zap_ismicro); - ASSERT(fzap_check(zn, integer_size, num_integers) == 0); + ASSERT0(fzap_check(zn, integer_size, num_integers)); err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); if (err != 0) @@ -1304,7 +1304,7 @@ zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) { - int err = ENOENT; + int err; zap_entry_handle_t zeh; zap_leaf_t *l; @@ -1386,7 +1386,7 @@ again: } err = zap_entry_read_name(zap, &zeh, za->za_name_len, za->za_name); - ASSERT(err == 0); + ASSERT0(err); za->za_normalization_conflict = zap_entry_normalization_conflict(&zeh, @@ -1546,7 +1546,7 @@ zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) boolean_t trunc = B_FALSE; int err = 0; - ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0); + ASSERT0(zap_leaf_phys(l)->l_hdr.lh_nentries); ASSERT3U(prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT3U(ZAP_HASH_IDX(hash, prefix_len), ==, prefix); @@ -1564,7 +1564,7 @@ zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) uint64_t sl_hash = ZAP_PREFIX_HASH(sl_prefix, prefix_len); int slbit = prefix & 1; - ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0); + ASSERT0(zap_leaf_phys(l)->l_hdr.lh_nentries); /* * Check if there is a sibling by reading ptrtbl ptrs. diff --git a/sys/contrib/openzfs/module/zfs/zap_micro.c b/sys/contrib/openzfs/module/zfs/zap_micro.c index 411b1a9db5ab..ea4e3117a8b9 100644 --- a/sys/contrib/openzfs/module/zfs/zap_micro.c +++ b/sys/contrib/openzfs/module/zfs/zap_micro.c @@ -346,7 +346,7 @@ zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) { zap_name_t *zn = kmem_cache_alloc(zap_name_cache, KM_SLEEP); - ASSERT(zap->zap_normflags == 0); + ASSERT0(zap->zap_normflags); zn->zn_zap = zap; zn->zn_key_intlen = sizeof (*key); zn->zn_key_orig = zn->zn_key_norm = key; @@ -1876,7 +1876,7 @@ zap_cursor_serialize(zap_cursor_t *zc) return (-1ULL); if (zc->zc_zap == NULL) return (zc->zc_serialized); - ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0); + ASSERT0((zc->zc_hash & zap_maxcd(zc->zc_zap))); ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); /* @@ -1911,7 +1911,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) * we must add to the existing zc_cd, which may already * be 1 due to the zap_cursor_advance. */ - ASSERT(zc->zc_hash == 0); + ASSERT0(zc->zc_hash); hb = zap_hashbits(zc->zc_zap); zc->zc_hash = zc->zc_serialized << (64 - hb); zc->zc_cd += zc->zc_serialized >> hb; diff --git a/sys/contrib/openzfs/module/zfs/zcp.c b/sys/contrib/openzfs/module/zfs/zcp.c index 6960ea360b15..c6684f453e95 100644 --- a/sys/contrib/openzfs/module/zfs/zcp.c +++ b/sys/contrib/openzfs/module/zfs/zcp.c @@ -765,7 +765,7 @@ zcp_lua_alloc(void *ud, void *ptr, size_t osize, size_t nsize) return (NULL); } (void) memcpy(luabuf, ptr, osize); - VERIFY3P(zcp_lua_alloc(ud, ptr, osize, 0), ==, NULL); + VERIFY0P(zcp_lua_alloc(ud, ptr, osize, 0)); return (luabuf); } } @@ -1175,7 +1175,7 @@ zcp_eval(const char *poolname, const char *program, boolean_t sync, for (nvpair_t *pair = nvlist_next_nvpair(runinfo.zri_new_zvols, NULL); pair != NULL; pair = nvlist_next_nvpair(runinfo.zri_new_zvols, pair)) { - zvol_create_minor(nvpair_name(pair)); + zvol_create_minors(nvpair_name(pair)); } fnvlist_free(runinfo.zri_new_zvols); diff --git a/sys/contrib/openzfs/module/zfs/zfeature.c b/sys/contrib/openzfs/module/zfs/zfeature.c index 7dfe00d42a08..4cf9e0dbb405 100644 --- a/sys/contrib/openzfs/module/zfs/zfeature.c +++ b/sys/contrib/openzfs/module/zfs/zfeature.c @@ -210,8 +210,8 @@ spa_features_check(spa_t *spa, boolean_t for_write, za->za_name, 1, MAXPATHLEN, buf) == 0) desc = buf; - VERIFY(nvlist_add_string(unsup_feat, - za->za_name, desc) == 0); + VERIFY0(nvlist_add_string(unsup_feat, + za->za_name, desc)); } } } @@ -308,6 +308,7 @@ feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount, ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature)); uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; + ASSERT(MUTEX_HELD(&spa->spa_feat_stats_lock)); VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid, sizeof (uint64_t), 1, &refcount, tx)); @@ -360,7 +361,9 @@ feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) feature->fi_guid, 1, strlen(feature->fi_desc) + 1, feature->fi_desc, tx)); + mutex_enter(&spa->spa_feat_stats_lock); feature_sync(spa, feature, initial_refcount, tx); + mutex_exit(&spa->spa_feat_stats_lock); if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) { uint64_t enabling_txg = dmu_tx_get_txg(tx); @@ -416,6 +419,7 @@ feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action, ASSERT(dmu_tx_is_syncing(tx)); ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + mutex_enter(&spa->spa_feat_stats_lock); VERIFY3U(feature_get_refcount(spa, feature, &refcount), !=, ENOTSUP); switch (action) { @@ -433,6 +437,7 @@ feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action, } feature_sync(spa, feature, refcount, tx); + mutex_exit(&spa->spa_feat_stats_lock); } void diff --git a/sys/contrib/openzfs/module/zfs/zfs_chksum.c b/sys/contrib/openzfs/module/zfs/zfs_chksum.c index 5c92be21c0c8..21852bf3d865 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_chksum.c +++ b/sys/contrib/openzfs/module/zfs/zfs_chksum.c @@ -32,9 +32,6 @@ #include <sys/blake3.h> #include <sys/sha2.h> -/* limit benchmarking to max 256KiB, when EdonR is slower then this: */ -#define LIMIT_PERF_MBS 300 - typedef struct { const char *name; const char *impl; @@ -52,9 +49,15 @@ typedef struct { zio_checksum_tmpl_free_t *(free); } chksum_stat_t; +#define AT_STARTUP 0 +#define AT_BENCHMARK 1 +#define AT_DONE 2 + static chksum_stat_t *chksum_stat_data = 0; -static int chksum_stat_cnt = 0; static kstat_t *chksum_kstat = NULL; +static int chksum_stat_limit = AT_STARTUP; +static int chksum_stat_cnt = 0; +static void chksum_benchmark(void); /* * Sample output on i3-1005G1 System: @@ -129,6 +132,9 @@ chksum_kstat_data(char *buf, size_t size, void *data) static void * chksum_kstat_addr(kstat_t *ksp, loff_t n) { + /* full benchmark */ + chksum_benchmark(); + if (n < chksum_stat_cnt) ksp->ks_private = (void *)(chksum_stat_data + n); else @@ -176,47 +182,36 @@ chksum_run(chksum_stat_t *cs, abd_t *abd, void *ctx, int round, kpreempt_enable(); run_bw = size * run_count * NANOSEC; - run_bw /= run_time_ns; /* B/s */ + run_bw /= run_time_ns; /* B/s */ *result = run_bw/1024/1024; /* MiB/s */ } -#define LIMIT_INIT 0 -#define LIMIT_NEEDED 1 -#define LIMIT_NOLIMIT 2 - static void chksum_benchit(chksum_stat_t *cs) { abd_t *abd; void *ctx = 0; void *salt = &cs->salt.zcs_bytes; - static int chksum_stat_limit = LIMIT_INIT; memset(salt, 0, sizeof (cs->salt.zcs_bytes)); if (cs->init) ctx = cs->init(&cs->salt); + /* benchmarks in startup mode */ + if (chksum_stat_limit == AT_STARTUP) { + abd = abd_alloc_linear(1<<18, B_FALSE); + chksum_run(cs, abd, ctx, 5, &cs->bs256k); + goto done; + } + /* allocate test memory via abd linear interface */ abd = abd_alloc_linear(1<<20, B_FALSE); + + /* benchmarks when requested */ chksum_run(cs, abd, ctx, 1, &cs->bs1k); chksum_run(cs, abd, ctx, 2, &cs->bs4k); chksum_run(cs, abd, ctx, 3, &cs->bs16k); chksum_run(cs, abd, ctx, 4, &cs->bs64k); - chksum_run(cs, abd, ctx, 5, &cs->bs256k); - - /* check if we ran on a slow cpu */ - if (chksum_stat_limit == LIMIT_INIT) { - if (cs->bs1k < LIMIT_PERF_MBS) { - chksum_stat_limit = LIMIT_NEEDED; - } else { - chksum_stat_limit = LIMIT_NOLIMIT; - } - } - - /* skip benchmarks >= 1MiB when the CPU is to slow */ - if (chksum_stat_limit == LIMIT_NEEDED) - goto abort; - chksum_run(cs, abd, ctx, 6, &cs->bs1m); abd_free(abd); @@ -225,7 +220,7 @@ chksum_benchit(chksum_stat_t *cs) chksum_run(cs, abd, ctx, 7, &cs->bs4m); chksum_run(cs, abd, ctx, 8, &cs->bs16m); -abort: +done: abd_free(abd); /* free up temp memory */ @@ -243,7 +238,6 @@ chksum_benchmark(void) /* we need the benchmark only for the kernel module */ return; #endif - chksum_stat_t *cs; uint64_t max; uint32_t id, cbid = 0, id_save; @@ -251,8 +245,14 @@ chksum_benchmark(void) const zfs_impl_t *sha256 = zfs_impl_get_ops("sha256"); const zfs_impl_t *sha512 = zfs_impl_get_ops("sha512"); + /* benchmarks are done */ + if (chksum_stat_limit == AT_DONE) + return; + + /* count implementations */ - chksum_stat_cnt = 2; + chksum_stat_cnt = 1; /* edonr */ + chksum_stat_cnt += 1; /* skein */ chksum_stat_cnt += sha256->getcnt(); chksum_stat_cnt += sha512->getcnt(); chksum_stat_cnt += blake3->getcnt(); @@ -332,6 +332,17 @@ chksum_benchmark(void) } } blake3->setid(id_save); + + switch (chksum_stat_limit) { + case AT_STARTUP: + /* next time we want a full benchmark */ + chksum_stat_limit = AT_BENCHMARK; + break; + case AT_BENCHMARK: + /* no further benchmarks */ + chksum_stat_limit = AT_DONE; + break; + } } void @@ -341,7 +352,7 @@ chksum_init(void) blake3_per_cpu_ctx_init(); #endif - /* Benchmark supported implementations */ + /* 256KiB benchmark */ chksum_benchmark(); /* Install kstats for all implementations */ diff --git a/sys/contrib/openzfs/module/zfs/zfs_crrd.c b/sys/contrib/openzfs/module/zfs/zfs_crrd.c new file mode 100644 index 000000000000..30d4c7c36897 --- /dev/null +++ b/sys/contrib/openzfs/module/zfs/zfs_crrd.c @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2024 Klara Inc. + * + * This software was developed by + * Mariusz Zaborski <mariusz.zaborski@klarasystems.com> + * Fred Weigel <fred.weigel@klarasystems.com> + * under sponsorship from Wasabi Technology, Inc. and Klara Inc. + */ +/* + * This file implements a round-robin database that stores timestamps and txg + * numbers. Due to limited space, we use a round-robin approach, where + * the oldest records are overwritten when there is no longer enough room. + * This is a best-effort mechanism, and the database should be treated as + * an approximation. Consider this before consuming it. + * + * The database is linear, meaning we assume each new entry is newer than the + * ones already stored. Because of this, if time is manipulated, the database + * will only accept records that are newer than the existing ones. + * (For example, jumping 10 years into the future and then back can lead to + * situation when for 10 years we wont write anything to database) + * + * All times stored in the database use UTC, which makes it easy to convert to + * and from local time. + * + * Each database holds 256 records (as defined in the `RRD_MAX_ENTRIES` macro). + * This limit comes from the maximum size of a ZAP object, where we store the + * binary blob. + * + * We've split the database into three smaller ones. + * The `minute database` provides high resolution (default: every 10 minutes), + * but only covers approximately 1.5 days. This gives a detailed view of recent + * activity, useful, for example, when performing a scrub of the last hour. + * The `daily database` records one txg per day. With 256 entries, it retains + * roughly 8 months of data. This allows users to scrub or analyze txgs across + * a range of days. + * The `monthly database` stores one record per month, giving approximately + * 21 years of history. + * All these calculations assume the worst-case scenario: the pool is always + * online and actively written to. + * + * A potential source of confusion is that the database does not store data + * while the pool is offline, leading to potential gaps in timeline. Also, + * the database contains no records from before this feature was enabled. + * Both, upon reflection, are expected. + */ +#include <sys/zfs_context.h> + +#include "zfs_crrd.h" + +rrd_data_t * +rrd_tail_entry(rrd_t *rrd) +{ + size_t n; + + if (rrd_len(rrd) == 0) + return (NULL); + + if (rrd->rrd_tail == 0) + n = RRD_MAX_ENTRIES - 1; + else + n = rrd->rrd_tail - 1; + + return (&rrd->rrd_entries[n]); +} + +uint64_t +rrd_tail(rrd_t *rrd) +{ + const rrd_data_t *tail; + + tail = rrd_tail_entry(rrd); + + return (tail == NULL ? 0 : tail->rrdd_time); +} + +/* + * Return length of data in the rrd. + * rrd_get works from 0..rrd_len()-1. + */ +size_t +rrd_len(rrd_t *rrd) +{ + + return (rrd->rrd_length); +} + +const rrd_data_t * +rrd_entry(rrd_t *rrd, size_t i) +{ + size_t n; + + if (i >= rrd_len(rrd)) { + return (0); + } + + n = (rrd->rrd_head + i) % RRD_MAX_ENTRIES; + return (&rrd->rrd_entries[n]); +} + +uint64_t +rrd_get(rrd_t *rrd, size_t i) +{ + const rrd_data_t *data = rrd_entry(rrd, i); + + return (data == NULL ? 0 : data->rrdd_txg); +} + +/* Add value to database. */ +void +rrd_add(rrd_t *rrd, hrtime_t time, uint64_t txg) +{ + rrd_data_t *tail; + + tail = rrd_tail_entry(rrd); + if (tail != NULL && tail->rrdd_time == time) { + if (tail->rrdd_txg < txg) { + tail->rrdd_txg = txg; + } else { + return; + } + } + + rrd->rrd_entries[rrd->rrd_tail].rrdd_time = time; + rrd->rrd_entries[rrd->rrd_tail].rrdd_txg = txg; + + rrd->rrd_tail = (rrd->rrd_tail + 1) % RRD_MAX_ENTRIES; + + if (rrd->rrd_length < RRD_MAX_ENTRIES) { + rrd->rrd_length++; + } else { + rrd->rrd_head = (rrd->rrd_head + 1) % RRD_MAX_ENTRIES; + } +} + +void +dbrrd_add(dbrrd_t *db, hrtime_t time, uint64_t txg) +{ + hrtime_t daydiff, monthdiff, minutedif; + + minutedif = time - rrd_tail(&db->dbr_minutes); + daydiff = time - rrd_tail(&db->dbr_days); + monthdiff = time - rrd_tail(&db->dbr_months); + + if (monthdiff >= 0 && monthdiff >= 30 * 24 * 60 * 60) + rrd_add(&db->dbr_months, time, txg); + else if (daydiff >= 0 && daydiff >= 24 * 60 * 60) + rrd_add(&db->dbr_days, time, txg); + else if (minutedif >= 0) + rrd_add(&db->dbr_minutes, time, txg); +} + +/* + * We could do a binary search here, but the routine isn't frequently + * called and the data is small so we stick to a simple loop. + */ +static const rrd_data_t * +rrd_query(rrd_t *rrd, hrtime_t tv, dbrrd_rounding_t rounding) +{ + const rrd_data_t *data = NULL; + + for (size_t i = 0; i < rrd_len(rrd); i++) { + const rrd_data_t *cur = rrd_entry(rrd, i); + + if (rounding == DBRRD_FLOOR) { + if (tv < cur->rrdd_time) { + break; + } + data = cur; + } else { + /* DBRRD_CEILING */ + if (tv <= cur->rrdd_time) { + data = cur; + break; + } + } + } + + return (data); +} + +static const rrd_data_t * +dbrrd_closest(hrtime_t tv, const rrd_data_t *r1, const rrd_data_t *r2) +{ + + if (r1 == NULL) + return (r2); + if (r2 == NULL) + return (r1); + + return (ABS(tv - (hrtime_t)r1->rrdd_time) < + ABS(tv - (hrtime_t)r2->rrdd_time) ? r1 : r2); +} + +uint64_t +dbrrd_query(dbrrd_t *r, hrtime_t tv, dbrrd_rounding_t rounding) +{ + const rrd_data_t *data, *dm, *dd, *dy; + + data = NULL; + dm = rrd_query(&r->dbr_minutes, tv, rounding); + dd = rrd_query(&r->dbr_days, tv, rounding); + dy = rrd_query(&r->dbr_months, tv, rounding); + + data = dbrrd_closest(tv, dbrrd_closest(tv, dd, dm), dy); + + return (data == NULL ? 0 : data->rrdd_txg); +} diff --git a/sys/contrib/openzfs/module/zfs/zfs_fuid.c b/sys/contrib/openzfs/module/zfs/zfs_fuid.c index 10a6d289fbf8..2af1efe82e62 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_fuid.c +++ b/sys/contrib/openzfs/module/zfs/zfs_fuid.c @@ -112,8 +112,7 @@ zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree, uint64_t fuid_size; ASSERT(fuid_obj != 0); - VERIFY(0 == dmu_bonus_hold(os, fuid_obj, - FTAG, &db)); + VERIFY0(dmu_bonus_hold(os, fuid_obj, FTAG, &db)); fuid_size = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); @@ -125,22 +124,21 @@ zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree, int i; packed = kmem_alloc(fuid_size, KM_SLEEP); - VERIFY(dmu_read(os, fuid_obj, 0, - fuid_size, packed, DMU_READ_PREFETCH) == 0); - VERIFY(nvlist_unpack(packed, fuid_size, - &nvp, 0) == 0); - VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY, - &fuidnvp, &count) == 0); + VERIFY0(dmu_read(os, fuid_obj, 0, + fuid_size, packed, DMU_READ_PREFETCH)); + VERIFY0(nvlist_unpack(packed, fuid_size, &nvp, 0)); + VERIFY0(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY, + &fuidnvp, &count)); for (i = 0; i != count; i++) { fuid_domain_t *domnode; const char *domain; uint64_t idx; - VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN, - &domain) == 0); - VERIFY(nvlist_lookup_uint64(fuidnvp[i], FUID_IDX, - &idx) == 0); + VERIFY0(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN, + &domain)); + VERIFY0(nvlist_lookup_uint64(fuidnvp[i], FUID_IDX, + &idx)); domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP); @@ -246,35 +244,33 @@ zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx) &zfsvfs->z_fuid_obj, tx) == 0); } - VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY0(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP)); numnodes = avl_numnodes(&zfsvfs->z_fuid_idx); fuids = kmem_alloc(numnodes * sizeof (void *), KM_SLEEP); for (i = 0, domnode = avl_first(&zfsvfs->z_fuid_domain); domnode; i++, domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode)) { - VERIFY(nvlist_alloc(&fuids[i], NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX, - domnode->f_idx) == 0); - VERIFY(nvlist_add_uint64(fuids[i], FUID_OFFSET, 0) == 0); - VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN, - domnode->f_ksid->kd_name) == 0); + VERIFY0(nvlist_alloc(&fuids[i], NV_UNIQUE_NAME, KM_SLEEP)); + VERIFY0(nvlist_add_uint64(fuids[i], FUID_IDX, + domnode->f_idx)); + VERIFY0(nvlist_add_uint64(fuids[i], FUID_OFFSET, 0)); + VERIFY0(nvlist_add_string(fuids[i], FUID_DOMAIN, + domnode->f_ksid->kd_name)); } fnvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY, (const nvlist_t * const *)fuids, numnodes); for (i = 0; i != numnodes; i++) nvlist_free(fuids[i]); kmem_free(fuids, numnodes * sizeof (void *)); - VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0); + VERIFY0(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR)); packed = kmem_alloc(nvsize, KM_SLEEP); - VERIFY(nvlist_pack(nvp, &packed, &nvsize, - NV_ENCODE_XDR, KM_SLEEP) == 0); + VERIFY0(nvlist_pack(nvp, &packed, &nvsize, NV_ENCODE_XDR, KM_SLEEP)); nvlist_free(nvp); zfsvfs->z_fuid_size = nvsize; dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0, zfsvfs->z_fuid_size, packed, tx); kmem_free(packed, zfsvfs->z_fuid_size); - VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj, - FTAG, &db)); + VERIFY0(dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); *(uint64_t *)db->db_data = zfsvfs->z_fuid_size; dmu_buf_rele(db, FTAG); diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c index ebb1cfd07125..5ca7c2320c4e 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c +++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c @@ -683,6 +683,7 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) dsl_dataset_t *ds; const char *cp; int error; + boolean_t rawok = (zc->zc_flags & 0x8); /* * Generate the current snapshot name from the given objsetid, then @@ -705,6 +706,10 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, ZFS_DELEG_PERM_SEND, cr); + if (error != 0 && rawok == B_TRUE) { + error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, + ZFS_DELEG_PERM_SEND_RAW, cr); + } dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); @@ -714,9 +719,17 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) static int zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + boolean_t rawok = nvlist_exists(innvl, "rawok"); + int error; + (void) innvl; - return (zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_SEND, cr)); + error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_SEND, cr); + if (error != 0 && rawok == B_TRUE) { + error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_SEND_RAW, cr); + } + return (error); } static int @@ -1493,7 +1506,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) goto pool_props_bad; (void) nvlist_remove_all(props, ZPOOL_HIDDEN_ARGS); - VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY0(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP)); error = zfs_fill_zplprops_root(version, rootprops, zplprops, NULL); if (error != 0) @@ -1704,6 +1717,8 @@ zfs_ioc_pool_scan(zfs_cmd_t *zc) static const zfs_ioc_key_t zfs_keys_pool_scrub[] = { {"scan_type", DATA_TYPE_UINT64, 0}, {"scan_command", DATA_TYPE_UINT64, 0}, + {"scan_date_start", DATA_TYPE_UINT64, ZK_OPTIONAL}, + {"scan_date_end", DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int @@ -1712,6 +1727,7 @@ zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) spa_t *spa; int error; uint64_t scan_type, scan_cmd; + uint64_t date_start, date_end; if (nvlist_lookup_uint64(innvl, "scan_type", &scan_type) != 0) return (SET_ERROR(EINVAL)); @@ -1721,6 +1737,11 @@ zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) if (scan_cmd >= POOL_SCRUB_FLAGS_END) return (SET_ERROR(EINVAL)); + if (nvlist_lookup_uint64(innvl, "scan_date_start", &date_start) != 0) + date_start = 0; + if (nvlist_lookup_uint64(innvl, "scan_date_end", &date_end) != 0) + date_end = 0; + if ((error = spa_open(poolname, &spa, FTAG)) != 0) return (error); @@ -1732,7 +1753,24 @@ zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) error = spa_scan_range(spa, scan_type, spa_get_last_scrubbed_txg(spa), 0); } else { - error = spa_scan(spa, scan_type); + uint64_t txg_start, txg_end; + + txg_start = txg_end = 0; + if (date_start != 0 || date_end != 0) { + mutex_enter(&spa->spa_txg_log_time_lock); + if (date_start != 0) { + txg_start = dbrrd_query(&spa->spa_txg_log_time, + date_start, DBRRD_FLOOR); + } + + if (date_end != 0) { + txg_end = dbrrd_query(&spa->spa_txg_log_time, + date_end, DBRRD_CEILING); + } + mutex_exit(&spa->spa_txg_log_time_lock); + } + + error = spa_scan_range(spa, scan_type, txg_start, txg_end); } spa_close(spa, FTAG); @@ -2220,7 +2258,7 @@ nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop) */ if ((error = zfs_get_zplprop(os, prop, &value)) != 0) return (error); - VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0); + VERIFY0(nvlist_add_uint64(props, zfs_prop_to_name(prop), value)); return (0); } @@ -2255,7 +2293,7 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc) dmu_objset_type(os) == DMU_OST_ZFS) { nvlist_t *nv; - VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY0(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP)); if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 && @@ -2458,7 +2496,7 @@ zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; - VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); + VERIFY0(nvpair_value_nvlist(pair, &attrs)); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) != 0) return (SET_ERROR(EINVAL)); @@ -2513,9 +2551,8 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; - VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); - VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, - &pair) == 0); + VERIFY0(nvpair_value_nvlist(pair, &attrs)); + VERIFY0(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair)); } /* all special properties are numeric except for keylocation */ @@ -2907,14 +2944,14 @@ props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) { nvpair_t *pair; - VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY0(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP)); pair = NULL; while ((pair = nvlist_next_nvpair(props, pair)) != NULL) { if (nvlist_exists(skipped, nvpair_name(pair))) continue; - VERIFY(nvlist_add_nvpair(*newprops, pair) == 0); + VERIFY0(nvlist_add_nvpair(*newprops, pair)); } } @@ -3039,11 +3076,11 @@ zfs_ioc_inherit_prop(zfs_cmd_t *zc) switch (type) { case PROP_TYPE_STRING: - VERIFY(0 == nvlist_add_string(dummy, propname, "")); + VERIFY0(nvlist_add_string(dummy, propname, "")); break; case PROP_TYPE_NUMBER: case PROP_TYPE_INDEX: - VERIFY(0 == nvlist_add_uint64(dummy, propname, 0)); + VERIFY0(nvlist_add_uint64(dummy, propname, 0)); break; default: err = SET_ERROR(EINVAL); @@ -3429,14 +3466,14 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, /* * Put the version in the zplprops */ - VERIFY(nvlist_add_uint64(zplprops, - zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); + VERIFY0(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_VERSION), zplver)); if (norm == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0) return (error); - VERIFY(nvlist_add_uint64(zplprops, - zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); + VERIFY0(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm)); /* * If we're normalizing, names must always be valid UTF-8 strings. @@ -3446,55 +3483,55 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, if (u8 == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0) return (error); - VERIFY(nvlist_add_uint64(zplprops, - zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); + VERIFY0(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8)); if (sense == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0) return (error); - VERIFY(nvlist_add_uint64(zplprops, - zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); + VERIFY0(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_CASE), sense)); if (duq == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTUSERQUOTA, &duq)) != 0) return (error); - VERIFY(nvlist_add_uint64(zplprops, - zfs_prop_to_name(ZFS_PROP_DEFAULTUSERQUOTA), duq) == 0); + VERIFY0(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_DEFAULTUSERQUOTA), duq)); if (dgq == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTGROUPQUOTA, &dgq)) != 0) return (error); - VERIFY(nvlist_add_uint64(zplprops, - zfs_prop_to_name(ZFS_PROP_DEFAULTGROUPQUOTA), dgq) == 0); + VERIFY0(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_DEFAULTGROUPQUOTA), dgq)); if (dpq == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTPROJECTQUOTA, &dpq)) != 0) return (error); - VERIFY(nvlist_add_uint64(zplprops, - zfs_prop_to_name(ZFS_PROP_DEFAULTPROJECTQUOTA), dpq) == 0); + VERIFY0(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_DEFAULTPROJECTQUOTA), dpq)); if (duoq == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTUSEROBJQUOTA, &duoq)) != 0) return (error); - VERIFY(nvlist_add_uint64(zplprops, - zfs_prop_to_name(ZFS_PROP_DEFAULTUSEROBJQUOTA), duoq) == 0); + VERIFY0(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_DEFAULTUSEROBJQUOTA), duoq)); if (dgoq == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTGROUPOBJQUOTA, &dgoq)) != 0) return (error); - VERIFY(nvlist_add_uint64(zplprops, - zfs_prop_to_name(ZFS_PROP_DEFAULTGROUPOBJQUOTA), dgoq) == 0); + VERIFY0(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_DEFAULTGROUPOBJQUOTA), dgoq)); if (dpoq == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTPROJECTOBJQUOTA, &dpoq)) != 0) return (error); - VERIFY(nvlist_add_uint64(zplprops, - zfs_prop_to_name(ZFS_PROP_DEFAULTPROJECTOBJQUOTA), dpoq) == 0); + VERIFY0(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_DEFAULTPROJECTOBJQUOTA), dpoq)); if (is_ci) *is_ci = (sense == ZFS_CASE_INSENSITIVE); @@ -3643,8 +3680,8 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) * file system creation, so go figure them out * now. */ - VERIFY(nvlist_alloc(&zct.zct_zplprops, - NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY0(nvlist_alloc(&zct.zct_zplprops, + NV_UNIQUE_NAME, KM_SLEEP)); error = zfs_fill_zplprops(fsname, nvprops, zct.zct_zplprops, &is_insensitive); if (error != 0) { @@ -4702,7 +4739,7 @@ zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) error = error ? error : resume_err; } zfs_vfs_rele(zfsvfs); - } else if ((zv = zvol_suspend(fsname)) != NULL) { + } else if (zvol_suspend(fsname, &zv) == 0) { error = dsl_dataset_rollback(fsname, target, zvol_tag(zv), outnvl); zvol_resume(zv); @@ -4891,9 +4928,8 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) * format. */ nvlist_t *attrs; - VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); - VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, - &pair) == 0); + VERIFY0(nvpair_value_nvlist(pair, &attrs)); + VERIFY0(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair)); } /* @@ -5000,15 +5036,6 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) } break; - case ZFS_PROP_SPECIAL_SMALL_BLOCKS: - /* - * This property could require the allocation classes - * feature to be active for setting, however we allow - * it so that tests of settable properties succeed. - * The CLI will issue a warning in this case. - */ - break; - case ZFS_PROP_SHARESMB: if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) return (SET_ERROR(ENOTSUP)); @@ -5087,7 +5114,7 @@ zfs_check_clearable(const char *dataset, nvlist_t *props, nvlist_t **errlist) if (props == NULL) return (0); - VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY0(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP)); zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strlcpy(zc->zc_name, dataset, sizeof (zc->zc_name)); @@ -5099,9 +5126,8 @@ zfs_check_clearable(const char *dataset, nvlist_t *props, nvlist_t **errlist) sizeof (zc->zc_value)); if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) { - VERIFY(nvlist_remove_nvpair(props, pair) == 0); - VERIFY(nvlist_add_int32(errors, - zc->zc_value, err) == 0); + VERIFY0(nvlist_remove_nvpair(props, pair)); + VERIFY0(nvlist_add_int32(errors, zc->zc_value, err)); } pair = next_pair; } @@ -5111,7 +5137,7 @@ zfs_check_clearable(const char *dataset, nvlist_t *props, nvlist_t **errlist) nvlist_free(errors); errors = NULL; } else { - VERIFY(nvpair_value_int32(pair, &rv) == 0); + VERIFY0(nvpair_value_int32(pair, &rv)); } if (errlist == NULL) @@ -5128,16 +5154,14 @@ propval_equals(nvpair_t *p1, nvpair_t *p2) if (nvpair_type(p1) == DATA_TYPE_NVLIST) { /* dsl_prop_get_all_impl() format */ nvlist_t *attrs; - VERIFY(nvpair_value_nvlist(p1, &attrs) == 0); - VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, - &p1) == 0); + VERIFY0(nvpair_value_nvlist(p1, &attrs)); + VERIFY0(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p1)); } if (nvpair_type(p2) == DATA_TYPE_NVLIST) { nvlist_t *attrs; - VERIFY(nvpair_value_nvlist(p2, &attrs) == 0); - VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, - &p2) == 0); + VERIFY0(nvpair_value_nvlist(p2, &attrs)); + VERIFY0(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p2)); } if (nvpair_type(p1) != nvpair_type(p2)) @@ -5146,14 +5170,14 @@ propval_equals(nvpair_t *p1, nvpair_t *p2) if (nvpair_type(p1) == DATA_TYPE_STRING) { const char *valstr1, *valstr2; - VERIFY(nvpair_value_string(p1, &valstr1) == 0); - VERIFY(nvpair_value_string(p2, &valstr2) == 0); + VERIFY0(nvpair_value_string(p1, &valstr1)); + VERIFY0(nvpair_value_string(p2, &valstr2)); return (strcmp(valstr1, valstr2) == 0); } else { uint64_t intval1, intval2; - VERIFY(nvpair_value_uint64(p1, &intval1) == 0); - VERIFY(nvpair_value_uint64(p2, &intval2) == 0); + VERIFY0(nvpair_value_uint64(p1, &intval1)); + VERIFY0(nvpair_value_uint64(p2, &intval2)); return (intval1 == intval2); } } @@ -5221,7 +5245,7 @@ extract_delay_props(nvlist_t *props) }; int i; - VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY0(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP)); for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL; nvp = nvlist_next_nvpair(props, nvp)) { @@ -5237,8 +5261,8 @@ extract_delay_props(nvlist_t *props) } if (delayable[i] != 0) { tmp = nvlist_prev_nvpair(props, nvp); - VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0); - VERIFY(nvlist_remove_nvpair(props, nvp) == 0); + VERIFY0(nvlist_add_nvpair(delayprops, nvp)); + VERIFY0(nvlist_remove_nvpair(props, nvp)); nvp = tmp; } } @@ -5437,7 +5461,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, const char *origin, } error = error ? error : end_err; zfs_vfs_rele(zfsvfs); - } else if ((zv = zvol_suspend(tofs)) != NULL) { + } else if (zvol_suspend(tofs, &zv) == 0) { error = dmu_recv_end(&drc, zvol_tag(zv)); zvol_resume(zv); } else { @@ -5469,15 +5493,15 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, const char *origin, * using ASSERT() will be just like a VERIFY. */ if (recv_delayprops != NULL) { - ASSERT(nvlist_merge(recvprops, recv_delayprops, 0) == 0); + ASSERT0(nvlist_merge(recvprops, recv_delayprops, 0)); nvlist_free(recv_delayprops); } if (local_delayprops != NULL) { - ASSERT(nvlist_merge(localprops, local_delayprops, 0) == 0); + ASSERT0(nvlist_merge(localprops, local_delayprops, 0)); nvlist_free(local_delayprops); } if (inherited_delayprops != NULL) { - ASSERT(nvlist_merge(localprops, inherited_delayprops, 0) == 0); + ASSERT0(nvlist_merge(localprops, inherited_delayprops, 0)); nvlist_free(inherited_delayprops); } *read_bytes = off - noff; @@ -7326,8 +7350,8 @@ zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); - ASSERT3P(vec->zvec_legacy_func, ==, NULL); - ASSERT3P(vec->zvec_func, ==, NULL); + ASSERT0P(vec->zvec_legacy_func); + ASSERT0P(vec->zvec_func); vec->zvec_legacy_func = func; vec->zvec_secpolicy = secpolicy; @@ -7350,8 +7374,8 @@ zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); - ASSERT3P(vec->zvec_legacy_func, ==, NULL); - ASSERT3P(vec->zvec_func, ==, NULL); + ASSERT0P(vec->zvec_legacy_func); + ASSERT0P(vec->zvec_func); /* if we are logging, the name must be valid */ ASSERT(!allow_log || namecheck != NO_NAME); @@ -7608,7 +7632,7 @@ zfs_ioctl_init(void) zfs_ioctl_register("scrub", ZFS_IOC_POOL_SCRUB, zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, - POOL_CHECK_NONE, B_TRUE, B_TRUE, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_scrub, ARRAY_SIZE(zfs_keys_pool_scrub)); zfs_ioctl_register("get_props", ZFS_IOC_POOL_GET_PROPS, @@ -8132,7 +8156,7 @@ zfsdev_ioctl_common(uint_t vecnum, zfs_cmd_t *zc, int flag) spa_t *spa; nvlist_t *lognv = NULL; - ASSERT(vec->zvec_legacy_func == NULL); + ASSERT0P(vec->zvec_legacy_func); /* * Add the innvl to the lognv before calling the func, diff --git a/sys/contrib/openzfs/module/zfs/zfs_log.c b/sys/contrib/openzfs/module/zfs/zfs_log.c index 2ce25b72b288..ea17e049279f 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_log.c +++ b/sys/contrib/openzfs/module/zfs/zfs_log.c @@ -607,8 +607,6 @@ zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, * called as soon as the write is on stable storage (be it via a DMU sync or a * ZIL commit). */ -static uint_t zfs_immediate_write_sz = 32768; - void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t resid, boolean_t commit, @@ -622,19 +620,12 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, if (zil_replaying(zilog, tx) || zp->z_unlinked || zfs_xattr_owner_unlinked(zp)) { if (callback != NULL) - callback(callback_data); + callback(callback_data, 0); return; } - if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT || o_direct) - write_state = WR_INDIRECT; - else if (!spa_has_slogs(zilog->zl_spa) && - resid >= zfs_immediate_write_sz) - write_state = WR_INDIRECT; - else if (commit) - write_state = WR_COPIED; - else - write_state = WR_NEED_COPY; + write_state = zil_write_state(zilog, resid, blocksize, o_direct, + commit); (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &gen, sizeof (gen)); @@ -672,7 +663,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, DMU_KEEP_CACHING); DB_DNODE_EXIT(db); if (err != 0) { - zil_itx_destroy(itx); + zil_itx_destroy(itx, 0); itx = zil_itx_create(txtype, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; wr_state = WR_NEED_COPY; @@ -938,6 +929,3 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, len -= partlen; } } - -ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, UINT, ZMOD_RW, - "Largest data block to write to zil"); diff --git a/sys/contrib/openzfs/module/zfs/zfs_quota.c b/sys/contrib/openzfs/module/zfs/zfs_quota.c index b8fe512d4f09..2e91ccc27d6d 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_quota.c +++ b/sys/contrib/openzfs/module/zfs/zfs_quota.c @@ -374,7 +374,7 @@ zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, if (*objp == 0) { *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, DMU_OT_NONE, 0, tx); - VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + VERIFY0(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); } mutex_exit(&zfsvfs->z_lock); @@ -386,7 +386,7 @@ zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, } else { err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); } - ASSERT(err == 0); + ASSERT0(err); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); dmu_tx_commit(tx); diff --git a/sys/contrib/openzfs/module/zfs/zfs_rlock.c b/sys/contrib/openzfs/module/zfs/zfs_rlock.c index 53eb3ef1b66e..4035baff77d6 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_rlock.c +++ b/sys/contrib/openzfs/module/zfs/zfs_rlock.c @@ -666,7 +666,7 @@ zfs_rangelock_reduce(zfs_locked_range_t *lr, uint64_t off, uint64_t len) /* Ensure there are no other locks */ ASSERT3U(avl_numnodes(&rl->rl_tree), ==, 1); - ASSERT3U(lr->lr_offset, ==, 0); + ASSERT0(lr->lr_offset); ASSERT3U(lr->lr_type, ==, RL_WRITER); ASSERT(!lr->lr_proxy); ASSERT3U(lr->lr_length, ==, UINT64_MAX); diff --git a/sys/contrib/openzfs/module/zfs/zfs_sa.c b/sys/contrib/openzfs/module/zfs/zfs_sa.c index 59b6ae4e4203..8b4fc6fd7fbd 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_sa.c +++ b/sys/contrib/openzfs/module/zfs/zfs_sa.c @@ -169,7 +169,7 @@ zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) ASSERT(MUTEX_HELD(&zp->z_lock)); VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); if (zp->z_is_sa) - VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs), + VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs), &xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp), tx)); else { @@ -181,12 +181,12 @@ zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) len = sizeof (xoap->xoa_av_scanstamp) + ZFS_OLD_ZNODE_PHYS_SIZE; if (len > doi.doi_bonus_size) - VERIFY(dmu_set_bonus(db, len, tx) == 0); + VERIFY0(dmu_set_bonus(db, len, tx)); (void) memcpy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp)); zp->z_pflags |= ZFS_BONUS_SCANSTAMP; - VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), &zp->z_pflags, sizeof (uint64_t), tx)); } } @@ -286,7 +286,7 @@ zfs_sa_set_xattr(znode_t *zp, const char *name, const void *value, size_t vsize) dmu_tx_commit(tx); if (logsaxattr && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + error = zil_commit(zilog, 0); } out_free: vmem_free(obj, size); @@ -427,11 +427,10 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP; } - VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0); - VERIFY(sa_replace_all_by_template_locked(hdl, sa_attrs, - count, tx) == 0); + VERIFY0(dmu_set_bonustype(db, DMU_OT_SA, tx)); + VERIFY0(sa_replace_all_by_template_locked(hdl, sa_attrs, count, tx)); if (znode_acl.z_acl_extern_obj) - VERIFY(0 == dmu_object_free(zfsvfs->z_os, + VERIFY0(dmu_object_free(zfsvfs->z_os, znode_acl.z_acl_extern_obj, tx)); zp->z_is_sa = B_TRUE; diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c index 656ca4dc22ff..7bb9ba57c69e 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c +++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c @@ -27,6 +27,7 @@ * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> + * Copyright (c) 2025, Klara, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ @@ -49,6 +50,7 @@ #include <sys/dmu.h> #include <sys/dmu_objset.h> #include <sys/dsl_crypt.h> +#include <sys/dsl_dataset.h> #include <sys/spa.h> #include <sys/txg.h> #include <sys/dbuf.h> @@ -67,13 +69,14 @@ int zfs_bclone_enabled = 1; /* - * When set zfs_clone_range() waits for dirty data to be written to disk. - * This allows the clone operation to reliably succeed when a file is modified - * and then immediately cloned. For small files this may be slower than making - * a copy of the file and is therefore not the default. However, in certain - * scenarios this behavior may be desirable so a tunable is provided. + * When set to 1 the FICLONE and FICLONERANGE ioctls will wait for any dirty + * data to be written to disk before proceeding. This ensures that the clone + * operation reliably succeeds, even if a file is modified and then immediately + * cloned. Note that for small files this may be slower than simply copying + * the file. When set to 0 the clone operation will immediately fail if it + * encounters any dirty blocks. By default waiting is enabled. */ -int zfs_bclone_wait_dirty = 0; +int zfs_bclone_wait_dirty = 1; /* * Enable Direct I/O. If this setting is 0, then all I/O requests will be @@ -114,9 +117,7 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); - atomic_inc_32(&zp->z_sync_writes_cnt); - zil_commit(zfsvfs->z_log, zp->z_id); - atomic_dec_32(&zp->z_sync_writes_cnt); + error = zil_commit(zfsvfs->z_log, zp->z_id); zfs_exit(zfsvfs, FTAG); } return (error); @@ -375,8 +376,13 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) frsync = !!(ioflag & FRSYNC); #endif if (zfsvfs->z_log && - (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) - zil_commit(zfsvfs->z_log, zp->z_id); + (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) { + error = zil_commit(zfsvfs->z_log, zp->z_id); + if (error != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } + } /* * Lock the range against changes. @@ -1074,8 +1080,13 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) return (error); } - if (commit) - zil_commit(zilog, zp->z_id); + if (commit) { + error = zil_commit(zilog, zp->z_id); + if (error != 0) { + zfs_exit(zfsvfs, FTAG); + return (error); + } + } int64_t nwritten = start_resid - zfs_uio_resid(uio); dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); @@ -1102,13 +1113,21 @@ zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags, { int error; - if (flags != 0 || arg != 0) + if ((flags & ~ZFS_REWRITE_PHYSICAL) != 0 || arg != 0) return (SET_ERROR(EINVAL)); zfsvfs_t *zfsvfs = ZTOZSB(zp); if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); + /* Check if physical rewrite is allowed */ + spa_t *spa = zfsvfs->z_os->os_spa; + if ((flags & ZFS_REWRITE_PHYSICAL) && + !spa_feature_is_enabled(spa, SPA_FEATURE_PHYSICAL_REWRITE)) { + zfs_exit(zfsvfs, FTAG); + return (SET_ERROR(ENOTSUP)); + } + if (zfs_is_readonly(zfsvfs)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EROFS)); @@ -1196,7 +1215,10 @@ zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags, if (dmu_buf_is_dirty(dbp[i], tx)) continue; nw += dbp[i]->db_size; - dmu_buf_will_dirty(dbp[i], tx); + if (flags & ZFS_REWRITE_PHYSICAL) + dmu_buf_will_rewrite(dbp[i], tx); + else + dmu_buf_will_dirty(dbp[i], tx); } dmu_buf_rele_array(dbp, numbufs, FTAG); @@ -1249,8 +1271,8 @@ zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) zilog = zfsvfs->z_log; error = zfs_setacl(zp, vsecp, skipaclchk, cr); - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + error = zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); @@ -1935,7 +1957,7 @@ unlock: ZFS_ACCESSTIME_STAMP(inzfsvfs, inzp); if (outos->os_sync == ZFS_SYNC_ALWAYS) { - zil_commit(zilog, outzp->z_id); + error = zil_commit(zilog, outzp->z_id); } *inoffp += done; diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c index 00059b2c6de0..0307df55aa21 100644 --- a/sys/contrib/openzfs/module/zfs/zil.c +++ b/sys/contrib/openzfs/module/zfs/zil.c @@ -24,6 +24,7 @@ * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2018 Datto Inc. + * Copyright (c) 2025, Klara, Inc. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -103,6 +104,7 @@ static zil_kstat_values_t zil_stats = { { "zil_commit_error_count", KSTAT_DATA_UINT64 }, { "zil_commit_stall_count", KSTAT_DATA_UINT64 }, { "zil_commit_suspend_count", KSTAT_DATA_UINT64 }, + { "zil_commit_crash_count", KSTAT_DATA_UINT64 }, { "zil_itx_count", KSTAT_DATA_UINT64 }, { "zil_itx_indirect_count", KSTAT_DATA_UINT64 }, { "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 }, @@ -145,7 +147,7 @@ static uint64_t zil_slog_bulk = 64 * 1024 * 1024; static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; -static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx); +static int zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx); static itx_t *zil_itx_clone(itx_t *oitx); static uint64_t zil_max_waste_space(zilog_t *zilog); @@ -367,6 +369,7 @@ zil_sums_init(zil_sums_t *zs) wmsum_init(&zs->zil_commit_error_count, 0); wmsum_init(&zs->zil_commit_stall_count, 0); wmsum_init(&zs->zil_commit_suspend_count, 0); + wmsum_init(&zs->zil_commit_crash_count, 0); wmsum_init(&zs->zil_itx_count, 0); wmsum_init(&zs->zil_itx_indirect_count, 0); wmsum_init(&zs->zil_itx_indirect_bytes, 0); @@ -392,6 +395,7 @@ zil_sums_fini(zil_sums_t *zs) wmsum_fini(&zs->zil_commit_error_count); wmsum_fini(&zs->zil_commit_stall_count); wmsum_fini(&zs->zil_commit_suspend_count); + wmsum_fini(&zs->zil_commit_crash_count); wmsum_fini(&zs->zil_itx_count); wmsum_fini(&zs->zil_itx_indirect_count); wmsum_fini(&zs->zil_itx_indirect_bytes); @@ -422,6 +426,8 @@ zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums) wmsum_value(&zil_sums->zil_commit_stall_count); zs->zil_commit_suspend_count.value.ui64 = wmsum_value(&zil_sums->zil_commit_suspend_count); + zs->zil_commit_crash_count.value.ui64 = + wmsum_value(&zil_sums->zil_commit_crash_count); zs->zil_itx_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_count); zs->zil_itx_indirect_count.value.ui64 = @@ -589,7 +595,7 @@ zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, * that we rewind to is invalid. Thus, we return -1 so * zil_parse() doesn't attempt to read it. */ - if (BP_GET_LOGICAL_BIRTH(bp) >= first_txg) + if (BP_GET_BIRTH(bp) >= first_txg) return (-1); if (zil_bp_tree_add(zilog, bp) != 0) @@ -615,7 +621,7 @@ zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, * Claim log block if not already committed and not already claimed. * If tx == NULL, just verify that the block is claimable. */ - if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) < first_txg || + if (BP_IS_HOLE(bp) || BP_GET_BIRTH(bp) < first_txg || zil_bp_tree_add(zilog, bp) != 0) return (0); @@ -640,7 +646,7 @@ zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) * waited for all writes to be stable first), so it is semantically * correct to declare this the end of the log. */ - if (BP_GET_LOGICAL_BIRTH(&lr->lr_blkptr) >= first_txg) { + if (BP_GET_BIRTH(&lr->lr_blkptr) >= first_txg) { error = zil_read_log_data(zilog, lr, NULL); if (error != 0) return (error); @@ -687,7 +693,7 @@ zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx, * just in case lets be safe and just stop here now instead of * corrupting the pool. */ - if (BP_GET_BIRTH(bp) >= first_txg) + if (BP_GET_PHYSICAL_BIRTH(bp) >= first_txg) return (SET_ERROR(ENOENT)); /* @@ -742,7 +748,7 @@ zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg) /* * If we previously claimed it, we need to free it. */ - if (BP_GET_LOGICAL_BIRTH(bp) >= claim_txg && + if (BP_GET_BIRTH(bp) >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && !BP_IS_HOLE(bp)) { zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); } @@ -813,34 +819,37 @@ zil_lwb_vdev_compare(const void *x1, const void *x2) * we choose them here and later make the block allocation match. */ static lwb_t * -zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog, - uint64_t txg, lwb_state_t state) +zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, int min_sz, int sz, + boolean_t slog, uint64_t txg) { lwb_t *lwb; lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); + lwb->lwb_flags = 0; lwb->lwb_zilog = zilog; if (bp) { lwb->lwb_blk = *bp; - lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2); + if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) + lwb->lwb_flags |= LWB_FLAG_SLIM; sz = BP_GET_LSIZE(bp); + lwb->lwb_min_sz = sz; } else { BP_ZERO(&lwb->lwb_blk); - lwb->lwb_slim = (spa_version(zilog->zl_spa) >= - SPA_VERSION_SLIM_ZIL); + if (spa_version(zilog->zl_spa) >= SPA_VERSION_SLIM_ZIL) + lwb->lwb_flags |= LWB_FLAG_SLIM; + lwb->lwb_min_sz = min_sz; } - lwb->lwb_slog = slog; + if (slog) + lwb->lwb_flags |= LWB_FLAG_SLOG; lwb->lwb_error = 0; - if (lwb->lwb_slim) { - lwb->lwb_nmax = sz; - lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t); - } else { - lwb->lwb_nmax = sz - sizeof (zil_chain_t); - lwb->lwb_nused = lwb->lwb_nfilled = 0; - } + /* + * Buffer allocation and capacity setup will be done in + * zil_lwb_write_open() when the LWB is opened for ITX assignment. + */ + lwb->lwb_nmax = lwb->lwb_nused = lwb->lwb_nfilled = 0; lwb->lwb_sz = sz; - lwb->lwb_state = state; - lwb->lwb_buf = zio_buf_alloc(sz); + lwb->lwb_buf = NULL; + lwb->lwb_state = LWB_STATE_NEW; lwb->lwb_child_zio = NULL; lwb->lwb_write_zio = NULL; lwb->lwb_root_zio = NULL; @@ -851,8 +860,6 @@ zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog, mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_lwb_list, lwb); - if (state != LWB_STATE_NEW) - zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); return (lwb); @@ -864,15 +871,15 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb) ASSERT(MUTEX_HELD(&zilog->zl_lock)); ASSERT(lwb->lwb_state == LWB_STATE_NEW || lwb->lwb_state == LWB_STATE_FLUSH_DONE); - ASSERT3P(lwb->lwb_child_zio, ==, NULL); - ASSERT3P(lwb->lwb_write_zio, ==, NULL); - ASSERT3P(lwb->lwb_root_zio, ==, NULL); + ASSERT0P(lwb->lwb_child_zio); + ASSERT0P(lwb->lwb_write_zio); + ASSERT0P(lwb->lwb_root_zio); ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa)); ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa)); VERIFY(list_is_empty(&lwb->lwb_itxs)); VERIFY(list_is_empty(&lwb->lwb_waiters)); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); - ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); + ASSERT(!MUTEX_HELD(&lwb->lwb_lock)); /* * Clear the zilog's field to indicate this lwb is no longer @@ -991,8 +998,8 @@ zil_create(zilog_t *zilog) */ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); - ASSERT(zh->zh_claim_txg == 0); - ASSERT(zh->zh_replay_seq == 0); + ASSERT0(zh->zh_claim_txg); + ASSERT0(zh->zh_replay_seq); blk = zh->zh_log; @@ -1013,7 +1020,7 @@ zil_create(zilog_t *zilog) } error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk, - ZIL_MIN_BLKSZ, &slog); + ZIL_MIN_BLKSZ, ZIL_MIN_BLKSZ, &slog, B_TRUE); if (error == 0) zil_init_log_chain(zilog, &blk); } @@ -1022,7 +1029,7 @@ zil_create(zilog_t *zilog) * Allocate a log write block (lwb) for the first log block. */ if (error == 0) - lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW); + lwb = zil_alloc_lwb(zilog, &blk, 0, 0, slog, txg); /* * If we just allocated the first log block, commit our transaction @@ -1104,7 +1111,7 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) zilog->zl_keep_first = keep_first; if (!list_is_empty(&zilog->zl_lwb_list)) { - ASSERT(zh->zh_claim_txg == 0); + ASSERT0(zh->zh_claim_txg); VERIFY(!keep_first); while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) { if (lwb->lwb_buf != NULL) @@ -1250,7 +1257,7 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) blkptr_t *bp; int error; - ASSERT(tx == NULL); + ASSERT0P(tx); error = dmu_objset_from_ds(ds, &os); if (error != 0) { @@ -1318,10 +1325,12 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) * zil_commit() is racing with spa_sync(). */ static void -zil_commit_waiter_skip(zil_commit_waiter_t *zcw) +zil_commit_waiter_done(zil_commit_waiter_t *zcw, int err) { mutex_enter(&zcw->zcw_lock); ASSERT3B(zcw->zcw_done, ==, B_FALSE); + zcw->zcw_lwb = NULL; + zcw->zcw_error = err; zcw->zcw_done = B_TRUE; cv_broadcast(&zcw->zcw_cv); mutex_exit(&zcw->zcw_lock); @@ -1351,7 +1360,7 @@ zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) ASSERT(!list_link_active(&zcw->zcw_node)); list_insert_tail(&lwb->lwb_waiters, zcw); - ASSERT3P(zcw->zcw_lwb, ==, NULL); + ASSERT0P(zcw->zcw_lwb); zcw->zcw_lwb = lwb; } @@ -1365,7 +1374,7 @@ zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) { ASSERT(!list_link_active(&zcw->zcw_node)); list_insert_tail(nolwb, zcw); - ASSERT3P(zcw->zcw_lwb, ==, NULL); + ASSERT0P(zcw->zcw_lwb); } void @@ -1383,7 +1392,7 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) if (zil_nocacheflush) return; - mutex_enter(&lwb->lwb_vdev_lock); + mutex_enter(&lwb->lwb_lock); for (i = 0; i < ndvas; i++) { zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); if (avl_find(t, &zvsearch, &where) == NULL) { @@ -1392,7 +1401,7 @@ zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) avl_insert(t, zv, where); } } - mutex_exit(&lwb->lwb_vdev_lock); + mutex_exit(&lwb->lwb_lock); } static void @@ -1409,12 +1418,12 @@ zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb) /* * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does - * not need the protection of lwb_vdev_lock (it will only be modified + * not need the protection of lwb_lock (it will only be modified * while holding zilog->zl_lock) as its writes and those of its * children have all completed. The younger 'nlwb' may be waiting on * future writes to additional vdevs. */ - mutex_enter(&nlwb->lwb_vdev_lock); + mutex_enter(&nlwb->lwb_lock); /* * Tear down the 'lwb' vdev tree, ensuring that entries which do not * exist in 'nlwb' are moved to it, freeing any would-be duplicates. @@ -1428,7 +1437,7 @@ zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb) kmem_free(zv, sizeof (*zv)); } } - mutex_exit(&nlwb->lwb_vdev_lock); + mutex_exit(&nlwb->lwb_lock); } void @@ -1482,13 +1491,9 @@ zil_lwb_flush_vdevs_done(zio_t *zio) } while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL) - zil_itx_destroy(itx); + zil_itx_destroy(itx, 0); while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) { - mutex_enter(&zcw->zcw_lock); - - ASSERT3P(zcw->zcw_lwb, ==, lwb); - zcw->zcw_lwb = NULL; /* * We expect any ZIO errors from child ZIOs to have been * propagated "up" to this specific LWB's root ZIO, in @@ -1503,14 +1508,7 @@ zil_lwb_flush_vdevs_done(zio_t *zio) * errors not being handled correctly here. See the * comment above the call to "zio_flush" for details. */ - - zcw->zcw_zio_error = zio->io_error; - - ASSERT3B(zcw->zcw_done, ==, B_FALSE); - zcw->zcw_done = B_TRUE; - cv_broadcast(&zcw->zcw_cv); - - mutex_exit(&zcw->zcw_lock); + zil_commit_waiter_done(zcw, zio->io_error); } uint64_t txg = lwb->lwb_issued_txg; @@ -1582,7 +1580,7 @@ zil_lwb_write_done(zio_t *zio) avl_tree_t *t = &lwb->lwb_vdev_tree; void *cookie = NULL; zil_vdev_node_t *zv; - lwb_t *nlwb; + lwb_t *nlwb = NULL; ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); @@ -1602,9 +1600,11 @@ zil_lwb_write_done(zio_t *zio) * its write ZIO a parent this ZIO. In such case we can not defer * our flushes or below may be a race between the done callbacks. */ - nlwb = list_next(&zilog->zl_lwb_list, lwb); - if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED) - nlwb = NULL; + if (!(lwb->lwb_flags & LWB_FLAG_CRASHED)) { + nlwb = list_next(&zilog->zl_lwb_list, lwb); + if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED) + nlwb = NULL; + } mutex_exit(&zilog->zl_lock); if (avl_numnodes(t) == 0) @@ -1618,12 +1618,17 @@ zil_lwb_write_done(zio_t *zio) * written out. * * Additionally, we don't perform any further error handling at - * this point (e.g. setting "zcw_zio_error" appropriately), as - * we expect that to occur in "zil_lwb_flush_vdevs_done" (thus, - * we expect any error seen here, to have been propagated to - * that function). + * this point (e.g. setting "zcw_error" appropriately), as we + * expect that to occur in "zil_lwb_flush_vdevs_done" (thus, we + * expect any error seen here, to have been propagated to that + * function). + * + * Note that we treat a "crashed" LWB as though it was in error, + * even if it did appear to succeed, because we've already + * signaled error and cleaned up waiters and committers in + * zil_crash(); we just want to clean up and get out of here. */ - if (zio->io_error != 0) { + if (zio->io_error != 0 || (lwb->lwb_flags & LWB_FLAG_CRASHED)) { while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) kmem_free(zv, sizeof (*zv)); return; @@ -1736,10 +1741,26 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) return; } + mutex_enter(&lwb->lwb_lock); mutex_enter(&zilog->zl_lock); lwb->lwb_state = LWB_STATE_OPENED; zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); + mutex_exit(&lwb->lwb_lock); + + /* + * Allocate buffer and set up LWB capacities. + */ + ASSERT0P(lwb->lwb_buf); + ASSERT3U(lwb->lwb_sz, >, 0); + lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz); + if (lwb->lwb_flags & LWB_FLAG_SLIM) { + lwb->lwb_nmax = lwb->lwb_sz; + lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t); + } else { + lwb->lwb_nmax = lwb->lwb_sz - sizeof (zil_chain_t); + lwb->lwb_nused = lwb->lwb_nfilled = 0; + } } /* @@ -1756,6 +1777,8 @@ static uint_t zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize) { uint_t md = zilog->zl_max_block_size - sizeof (zil_chain_t); + uint_t waste = zil_max_waste_space(zilog); + waste = MAX(waste, zilog->zl_cur_max); if (size <= md) { /* @@ -1766,9 +1789,10 @@ zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize) } else if (size > 8 * md) { /* * Big bursts use maximum blocks. The first block size - * is hard to predict, but it does not really matter. + * is hard to predict, but we need at least enough space + * to make reasonable progress. */ - *minsize = 0; + *minsize = waste; return (md); } @@ -1781,57 +1805,52 @@ zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize) uint_t s = size; uint_t n = DIV_ROUND_UP(s, md - sizeof (lr_write_t)); uint_t chunk = DIV_ROUND_UP(s, n); - uint_t waste = zil_max_waste_space(zilog); - waste = MAX(waste, zilog->zl_cur_max); if (chunk <= md - waste) { *minsize = MAX(s - (md - waste) * (n - 1), waste); return (chunk); } else { - *minsize = 0; + *minsize = waste; return (md); } } /* * Try to predict next block size based on previous history. Make prediction - * sufficient for 7 of 8 previous bursts. Don't try to save if the saving is - * less then 50%, extra writes may cost more, but we don't want single spike - * to badly affect our predictions. + * sufficient for 7 of 8 previous bursts, but don't try to save if the saving + * is less then 50%. Extra writes may cost more, but we don't want single + * spike to badly affect our predictions. */ -static uint_t -zil_lwb_predict(zilog_t *zilog) +static void +zil_lwb_predict(zilog_t *zilog, uint64_t *min_predict, uint64_t *max_predict) { - uint_t m, o; + uint_t m1 = 0, m2 = 0, o; - /* If we are in the middle of a burst, take it into account also. */ - if (zilog->zl_cur_size > 0) { - o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m); - } else { + /* If we are in the middle of a burst, take it as another data point. */ + if (zilog->zl_cur_size > 0) + o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m1); + else o = UINT_MAX; - m = 0; - } - /* Find minimum optimal size. We don't need to go below that. */ - for (int i = 0; i < ZIL_BURSTS; i++) - o = MIN(o, zilog->zl_prev_opt[i]); - - /* Find two biggest minimal first block sizes above the optimal. */ - uint_t m1 = MAX(m, o), m2 = o; + /* Find two largest minimal first block sizes. */ for (int i = 0; i < ZIL_BURSTS; i++) { - m = zilog->zl_prev_min[i]; - if (m >= m1) { + uint_t cur = zilog->zl_prev_min[i]; + if (cur >= m1) { m2 = m1; - m1 = m; - } else if (m > m2) { - m2 = m; + m1 = cur; + } else if (cur > m2) { + m2 = cur; } } - /* - * If second minimum size gives 50% saving -- use it. It may cost us - * one additional write later, but the space saving is just too big. - */ - return ((m1 < m2 * 2) ? m1 : m2); + /* Minimum should guarantee progress in most cases. */ + *min_predict = (m1 < m2 * 2) ? m1 : m2; + + /* Maximum doesn't need to go below the minimum optimal size. */ + for (int i = 0; i < ZIL_BURSTS; i++) + o = MIN(o, zilog->zl_prev_opt[i]); + m1 = MAX(m1, o); + m2 = MAX(m2, o); + *max_predict = (m1 < m2 * 2) ? m1 : m2; } /* @@ -1839,12 +1858,13 @@ zil_lwb_predict(zilog_t *zilog) * Has to be called under zl_issuer_lock to chain more lwbs. */ static lwb_t * -zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state) +zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb) { - uint64_t blksz, plan, plan2; + uint64_t minbs, maxbs; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); + membar_producer(); lwb->lwb_state = LWB_STATE_CLOSED; /* @@ -1869,33 +1889,40 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state) * Try to predict what can it be and plan for the worst case. */ uint_t m; - plan = zil_lwb_plan(zilog, zilog->zl_cur_left, &m); + maxbs = zil_lwb_plan(zilog, zilog->zl_cur_left, &m); + minbs = m; if (zilog->zl_parallel) { - plan2 = zil_lwb_plan(zilog, zilog->zl_cur_left + - zil_lwb_predict(zilog), &m); - if (plan < plan2) - plan = plan2; + uint64_t minp, maxp; + zil_lwb_predict(zilog, &minp, &maxp); + maxp = zil_lwb_plan(zilog, zilog->zl_cur_left + maxp, + &m); + if (maxbs < maxp) + maxbs = maxp; } } else { /* * The previous burst is done and we can only predict what * will come next. */ - plan = zil_lwb_predict(zilog); + zil_lwb_predict(zilog, &minbs, &maxbs); } - blksz = plan + sizeof (zil_chain_t); - blksz = P2ROUNDUP_TYPED(blksz, ZIL_MIN_BLKSZ, uint64_t); - blksz = MIN(blksz, zilog->zl_max_block_size); - DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, blksz, - uint64_t, plan); - return (zil_alloc_lwb(zilog, blksz, NULL, 0, 0, state)); + minbs += sizeof (zil_chain_t); + maxbs += sizeof (zil_chain_t); + minbs = P2ROUNDUP_TYPED(minbs, ZIL_MIN_BLKSZ, uint64_t); + maxbs = P2ROUNDUP_TYPED(maxbs, ZIL_MIN_BLKSZ, uint64_t); + maxbs = MIN(maxbs, zilog->zl_max_block_size); + minbs = MIN(minbs, maxbs); + DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, minbs, + uint64_t, maxbs); + + return (zil_alloc_lwb(zilog, NULL, minbs, maxbs, 0, 0)); } /* * Finalize previously closed block and issue the write zio. */ -static void +static int zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) { spa_t *spa = zilog->zl_spa; @@ -1909,8 +1936,13 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) /* Actually fill the lwb with the data. */ for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; - itx = list_next(&lwb->lwb_itxs, itx)) - zil_lwb_commit(zilog, lwb, itx); + itx = list_next(&lwb->lwb_itxs, itx)) { + error = zil_lwb_commit(zilog, lwb, itx); + if (error != 0) { + ASSERT3U(error, ==, ESHUTDOWN); + return (error); + } + } lwb->lwb_nused = lwb->lwb_nfilled; ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax); @@ -1928,19 +1960,21 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) lwb->lwb_state = LWB_STATE_READY; if (BP_IS_HOLE(&lwb->lwb_blk) && lwb->lwb_error == 0) { mutex_exit(&zilog->zl_lock); - return; + return (0); } mutex_exit(&zilog->zl_lock); next_lwb: - if (lwb->lwb_slim) + if (lwb->lwb_flags & LWB_FLAG_SLIM) zilc = (zil_chain_t *)lwb->lwb_buf; else zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax); - int wsz = lwb->lwb_sz; + uint64_t alloc_size = BP_GET_LSIZE(&lwb->lwb_blk); + int wsz = alloc_size; if (lwb->lwb_error == 0) { abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz); - if (!lwb->lwb_slog || zilog->zl_cur_size <= zil_slog_bulk) + if (!(lwb->lwb_flags & LWB_FLAG_SLOG) || + zilog->zl_cur_size <= zil_slog_bulk) prio = ZIO_PRIORITY_SYNC_WRITE; else prio = ZIO_PRIORITY_ASYNC_WRITE; @@ -1948,16 +1982,17 @@ next_lwb: ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0, - &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done, + &lwb->lwb_blk, lwb_abd, alloc_size, zil_lwb_write_done, lwb, prio, ZIO_FLAG_CANFAIL, &zb); zil_lwb_add_block(lwb, &lwb->lwb_blk); - if (lwb->lwb_slim) { + if (lwb->lwb_flags & LWB_FLAG_SLIM) { /* For Slim ZIL only write what is used. */ wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, int); - ASSERT3S(wsz, <=, lwb->lwb_sz); - zio_shrink(lwb->lwb_write_zio, wsz); + ASSERT3S(wsz, <=, alloc_size); + if (wsz < alloc_size) + zio_shrink(lwb->lwb_write_zio, wsz); wsz = lwb->lwb_write_zio->io_size; } memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused); @@ -1993,13 +2028,53 @@ next_lwb: BP_ZERO(bp); error = lwb->lwb_error; if (error == 0) { - error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz, - &slog); + /* + * Allocation flexibility depends on LWB state: + * if NEW: allow range allocation and larger sizes; + * if OPENED: use fixed predetermined allocation size; + * if CLOSED + Slim: allocate precisely for actual usage. + */ + boolean_t flexible = (nlwb->lwb_state == LWB_STATE_NEW); + if (flexible) { + /* We need to prevent opening till we update lwb_sz. */ + mutex_enter(&nlwb->lwb_lock); + flexible = (nlwb->lwb_state == LWB_STATE_NEW); + if (!flexible) + mutex_exit(&nlwb->lwb_lock); /* We lost. */ + } + boolean_t closed_slim = (nlwb->lwb_state == LWB_STATE_CLOSED && + (lwb->lwb_flags & LWB_FLAG_SLIM)); + + uint64_t min_size, max_size; + if (closed_slim) { + /* This transition is racy, but only one way. */ + membar_consumer(); + min_size = max_size = P2ROUNDUP_TYPED(nlwb->lwb_nused, + ZIL_MIN_BLKSZ, uint64_t); + } else if (flexible) { + min_size = nlwb->lwb_min_sz; + max_size = nlwb->lwb_sz; + } else { + min_size = max_size = nlwb->lwb_sz; + } + + error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, + min_size, max_size, &slog, flexible); + if (error == 0) { + if (closed_slim) + ASSERT3U(BP_GET_LSIZE(bp), ==, max_size); + else if (flexible) + nlwb->lwb_sz = BP_GET_LSIZE(bp); + else + ASSERT3U(BP_GET_LSIZE(bp), ==, nlwb->lwb_sz); + } + if (flexible) + mutex_exit(&nlwb->lwb_lock); } if (error == 0) { - ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), ==, txg); - BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 : - ZIO_CHECKSUM_ZILOG); + ASSERT3U(BP_GET_BIRTH(bp), ==, txg); + BP_SET_CHECKSUM(bp, (nlwb->lwb_flags & LWB_FLAG_SLIM) ? + ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); bp->blk_cksum = lwb->lwb_blk.blk_cksum; bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; } @@ -2028,14 +2103,15 @@ next_lwb: if (nlwb) { nlwb->lwb_blk = *bp; nlwb->lwb_error = error; - nlwb->lwb_slog = slog; + if (slog) + nlwb->lwb_flags |= LWB_FLAG_SLOG; nlwb->lwb_alloc_txg = txg; if (nlwb->lwb_state != LWB_STATE_READY) nlwb = NULL; } mutex_exit(&zilog->zl_lock); - if (lwb->lwb_slog) { + if (lwb->lwb_flags & LWB_FLAG_SLOG) { ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count); ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes, lwb->lwb_nused); @@ -2065,6 +2141,8 @@ next_lwb: lwb = nlwb; if (lwb) goto next_lwb; + + return (0); } /* @@ -2095,6 +2173,19 @@ zil_max_waste_space(zilog_t *zilog) */ static uint_t zil_maxcopied = 7680; +/* + * Largest write size to store the data directly into ZIL. + */ +uint_t zfs_immediate_write_sz = 32768; + +/* + * When enabled and blocks go to normal vdev, treat special vdevs as SLOG, + * writing data to ZIL (WR_COPIED/WR_NEED_COPY). Disabling this forces the + * indirect writes (WR_INDIRECT) to preserve special vdev throughput and + * endurance, likely at the cost of normal vdev latency. + */ +int zil_special_is_slog = 1; + uint64_t zil_max_copied_data(zilog_t *zilog) { @@ -2102,6 +2193,46 @@ zil_max_copied_data(zilog_t *zilog) return (MIN(max_data, zil_maxcopied)); } +/* + * Determine the appropriate write state for ZIL transactions based on + * pool configuration, data placement, write size, and logbias settings. + */ +itx_wr_state_t +zil_write_state(zilog_t *zilog, uint64_t size, uint32_t blocksize, + boolean_t o_direct, boolean_t commit) +{ + if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT || o_direct) + return (WR_INDIRECT); + + /* + * Don't use indirect for too small writes to reduce overhead. + * Don't use indirect if written less than a half of a block if + * we are going to commit it immediately, since next write might + * rewrite the same block again, causing inflation. If commit + * is not planned, then next writes might coalesce, and so the + * indirect may be perfect. + */ + boolean_t indirect = (size >= zfs_immediate_write_sz && + (size >= blocksize / 2 || !commit)); + + if (spa_has_slogs(zilog->zl_spa)) { + /* Dedicated slogs: never use indirect */ + indirect = B_FALSE; + } else if (spa_has_special(zilog->zl_spa)) { + /* Special vdevs: only when beneficial */ + boolean_t on_special = (blocksize <= + zilog->zl_os->os_zpl_special_smallblock); + indirect &= (on_special || !zil_special_is_slog); + } + + if (indirect) + return (WR_INDIRECT); + else if (commit) + return (WR_COPIED); + else + return (WR_NEED_COPY); +} + static uint64_t zil_itx_record_size(itx_t *itx) { @@ -2154,7 +2285,6 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3P(lwb, !=, NULL); - ASSERT3P(lwb->lwb_buf, !=, NULL); zil_lwb_write_open(zilog, lwb); @@ -2196,9 +2326,10 @@ cont: (dlen % max_log_data == 0 || lwb_sp < reclen + dlen % max_log_data))) { list_insert_tail(ilwbs, lwb); - lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED); + lwb = zil_lwb_write_close(zilog, lwb); if (lwb == NULL) return (NULL); + zil_lwb_write_open(zilog, lwb); lwb_sp = lwb->lwb_nmax - lwb->lwb_nused; } @@ -2255,11 +2386,13 @@ cont: return (lwb); } +static void zil_crash(zilog_t *zilog); + /* * Fill the actual transaction data into the lwb, following zil_lwb_assign(). * Does not require locking. */ -static void +static int zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) { lr_t *lr, *lrb; @@ -2271,7 +2404,7 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) lrw = (lr_write_t *)lr; if (lr->lrc_txtype == TX_COMMIT) - return; + return (0); reclen = lr->lrc_reclen; dlen = zil_itx_data_size(itx); @@ -2357,16 +2490,35 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) ". Falling back to txg_wait_synced().", error); zfs_fallthrough; - case EIO: - txg_wait_synced(zilog->zl_dmu_pool, - lr->lrc_txg); + case EIO: { + int error = txg_wait_synced_flags( + zilog->zl_dmu_pool, + lr->lrc_txg, TXG_WAIT_SUSPEND); + if (error != 0) { + ASSERT3U(error, ==, ESHUTDOWN); + /* + * zil_lwb_commit() is called from a + * loop over a list of itxs at the + * top of zil_lwb_write_issue(), which + * itself is called from a loop over a + * list of lwbs in various places. + * zil_crash() will free those itxs + * and sometimes the lwbs, so they + * are invalid when zil_crash() returns. + * Callers must pretty much abort + * immediately. + */ + zil_crash(zilog); + return (error); + } zfs_fallthrough; + } case ENOENT: zfs_fallthrough; case EEXIST: zfs_fallthrough; case EALREADY: - return; + return (0); } } } @@ -2374,6 +2526,8 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) lwb->lwb_nfilled += reclen + dlen; ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused); ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t))); + + return (0); } itx_t * @@ -2415,7 +2569,7 @@ zil_itx_clone(itx_t *oitx) } void -zil_itx_destroy(itx_t *itx) +zil_itx_destroy(itx_t *itx, int err) { ASSERT3U(itx->itx_size, >=, sizeof (itx_t)); ASSERT3U(itx->itx_lr.lrc_reclen, ==, @@ -2424,7 +2578,7 @@ zil_itx_destroy(itx_t *itx) IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); if (itx->itx_callback != NULL) - itx->itx_callback(itx->itx_callback_data); + itx->itx_callback(itx->itx_callback_data, err); zio_data_buf_free(itx, itx->itx_size); } @@ -2465,9 +2619,9 @@ zil_itxg_clean(void *arg) * called) we will hit this case. */ if (itx->itx_lr.lrc_txtype == TX_COMMIT) - zil_commit_waiter_skip(itx->itx_private); + zil_commit_waiter_done(itx->itx_private, 0); - zil_itx_destroy(itx); + zil_itx_destroy(itx, 0); } cookie = NULL; @@ -2477,7 +2631,7 @@ zil_itxg_clean(void *arg) while ((itx = list_remove_head(list)) != NULL) { /* commit itxs should never be on the async lists. */ ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); - zil_itx_destroy(itx); + zil_itx_destroy(itx, 0); } list_destroy(list); kmem_free(ian, sizeof (itx_async_node_t)); @@ -2539,7 +2693,7 @@ zil_remove_async(zilog_t *zilog, uint64_t oid) while ((itx = list_remove_head(&clean_list)) != NULL) { /* commit itxs should never be on the async lists. */ ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); - zil_itx_destroy(itx); + zil_itx_destroy(itx, 0); } list_destroy(&clean_list); } @@ -2624,6 +2778,68 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) } /* + * Post-crash cleanup. This is called from zil_clean() because it needs to + * do cleanup after every txg until the ZIL is restarted, and zilog_dirty() + * can arrange that easily, unlike zil_sync() which is more complicated to + * get a call to without actual dirty data. + */ +static void +zil_crash_clean(zilog_t *zilog, uint64_t synced_txg) +{ + ASSERT(MUTEX_HELD(&zilog->zl_lock)); + ASSERT3U(zilog->zl_restart_txg, >, 0); + + /* Clean up anything on the crash list from earlier txgs */ + lwb_t *lwb; + while ((lwb = list_head(&zilog->zl_lwb_crash_list)) != NULL) { + if (lwb->lwb_alloc_txg >= synced_txg || + lwb->lwb_max_txg >= synced_txg) { + /* + * This lwb was allocated or updated on this txg, or + * in the future. We stop processing here, to avoid + * the strange situation of freeing a ZIL block on + * on the same or earlier txg than what it was + * allocated for. + * + * We'll take care of it on the next txg. + */ + break; + } + + /* This LWB is from the past, so we can clean it up now. */ + ASSERT(lwb->lwb_flags & LWB_FLAG_CRASHED); + list_remove(&zilog->zl_lwb_crash_list, lwb); + if (lwb->lwb_buf != NULL) + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + if (!BP_IS_HOLE(&lwb->lwb_blk)) + /* + * Free on the next txg, since zil_clean() is called + * once synced_txg has already been completed. + */ + zio_free(zilog->zl_spa, synced_txg+1, &lwb->lwb_blk); + zil_free_lwb(zilog, lwb); + } + + if (zilog->zl_restart_txg > synced_txg) { + /* + * Not reached the restart txg yet, so mark the ZIL dirty for + * the next txg and we'll consider it all again then. + */ + zilog_dirty(zilog, synced_txg+1); + return; + } + + /* + * Reached the restart txg, so we can allow new calls to zil_commit(). + * All ZIL txgs have long past so there should be no IO waiting. + */ + ASSERT(list_is_empty(&zilog->zl_lwb_list)); + ASSERT(list_is_empty(&zilog->zl_lwb_crash_list)); + + zilog->zl_restart_txg = 0; +} + +/* * If there are any in-memory intent log transactions which have now been * synced then start up a taskq to free them. We should only do this after we * have written out the uberblocks (i.e. txg has been committed) so that @@ -2638,6 +2854,15 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg) ASSERT3U(synced_txg, <, ZILTEST_TXG); + /* Do cleanup and restart after crash. */ + if (zilog->zl_restart_txg > 0) { + mutex_enter(&zilog->zl_lock); + /* Make sure we didn't lose a race. */ + if (zilog->zl_restart_txg > 0) + zil_crash_clean(zilog, synced_txg); + mutex_exit(&zilog->zl_lock); + } + mutex_enter(&itxg->itxg_lock); if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { mutex_exit(&itxg->itxg_lock); @@ -2822,7 +3047,7 @@ zil_prune_commit_list(zilog_t *zilog) * never any itx's for it to wait on), so it's * safe to skip this waiter and mark it done. */ - zil_commit_waiter_skip(itx->itx_private); + zil_commit_waiter_done(itx->itx_private, 0); } else { zil_commit_waiter_link_lwb(itx->itx_private, last_lwb); } @@ -2830,13 +3055,13 @@ zil_prune_commit_list(zilog_t *zilog) mutex_exit(&zilog->zl_lock); list_remove(&zilog->zl_itx_commit_list, itx); - zil_itx_destroy(itx); + zil_itx_destroy(itx, 0); } IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); } -static void +static int zil_commit_writer_stall(zilog_t *zilog) { /* @@ -2861,8 +3086,22 @@ zil_commit_writer_stall(zilog_t *zilog) */ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ZIL_STAT_BUMP(zilog, zil_commit_stall_count); - txg_wait_synced(zilog->zl_dmu_pool, 0); + + int err = txg_wait_synced_flags(zilog->zl_dmu_pool, 0, + TXG_WAIT_SUSPEND); + if (err != 0) { + ASSERT3U(err, ==, ESHUTDOWN); + zil_crash(zilog); + } + + /* + * Either zil_sync() has been called to wait for and clean up any + * in-flight LWBs, or zil_crash() has emptied out the list and arranged + * for them to be cleaned up later. + */ ASSERT(list_is_empty(&zilog->zl_lwb_list)); + + return (err); } static void @@ -2902,19 +3141,14 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); - /* - * Return if there's nothing to commit before we dirty the fs by - * calling zil_create(). - */ - if (list_is_empty(&zilog->zl_itx_commit_list)) - return; - - list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); - list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t), - offsetof(zil_commit_waiter_t, zcw_node)); - lwb = list_tail(&zilog->zl_lwb_list); if (lwb == NULL) { + /* + * Return if there's nothing to commit before we dirty the fs. + */ + if (list_is_empty(&zilog->zl_itx_commit_list)) + return; + lwb = zil_create(zilog); } else { /* @@ -2942,6 +3176,10 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) } } + list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); + list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t), + offsetof(zil_commit_waiter_t, zcw_node)); + while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) { lr_t *lrc = &itx->itx_lr; uint64_t txg = lrc->lrc_txg; @@ -3030,7 +3268,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) } else { ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT); zilog->zl_cur_left -= zil_itx_full_size(itx); - zil_itx_destroy(itx); + zil_itx_destroy(itx, 0); } } @@ -3040,10 +3278,21 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) * "next" lwb on-disk. When this happens, we must stall * the ZIL write pipeline; see the comment within * zil_commit_writer_stall() for more details. + * + * ESHUTDOWN has to be handled carefully here. If we get it, + * then the pool suspended and zil_crash() was called, so we + * need to stop trying and just get an error back to the + * callers. */ - while ((lwb = list_remove_head(ilwbs)) != NULL) - zil_lwb_write_issue(zilog, lwb); - zil_commit_writer_stall(zilog); + int err = 0; + while ((lwb = list_remove_head(ilwbs)) != NULL) { + if (err == 0) + err = zil_lwb_write_issue(zilog, lwb); + } + if (err != ESHUTDOWN) + err = zil_commit_writer_stall(zilog); + if (err == ESHUTDOWN) + err = SET_ERROR(EIO); /* * Additionally, we have to signal and mark the "nolwb" @@ -3053,7 +3302,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) */ zil_commit_waiter_t *zcw; while ((zcw = list_remove_head(&nolwb_waiters)) != NULL) - zil_commit_waiter_skip(zcw); + zil_commit_waiter_done(zcw, err); /* * And finally, we have to destroy the itx's that @@ -3061,7 +3310,7 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) * the itx's callback if one exists for the itx. */ while ((itx = list_remove_head(&nolwb_itxs)) != NULL) - zil_itx_destroy(itx); + zil_itx_destroy(itx, err); } else { ASSERT(list_is_empty(&nolwb_waiters)); ASSERT3P(lwb, !=, NULL); @@ -3111,14 +3360,21 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) * possible, without significantly impacting the latency * of each individual itx. */ - if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) { + if (lwb->lwb_state == LWB_STATE_OPENED && + (!zilog->zl_parallel || zilog->zl_suspend > 0)) { zil_burst_done(zilog); list_insert_tail(ilwbs, lwb); - lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); + lwb = zil_lwb_write_close(zilog, lwb); if (lwb == NULL) { - while ((lwb = list_remove_head(ilwbs)) != NULL) - zil_lwb_write_issue(zilog, lwb); - zil_commit_writer_stall(zilog); + int err = 0; + while ((lwb = + list_remove_head(ilwbs)) != NULL) { + if (err == 0) + err = zil_lwb_write_issue( + zilog, lwb); + } + if (err != ESHUTDOWN) + (void) zil_commit_writer_stall(zilog); } } } @@ -3177,10 +3433,23 @@ zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) zil_prune_commit_list(zilog); zil_process_commit_list(zilog, zcw, &ilwbs); + /* + * If the ZIL failed somewhere inside zil_process_commit_list(), it's + * will be because a fallback to txg_wait_sync_flags() happened at some + * point (eg zil_commit_writer_stall()). All cases should issue and + * empty ilwbs, so there will be nothing to in the issue loop below. + * That's why we don't have to plumb the error value back from + * zil_process_commit_list(), and don't have to skip it. + */ + IMPLY(zilog->zl_restart_txg > 0, list_is_empty(&ilwbs)); + out: mutex_exit(&zilog->zl_issuer_lock); - while ((lwb = list_remove_head(&ilwbs)) != NULL) - zil_lwb_write_issue(zilog, lwb); + int err = 0; + while ((lwb = list_remove_head(&ilwbs)) != NULL) { + if (err == 0) + err = zil_lwb_write_issue(zilog, lwb); + } list_destroy(&ilwbs); return (wtxg); } @@ -3273,7 +3542,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * hasn't been issued. */ zil_burst_done(zilog); - lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); + lwb_t *nlwb = zil_lwb_write_close(zilog, lwb); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); @@ -3349,7 +3618,7 @@ zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) * commit itxs. When this occurs, the commit waiters linked * off of these commit itxs will not be committed to an * lwb. Additionally, these commit waiters will not be - * marked done until zil_commit_waiter_skip() is called via + * marked done until zil_commit_waiter_done() is called via * zil_itxg_clean(). * * Thus, it's possible for this commit waiter (i.e. the @@ -3427,7 +3696,7 @@ zil_alloc_commit_waiter(void) list_link_init(&zcw->zcw_node); zcw->zcw_lwb = NULL; zcw->zcw_done = B_FALSE; - zcw->zcw_zio_error = 0; + zcw->zcw_error = 0; return (zcw); } @@ -3436,7 +3705,7 @@ static void zil_free_commit_waiter(zil_commit_waiter_t *zcw) { ASSERT(!list_link_active(&zcw->zcw_node)); - ASSERT3P(zcw->zcw_lwb, ==, NULL); + ASSERT0P(zcw->zcw_lwb); ASSERT3B(zcw->zcw_done, ==, B_TRUE); mutex_destroy(&zcw->zcw_lock); cv_destroy(&zcw->zcw_cv); @@ -3473,6 +3742,99 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) } /* + * Crash the ZIL. This is something like suspending, but abandons the ZIL + * without further IO until the wanted txg completes. No effort is made to + * close the on-disk chain or do any other on-disk work, as the pool may + * have suspended. zil_sync() will handle cleanup as normal and restart the + * ZIL once enough txgs have passed. + */ +static void +zil_crash(zilog_t *zilog) +{ + mutex_enter(&zilog->zl_lock); + + uint64_t txg = spa_syncing_txg(zilog->zl_spa); + uint64_t restart_txg = + spa_syncing_txg(zilog->zl_spa) + TXG_CONCURRENT_STATES; + + if (zilog->zl_restart_txg > 0) { + /* + * If the ZIL is already crashed, it's almost certainly because + * we lost a race involving multiple callers from + * zil_commit_impl(). + */ + + /* + * This sanity check is to support my understanding that in the + * event of multiple callers to zil_crash(), only one of them + * can possibly be in the codepath to issue lwbs; the rest + * should be calling from zil_commit_impl() after their waiters + * have completed. As I understand it, a second thread trying + * to issue will eventually wait on zl_issuer_lock, and then + * have no work to do and leave. + * + * If more lwbs had been created an issued between zil_crash() + * calls, then we probably just need to take those too, add + * them to the crash list and clean them up, but it complicates + * this function and I don't think it can happend. + */ + ASSERT(list_is_empty(&zilog->zl_lwb_list)); + + mutex_exit(&zilog->zl_lock); + return; + } + + zilog->zl_restart_txg = restart_txg; + + /* + * Capture any live LWBs. Depending on the state of the pool they may + * represent in-flight IO that won't return for some time, and we want + * to make sure they don't get in the way of normal ZIL operation. + */ + ASSERT(list_is_empty(&zilog->zl_lwb_crash_list)); + list_move_tail(&zilog->zl_lwb_crash_list, &zilog->zl_lwb_list); + + /* + * Run through the LWB list; erroring all itxes and signalling error + * to all waiters. + */ + for (lwb_t *lwb = list_head(&zilog->zl_lwb_crash_list); lwb != NULL; + lwb = list_next(&zilog->zl_lwb_crash_list, lwb)) { + ASSERT(!(lwb->lwb_flags & LWB_FLAG_CRASHED)); + lwb->lwb_flags |= LWB_FLAG_CRASHED; + + itx_t *itx; + while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL) + zil_itx_destroy(itx, EIO); + + zil_commit_waiter_t *zcw; + while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) { + mutex_enter(&zcw->zcw_lock); + zcw->zcw_lwb = NULL; + zcw->zcw_error = EIO; + zcw->zcw_done = B_TRUE; + cv_broadcast(&zcw->zcw_cv); + mutex_exit(&zcw->zcw_lock); + } + } + + /* + * Zero the ZIL header bp after the ZIL restarts. We'll free it in + * zil_clean() when we clean up the lwbs. + */ + zil_header_t *zh = zil_header_in_syncing_context(zilog); + BP_ZERO(&zh->zh_log); + + /* + * Mark this ZIL dirty on the next txg, so that zil_clean() will be + * called for cleanup. + */ + zilog_dirty(zilog, txg+1); + + mutex_exit(&zilog->zl_lock); +} + +/* * Commit ZFS Intent Log transactions (itxs) to stable storage. * * When writing ZIL transactions to the on-disk representation of the @@ -3587,9 +3949,17 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) * but the order in which they complete will be the same order in * which they were created. */ -void +static int zil_commit_impl(zilog_t *zilog, uint64_t foid); + +int zil_commit(zilog_t *zilog, uint64_t foid) { + return (zil_commit_flags(zilog, foid, ZIL_COMMIT_FAILMODE)); +} + +int +zil_commit_flags(zilog_t *zilog, uint64_t foid, zil_commit_flag_t flags) +{ /* * We should never attempt to call zil_commit on a snapshot for * a couple of reasons: @@ -3606,7 +3976,7 @@ zil_commit(zilog_t *zilog, uint64_t foid) ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE); if (zilog->zl_sync == ZFS_SYNC_DISABLED) - return; + return (0); if (!spa_writeable(zilog->zl_spa)) { /* @@ -3617,10 +3987,23 @@ zil_commit(zilog_t *zilog, uint64_t foid) * verifying that truth before we return to the caller. */ ASSERT(list_is_empty(&zilog->zl_lwb_list)); - ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); + ASSERT0P(zilog->zl_last_lwb_opened); for (int i = 0; i < TXG_SIZE; i++) - ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL); - return; + ASSERT0P(zilog->zl_itxg[i].itxg_itxs); + return (0); + } + + int err = 0; + + /* + * If the ZIL crashed, bypass it entirely, and rely on txg_wait_sync() + * to get the data out to disk. + */ + if (zilog->zl_restart_txg > 0) { + ZIL_STAT_BUMP(zilog, zil_commit_crash_count); + err = txg_wait_synced_flags(zilog->zl_dmu_pool, 0, + TXG_WAIT_SUSPEND); + goto out; } /* @@ -3632,14 +4015,43 @@ zil_commit(zilog_t *zilog, uint64_t foid) */ if (zilog->zl_suspend > 0) { ZIL_STAT_BUMP(zilog, zil_commit_suspend_count); - txg_wait_synced(zilog->zl_dmu_pool, 0); - return; + err = txg_wait_synced_flags(zilog->zl_dmu_pool, 0, + TXG_WAIT_SUSPEND); + if (err != 0) { + ASSERT3U(err, ==, ESHUTDOWN); + zil_crash(zilog); + } + goto out; } - zil_commit_impl(zilog, foid); + err = zil_commit_impl(zilog, foid); + +out: + if (err == 0) + return (0); + + /* + * The ZIL write failed and the pool is suspended. There's nothing else + * we can do except return or block. + */ + ASSERT3U(err, ==, ESHUTDOWN); + + /* + * Return error if failmode=continue or caller will handle directly. + */ + if (!(flags & ZIL_COMMIT_FAILMODE) || + spa_get_failmode(zilog->zl_spa) == ZIO_FAILURE_MODE_CONTINUE) + return (SET_ERROR(EIO)); + + /* + * Block until the pool returns. We assume that the data will make + * it out to disk in the end, and so return success. + */ + txg_wait_synced(zilog->zl_dmu_pool, 0); + return (0); } -void +static int zil_commit_impl(zilog_t *zilog, uint64_t foid) { ZIL_STAT_BUMP(zilog, zil_commit_count); @@ -3676,7 +4088,8 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid) uint64_t wtxg = zil_commit_writer(zilog, zcw); zil_commit_waiter(zilog, zcw); - if (zcw->zcw_zio_error != 0) { + int err = 0; + if (zcw->zcw_error != 0) { /* * If there was an error writing out the ZIL blocks that * this thread is waiting on, then we fallback to @@ -3688,13 +4101,29 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid) ZIL_STAT_BUMP(zilog, zil_commit_error_count); DTRACE_PROBE2(zil__commit__io__error, zilog_t *, zilog, zil_commit_waiter_t *, zcw); - txg_wait_synced(zilog->zl_dmu_pool, 0); + err = txg_wait_synced_flags(zilog->zl_dmu_pool, 0, + TXG_WAIT_SUSPEND); } else if (wtxg != 0) { ZIL_STAT_BUMP(zilog, zil_commit_suspend_count); - txg_wait_synced(zilog->zl_dmu_pool, wtxg); + err = txg_wait_synced_flags(zilog->zl_dmu_pool, wtxg, + TXG_WAIT_SUSPEND); } zil_free_commit_waiter(zcw); + + if (err == 0) + return (0); + + /* + * ZIL write failed and pool failed in the fallback to + * txg_wait_synced_flags(). Right now we have no idea if the data is on + * disk and the pool is probably suspended so we have no idea when it's + * coming back. All we can do is shut down and return error to the + * caller. + */ + ASSERT3U(err, ==, ESHUTDOWN); + zil_crash(zilog); + return (err); } /* @@ -3720,7 +4149,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) mutex_enter(&zilog->zl_lock); - ASSERT(zilog->zl_stop_sync == 0); + ASSERT0(zilog->zl_stop_sync); if (*replayed_seq != 0) { ASSERT(zh->zh_replay_seq < *replayed_seq); @@ -3795,7 +4224,7 @@ zil_lwb_cons(void *vbuf, void *unused, int kmflag) offsetof(zil_commit_waiter_t, zcw_node)); avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare, sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); - mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&lwb->lwb_lock, NULL, MUTEX_DEFAULT, NULL); return (0); } @@ -3804,7 +4233,7 @@ zil_lwb_dest(void *vbuf, void *unused) { (void) unused; lwb_t *lwb = vbuf; - mutex_destroy(&lwb->lwb_vdev_lock); + mutex_destroy(&lwb->lwb_lock); avl_destroy(&lwb->lwb_vdev_tree); list_destroy(&lwb->lwb_waiters); list_destroy(&lwb->lwb_itxs); @@ -3890,6 +4319,8 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) list_create(&zilog->zl_lwb_list, sizeof (lwb_t), offsetof(lwb_t, lwb_node)); + list_create(&zilog->zl_lwb_crash_list, sizeof (lwb_t), + offsetof(lwb_t, lwb_node)); list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), offsetof(itx_t, itx_node)); @@ -3914,9 +4345,12 @@ zil_free(zilog_t *zilog) ASSERT0(zilog->zl_suspend); ASSERT0(zilog->zl_suspending); + ASSERT0(zilog->zl_restart_txg); ASSERT(list_is_empty(&zilog->zl_lwb_list)); list_destroy(&zilog->zl_lwb_list); + ASSERT(list_is_empty(&zilog->zl_lwb_crash_list)); + list_destroy(&zilog->zl_lwb_crash_list); ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); list_destroy(&zilog->zl_itx_commit_list); @@ -3952,8 +4386,8 @@ zil_open(objset_t *os, zil_get_data_t *get_data, zil_sums_t *zil_sums) { zilog_t *zilog = dmu_objset_zil(os); - ASSERT3P(zilog->zl_get_data, ==, NULL); - ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); + ASSERT0P(zilog->zl_get_data); + ASSERT0P(zilog->zl_last_lwb_opened); ASSERT(list_is_empty(&zilog->zl_lwb_list)); zilog->zl_get_data = get_data; @@ -3972,7 +4406,8 @@ zil_close(zilog_t *zilog) uint64_t txg; if (!dmu_objset_is_snapshot(zilog->zl_os)) { - zil_commit(zilog, 0); + if (zil_commit_flags(zilog, 0, ZIL_COMMIT_NOW) != 0) + txg_wait_synced(zilog->zl_dmu_pool, 0); } else { ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT0(zilog->zl_dirty_max_txg); @@ -4021,7 +4456,7 @@ zil_close(zilog_t *zilog) if (lwb != NULL) { ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW); - zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + ASSERT0P(lwb->lwb_buf); zil_free_lwb(zilog, lwb); } mutex_exit(&zilog->zl_lock); @@ -4073,6 +4508,17 @@ zil_suspend(const char *osname, void **cookiep) return (SET_ERROR(EBUSY)); } + if (zilog->zl_restart_txg > 0) { + /* + * ZIL crashed. It effectively _is_ suspended, but callers + * are usually trying to make sure it's empty on-disk, which + * we can't guarantee right now. + */ + mutex_exit(&zilog->zl_lock); + dmu_objset_rele(os, suspend_tag); + return (SET_ERROR(EBUSY)); + } + /* * Don't put a long hold in the cases where we can avoid it. This * is when there is no cookie so we are doing a suspend & resume @@ -4101,11 +4547,16 @@ zil_suspend(const char *osname, void **cookiep) cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); mutex_exit(&zilog->zl_lock); - if (cookiep == NULL) + if (zilog->zl_restart_txg > 0) { + /* ZIL crashed while we were waiting. */ + zil_resume(os); + error = SET_ERROR(EBUSY); + } else if (cookiep == NULL) zil_resume(os); else *cookiep = os; - return (0); + + return (error); } /* @@ -4146,17 +4597,34 @@ zil_suspend(const char *osname, void **cookiep) * would just call txg_wait_synced(), because zl_suspend is set. * txg_wait_synced() doesn't wait for these lwb's to be * LWB_STATE_FLUSH_DONE before returning. + * + * However, zil_commit_impl() itself can return an error if any of the + * lwbs fail, or the pool suspends in the fallback + * txg_wait_sync_flushed(), which affects what we do next, so we + * capture that error. */ - zil_commit_impl(zilog, 0); + error = zil_commit_impl(zilog, 0); + if (error == ESHUTDOWN) + /* zil_commit_impl() has called zil_crash() already */ + error = SET_ERROR(EBUSY); /* * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we * use txg_wait_synced() to ensure the data from the zilog has * migrated to the main pool before calling zil_destroy(). */ - txg_wait_synced(zilog->zl_dmu_pool, 0); + if (error == 0) { + error = txg_wait_synced_flags(zilog->zl_dmu_pool, 0, + TXG_WAIT_SUSPEND); + if (error != 0) { + ASSERT3U(error, ==, ESHUTDOWN); + zil_crash(zilog); + error = SET_ERROR(EBUSY); + } + } - zil_destroy(zilog, B_FALSE); + if (error == 0) + zil_destroy(zilog, B_FALSE); mutex_enter(&zilog->zl_lock); zilog->zl_suspending = B_FALSE; @@ -4170,7 +4638,8 @@ zil_suspend(const char *osname, void **cookiep) zil_resume(os); else *cookiep = os; - return (0); + + return (error); } void @@ -4333,7 +4802,7 @@ zil_replay(objset_t *os, void *arg, zilog->zl_replay = B_TRUE; zilog->zl_replay_time = ddi_get_lbolt(); - ASSERT(zilog->zl_replay_blks == 0); + ASSERT0(zilog->zl_replay_blks); (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, zh->zh_claim_txg, B_TRUE); vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); @@ -4418,3 +4887,9 @@ ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW, "Limit in bytes WR_COPIED size"); + +ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, UINT, ZMOD_RW, + "Largest write size to store data into ZIL"); + +ZFS_MODULE_PARAM(zfs_zil, zil_, special_is_slog, INT, ZMOD_RW, + "Treat special vdevs as SLOG"); diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index 6d7bce8b0e10..4cf8912d4269 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -339,8 +339,8 @@ zio_fini(void) } for (size_t i = 0; i < n; i++) { - VERIFY3P(zio_buf_cache[i], ==, NULL); - VERIFY3P(zio_data_buf_cache[i], ==, NULL); + VERIFY0P(zio_buf_cache[i]); + VERIFY0P(zio_data_buf_cache[i]); } if (zio_ksp != NULL) { @@ -692,7 +692,7 @@ error: zio->io_error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, &zio->io_bookmark, - BP_GET_LOGICAL_BIRTH(zio->io_bp)); + BP_GET_PHYSICAL_BIRTH(zio->io_bp)); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, &zio->io_bookmark, zio, 0); } @@ -771,7 +771,7 @@ zio_add_child_impl(zio_t *pio, zio_t *cio, boolean_t first) else mutex_enter(&cio->io_lock); - ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); + ASSERT0(pio->io_state[ZIO_WAIT_DONE]); uint64_t *countp = pio->io_children[cio->io_child_type]; for (int w = 0; w < ZIO_WAIT_TYPES; w++) @@ -821,7 +821,7 @@ zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait) boolean_t waiting = B_FALSE; mutex_enter(&zio->io_lock); - ASSERT(zio->io_stall == NULL); + ASSERT0P(zio->io_stall); for (int c = 0; c < ZIO_CHILD_TYPES; c++) { if (!(ZIO_CHILD_BIT_IS_SET(childbits, c))) continue; @@ -850,15 +850,9 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, mutex_enter(&pio->io_lock); if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) *errorp = zio_worst_error(*errorp, zio->io_error); - pio->io_reexecute |= zio->io_reexecute; + pio->io_post |= zio->io_post; ASSERT3U(*countp, >, 0); - /* - * Propogate the Direct I/O checksum verify failure to the parent. - */ - if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) - pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; - (*countp)--; if (*countp == 0 && pio->io_stall == countp) { @@ -961,8 +955,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_t *zio; IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE); - ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0); - ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); + ASSERT0(P2PHASE(psize, SPA_MINBLOCKSIZE)); + ASSERT0(P2PHASE(offset, SPA_MINBLOCKSIZE)); ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); @@ -1110,7 +1104,8 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, "DVA[1]=%#llx/%#llx " "DVA[2]=%#llx/%#llx " "prop=%#llx " - "pad=%#llx,%#llx " + "prop2=%#llx " + "pad=%#llx " "phys_birth=%#llx " "birth=%#llx " "fill=%#llx " @@ -1123,9 +1118,9 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, (long long)bp->blk_dva[2].dva_word[0], (long long)bp->blk_dva[2].dva_word[1], (long long)bp->blk_prop, - (long long)bp->blk_pad[0], - (long long)bp->blk_pad[1], - (long long)BP_GET_PHYSICAL_BIRTH(bp), + (long long)bp->blk_prop2, + (long long)bp->blk_pad, + (long long)BP_GET_RAW_PHYSICAL_BIRTH(bp), (long long)BP_GET_LOGICAL_BIRTH(bp), (long long)bp->blk_fill, (long long)bp->blk_cksum.zc_word[0], @@ -1340,7 +1335,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, { zio_t *zio; - zio = zio_create(pio, spa, BP_GET_BIRTH(bp), bp, + zio = zio_create(pio, spa, BP_GET_PHYSICAL_BIRTH(bp), bp, data, size, size, done, private, ZIO_TYPE_READ, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? @@ -1456,7 +1451,7 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) metaslab_check_free(spa, bp); bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } else { - VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL); + VERIFY0P(zio_free_sync(NULL, spa, txg, bp, 0)); } } @@ -1564,7 +1559,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, { zio_t *zio; - ASSERT(vd->vdev_children == 0); + ASSERT0(vd->vdev_children); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); @@ -1585,7 +1580,7 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, { zio_t *zio; - ASSERT(vd->vdev_children == 0); + ASSERT0(vd->vdev_children); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); @@ -1649,7 +1644,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, * through the mirror during self healing. See comment in * vdev_mirror_io_done() for more details. */ - ASSERT0(pio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR); + ASSERT0(pio->io_post & ZIO_POST_DIO_CHKSUM_ERR); } else if (type == ZIO_TYPE_WRITE && pio->io_prop.zp_direct_write == B_TRUE) { /* @@ -1685,7 +1680,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, * If this is a retried I/O then we ignore it since we will * have already processed the original allocating I/O. */ - if (flags & ZIO_FLAG_IO_ALLOCATING && + if (flags & ZIO_FLAG_ALLOC_THROTTLED && (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { ASSERT(pio->io_metaslab_class != NULL); ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled); @@ -1695,7 +1690,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || pio->io_child_type == ZIO_CHILD_GANG); - flags &= ~ZIO_FLAG_IO_ALLOCATING; + flags &= ~ZIO_FLAG_ALLOC_THROTTLED; } zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, @@ -1752,7 +1747,7 @@ zio_flush(zio_t *pio, vdev_t *vd) void zio_shrink(zio_t *zio, uint64_t size) { - ASSERT3P(zio->io_executor, ==, NULL); + ASSERT0P(zio->io_executor); ASSERT3U(zio->io_orig_size, ==, zio->io_size); ASSERT3U(size, <=, zio->io_size); @@ -1860,7 +1855,7 @@ zio_write_bp_init(zio_t *zio) blkptr_t *bp = zio->io_bp; zio_prop_t *zp = &zio->io_prop; - ASSERT(BP_GET_LOGICAL_BIRTH(bp) != zio->io_txg); + ASSERT(BP_GET_BIRTH(bp) != zio->io_txg); *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; @@ -1946,9 +1941,9 @@ zio_write_compress(zio_t *zio) } ASSERT(zio->io_child_type != ZIO_CHILD_DDT); - ASSERT(zio->io_bp_override == NULL); + ASSERT0P(zio->io_bp_override); - if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg) { + if (!BP_IS_HOLE(bp) && BP_GET_BIRTH(bp) == zio->io_txg) { /* * We're rewriting an existing block, which means we're * working on behalf of spa_sync(). For spa_sync() to @@ -2085,7 +2080,7 @@ zio_write_compress(zio_t *zio) * spa_sync() to allocate new blocks, but force rewrites after that. * There should only be a handful of blocks after pass 1 in any case. */ - if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg && + if (!BP_IS_HOLE(bp) && BP_GET_BIRTH(bp) == zio->io_txg && BP_GET_PSIZE(bp) == psize && pass >= zfs_sync_pass_rewrite) { VERIFY3U(psize, !=, 0); @@ -2441,7 +2436,7 @@ __zio_execute(zio_t *zio) ASSERT(!MUTEX_HELD(&zio->io_lock)); ASSERT(ISP2(stage)); - ASSERT(zio->io_stall == NULL); + ASSERT0P(zio->io_stall); do { stage <<= 1; @@ -2514,7 +2509,7 @@ zio_wait(zio_t *zio) int error; ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN); - ASSERT3P(zio->io_executor, ==, NULL); + ASSERT0P(zio->io_executor); zio->io_waiter = curthread; ASSERT0(zio->io_queued_timestamp); @@ -2556,7 +2551,7 @@ zio_nowait(zio_t *zio) if (zio == NULL) return; - ASSERT3P(zio->io_executor, ==, NULL); + ASSERT0P(zio->io_executor); if (zio->io_child_type == ZIO_CHILD_LOGICAL && list_is_empty(&zio->io_parent_list)) { @@ -2595,14 +2590,14 @@ zio_reexecute(void *arg) ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); - ASSERT(pio->io_gang_leader == NULL); - ASSERT(pio->io_gang_tree == NULL); + ASSERT0P(pio->io_gang_leader); + ASSERT0P(pio->io_gang_tree); mutex_enter(&pio->io_lock); pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; - pio->io_reexecute = 0; + pio->io_post = 0; pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_pipeline_trace = 0; pio->io_error = 0; @@ -2694,7 +2689,7 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); ASSERT(zio != spa->spa_suspend_zio_root); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); - ASSERT(zio_unique_parent(zio) == NULL); + ASSERT0P(zio_unique_parent(zio)); ASSERT(zio->io_stage == ZIO_STAGE_DONE); zio_add_child(spa->spa_suspend_zio_root, zio); } @@ -2749,11 +2744,14 @@ zio_resume_wait(spa_t *spa) * being nearly full, it calls zio_write_gang_block() to construct the * block from smaller fragments. * - * A gang block consists of a gang header (zio_gbh_phys_t) and up to - * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like - * an indirect block: it's an array of block pointers. It consumes - * only one sector and hence is allocatable regardless of fragmentation. - * The gang header's bps point to its gang members, which hold the data. + * A gang block consists of a a gang header and up to gbh_nblkptrs(size) + * gang members. The gang header is like an indirect block: it's an array + * of block pointers, though the header has a small tail (a zio_eck_t) + * that stores an embedded checksum. It is allocated using only a single + * sector as the requested size, and hence is allocatable regardless of + * fragmentation. Its size is determined by the smallest allocatable + * asize of the vdevs it was allocated on. The gang header's bps point + * to its gang members, which hold the data. * * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> * as the verifier to ensure uniqueness of the SHA256 checksum. @@ -2832,10 +2830,10 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, if (gn != NULL) { abd_t *gbh_abd = - abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); + abd_get_from_buf(gn->gn_gbh, gn->gn_gangblocksize); zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, - gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, - pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + gbh_abd, gn->gn_gangblocksize, zio_gang_issue_func_done, + NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute @@ -2906,14 +2904,16 @@ static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { static void zio_gang_tree_assemble_done(zio_t *zio); static zio_gang_node_t * -zio_gang_node_alloc(zio_gang_node_t **gnpp) +zio_gang_node_alloc(zio_gang_node_t **gnpp, uint64_t gangblocksize) { zio_gang_node_t *gn; - ASSERT(*gnpp == NULL); + ASSERT0P(*gnpp); - gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); - gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); + gn = kmem_zalloc(sizeof (*gn) + + (gbh_nblkptrs(gangblocksize) * sizeof (gn)), KM_SLEEP); + gn->gn_gangblocksize = gn->gn_allocsize = gangblocksize; + gn->gn_gbh = zio_buf_alloc(gangblocksize); *gnpp = gn; return (gn); @@ -2924,11 +2924,12 @@ zio_gang_node_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) - ASSERT(gn->gn_child[g] == NULL); + for (int g = 0; g < gbh_nblkptrs(gn->gn_allocsize); g++) + ASSERT0P(gn->gn_child[g]); - zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); - kmem_free(gn, sizeof (*gn)); + zio_buf_free(gn->gn_gbh, gn->gn_allocsize); + kmem_free(gn, sizeof (*gn) + + (gbh_nblkptrs(gn->gn_allocsize) * sizeof (gn))); *gnpp = NULL; } @@ -2940,7 +2941,7 @@ zio_gang_tree_free(zio_gang_node_t **gnpp) if (gn == NULL) return; - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) + for (int g = 0; g < gbh_nblkptrs(gn->gn_allocsize); g++) zio_gang_tree_free(&gn->gn_child[g]); zio_gang_node_free(gnpp); @@ -2949,13 +2950,28 @@ zio_gang_tree_free(zio_gang_node_t **gnpp) static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { - zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); - abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); + uint64_t gangblocksize = UINT64_MAX; + if (spa_feature_is_active(gio->io_spa, + SPA_FEATURE_DYNAMIC_GANG_HEADER)) { + spa_config_enter(gio->io_spa, SCL_VDEV, FTAG, RW_READER); + for (int dva = 0; dva < BP_GET_NDVAS(bp); dva++) { + vdev_t *vd = vdev_lookup_top(gio->io_spa, + DVA_GET_VDEV(&bp->blk_dva[dva])); + uint64_t psize = vdev_gang_header_psize(vd); + gangblocksize = MIN(gangblocksize, psize); + } + spa_config_exit(gio->io_spa, SCL_VDEV, FTAG); + } else { + gangblocksize = SPA_OLD_GANGBLOCKSIZE; + } + ASSERT3U(gangblocksize, !=, UINT64_MAX); + zio_gang_node_t *gn = zio_gang_node_alloc(gnpp, gangblocksize); + abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, gangblocksize); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); - zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, gangblocksize, zio_gang_tree_assemble_done, gn, gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } @@ -2978,13 +2994,17 @@ zio_gang_tree_assemble_done(zio_t *zio) byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); - ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); - ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); + /* + * If this was an old-style gangblock, the gangblocksize should have + * been updated in zio_checksum_error to reflect that. + */ + ASSERT3U(gbh_eck(gn->gn_gbh, gn->gn_gangblocksize)->zec_magic, + ==, ZEC_MAGIC); abd_free(zio->io_abd); - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { - blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; + for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) { + blkptr_t *gbp = gbh_bp(gn->gn_gbh, g); if (!BP_IS_GANG(gbp)) continue; zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); @@ -3009,10 +3029,11 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); if (gn != NULL) { - ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); + ASSERT3U(gbh_eck(gn->gn_gbh, + gn->gn_gangblocksize)->zec_magic, ==, ZEC_MAGIC); - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { - blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; + for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) { + blkptr_t *gbp = gbh_bp(gn->gn_gbh, g); if (BP_IS_HOLE(gbp)) continue; zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, @@ -3119,6 +3140,13 @@ zio_write_gang_done(zio_t *zio) abd_free(zio->io_abd); } +static void +zio_update_feature(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + spa_feature_incr(spa, (spa_feature_t)(uintptr_t)arg, tx); +} + static zio_t * zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) { @@ -3157,20 +3185,24 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) ASSERT(ZIO_HAS_ALLOCATOR(pio)); int flags = METASLAB_GANG_HEADER; - if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + if (pio->io_flags & ZIO_FLAG_ALLOC_THROTTLED) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); flags |= METASLAB_ASYNC_ALLOC; } - error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, + uint64_t gangblocksize = SPA_OLD_GANGBLOCKSIZE; + uint64_t candidate = gangblocksize; + error = metaslab_alloc_range(spa, mc, gangblocksize, gangblocksize, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, - &pio->io_alloc_list, pio->io_allocator, pio); + &pio->io_alloc_list, pio->io_allocator, pio, &candidate); if (error) { pio->io_error = error; return (pio); } + if (spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER)) + gangblocksize = candidate; if (pio == gio) { gnpp = &gio->io_gang_tree; @@ -3179,23 +3211,24 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) ASSERT(pio->io_ready == zio_write_gang_member_ready); } - gn = zio_gang_node_alloc(gnpp); + gn = zio_gang_node_alloc(gnpp, gangblocksize); gbh = gn->gn_gbh; - memset(gbh, 0, SPA_GANGBLOCKSIZE); - gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); + memset(gbh, 0, gangblocksize); + gbh_abd = abd_get_from_buf(gbh, gangblocksize); /* * Create the gang header. */ - zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, gangblocksize, zio_write_gang_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); zio_gang_inherit_allocator(pio, zio); - if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + if (pio->io_flags & ZIO_FLAG_ALLOC_THROTTLED) { boolean_t more; - VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies, - zio, B_TRUE, &more)); + VERIFY(metaslab_class_throttle_reserve(mc, zio->io_allocator, + gbh_copies, zio->io_size, B_TRUE, &more)); + zio->io_flags |= ZIO_FLAG_ALLOC_THROTTLED; } /* @@ -3203,7 +3236,9 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) * opportunistic allocations. If that fails to generate enough * space, we fall back to normal zio_write calls for nested gang. */ - for (int g = 0; resid != 0; g++) { + int g; + boolean_t any_failed = B_FALSE; + for (g = 0; resid != 0; g++) { flags &= METASLAB_ASYNC_ALLOC; flags |= METASLAB_GANG_CHILD; zp.zp_checksum = gio->io_prop.zp_checksum; @@ -3224,9 +3259,9 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN); uint64_t min_size = zio_roundup_alloc_size(spa, - resid / (SPA_GBH_NBLKPTRS - g)); + resid / (gbh_nblkptrs(gangblocksize) - g)); min_size = MIN(min_size, resid); - bp = &gbh->zg_blkptr[g]; + bp = &((blkptr_t *)gbh)[g]; zio_alloc_list_t cio_list; metaslab_trace_init(&cio_list); @@ -3236,6 +3271,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) flags, &cio_list, zio->io_allocator, NULL, &allocated_size); boolean_t allocated = error == 0; + any_failed |= !allocated; uint64_t psize = allocated ? MIN(resid, allocated_size) : min_size; @@ -3268,6 +3304,29 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) } /* + * If we used more gang children than the old limit, we must already be + * using the new headers. No need to update anything, just move on. + * + * Otherwise, we might be in a case where we need to turn on the new + * feature, so we check that. We enable the new feature if we didn't + * manage to fit everything into 3 gang children and we could have + * written more than that. + */ + if (g > gbh_nblkptrs(SPA_OLD_GANGBLOCKSIZE)) { + ASSERT(spa_feature_is_active(spa, + SPA_FEATURE_DYNAMIC_GANG_HEADER)); + } else if (any_failed && candidate > SPA_OLD_GANGBLOCKSIZE && + spa_feature_is_enabled(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER) && + !spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER)) { + dmu_tx_t *tx = + dmu_tx_create_assigned(spa->spa_dsl_pool, txg + 1); + dsl_sync_task_nowait(spa->spa_dsl_pool, + zio_update_feature, + (void *)SPA_FEATURE_DYNAMIC_GANG_HEADER, tx); + dmu_tx_commit(tx); + } + + /* * Set pio's pipeline to just wait for zio to finish. */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; @@ -3303,11 +3362,11 @@ zio_nop_write(zio_t *zio) zio_prop_t *zp = &zio->io_prop; ASSERT(BP_IS_HOLE(bp)); - ASSERT(BP_GET_LEVEL(bp) == 0); + ASSERT0(BP_GET_LEVEL(bp)); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(zp->zp_nopwrite); ASSERT(!zp->zp_dedup); - ASSERT(zio->io_bp_override == NULL); + ASSERT0P(zio->io_bp_override); ASSERT(IO_IS_ALLOCATING(zio)); /* @@ -3436,7 +3495,7 @@ zio_ddt_read_start(zio_t *zio) ddt_univ_phys_t *ddp = dde->dde_phys; blkptr_t blk; - ASSERT(zio->io_vsd == NULL); + ASSERT0P(zio->io_vsd); zio->io_vsd = dde; if (v_self == DDT_PHYS_NONE) @@ -3501,7 +3560,7 @@ zio_ddt_read_done(zio_t *zio) zio->io_vsd = NULL; } - ASSERT(zio->io_vsd == NULL); + ASSERT0P(zio->io_vsd); return (zio); } @@ -3836,7 +3895,7 @@ zio_ddt_write(zio_t *zio) * block and leave. */ if (have_dvas == 0) { - ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg); + ASSERT(BP_GET_BIRTH(bp) == txg); ASSERT(BP_EQUAL(bp, zio->io_bp_override)); ddt_phys_extend(ddp, v, bp); ddt_phys_addref(ddp, v); @@ -3864,6 +3923,23 @@ zio_ddt_write(zio_t *zio) * then we can just use them as-is. */ if (have_dvas >= need_dvas) { + /* + * For rewrite operations, try preserving the original + * logical birth time. If the result matches the + * original BP, this becomes a NOP. + */ + if (zp->zp_rewrite) { + uint64_t orig_logical_birth = + BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig); + ddt_bp_fill(ddp, v, bp, orig_logical_birth); + if (BP_EQUAL(bp, &zio->io_bp_orig)) { + /* We can skip accounting. */ + zio->io_flags |= ZIO_FLAG_NOPWRITE; + ddt_exit(ddt); + return (zio); + } + } + ddt_bp_fill(ddp, v, bp, txg); ddt_phys_addref(ddp, v); ddt_exit(ddt); @@ -4078,9 +4154,11 @@ zio_io_to_allocate(metaslab_class_allocator_t *mca, boolean_t *more) * reserve then we throttle. */ if (!metaslab_class_throttle_reserve(zio->io_metaslab_class, - zio->io_prop.zp_copies, zio, B_FALSE, more)) { + zio->io_allocator, zio->io_prop.zp_copies, zio->io_size, + B_FALSE, more)) { return (NULL); } + zio->io_flags |= ZIO_FLAG_ALLOC_THROTTLED; avl_remove(&mca->mca_tree, zio); ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); @@ -4164,8 +4242,10 @@ zio_dva_allocate(zio_t *zio) ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_GANG); memcpy(zio->io_bp->blk_dva, zio->io_bp_orig.blk_dva, 3 * sizeof (dva_t)); - BP_SET_BIRTH(zio->io_bp, BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig), - BP_GET_PHYSICAL_BIRTH(&zio->io_bp_orig)); + BP_SET_LOGICAL_BIRTH(zio->io_bp, + BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig)); + BP_SET_PHYSICAL_BIRTH(zio->io_bp, + BP_GET_RAW_PHYSICAL_BIRTH(&zio->io_bp_orig)); return (zio); } @@ -4236,13 +4316,14 @@ again: * If we are holding old class reservation, drop it. * Dispatch the next ZIO(s) there if some are waiting. */ - if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + if (zio->io_flags & ZIO_FLAG_ALLOC_THROTTLED) { if (metaslab_class_throttle_unreserve(mc, - zio->io_prop.zp_copies, zio)) { + zio->io_allocator, zio->io_prop.zp_copies, + zio->io_size)) { zio_allocate_dispatch(zio->io_metaslab_class, zio->io_allocator); } - zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING; + zio->io_flags &= ~ZIO_FLAG_ALLOC_THROTTLED; } if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { @@ -4291,6 +4372,15 @@ again: error); } zio->io_error = error; + } else if (zio->io_prop.zp_rewrite) { + /* + * For rewrite operations, preserve the logical birth time + * but set the physical birth time to the current txg. + */ + uint64_t logical_birth = BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig); + ASSERT3U(logical_birth, <=, zio->io_txg); + BP_SET_BIRTH(zio->io_bp, logical_birth, zio->io_txg); + BP_SET_REWRITE(zio->io_bp, 1); } return (zio); @@ -4324,18 +4414,17 @@ zio_dva_claim(zio_t *zio) static void zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) { - ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp)); - ASSERT(zio->io_bp_override == NULL); + ASSERT(BP_GET_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp)); + ASSERT0P(zio->io_bp_override); if (!BP_IS_HOLE(bp)) { - metaslab_free(zio->io_spa, bp, BP_GET_LOGICAL_BIRTH(bp), - B_TRUE); + metaslab_free(zio->io_spa, bp, BP_GET_BIRTH(bp), B_TRUE); } if (gn != NULL) { - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { + for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) { zio_dva_unallocate(zio, gn->gn_child[g], - &gn->gn_gbh->zg_blkptr[g]); + gbh_bp(gn->gn_gbh, g)); } } } @@ -4345,12 +4434,15 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) */ int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, - uint64_t size, boolean_t *slog) + uint64_t min_size, uint64_t max_size, boolean_t *slog, + boolean_t allow_larger) { - int error = 1; + int error; zio_alloc_list_t io_alloc_list; + uint64_t alloc_size = 0; ASSERT(txg > spa_syncing_txg(spa)); + ASSERT3U(min_size, <=, max_size); metaslab_trace_init(&io_alloc_list); @@ -4359,7 +4451,7 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, * Fill in the obvious ones before calling into metaslab_alloc(). */ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); - BP_SET_PSIZE(new_bp, size); + BP_SET_PSIZE(new_bp, max_size); BP_SET_LEVEL(new_bp, 0); /* @@ -4372,25 +4464,53 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, int allocator = (uint_t)cityhash1(os->os_dsl_dataset->ds_object) % spa->spa_alloc_count; ZIOSTAT_BUMP(ziostat_total_allocations); - error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, - txg, NULL, flags, &io_alloc_list, allocator, NULL); + + /* Try log class (dedicated slog devices) first */ + error = metaslab_alloc_range(spa, spa_log_class(spa), min_size, + max_size, new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator, + NULL, &alloc_size); *slog = (error == 0); + + /* Try special_embedded_log class (reserved on special vdevs) */ if (error != 0) { - error = metaslab_alloc(spa, spa_embedded_log_class(spa), size, + error = metaslab_alloc_range(spa, + spa_special_embedded_log_class(spa), min_size, max_size, new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator, - NULL); + NULL, &alloc_size); } + + /* Try special class (general special vdev allocation) */ + if (error != 0) { + error = metaslab_alloc_range(spa, spa_special_class(spa), + min_size, max_size, new_bp, 1, txg, NULL, flags, + &io_alloc_list, allocator, NULL, &alloc_size); + } + + /* Try embedded_log class (reserved on normal vdevs) */ + if (error != 0) { + error = metaslab_alloc_range(spa, spa_embedded_log_class(spa), + min_size, max_size, new_bp, 1, txg, NULL, flags, + &io_alloc_list, allocator, NULL, &alloc_size); + } + + /* Finally fall back to normal class */ if (error != 0) { ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks); - error = metaslab_alloc(spa, spa_normal_class(spa), size, - new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator, - NULL); + error = metaslab_alloc_range(spa, spa_normal_class(spa), + min_size, max_size, new_bp, 1, txg, NULL, flags, + &io_alloc_list, allocator, NULL, &alloc_size); } metaslab_trace_fini(&io_alloc_list); if (error == 0) { - BP_SET_LSIZE(new_bp, size); - BP_SET_PSIZE(new_bp, size); + if (!allow_larger) + alloc_size = MIN(alloc_size, max_size); + else if (max_size <= SPA_OLD_MAXBLOCKSIZE) + alloc_size = MIN(alloc_size, SPA_OLD_MAXBLOCKSIZE); + alloc_size = P2ALIGN_TYPED(alloc_size, ZIL_MIN_BLKSZ, uint64_t); + + BP_SET_LSIZE(new_bp, alloc_size); + BP_SET_PSIZE(new_bp, alloc_size); BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(new_bp, spa_version(spa) >= SPA_VERSION_SLIM_ZIL @@ -4418,8 +4538,8 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, } } else { zfs_dbgmsg("%s: zil block allocation failure: " - "size %llu, error %d", spa_name(spa), (u_longlong_t)size, - error); + "min_size %llu, max_size %llu, error %d", spa_name(spa), + (u_longlong_t)min_size, (u_longlong_t)max_size, error); } return (error); @@ -4450,8 +4570,8 @@ zio_vdev_io_start(zio_t *zio) zio->io_delay = 0; - ASSERT(zio->io_error == 0); - ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); + ASSERT0(zio->io_error); + ASSERT0(zio->io_child_error[ZIO_CHILD_VDEV]); if (vd == NULL) { if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) @@ -4642,7 +4762,7 @@ zio_vdev_io_done(zio_t *zio) ops->vdev_op_io_done(zio); if (unexpected_error && vd->vdev_remove_wanted == B_FALSE) - VERIFY(vdev_probe(vd, zio) == NULL); + VERIFY0P(vdev_probe(vd, zio)); return (zio); } @@ -4722,7 +4842,7 @@ zio_vdev_io_assess(zio_t *zio) * If a Direct I/O operation has a checksum verify error then this I/O * should not attempt to be issued again. */ - if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) { + if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR) { if (zio->io_type == ZIO_TYPE_WRITE) { ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL); ASSERT3U(zio->io_error, ==, EIO); @@ -4794,7 +4914,7 @@ void zio_vdev_io_reissue(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); - ASSERT(zio->io_error == 0); + ASSERT0(zio->io_error); zio->io_stage >>= 1; } @@ -4811,7 +4931,7 @@ void zio_vdev_io_bypass(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); - ASSERT(zio->io_error == 0); + ASSERT0(zio->io_error); zio->io_flags |= ZIO_FLAG_IO_BYPASS; zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; @@ -5031,7 +5151,7 @@ zio_checksum_verify(zio_t *zio) ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL); } - ASSERT0(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR); + ASSERT0(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR); IMPLY(zio->io_flags & ZIO_FLAG_DIO_READ, !(zio->io_flags & ZIO_FLAG_SPECULATIVE)); @@ -5040,7 +5160,7 @@ zio_checksum_verify(zio_t *zio) if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { if (zio->io_flags & ZIO_FLAG_DIO_READ) { - zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; + zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR; zio_t *pio = zio_unique_parent(zio); /* * Any Direct I/O read that has a checksum @@ -5090,7 +5210,7 @@ zio_dio_checksum_verify(zio_t *zio) if ((error = zio_checksum_error(zio, NULL)) != 0) { zio->io_error = error; if (error == ECKSUM) { - zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; + zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR; zio_dio_chksum_verify_error_report(zio); } } @@ -5115,7 +5235,7 @@ zio_checksum_verified(zio_t *zio) void zio_dio_chksum_verify_error_report(zio_t *zio) { - ASSERT(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR); + ASSERT(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR); if (zio->io_child_type == ZIO_CHILD_LOGICAL) return; @@ -5187,9 +5307,9 @@ zio_ready(zio_t *zio) if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); - ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || + ASSERT(BP_GET_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE)); - ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); + ASSERT0(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY]); zio->io_ready(zio); } @@ -5202,7 +5322,7 @@ zio_ready(zio_t *zio) if (zio->io_error != 0) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + if (zio->io_flags & ZIO_FLAG_ALLOC_THROTTLED) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_metaslab_class != NULL); @@ -5213,8 +5333,8 @@ zio_ready(zio_t *zio) * issue the next I/O to allocate. */ if (metaslab_class_throttle_unreserve( - zio->io_metaslab_class, zio->io_prop.zp_copies, - zio)) { + zio->io_metaslab_class, zio->io_allocator, + zio->io_prop.zp_copies, zio->io_size)) { zio_allocate_dispatch(zio->io_metaslab_class, zio->io_allocator); } @@ -5264,6 +5384,7 @@ zio_dva_throttle_done(zio_t *zio) vdev_t *vd = zio->io_vd; int flags = METASLAB_ASYNC_ALLOC; const void *tag = pio; + uint64_t size = pio->io_size; ASSERT3P(zio->io_bp, !=, NULL); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); @@ -5273,16 +5394,19 @@ zio_dva_throttle_done(zio_t *zio) ASSERT3P(vd, ==, vd->vdev_top); ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY)); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); - ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); + ASSERT(zio->io_flags & ZIO_FLAG_ALLOC_THROTTLED); /* * Parents of gang children can have two flavors -- ones that allocated * the gang header (will have ZIO_FLAG_IO_REWRITE set) and ones that * allocated the constituent blocks. The first use their parent as tag. + * We set the size to match the original allocation call for that case. */ if (pio->io_child_type == ZIO_CHILD_GANG && - (pio->io_flags & ZIO_FLAG_IO_REWRITE)) + (pio->io_flags & ZIO_FLAG_IO_REWRITE)) { tag = zio_unique_parent(pio); + size = SPA_OLD_GANGBLOCKSIZE; + } ASSERT(IO_IS_ALLOCATING(pio) || (pio->io_child_type == ZIO_CHILD_GANG && (pio->io_flags & ZIO_FLAG_IO_REWRITE))); @@ -5295,9 +5419,10 @@ zio_dva_throttle_done(zio_t *zio) ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled); metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, - pio->io_allocator, flags, pio->io_size, tag); + pio->io_allocator, flags, size, tag); - if (metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1, pio)) { + if (metaslab_class_throttle_unreserve(pio->io_metaslab_class, + pio->io_allocator, 1, pio->io_size)) { zio_allocate_dispatch(zio->io_metaslab_class, pio->io_allocator); } @@ -5328,17 +5453,15 @@ zio_done(zio_t *zio) * write. We must do this since the allocation is performed * by the logical I/O but the actual write is done by child I/Os. */ - if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && + if (zio->io_flags & ZIO_FLAG_ALLOC_THROTTLED && zio->io_child_type == ZIO_CHILD_VDEV) zio_dva_throttle_done(zio); for (int c = 0; c < ZIO_CHILD_TYPES; c++) for (int w = 0; w < ZIO_WAIT_TYPES; w++) - ASSERT(zio->io_children[c][w] == 0); + ASSERT0(zio->io_children[c][w]); if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) { - ASSERT(zio->io_bp->blk_pad[0] == 0); - ASSERT(zio->io_bp->blk_pad[1] == 0); ASSERT(memcmp(zio->io_bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || (zio->io_bp == zio_unique_parent(zio)->io_bp)); @@ -5431,7 +5554,7 @@ zio_done(zio_t *zio) */ if (zio->io_error != ECKSUM && zio->io_vd != NULL && !vdev_is_dead(zio->io_vd) && - !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) { + !(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)) { int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); if (ret != EALREADY) { @@ -5446,14 +5569,14 @@ zio_done(zio_t *zio) if ((zio->io_error == EIO || !(zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && - !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) && + !(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR) && zio == zio->io_logical) { /* * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. */ spa_log_error(zio->io_spa, &zio->io_bookmark, - BP_GET_LOGICAL_BIRTH(zio->io_bp)); + BP_GET_PHYSICAL_BIRTH(zio->io_bp)); (void) zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, NULL, &zio->io_bookmark, zio, 0); } @@ -5467,7 +5590,7 @@ zio_done(zio_t *zio) */ if (zio->io_error == EAGAIN && IO_IS_ALLOCATING(zio) && zio->io_prop.zp_dedup) { - zio->io_reexecute |= ZIO_REEXECUTE_NOW; + zio->io_post |= ZIO_POST_REEXECUTE; zio->io_prop.zp_dedup = B_FALSE; } /* @@ -5479,11 +5602,11 @@ zio_done(zio_t *zio) if (IO_IS_ALLOCATING(zio) && !(zio->io_flags & ZIO_FLAG_CANFAIL) && - !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) { + !(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)) { if (zio->io_error != ENOSPC) - zio->io_reexecute |= ZIO_REEXECUTE_NOW; + zio->io_post |= ZIO_POST_REEXECUTE; else - zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; + zio->io_post |= ZIO_POST_SUSPEND; } if ((zio->io_type == ZIO_TYPE_READ || @@ -5492,10 +5615,11 @@ zio_done(zio_t *zio) zio->io_error == ENXIO && spa_load_state(zio->io_spa) == SPA_LOAD_NONE && spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE) - zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; + zio->io_post |= ZIO_POST_SUSPEND; - if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) - zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; + if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && + !(zio->io_post & (ZIO_POST_REEXECUTE|ZIO_POST_SUSPEND))) + zio->io_post |= ZIO_POST_SUSPEND; /* * Here is a possibly good place to attempt to do @@ -5514,7 +5638,8 @@ zio_done(zio_t *zio) */ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); - if ((zio->io_error || zio->io_reexecute) && + if ((zio->io_error || + (zio->io_post & (ZIO_POST_REEXECUTE|ZIO_POST_SUSPEND))) && IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp); @@ -5525,16 +5650,16 @@ zio_done(zio_t *zio) * Godfather I/Os should never suspend. */ if ((zio->io_flags & ZIO_FLAG_GODFATHER) && - (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) - zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND; + (zio->io_post & ZIO_POST_SUSPEND)) + zio->io_post &= ~ZIO_POST_SUSPEND; - if (zio->io_reexecute) { + if (zio->io_post & (ZIO_POST_REEXECUTE|ZIO_POST_SUSPEND)) { /* * A Direct I/O operation that has a checksum verify error * should not attempt to reexecute. Instead, the error should * just be propagated back. */ - ASSERT(!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)); + ASSERT0(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR); /* * This is a logical I/O that wants to reexecute. @@ -5571,7 +5696,7 @@ zio_done(zio_t *zio) pio_next = zio_walk_parents(zio, &zl); if ((pio->io_flags & ZIO_FLAG_GODFATHER) && - (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { + (zio->io_post & ZIO_POST_SUSPEND)) { zio_remove_child(pio, zio, remove_zl); /* * This is a rare code path, so we don't @@ -5595,13 +5720,14 @@ zio_done(zio_t *zio) * "next_to_execute". */ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); - } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { + } else if (zio->io_post & ZIO_POST_SUSPEND) { /* * We'd fail again if we reexecuted now, so suspend * until conditions improve (e.g. device comes online). */ zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR); } else { + ASSERT(zio->io_post & ZIO_POST_REEXECUTE); /* * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. @@ -5614,7 +5740,8 @@ zio_done(zio_t *zio) } ASSERT(list_is_empty(&zio->io_child_list)); - ASSERT(zio->io_reexecute == 0); + ASSERT0(zio->io_post & ZIO_POST_REEXECUTE); + ASSERT0(zio->io_post & ZIO_POST_SUSPEND); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); /* diff --git a/sys/contrib/openzfs/module/zfs/zio_checksum.c b/sys/contrib/openzfs/module/zfs/zio_checksum.c index a91775b04af2..1d0646a61185 100644 --- a/sys/contrib/openzfs/module/zfs/zio_checksum.c +++ b/sys/contrib/openzfs/module/zfs/zio_checksum.c @@ -215,7 +215,7 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { spa_feature_t zio_checksum_to_feature(enum zio_checksum cksum) { - VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0); + VERIFY0((cksum & ~ZIO_CHECKSUM_MASK)); switch (cksum) { case ZIO_CHECKSUM_BLAKE3: @@ -279,7 +279,7 @@ static void zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) { const dva_t *dva = BP_IDENTITY(bp); - uint64_t txg = BP_GET_BIRTH(bp); + uint64_t txg = BP_GET_PHYSICAL_BIRTH(bp); ASSERT(BP_IS_GANG(bp)); @@ -545,14 +545,39 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); int error; - uint64_t size = (bp == NULL ? zio->io_size : - (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); + uint64_t size = bp ? BP_GET_PSIZE(bp) : zio->io_size; uint64_t offset = zio->io_offset; abd_t *data = zio->io_abd; spa_t *spa = zio->io_spa; + if (bp && BP_IS_GANG(bp)) { + if (spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER)) + size = zio->io_size; + else + size = SPA_OLD_GANGBLOCKSIZE; + } + error = zio_checksum_error_impl(spa, bp, checksum, data, size, offset, info); + if (error && bp && BP_IS_GANG(bp) && size > SPA_OLD_GANGBLOCKSIZE) { + /* + * It's possible that this is an old gang block. Rerun + * the checksum with the old size; if that passes, then + * update the gangblocksize appropriately. + */ + error = zio_checksum_error_impl(spa, bp, checksum, data, + SPA_OLD_GANGBLOCKSIZE, offset, info); + if (error == 0) { + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); + zio_t *pio; + for (pio = zio_unique_parent(zio); + pio->io_child_type != ZIO_CHILD_GANG; + pio = zio_unique_parent(pio)) + ; + zio_gang_node_t *gn = pio->io_private; + gn->gn_gangblocksize = SPA_OLD_GANGBLOCKSIZE; + } + } if (zio_injection_enabled && error == 0 && zio->io_error == 0) { error = zio_handle_fault_injection(zio, ECKSUM); diff --git a/sys/contrib/openzfs/module/zfs/zio_compress.c b/sys/contrib/openzfs/module/zfs/zio_compress.c index 9f0ac1b63146..89ceeb58ad91 100644 --- a/sys/contrib/openzfs/module/zfs/zio_compress.c +++ b/sys/contrib/openzfs/module/zfs/zio_compress.c @@ -38,12 +38,6 @@ #include <sys/zstd/zstd.h> /* - * If nonzero, every 1/X decompression attempts will fail, simulating - * an undetected memory error. - */ -static unsigned long zio_decompress_fail_fraction = 0; - -/* * Compression vectors. */ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { @@ -171,15 +165,6 @@ zio_decompress_data(enum zio_compress c, abd_t *src, abd_t *dst, else err = ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); - /* - * Decompression shouldn't fail, because we've already verified - * the checksum. However, for extra protection (e.g. against bitflips - * in non-ECC RAM), we handle this error (and test it). - */ - if (zio_decompress_fail_fraction != 0 && - random_in_range(zio_decompress_fail_fraction) == 0) - err = SET_ERROR(EINVAL); - return (err); } diff --git a/sys/contrib/openzfs/module/zfs/zio_inject.c b/sys/contrib/openzfs/module/zfs/zio_inject.c index df7b01ba879e..981a1be4847c 100644 --- a/sys/contrib/openzfs/module/zfs/zio_inject.c +++ b/sys/contrib/openzfs/module/zfs/zio_inject.c @@ -1119,7 +1119,7 @@ zio_clear_fault(int id) kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) * handler->zi_record.zi_nlanes); } else { - ASSERT3P(handler->zi_lanes, ==, NULL); + ASSERT0P(handler->zi_lanes); } if (handler->zi_spa_name != NULL) diff --git a/sys/contrib/openzfs/module/zfs/zrlock.c b/sys/contrib/openzfs/module/zfs/zrlock.c index 3c0f1b7bbbc1..09c110945c97 100644 --- a/sys/contrib/openzfs/module/zfs/zrlock.c +++ b/sys/contrib/openzfs/module/zfs/zrlock.c @@ -129,7 +129,7 @@ zrl_tryenter(zrlock_t *zrl) (uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED); if (cas == 0) { #ifdef ZFS_DEBUG - ASSERT3P(zrl->zr_owner, ==, NULL); + ASSERT0P(zrl->zr_owner); zrl->zr_owner = curthread; #endif return (1); diff --git a/sys/contrib/openzfs/module/zfs/zthr.c b/sys/contrib/openzfs/module/zfs/zthr.c index 597a510528ea..d245ce4946e0 100644 --- a/sys/contrib/openzfs/module/zfs/zthr.c +++ b/sys/contrib/openzfs/module/zfs/zthr.c @@ -316,7 +316,7 @@ zthr_destroy(zthr_t *t) { ASSERT(!MUTEX_HELD(&t->zthr_state_lock)); ASSERT(!MUTEX_HELD(&t->zthr_request_lock)); - VERIFY3P(t->zthr_thread, ==, NULL); + VERIFY0P(t->zthr_thread); mutex_destroy(&t->zthr_request_lock); mutex_destroy(&t->zthr_state_lock); cv_destroy(&t->zthr_cv); diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c index 3568d4f43fcb..faced0db7e9e 100644 --- a/sys/contrib/openzfs/module/zfs/zvol.c +++ b/sys/contrib/openzfs/module/zfs/zvol.c @@ -38,25 +38,36 @@ * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. - * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2024, 2025, Klara, Inc. */ /* * Note on locking of zvol state structures. * - * These structures are used to maintain internal state used to emulate block - * devices on top of zvols. In particular, management of device minor number - * operations - create, remove, rename, and set_snapdev - involves access to - * these structures. The zvol_state_lock is primarily used to protect the - * zvol_state_list. The zv->zv_state_lock is used to protect the contents - * of the zvol_state_t structures, as well as to make sure that when the - * time comes to remove the structure from the list, it is not in use, and - * therefore, it can be taken off zvol_state_list and freed. + * zvol_state_t represents the connection between a single dataset + * (DMU_OST_ZVOL) and the device "minor" (some OS-specific representation of a + * "disk" or "device" or "volume", eg, a /dev/zdXX node, a GEOM object, etc). * - * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol, - * e.g. for the duration of receive and rollback operations. This lock can be - * held for significant periods of time. Given that it is undesirable to hold - * mutexes for long periods of time, the following lock ordering applies: + * The global zvol_state_lock is used to protect access to zvol_state_list and + * zvol_htable, which are the primary way to obtain a zvol_state_t from a name. + * It should not be used for anything not name-relateds, and you should avoid + * sleeping or waiting while its held. See zvol_find_by_name(), zvol_insert(), + * zvol_remove(). + * + * The zv_state_lock is used to protect the contents of the associated + * zvol_state_t. Most of the zvol_state_t is dedicated to control and + * configuration; almost none of it is needed for data operations (that is, + * read, write, flush) so this lock is rarely taken during general IO. It + * should be released quickly; you should avoid sleeping or waiting while its + * held. + * + * zv_suspend_lock is used to suspend IO/data operations to a zvol. The read + * half should held for the duration of an IO operation. The write half should + * be taken when something to wait for IO to complete and the block further IO, + * eg for the duration of receive and rollback operations. This lock can be + * held for long periods of time. + * + * Thus, the following lock ordering appies. * - take zvol_state_lock if necessary, to protect zvol_state_list * - take zv_suspend_lock if necessary, by the code path in question * - take zv_state_lock to protect zvol_state_t @@ -67,9 +78,8 @@ * these operations are serialized per pool. Consequently, we can be certain * that for a given zvol, there is only one operation at a time in progress. * That is why one can be sure that first, zvol_state_t for a given zvol is - * allocated and placed on zvol_state_list, and then other minor operations - * for this zvol are going to proceed in the order of issue. - * + * allocated and placed on zvol_state_list, and then other minor operations for + * this zvol are going to proceed in the order of issue. */ #include <sys/dataset_kstats.h> @@ -102,6 +112,7 @@ extern int zfs_bclone_wait_dirty; zv_taskq_t zvol_taskqs; typedef enum { + ZVOL_ASYNC_CREATE_MINORS, ZVOL_ASYNC_REMOVE_MINORS, ZVOL_ASYNC_RENAME_MINORS, ZVOL_ASYNC_SET_SNAPDEV, @@ -110,10 +121,14 @@ typedef enum { } zvol_async_op_t; typedef struct { - zvol_async_op_t op; - char name1[MAXNAMELEN]; - char name2[MAXNAMELEN]; - uint64_t value; + zvol_async_op_t zt_op; + char zt_name1[MAXNAMELEN]; + char zt_name2[MAXNAMELEN]; + uint64_t zt_value; + uint32_t zt_total; + uint32_t zt_done; + int32_t zt_status; + int zt_error; } zvol_task_t; zv_request_task_t * @@ -210,8 +225,8 @@ zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) int error; uint64_t volblocksize, volsize; - VERIFY(nvlist_lookup_uint64(nvprops, - zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); + VERIFY0(nvlist_lookup_uint64(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize)); if (nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); @@ -220,21 +235,20 @@ zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) * These properties must be removed from the list so the generic * property setting step won't apply to them. */ - VERIFY(nvlist_remove_all(nvprops, - zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); + VERIFY0(nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE))); (void) nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, DMU_OT_NONE, 0, tx); - ASSERT(error == 0); + ASSERT0(error); error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, DMU_OT_NONE, 0, tx); - ASSERT(error == 0); + ASSERT0(error); error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); - ASSERT(error == 0); + ASSERT0(error); } /* @@ -249,7 +263,7 @@ zvol_get_stats(objset_t *os, nvlist_t *nv) error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); if (error) - return (SET_ERROR(error)); + return (error); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); @@ -262,7 +276,7 @@ zvol_get_stats(objset_t *os, nvlist_t *nv) kmem_free(doi, sizeof (dmu_object_info_t)); - return (SET_ERROR(error)); + return (error); } /* @@ -300,7 +314,7 @@ zvol_update_volsize(uint64_t volsize, objset_t *os) error = dmu_tx_assign(tx, DMU_TX_WAIT); if (error) { dmu_tx_abort(tx); - return (SET_ERROR(error)); + return (error); } txg = dmu_tx_get_txg(tx); @@ -332,7 +346,7 @@ zvol_set_volsize(const char *name, uint64_t volsize) error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL); if (error != 0) - return (SET_ERROR(error)); + return (error); if (readonly) return (SET_ERROR(EROFS)); @@ -348,7 +362,7 @@ zvol_set_volsize(const char *name, uint64_t volsize) FTAG, &os)) != 0) { if (zv != NULL) mutex_exit(&zv->zv_state_lock); - return (SET_ERROR(error)); + return (error); } owned = B_TRUE; if (zv != NULL) @@ -385,7 +399,7 @@ out: if (error == 0 && zv != NULL) zvol_os_update_volsize(zv, volsize); - return (SET_ERROR(error)); + return (error); } /* @@ -396,7 +410,7 @@ zvol_set_volthreading(const char *name, boolean_t value) { zvol_state_t *zv = zvol_find_by_name(name, RW_NONE); if (zv == NULL) - return (ENOENT); + return (SET_ERROR(ENOENT)); zv->zv_threading = value; mutex_exit(&zv->zv_state_lock); return (0); @@ -445,8 +459,10 @@ zvol_check_volblocksize(const char *name, uint64_t volblocksize) * We don't allow setting the property above 1MB, * unless the tunable has been changed. */ - if (volblocksize > zfs_max_recordsize) + if (volblocksize > zfs_max_recordsize) { + spa_close(spa, FTAG); return (SET_ERROR(EDOM)); + } spa_close(spa, FTAG); } @@ -613,7 +629,7 @@ zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst, dmu_tx_t *tx; blkptr_t *bps; size_t maxblocks; - int error = EINVAL; + int error = 0; rw_enter(&zv_dst->zv_suspend_lock, RW_READER); if (zv_dst->zv_zilog == NULL) { @@ -639,23 +655,22 @@ zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst, */ if (!spa_feature_is_enabled(dmu_objset_spa(outos), SPA_FEATURE_BLOCK_CLONING)) { - error = EOPNOTSUPP; + error = SET_ERROR(EOPNOTSUPP); goto out; } if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) { - error = EXDEV; + error = SET_ERROR(EXDEV); goto out; } if (inos->os_encrypted != outos->os_encrypted) { - error = EXDEV; + error = SET_ERROR(EXDEV); goto out; } if (zv_src->zv_volblocksize != zv_dst->zv_volblocksize) { - error = EINVAL; + error = SET_ERROR(EINVAL); goto out; } if (inoff >= zv_src->zv_volsize || outoff >= zv_dst->zv_volsize) { - error = 0; goto out; } @@ -666,17 +681,15 @@ zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst, len = zv_src->zv_volsize - inoff; if (len > zv_dst->zv_volsize - outoff) len = zv_dst->zv_volsize - outoff; - if (len == 0) { - error = 0; + if (len == 0) goto out; - } /* * No overlapping if we are cloning within the same file */ if (zv_src == zv_dst) { if (inoff < outoff + len && outoff < inoff + len) { - error = EINVAL; + error = SET_ERROR(EINVAL); goto out; } } @@ -686,7 +699,7 @@ zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst, */ if ((inoff % zv_src->zv_volblocksize) != 0 || (outoff % zv_dst->zv_volblocksize) != 0) { - error = EINVAL; + error = SET_ERROR(EINVAL); goto out; } @@ -694,7 +707,7 @@ zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst, * Length must be multiple of block size */ if ((len % zv_src->zv_volblocksize) != 0) { - error = EINVAL; + error = SET_ERROR(EINVAL); goto out; } @@ -766,13 +779,13 @@ zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst, zfs_rangelock_exit(outlr); zfs_rangelock_exit(inlr); if (error == 0 && zv_dst->zv_objset->os_sync == ZFS_SYNC_ALWAYS) { - zil_commit(zilog_dst, ZVOL_OBJ); + error = zil_commit(zilog_dst, ZVOL_OBJ); } out: if (zv_src != zv_dst) rw_exit(&zv_src->zv_suspend_lock); rw_exit(&zv_dst->zv_suspend_lock); - return (SET_ERROR(error)); + return (error); } /* @@ -859,13 +872,8 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { }; /* - * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. - * - * We store data in the log buffers if it's small enough. - * Otherwise we will later flush the data out via dmu_sync(). + * zvol_log_write() handles TX_WRITE transactions. */ -static const ssize_t zvol_immediate_write_sz = 32768; - void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, uint64_t size, boolean_t commit) @@ -878,15 +886,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, if (zil_replaying(zilog, tx)) return; - if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) - write_state = WR_INDIRECT; - else if (!spa_has_slogs(zilog->zl_spa) && - size >= blocksize && blocksize > zvol_immediate_write_sz) - write_state = WR_INDIRECT; - else if (commit) - write_state = WR_COPIED; - else - write_state = WR_NEED_COPY; + write_state = zil_write_state(zilog, size, blocksize, B_FALSE, commit); while (size) { itx_t *itx; @@ -905,7 +905,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn, offset, len, lr + 1, DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING) != 0) { - zil_itx_destroy(itx); + zil_itx_destroy(itx, 0); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; wr_state = WR_NEED_COPY; @@ -924,7 +924,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, itx->itx_private = zv; - (void) zil_itx_assign(zilog, itx, tx); + zil_itx_assign(zilog, itx, tx); offset += len; size -= len; @@ -1034,7 +1034,7 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, zvol_get_done(zgd, error); - return (SET_ERROR(error)); + return (error); } /* @@ -1079,15 +1079,15 @@ zvol_setup_zv(zvol_state_t *zv) error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL); if (error) - return (SET_ERROR(error)); + return (error); error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) - return (SET_ERROR(error)); + return (error); error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn); if (error) - return (SET_ERROR(error)); + return (error); zvol_os_set_capacity(zv, volsize >> 9); zv->zv_volsize = volsize; @@ -1129,7 +1129,7 @@ zvol_shutdown_zv(zvol_state_t *zv) */ if (zv->zv_flags & ZVOL_WRITTEN_TO) txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); - (void) dmu_objset_evict_dbufs(zv->zv_objset); + dmu_objset_evict_dbufs(zv->zv_objset); } /* @@ -1145,20 +1145,34 @@ zvol_tag(zvol_state_t *zv) /* * Suspend the zvol for recv and rollback. */ -zvol_state_t * -zvol_suspend(const char *name) +int +zvol_suspend(const char *name, zvol_state_t **zvp) { zvol_state_t *zv; zv = zvol_find_by_name(name, RW_WRITER); if (zv == NULL) - return (NULL); + return (SET_ERROR(ENOENT)); /* block all I/O, release in zvol_resume. */ ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); + /* + * If it's being removed, unlock and return error. It doesn't make any + * sense to try to suspend a zvol being removed, but being here also + * means that zvol_remove_minors_impl() is about to call zvol_remove() + * and then destroy the zvol_state_t, so returning a pointer to it for + * the caller to mess with would be a disaster anyway. + */ + if (zv->zv_flags & ZVOL_REMOVING) { + mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); + /* NB: Returning EIO here to match zfsvfs_teardown() */ + return (SET_ERROR(EIO)); + } + atomic_inc(&zv->zv_suspend_ref); if (zv->zv_open_count > 0) @@ -1171,7 +1185,8 @@ zvol_suspend(const char *name) mutex_exit(&zv->zv_state_lock); /* zv_suspend_lock is released in zvol_resume() */ - return (zv); + *zvp = zv; + return (0); } int @@ -1206,7 +1221,7 @@ zvol_resume(zvol_state_t *zv) if (zv->zv_flags & ZVOL_REMOVING) cv_broadcast(&zv->zv_removing_cv); - return (SET_ERROR(error)); + return (error); } int @@ -1222,7 +1237,7 @@ zvol_first_open(zvol_state_t *zv, boolean_t readonly) boolean_t ro = (readonly || (strchr(zv->zv_name, '@') != NULL)); error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os); if (error) - return (SET_ERROR(error)); + return (error); zv->zv_objset = os; @@ -1434,6 +1449,48 @@ zvol_create_minors_cb(const char *dsname, void *arg) return (0); } +static void +zvol_task_update_status(zvol_task_t *task, uint64_t total, uint64_t done, + int error) +{ + + task->zt_total += total; + task->zt_done += done; + if (task->zt_total != task->zt_done) { + task->zt_status = -1; + if (error) + task->zt_error = error; + } +} + +static void +zvol_task_report_status(zvol_task_t *task) +{ +#ifdef ZFS_DEBUG + static const char *const msg[] = { + "create", + "remove", + "rename", + "set snapdev", + "set volmode", + "unknown", + }; + + if (task->zt_status == 0) + return; + + zvol_async_op_t op = MIN(task->zt_op, ZVOL_ASYNC_MAX); + if (task->zt_error) { + dprintf("The %s minors zvol task was not ok, last error %d\n", + msg[op], task->zt_error); + } else { + dprintf("The %s minors zvol task was not ok\n", msg[op]); + } +#else + (void) task; +#endif +} + /* * Create minors for the specified dataset, including children and snapshots. * Pay attention to the 'snapdev' property and iterate over the snapshots @@ -1451,14 +1508,27 @@ zvol_create_minors_cb(const char *dsname, void *arg) * 'visible' (which also verifies that the parent is a zvol), and if so, * a minor node for that snapshot is created. */ -void -zvol_create_minors_recursive(const char *name) +static void +zvol_create_minors_impl(zvol_task_t *task) { + const char *name = task->zt_name1; list_t minors_list; minors_job_t *job; + uint64_t snapdev; + int total = 0, done = 0, last_error, error; - if (zvol_inhibit_dev) + /* + * Note: the dsl_pool_config_lock must not be held. + * Minor node creation needs to obtain the zvol_state_lock. + * zvol_open() obtains the zvol_state_lock and then the dsl pool + * config lock. Therefore, we can't have the config lock now if + * we are going to wait for the zvol_state_lock, because it + * would be a lock order inversion which could lead to deadlock. + */ + + if (zvol_inhibit_dev) { return; + } /* * This is the list for prefetch jobs. Whenever we found a match @@ -1474,13 +1544,16 @@ zvol_create_minors_recursive(const char *name) if (strchr(name, '@') != NULL) { - uint64_t snapdev; - - int error = dsl_prop_get_integer(name, "snapdev", - &snapdev, NULL); - - if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) - (void) zvol_os_create_minor(name); + error = dsl_prop_get_integer(name, "snapdev", &snapdev, NULL); + if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) { + error = zvol_os_create_minor(name); + if (error == 0) { + done++; + } else { + last_error = error; + } + total++; + } } else { fstrans_cookie_t cookie = spl_fstrans_mark(); (void) dmu_objset_find(name, zvol_create_minors_cb, @@ -1495,229 +1568,195 @@ zvol_create_minors_recursive(const char *name) * sequentially. */ while ((job = list_remove_head(&minors_list)) != NULL) { - if (!job->error) - (void) zvol_os_create_minor(job->name); + if (!job->error) { + error = zvol_os_create_minor(job->name); + if (error == 0) { + done++; + } else { + last_error = error; + } + } else if (job->error == EINVAL) { + /* + * The objset, with the name requested by current job + * exist, but have the type different from zvol. + * Just ignore this sort of errors. + */ + done++; + } else { + last_error = job->error; + } + total++; kmem_strfree(job->name); kmem_free(job, sizeof (minors_job_t)); } list_destroy(&minors_list); + zvol_task_update_status(task, total, done, last_error); } -void -zvol_create_minor(const char *name) -{ - /* - * Note: the dsl_pool_config_lock must not be held. - * Minor node creation needs to obtain the zvol_state_lock. - * zvol_open() obtains the zvol_state_lock and then the dsl pool - * config lock. Therefore, we can't have the config lock now if - * we are going to wait for the zvol_state_lock, because it - * would be a lock order inversion which could lead to deadlock. - */ - - if (zvol_inhibit_dev) - return; - - if (strchr(name, '@') != NULL) { - uint64_t snapdev; - - int error = dsl_prop_get_integer(name, - "snapdev", &snapdev, NULL); - - if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) - (void) zvol_os_create_minor(name); - } else { - (void) zvol_os_create_minor(name); - } -} - -/* - * Remove minors for specified dataset including children and snapshots. - */ - /* - * Remove the minor for a given zvol. This will do it all: - * - flag the zvol for removal, so new requests are rejected - * - wait until outstanding requests are completed - * - remove it from lists - * - free it - * It's also usable as a taskq task, and smells nice too. + * Remove minors for specified dataset and, optionally, its children and + * snapshots. */ static void -zvol_remove_minor_task(void *arg) -{ - zvol_state_t *zv = (zvol_state_t *)arg; - - ASSERT(!RW_LOCK_HELD(&zvol_state_lock)); - ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); - - mutex_enter(&zv->zv_state_lock); - while (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) { - zv->zv_flags |= ZVOL_REMOVING; - cv_wait(&zv->zv_removing_cv, &zv->zv_state_lock); - } - mutex_exit(&zv->zv_state_lock); - - rw_enter(&zvol_state_lock, RW_WRITER); - mutex_enter(&zv->zv_state_lock); - - zvol_remove(zv); - zvol_os_clear_private(zv); - - mutex_exit(&zv->zv_state_lock); - rw_exit(&zvol_state_lock); - - zvol_os_free(zv); -} - -static void -zvol_free_task(void *arg) -{ - zvol_os_free(arg); -} - -void -zvol_remove_minors_impl(const char *name) +zvol_remove_minors_impl(zvol_task_t *task) { zvol_state_t *zv, *zv_next; + const char *name = task ? task->zt_name1 : NULL; int namelen = ((name) ? strlen(name) : 0); - taskqid_t t; - list_t delay_list, free_list; + boolean_t children = task ? !!task->zt_value : B_TRUE; if (zvol_inhibit_dev) return; - list_create(&delay_list, sizeof (zvol_state_t), - offsetof(zvol_state_t, zv_next)); - list_create(&free_list, sizeof (zvol_state_t), - offsetof(zvol_state_t, zv_next)); + /* + * We collect up zvols that we want to remove on a separate list, so + * that we don't have to hold zvol_state_lock for the whole time. + * + * We can't remove them from the global lists until we're completely + * done with them, because that would make them appear to ZFS-side ops + * that they don't exist, and the name might be reused, which can't be + * good. + */ + list_t remove_list; + list_create(&remove_list, sizeof (zvol_state_t), + offsetof(zvol_state_t, zv_remove_node)); - rw_enter(&zvol_state_lock, RW_WRITER); + rw_enter(&zvol_state_lock, RW_READER); for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); mutex_enter(&zv->zv_state_lock); + if (zv->zv_flags & ZVOL_REMOVING) { + /* Another thread is handling shutdown, skip it. */ + mutex_exit(&zv->zv_state_lock); + continue; + } + + /* + * This zvol should be removed if: + * - no name was offered (ie removing all at shutdown); or + * - name matches exactly; or + * - we were asked to remove children, and + * - the start of the name matches, and + * - there is a '/' immediately after the matched name; or + * - there is a '@' immediately after the matched name + */ if (name == NULL || strcmp(zv->zv_name, name) == 0 || - (strncmp(zv->zv_name, name, namelen) == 0 && + (children && strncmp(zv->zv_name, name, namelen) == 0 && (zv->zv_name[namelen] == '/' || zv->zv_name[namelen] == '@'))) { - /* - * By holding zv_state_lock here, we guarantee that no - * one is currently using this zv - */ /* - * If in use, try to throw everyone off and try again - * later. + * Matched, so mark it removal. We want to take the + * write half of the suspend lock to make sure that + * the zvol is not suspended, and give any data ops + * chance to finish. */ - if (zv->zv_open_count > 0 || - atomic_read(&zv->zv_suspend_ref)) { - zv->zv_flags |= ZVOL_REMOVING; - t = taskq_dispatch( - zv->zv_objset->os_spa->spa_zvol_taskq, - zvol_remove_minor_task, zv, TQ_SLEEP); - if (t == TASKQID_INVALID) { - /* - * Couldn't create the task, so we'll - * do it in place once the loop is - * finished. - */ - list_insert_head(&delay_list, zv); - } + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, RW_WRITER); + mutex_enter(&zv->zv_state_lock); + + if (zv->zv_flags & ZVOL_REMOVING) { + /* Another thread has taken it, let them. */ mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); continue; } - zvol_remove(zv); - /* - * Cleared while holding zvol_state_lock as a writer - * which will prevent zvol_open() from opening it. + * Mark it and unlock. New entries will see the flag + * and return ENXIO. */ - zvol_os_clear_private(zv); - - /* Drop zv_state_lock before zvol_free() */ + zv->zv_flags |= ZVOL_REMOVING; mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); - /* Try parallel zv_free, if failed do it in place */ - t = taskq_dispatch(system_taskq, zvol_free_task, zv, - TQ_SLEEP); - if (t == TASKQID_INVALID) - list_insert_head(&free_list, zv); - } else { + /* Put it on the list for the next stage. */ + list_insert_head(&remove_list, zv); + } else mutex_exit(&zv->zv_state_lock); - } } - rw_exit(&zvol_state_lock); - - /* Wait for zvols that we couldn't create a remove task for */ - while ((zv = list_remove_head(&delay_list)) != NULL) - zvol_remove_minor_task(zv); - /* Free any that we couldn't free in parallel earlier */ - while ((zv = list_remove_head(&free_list)) != NULL) - zvol_os_free(zv); -} - -/* Remove minor for this specific volume only */ -static void -zvol_remove_minor_impl(const char *name) -{ - zvol_state_t *zv = NULL, *zv_next; + rw_exit(&zvol_state_lock); - if (zvol_inhibit_dev) + /* Didn't match any, nothing to do! */ + if (list_is_empty(&remove_list)) { + if (task) + task->zt_error = SET_ERROR(ENOENT); return; + } - rw_enter(&zvol_state_lock, RW_WRITER); - - for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { - zv_next = list_next(&zvol_state_list, zv); + /* Actually shut them all down. */ + for (zv = list_head(&remove_list); zv != NULL; zv = zv_next) { + zv_next = list_next(&remove_list, zv); mutex_enter(&zv->zv_state_lock); - if (strcmp(zv->zv_name, name) == 0) - /* Found, leave the the loop with zv_lock held */ - break; - mutex_exit(&zv->zv_state_lock); - } - if (zv == NULL) { - rw_exit(&zvol_state_lock); - return; - } - - ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + /* + * Still open or suspended, just wait. This can happen if, for + * example, we managed to acquire zv_state_lock in the moments + * where zvol_open() or zvol_release() are trading locks to + * call zvol_first_open() or zvol_last_close(). + */ + while (zv->zv_open_count > 0 || + atomic_read(&zv->zv_suspend_ref)) + cv_wait(&zv->zv_removing_cv, &zv->zv_state_lock); - if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) { /* - * In use, so try to throw everyone off, then wait - * until finished. + * No users, shut down the OS side. This may not remove the + * minor from view immediately, depending on the kernel + * specifics, but it will ensure that it is unusable and that + * this zvol_state_t can never again be reached from an OS-side + * operation. */ - zv->zv_flags |= ZVOL_REMOVING; + zvol_os_remove_minor(zv); mutex_exit(&zv->zv_state_lock); + + /* Remove it from the name lookup lists */ + rw_enter(&zvol_state_lock, RW_WRITER); + zvol_remove(zv); rw_exit(&zvol_state_lock); - zvol_remove_minor_task(zv); - return; } - zvol_remove(zv); - zvol_os_clear_private(zv); + /* + * Our own references on remove_list is the last one, free them and + * we're done. + */ + while ((zv = list_remove_head(&remove_list)) != NULL) + zvol_os_free(zv); - mutex_exit(&zv->zv_state_lock); - rw_exit(&zvol_state_lock); + list_destroy(&remove_list); +} + +/* Remove minor for this specific volume only */ +static int +zvol_remove_minor_impl(const char *name) +{ + if (zvol_inhibit_dev) + return (0); + + zvol_task_t task; + memset(&task, 0, sizeof (zvol_task_t)); + strlcpy(task.zt_name1, name, sizeof (task.zt_name1)); + task.zt_value = B_FALSE; + + zvol_remove_minors_impl(&task); - zvol_os_free(zv); + return (task.zt_error); } /* * Rename minors for specified dataset including children and snapshots. */ static void -zvol_rename_minors_impl(const char *oldname, const char *newname) +zvol_rename_minors_impl(zvol_task_t *task) { zvol_state_t *zv, *zv_next; - int oldnamelen; + const char *oldname = task->zt_name1; + const char *newname = task->zt_name2; + int total = 0, done = 0, last_error, error, oldnamelen; if (zvol_inhibit_dev) return; @@ -1732,24 +1771,31 @@ zvol_rename_minors_impl(const char *oldname, const char *newname) mutex_enter(&zv->zv_state_lock); if (strcmp(zv->zv_name, oldname) == 0) { - zvol_os_rename_minor(zv, newname); + error = zvol_os_rename_minor(zv, newname); } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 && (zv->zv_name[oldnamelen] == '/' || zv->zv_name[oldnamelen] == '@')) { char *name = kmem_asprintf("%s%c%s", newname, zv->zv_name[oldnamelen], zv->zv_name + oldnamelen + 1); - zvol_os_rename_minor(zv, name); + error = zvol_os_rename_minor(zv, name); kmem_strfree(name); } - + if (error) { + last_error = error; + } else { + done++; + } + total++; mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); + zvol_task_update_status(task, total, done, last_error); } typedef struct zvol_snapdev_cb_arg { + zvol_task_t *task; uint64_t snapdev; } zvol_snapdev_cb_arg_t; @@ -1757,26 +1803,31 @@ static int zvol_set_snapdev_cb(const char *dsname, void *param) { zvol_snapdev_cb_arg_t *arg = param; + int error = 0; if (strchr(dsname, '@') == NULL) return (0); switch (arg->snapdev) { case ZFS_SNAPDEV_VISIBLE: - (void) zvol_os_create_minor(dsname); + error = zvol_os_create_minor(dsname); break; case ZFS_SNAPDEV_HIDDEN: - (void) zvol_remove_minor_impl(dsname); + error = zvol_remove_minor_impl(dsname); break; } + zvol_task_update_status(arg->task, 1, error == 0, error); return (0); } static void -zvol_set_snapdev_impl(char *name, uint64_t snapdev) +zvol_set_snapdev_impl(zvol_task_t *task) { - zvol_snapdev_cb_arg_t arg = {snapdev}; + const char *name = task->zt_name1; + uint64_t snapdev = task->zt_value; + + zvol_snapdev_cb_arg_t arg = {task, snapdev}; fstrans_cookie_t cookie = spl_fstrans_mark(); /* * The zvol_set_snapdev_sync() sets snapdev appropriately @@ -1787,11 +1838,14 @@ zvol_set_snapdev_impl(char *name, uint64_t snapdev) } static void -zvol_set_volmode_impl(char *name, uint64_t volmode) +zvol_set_volmode_impl(zvol_task_t *task) { + const char *name = task->zt_name1; + uint64_t volmode = task->zt_value; fstrans_cookie_t cookie; uint64_t old_volmode; zvol_state_t *zv; + int error; if (strchr(name, '@') != NULL) return; @@ -1804,7 +1858,7 @@ zvol_set_volmode_impl(char *name, uint64_t volmode) */ zv = zvol_find_by_name(name, RW_NONE); if (zv == NULL && volmode == ZFS_VOLMODE_NONE) - return; + return; if (zv != NULL) { old_volmode = zv->zv_volmode; mutex_exit(&zv->zv_state_lock); @@ -1815,51 +1869,34 @@ zvol_set_volmode_impl(char *name, uint64_t volmode) cookie = spl_fstrans_mark(); switch (volmode) { case ZFS_VOLMODE_NONE: - (void) zvol_remove_minor_impl(name); + error = zvol_remove_minor_impl(name); break; case ZFS_VOLMODE_GEOM: case ZFS_VOLMODE_DEV: - (void) zvol_remove_minor_impl(name); - (void) zvol_os_create_minor(name); + error = zvol_remove_minor_impl(name); + /* + * The remove minor function call above, might be not + * needed, if volmode was switched from 'none' value. + * Ignore error in this case. + */ + if (error == ENOENT) + error = 0; + else if (error) + break; + error = zvol_os_create_minor(name); break; case ZFS_VOLMODE_DEFAULT: - (void) zvol_remove_minor_impl(name); + error = zvol_remove_minor_impl(name); if (zvol_volmode == ZFS_VOLMODE_NONE) break; else /* if zvol_volmode is invalid defaults to "geom" */ - (void) zvol_os_create_minor(name); + error = zvol_os_create_minor(name); break; } + zvol_task_update_status(task, 1, error == 0, error); spl_fstrans_unmark(cookie); } -static zvol_task_t * -zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2, - uint64_t value) -{ - zvol_task_t *task; - - /* Never allow tasks on hidden names. */ - if (name1[0] == '$') - return (NULL); - - task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); - task->op = op; - task->value = value; - - strlcpy(task->name1, name1, sizeof (task->name1)); - if (name2 != NULL) - strlcpy(task->name2, name2, sizeof (task->name2)); - - return (task); -} - -static void -zvol_task_free(zvol_task_t *task) -{ - kmem_free(task, sizeof (zvol_task_t)); -} - /* * The worker thread function performed asynchronously. */ @@ -1868,25 +1905,29 @@ zvol_task_cb(void *arg) { zvol_task_t *task = arg; - switch (task->op) { + switch (task->zt_op) { + case ZVOL_ASYNC_CREATE_MINORS: + zvol_create_minors_impl(task); + break; case ZVOL_ASYNC_REMOVE_MINORS: - zvol_remove_minors_impl(task->name1); + zvol_remove_minors_impl(task); break; case ZVOL_ASYNC_RENAME_MINORS: - zvol_rename_minors_impl(task->name1, task->name2); + zvol_rename_minors_impl(task); break; case ZVOL_ASYNC_SET_SNAPDEV: - zvol_set_snapdev_impl(task->name1, task->value); + zvol_set_snapdev_impl(task); break; case ZVOL_ASYNC_SET_VOLMODE: - zvol_set_volmode_impl(task->name1, task->value); + zvol_set_volmode_impl(task); break; default: VERIFY(0); break; } - zvol_task_free(task); + zvol_task_report_status(task); + kmem_free(task, sizeof (zvol_task_t)); } typedef struct zvol_set_prop_int_arg { @@ -1931,23 +1972,17 @@ zvol_set_common_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) if (dsl_prop_get_int_ds(ds, prop_name, &prop) != 0) return (0); - switch (zsda->zsda_prop) { - case ZFS_PROP_VOLMODE: - task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname, - NULL, prop); - break; - case ZFS_PROP_SNAPDEV: - task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname, - NULL, prop); - break; - default: - task = NULL; - break; - } - - if (task == NULL) + task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); + if (zsda->zsda_prop == ZFS_PROP_VOLMODE) { + task->zt_op = ZVOL_ASYNC_SET_VOLMODE; + } else if (zsda->zsda_prop == ZFS_PROP_SNAPDEV) { + task->zt_op = ZVOL_ASYNC_SET_SNAPDEV; + } else { + kmem_free(task, sizeof (zvol_task_t)); return (0); - + } + task->zt_value = prop; + strlcpy(task->zt_name1, dsname, sizeof (task->zt_name1)); (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); return (0); @@ -2001,15 +2036,35 @@ zvol_set_common(const char *ddname, zfs_prop_t prop, zprop_source_t source, } void -zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) +zvol_create_minors(const char *name) { + spa_t *spa; zvol_task_t *task; taskqid_t id; - task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL); - if (task == NULL) + if (spa_open(name, &spa, FTAG) != 0) return; + task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); + task->zt_op = ZVOL_ASYNC_CREATE_MINORS; + strlcpy(task->zt_name1, name, sizeof (task->zt_name1)); + id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); + if (id != TASKQID_INVALID) + taskq_wait_id(spa->spa_zvol_taskq, id); + + spa_close(spa, FTAG); +} + +void +zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) +{ + zvol_task_t *task; + taskqid_t id; + + task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); + task->zt_op = ZVOL_ASYNC_REMOVE_MINORS; + strlcpy(task->zt_name1, name, sizeof (task->zt_name1)); + task->zt_value = B_TRUE; id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); @@ -2022,10 +2077,10 @@ zvol_rename_minors(spa_t *spa, const char *name1, const char *name2, zvol_task_t *task; taskqid_t id; - task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL); - if (task == NULL) - return; - + task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); + task->zt_op = ZVOL_ASYNC_RENAME_MINORS; + strlcpy(task->zt_name1, name1, sizeof (task->zt_name1)); + strlcpy(task->zt_name2, name2, sizeof (task->zt_name2)); id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); @@ -2131,20 +2186,12 @@ zvol_fini_impl(void) zvol_remove_minors_impl(NULL); - /* - * The call to "zvol_remove_minors_impl" may dispatch entries to - * the system_taskq, but it doesn't wait for those entries to - * complete before it returns. Thus, we must wait for all of the - * removals to finish, before we can continue. - */ - taskq_wait_outstanding(system_taskq, 0); - kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head)); list_destroy(&zvol_state_list); rw_destroy(&zvol_state_lock); if (ztqs->tqs_taskq == NULL) { - ASSERT3U(ztqs->tqs_cnt, ==, 0); + ASSERT0(ztqs->tqs_cnt); } else { for (uint_t i = 0; i < ztqs->tqs_cnt; i++) { ASSERT3P(ztqs->tqs_taskq[i], !=, NULL); diff --git a/sys/contrib/openzfs/module/zstd/zfs_zstd.c b/sys/contrib/openzfs/module/zstd/zfs_zstd.c index 4f7ac56a6273..3db196953f74 100644 --- a/sys/contrib/openzfs/module/zstd/zfs_zstd.c +++ b/sys/contrib/openzfs/module/zstd/zfs_zstd.c @@ -637,11 +637,10 @@ zfs_zstd_compress_buf(void *s_start, void *d_start, size_t s_len, size_t d_len, size_t actual_abort_size = zstd_abort_size; if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level && s_len >= actual_abort_size) { - int pass_len = 1; abd_t sabd, dabd; abd_get_from_buf_struct(&sabd, s_start, s_len); abd_get_from_buf_struct(&dabd, d_start, d_len); - pass_len = zfs_lz4_compress(&sabd, &dabd, s_len, d_len, 0); + int pass_len = zfs_lz4_compress(&sabd, &dabd, s_len, d_len, 0); abd_free(&dabd); abd_free(&sabd); if (pass_len < d_len) { @@ -877,9 +876,9 @@ static void __init zstd_mempool_init(void) { zstd_mempool_cctx = - kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); + vmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); zstd_mempool_dctx = - kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); + vmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); for (int i = 0; i < ZSTD_POOL_MAX; i++) { mutex_init(&zstd_mempool_cctx[i].barrier, NULL, @@ -925,8 +924,8 @@ zstd_mempool_deinit(void) release_pool(&zstd_mempool_dctx[i]); } - kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); - kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); + vmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); + vmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); zstd_mempool_dctx = NULL; zstd_mempool_cctx = NULL; } diff --git a/sys/contrib/openzfs/rpm/generic/zfs.spec.in b/sys/contrib/openzfs/rpm/generic/zfs.spec.in index 8cf13023f537..8986e29eb7fb 100644 --- a/sys/contrib/openzfs/rpm/generic/zfs.spec.in +++ b/sys/contrib/openzfs/rpm/generic/zfs.spec.in @@ -87,7 +87,19 @@ %define __python %{__use_python} %define __python_pkg_version %{__use_python_pkg_version} %endif -%define __python_sitelib %(%{__python} -Esc "from distutils.sysconfig import get_python_lib; print(get_python_lib())" 2>/dev/null || %{__python} -Esc "import sysconfig; print(sysconfig.get_path('purelib'))") +%define __python_sitelib %(%{__python} -Esc " +import sysconfig; +if hasattr(sysconfig, 'get_default_scheme'): + scheme = sysconfig.get_default_scheme() +else: + scheme = sysconfig._get_default_scheme() +if scheme == 'posix_local': + scheme = 'posix_prefix' +prefix = '%{_prefix}' +if prefix == 'NONE': + prefix = '%{ac_default_prefix}' +sitedir = sysconfig.get_path('purelib', scheme, vars={'base': prefix}) +print(sitedir);" 2>/dev/null || %{__python} -Esc "from distutils import sysconfig; print(sysconfig.get_python_lib(0,0))") Name: @PACKAGE@ Version: @VERSION@ @@ -376,7 +388,7 @@ support for unlocking datasets on user login. %if 0%{?_systemd} %define systemd --enable-systemd --with-systemdunitdir=%{_unitdir} --with-systemdpresetdir=%{_presetdir} --with-systemdmodulesloaddir=%{_modulesloaddir} --with-systemdgeneratordir=%{_systemdgeneratordir} --disable-sysvinit - %define systemd_svcs zfs-import-cache.service zfs-import-scan.service zfs-mount.service zfs-share.service zfs-zed.service zfs.target zfs-import.target zfs-volume-wait.service zfs-volumes.target + %define systemd_svcs zfs-import-cache.service zfs-import-scan.service zfs-mount.service zfs-mount@.service zfs-share.service zfs-zed.service zfs.target zfs-import.target zfs-volume-wait.service zfs-volumes.target %else %define systemd --enable-sysvinit --disable-systemd %endif @@ -421,7 +433,7 @@ make install DESTDIR=%{?buildroot} find %{?buildroot}%{_libdir} -name '*.la' -exec rm -f {} \; %if 0%{!?__brp_mangle_shebangs:1} find %{?buildroot}%{_bindir} \ - \( -name arc_summary -or -name arcstat -or -name dbufstat \ + \( -name zarcsummary -or -name zarcstat -or -name dbufstat \ -or -name zilstat \) \ -exec %{__sed} -i 's|^#!.*|#!%{__python}|' {} \; find %{?buildroot}%{_datadir} \ @@ -494,11 +506,10 @@ systemctl --system daemon-reload >/dev/null || true # Core utilities %{_sbindir}/* %{_bindir}/raidz_test -%{_sbindir}/zgenhostid %{_bindir}/zvol_wait # Optional Python 3 scripts -%{_bindir}/arc_summary -%{_bindir}/arcstat +%{_bindir}/zarcsummary +%{_bindir}/zarcstat %{_bindir}/dbufstat %{_bindir}/zilstat # Man pages diff --git a/sys/contrib/openzfs/scripts/mancheck.sh b/sys/contrib/openzfs/scripts/mancheck.sh index 364ad1b76286..33d7d3c7155f 100755 --- a/sys/contrib/openzfs/scripts/mancheck.sh +++ b/sys/contrib/openzfs/scripts/mancheck.sh @@ -11,12 +11,12 @@ # AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT # OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. # -# shellcheck disable=SC2086 +# shellcheck disable=SC2068,SC2086 trap 'rm -f "$stdout_file" "$stderr_file" "$result_file"' EXIT if [ "$#" -eq 0 ]; then - echo "Usage: $0 manpage-directory..." + echo "Usage: $0 <manpage-directory|manpage-file>..." exit 1 fi @@ -27,7 +27,16 @@ fi IFS=" " -files="$(find "$@" -type f -name '*[1-9]*' -not -name '.*')" || exit 1 +files="$( + for path in $@ ; do + find -L $path -type f -name '*[1-9]*' -not -name '.*' + done | sort | uniq +)" + +if [ "$files" = "" ] ; then + echo no files to process! 1>&2 + exit 1 +fi add_excl="$(awk ' /^.\\" lint-ok:/ { @@ -48,6 +57,4 @@ grep -vhE -e 'mandoc: outdated mandoc.db' -e 'STYLE: referenced manual not found if [ -s "$result_file" ]; then cat "$result_file" exit 1 -else - echo "no errors found" fi diff --git a/sys/contrib/openzfs/scripts/spdxcheck.pl b/sys/contrib/openzfs/scripts/spdxcheck.pl index 47128402f7bc..4d4e14368beb 100755 --- a/sys/contrib/openzfs/scripts/spdxcheck.pl +++ b/sys/contrib/openzfs/scripts/spdxcheck.pl @@ -83,8 +83,8 @@ my $tagged_patterns = q( man/man?/*.?.in # Unsuffixed programs (or generated of same) - cmd/arcstat.in - cmd/arc_summary + cmd/zarcstat.in + cmd/zarcsummary cmd/dbufstat.in cmd/zilstat.in cmd/zpool/zpool.d/* @@ -190,6 +190,7 @@ my @path_license_tags = ( ['BSD-2-Clause OR GPL-2.0-only', 'CDDL-1.0'], 'module/icp' => ['Apache-2.0', 'CDDL-1.0'], + 'contrib/icp' => ['Apache-2.0', 'CDDL-1.0'], # Python bindings are always Apache-2.0 'contrib/pyzfs' => ['Apache-2.0'], @@ -253,7 +254,6 @@ my %override_file_license_tags = ( 'GPL-2.0-or-later' => [qw( include/os/freebsd/spl/sys/kstat.h include/os/freebsd/spl/sys/sunddi.h - include/sys/mod.h )], 'CDDL-1.0' => [qw( include/os/linux/spl/sys/errno.h diff --git a/sys/contrib/openzfs/scripts/zfs-helpers.sh b/sys/contrib/openzfs/scripts/zfs-helpers.sh index b45384a9aa52..2e97d40db1c1 100755 --- a/sys/contrib/openzfs/scripts/zfs-helpers.sh +++ b/sys/contrib/openzfs/scripts/zfs-helpers.sh @@ -122,6 +122,13 @@ install() { src=$1 dst=$2 + # We may have an old symlink pointing to different ZFS workspace. + # Remove the old symlink if it doesn't point to our workspace. + if [ -h "$dst" ] && [ "$(readlink -f """$dst""")" != "$src" ] ; then + echo "Removing old symlink: $dst -> $(readlink """$dst""")" + rm "$dst" + fi + if [ -h "$dst" ]; then echo "Symlink exists: $dst" elif [ -e "$dst" ]; then diff --git a/sys/contrib/openzfs/tests/runfiles/common.run b/sys/contrib/openzfs/tests/runfiles/common.run index 7969945a4796..2b002830c82f 100644 --- a/sys/contrib/openzfs/tests/runfiles/common.run +++ b/sys/contrib/openzfs/tests/runfiles/common.run @@ -37,8 +37,7 @@ tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos', 'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos', 'alloc_class_007_pos', 'alloc_class_008_pos', 'alloc_class_009_pos', 'alloc_class_010_pos', 'alloc_class_011_neg', 'alloc_class_012_pos', - 'alloc_class_013_pos', 'alloc_class_014_neg', 'alloc_class_015_pos', - 'alloc_class_016_pos'] + 'alloc_class_013_pos', 'alloc_class_016_pos'] tags = ['functional', 'alloc_class'] [tests/functional/append] @@ -171,7 +170,8 @@ tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos', 'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress', 'zdb_display_block', 'zdb_encrypted', 'zdb_label_checksum', 'zdb_object_range_neg', 'zdb_object_range_pos', 'zdb_objset_id', - 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2', 'zdb_backup'] + 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2', 'zdb_backup', + 'zdb_tunables'] pre = post = tags = ['functional', 'cli_root', 'zdb'] @@ -308,7 +308,7 @@ tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos'] tags = ['functional', 'cli_root', 'zfs_reservation'] [tests/functional/cli_root/zfs_rewrite] -tests = ['zfs_rewrite'] +tests = ['zfs_rewrite', 'zfs_rewrite_physical'] tags = ['functional', 'cli_root', 'zfs_rewrite'] [tests/functional/cli_root/zfs_rollback] @@ -323,6 +323,10 @@ tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos', 'zfs_send_raw', 'zfs_send_sparse', 'zfs_send-b', 'zfs_send_skip_missing'] tags = ['functional', 'cli_root', 'zfs_send'] +[tests/functional/cli_root/zfs_send_delegation] +tests = ['zfs_send_test'] +tags = ['functional', 'cli_root', 'zfs_send_delegation'] + [tests/functional/cli_root/zfs_set] tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos', 'canmount_002_pos', 'canmount_003_pos', 'canmount_004_pos', @@ -379,7 +383,7 @@ tags = ['functional', 'cli_root', 'zfs_wait'] [tests/functional/cli_root/zhack] tests = ['zhack_label_repair_001', 'zhack_label_repair_002', - 'zhack_label_repair_003', 'zhack_label_repair_004'] + 'zhack_label_repair_003', 'zhack_label_repair_004', 'zhack_metaslab_leak'] pre = post = tags = ['functional', 'cli_root', 'zhack'] @@ -497,6 +501,7 @@ tags = ['functional', 'cli_root', 'zpool_labelclear'] tests = ['zpool_initialize_attach_detach_add_remove', 'zpool_initialize_fault_export_import_online', 'zpool_initialize_import_export', + 'zpool_initialize_multiple_pools', 'zpool_initialize_offline_export_import_online', 'zpool_initialize_online_offline', 'zpool_initialize_split', @@ -542,8 +547,10 @@ tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos', 'zpool_scrub_004_pos', 'zpool_scrub_005_pos', 'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing', 'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies', + 'zpool_scrub_multiple_pools', 'zpool_error_scrub_001_pos', 'zpool_error_scrub_002_pos', - 'zpool_error_scrub_003_pos', 'zpool_error_scrub_004_pos'] + 'zpool_error_scrub_003_pos', 'zpool_error_scrub_004_pos', + 'zpool_scrub_date_range_001', 'zpool_scrub_date_range_002'] tags = ['functional', 'cli_root', 'zpool_scrub'] [tests/functional/cli_root/zpool_set] @@ -574,8 +581,8 @@ tags = ['functional', 'cli_root', 'zpool_sync'] [tests/functional/cli_root/zpool_trim] tests = ['zpool_trim_attach_detach_add_remove', - 'zpool_trim_fault_export_import_online', - 'zpool_trim_import_export', 'zpool_trim_multiple', 'zpool_trim_neg', + 'zpool_trim_fault_export_import_online', 'zpool_trim_import_export', + 'zpool_trim_multiple', 'zpool_trim_multiple_pools', 'zpool_trim_neg', 'zpool_trim_offline_export_import_online', 'zpool_trim_online_offline', 'zpool_trim_partial', 'zpool_trim_rate', 'zpool_trim_rate_neg', 'zpool_trim_secure', 'zpool_trim_split', 'zpool_trim_start_and_cancel_neg', @@ -621,8 +628,8 @@ tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg', 'zpool_history_001_neg', 'zpool_import_001_neg', 'zpool_import_002_neg', 'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg', 'zpool_replace_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg', - 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos', - 'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege', + 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'zarcstat_001_pos', + 'zarcsummary_001_pos', 'zarcsummary_002_neg', 'zpool_wait_privilege', 'zilstat_001_pos'] user = tags = ['functional', 'cli_user', 'misc'] @@ -634,6 +641,10 @@ tests = ['zfs_list_001_pos', 'zfs_list_002_pos', 'zfs_list_003_pos', user = tags = ['functional', 'cli_user', 'zfs_list'] +[tests/functional/cli_user/zfs_send_delegation_user] +tests = ['zfs_send_usertest'] +tags = ['functional', 'cli_user', 'zfs_send_delegation_user'] + [tests/functional/cli_user/zpool_iostat] tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos', 'zpool_iostat_003_neg', 'zpool_iostat_004_pos', @@ -722,7 +733,11 @@ tests = ['fadvise_willneed'] tags = ['functional', 'fadvise'] [tests/functional/failmode] -tests = ['failmode_dmu_tx_wait', 'failmode_dmu_tx_continue'] +tests = ['failmode_dmu_tx_wait', 'failmode_dmu_tx_continue', + 'failmode_fsync_wait', 'failmode_fsync_continue', + 'failmode_msync_wait', 'failmode_msync_continue', + 'failmode_osync_wait', 'failmode_osync_continue', + 'failmode_syncalways_wait', 'failmode_syncalways_continue'] tags = ['functional', 'failmode'] [tests/functional/fallocate] @@ -740,7 +755,8 @@ tags = ['functional', 'features', 'large_dnode'] [tests/functional/gang_blocks] tests = ['gang_blocks_001_pos', 'gang_blocks_redundant', - 'gang_blocks_ddt_copies'] + 'gang_blocks_ddt_copies', 'gang_blocks_dyn_header_pos', + 'gang_blocks_dyn_header_neg', 'gang_blocks_dyn_multi'] tags = ['functional', 'gang_blocks'] [tests/functional/grow] @@ -798,7 +814,7 @@ tags = ['functional', 'migration'] [tests/functional/mmap] tests = ['mmap_mixed', 'mmap_read_001_pos', 'mmap_seek_001_pos', - 'mmap_sync_001_pos', 'mmap_write_001_pos'] + 'mmap_sync_001_pos', 'mmap_write_001_pos', 'mmap_ftruncate'] tags = ['functional', 'mmap'] [tests/functional/mount] @@ -932,10 +948,11 @@ tags = ['functional', 'rename_dirs'] [tests/functional/replacement] tests = ['attach_import', 'attach_multiple', 'attach_rebuild', - 'attach_resilver', 'detach', 'rebuild_disabled_feature', - 'rebuild_multiple', 'rebuild_raidz', 'replace_import', 'replace_rebuild', - 'replace_resilver', 'resilver_restart_001', 'resilver_restart_002', - 'scrub_cancel'] + 'attach_resilver', 'attach_resilver_sit_out', 'detach', + 'rebuild_disabled_feature', 'rebuild_multiple', 'rebuild_raidz', + 'replace_import', 'replace_rebuild', 'replace_resilver', + 'replace_resilver_sit_out', 'resilver_restart_001', + 'resilver_restart_002', 'scrub_cancel'] tags = ['functional', 'replacement'] [tests/functional/reservation] @@ -1085,7 +1102,7 @@ tests = ['zvol_misc_002_pos', 'zvol_misc_hierarchy', 'zvol_misc_rename_inuse', tags = ['functional', 'zvol', 'zvol_misc'] [tests/functional/zvol/zvol_stress] -tests = ['zvol_stress'] +tests = ['zvol_stress', 'zvol_stress_destroy'] tags = ['functional', 'zvol', 'zvol_stress'] [tests/functional/zvol/zvol_swap] diff --git a/sys/contrib/openzfs/tests/runfiles/linux.run b/sys/contrib/openzfs/tests/runfiles/linux.run index f3d56acffde0..339361cc2762 100644 --- a/sys/contrib/openzfs/tests/runfiles/linux.run +++ b/sys/contrib/openzfs/tests/runfiles/linux.run @@ -109,7 +109,9 @@ tags = ['functional', 'direct'] [tests/functional/events:Linux] tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill', 'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config', - 'zed_slow_io', 'zed_slow_io_many_vdevs', 'zed_diagnose_multiple'] + 'zed_slow_io', 'zed_slow_io_many_vdevs', 'zed_diagnose_multiple', + 'zed_synchronous_zedlet', 'slow_vdev_sit_out', 'slow_vdev_sit_out_neg', + 'slow_vdev_degraded_sit_out'] tags = ['functional', 'events'] [tests/functional/fallocate:Linux] @@ -161,7 +163,7 @@ tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval', tags = ['functional', 'mmp'] [tests/functional/mount:Linux] -tests = ['umount_unlinked_drain'] +tests = ['umount_unlinked_drain', 'mount_loopback'] tags = ['functional', 'mount'] [tests/functional/pam:Linux] diff --git a/sys/contrib/openzfs/tests/runfiles/sanity.run b/sys/contrib/openzfs/tests/runfiles/sanity.run index 732f252b52d2..b56ffc3a4a2d 100644 --- a/sys/contrib/openzfs/tests/runfiles/sanity.run +++ b/sys/contrib/openzfs/tests/runfiles/sanity.run @@ -195,7 +195,7 @@ tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos'] tags = ['functional', 'cli_root', 'zfs_reservation'] [tests/functional/cli_root/zfs_rewrite] -tests = ['zfs_rewrite'] +tests = ['zfs_rewrite', 'zfs_rewrite_physical'] tags = ['functional', 'cli_root', 'zfs_rewrite'] [tests/functional/cli_root/zfs_rollback] @@ -400,8 +400,8 @@ tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg', 'zpool_detach_001_neg', 'zpool_export_001_neg', 'zpool_get_001_neg', 'zpool_history_001_neg', 'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg', - 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos', - 'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege', + 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'zarcstat_001_pos', + 'zarcsummary_001_pos', 'zarcsummary_002_neg', 'zpool_wait_privilege', 'zilstat_001_pos'] user = tags = ['functional', 'cli_user', 'misc'] diff --git a/sys/contrib/openzfs/tests/test-runner/bin/test-runner.py.in b/sys/contrib/openzfs/tests/test-runner/bin/test-runner.py.in index 5bf13f5c08af..2158208be6e5 100755 --- a/sys/contrib/openzfs/tests/test-runner/bin/test-runner.py.in +++ b/sys/contrib/openzfs/tests/test-runner/bin/test-runner.py.in @@ -15,6 +15,7 @@ # # Copyright (c) 2012, 2018 by Delphix. All rights reserved. # Copyright (c) 2019 Datto Inc. +# Copyright (c) 2025, Klara, Inc. # # This script must remain compatible with Python 3.6+. # @@ -372,6 +373,8 @@ User: %s stdout/stderr/merged in its own file. """ + timeprefix = datetime.now().strftime('[%FT%T.%f] ') + logname = getpwuid(os.getuid()).pw_name rer = '' if self.reran is True: @@ -383,7 +386,7 @@ User: %s msga = 'Test: %s%s ' % (self.pathname, user) msgb = '[%s] [%s]%s\n' % (self.result.runtime, self.result.result, rer) pad = ' ' * (80 - (len(msga) + len(msgb))) - result_line = msga + pad + msgb + result_line = timeprefix + msga + pad + msgb # The result line is always written to the log file. If -q was # specified only failures are written to the console, otherwise diff --git a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in index 40f5083d1294..5bc65f993734 100755 --- a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in +++ b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in @@ -15,6 +15,7 @@ # # Copyright (c) 2017 by Delphix. All rights reserved. # Copyright (c) 2018 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2025, Klara, Inc. # # This script must remain compatible with Python 3.6+. # @@ -231,7 +232,7 @@ maybe = { 'cli_root/zpool_trim/zpool_trim_fault_export_import_online': ['FAIL', known_reason], 'cli_root/zpool_upgrade/zpool_upgrade_004_pos': ['FAIL', 6141], - 'cli_user/misc/arc_summary_001_pos': ['FAIL', known_reason], + 'cli_user/misc/zarcsummary_001_pos': ['FAIL', known_reason], 'delegate/setup': ['SKIP', exec_reason], 'events/zed_cksum_config': ['FAIL', known_reason], 'fault/auto_replace_002_pos': ['FAIL', known_reason], @@ -381,7 +382,8 @@ def process_results(pathname): prefix = '/zfs-tests/tests/(?:functional|perf/regression)/' pattern = \ - r'^Test(?:\s+\(\S+\))?:' + \ + r'^(?:\[[0-9\-T:\.]+\]\s+)?' + \ + r'Test(?:\s+\(\S+\))?:' + \ rf'\s*\S*{prefix}(\S+)' + \ r'\s*\(run as (\S+)\)\s*\[(\S+)\]\s*\[(\S+)\]' pattern_log = r'^\s*Log directory:\s*(\S*)' diff --git a/sys/contrib/openzfs/tests/test-runner/include/logapi.shlib b/sys/contrib/openzfs/tests/test-runner/include/logapi.shlib index 670ecfefb986..29e0c7f1c9ca 100644 --- a/sys/contrib/openzfs/tests/test-runner/include/logapi.shlib +++ b/sys/contrib/openzfs/tests/test-runner/include/logapi.shlib @@ -25,6 +25,7 @@ # Use is subject to license terms. # # Copyright (c) 2012, 2020 by Delphix. All rights reserved. +# Copyright (c) 2025, Klara, Inc. # STF_PASS=0 @@ -465,7 +466,11 @@ function _endlog function _printline { - echo "$@" + if [[ -n "$ZTS_LOG_SUPPRESS_TIMESTAMP" ]] ; then + printf '[%(%FT%T.%6N)T] %s\n' now "$*" + else + echo "$@" + fi } # Output an error message diff --git a/sys/contrib/openzfs/tests/zfs-tests/callbacks/zfs_dmesg.ksh b/sys/contrib/openzfs/tests/zfs-tests/callbacks/zfs_dmesg.ksh index 73c654125319..de31765a52e4 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/callbacks/zfs_dmesg.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/callbacks/zfs_dmesg.ksh @@ -15,6 +15,7 @@ # # Copyright (c) 2016 by Delphix. All rights reserved. # Copyright (c) 2017 Lawrence Livermore National Security, LLC. +# Copyright (c) 2025, Klara, Inc. # # $1: number of lines to output (default: 200) @@ -25,7 +26,11 @@ echo " Tailing last $lines lines of dmesg log" echo "=================================================================" # report and reset afterwards -sudo dmesg -c | tail -n $lines +dmesg_args="-c" +if [[ $(uname) = "Linux" ]] ; then + dmesg_args="$dmesg_args --time-format=iso" +fi +sudo dmesg $dmesg_args | tail -n $lines echo "=================================================================" echo " End of dmesg log" diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/.gitignore b/sys/contrib/openzfs/tests/zfs-tests/cmd/.gitignore index e9a6f8f0ac17..62f1684acfb4 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/cmd/.gitignore +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/.gitignore @@ -23,10 +23,12 @@ /mkfiles /mktree /mmap_exec +/mmap_ftruncate /mmap_libaio /mmap_seek /mmap_sync /mmapwrite +/mmap_write_sync /nvlist_to_lua /randfree_file /randwritecomp diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/cmd/Makefile.am index 909a72c43d80..85c3cf3c35a8 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/cmd/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/Makefile.am @@ -72,7 +72,9 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/mkbusy %D%/mkfile %D%/mkfiles %D%/mktree %C%_mkfile_LDADD = $(LTLIBINTL) -scripts_zfs_tests_bin_PROGRAMS += %D%/mmap_exec %D%/mmap_seek %D%/mmap_sync %D%/mmapwrite %D%/readmmap +scripts_zfs_tests_bin_PROGRAMS += \ + %D%/mmap_exec %D%/mmap_ftruncate %D%/mmap_seek \ + %D%/mmap_sync %D%/mmapwrite %D%/readmmap %D%/mmap_write_sync %C%_mmapwrite_LDADD = -lpthread if WANT_MMAP_LIBAIO diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/btree_test.c b/sys/contrib/openzfs/tests/zfs-tests/cmd/btree_test.c index e7b80d01efaa..f8948a61833d 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/cmd/btree_test.c +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/btree_test.c @@ -207,7 +207,7 @@ insert_find_remove(zfs_btree_t *bt, char *why) "Found removed value (%llu)\n", *p); return (1); } - ASSERT3S(zfs_btree_numnodes(bt), ==, 0); + ASSERT0(zfs_btree_numnodes(bt)); zfs_btree_verify(bt); return (0); @@ -279,7 +279,7 @@ drain_tree(zfs_btree_t *bt, char *why) node = avl_last(&avl); ASSERT3U(node->data, ==, *(uint64_t *)zfs_btree_last(bt, NULL)); } - ASSERT3S(zfs_btree_numnodes(bt), ==, 0); + ASSERT0(zfs_btree_numnodes(bt)); void *avl_cookie = NULL; while ((node = avl_destroy_nodes(&avl, &avl_cookie)) != NULL) diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/crypto_test.c b/sys/contrib/openzfs/tests/zfs-tests/cmd/crypto_test.c index e08003f80464..c8d8622c7571 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/cmd/crypto_test.c +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/crypto_test.c @@ -529,6 +529,8 @@ static const char *aes_gcm_impl[][2] = { { "aesni", "pclmulqdq" }, { "x86_64", "avx" }, { "aesni", "avx" }, + { "x86_64", "avx2" }, + { "aesni", "avx2" }, }; /* signature of function to call after setting implementation params */ @@ -861,7 +863,8 @@ test_result(const crypto_test_t *test, int encrypt_rv, uint8_t *encrypt_buf, return (pass); /* print summary of test result */ - printf("%s[%lu]: encrypt=%s decrypt=%s\n", test->fileloc, test->id, + printf("%s[%ju]: encrypt=%s decrypt=%s\n", test->fileloc, + (uintmax_t)test->id, encrypt_pass ? "PASS" : "FAIL", decrypt_pass ? "PASS" : "FAIL"); diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/draid.c b/sys/contrib/openzfs/tests/zfs-tests/cmd/draid.c index 8d0bdc450f80..2c1ab1b9bec3 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/cmd/draid.c +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/draid.c @@ -204,7 +204,7 @@ write_map(const char *filename, nvlist_t *allcfgs) error = errno; free(buf); free(tmpname); - return (errno); + return (error); } ssize_t rc, bytes = 0; diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/idmap_util.c b/sys/contrib/openzfs/tests/zfs-tests/cmd/idmap_util.c index 416e80714f9b..f332677f520c 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/cmd/idmap_util.c +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/idmap_util.c @@ -301,7 +301,7 @@ static int write_idmap(pid_t pid, char *buf, size_t buf_size, idmap_type_t type) { char path[PATH_MAX]; - int fd = -EBADF; + int fd; int ret; (void) snprintf(path, sizeof (path), "/proc/%d/%cid_map", diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/mktree.c b/sys/contrib/openzfs/tests/zfs-tests/cmd/mktree.c index 297cf6dea415..9a5253468bd2 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/cmd/mktree.c +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/mktree.c @@ -152,7 +152,7 @@ getfdname(char *pdir, char type, int level, int dir, int file) static void crtfile(char *pname) { - int fd = -1; + int fd; int i, size; const char *context = "0123456789ABCDF"; char *pbuf; diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/mmap_ftruncate.c b/sys/contrib/openzfs/tests/zfs-tests/cmd/mmap_ftruncate.c new file mode 100644 index 000000000000..91cdfe3715e6 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/mmap_ftruncate.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2025, Klara, Inc. + */ + +/* + * Tests async writeback behaviour. Creates a file, maps it into memory, and + * dirties every page within it. Then, calls ftruncate() to collapse the file + * back down to 0. This causes the kernel to begin writeback on the dirty + * pages so they can be freed, before it can complete the ftruncate() call. + * None of these are sync operations, so they should avoid the various "force + * flush" codepaths. + */ + +#include <unistd.h> +#include <fcntl.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <stdlib.h> +#include <stdio.h> + +#define _pdfail(f, l, s) \ + do { perror("[" f "#" #l "] " s); exit(2); } while (0) +#define pdfail(str) _pdfail(__FILE__, __LINE__, str) + +int +main(int argc, char **argv) { + if (argc != 3) { + printf("usage: mmap_ftruncate <file> <size>\n"); + exit(2); + } + + const char *file = argv[1]; + + char *end; + off_t sz = strtoull(argv[2], &end, 0); + if (end == argv[2] || *end != '\0' || sz == 0) { + fprintf(stderr, "E: invalid size"); + exit(2); + } + + int fd = open(file, O_CREAT|O_TRUNC|O_RDWR, S_IRUSR|S_IWUSR); + if (fd < 0) + pdfail("open"); + + if (ftruncate(fd, sz) < 0) + pdfail("ftruncate"); + + char *p = mmap(NULL, sz, PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) + pdfail("mmap"); + + for (off_t off = 0; off < sz; off += 4096) + p[off] = 1; + + if (ftruncate(fd, 0) < 0) + pdfail("ftruncate"); + + if (munmap(p, sz) < 0) + pdfail("munmap"); + + close(fd); + return (0); +} diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/mmap_write_sync.c b/sys/contrib/openzfs/tests/zfs-tests/cmd/mmap_write_sync.c new file mode 100644 index 000000000000..ad5e37f24960 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/mmap_write_sync.c @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2025, Klara, Inc. + */ + +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <fcntl.h> +#include <sys/stat.h> +#include <sys/mman.h> + +#define PAGES (8) + +int +main(int argc, char **argv) +{ + if (argc != 2) { + fprintf(stderr, "usage: %s <filename>\n", argv[0]); + exit(1); + } + + long page_size = sysconf(_SC_PAGESIZE); + if (page_size < 0) { + perror("sysconf"); + exit(2); + } + size_t map_size = page_size * PAGES; + + int fd = open(argv[1], O_CREAT|O_RDWR, S_IRWXU|S_IRWXG|S_IRWXO); + if (fd < 0) { + perror("open"); + exit(2); + } + + if (ftruncate(fd, map_size) < 0) { + perror("ftruncate"); + close(fd); + exit(2); + } + + uint64_t *p = + mmap(NULL, map_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) { + perror("mmap"); + close(fd); + exit(2); + } + + for (int i = 0; i < (map_size / sizeof (uint64_t)); i++) + p[i] = 0x0123456789abcdef; + + if (msync(p, map_size, MS_SYNC) < 0) { + perror("msync"); + munmap(p, map_size); + close(fd); + exit(3); + } + + munmap(p, map_size); + close(fd); + exit(0); +} diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/mmapwrite.c b/sys/contrib/openzfs/tests/zfs-tests/cmd/mmapwrite.c index 61fcdc35af13..31d61ffb07d0 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/cmd/mmapwrite.c +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/mmapwrite.c @@ -59,7 +59,7 @@ static void * normal_writer(void *filename) { char *file_path = filename; - int fd = -1; + int fd; ssize_t write_num = 0; int page_size = getpagesize(); @@ -93,7 +93,7 @@ normal_writer(void *filename) static void * map_writer(void *filename) { - int fd = -1; + int fd; int ret = 0; char *buf = NULL; int page_size = getpagesize(); diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg b/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg index 1c7e42a06e05..1c4d25e152a7 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg +++ b/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg @@ -100,6 +100,7 @@ export SYSTEM_FILES_COMMON='awk uniq vmstat wc + which xargs xxh128sum' @@ -146,6 +147,7 @@ export SYSTEM_FILES_LINUX='attr lscpu lsmod lsscsi + mkfs.xfs mkswap modprobe mountpoint @@ -169,8 +171,8 @@ export ZFS_FILES='zdb zpool ztest raidz_test - arc_summary - arcstat + zarcsummary + zarcstat zilstat dbufstat mount.zfs @@ -205,10 +207,12 @@ export ZFSTEST_FILES='badsend mkfiles mktree mmap_exec + mmap_ftruncate mmap_libaio mmap_seek mmap_sync mmapwrite + mmap_write_sync nvlist_to_lua randfree_file randwritecomp diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib b/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib index 4b8db189310f..8b30b9b91641 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib +++ b/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib @@ -1085,7 +1085,7 @@ function fill_fs # destdir dirnum filenum bytes num_writes data typeset -i filenum=${3:-50} typeset -i bytes=${4:-8192} typeset -i num_writes=${5:-10240} - typeset data=${6:-0} + typeset data=${6:-"R"} mkdir -p $destdir/{1..$dirnum} for f in $destdir/{1..$dirnum}/$TESTFILE{1..$filenum}; do @@ -1112,6 +1112,16 @@ function get_pool_prop # property pool zpool get -Hpo value "$prop" "$pool" || log_fail "zpool get $prop $pool" } +# Get the specified vdev property in parsable format or fail +function get_vdev_prop +{ + typeset prop="$1" + typeset pool="$2" + typeset vdev="$3" + + zpool get -Hpo value "$prop" "$pool" "$vdev" || log_fail "zpool get $prop $pool $vdev" +} + # Return 0 if a pool exists; $? otherwise # # $1 - pool name @@ -1971,6 +1981,28 @@ function wait_vdev_state # pool disk state timeout } # +# Wait for vdev 'sit_out' property to be cleared. +# +# $1 pool name +# $2 vdev name +# $3 timeout +# +function wait_sit_out #pool vdev timeout +{ + typeset pool=${1:-$TESTPOOL} + typeset vdev="$2" + typeset timeout=${3:-300} + for (( timer = 0; timer < $timeout; timer++ )); do + if [ "$(get_vdev_prop sit_out "$pool" "$vdev")" = "off" ]; then + return 0 + fi + sleep 1; + done + + return 1 +} + +# # Check the output of 'zpool status -v <pool>', # and to see if the content of <token> contain the <keyword> specified. # @@ -2881,10 +2913,34 @@ function user_run log_note "user: $user" log_note "cmd: $*" + if ! sudo -Eu $user test -x $PATH ; then + log_note "-------------------------------------------------" + log_note "Warning: $user doesn't have permissions on $PATH" + log_note "" + log_note "This usually happens when you're running ZTS locally" + log_note "from inside the ZFS source dir, and are attempting to" + log_note "run a test that calls user_run. The ephemeral user" + log_note "($user) that ZTS is creating does not have permission" + log_note "to traverse to $PATH, or the binaries in $PATH are" + log_note "not the right permissions." + log_note "" + log_note "To get around this, copy your ZFS source directory" + log_note "to a world-accessible location (like /tmp), and " + log_note "change the permissions on your ZFS source dir " + log_note "to allow access." + log_note "" + log_note "Also, verify that /dev/zfs is RW for others:" + log_note "" + log_note " sudo chmod o+rw /dev/zfs" + log_note "-------------------------------------------------" + fi + typeset out=$TEST_BASE_DIR/out typeset err=$TEST_BASE_DIR/err - sudo -Eu $user env PATH="$PATH" ksh <<<"$*" >$out 2>$err + sudo -Eu $user \ + env PATH="$PATH" ZTS_LOG_SUPPRESS_TIMESTAMP=1 \ + ksh <<<"$*" >$out 2>$err typeset res=$? log_note "out: $(<$out)" log_note "err: $(<$err)" diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg b/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg index 98e14ad97718..54b50c9dba77 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg +++ b/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg @@ -72,9 +72,12 @@ MULTIHOST_INTERVAL multihost.interval zfs_multihost_interval OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_estimate_recordsize PREFETCH_DISABLE prefetch.disable zfs_prefetch_disable RAIDZ_EXPAND_MAX_REFLOW_BYTES vdev.expand_max_reflow_bytes raidz_expand_max_reflow_bytes +READ_SIT_OUT_SECS vdev.read_sit_out_secs vdev_read_sit_out_secs +SIT_OUT_CHECK_INTERVAL vdev.raidz_outlier_check_interval_ms vdev_raidz_outlier_check_interval_ms +SIT_OUT_INSENSITIVITY vdev.raidz_outlier_insensitivity vdev_raidz_outlier_insensitivity REBUILD_SCRUB_ENABLED rebuild_scrub_enabled zfs_rebuild_scrub_enabled -REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress -REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment +REMOVAL_SUSPEND_PROGRESS vdev.removal_suspend_progress zfs_removal_suspend_progress +REMOVE_MAX_SEGMENT vdev.remove_max_segment zfs_remove_max_segment RESILVER_MIN_TIME_MS resilver_min_time_ms zfs_resilver_min_time_ms RESILVER_DEFER_PERCENT resilver_defer_percent zfs_resilver_defer_percent SCAN_LEGACY scan_legacy zfs_scan_legacy @@ -87,6 +90,8 @@ SPA_ASIZE_INFLATION spa.asize_inflation spa_asize_inflation SPA_DISCARD_MEMORY_LIMIT spa.discard_memory_limit zfs_spa_discard_memory_limit SPA_LOAD_VERIFY_DATA spa.load_verify_data spa_load_verify_data SPA_LOAD_VERIFY_METADATA spa.load_verify_metadata spa_load_verify_metadata +SPA_NOTE_TXG_TIME spa.note_txg_time spa_note_txg_time +SPA_FLUSH_TXG_TIME spa.flush_txg_time spa_flush_txg_time TRIM_EXTENT_BYTES_MIN trim.extent_bytes_min zfs_trim_extent_bytes_min TRIM_METASLAB_SKIP trim.metaslab_skip zfs_trim_metaslab_skip TRIM_TXG_BATCH trim.txg_batch zfs_trim_txg_batch diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am index d27660a42c5a..1517f90e99a5 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am @@ -276,6 +276,7 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/direct/dio.kshlib \ functional/events/events.cfg \ functional/events/events_common.kshlib \ + functional/failmode/failmode.kshlib \ functional/fault/fault.cfg \ functional/gang_blocks/gang_blocks.kshlib \ functional/grow/grow.cfg \ @@ -429,8 +430,6 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/alloc_class/alloc_class_011_neg.ksh \ functional/alloc_class/alloc_class_012_pos.ksh \ functional/alloc_class/alloc_class_013_pos.ksh \ - functional/alloc_class/alloc_class_014_neg.ksh \ - functional/alloc_class/alloc_class_015_pos.ksh \ functional/alloc_class/alloc_class_016_pos.ksh \ functional/alloc_class/cleanup.ksh \ functional/alloc_class/setup.ksh \ @@ -647,6 +646,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zdb/zdb_objset_id.ksh \ functional/cli_root/zdb/zdb_recover_2.ksh \ functional/cli_root/zdb/zdb_recover.ksh \ + functional/cli_root/zdb/zdb_tunables.ksh \ functional/cli_root/zfs_bookmark/cleanup.ksh \ functional/cli_root/zfs_bookmark/setup.ksh \ functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh \ @@ -870,6 +870,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zfs_rewrite/cleanup.ksh \ functional/cli_root/zfs_rewrite/setup.ksh \ functional/cli_root/zfs_rewrite/zfs_rewrite.ksh \ + functional/cli_root/zfs_rewrite/zfs_rewrite_physical.ksh \ functional/cli_root/zfs_rollback/cleanup.ksh \ functional/cli_root/zfs_rollback/setup.ksh \ functional/cli_root/zfs_rollback/zfs_rollback_001_pos.ksh \ @@ -891,6 +892,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zfs_send/zfs_send_raw.ksh \ functional/cli_root/zfs_send/zfs_send_skip_missing.ksh \ functional/cli_root/zfs_send/zfs_send_sparse.ksh \ + functional/cli_root/zfs_send_delegation/cleanup.ksh \ + functional/cli_root/zfs_send_delegation/setup.ksh \ + functional/cli_root/zfs_send_delegation/zfs_send_test.ksh \ functional/cli_root/zfs_set/cache_001_pos.ksh \ functional/cli_root/zfs_set/cache_002_neg.ksh \ functional/cli_root/zfs_set/canmount_001_pos.ksh \ @@ -1008,6 +1012,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zhack/zhack_label_repair_002.ksh \ functional/cli_root/zhack/zhack_label_repair_003.ksh \ functional/cli_root/zhack/zhack_label_repair_004.ksh \ + functional/cli_root/zhack/zhack_metaslab_leak.ksh \ functional/cli_root/zpool_add/add_nested_replacing_spare.ksh \ functional/cli_root/zpool_add/add-o_ashift.ksh \ functional/cli_root/zpool_add/add_prop_ashift.ksh \ @@ -1177,6 +1182,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh \ + functional/cli_root/zpool_initialize/zpool_initialize_multiple_pools.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_split.ksh \ @@ -1240,9 +1246,12 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_encrypted_unloaded.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh \ + functional/cli_root/zpool_scrub/zpool_scrub_multiple_pools.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_txg_continue_from_last.ksh \ + functional/cli_root/zpool_scrub/zpool_scrub_date_range_001.ksh \ + functional/cli_root/zpool_scrub/zpool_scrub_date_range_002.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh \ @@ -1292,6 +1301,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_trim/zpool_trim_fault_export_import_online.ksh \ functional/cli_root/zpool_trim/zpool_trim_import_export.ksh \ functional/cli_root/zpool_trim/zpool_trim_multiple.ksh \ + functional/cli_root/zpool_trim/zpool_trim_multiple_pools.ksh \ functional/cli_root/zpool_trim/zpool_trim_neg.ksh \ functional/cli_root/zpool_trim/zpool_trim_offline_export_import_online.ksh \ functional/cli_root/zpool_trim/zpool_trim_online_offline.ksh \ @@ -1346,9 +1356,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool/zpool_002_pos.ksh \ functional/cli_root/zpool/zpool_003_pos.ksh \ functional/cli_root/zpool/zpool_colors.ksh \ - functional/cli_user/misc/arcstat_001_pos.ksh \ - functional/cli_user/misc/arc_summary_001_pos.ksh \ - functional/cli_user/misc/arc_summary_002_neg.ksh \ + functional/cli_user/misc/zarcstat_001_pos.ksh \ + functional/cli_user/misc/zarcsummary_001_pos.ksh \ + functional/cli_user/misc/zarcsummary_002_neg.ksh \ functional/cli_user/misc/zilstat_001_pos.ksh \ functional/cli_user/misc/cleanup.ksh \ functional/cli_user/misc/setup.ksh \ @@ -1403,6 +1413,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_user/zfs_list/zfs_list_005_neg.ksh \ functional/cli_user/zfs_list/zfs_list_007_pos.ksh \ functional/cli_user/zfs_list/zfs_list_008_neg.ksh \ + functional/cli_user/zfs_send_delegation_user/cleanup.ksh \ + functional/cli_user/zfs_send_delegation_user/setup.ksh \ + functional/cli_user/zfs_send_delegation_user/zfs_send_usertest.ksh \ functional/cli_user/zpool_iostat/cleanup.ksh \ functional/cli_user/zpool_iostat/setup.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_001_neg.ksh \ @@ -1444,6 +1457,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/crtime/setup.ksh \ functional/crypto/icp_aes_ccm.ksh \ functional/crypto/icp_aes_gcm.ksh \ + functional/ctime/cleanup.ksh \ + functional/ctime/ctime_001_pos.ksh \ + functional/ctime/setup.ksh \ functional/deadman/deadman_ratelimit.ksh \ functional/deadman/deadman_sync.ksh \ functional/deadman/deadman_zio.ksh \ @@ -1516,6 +1532,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/events/events_001_pos.ksh \ functional/events/events_002_pos.ksh \ functional/events/setup.ksh \ + functional/events/slow_vdev_degraded_sit_out.ksh \ + functional/events/slow_vdev_sit_out.ksh \ + functional/events/slow_vdev_sit_out_neg.ksh \ functional/events/zed_cksum_config.ksh \ functional/events/zed_cksum_reported.ksh \ functional/events/zed_diagnose_multiple.ksh \ @@ -1524,6 +1543,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/events/zed_rc_filter.ksh \ functional/events/zed_slow_io.ksh \ functional/events/zed_slow_io_many_vdevs.ksh \ + functional/events/zed_synchronous_zedlet.ksh \ functional/exec/cleanup.ksh \ functional/exec/exec_001_pos.ksh \ functional/exec/exec_002_neg.ksh \ @@ -1534,6 +1554,14 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/failmode/cleanup.ksh \ functional/failmode/failmode_dmu_tx_wait.ksh \ functional/failmode/failmode_dmu_tx_continue.ksh \ + functional/failmode/failmode_fsync_wait.ksh \ + functional/failmode/failmode_fsync_continue.ksh \ + functional/failmode/failmode_msync_wait.ksh \ + functional/failmode/failmode_msync_continue.ksh \ + functional/failmode/failmode_osync_wait.ksh \ + functional/failmode/failmode_osync_continue.ksh \ + functional/failmode/failmode_syncalways_wait.ksh \ + functional/failmode/failmode_syncalways_continue.ksh \ functional/failmode/setup.ksh \ functional/fallocate/cleanup.ksh \ functional/fallocate/fallocate_prealloc.ksh \ @@ -1578,6 +1606,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/gang_blocks/gang_blocks_001_pos.ksh \ functional/gang_blocks/gang_blocks_ddt_copies.ksh \ functional/gang_blocks/gang_blocks_redundant.ksh \ + functional/gang_blocks/gang_blocks_dyn_header_neg.ksh \ + functional/gang_blocks/gang_blocks_dyn_header_pos.ksh \ + functional/gang_blocks/gang_blocks_dyn_multi.ksh \ functional/gang_blocks/setup.ksh \ functional/grow/grow_pool_001_pos.ksh \ functional/grow/grow_replicas_001_pos.ksh \ @@ -1667,6 +1698,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/mmap/mmap_seek_001_pos.ksh \ functional/mmap/mmap_sync_001_pos.ksh \ functional/mmap/mmap_write_001_pos.ksh \ + functional/mmap/mmap_ftruncate.ksh \ functional/mmap/setup.ksh \ functional/mmp/cleanup.ksh \ functional/mmp/mmp_active_import.ksh \ @@ -1686,6 +1718,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/mmp/setup.ksh \ functional/mount/cleanup.ksh \ functional/mount/setup.ksh \ + functional/mount/mount_loopback.ksh \ functional/mount/umount_001.ksh \ functional/mount/umountall_001.ksh \ functional/mount/umount_unlinked_drain.ksh \ @@ -1915,6 +1948,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/replacement/attach_multiple.ksh \ functional/replacement/attach_rebuild.ksh \ functional/replacement/attach_resilver.ksh \ + functional/replacement/attach_resilver_sit_out.ksh \ functional/replacement/cleanup.ksh \ functional/replacement/detach.ksh \ functional/replacement/rebuild_disabled_feature.ksh \ @@ -1923,6 +1957,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/replacement/replace_import.ksh \ functional/replacement/replace_rebuild.ksh \ functional/replacement/replace_resilver.ksh \ + functional/replacement/replace_resilver_sit_out.ksh \ functional/replacement/resilver_restart_001.ksh \ functional/replacement/resilver_restart_002.ksh \ functional/replacement/scrub_cancel.ksh \ @@ -2224,6 +2259,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/zvol/zvol_stress/cleanup.ksh \ functional/zvol/zvol_stress/setup.ksh \ functional/zvol/zvol_stress/zvol_stress.ksh \ + functional/zvol/zvol_stress/zvol_stress_destroy.ksh \ functional/zvol/zvol_swap/cleanup.ksh \ functional/zvol/zvol_swap/setup.ksh \ functional/zvol/zvol_swap/zvol_swap_001_pos.ksh \ diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_002_neg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_002_neg.ksh index 7d6924b2c9bf..79a431a13238 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_002_neg.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_002_neg.ksh @@ -41,9 +41,4 @@ log_mustnot zpool create $TESTPOOL $ZPOOL_DISKS special mirror \ log_mustnot display_status $TESTPOOL log_mustnot zpool destroy -f $TESTPOOL -log_mustnot zpool create $TESTPOOL raidz $ZPOOL_DISKS special raidz \ - $CLASS_DISK0 $CLASS_DISK1 $CLASS_DISK2 -log_mustnot display_status $TESTPOOL -log_mustnot zpool destroy -f $TESTPOOL - log_pass $claim diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh index 42d5deda3842..961dcd46429a 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh @@ -32,7 +32,7 @@ log_onexit cleanup log_must disk_setup -for type in "" "mirror" "raidz" +for type in "" "mirror" "raidz" "draid" do log_must zpool create $TESTPOOL $type $ZPOOL_DISKS @@ -47,6 +47,12 @@ do $CLASS_DISK0 $CLASS_DISK1 log_must zpool iostat -H $TESTPOOL $CLASS_DISK0 log_must zpool iostat -H $TESTPOOL $CLASS_DISK1 + elif [ "$type" = "draid" ]; then + log_must zpool add $TESTPOOL special raidz \ + $CLASS_DISK0 $CLASS_DISK1 $CLASS_DISK2 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK0 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK1 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK2 else log_must zpool add $TESTPOOL special $CLASS_DISK0 log_must zpool iostat -H $TESTPOOL $CLASS_DISK0 diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh index 684b6557e3f1..39ddaad1be5b 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh @@ -37,7 +37,7 @@ typeset ac_value typeset stype="" typeset sdisks="" -for type in "" "mirror" "raidz" +for type in "" "mirror" "raidz" "draid" do if [ "$type" = "mirror" ]; then stype="mirror" @@ -45,6 +45,9 @@ do elif [ "$type" = "raidz" ]; then stype="mirror" sdisks="${CLASS_DISK0} ${CLASS_DISK1}" + elif [ "$type" = "draid" ]; then + stype="raidz" + sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}" else stype="" sdisks="${CLASS_DISK0}" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh index 2223bb1c491d..b7e93fc7350e 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh @@ -36,7 +36,7 @@ typeset stype="" typeset sdisks="" typeset props="" -for type in "" "mirror" "raidz" +for type in "" "mirror" "raidz" "draid" do if [ "$type" = "mirror" ]; then stype="mirror" @@ -45,6 +45,9 @@ do elif [ "$type" = "raidz" ]; then stype="mirror" sdisks="${CLASS_DISK0} ${CLASS_DISK1}" + elif [ "$type" = "draid" ]; then + stype="raidz" + sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}" else stype="" sdisks="${CLASS_DISK0}" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_010_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_010_pos.ksh index 7f9d108ed184..f7dfd42b0f05 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_010_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_010_pos.ksh @@ -36,7 +36,7 @@ log_must disk_setup log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \ $CLASS_DISK0 $CLASS_DISK1 -for value in 0 512 1024 2048 4096 8192 16384 32768 65536 131072 +for value in 0 200 512 1300 4096 12345 131072 1572864 16777216 do log_must zfs set special_small_blocks=$value $TESTPOOL ACTUAL=$(zfs get -p special_small_blocks $TESTPOOL | \ diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh index a04e9ca43273..0f90117544a6 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh @@ -22,7 +22,7 @@ # # DESCRIPTION: # Setting the special_small_blocks property to invalid values fails. -# Powers of two from 512 to 1M are allowed. +# Only values between 0 and 16M including are allowed. # verify_runnable "global" @@ -36,7 +36,7 @@ log_must disk_setup log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \ $CLASS_DISK0 $CLASS_DISK1 -for value in 256 1025 33554432 +for value in 16777217 33554432 4294967296 do log_mustnot zfs set special_small_blocks=$value $TESTPOOL done diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh deleted file mode 100755 index e16b64a964e4..000000000000 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/ksh -p -# SPDX-License-Identifier: CDDL-1.0 - -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib - -# -# DESCRIPTION: -# Setting the special_small_blocks property greater than recordsize fails. -# - -verify_runnable "global" - -claim="Setting the special_small_blocks property greater than recordsize fails" - -log_assert $claim -log_onexit cleanup -log_must disk_setup - -for size in 512 4096 32768 131072 524288 1048576 -do - let bigger=$size*2 - log_mustnot zpool create -O recordsize=$size \ - -O special_small_blocks=$bigger \ - $TESTPOOL raidz $ZPOOL_DISKS special mirror \ - $CLASS_DISK0 $CLASS_DISK1 -done - -log_pass $claim diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh deleted file mode 100755 index 9d34375b74ca..000000000000 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/ksh -p -# SPDX-License-Identifier: CDDL-1.0 - -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib - -# -# DESCRIPTION: -# Can set special_small_blocks property less than or equal to recordsize. -# - -verify_runnable "global" - -claim="Can set special_small_blocks property less than or equal to recordsize" - -log_assert $claim -log_onexit cleanup -log_must disk_setup - -for size in 8192 32768 131072 524288 1048576 -do - let smaller=$size/2 - log_must zpool create -O recordsize=$size \ - -O special_small_blocks=$smaller \ - $TESTPOOL raidz $ZPOOL_DISKS special mirror \ - $CLASS_DISK0 $CLASS_DISK1 - log_must zpool destroy -f "$TESTPOOL" - - log_must zpool create -O recordsize=$size \ - -O special_small_blocks=$size \ - $TESTPOOL raidz $ZPOOL_DISKS special mirror \ - $CLASS_DISK0 $CLASS_DISK1 - log_must zpool destroy -f "$TESTPOOL" -done - -log_pass $claim diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh index 54ffdc75669a..4cede26b913a 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh @@ -41,16 +41,22 @@ function cleanup { datasetexists $TESTPOOL && destroy_pool $TESTPOOL set_tunable64 TXG_TIMEOUT $timeout + log_must restore_tunable BCLONE_WAIT_DIRTY } log_onexit cleanup +log_must save_tunable BCLONE_WAIT_DIRTY + log_must set_tunable64 TXG_TIMEOUT 5000 log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS log_must sync_pool $TESTPOOL true +# Verify fallback to copy when there are dirty blocks +log_must set_tunable32 BCLONE_WAIT_DIRTY 0 + log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4 log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288 @@ -61,5 +67,20 @@ log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone) log_must [ "$blocks" = "" ] +log_must rm /$TESTPOOL/file /$TESTPOOL/clone + +# Verify blocks are cloned even when there are dirty blocks +log_must set_tunable32 BCLONE_WAIT_DIRTY 1 + +log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4 +log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288 + +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone + +typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone) +log_must [ "$blocks" = "0 1 2 3" ] + log_pass $claim diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_tunables.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_tunables.ksh new file mode 100755 index 000000000000..b89790d5d525 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_tunables.ksh @@ -0,0 +1,71 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Rob Norris <robn@despairlabs.com> +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "both" + +log_assert "zdb can work with libzpool tunables" + +# a tunable name by itself, or with the "show" command, produces name and value +log_must eval 'zdb -o zfs_recover | grep -qE "^zfs_recover: 0$"' +log_must eval 'zdb -o show=zfs_recover | grep -qE "^zfs_recover: 0$"' + +# info about a tunable shows a different format +log_must eval 'zdb -o info=zfs_recover | grep -qE "^zfs_recover \[[[:alnum:]_]+ r[dw]]: .+"' + +# "show" by itself shows all the tunables and their values +# this tests limits to 50 tunables, and then counts the number that match +# the format, which should be all of them +log_must test $(zdb -o show | head -50 | grep -cE "^[[:alnum:]_]+: .+") -eq 50 + +# "info" by itself shows info about all tunables +# like previous test, we limit and then count +log_must test $(zdb -o info | head -50 | grep -cE "^[[:alnum:]_]+ \[[[:alnum:]_]+ r[dw]]: .+") -eq 50 + +# can't lookup nonexistent tunables +log_mustnot_expect 'no such tunable: hello' zdb -o hello +log_mustnot_expect 'no such tunable: hello' zdb -o show=hello +log_mustnot_expect 'no such tunable: hello' zdb -o info=hello + +# setting a tunable shows the old and the new value +log_must eval 'zdb -o zfs_recover=1 | grep -qE "^zfs_recover: 0 -> 1$"' + +# replacing a value still sets it +log_must eval 'zdb -o zfs_recover=0 | grep -qE "^zfs_recover: 0 -> 0$"' + +# can't set the "magic" commands +log_mustnot_expect 'no such tunable: 0' zdb -o show=0 +log_mustnot_expect 'no such tunable: 1' zdb -o info=1 + +# can set multiple in same command +log_must eval 'zdb -o zfs_recover=1 -o zfs_flags=512 | xargs | grep -qE "^zfs_recover: 0 -> 1 zfs_flags: 4294932990 -> 512$"' + +# can set and show in same command +log_must eval 'zdb -o zfs_recover=1 -o zfs_recover -o zfs_recover=0 | xargs | grep -qE "^zfs_recover: 0 -> 1 zfs_recover: 1 zfs_recover: 1 -> 0$"' + +log_pass "zdb can work with libzpool tunables" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/zfs_rewrite_physical.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/zfs_rewrite_physical.ksh new file mode 100755 index 000000000000..142e44f53515 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/zfs_rewrite_physical.ksh @@ -0,0 +1,100 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, iXsystems, Inc. +# + +# DESCRIPTION: +# Verify zfs rewrite -P flag correctly preserves logical birth times. +# +# STRATEGY: +# 1. Create a test file and sync it. +# 2. Create a snapshot to capture the original birth time. +# 3. Test default rewrite behavior (updates logical birth time). +# 4. Test -P flag behavior (preserves logical birth time). +# 5. Verify incremental send behavior difference. + +. $STF_SUITE/include/libtest.shlib + +typeset tmp=$(mktemp) +typeset send_default=$(mktemp) +typeset send_physical=$(mktemp) + +function cleanup +{ + rm -rf $tmp $send_default $send_physical $TESTDIR/* + zfs destroy -R $TESTPOOL/$TESTFS@snap1 2>/dev/null || true + zfs destroy -R $TESTPOOL/$TESTFS@snap2 2>/dev/null || true + zfs destroy -R $TESTPOOL/$TESTFS@snap3 2>/dev/null || true +} + +log_assert "zfs rewrite -P flag correctly preserves logical birth times" + +log_onexit cleanup + +log_must zfs set recordsize=128k $TESTPOOL/$TESTFS + +# Create test file and initial snapshot +log_must dd if=/dev/urandom of=$TESTDIR/testfile bs=128k count=4 +log_must sync_pool $TESTPOOL +typeset orig_hash=$(xxh128digest $TESTDIR/testfile) +log_must zfs snapshot $TESTPOOL/$TESTFS@snap1 + +# Test default rewrite behavior (updates logical birth time) +log_must zfs rewrite $TESTDIR/testfile +log_must sync_pool $TESTPOOL +typeset default_hash=$(xxh128digest $TESTDIR/testfile) +log_must [ "$orig_hash" = "$default_hash" ] +log_must zfs snapshot $TESTPOOL/$TESTFS@snap2 + +# Test incremental send size - should be large with updated birth time +log_must eval "zfs send -i @snap1 $TESTPOOL/$TESTFS@snap2 > $send_default" +typeset default_size=$(wc -c < $send_default) +log_note "Default rewrite incremental send size: $default_size bytes" + +# Reset the file to original state +log_must zfs rollback -r $TESTPOOL/$TESTFS@snap1 + +# Test -P flag behavior (preserves logical birth time) +log_must zfs rewrite -P $TESTDIR/testfile +log_must sync_pool $TESTPOOL +typeset physical_hash=$(xxh128digest $TESTDIR/testfile) +log_must [ "$orig_hash" = "$physical_hash" ] +log_must zfs snapshot $TESTPOOL/$TESTFS@snap3 + +# Test incremental send size - should be minimal with preserved birth time +log_must eval "zfs send -i @snap1 $TESTPOOL/$TESTFS@snap3 > $send_physical" +typeset physical_size=$(wc -c < $send_physical) +log_note "Physical rewrite incremental send size: $physical_size bytes" + +# Verify that -P flag produces smaller incremental send +if [[ $physical_size -lt $default_size ]]; then + log_note "SUCCESS: -P flag produces smaller incremental send" \ + "($physical_size < $default_size)" +else + log_fail "FAIL: -P flag should produce smaller incremental send" \ + "($physical_size >= $default_size)" +fi + +log_pass diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_send_delegation/cleanup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_send_delegation/cleanup.ksh new file mode 100755 index 000000000000..4a59e15cc693 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_send_delegation/cleanup.ksh @@ -0,0 +1,43 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/delegate/delegate_common.kshlib + + +poolexists $TESTPOOL1 && \ + destroy_pool $TESTPOOL1 + +del_user $STAFF1 +del_user $STAFF2 +del_group $STAFF_GROUP + +del_user $OTHER1 +del_user $OTHER2 +del_group $OTHER_GROUP + +default_cleanup diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_send_delegation/setup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_send_delegation/setup.ksh new file mode 100755 index 000000000000..0978193eddc4 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_send_delegation/setup.ksh @@ -0,0 +1,50 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/delegate/delegate_common.kshlib + +# Create staff group and add two user to it +log_must add_group $STAFF_GROUP +if ! id $STAFF1 > /dev/null 2>&1; then + log_must add_user $STAFF_GROUP $STAFF1 +fi +if ! id $STAFF2 > /dev/null 2>&1; then + log_must add_user $STAFF_GROUP $STAFF2 +fi + +# Create other group and add two user to it +log_must add_group $OTHER_GROUP +if ! id $OTHER1 > /dev/null 2>&1; then + log_must add_user $OTHER_GROUP $OTHER1 +fi +if ! id $OTHER2 > /dev/null 2>&1; then + log_must add_user $OTHER_GROUP $OTHER2 +fi +DISK=${DISKS%% *} + +default_raidz_setup $DISKS diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_send_delegation/zfs_send_test.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_send_delegation/zfs_send_test.ksh new file mode 100755 index 000000000000..d018f313fae1 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_send_delegation/zfs_send_test.ksh @@ -0,0 +1,111 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara Inc. +# + +# STRATEGY: +# 1. Create a pool (this is done by the test framework) +# 2. Create an encrypted dataset +# 3. Write random data to the encrypted dataset +# 4. Snapshot the dataset +# 5. As root: attempt a send and raw send (both should succeed) +# 6. Create a delegation (zfs allow -u user send testpool/encrypted_dataset) +# 7. As root: attempt a send and raw send (both should succeed) +# 8. Create a delegation (zfs allow -u user send:raw testpool/encrypted_dataset) +# 9. As root: attempt a send and raw send (both should succeed) +# 10. Disable delegation (zfs unallow) +# 11. As root: attempt a send and raw send (both should succeed) +# 12. Clean up (handled by framework) +# +# Tested as a user under ../cli_user/zfs_send_delegation_user/ + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_create/zfs_create_common.kshlib +. $STF_SUITE/tests/functional/cli_root/zfs_create/properties.kshlib +. $STF_SUITE/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib +. $STF_SUITE/tests/functional/delegate/delegate.cfg + +# create encrypted dataset + +log_must eval "echo $PASSPHRASE | zfs create -o encryption=on -o keyformat=passphrase $TESTPOOL/$TESTFS1" + +# create target dataset for receives +if ! zfs list | grep testfs2 >/dev/null 2>&1; then + dataset_created="TRUE" + log_must zfs create $TESTPOOL/$TESTFS2 +fi + +# create user and group +typeset perms="snapshot,reservation,compression,checksum,userprop,receive" + +log_note "Added permissions to the $OTHER1 user." +log_must zfs allow $OTHER1 $perms $TESTPOOL/$TESTFS1 +log_must zfs allow $OTHER1 $perms $TESTPOOL/$TESTFS2 + +# create random data +log_must fill_fs $TESTPOOL/$TESTFS1/child 1 2047 1024 1 R + +# snapshot +log_must zfs snapshot $TESTPOOL/$TESTFS1@snap1 + + +# check baseline send abilities (should pass) +log_must eval "zfs send $TESTPOOL/$TESTFS1@snap1 | zfs receive $TESTPOOL/$TESTFS2/zfsrecv0_datastream.$$" +log_must eval "zfs send -w $TESTPOOL/$TESTFS1@snap1 | zfs receive $TESTPOOL/$TESTFS2/zfsrecv0raw_datastream.$$" + + +# create delegation +log_must zfs allow $OTHER1 send $TESTPOOL/$TESTFS1 + +# attempt send with full allow + +log_must eval "zfs send $TESTPOOL/$TESTFS1@snap1 | zfs receive $TESTPOOL/$TESTFS2/zfsrecv1_datastream.$$" +log_must eval "zfs send -w $TESTPOOL/$TESTFS1@snap1 | zfs receive $TESTPOOL/$TESTFS2/zfsrecv1raw_datastream.$$" + +# create raw delegation +log_must zfs allow $OTHER1 send:raw $TESTPOOL/$TESTFS1 +log_must zfs unallow $OTHER1 send $TESTPOOL/$TESTFS1 + +# test new send abilities (should pass) +log_must eval "zfs send $TESTPOOL/$TESTFS1@snap1 | zfs receive $TESTPOOL/$TESTFS2/zfsrecv2_datastream.$$" +log_must eval "zfs send -w $TESTPOOL/$TESTFS1@snap1 | zfs receive $TESTPOOL/$TESTFS2/zfsrecv2raw_datastream.$$" + + +# disable raw delegation +zfs unallow $OTHER1 send:raw $TESTPOOL/$TESTFS1 +zfs allow $OTHER1 send $TESTPOOL/$TESTFS1 + +# verify original send abilities (should pass) +log_must eval "zfs send $TESTPOOL/$TESTFS1@snap1 | zfs receive $TESTPOOL/$TESTFS2/zfsrecv3_datastream.$$" +log_must eval "zfs send -w $TESTPOOL/$TESTFS1@snap1 | zfs receive $TESTPOOL/$TESTFS2/zfsrecv3raw_datastream.$$" + + +function cleanup +{ + datasetexists $TESTPOOL/$TESTFS1 && \ + destroy_dataset $TESTPOOL/$TESTFS1 -r \ + destroy_dataset $TESTPOOL/$TESTFS2 -r + +} diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib index 0f5f6198daf2..0d07b1fd1952 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib @@ -33,13 +33,16 @@ # Test one: # # 1. Create pool on a loopback device with some test data -# 2. Export the pool. -# 3. Corrupt all label checksums in the pool -# 4. Check that pool cannot be imported -# 5. Verify that it cannot be imported after using zhack label repair -u +# 2. Checksum repair should work with a valid TXG. Repeatedly write and +# sync the pool so there are enough transactions for every uberblock +# to have a TXG +# 3. Export the pool. +# 4. Corrupt all label checksums in the pool +# 5. Check that pool cannot be imported +# 6. Verify that it cannot be imported after using zhack label repair -u # to ensure that the -u option will quit on corrupted checksums. -# 6. Use zhack label repair -c on device -# 7. Check that pool can be imported and that data is intact +# 7. Use zhack label repair -c on device +# 8. Check that pool can be imported and that data is intact # # Test two: # @@ -97,7 +100,7 @@ VIRTUAL_MIRROR_DEVICE= function cleanup_lo { - L_DEVICE="$1" + typeset L_DEVICE="$1" if [[ -e $L_DEVICE ]]; then if is_linux; then @@ -133,9 +136,9 @@ function get_devsize function pick_logop { - L_SHOULD_SUCCEED="$1" + typeset L_SHOULD_SUCCEED="$1" - l_logop="log_mustnot" + typeset l_logop="log_mustnot" if [ "$L_SHOULD_SUCCEED" == true ]; then l_logop="log_must" fi @@ -145,7 +148,9 @@ function pick_logop function check_dataset { - L_SHOULD_SUCCEED="$1" + typeset L_SHOULD_SUCCEED="$1" + + typeset L_LOGOP= L_LOGOP="$(pick_logop "$L_SHOULD_SUCCEED")" "$L_LOGOP" mounted "$TESTPOOL"/"$TESTFS" @@ -170,9 +175,21 @@ function setup_dataset check_dataset true } +function force_transactions +{ + typeset L_TIMES="$1" + typeset i= + for ((i=0; i < L_TIMES; i++)) + do + touch "$TESTDIR"/"test" || return $? + zpool sync -f "$TESTPOOL" || return $? + done + return 0 +} + function get_practical_size { - L_SIZE="$1" + typeset L_SIZE="$1" if [ "$((L_SIZE % LABEL_SIZE))" -ne 0 ]; then echo "$(((L_SIZE / LABEL_SIZE) * LABEL_SIZE))" @@ -183,10 +200,11 @@ function get_practical_size function corrupt_sized_label_checksum { - L_SIZE="$1" - L_LABEL="$2" - L_DEVICE="$3" + typeset L_SIZE="$1" + typeset L_LABEL="$2" + typeset L_DEVICE="$3" + typeset L_PRACTICAL_SIZE= L_PRACTICAL_SIZE="$(get_practical_size "$L_SIZE")" typeset -a L_OFFSETS=("$LABEL_CKSUM_START" \ @@ -201,8 +219,8 @@ function corrupt_sized_label_checksum function corrupt_labels { - L_SIZE="$1" - L_DISK="$2" + typeset L_SIZE="$1" + typeset L_DISK="$2" corrupt_sized_label_checksum "$L_SIZE" 0 "$L_DISK" corrupt_sized_label_checksum "$L_SIZE" 1 "$L_DISK" @@ -212,11 +230,14 @@ function corrupt_labels function try_import_and_repair { - L_REPAIR_SHOULD_SUCCEED="$1" - L_IMPORT_SHOULD_SUCCEED="$2" - L_OP="$3" - L_POOLDISK="$4" + typeset L_REPAIR_SHOULD_SUCCEED="$1" + typeset L_IMPORT_SHOULD_SUCCEED="$2" + typeset L_OP="$3" + typeset L_POOLDISK="$4" + + typeset L_REPAIR_LOGOP= L_REPAIR_LOGOP="$(pick_logop "$L_REPAIR_SHOULD_SUCCEED")" + typeset L_IMPORT_LOGOP= L_IMPORT_LOGOP="$(pick_logop "$L_IMPORT_SHOULD_SUCCEED")" log_mustnot zpool import "$TESTPOOL" -d "$L_POOLDISK" @@ -230,10 +251,10 @@ function try_import_and_repair function prepare_vdev { - L_SIZE="$1" - L_BACKFILE="$2" + typeset L_SIZE="$1" + typeset L_BACKFILE="$2" - l_devname= + typeset l_devname= if truncate -s "$L_SIZE" "$L_BACKFILE"; then if is_linux; then l_devname="$(losetup -f "$L_BACKFILE" --show)" @@ -248,7 +269,7 @@ function prepare_vdev function run_test_one { - L_SIZE="$1" + typeset L_SIZE="$1" VIRTUAL_DEVICE="$(prepare_vdev "$L_SIZE" "$VIRTUAL_DISK")" log_must test -e "$VIRTUAL_DEVICE" @@ -257,6 +278,9 @@ function run_test_one setup_dataset + # Force 256 extra transactions to ensure all uberblocks are assigned a TXG + log_must force_transactions 256 + log_must zpool export "$TESTPOOL" corrupt_labels "$L_SIZE" "$VIRTUAL_DISK" @@ -272,7 +296,7 @@ function run_test_one function make_mirrored_pool { - L_SIZE="$1" + typeset L_SIZE="$1" VIRTUAL_DEVICE="$(prepare_vdev "$L_SIZE" "$VIRTUAL_DISK")" log_must test -e "$VIRTUAL_DEVICE" @@ -296,7 +320,7 @@ function export_and_cleanup_vdisk function run_test_two { - L_SIZE="$1" + typeset L_SIZE="$1" make_mirrored_pool "$L_SIZE" @@ -317,7 +341,7 @@ function run_test_two function run_test_three { - L_SIZE="$1" + typeset L_SIZE="$1" make_mirrored_pool "$L_SIZE" @@ -342,7 +366,7 @@ function run_test_three function run_test_four { - L_SIZE="$1" + typeset L_SIZE="$1" make_mirrored_pool "$L_SIZE" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_001.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_001.ksh index ce159b555d20..b5b24322f882 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_001.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_001.ksh @@ -18,13 +18,16 @@ # Strategy: # # 1. Create pool on a loopback device with some test data -# 2. Export the pool. -# 3. Corrupt all label checksums in the pool -# 4. Check that pool cannot be imported -# 5. Verify that it cannot be imported after using zhack label repair -u +# 2. Checksum repair should work with a valid TXG. Repeatedly write and +# sync the pool so there are enough transactions for every uberblock +# to have a TXG +# 3. Export the pool. +# 4. Corrupt all label checksums in the pool +# 5. Check that pool cannot be imported +# 6. Verify that it cannot be imported after using zhack label repair -u # to ensure that the -u option will quit on corrupted checksums. -# 6. Use zhack label repair -c on device -# 7. Check that pool can be imported and that data is intact +# 7. Use zhack label repair -c on device +# 8. Check that pool can be imported and that data is intact . "$STF_SUITE"/tests/functional/cli_root/zhack/library.kshlib diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh new file mode 100755 index 000000000000..0d2a39be6b5a --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_metaslab_leak.ksh @@ -0,0 +1,70 @@ +#!/bin/ksh +# SPDX-License-Identifier: CDDL-1.0 + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# + +# +# Description: +# +# Test whether zhack metaslab leak functions correctly +# +# Strategy: +# +# 1. Create pool on a loopback device with some test data +# 2. Gather pool capacity stats +# 3. Generate fragmentation data with zdb +# 4. Destroy the pool +# 5. Create a new pool with the same configuration +# 6. Export the pool +# 7. Apply the fragmentation information with zhack metaslab leak +# 8. Import the pool +# 9. Verify that pool capacity stats match + +. "$STF_SUITE"/include/libtest.shlib + +verify_runnable "global" + +function cleanup +{ + zpool destroy $TESTPOOL + rm $tmp +} + +log_onexit cleanup +log_assert "zhack metaslab leak leaks the right amount of space" + +typeset tmp=$(mktemp) + +log_must zpool create $TESTPOOL $DISKS +for i in `seq 1 16`; do + log_must dd if=/dev/urandom of=/$TESTPOOL/f$i bs=1M count=16 + log_must zpool sync $TESTPOOL +done +for i in `seq 2 2 16`; do + log_must rm /$TESTPOOL/f$i +done +for i in `seq 1 16`; do + log_must touch /$TESTPOOL/g$i + log_must zpool sync $TESTPOOL +done + +alloc=$(zpool get -Hpo value alloc $TESTPOOL) +log_must eval "zdb -m --allocated-map $TESTPOOL > $tmp" +log_must zpool destroy $TESTPOOL + +log_must zpool create $TESTPOOL $DISKS +log_must zpool export $TESTPOOL +log_must eval "zhack metaslab leak $TESTPOOL < $tmp" +log_must zpool import $TESTPOOL + +alloc2=$(zpool get -Hpo value alloc $TESTPOOL) + +within_percent $alloc $alloc2 98 || + log_fail "space usage changed too much: $alloc to $alloc2" + +log_pass "zhack metaslab leak behaved correctly" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh index f96d291ccb1c..94ccabeb80a8 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh @@ -50,7 +50,7 @@ function cleanup function check_features { - for state in $(zpool get all $TESTPOOL | \ + for state in $(zpool get all $TESTPOOL | grep -v "dynamic_gang_header" | \ awk '$2 ~ /feature@/ { print $3 }'); do if [[ "$state" != "enabled" && "$state" != "active" ]]; then log_fail "some features are not enabled on new pool" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh index 7366a46f9c81..676aca1a20a5 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh @@ -58,6 +58,9 @@ function check_features return 1; fi else + if [[ "feature@dynamic_gang_header" == "${2}" ]]; then + continue + fi # Failure other features must be enabled or active. if [[ "${3}" != "enabled" && "${3}" != "active" ]]; then return 2; diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index cf5e0961f9fd..bdf5fdf85cff 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -91,6 +91,8 @@ typeset -a properties=( "feature@device_rebuild" "feature@draid" "feature@redaction_list_spill" + "feature@dynamic_gang_header" + "feature@physical_rewrite" ) if is_linux || is_freebsd; then @@ -114,5 +116,6 @@ if is_linux || is_freebsd; then "feature@fast_dedup" "feature@longname" "feature@large_microzap" + "feature@block_cloning_endian" ) fi diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_multiple_pools.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_multiple_pools.ksh new file mode 100755 index 000000000000..cc7bca5445d2 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_multiple_pools.ksh @@ -0,0 +1,131 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 Hewlett Packard Enterprise Development LP. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Verify 'zpool initialize -a' works correctly with multiple pools +# +# STRATEGY: +# 1. Create multiple pools. +# 2. Start a initialize operation on all pools using 'zpool initialize -a'. +# 3. Verify that the initializing is active on all pools. +# 4. Wait for the initialize operation to complete. +# 5. Verify that the initialize operation is complete on all pools. +# 6. Start a initializing on all pools using 'zpool initialize -w -a'. +# 7. Verify that the initialize operation is complete on all pools. +# 8. Now test the -u, -c and -s options on multiple pools with -a. +# 9. Verify that the initialize status is correctly updated on all pools. +# + +verify_runnable "global" + +cleanup() { + for pool in {1..4}; do + zpool destroy $TESTPOOL${pool} + rm -rf $TESTDIR${pool} + done + rm -f $DISK1 $DISK2 $DISK3 $DISK4 +} + +log_onexit cleanup + +log_assert "Verify if 'zpool initialize -a' works correctly with multiple pools." + +DEVSIZE='5G' +TESTDIR="$TEST_BASE_DIR/zpool_initialize_multiple_pools" +DISK1="$TEST_BASE_DIR/zpool_disk1.dat" +DISK2="$TEST_BASE_DIR/zpool_disk2.dat" +DISK3="$TEST_BASE_DIR/zpool_disk3.dat" +DISK4="$TEST_BASE_DIR/zpool_disk4.dat" + +truncate -s $DEVSIZE $DISK1 +truncate -s $DEVSIZE $DISK2 +truncate -s $DEVSIZE $DISK3 +truncate -s $DEVSIZE $DISK4 + +for pool in {1..4}; do + DISK[$pool]="$TEST_BASE_DIR/zpool_disk${pool}.dat" + truncate -s $DEVSIZE ${DISK[$pool]} + log_must zpool create $TESTPOOL${pool} ${DISK[$pool]} +done +sync_all_pools + +# Start an initialize operation on all pools using 'zpool initialize -a'. +log_must zpool initialize -a + +# Verify that the initializing is active on all pools. +for pool in {1..4}; do + if [[ -z "$(initialize_progress $TESTPOOL${pool} ${DISK[$pool]})" ]]; then + log_fail "Initializing did not start on pool $TESTPOOL${pool}" + fi +done + +# Wait for the initialize operation to complete on all pools. +for pool in {1..4}; do + log_must zpool wait -t initialize $TESTPOOL${pool} +done + +# Verify that the initialize operation is complete on all pools. +complete_count=$(zpool status -i | grep -c "completed") +if [[ $complete_count -ne 4 ]]; then + log_fail "Expected 4 pools to have initialize status 'completed', but found ${complete_count}." +fi + +# Start an initialize operation on all pools using 'zpool initialize -w -a'. +log_must zpool initialize -w -a + +# Verify that the initialize operation is complete on all pools. +complete_count=$(zpool status -i | grep -c "completed") +if [[ $complete_count -ne 4 ]]; then + log_fail "Expected 4 pools to have initialize status 'completed', but found ${complete_count}." +fi + +# Now test the -u, -c and -s options on multiple pools with -a. +log_must zpool initialize -u -a +complete_count=$(zpool status -i | grep -c "uninitialized") +if [[ $complete_count -ne 4 ]]; then + log_fail "Expected 4 pools to have initialize status 'uninitialized', but found ${complete_count}." +fi + +log_must zpool initialize -a + +for pool in {1..4}; do + if [[ -z "$(initialize_progress $TESTPOOL${pool} ${DISK[$pool]})" ]]; then + log_fail "Initializing did not start on pool $TESTPOOL${pool}" + fi +done + +log_must zpool initialize -a -s +complete_count=$(zpool status -i | grep -c "suspended") +if [[ $complete_count -ne 4 ]]; then + log_fail "Expected 4 pools to have initialize status 'suspended', but found ${complete_count}." +fi + +log_must zpool initialize -a -c +for pool in {1..4}; do + [[ -z "$(initialize_progress $TESTPOOL${pool} ${DISK[$pool]})" ]] || \ + log_fail "Initialize did not stop on pool $TESTPOOL${pool}" +done + +log_pass "Initialize '-a' works on multiple pools correctly." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh index 431568053472..5ffba803342f 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh @@ -28,6 +28,7 @@ # # Copyright (c) 2016 by Delphix. All rights reserved. +# Copyright (c) 2025 Hewlett Packard Enterprise Development LP. # . $STF_SUITE/include/libtest.shlib @@ -46,7 +47,7 @@ verify_runnable "global" set -A args "" "-?" "blah blah" "-%" "--?" "-*" "-=" \ - "-a" "-b" "-c" "-d" "-e" "-f" "-g" "-h" "-i" "-j" "-k" "-l" \ + "-b" "-c" "-d" "-e" "-f" "-g" "-h" "-i" "-j" "-k" "-l" \ "-m" "-n" "-o" "-p" "-q" "-r" "-s" "-t" "-u" "-v" "-w" "-x" "-y" "-z" \ "-A" "-B" "-C" "-D" "-E" "-F" "-G" "-H" "-I" "-J" "-K" "-L" \ "-M" "-N" "-O" "-P" "-Q" "-R" "-S" "-T" "-U" "-V" "-W" "-X" "-W" "-Z" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_date_range_001.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_date_range_001.ksh new file mode 100755 index 000000000000..7f5f8052c8ec --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_date_range_001.ksh @@ -0,0 +1,94 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright 2025 Klara, Inc. +# Copyright 2025 Mariusz Zaborski <oshogbo@FreeBSD.org> +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg + +# +# DESCRIPTION: +# Verify that the date range scrub only scrubs the files that were +# created/modified within a given time slot. +# +# STRATEGY: +# 1. Write a file. +# 2. Force a sync of everything via export/import. +# 3. Wait for one minute. +# 4. Repeat steps 1, 2, and 3 four two times. +# 5. Inject checksum errors into all 3 files. +# 6. Scrub the date range for the first file. +# 7. Verify that the first file is scrubbed. +# 8. Verify that newer files are not scrubbed. +# 9. Repeat steps 6–8 for each of the remaining 2 files. +# + +verify_runnable "global" + +function cleanup +{ + log_must zinject -c all + rm -f $TESTDIR/*_file + log_must restore_tunable SPA_NOTE_TXG_TIME +} + +log_onexit cleanup + +log_assert "Verifiy scrub, -E, and -S show expected status." + +log_must save_tunable SPA_NOTE_TXG_TIME +log_must set_tunable64 SPA_NOTE_TXG_TIME 30 + +typeset -a date_list +for i in `seq 0 2`; do + log_must sleep 60 + log_must zpool export $TESTPOOL + log_must zpool import $TESTPOOL + date_list+=("$(date '+%Y-%m-%d %H:%M')") + + log_must file_write -o create -f"$TESTDIR/${i}_file" \ + -b 512 -c 2048 -dR + + log_must sleep 60 + log_must zpool export $TESTPOOL + log_must zpool import $TESTPOOL + date_list+=("$(date '+%Y-%m-%d %H:%M')") +done + +for i in `seq 0 2`; do + log_must zinject -t data -e checksum -f 100 $TESTDIR/${i}_file +done + +for i in `seq 0 2`; do + log_must zpool scrub -w -S "${date_list[$((i * 2))]}" -E "${date_list[$((i * 2 + 1))]}" $TESTPOOL + log_must eval "zpool status -v $TESTPOOL | grep '${i}_file'" + for j in `seq 0 2`; do + if [ $i == $j ]; then + continue + fi + log_mustnot eval "zpool status -v $TESTPOOL | grep '${j}_file'" + done +done + +log_pass "Verified scrub, -E, and -S show expected status." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_date_range_002.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_date_range_002.ksh new file mode 100755 index 000000000000..9327df81a5c5 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_date_range_002.ksh @@ -0,0 +1,76 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright 2025 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg + +# +# DESCRIPTION: +# Verify that the timestamp database updates all the tables as expected. +# +# STRATEGY: +# 1. Decrease the note and flush frequency of the txg database. +# 2. Force the pool to sync several txgs +# 3. Verify that there are entries in each of the "month", "day", and +# "minute" tables. +# + +verify_runnable "global" + +function cleanup +{ + log_must restore_tunable SPA_NOTE_TXG_TIME + log_must restore_tunable SPA_FLUSH_TXG_TIME + rm /$TESTPOOL/f1 +} + +log_onexit cleanup + +log_assert "Verifiy timestamp databases all update as expected." + +log_must save_tunable SPA_NOTE_TXG_TIME +log_must set_tunable64 SPA_NOTE_TXG_TIME 1 +log_must save_tunable SPA_FLUSH_TXG_TIME +log_must set_tunable64 SPA_FLUSH_TXG_TIME 1 + +log_must touch /$TESTPOOL/f1 +log_must zpool sync $TESTPOOL +sleep 1 +log_must touch /$TESTPOOL/f1 +log_must zpool sync $TESTPOOL +sleep 1 +log_must touch /$TESTPOOL/f1 +log_must zpool sync $TESTPOOL + +mos_zap="$(zdb -dddd $TESTPOOL 1)" +minutes_entries=$(echo "$mos_zap" | grep "txg_log_time:minutes" | awk '{print $5}') +days_entries=$(echo "$mos_zap" | grep "txg_log_time:days" | awk '{print $5}') +months_entries=$(echo "$mos_zap" | grep "txg_log_time:months" | awk '{print $5}') + +[[ "$minutes_entries" -ne "0" ]] || log_fail "0 entries in the minutes table" +[[ "$days_entries" -ne "0" ]] || log_fail "0 entries in the days table" +[[ "$months_entries" -ne "0" ]] || log_fail "0 entries in the months table" + +log_pass "Verified all timestamp databases had entries as expected." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_pools.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_pools.ksh new file mode 100755 index 000000000000..b8647e208644 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_pools.ksh @@ -0,0 +1,128 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 Hewlett Packard Enterprise Development LP. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg + +# +# DESCRIPTION: +# Verify 'zpool scrub -a' works correctly with multiple pools +# +# STRATEGY: +# 1. Create multiple pools. +# 2. Start a scrub on all pools using 'zpool scrub -a'. +# 3. Verify that the scrub is running on all pools. +# 4. Wait for the scrub to complete. +# 5. Verify that the scrub status is complete on all pools. +# 6. Start a scrub on all pools using 'zpool scrub -w -a'. +# 7. Verify that the scrub status is complete on all pools. +# 8. Now test the -p and -s options on multiple pools with -a. +# 9. Verify that the scrub status is correct for each option. +# + +verify_runnable "global" + +cleanup() { + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 + for pool in {1..4}; do + zpool destroy $TESTPOOL${pool} + rm -rf $TESTDIR${pool} + done + rm -f $DISK1 $DISK2 $DISK3 $DISK4 + # Import the testpool + zpool import -a +} + +log_onexit cleanup + +log_assert "Verify if scrubbing multiple pools works correctly." + +# Export the testpool created by setup and Import them later. +log_must zpool export -a + +DEVSIZE='128m' +FILESIZE='50m' +TESTDIR="$TEST_BASE_DIR/zpool_scrub_multiple_pools" +DISK1="$TEST_BASE_DIR/zpool_disk1.dat" +DISK2="$TEST_BASE_DIR/zpool_disk2.dat" +DISK3="$TEST_BASE_DIR/zpool_disk3.dat" +DISK4="$TEST_BASE_DIR/zpool_disk4.dat" + +truncate -s $DEVSIZE $DISK1 +truncate -s $DEVSIZE $DISK2 +truncate -s $DEVSIZE $DISK3 +truncate -s $DEVSIZE $DISK4 + +for pool in {1..4}; do + DISK[$pool]="$TEST_BASE_DIR/zpool_disk${pool}.dat" + truncate -s $DEVSIZE ${DISK[$pool]} + log_must zpool create -O mountpoint=$TESTDIR${pool} $TESTPOOL${pool} ${DISK[$pool]} + log_must zfs create -o compression=off $TESTPOOL${pool}/testfs${pool} + typeset mntpnt=$(get_prop mountpoint $TESTPOOL${pool}/testfs${pool}) + # Fill some data into the filesystem. + log_must mkfile $FILESIZE $mntpnt/file${pool}.dat +done +sync_all_pools + +# Start a scrub on all pools using 'zpool scrub -a'. +log_must zpool scrub -a +# Wait for the scrub to complete on all pools. +for pool in {1..4}; do + log_must zpool wait -t scrub $TESTPOOL${pool} +done + +# Verify that the scrub status is complete on all pools. +complete_count=$(zpool status -v | grep -c "scrub repaired") +if [[ $complete_count -ne 4 ]]; then + log_fail "Expected 4 pools to have scrub status 'scrub repaired', but found $complete_count." +fi + +# Start a error scrub on all pools using 'zpool scrub -w -a' +log_must zpool scrub -w -a + +# Verify that the scrub status is complete on all pools. +complete_count=$(zpool status -v | grep -c "scrub repaired") +if [[ $complete_count -ne 4 ]]; then + log_fail "Expected 4 pools to have scrub status 'scrub repaired', but found $complete_count." +fi + +# Now test the -p and -s options on multiple pools with -a. +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + +log_must zpool scrub -a +complete_count=$(zpool status -v | grep -c "scrub in progress since") +if [[ $complete_count -ne 4 ]]; then + log_fail "Expected 4 pools to have scrub status 'scrub in progress since', but found $complete_count." +fi + +log_must zpool scrub -a -p +complete_count=$(zpool status -v | grep -c "scrub paused since") +if [[ $complete_count -ne 4 ]]; then + log_fail "Expected 4 pools to have scrub status 'scrub paused since', but found $complete_count." +fi + +log_must zpool scrub -a -s +complete_count=$(zpool status -v | grep -c "scrub canceled") +if [[ $complete_count -ne 4 ]]; then + log_fail "Expected 4 pools to have scrub status 'scrub canceled', but found $complete_count." +fi + +log_pass "Scrubbing multiple pools works correctly." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_multiple_pools.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_multiple_pools.ksh new file mode 100755 index 000000000000..4348eecc698c --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_multiple_pools.ksh @@ -0,0 +1,123 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 Hewlett Packard Enterprise Development LP. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib + +# +# DESCRIPTION: +# Verify 'zpool trim -a' works correctly with multiple pools +# +# STRATEGY: +# 1. Create multiple pools. +# 2. Start a trim on all pools using 'zpool trim -a'. +# 3. Verify that the trim is started on all pools. +# 4. Wait for the trim to complete. +# 5. Verify that the trim is complete on all pools. +# 6. Start a trim on all pools using 'zpool trim -w -a'. +# 7. Verify that the trim is complete on all pools. +# 8. Now test the -c and -s options on multiple pools with -a. +# 9. Verify that the trim status is correct for each option. +# + +verify_runnable "global" + +cleanup() { + for pool in {1..4}; do + zpool destroy $TESTPOOL${pool} + rm -rf $TESTDIR${pool} + done + rm -f $DISK1 $DISK2 $DISK3 $DISK4 +} + +log_onexit cleanup + +log_assert "Verify if trim '-a' works on multiple pools correctly." + +DEVSIZE='5G' +TESTDIR="$TEST_BASE_DIR/zpool_trim_multiple_pools" +DISK1="$TEST_BASE_DIR/zpool_disk1.dat" +DISK2="$TEST_BASE_DIR/zpool_disk2.dat" +DISK3="$TEST_BASE_DIR/zpool_disk3.dat" +DISK4="$TEST_BASE_DIR/zpool_disk4.dat" + +truncate -s $DEVSIZE $DISK1 +truncate -s $DEVSIZE $DISK2 +truncate -s $DEVSIZE $DISK3 +truncate -s $DEVSIZE $DISK4 + +for pool in {1..4}; do + DISK[$pool]="$TEST_BASE_DIR/zpool_disk${pool}.dat" + truncate -s $DEVSIZE ${DISK[$pool]} + log_must zpool create $TESTPOOL${pool} ${DISK[$pool]} +done +sync_all_pools + +# Start a trim on all pools using 'zpool trim -a'. +log_must zpool trim -a + +# Verify that the trim is started on all pools. +for pool in {1..4}; do + [[ -z "$(trim_progress $TESTPOOL${pool} ${DISK[$pool]})" ]] && \ + log_fail "Trim did not start on pool $TESTPOOL${pool}" +done + +# Wait for the trim to complete on all pools. +for pool in {1..4}; do + log_must zpool wait -t trim $TESTPOOL${pool} +done + +# Verify that the trim status is complete on all pools. +complete_count=$(zpool status -t | grep -c "completed") +if [[ $complete_count -ne 4 ]]; then + log_fail "Expected 4 pools to have trim status 'completed', but found ${complete_count}." +fi + +# Start a trim on all pools using 'zpool trim -w -a' +log_must zpool trim -w -a + +# Verify that the trim status is complete on all pools. +complete_count=$(zpool status -t | grep -c "completed") +if [[ $complete_count -ne 4 ]]; then + log_fail "Expected 4 pools to have trim status 'completed', but found ${complete_count}." +fi + +# Now test the -s and -c options on multiple pools with -a. +log_must zpool trim -r 1 -a + +for pool in {1..4}; do + [[ -z "$(trim_progress $TESTPOOL${pool} ${DISK[$pool]})" ]] && \ + log_fail "Trim did not start" +done + +log_must zpool trim -a -s +complete_count=$(zpool status -t | grep -c "suspended") +if [[ $complete_count -ne 4 ]]; then + log_fail "Expected 4 pools to have trim status 'suspended', but found $complete_count." +fi + +log_must zpool trim -a -c +for pool in {1..4}; do + [[ -z "$(trim_progress $TESTPOOL${pool} ${DISK[$pool]})" ]] || \ + log_fail "TRIM did not stop on pool $TESTPOOL${pool}" +done + +log_pass "Trim '-a' works on multiple pools correctly." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/arcstat_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/zarcstat_001_pos.ksh index 700bd9a6f529..d63b72f8039d 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/arcstat_001_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/zarcstat_001_pos.ksh @@ -37,7 +37,7 @@ log_assert "arcstat generates output and doesn't return an error code" typeset -i i=0 while [[ $i -lt ${#args[*]} ]]; do - log_must eval "arcstat ${args[i]} > /dev/null" + log_must eval "zarcstat ${args[i]} > /dev/null" ((i = i + 1)) done log_pass "arcstat generates output and doesn't return an error code" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/arc_summary_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/zarcsummary_001_pos.ksh index 0840878fdb0d..b7faac5243c9 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/arc_summary_001_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/zarcsummary_001_pos.ksh @@ -30,16 +30,16 @@ is_freebsd && ! python3 -c 'import sysctl' 2>/dev/null && log_unsupported "python3 sysctl module missing" -log_assert "arc_summary generates output and doesn't return an error code" +log_assert "zarcsummary generates output and doesn't return an error code" # Without this, the below checks aren't going to work the way we hope... set -o pipefail for arg in "" "-a" "-d" "-p 1" "-g" "-s arc" "-r"; do - log_must eval "arc_summary $arg > /dev/null" + log_must eval "zarcsummary $arg > /dev/null" done -log_must eval "arc_summary | head > /dev/null" -log_must eval "arc_summary | head -1 > /dev/null" +log_must eval "zarcsummary | head > /dev/null" +log_must eval "zarcsummary | head -1 > /dev/null" -log_pass "arc_summary generates output and doesn't return an error code" +log_pass "zarcsummary generates output and doesn't return an error code" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/arc_summary_002_neg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/zarcsummary_002_neg.ksh index ec4abe35409d..227646777ba0 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/arc_summary_002_neg.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/zarcsummary_002_neg.ksh @@ -30,10 +30,10 @@ is_freebsd && ! python3 -c 'import sysctl' 2>/dev/null && log_unsupported "python3 sysctl module missing" -log_assert "arc_summary generates an error code with invalid options" +log_assert "zarcsummary generates an error code with invalid options" for arg in "-x" "-5" "-p 7" "--err" "-@"; do - log_mustnot eval "arc_summary $arg > /dev/null" + log_mustnot eval "zarcsummary $arg > /dev/null" done -log_pass "arc_summary generates an error code with invalid options" +log_pass "zarcsummary generates an error code with invalid options" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/zfs_send_delegation_user/cleanup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/zfs_send_delegation_user/cleanup.ksh new file mode 100755 index 000000000000..4a59e15cc693 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/zfs_send_delegation_user/cleanup.ksh @@ -0,0 +1,43 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/delegate/delegate_common.kshlib + + +poolexists $TESTPOOL1 && \ + destroy_pool $TESTPOOL1 + +del_user $STAFF1 +del_user $STAFF2 +del_group $STAFF_GROUP + +del_user $OTHER1 +del_user $OTHER2 +del_group $OTHER_GROUP + +default_cleanup diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/zfs_send_delegation_user/setup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/zfs_send_delegation_user/setup.ksh new file mode 100755 index 000000000000..0978193eddc4 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/zfs_send_delegation_user/setup.ksh @@ -0,0 +1,50 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/delegate/delegate_common.kshlib + +# Create staff group and add two user to it +log_must add_group $STAFF_GROUP +if ! id $STAFF1 > /dev/null 2>&1; then + log_must add_user $STAFF_GROUP $STAFF1 +fi +if ! id $STAFF2 > /dev/null 2>&1; then + log_must add_user $STAFF_GROUP $STAFF2 +fi + +# Create other group and add two user to it +log_must add_group $OTHER_GROUP +if ! id $OTHER1 > /dev/null 2>&1; then + log_must add_user $OTHER_GROUP $OTHER1 +fi +if ! id $OTHER2 > /dev/null 2>&1; then + log_must add_user $OTHER_GROUP $OTHER2 +fi +DISK=${DISKS%% *} + +default_raidz_setup $DISKS diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/zfs_send_delegation_user/zfs_send_usertest.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/zfs_send_delegation_user/zfs_send_usertest.ksh new file mode 100755 index 000000000000..f62f2b07929c --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/zfs_send_delegation_user/zfs_send_usertest.ksh @@ -0,0 +1,145 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara Inc. +# + +# STRATEGY: +# 1. Create a pool (this is done by the test framework) +# 2. Create a user +# 3. Create an encrypted dataset +# 4. Write random data to the encrypted dataset +# 5. Snapshot the dataset +# 6. As root: attempt a send and raw send (both should succeed) +# 7. As user: attempt a send and raw send (both should fail, no permission) +# 8. Create a delegation (zfs allow -u user send testpool/encrypted_dataset) +# 9. As root: attempt a send and raw send (both should succeed) +# 10. As user: attempt a send and raw send (both should succeed) +# 11. Create a delegation (zfs allow -u user sendraw testpool/encrypted_dataset) +# 12. As root: attempt a send and raw send (both should succeed) +# 13. As user: attempt a send and raw send (send should fail, raw send should succeed) +# 14. Disable delegation (zfs unallow) +# 15. As root: attempt a send and raw send (both should succeed) +# 16. As user: attempt a send and raw send (both should fail, no permission) +# 17. Clean up (handled by framework) +# root tests to verify this doesnt affect root user under ../cli_root/zfs_send_delegation/ +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_create/zfs_create_common.kshlib +. $STF_SUITE/tests/functional/cli_root/zfs_create/properties.kshlib +. $STF_SUITE/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib +. $STF_SUITE/tests/functional/delegate/delegate.cfg + +# create encrypted dataset + +log_must eval "echo $PASSPHRASE | zfs create -o encryption=on -o keyformat=passphrase $TESTPOOL/$TESTFS1" + +# create target dataset for receives +log_must zfs create $TESTPOOL/$TESTFS2 + +# set user perms +# need to run chown for fs permissions for $OTHER1 +typeset perms="snapshot,reservation,compression,checksum,userprop,receive,mount,create" + +log_must zfs allow $OTHER1 $perms $TESTPOOL/$TESTFS1 +log_must zfs allow $OTHER1 $perms $TESTPOOL/$TESTFS2 +log_must chown ${OTHER1}:${OTHER_GROUP} /$TESTPOOL/$TESTFS2 + +# create random data +log_must fill_fs $TESTPOOL/$TESTFS1/child 1 2047 1024 1 R + +# snapshot +log_must zfs snapshot $TESTPOOL/$TESTFS1@snap1 + +# note +# we need to use `sh -c` here becuase the quoting on <<<"$*" in the user_run wrapper is broken once pipes and redirects get involved + +# check baseline send abilities (should fail) +log_mustnot user_run $OTHER1 sh -c "'zfs send $TESTPOOL/$TESTFS1@snap1 | zfs receive -u $TESTPOOL/$TESTFS2/zfsrecv0_user_datastream.$$'" +# verify nothing went through +if [ -s $TESTPOOL/$TESTFS2/zfsrecv0_user_datastream.$$ ] +then + log_fail "A zfs recieve was completed in $TESTPOOL/$TESTFS2/zfsrecv0_user_datastream !" +fi +log_mustnot user_run $OTHER1 sh -c "'zfs send -w $TESTPOOL/$TESTFS1@snap1 | zfs receive -u $TESTPOOL/$TESTFS2/zfsrecv0raw_user_datastream.$$'" +# verify nothing went through +if [ -s $TESTPOOL/$TESTFS2/zfsrecv0raw_user_datastream.$$ ] +then + log_fail "A zfs recieve was completed in $TESTPOOL/$TESTFS2/zfsrecv0raw_user_datastream !" +fi + +# create delegation +log_must zfs allow $OTHER1 send $TESTPOOL/$TESTFS1 + +# attempt send with full allow (should pass) +log_must user_run $OTHER1 sh -c "'zfs send $TESTPOOL/$TESTFS1@snap1 | zfs receive -u $TESTPOOL/$TESTFS2/zfsrecv1_user_datastream.$$'" +log_must user_run $OTHER1 sh -c "'zfs send -w $TESTPOOL/$TESTFS1@snap1 | zfs receive -u $TESTPOOL/$TESTFS2/zfsrecv1raw_user_datastream.$$'" + + +# create raw delegation +log_must zfs allow $OTHER1 send:raw $TESTPOOL/$TESTFS1 +# We have to remove 'send' to confirm 'send raw' only allows what we want +log_must zfs unallow -u $OTHER1 send $TESTPOOL/$TESTFS1 + +# test new sendraw abilities (send should fail, sendraw should pass) +log_mustnot user_run $OTHER1 sh -c "'zfs send $TESTPOOL/$TESTFS1@snap1 | zfs receive -u $TESTPOOL/$TESTFS2/zfsrecv2_user_datastream.$$'" + verify nothing went through +if [ -s $TESTPOOL/$TESTFS2/zfsrecv2_user_datastream.$$ ] +then + log_fail "A zfs recieve was completed in $TESTPOOL/$TESTFS2/zfsrecv2_user_datastream !" +fi +log_must user_run $OTHER1 sh -c "'zfs send -w $TESTPOOL/$TESTFS1@snap1 | zfs receive -u $TESTPOOL/$TESTFS2/zfsrecv2raw_user_datastream.$$'" + +# disable raw delegation +log_must zfs unallow -u $OTHER1 send:raw $TESTPOOL/$TESTFS1 +log_must zfs allow $OTHER1 send $TESTPOOL/$TESTFS1 + +# test with raw taken away (should pass) +log_must user_run $OTHER1 sh -c "'zfs send $TESTPOOL/$TESTFS1@snap1 | zfs receive -u $TESTPOOL/$TESTFS2/zfsrecv3_user_datastream.$$'" +log_must user_run $OTHER1 sh -c "'zfs send -w $TESTPOOL/$TESTFS1@snap1 | zfs receive -u $TESTPOOL/$TESTFS2/zfsrecv3raw_user_datastream.$$'" + +# disable send abilities +log_must zfs unallow -u $OTHER1 send $TESTPOOL/$TESTFS1 + +# verify original send abilities (should fail) +log_mustnot user_run $OTHER1 sh -c "'zfs send $TESTPOOL/$TESTFS1@snap1 | zfs receive -u $TESTPOOL/$TESTFS2/zfsrecv4_user_datastream.$$'" + verify nothing went through +if [ -s $TESTPOOL/$TESTFS2/zfsrecv4_user_datastream.$$ ] +then + log_fail "A zfs recieve was completed in $TESTPOOL/$TESTFS2/zfsrecv4_user_datastream !" +fi +log_mustnot user_run $OTHER1 sh -c "'zfs send -w $TESTPOOL/$TESTFS1@snap1 | zfs receive -u $TESTPOOL/$TESTFS2/zfsrecv4raw_user_datastream.$$'" + verify nothing went through +if [ -s $TESTPOOL/$TESTFS2/zfsrecv4raw_user_datastream.$$ ] +then + log_fail "A zfs recieve was completed in $TESTPOOL/$TESTFS2/zfsrecv4raw_user_datastream !" +fi + + +function cleanup +{ + datasetexists $TESTPOOL/$TESTFS1 && \ + destroy_dataset $TESTPOOL/$TESTFS1 -r \ + destroy_dataset $TESTPOOL/$TESTFS2 -r + +} diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh index 8f3e6d12e53d..449dedacb307 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh @@ -56,7 +56,7 @@ function cleanup { datasetexists $TESTPOOL/cp-reflink && \ destroy_dataset $$TESTPOOL/cp-reflink -f - log_must set_tunable32 BCLONE_WAIT_DIRTY 0 + log_must restore_tunable BCLONE_WAIT_DIRTY } function verify_copy @@ -81,6 +81,8 @@ SRC_SIZE=$((1024 + $RANDOM % 1024)) # A smaller recordsize is used merely to speed up the test. RECORDSIZE=4096 +log_must save_tunable BCLONE_WAIT_DIRTY + log_must zfs create -o recordsize=$RECORDSIZE $TESTPOOL/cp-reflink CP_TESTDIR=$(get_prop mountpoint $TESTPOOL/cp-reflink) diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/slow_vdev_degraded_sit_out.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/slow_vdev_degraded_sit_out.ksh new file mode 100755 index 000000000000..d5feb6936b4b --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/slow_vdev_degraded_sit_out.ksh @@ -0,0 +1,106 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# Copyright (c) 2024 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2025 by Klara, Inc. + +# DESCRIPTION: +# Verify that vdevs 'sit out' when they are slow +# +# STRATEGY: +# 1. Create various raidz/draid pools +# 2. Degrade/fault one of the disks. +# 3. Inject delays into one of the disks +# 4. Verify disk is set to 'sit out' for awhile. +# 5. Wait for READ_SIT_OUT_SECS and verify sit out state is lifted. +# + +. $STF_SUITE/include/libtest.shlib + +function cleanup +{ + restore_tunable READ_SIT_OUT_SECS + restore_tunable SIT_OUT_CHECK_INTERVAL + log_must zinject -c all + log_must zpool events -c + destroy_pool $TESTPOOL2 + log_must rm -f $TEST_BASE_DIR/vdev.$$.* +} + +log_assert "Verify sit_out works" + +log_onexit cleanup + +# shorten sit out period for testing +save_tunable READ_SIT_OUT_SECS +set_tunable32 READ_SIT_OUT_SECS 5 + +save_tunable SIT_OUT_CHECK_INTERVAL +set_tunable64 SIT_OUT_CHECK_INTERVAL 20 + +log_must truncate -s 150M $TEST_BASE_DIR/vdev.$$.{0..9} + +for raidtype in raidz2 raidz3 draid2 draid3 ; do + log_must zpool create $TESTPOOL2 $raidtype $TEST_BASE_DIR/vdev.$$.{0..9} + log_must zpool set autosit=on $TESTPOOL2 "${raidtype}-0" + log_must dd if=/dev/urandom of=/$TESTPOOL2/bigfile bs=1M count=400 + log_must zpool export $TESTPOOL2 + log_must zpool import -d $TEST_BASE_DIR $TESTPOOL2 + + BAD_VDEV=$TEST_BASE_DIR/vdev.$$.9 + SLOW_VDEV=$TEST_BASE_DIR/vdev.$$.8 + + # Initial state should not be sitting out + log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $SLOW_VDEV)" == "off" ]] + + # Delay our reads 200ms to trigger sit out + log_must zinject -d $SLOW_VDEV -D200:1 -T read $TESTPOOL2 + type=$((RANDOM % 2)) + [[ "$type" -eq "0" ]] && action="degrade" || action="fault" + log_must zinject -d $BAD_VDEV -A $action -T read $TESTPOOL2 + + # Do some reads and wait for us to sit out + for i in {0..99} ; do + dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null & + dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null + + sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $SLOW_VDEV) + if [[ "$sit_out" == "on" ]] ; then + break + fi + done + + log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $SLOW_VDEV)" == "on" + + # Clear fault injection + log_must zinject -c all + + # Wait for us to exit our sit out period + log_must wait_sit_out $TESTPOOL2 $SLOW_VDEV 10 + + log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $SLOW_VDEV)" == "off" + destroy_pool $TESTPOOL2 + log_must zpool labelclear -f $BAD_VDEV +done + +log_pass "sit_out works correctly" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh new file mode 100755 index 000000000000..37f616cf56ee --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh @@ -0,0 +1,102 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# Copyright (c) 2024 by Lawrence Livermore National Security, LLC. + +# DESCRIPTION: +# Verify that vdevs 'sit out' when they are slow +# +# STRATEGY: +# 1. Create various raidz/draid pools +# 2. Inject delays into one of the disks +# 3. Verify disk is set to 'sit out' for awhile. +# 4. Wait for READ_SIT_OUT_SECS and verify sit out state is lifted. +# + +. $STF_SUITE/include/libtest.shlib + +function cleanup +{ + restore_tunable READ_SIT_OUT_SECS + restore_tunable SIT_OUT_CHECK_INTERVAL + log_must zinject -c all + log_must zpool events -c + destroy_pool $TESTPOOL2 + log_must rm -f $TEST_BASE_DIR/vdev.$$.* +} + +log_assert "Verify sit_out works" + +log_onexit cleanup + +# shorten sit out period for testing +save_tunable READ_SIT_OUT_SECS +set_tunable32 READ_SIT_OUT_SECS 5 + +save_tunable SIT_OUT_CHECK_INTERVAL +set_tunable64 SIT_OUT_CHECK_INTERVAL 20 + +log_must truncate -s200M $TEST_BASE_DIR/vdev.$$.{0..9} + +for raidtype in raidz raidz2 raidz3 draid1 draid2 draid3 ; do + log_must zpool create $TESTPOOL2 $raidtype $TEST_BASE_DIR/vdev.$$.{0..9} + log_must zpool set autosit=on $TESTPOOL2 "${raidtype}-0" + log_must dd if=/dev/urandom of=/$TESTPOOL2/bigfile bs=1M count=600 + log_must zpool export $TESTPOOL2 + log_must zpool import -d $TEST_BASE_DIR $TESTPOOL2 + + BAD_VDEV=$TEST_BASE_DIR/vdev.$$.9 + + # Initial state should not be sitting out + log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "off" ]] + + # Delay our reads 200ms to trigger sit out + log_must zinject -d $BAD_VDEV -D200:1 -T read $TESTPOOL2 + + # Do some reads and wait for us to sit out + for i in {0..99} ; do + dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null & + dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null & + dd if=/$TESTPOOL2/bigfile skip=$((i + 200)) bs=2M count=1 of=/dev/null + + sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV) + if [[ "$sit_out" == "on" ]] ; then + break + fi + done + + log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "on" + + # Clear fault injection + log_must zinject -c all + + # Wait for us to exit our sit out period + log_must wait_sit_out $TESTPOOL2 $BAD_VDEV 10 + + # Verify sit_out was cleared during wait_sit_out + log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "off" + + destroy_pool $TESTPOOL2 +done + +log_pass "sit_out works correctly" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out_neg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out_neg.ksh new file mode 100755 index 000000000000..457105a66453 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out_neg.ksh @@ -0,0 +1,116 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# Copyright (c) 2024 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2025 by Klara, Inc. + +# DESCRIPTION: +# Verify that we don't sit out too many vdevs +# +# STRATEGY: +# 1. Create draid2 pool +# 2. Inject delays into three of the disks +# 3. Do reads to trigger sit-outs +# 4. Verify exactly 2 disks sit out +# + +. $STF_SUITE/include/libtest.shlib + +function cleanup +{ + restore_tunable READ_SIT_OUT_SECS + restore_tunable SIT_OUT_CHECK_INTERVAL + log_must zinject -c all + log_must zpool events -c + destroy_pool $TESTPOOL2 + log_must rm -f $TEST_BASE_DIR/vdev.$$.* +} + +log_assert "Verify sit_out works" + +log_onexit cleanup + +save_tunable SIT_OUT_CHECK_INTERVAL +set_tunable64 SIT_OUT_CHECK_INTERVAL 20 + +log_must truncate -s 150M $TEST_BASE_DIR/vdev.$$.{0..9} + +log_must zpool create $TESTPOOL2 draid2 $TEST_BASE_DIR/vdev.$$.{0..9} +log_must zpool set autosit=on $TESTPOOL2 draid2-0 +log_must dd if=/dev/urandom of=/$TESTPOOL2/bigfile bs=1M count=400 +log_must zpool export $TESTPOOL2 +log_must zpool import -d $TEST_BASE_DIR $TESTPOOL2 + +BAD_VDEV1=$TEST_BASE_DIR/vdev.$$.7 +BAD_VDEV2=$TEST_BASE_DIR/vdev.$$.8 +BAD_VDEV3=$TEST_BASE_DIR/vdev.$$.9 + +# Initial state should not be sitting out +log_must eval [[ "$(get_vdev_prop autosit $TESTPOOL2 draid2-0)" == "on" ]] +log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV1)" == "off" ]] +log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV2)" == "off" ]] +log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV3)" == "off" ]] + +# Delay our reads 200ms to trigger sit out +log_must zinject -d $BAD_VDEV1 -D200:1 -T read $TESTPOOL2 + +# Do some reads and wait for us to sit out +for i in {0..99} ; do + dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null & + dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null + + sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV1) + if [[ "$sit_out" == "on" ]] ; then + break + fi +done +log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV1)" == "on" + +log_must zinject -d $BAD_VDEV2 -D200:1 -T read $TESTPOOL2 +# Do some reads and wait for us to sit out +for i in {0..99} ; do + dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null & + dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null + + sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV2) + if [[ "$sit_out" == "on" ]] ; then + break + fi +done +log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV2)" == "on" + +log_must zinject -d $BAD_VDEV3 -D200:1 -T read $TESTPOOL2 +# Do some reads and wait for us to sit out +for i in {0..99} ; do + dd if=/$TESTPOOL2/bigfile skip=$i bs=2M count=1 of=/dev/null & + dd if=/$TESTPOOL2/bigfile skip=$((i + 100)) bs=2M count=1 of=/dev/null + + sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV3) + if [[ "$sit_out" == "on" ]] ; then + break + fi +done +log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV3)" == "off" + + +log_pass "sit_out works correctly" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/zed_synchronous_zedlet.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/zed_synchronous_zedlet.ksh new file mode 100755 index 000000000000..6b732ea96d0c --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/zed_synchronous_zedlet.ksh @@ -0,0 +1,149 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 by Lawrence Livermore National Security, LLC. +# + +# DESCRIPTION: +# Verify ZED synchronous zedlets work as expected +# +# STRATEGY: +# 1. Create a scrub_start zedlet that runs quickly +# 2. Create a scrub_start zedlet that runs slowly (takes seconds) +# 3. Create a scrub_finish zedlet that is synchronous and runs slowly +# 4. Create a trim_start zedlet that runs quickly +# 4. Scrub the pool +# 5. Trim the pool +# 6. Verify the synchronous scrub_finish zedlet waited for the scrub_start +# zedlets to finish (including the slow one). If the scrub_finish zedlet +# was not synchronous, it would have completed before the slow scrub_start +# zedlet. +# 7. Verify the trim_start zedlet waited for the slow synchronous scrub_finish +# zedlet to complete. + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/events/events_common.kshlib + +verify_runnable "both" + +OUR_ZEDLETS="scrub_start-async.sh scrub_start-slow.sh scrub_finish-sync-slow.sh trim_start-async.sh" + +OUTFILE="$TEST_BASE_DIR/zed_synchronous_zedlet_lines" +TESTPOOL2=testpool2 + +function cleanup +{ + zed_stop + + for i in $OUR_ZEDLETS ; do + log_must rm -f $ZEDLET_DIR/$i + done + destroy_pool $TESTPOOL2 + log_must rm -f $TEST_BASE_DIR/vdev-file-sync-zedlet + log_must rm -f $OUTFILE +} + +log_assert "Verify ZED synchronous zedlets work as expected" + +log_onexit cleanup + +# Make a pool +log_must truncate -s 100M $TEST_BASE_DIR/vdev-file-sync-zedlet +log_must zpool create $TESTPOOL2 $TEST_BASE_DIR/vdev-file-sync-zedlet + +# Do an initial scrub +log_must zpool scrub -w $TESTPOOL2 + +log_must zpool events -c + +mkdir -p $ZEDLET_DIR + +# Create zedlets +cat << EOF > $ZEDLET_DIR/scrub_start-async.sh +#!/bin/ksh -p +echo "\$(date) \$(basename \$0)" >> $OUTFILE +EOF + +cat << EOF > $ZEDLET_DIR/scrub_start-slow.sh +#!/bin/ksh -p +sleep 3 +echo "\$(date) \$(basename \$0)" >> $OUTFILE +EOF + +cat << EOF > $ZEDLET_DIR/scrub_finish-sync-slow.sh +#!/bin/ksh -p +sleep 3 +echo "\$(date) \$(basename \$0)" >> $OUTFILE +EOF + +cat << EOF > $ZEDLET_DIR/trim_start-async.sh +#!/bin/ksh -p +echo "\$(date) \$(basename \$0)" >> $OUTFILE +EOF + +for i in $OUR_ZEDLETS ; do + log_must chmod +x $ZEDLET_DIR/$i +done + +log_must zed_start + +# Do a scrub - it should be instantaneous. +log_must zpool scrub -w $TESTPOOL2 + +# Start off a trim immediately after scrubiung. The trim should be +# instantaneous and generate a trimp_start event. This will happen in parallel +# with the slow 'scrub_finish-sync-slow.sh' zedlet still running. +log_must zpool trim -w $TESTPOOL2 + +# Wait for scrub_finish event to happen for sanity. This is the *event*, not +# the completion of zedlets for the event. +log_must file_wait_event $ZED_DEBUG_LOG 'sysevent\.fs\.zfs\.trim_finish' 10 + +# At a minimum, scrub_start-slow.sh + scrub_finish-sync-slow.sh will take a +# total of 6 seconds to run, so wait 7 sec to be sure. +sleep 7 + +# If our zedlets were run in the right order, with sync correctly honored, you +# will see this ordering in $OUTFILE: +# +# Fri May 16 12:04:23 PDT 2025 scrub_start-async.sh +# Fri May 16 12:04:26 PDT 2025 scrub_start-slow.sh +# Fri May 16 12:04:31 PDT 2025 scrub_finish-sync-slow.sh +# Fri May 16 12:04:31 PDT 2025 trim_start-async.sh +# +# Check for this ordering + +# Get a list of just the script names in the order they were executed +# from OUTFILE +lines="$(echo $(grep -Eo '(scrub|trim)_.+\.sh$' $OUTFILE))" + +# Compare it to the ordering we expect +expected="\ +scrub_start-async.sh \ +scrub_start-slow.sh \ +scrub_finish-sync-slow.sh \ +trim_start-async.sh" +log_must test "$lines" == "$expected" + +log_pass "Verified synchronous zedlets" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode.kshlib new file mode 100644 index 000000000000..d0b7404557ab --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode.kshlib @@ -0,0 +1,149 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +typeset -A failmode_sync_helper_cmd=( + ["fsync"]='dd if=/dev/urandom of=DATAFILE bs=128k count=1 conv=fsync' + ["msync"]='mmap_write_sync DATAFILE' + ["osync"]='dd if=/dev/urandom of=DATAFILE bs=128k count=1 oflag=sync' + ["syncalways"]='dd if=/dev/urandom of=DATAFILE bs=128k count=1' +) + +typeset -A failmode_sync_helper_dsopts=( + ["syncalways"]="-o sync=always" +) + +function failmode_sync_cleanup +{ + zinject -c all || true + zpool clear $TESTPOOL || true + destroy_pool $TESTPOOL +} + +# +# failmode_sync_test <failmode> <helper> +# +# run a failmode sync test: +# - failmode: wait|continue +# - helper: fsync|msync|osync|syncalways +# +function failmode_sync_test +{ + typeset failmode=$1 + typeset helper=$2 + + # we'll need two disks, one for the main pool, one for the log + read -r DISK1 DISK2 _ <<<"$DISKS" + + # file to write to the pool + typeset datafile="/$TESTPOOL/$TESTFS/datafile" + + # create a single-disk pool with a separate log and the wanted failmode + log_must zpool create \ + -f -o failmode=$failmode $TESTPOOL $DISK1 log $DISK2 + + # create the test dataset. we bias the ZIL towards the log device to + # try to ensure that the sync write never involves the main device + log_must zfs create \ + -o recordsize=128k -o logbias=latency \ + ${failmode_sync_helper_dsopts[$helper]} \ + $TESTPOOL/$TESTFS + + # create the target file. the ZIL head structure is created on first + # use, and does a full txg wait to finish, which we want to avoid + log_must dd if=/dev/zero of=$datafile bs=128k count=1 conv=fsync + log_must zpool sync + + # inject errors. writes will fail, as will the followup probes + zinject -d $DISK1 -e io -T write $TESTPOOL + zinject -d $DISK1 -e nxio -T probe $TESTPOOL + zinject -d $DISK2 -e io -T write $TESTPOOL + zinject -d $DISK2 -e nxio -T probe $TESTPOOL + + # run the helper program in the background. the pool should immediately + # suspend, and the sync op block or fail based on the failmode + typeset helper_cmd=${failmode_sync_helper_cmd[$helper]/DATAFILE/$datafile} + log_note "running failmode sync helper: $helper_cmd" + $helper_cmd & + typeset -i pid=$! + + # should only take a moment, but give it a chance + log_note "waiting for pool to suspend" + typeset -i tries=10 + until [[ $(kstat_pool $TESTPOOL state) == "SUSPENDED" ]] ; do + if ((tries-- == 0)); then + log_fail "pool didn't suspend" + fi + sleep 1 + done + + # zil_commit() should have noticed the suspend by now + typeset -i zilerr=$(kstat zil.zil_commit_error_count) + + # see if the helper program blocked + typeset -i blocked + if kill -0 $pid ; then + blocked=1 + log_note "$helper: blocked in the kernel" + else + blocked=0 + log_note "$helper: exited while pool suspended" + fi + + # bring the pool back online + zinject -c all + zpool clear $TESTPOOL + + # program definitely exited now, get its return code + wait $pid + typeset -i rc=$? + + failmode_sync_cleanup + + log_note "$helper: zilerr=$zilerr blocked=$blocked rc=$rc" + + # confirm expected results for the failmode + if [[ $failmode = "wait" ]] ; then + # - the ZIL saw an error, and fell back to a txg sync + # - sync op blocked when the pool suspended + # - after resume, sync op succeeded, helper returned success + log_must test $zilerr -ne 0 + log_must test $blocked -eq 1 + log_must test $rc -eq 0 + elif [[ $failmode = "continue" ]] ; then + # confirm expected results: + # - the ZIL saw an error, and fell back to a txg sync + # - helper exited when the pool suspended + # - sync op returned an error, so helper returned failure + log_must test $zilerr -ne 0 + log_must test $blocked -eq 0 + log_must test $rc -ne 0 + else + log_fail "impossible failmode: $failmode" + fi +} diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_fsync_continue.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_fsync_continue.ksh new file mode 100755 index 000000000000..7b145d3a2b4c --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_fsync_continue.ksh @@ -0,0 +1,36 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/failmode/failmode.kshlib + +typeset desc="fsync() returns when pool suspends with failmode=continue" + +log_assert $desc +log_onexit failmode_sync_cleanup +log_must failmode_sync_test continue fsync +log_pass $desc diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_fsync_wait.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_fsync_wait.ksh new file mode 100755 index 000000000000..677d226b5481 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_fsync_wait.ksh @@ -0,0 +1,36 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/failmode/failmode.kshlib + +typeset desc="fsync() blocks when pool suspends with failmode=wait" + +log_assert $desc +log_onexit failmode_sync_cleanup +log_must failmode_sync_test wait fsync +log_pass $desc diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_msync_continue.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_msync_continue.ksh new file mode 100755 index 000000000000..0c79ee15a1ba --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_msync_continue.ksh @@ -0,0 +1,36 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/failmode/failmode.kshlib + +typeset desc="msync() returns when pool suspends with failmode=continue" + +log_assert $desc +log_onexit failmode_sync_cleanup +log_must failmode_sync_test continue msync +log_pass $desc diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_msync_wait.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_msync_wait.ksh new file mode 100755 index 000000000000..a59d8cc50d61 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_msync_wait.ksh @@ -0,0 +1,36 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/failmode/failmode.kshlib + +typeset desc="msync() blocks when pool suspends with failmode=wait" + +log_assert $desc +log_onexit failmode_sync_cleanup +log_must failmode_sync_test wait msync +log_pass $desc diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_osync_continue.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_osync_continue.ksh new file mode 100755 index 000000000000..c4fa0c8f042c --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_osync_continue.ksh @@ -0,0 +1,36 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/failmode/failmode.kshlib + +typeset desc="O_SYNC returns when pool suspends with failmode=continue" + +log_assert $desc +log_onexit failmode_sync_cleanup +log_must failmode_sync_test continue osync +log_pass $desc diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_osync_wait.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_osync_wait.ksh new file mode 100755 index 000000000000..5f65cf92ad33 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_osync_wait.ksh @@ -0,0 +1,37 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/failmode/failmode.kshlib + +typeset desc="O_SYNC blocks when pool suspends with failmode=wait" + +log_assert $desc +log_onexit failmode_sync_cleanup +log_must failmode_sync_test wait osync +log_pass $desc + diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_syncalways_continue.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_syncalways_continue.ksh new file mode 100755 index 000000000000..b80d776224a0 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_syncalways_continue.ksh @@ -0,0 +1,37 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/failmode/failmode.kshlib + +typeset desc="write()+sync=always returns when pool suspends with failmode=continue" + +log_assert $desc +log_onexit failmode_sync_cleanup +log_must failmode_sync_test continue syncalways +log_pass $desc + diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_syncalways_wait.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_syncalways_wait.ksh new file mode 100755 index 000000000000..4fcb167b5c77 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/failmode/failmode_syncalways_wait.ksh @@ -0,0 +1,37 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/failmode/failmode.kshlib + +typeset desc="write()+sync=always blocks when pool suspends with failmode=wait" + +log_assert $desc +log_onexit failmode_sync_cleanup +log_must failmode_sync_test wait syncalways +log_pass $desc + diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh index ef49a5d50f6c..45848bec1f5a 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh @@ -86,7 +86,7 @@ log_must zpool set autoreplace=on $TESTPOOL # Add some data to the pool log_must zfs create $TESTPOOL/fs -log_must fill_fs /$TESTPOOL/fs 4 100 4096 512 Z +log_must fill_fs /$TESTPOOL/fs 4 100 4096 512 R log_must zpool export $TESTPOOL # Record the partition UUID for later comparison diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_replace_002_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_replace_002_pos.ksh index a77957f32255..878b4e450340 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_replace_002_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_replace_002_pos.ksh @@ -119,7 +119,7 @@ log_must zpool set autoreplace=on $TESTPOOL # Add some data to the pool log_must zfs create $TESTPOOL/fs -log_must fill_fs /$TESTPOOL/fs 4 100 4096 512 Z +log_must fill_fs /$TESTPOOL/fs 4 100 4096 512 R log_must zpool export $TESTPOOL # Record the partition UUID for later comparison diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/fault_limits.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/fault_limits.ksh index 1b3310edb98b..45b041503e22 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/fault_limits.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/fault_limits.ksh @@ -67,7 +67,7 @@ log_must zpool create -f ${TESTPOOL} raidz${PARITY} ${disks[1..$((VDEV_CNT - 1)) # Add some data to the pool log_must zfs create $TESTPOOL/fs MNTPOINT="$(get_prop mountpoint $TESTPOOL/fs)" -log_must fill_fs $MNTPOINT $PARITY 200 32768 1000 Z +log_must fill_fs $MNTPOINT $PARITY 200 32768 100 R sync_pool $TESTPOOL # Replace the last child vdev to form a replacing vdev diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh index 340994bb60c5..b5df1c7e37f8 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/suspend_on_probe_errors.ksh @@ -101,7 +101,7 @@ sync_pool $TESTPOOL log_must zfs create $TESTPOOL/fs MNTPOINT="$(get_prop mountpoint $TESTPOOL/fs)" SECONDS=0 -log_must fill_fs $MNTPOINT 1 200 4096 10 Z +log_must fill_fs $MNTPOINT 1 200 4096 10 R log_note "fill_fs took $SECONDS seconds" sync_pool $TESTPOOL diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib index 553533377aa4..1fd9b321cd9e 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib @@ -72,6 +72,23 @@ function get_num_dvas sed 's/.*L[0-9] \(.*\) [a-f0-9]*L.*/\1/' | awk '{print NF}' } +function check_gang_bp +{ + typeset gang="$(echo -n $1 | grep -c 'gang')" + [[ "$gang" == "1" ]] || return 1 + return 0 +} + +function check_is_gang_bp +{ + check_gang_bp $1 || log_fail "Not a gang DVA: \"$1\"" +} + +function check_not_gang_bp +{ + check_gang_bp $1 && log_fail "Gang DVA: \"$1\"" +} + function check_gang_dva { typeset last_byte="$(echo -n $1 | tail -c 1)" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_neg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_neg.ksh new file mode 100755 index 000000000000..e9cb1d2a034a --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_neg.ksh @@ -0,0 +1,53 @@ +#!/bin/ksh +# SPDX-License-Identifier: CDDL-1.0 +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +# +# Description: +# Verify that we don't use larger gang headers on ashift=9 pools +# +# Strategy: +# 1. Create a pool with dynamic gang headers. +# 2. Set metaslab_force_ganging to force multi-level ganging. +# 3. Verify that a large file has multi-level ganging +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib + +log_assert "Verify that we don't use large gang headers on small-ashift pools". + +log_onexit cleanup +preamble + +log_must zpool create -f -o ashift=9 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS +log_must zfs create -o recordsize=1M $TESTPOOL/$TESTFS +mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS) +set_tunable64 METASLAB_FORCE_GANGING 200000 +set_tunable32 METASLAB_FORCE_GANGING_PCT 100 + +path="${mountpoint}/file" +log_must dd if=/dev/urandom of=$path bs=1M count=1 +log_must zpool sync $TESTPOOL +first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file) +leaves=$(read_gang_header $TESTPOOL $first_block 200) +gangs=$(echo "$leaves" | grep -c gang) +[[ "$gangs" -gt 0 ]] || log_fail "We didn't use a deep gang tree when needed" + +log_must verify_pool $TESTPOOL +status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL) +[[ "$status" == "enabled" ]] || log_fail "Dynamic gang headers active on an ashift-9 pool" +log_pass "We don't use large gang headers on small-ashift pools". diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh new file mode 100755 index 000000000000..2941325fdc33 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_header_pos.ksh @@ -0,0 +1,81 @@ +#!/bin/ksh +# SPDX-License-Identifier: CDDL-1.0 +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +# +# Description: +# Verify that we use larger gang headers on ashift=12 pools +# +# Strategy: +# 1. Create a pool with dynamic gang headers. +# 2. Set metaslab_force_ganging to force ganging. +# 3. Verify that a large file has more than 3 gang headers. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib + +log_assert "Verify that we don't use large gang headers on small-ashift pools". + +log_onexit cleanup +preamble + +for vdevtype in "" "mirror" "raidz" "raidz2" "draid"; do + log_must zpool create -f -o ashift=12 $TESTPOOL $vdevtype $DISKS + log_must zfs create -o recordsize=1M $TESTPOOL/$TESTFS + mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS) + set_tunable64 METASLAB_FORCE_GANGING 200000 + set_tunable32 METASLAB_FORCE_GANGING_PCT 100 + + status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL) + [[ "$status" == "enabled" ]] || \ + log_fail "Dynamic gang headers not enabled" + path="${mountpoint}/file" + log_must dd if=/dev/urandom of=$path bs=1M count=1 + log_must zpool sync $TESTPOOL + first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file) + leaves=$(read_gang_header $TESTPOOL $first_block 1000 | \ + grep -v HOLE | grep -v "^Found") + first_child=$(echo "$leaves" | head -n 1) + check_gang_bp $first_child + + num_leaves=$(echo "$leaves" | wc -l) + [[ "$num_leaves" -gt 3 ]] && \ + log_fail "used a larger gang header too soon: \"$leaves\"" + log_must verify_pool $TESTPOOL + status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL) + [[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active" + + path="${mountpoint}/file2" + log_must dd if=/dev/urandom of=$path bs=1M count=1 + log_must zpool sync $TESTPOOL + first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file2) + leaves=$(read_gang_header $TESTPOOL $first_block 1000 | \ + grep -v HOLE | grep -v "^Found") + first_child=$(echo "$leaves" | head -n 1) + check_not_gang_bp $first_child + + num_leaves=$(echo "$leaves" | wc -l) + [[ "$num_leaves" -gt 3 ]] || \ + log_fail "didn't use a larger gang header: \"$leaves\"" + + + log_must verify_pool $TESTPOOL + status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL) + [[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active" + log_must zpool destroy $TESTPOOL +done +log_pass "We don't use large gang headers on small-ashift pools". diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_multi.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_multi.ksh new file mode 100755 index 000000000000..2ffe24968f10 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_dyn_multi.ksh @@ -0,0 +1,54 @@ +#!/bin/ksh +# SPDX-License-Identifier: CDDL-1.0 +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +# +# Description: +# Verify that multi-level ganging still works with dynamic headers +# +# Strategy: +# 1. Create a pool with dynamic gang headers and ashift=12. +# 2. Set metaslab_force_ganging to force multi-level ganging. +# 3. Verify that a large file has multi-level ganging +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib + +log_assert "Verify that we can still multi-level gang with large headers." + +log_onexit cleanup +preamble + +log_must zpool create -f -o ashift=12 -o feature@dynamic_gang_header=enabled $TESTPOOL $DISKS +log_must zfs create -o recordsize=16M $TESTPOOL/$TESTFS +mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS) +set_tunable64 METASLAB_FORCE_GANGING 50000 +set_tunable32 METASLAB_FORCE_GANGING_PCT 100 + +path="${mountpoint}/file" +log_must dd if=/dev/urandom of=$path bs=16M count=1 +log_must zpool sync $TESTPOOL +first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file) +leaves=$(read_gang_header $TESTPOOL $first_block 200) +gangs=$(echo "$leaves" | grep -c gang) +[[ "$gangs" -gt 0 ]] || log_fail "We didn't use a deep gang tree when needed" + +log_must verify_pool $TESTPOOL +status=$(get_pool_prop feature@dynamic_gang_header $TESTPOOL) +[[ "$status" == "active" ]] || log_fail "Dynamic gang headers not active" + +log_pass "We can still multi-level gang with large headers." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/mmap/mmap_ftruncate.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/mmap/mmap_ftruncate.ksh new file mode 100755 index 000000000000..63ebf95de7f0 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/mmap/mmap_ftruncate.ksh @@ -0,0 +1,80 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# This verifies that async writeback of dirty mmap()'d pages completes quickly. +# ftruncate() is an operation that will trigger async writeback, but is not +# itself a syncing operation, making it a useful proxy for any way the kernel +# might trigger async writeback. +# +# The guts of this test is in the mmap_ftruncate program. This driver sets a +# larger zfs_txg_timeout. Test failure occurs ftruncate() blocks waiting for +# the writeback until the txg timeout is reached and the changes are forcibly +# written out. Success means the DMU has accepted the changes and cleared the +# page dirty flags. +# + +TIMEOUT=180 +TESTFILE=/$TESTPOOL/truncfile +TESTSIZE=$((2*1024*1024*1024)) # 2G + +verify_runnable "global" + +typeset claim="async writeback of dirty mmap()'d pages completes quickly" + +log_assert $claim + +log_must save_tunable TXG_TIMEOUT + +function cleanup +{ + log_must restore_tunable TXG_TIMEOUT + rm -f $TESTFILE +} +log_onexit cleanup + +log_must set_tunable32 TXG_TIMEOUT $TIMEOUT +log_must zpool sync -f + +# run mmap_ftruncate and record the run time +typeset -i start=$(date +%s) +log_must mmap_ftruncate $TESTFILE $TESTSIZE +typeset -i end=$(date +%s) +typeset -i delta=$((end - start)) + +# in practice, mmap_ftruncate needs a few seconds to dirty all the pages, and +# when this test passes, the ftruncate() call itself should be near-instant. +# when it fails, then its only the txg sync that allows ftruncate() to +# complete, in that case, the run time will be extremely close to the timeout, +# so to avoid any confusion at the edges, we require that it complets within +# half the transaction time. for any timeout higher than ~30s that should be a +# very bright line down the middle. +log_must test $delta -lt $((TIMEOUT / 2)) + +log_pass $claim diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/mount/mount_loopback.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/mount/mount_loopback.ksh new file mode 100755 index 000000000000..86adef7ea032 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/mount/mount_loopback.ksh @@ -0,0 +1,111 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# Copyright (c) 2025 by Lawrence Livermore National Security, LLC. + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify that we can make an xfs filesystem on a ZFS-backed loopback device. +# +# See: +# https://github.com/openzfs/zfs/pull/17298 +# https://github.com/openzfs/zfs/issues/17277 +# +# STRATEGY: +# 1. Make a pool +# 2. Make a file on the pool or create zvol +# 3. Mount the file/zvol behind a loopback device +# 4. Create & mount an xfs filesystem on the loopback device + +function cleanup +{ + if [ -d $TEST_BASE_DIR/mnt ] ; then + umount $TEST_BASE_DIR/mnt + log_must rmdir $TEST_BASE_DIR/mnt + fi + if [ -n "$DEV" ] ; then + log_must losetup -d $DEV + fi + destroy_pool $TESTPOOL2 + log_must rm -f $TEST_BASE_DIR/file1 +} + +if [ ! -x "$(which mkfs.xfs)" ] ; then + log_unsupported "No mkfs.xfs binary" +fi + +if [ ! -d /lib/modules/$(uname -r)/kernel/fs/xfs ] && \ + ! grep -qE '\sxfs$' /proc/filesystems ; then + log_unsupported "No XFS kernel support" +fi + +log_assert "Make an xfs filesystem on a ZFS-backed loopback device" +log_onexit cleanup + +# fio options +export NUMJOBS=2 +export RUNTIME=3 +export PERF_RANDSEED=1234 +export PERF_COMPPERCENT=66 +export PERF_COMPCHUNK=0 +export BLOCKSIZE=128K +export SYNC_TYPE=0 +export FILE_SIZE=$(( 1024 * 1024 )) + +function do_test +{ + imgfile=$1 + log_note "Running test on $imgfile" + log_must losetup -f $imgfile + DEV=$(losetup --associated $imgfile | grep -Eo '^/dev/loop[0-9]+') + log_must mkfs.xfs $DEV + mkdir $TEST_BASE_DIR/mnt + log_must mount $DEV $TEST_BASE_DIR/mnt + export DIRECTORY=$TEST_BASE_DIR/mnt + + for d in 0 1 ; do + # fio options + export DIRECT=$d + log_must fio $FIO_SCRIPTS/mkfiles.fio + log_must fio $FIO_SCRIPTS/random_reads.fio + done + log_must umount $TEST_BASE_DIR/mnt + log_must rmdir $TEST_BASE_DIR/mnt + log_must losetup -d $DEV + DEV="" +} + +log_must truncate -s 1G $TEST_BASE_DIR/file1 +log_must zpool create $TESTPOOL2 $TEST_BASE_DIR/file1 +log_must truncate -s 512M /$TESTPOOL2/img +do_test /$TESTPOOL2/img +log_must rm /$TESTPOOL2/img +log_must zfs create -V 512M $TESTPOOL2/vol + +blkdev="$ZVOL_DEVDIR/$TESTPOOL2/vol" +block_device_wait $blkdev +do_test $blkdev + +log_pass "Verified xfs filesystem on a ZFS-backed loopback device" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_zdb.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_zdb.ksh index cd4573b2e4d1..b364a5cb4bdc 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_zdb.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_zdb.ksh @@ -63,6 +63,7 @@ log_must eval "zdb $TESTPOOL | grep -q \"Checkpointed uberblock found\"" log_mustnot eval "zdb -k $TESTPOOL | grep -q \"Checkpointed uberblock found\"" log_mustnot eval "zdb $TESTPOOL | grep \"Dataset $FS1\"" log_must eval "zdb -k $TESTPOOL | grep \"Dataset $CHECKPOINTED_FS1\"" +log_must eval "zdb -k $TESTPOOL/ | grep \"$TESTPOOL$BOGUS_SUFFIX\"" log_must zpool export $TESTPOOL @@ -70,6 +71,7 @@ log_must eval "zdb -e $TESTPOOL | grep \"Checkpointed uberblock found\"" log_mustnot eval "zdb -k -e $TESTPOOL | grep \"Checkpointed uberblock found\"" log_mustnot eval "zdb -e $TESTPOOL | grep \"Dataset $FS1\"" log_must eval "zdb -k -e $TESTPOOL | grep \"Dataset $CHECKPOINTED_FS1\"" +log_must eval "zdb -k -e $TESTPOOL/ | grep \"$TESTPOOL$BOGUS_SUFFIX\"" log_must zpool import $TESTPOOL diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh index 3249bd93d5ce..a1da4a8631e1 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh @@ -17,6 +17,7 @@ . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/refreserv/refreserv.cfg +. $STF_SUITE/tests/functional/zvol/zvol_misc/zvol_misc_common.kshlib # # DESCRIPTION: @@ -24,7 +25,7 @@ # # STRATEGY: # 1. Create a pool with a single raidz vdev -# 2. For each block size [512b, 1k, 128k] or [4k, 8k, 128k] +# 2. For each block size [4k, 8k, 128k] # - create a volume # - fully overwrite it # - verify that referenced is less than or equal to reservation @@ -38,6 +39,7 @@ # 1. This test will use up to 14 disks but can cover the key concepts with # 5 disks. # 2. If the disks are a mixture of 4Kn and 512n/512e, failures are likely. +# Therefore, when creating the pool we specify 4Kn sectors. # verify_runnable "global" @@ -60,29 +62,10 @@ log_onexit cleanup poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL" -# Testing tiny block sizes on ashift=12 pools causes so much size inflation -# that small test disks may fill before creating small volumes. However, -# testing 512b and 1K blocks on ashift=9 pools is an ok approximation for -# testing the problems that arise from 4K and 8K blocks on ashift=12 pools. -if is_freebsd; then - bps=$(diskinfo -v ${alldisks[0]} | awk '/sectorsize/ { print $1 }') -elif is_linux; then - bps=$(lsblk -nrdo min-io /dev/${alldisks[0]}) -fi -log_must test "$bps" -eq 512 -o "$bps" -eq 4096 -case "$bps" in -512) - allshifts=(9 10 17) - maxpct=151 - ;; -4096) - allshifts=(12 13 17) - maxpct=110 - ;; -*) - log_fail "bytes/sector: $bps != (512|4096)" - ;; -esac +ashift=12 +allshifts=(12 13 17) +maxpct=110 + log_note "Testing in ashift=${allshifts[0]} mode" # This loop handles all iterations of steps 1 through 4 described in strategy @@ -99,18 +82,21 @@ for parity in 1 2 3; do continue fi - log_must zpool create -O compression=off "$TESTPOOL" "$raid" "${disks[@]}" + log_must zpool create -o ashift=$ashift "$TESTPOOL" "$raid" "${disks[@]}" for bits in "${allshifts[@]}"; do vbs=$((1 << bits)) log_note "Testing $raid-$ndisks volblocksize=$vbs" - vol=$TESTPOOL/$TESTVOL + vol=$TESTPOOL/$TESTVOL-$vbs + zdev=$ZVOL_DEVDIR/$vol log_must zfs create -V ${volsize}m \ -o volblocksize=$vbs "$vol" - block_device_wait "/dev/zvol/$vol" - log_must dd if=/dev/zero of=/dev/zvol/$vol \ - bs=1024k count=$volsize + block_device_wait $zdev + blockdev_exists $zdev + + log_must timeout 120 dd if=/dev/urandom of=$zdev \ + bs=1024k count=$volsize status=progress sync_pool $TESTPOOL ref=$(zfs get -Hpo value referenced "$vol") @@ -126,7 +112,7 @@ for parity in 1 2 3; do log_must test "$deltapct" -le $maxpct log_must_busy zfs destroy "$vol" - block_device_wait + blockdev_missing $zdev done log_must_busy zpool destroy "$TESTPOOL" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/attach_resilver_sit_out.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/attach_resilver_sit_out.ksh new file mode 100755 index 000000000000..6820aba184b7 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/attach_resilver_sit_out.ksh @@ -0,0 +1,189 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013, 2016 by Delphix. All rights reserved. +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# DESCRIPTION: +# Attaching disks while a disk is sitting out reads should pass +# +# STRATEGY: +# 1. Create raidz pools +# 2. Make one disk slower and trigger a read sit out for that disk +# 3. Start some random I/O +# 4. Attach a disk to the pool. +# 5. Verify the integrity of the file system and the resilvering. + +verify_runnable "global" + +save_tunable READ_SIT_OUT_SECS +set_tunable32 READ_SIT_OUT_SECS 120 +save_tunable SIT_OUT_CHECK_INTERVAL +set_tunable64 SIT_OUT_CHECK_INTERVAL 20 + +function cleanup +{ + restore_tunable READ_SIT_OUT_SECS + restore_tunable SIT_OUT_CHECK_INTERVAL + log_must zinject -c all + log_must zpool events -c + + if [[ -n "$child_pids" ]]; then + for wait_pid in $child_pids; do + kill $wait_pid + done + fi + + if poolexists $TESTPOOL1; then + destroy_pool $TESTPOOL1 + fi + + [[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/* +} + +log_assert "Replacing a disk during I/O with a sit out completes." + +options="" +options_display="default options" + +log_onexit cleanup + +[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE " + +[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE " + +[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT " + +[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED " + +[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET " + +options="$options -r " + +[[ -n "$options" ]] && options_display=$options + +child_pids="" + +function attach_test +{ + typeset vdev=$1 + typeset disk=$2 + + typeset i=0 + while [[ $i -lt $iters ]]; do + log_note "Invoking file_trunc with: $options_display on $TESTFILE.$i" + file_trunc $options $TESTDIR/$TESTFILE.$i & + typeset pid=$! + + sleep 1 + + child_pids="$child_pids $pid" + ((i = i + 1)) + done + + # attach disk with a slow drive still present + SECONDS=0 + log_must zpool attach -w $TESTPOOL1 $vdev $disk + log_note took $SECONDS seconds to attach disk + + for wait_pid in $child_pids + do + kill $wait_pid + done + child_pids="" + + log_must zinject -c all + log_must zpool export $TESTPOOL1 + log_must zpool import -d $TESTDIR $TESTPOOL1 + log_must zfs umount $TESTPOOL1/$TESTFS1 + log_must zdb -cdui $TESTPOOL1/$TESTFS1 + log_must zfs mount $TESTPOOL1/$TESTFS1 + verify_pool $TESTPOOL1 +} + +DEVSIZE="150M" +specials_list="" +i=0 +while [[ $i != 10 ]]; do + truncate -s $DEVSIZE $TESTDIR/$TESTFILE1.$i + specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" + + ((i = i + 1)) +done + +slow_disk=$TESTDIR/$TESTFILE1.3 +log_must truncate -s $DEVSIZE $TESTDIR/$REPLACEFILE + +# Test file size in MB +count=200 + +for type in "raidz1" "raidz2" "raidz3" ; do + create_pool $TESTPOOL1 $type $specials_list + log_must zpool set autosit=on $TESTPOOL1 "${type}-0" + log_must zfs create -o primarycache=none -o recordsize=512K \ + $TESTPOOL1/$TESTFS1 + log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1 + + log_must dd if=/dev/urandom of=/$TESTDIR1/bigfile bs=1M count=$count + + # Make one disk 100ms slower to trigger a sit out + log_must zinject -d $slow_disk -D100:1 -T read $TESTPOOL1 + + # Do some reads and wait for sit out on slow disk + SECONDS=0 + typeset -i size=0 + for i in $(seq 1 $count) ; do + dd if=/$TESTDIR1/bigfile skip=$i bs=1M count=1 of=/dev/null + size=$i + + sit_out=$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk) + if [[ "$sit_out" == "on" ]] ; then + break + fi + done + + log_must test "$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk)" == "on" + log_note took $SECONDS seconds to reach sit out reading ${size}M + log_must zpool status -s $TESTPOOL1 + + typeset top=$(zpool status -j | jq -r ".pools.$TESTPOOL1.vdevs[].vdevs[].name") + attach_test $top $TESTDIR/$REPLACEFILE + + log_must eval "zpool iostat -v $TESTPOOL1 | grep \"$REPLACEFILE\"" + + destroy_pool $TESTPOOL1 + log_must rm -rf /$TESTPOOL1 +done + +log_pass diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/replace_resilver_sit_out.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/replace_resilver_sit_out.ksh new file mode 100755 index 000000000000..4109dbaf45ac --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/replace_resilver_sit_out.ksh @@ -0,0 +1,199 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013, 2016 by Delphix. All rights reserved. +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/replacement/replacement.cfg + +# +# DESCRIPTION: +# Replacing disks while a disk is sitting out reads should pass +# +# STRATEGY: +# 1. Create raidz and draid pools +# 2. Make one disk slower and trigger a read sit out for that disk +# 3. Start some random I/O +# 4. Replace a disk in the pool with another disk. +# 5. Verify the integrity of the file system and the resilvering. +# + +verify_runnable "global" + +save_tunable READ_SIT_OUT_SECS +set_tunable32 READ_SIT_OUT_SECS 120 +save_tunable SIT_OUT_CHECK_INTERVAL +set_tunable64 SIT_OUT_CHECK_INTERVAL 20 + +function cleanup +{ + restore_tunable READ_SIT_OUT_SECS + restore_tunable SIT_OUT_CHECK_INTERVAL + log_must zinject -c all + log_must zpool events -c + + if [[ -n "$child_pids" ]]; then + for wait_pid in $child_pids + do + kill $wait_pid + done + fi + + if poolexists $TESTPOOL1; then + destroy_pool $TESTPOOL1 + fi + + [[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/* +} + +log_assert "Replacing a disk during I/O with a sit out completes." + +options="" +options_display="default options" + +log_onexit cleanup + +[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE " + +[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE " + +[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT " + +[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED " + +[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET " + +options="$options -r " + +[[ -n "$options" ]] && options_display=$options + +child_pids="" + +function replace_test +{ + typeset -i iters=2 + typeset disk1=$1 + typeset disk2=$2 + typeset repl_type=$3 + + typeset i=0 + while [[ $i -lt $iters ]]; do + log_note "Invoking file_trunc with: $options_display on $TESTFILE.$i" + file_trunc $options $TESTDIR/$TESTFILE.$i & + typeset pid=$! + + sleep 1 + + child_pids="$child_pids $pid" + ((i = i + 1)) + done + + typeset repl_flag="-w" + if [[ "$repl_type" == "seq" ]]; then + repl_flag="-ws" + fi + # replace disk with a slow drive still present + SECONDS=0 + log_must zpool replace $repl_flag $TESTPOOL1 $disk1 $disk2 + log_note took $SECONDS seconds to replace disk + + for wait_pid in $child_pids + do + kill $wait_pid + done + child_pids="" + + log_must zinject -c all + log_must zpool export $TESTPOOL1 + log_must zpool import -d $TESTDIR $TESTPOOL1 + log_must zfs umount $TESTPOOL1/$TESTFS1 + log_must zdb -cdui $TESTPOOL1/$TESTFS1 + log_must zfs mount $TESTPOOL1/$TESTFS1 + verify_pool $TESTPOOL1 +} + +DEVSIZE="150M" +specials_list="" +i=0 +while [[ $i != 10 ]]; do + log_must truncate -s $DEVSIZE $TESTDIR/$TESTFILE1.$i + specials_list="$specials_list $TESTDIR/$TESTFILE1.$i" + + ((i = i + 1)) +done + +slow_disk=$TESTDIR/$TESTFILE1.3 +log_must truncate -s $DEVSIZE $TESTDIR/$REPLACEFILE + +# Test file size in MB +count=400 + +for type in "raidz2" "raidz3" "draid2"; do + create_pool $TESTPOOL1 $type $specials_list + log_must zpool set autosit=on $TESTPOOL1 "${type}-0" + log_must zfs create -o primarycache=none -o recordsize=512K \ + $TESTPOOL1/$TESTFS1 + log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1 + + log_must dd if=/dev/urandom of=/$TESTDIR1/bigfile bs=1M count=$count + + # Make one disk 100ms slower to trigger a sit out + log_must zinject -d $slow_disk -D100:1 -T read $TESTPOOL1 + + # Do some reads and wait for sit out on slow disk + SECONDS=0 + typeset -i size=0 + for i in $(seq 1 $count) ; do + dd if=/$TESTDIR1/bigfile skip=$i bs=1M count=1 of=/dev/null + size=$i + + sit_out=$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk) + if [[ "$sit_out" == "on" ]] ; then + break + fi + done + log_must test "$(get_vdev_prop sit_out $TESTPOOL1 $slow_disk)" == "on" + log_note took $SECONDS seconds to reach sit out reading ${size}M + log_must zpool status -s $TESTPOOL1 + + typeset repl_type="replace" + if [[ "$type" == "draid2" && $((RANDOM % 2)) -eq 0 ]]; then + repl_type="seq" + fi + replace_test $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE $repl_type + + log_must eval "zpool iostat -v $TESTPOOL1 | grep \"$REPLACEFILE\"" + + destroy_pool $TESTPOOL1 + log_must rm -rf /$TESTPOOL1 +done + +log_pass diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress_destroy.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress_destroy.ksh new file mode 100755 index 000000000000..669b59fac01f --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress_destroy.ksh @@ -0,0 +1,66 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +typeset -i nzvols=1000 +typeset -i parallel=$(( $(get_num_cpus) * 2 )) + +function cleanup { + for zvol in $(zfs list -Ho name -t vol) ; do + log_must_busy zfs destroy $zvol + done +} + +log_onexit cleanup + +log_assert "stress test concurrent zvol create/destroy" + +function destroy_zvols_until { + typeset cond=$1 + while true ; do + IFS='' zfs list -Ho name -t vol | read -r -d '' zvols + if [[ -n $zvols ]] ; then + echo $zvols | xargs -n 1 -P $parallel zfs destroy + fi + if ! $cond ; then + break + fi + done +} + +( seq $nzvols | \ + xargs -P $parallel -I % zfs create -s -V 1G $TESTPOOL/testvol% ) & +cpid=$! +sleep 1 + +destroy_zvols_until "kill -0 $cpid" +destroy_zvols_until "false" + +log_pass "stress test done" diff --git a/sys/contrib/vchiq/interface/vchiq_arm/vchiq_2835_arm.c b/sys/contrib/vchiq/interface/vchiq_arm/vchiq_2835_arm.c index ab8981e25cb2..0150ce72f0a4 100644 --- a/sys/contrib/vchiq/interface/vchiq_arm/vchiq_2835_arm.c +++ b/sys/contrib/vchiq/interface/vchiq_arm/vchiq_2835_arm.c @@ -464,7 +464,8 @@ create_pagelist(char __user *buf, size_t count, unsigned short type, (type == PAGELIST_READ ? VM_PROT_WRITE : 0 ) | VM_PROT_READ, pages, num_pages); if (actual_pages != num_pages) { - vm_page_unhold_pages(pages, actual_pages); + if (actual_pages > 0) + vm_page_unhold_pages(pages, actual_pages); free(pagelist, M_VCPAGELIST); return (-ENOMEM); } |