aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKa Ho Ng <khng@FreeBSD.org>2021-08-05 15:20:42 +0000
committerKa Ho Ng <khng@FreeBSD.org>2021-08-05 15:20:42 +0000
commit0dc332bff200c940edc36c4715b629a2e1e9f9ae (patch)
tree0c2995d8ed48879914d30fbc7228f0e718316fad
parentabbb57d5a647f91847a860bd25b4f109c3abb390 (diff)
downloadsrc-0dc332bff200c940edc36c4715b629a2e1e9f9ae.tar.gz
src-0dc332bff200c940edc36c4715b629a2e1e9f9ae.zip
Add fspacectl(2), vn_deallocate(9) and VOP_DEALLOCATE(9).
fspacectl(2) is a system call to provide space management support to userspace applications. VOP_DEALLOCATE(9) is a VOP call to perform the deallocation. vn_deallocate(9) is a public KPI for kmods' use. The purpose of proposing a new system call, a KPI and a VOP call is to allow bhyve or other hypervisor monitors to emulate the behavior of SCSI UNMAP/NVMe DEALLOCATE on a plain file. fspacectl(2) comprises of cmd and flags parameters to specify the space management operation to be performed. Currently cmd has to be SPACECTL_DEALLOC, and flags has to be 0. fo_fspacectl is added to fileops. VOP_DEALLOCATE(9) is added as a new VOP call. A trivial implementation of VOP_DEALLOCATE(9) is provided. Sponsored by: The FreeBSD Foundation Reviewed by: kib Differential Revision: https://reviews.freebsd.org/D28347
-rw-r--r--lib/libc/sys/Makefile.inc1
-rw-r--r--lib/libc/sys/Symbol.map1
-rw-r--r--lib/libc/sys/fspacectl.2189
-rw-r--r--lib/libc/sys/pathconf.23
-rw-r--r--share/man/man9/Makefile2
-rw-r--r--share/man/man9/VOP_DEALLOCATE.9101
-rw-r--r--share/man/man9/vn_deallocate.9103
-rw-r--r--sys/bsm/audit_kevents.h1
-rw-r--r--sys/compat/freebsd32/freebsd32.h4
-rw-r--r--sys/compat/freebsd32/freebsd32_misc.c34
-rw-r--r--sys/compat/freebsd32/syscalls.master5
-rw-r--r--sys/kern/capabilities.conf5
-rw-r--r--sys/kern/sys_generic.c70
-rw-r--r--sys/kern/syscalls.master9
-rw-r--r--sys/kern/vfs_default.c122
-rw-r--r--sys/kern/vfs_vnops.c110
-rw-r--r--sys/kern/vnode_if.src11
-rw-r--r--sys/security/audit/audit_bsm.c12
-rw-r--r--sys/sys/fcntl.h20
-rw-r--r--sys/sys/file.h15
-rw-r--r--sys/sys/syscallsubr.h3
-rw-r--r--sys/sys/unistd.h1
-rw-r--r--sys/sys/vnode.h2
-rw-r--r--tests/sys/file/Makefile1
-rw-r--r--tests/sys/file/fspacectl_test.c338
25 files changed, 1163 insertions, 0 deletions
diff --git a/lib/libc/sys/Makefile.inc b/lib/libc/sys/Makefile.inc
index a1eb9567a380..29e914872a8d 100644
--- a/lib/libc/sys/Makefile.inc
+++ b/lib/libc/sys/Makefile.inc
@@ -190,6 +190,7 @@ MAN+= abort2.2 \
fhreadlink.2 \
flock.2 \
fork.2 \
+ fspacectl.2 \
fsync.2 \
getdirentries.2 \
getdtablesize.2 \
diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map
index 80bb2c236191..93fbc947a7e1 100644
--- a/lib/libc/sys/Symbol.map
+++ b/lib/libc/sys/Symbol.map
@@ -419,6 +419,7 @@ FBSD_1.6 {
FBSD_1.7 {
_Fork;
+ fspacectl;
};
FBSDprivate_1.0 {
diff --git a/lib/libc/sys/fspacectl.2 b/lib/libc/sys/fspacectl.2
new file mode 100644
index 000000000000..2f581d1c1fb8
--- /dev/null
+++ b/lib/libc/sys/fspacectl.2
@@ -0,0 +1,189 @@
+.\"
+.\" SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+.\"
+.\" Copyright (c) 2021 The FreeBSD Foundation
+.\"
+.\" This manual page was written by Ka Ho Ng under sponsorship from
+.\" the FreeBSD Foundation.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.Dd August 4, 2021
+.Dt FSPACECTL 2
+.Os
+.Sh NAME
+.Nm fspacectl
+.Nd space management in a file
+.Sh LIBRARY
+.Lb libc
+.Sh SYNOPSIS
+.In fcntl.h
+.Ft int
+.Fo fspacectl
+.Fa "int fd"
+.Fa "int cmd"
+.Fa "const struct spacectl_range *rqsr"
+.Fa "int flags"
+.Fa "struct spacectl_range *rmsr"
+.Fc
+.Sh DESCRIPTION
+.Nm
+is a system call performing space management over a file.
+The
+.Fa fd
+argument specifies the file descriptor to be operated on by the
+.Fa cmd
+argument.
+The
+.Fa rqsr
+argument points to a
+.Fa spacectl_range
+structure that contains the requested operation range.
+The
+.Fa flags
+argument controls the behavior of the operation to take place.
+If the
+.Fa rmsr
+argument is non-NULL, the
+.Fa spacectl_range
+structure it points to is updated to contain the unprocessed operation range
+after the system call returns.
+Both
+.Fa rqsr
+and
+.Fa rmsr
+arguments can point to the same structure.
+.Pp
+The
+.Fa spacectl_range
+structure is defined as:
+.Bd -literal
+struct spacectl_range {
+ off_t r_offset;
+ off_t r_len;
+};
+.Ed
+.Pp
+The operation specified by the
+.Fa cmd
+argument may be one of:
+.Bl -tag -width SPACECTL_DEALLOC
+.It Dv SPACECTL_DEALLOC
+Zero a region in the file specified by the
+.Fa rqsr
+argument.
+The
+.Va "rqsr->r_offset"
+has to be a value greater than or equal to 0, and the
+.Va "rqsr->r_len"
+has to be a value greater than 0.
+.Pp
+If the file system supports hole-punching,
+file system space deallocation may be performed in the given region.
+.El
+.Pp
+The
+.Fa flags
+argument needs to be the value 0 currently.
+.Sh RETURN VALUES
+Upon successful completion, the value 0 is returned;
+otherwise the value -1 is returned and
+.Va errno
+is set to indicate the error.
+.Sh ERRORS
+Possible failure conditions:
+.Bl -tag -width Er
+.It Bq Er EBADF
+The
+.Fa fd
+argument is not a valid file descriptor.
+.It Bq Er EBADF
+The
+.Fa fd
+argument references a file that was opened without write permission.
+.It Bq Er EINTR
+A signal was caught during execution.
+.It Bq Er EINVAL
+The
+.Fa cmd
+argument is not valid.
+.It Bq Er EINVAL
+If the
+.Fa cmd
+argument is
+.Dv SPACECTL_DEALLOC ,
+either the
+.Fa "range->r_offset"
+argument was less than zero, or the
+.Fa "range->r_len"
+argument was less than or equal to zero.
+.It Bq Er EINVAL
+An invalid or unsupported flag is included in
+.Fa flags .
+.It Bq Er EINVAL
+A flag included in
+.Fa flags
+is not supported by the operation specified by the
+.Fa cmd
+argument.
+.It Bq Er EFAULT
+The
+.Fa rqsr
+or a non-NULL
+.Fa rmsr
+argument point outside the process' allocated address space.
+.It Bq Er EIO
+An I/O error occurred while reading from or writing to a file system.
+.It Bq Er EINTEGRITY
+Corrupted data was detected while reading from the file system.
+.It Bq Er ENODEV
+The
+.Fa fd
+argument does not refer to a file that supports
+.Nm .
+.It Bq Er ENOSPC
+There is insufficient free space remaining on the file system storage
+media.
+.It Bq Er ENOTCAPABLE
+The file descriptor
+.Fa fd
+has insufficient rights.
+.It Bq Er ESPIPE
+The
+.Fa fd
+argument is associated with a pipe or FIFO.
+.El
+.Sh SEE ALSO
+.Xr creat 2 ,
+.Xr ftruncate 2 ,
+.Xr open 2 ,
+.Xr unlink 2
+.Sh HISTORY
+The
+.Nm
+system call appeared in
+.Fx 14.0 .
+.Sh AUTHORS
+.Nm
+and this manual page were written by
+.An Ka Ho Ng Aq Mt khng@FreeBSD.org
+under sponsorship from the FreeBSD Foundation.
diff --git a/lib/libc/sys/pathconf.2 b/lib/libc/sys/pathconf.2
index 62ec532705ef..c5a7ba1be3c5 100644
--- a/lib/libc/sys/pathconf.2
+++ b/lib/libc/sys/pathconf.2
@@ -166,6 +166,9 @@ specified file, otherwise 0.
.It Li _PC_MIN_HOLE_SIZE
If a file system supports the reporting of holes (see
.Xr lseek 2 ) ,
+.It Li _PC_DEALLOC_PRESENT
+If a file system supports hole-punching (see
+.Xr fspacectl 2 ) ,
.Fn pathconf
and
.Fn fpathconf
diff --git a/share/man/man9/Makefile b/share/man/man9/Makefile
index d0012301d889..b2f1451a79d7 100644
--- a/share/man/man9/Makefile
+++ b/share/man/man9/Makefile
@@ -404,6 +404,7 @@ MAN= accept_filter.9 \
vm_page_wire.9 \
vm_set_page_size.9 \
vmem.9 \
+ vn_deallocate.9 \
vn_fullpath.9 \
vn_isdisk.9 \
vnet.9 \
@@ -420,6 +421,7 @@ MAN= accept_filter.9 \
VOP_BWRITE.9 \
VOP_COPY_FILE_RANGE.9 \
VOP_CREATE.9 \
+ VOP_DEALLOCATE.9 \
VOP_FSYNC.9 \
VOP_GETACL.9 \
VOP_GETEXTATTR.9 \
diff --git a/share/man/man9/VOP_DEALLOCATE.9 b/share/man/man9/VOP_DEALLOCATE.9
new file mode 100644
index 000000000000..1c7f80cfbc6c
--- /dev/null
+++ b/share/man/man9/VOP_DEALLOCATE.9
@@ -0,0 +1,101 @@
+.\"
+.\" SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+.\"
+.\" Copyright (c) 2021 The FreeBSD Foundation
+.\"
+.\" This manual page was written by Ka Ho Ng under sponsorship from
+.\" the FreeBSD Foundation.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.Dd May 11, 2021
+.Dt VOP_DEALLOCATE 9
+.Os
+.Sh NAME
+.Nm VOP_DEALLOCATE
+.Nd zero and/or deallocate storage from a file
+.Sh SYNOPSIS
+.In sys/param.h
+.In sys/vnode.h
+.Ft int
+.Fo VOP_DEALLOCATE
+.Fa "struct vnode *vp"
+.Fa "off_t *offset"
+.Fa "off_t *len"
+.Fa "int flags"
+.Fa "struct ucred *cred"
+.Fc
+.Sh DESCRIPTION
+This VOP call zeroes/deallocates storage for an offset range in a file.
+It is used to implement the
+.Xr fspacectl 2
+system call.
+.Pp
+Its arguments are:
+.Bl -tag -width offset
+.It Fa vp
+The vnode of the file.
+.It Fa offset
+The start of the range to deallocate storage in the file.
+.It Fa len
+The length of the range to deallocate storage in the file.
+.It Fa flags
+The flags of this call.
+This should be set to 0 for now.
+.It Fa cred
+The credentials of the caller.
+.El
+.Pp
+.Fa *offset
+and
+.Fa *len
+are updated to reflect the portion of the range that
+still needs to be zeroed/deallocated on return.
+Partial result is considered a successful operation.
+.Sh LOCKS
+The vnode should be locked on entry and will still be locked on exit.
+.Sh RETURN VALUES
+Zero is returned if the call is successful, otherwise an appropriate
+error code is returned.
+.Sh ERRORS
+.Bl -tag -width Er
+.It Bq Er EINVAL
+Invalid
+.Fa offset , len
+or
+.Fa flags
+parameters are passed into this VOP call.
+.It Bq Er ENODEV
+The vnode type is not supported by this VOP call.
+.It Bq Er ENOSPC
+The file system is full.
+.It Bq Er EPERM
+An append-only flag is set on the file, but the caller is attempting to
+zero before the current end of file.
+.El
+.Sh SEE ALSO
+.Xr vnode 9
+.Sh AUTHORS
+.Nm
+and this manual page was written by
+.An Ka Ho Ng Aq Mt khng@FreeBSD.org
+under sponsorship from the FreeBSD Foundation.
diff --git a/share/man/man9/vn_deallocate.9 b/share/man/man9/vn_deallocate.9
new file mode 100644
index 000000000000..415a8941ca68
--- /dev/null
+++ b/share/man/man9/vn_deallocate.9
@@ -0,0 +1,103 @@
+.\"
+.\" SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+.\"
+.\" Copyright (c) 2021 The FreeBSD Foundation
+.\"
+.\" This manual page was written by Ka Ho Ng under sponsorship from
+.\" the FreeBSD Foundation.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.Dd Jul 30, 2021
+.Dt VN_DEALLOCATE 9
+.Os
+.Sh NAME
+.Nm vn_deallocate
+.Nd zero and/or deallocate storage from a file
+.Sh SYNOPSIS
+.In sys/param.h
+.In sys/vnode.h
+.Ft int
+.Fo vn_deallocate
+.Fa "struct vnode *vp"
+.Fa "off_t *offset"
+.Fa "off_t *length"
+.Fa "int flags"
+.Fa "int ioflg"
+.Fa "struct ucred *active_cred"
+.Fa "struct ucred *file_cred"
+.Fc
+.Sh DESCRIPTION
+The
+.Fn vn_deallocate
+function zeros and/or deallocates backing storage space from a file.
+This function only works on vnodes with
+.Dv VREG
+type.
+.Pp
+The arguments are:
+.Bl -tag -width active_cred
+.It Fa vp
+The vnode of the file.
+.It Fa offset
+The starting offset of the operation range.
+.It Fa length
+The length of the operation range.
+This must be greater than 0.
+.It Fa flags
+The control flags of the operation.
+This should be set to 0 for now.
+.It Fa ioflg
+The control flags of vnode locking.
+.It Fa active_cred
+The user credentials of the calling thread.
+.It Fa file_cred
+The credentials installed on the file description pointing to the vnode or NOCRED.
+.El
+.Pp
+The
+.Fn ioflg
+argument may be one or more of the following flags:
+.Bl -tag -width IO_RANGELOCKED
+.It Dv IO_NODELOCKED
+The vnode was locked before the call.
+.It Dv IO_RANGELOCKED
+Rangelock was owned around the call.
+.It Dv IO_NOMACCHECK
+Skip MAC checking in the call.
+.El
+.Pp
+.Fa *offset
+and
+.Fa *length
+are updated to reflect the unprocessed operation range of the call.
+.Sh RETURN VALUES
+Upon successful completion, the value 0 is returned; otherwise the
+appropriate error is returned.
+.Sh SEE ALSO
+.Xr vnode 9 ,
+.Xr VOP_DEALLOCATE 9
+.Sh AUTHORS
+.Nm
+and this manual page was written by
+.An Ka Ho Ng Aq Mt khng@FreeBSD.org
+under sponsorship from the FreeBSD Foundation.
diff --git a/sys/bsm/audit_kevents.h b/sys/bsm/audit_kevents.h
index eeb928ecafdc..0da82de1fbcb 100644
--- a/sys/bsm/audit_kevents.h
+++ b/sys/bsm/audit_kevents.h
@@ -662,6 +662,7 @@
#define AUE_SPECIALFD 43266 /* FreeBSD-specific. */
#define AUE_AIO_WRITEV 43267 /* FreeBSD-specific. */
#define AUE_AIO_READV 43268 /* FreeBSD-specific. */
+#define AUE_FSPACECTL 43269 /* FreeBSD-specific. */
/*
* Darwin BSM uses a number of AUE_O_* definitions, which are aliased to the
diff --git a/sys/compat/freebsd32/freebsd32.h b/sys/compat/freebsd32/freebsd32.h
index 2e4f5155cbf4..8a14a42db813 100644
--- a/sys/compat/freebsd32/freebsd32.h
+++ b/sys/compat/freebsd32/freebsd32.h
@@ -435,5 +435,9 @@ struct ptrace_coredump32 {
uint32_t pc_limit1, pc_limit2;
};
+struct spacectl_range32 {
+ uint32_t r_offset1, r_offset2;
+ uint32_t r_len1, r_len2;
+};
#endif /* !_COMPAT_FREEBSD32_FREEBSD32_H_ */
diff --git a/sys/compat/freebsd32/freebsd32_misc.c b/sys/compat/freebsd32/freebsd32_misc.c
index 736fd1123d53..c417a64d286a 100644
--- a/sys/compat/freebsd32/freebsd32_misc.c
+++ b/sys/compat/freebsd32/freebsd32_misc.c
@@ -3857,3 +3857,37 @@ freebsd32_ntp_adjtime(struct thread *td, struct freebsd32_ntp_adjtime_args *uap)
}
return (error);
}
+
+int
+freebsd32_fspacectl(struct thread *td, struct freebsd32_fspacectl_args *uap)
+{
+ struct spacectl_range rqsr, rmsr;
+ struct spacectl_range32 rqsr32, rmsr32;
+ int error, cerror;
+
+ error = copyin(uap->rqsr, &rqsr32, sizeof(rqsr32));
+ if (error != 0)
+ return (error);
+ rqsr.r_offset = PAIR32TO64(off_t, rqsr32.r_offset);
+ rqsr.r_len = PAIR32TO64(off_t, rqsr32.r_len);
+
+ error = kern_fspacectl(td, uap->fd, uap->cmd, &rqsr, uap->flags,
+ &rmsr);
+ if (uap->rmsr != NULL) {
+#if BYTE_ORDER == LITTLE_ENDIAN
+ rmsr32.r_offset1 = rmsr.r_offset;
+ rmsr32.r_offset2 = rmsr.r_offset >> 32;
+ rmsr32.r_len1 = rmsr.r_len;
+ rmsr32.r_len2 = rmsr.r_len >> 32;
+#else
+ rmsr32.r_offset1 = rmsr.r_offset >> 32;
+ rmsr32.r_offset2 = rmsr.r_offset;
+ rmsr32.r_len1 = rmsr.r_len >> 32;
+ rmsr32.r_len2 = rmsr.r_len;
+#endif
+ cerror = copyout(&rmsr32, uap->rmsr, sizeof(rmsr32));
+ if (error == 0)
+ error = cerror;
+ }
+ return (error);
+}
diff --git a/sys/compat/freebsd32/syscalls.master b/sys/compat/freebsd32/syscalls.master
index aac788bf3956..3e53de2dc966 100644
--- a/sys/compat/freebsd32/syscalls.master
+++ b/sys/compat/freebsd32/syscalls.master
@@ -1176,5 +1176,10 @@
struct aiocb32 *aiocbp); }
579 AUE_AIO_READV STD { int freebsd32_aio_readv( \
struct aiocb32 *aiocbp); }
+580 AUE_FSPACECTL STD { int freebsd32_fspacectl(int fd, \
+ int cmd, \
+ const struct spacectl_range32 *rqsr, \
+ int flags, \
+ struct spacectl_range32 *rmsr); }
; vim: syntax=off
diff --git a/sys/kern/capabilities.conf b/sys/kern/capabilities.conf
index 602ec7088fc6..f53530eb7fa7 100644
--- a/sys/kern/capabilities.conf
+++ b/sys/kern/capabilities.conf
@@ -229,6 +229,11 @@ freebsd6_pread
freebsd6_pwrite
##
+## Allow I/O-related file operations, subject to capability rights.
+##
+fspacectl
+
+##
## Allow querying file and file system state with fstat(2) and fstatfs(2),
## subject to capability rights.
##
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index f86d494400e2..e6b2cba27a04 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -862,6 +862,76 @@ kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
}
int
+sys_fspacectl(struct thread *td, struct fspacectl_args *uap)
+{
+ struct spacectl_range rqsr, rmsr;
+ int error, cerror;
+
+ error = copyin(uap->rqsr, &rqsr, sizeof(rqsr));
+ if (error != 0)
+ return (error);
+
+ error = kern_fspacectl(td, uap->fd, uap->cmd, &rqsr, uap->flags,
+ &rmsr);
+ if (uap->rmsr != NULL) {
+ cerror = copyout(&rmsr, uap->rmsr, sizeof(rmsr));
+ if (error == 0)
+ error = cerror;
+ }
+ return (error);
+}
+
+int
+kern_fspacectl(struct thread *td, int fd, int cmd,
+ const struct spacectl_range *rqsr, int flags, struct spacectl_range *rmsrp)
+{
+ struct file *fp;
+ struct spacectl_range rmsr;
+ int error;
+
+ AUDIT_ARG_FD(fd);
+ AUDIT_ARG_CMD(cmd);
+ AUDIT_ARG_FFLAGS(flags);
+
+ if (rqsr == NULL)
+ return (EINVAL);
+ rmsr = *rqsr;
+ if (rmsrp != NULL)
+ *rmsrp = rmsr;
+
+ if (cmd != SPACECTL_DEALLOC ||
+ rqsr->r_offset < 0 || rqsr->r_len <= 0 ||
+ rqsr->r_offset > OFF_MAX - rqsr->r_len ||
+ (flags & ~SPACECTL_F_SUPPORTED) != 0)
+ return (EINVAL);
+
+ error = fget_write(td, fd, &cap_pwrite_rights, &fp);
+ if (error != 0)
+ return (error);
+ AUDIT_ARG_FILE(td->td_proc, fp);
+ if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
+ error = ESPIPE;
+ goto out;
+ }
+ if ((fp->f_flag & FWRITE) == 0) {
+ error = EBADF;
+ goto out;
+ }
+
+ error = fo_fspacectl(fp, cmd, &rmsr.r_offset, &rmsr.r_len, flags,
+ td->td_ucred, td);
+ /* fspacectl is not restarted after signals if the file is modified. */
+ if (rmsr.r_len != rqsr->r_len && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ if (rmsrp != NULL)
+ *rmsrp = rmsr;
+out:
+ fdrop(fp, td);
+ return (error);
+}
+
+int
kern_specialfd(struct thread *td, int type, void *arg)
{
struct file *fp;
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index af787908451a..11247aed8fd6 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -3250,6 +3250,15 @@
_Inout_ struct aiocb *aiocbp
);
}
+580 AUE_FSPACECTL STD {
+ int fspacectl(
+ int fd,
+ int cmd,
+ _In_ const struct spacectl_range *rqsr,
+ int flags,
+ _Out_opt_ struct spacectl_range *rmsr,
+ );
+ }
; Please copy any additions and changes to the following compatability tables:
; sys/compat/freebsd32/syscalls.master
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
index 63bca7810847..c42d5a795935 100644
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@@ -93,6 +93,7 @@ static int vop_stdgetpages_async(struct vop_getpages_async_args *ap);
static int vop_stdread_pgcache(struct vop_read_pgcache_args *ap);
static int vop_stdstat(struct vop_stat_args *ap);
static int vop_stdvput_pair(struct vop_vput_pair_args *ap);
+static int vop_stddeallocate(struct vop_deallocate_args *ap);
/*
* This vnode table stores what we want to do if the filesystem doesn't
@@ -117,6 +118,7 @@ struct vop_vector default_vnodeops = {
.vop_advlockasync = vop_stdadvlockasync,
.vop_advlockpurge = vop_stdadvlockpurge,
.vop_allocate = vop_stdallocate,
+ .vop_deallocate = vop_stddeallocate,
.vop_bmap = vop_stdbmap,
.vop_close = VOP_NULL,
.vop_fsync = VOP_NULL,
@@ -518,6 +520,7 @@ vop_stdpathconf(ap)
case _PC_ACL_EXTENDED:
case _PC_ACL_NFS4:
case _PC_CAP_PRESENT:
+ case _PC_DEALLOC_PRESENT:
case _PC_INF_PRESENT:
case _PC_MAC_PRESENT:
*ap->a_retval = 0;
@@ -1069,6 +1072,125 @@ vop_stdallocate(struct vop_allocate_args *ap)
return (error);
}
+static int
+vp_zerofill(struct vnode *vp, struct vattr *vap, off_t *offsetp, off_t *lenp,
+ struct ucred *cred)
+{
+ int iosize;
+ int error = 0;
+ struct iovec aiov;
+ struct uio auio;
+ struct thread *td;
+ off_t offset, len;
+
+ iosize = vap->va_blocksize;
+ td = curthread;
+ offset = *offsetp;
+ len = *lenp;
+
+ if (iosize == 0)
+ iosize = BLKDEV_IOSIZE;
+ /* If va_blocksize is 512 bytes, iosize will be 4 kilobytes */
+ iosize = min(iosize * 8, ZERO_REGION_SIZE);
+
+ while (len > 0) {
+ int xfersize = iosize;
+ if (offset % iosize != 0)
+ xfersize -= offset % iosize;
+ if (xfersize > len)
+ xfersize = len;
+
+ aiov.iov_base = __DECONST(void *, zero_region);
+ aiov.iov_len = xfersize;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = offset;
+ auio.uio_resid = xfersize;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_td = td;
+
+ error = VOP_WRITE(vp, &auio, 0, cred);
+ if (error != 0) {
+ len -= xfersize - auio.uio_resid;
+ offset += xfersize - auio.uio_resid;
+ break;
+ }
+
+ len -= xfersize;
+ offset += xfersize;
+ }
+
+ *offsetp = offset;
+ *lenp = len;
+ return (error);
+}
+
+static int
+vop_stddeallocate(struct vop_deallocate_args *ap)
+{
+ struct vnode *vp;
+ off_t offset, len;
+ struct ucred *cred;
+ int error;
+ struct vattr va;
+ off_t noff, xfersize, rem;
+
+ vp = ap->a_vp;
+ offset = *ap->a_offset;
+ len = *ap->a_len;
+ cred = ap->a_cred;
+
+ error = VOP_GETATTR(vp, &va, cred);
+ if (error)
+ return (error);
+
+ len = omin(OFF_MAX - offset, *ap->a_len);
+ while (len > 0) {
+ noff = offset;
+ error = vn_bmap_seekhole_locked(vp, FIOSEEKDATA, &noff, cred);
+ if (error) {
+ if (error != ENXIO)
+ /* XXX: Is it okay to fallback further? */
+ goto out;
+
+ /*
+ * No more data region to be filled
+ */
+ len = 0;
+ error = 0;
+ break;
+ }
+ KASSERT(noff >= offset, ("FIOSEEKDATA going backward"));
+ if (noff != offset) {
+ xfersize = omin(noff - offset, len);
+ len -= xfersize;
+ offset += xfersize;
+ if (len == 0)
+ break;
+ }
+ error = vn_bmap_seekhole_locked(vp, FIOSEEKHOLE, &noff, cred);
+ if (error)
+ goto out;
+
+ /* Fill zeroes */
+ xfersize = rem = omin(noff - offset, len);
+ error = vp_zerofill(vp, &va, &offset, &rem, cred);
+ if (error) {
+ len -= xfersize - rem;
+ goto out;
+ }
+
+ len -= xfersize;
+ if (should_yield())
+ break;
+ }
+out:
+ *ap->a_offset = offset;
+ *ap->a_len = len;
+ return (error);
+}
+
int
vop_stdadvise(struct vop_advise_args *ap)
{
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index ccc468d71737..c54f55a99036 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -106,6 +106,7 @@ static fo_kqfilter_t vn_kqfilter;
static fo_close_t vn_closefile;
static fo_mmap_t vn_mmap;
static fo_fallocate_t vn_fallocate;
+static fo_fspacectl_t vn_fspacectl;
struct fileops vnops = {
.fo_read = vn_io_fault,
@@ -123,6 +124,7 @@ struct fileops vnops = {
.fo_fill_kinfo = vn_fill_kinfo,
.fo_mmap = vn_mmap,
.fo_fallocate = vn_fallocate,
+ .fo_fspacectl = vn_fspacectl,
.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
};
@@ -3439,6 +3441,114 @@ vn_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td)
return (error);
}
+static int
+vn_deallocate_impl(struct vnode *vp, off_t *offset, off_t *length, int flags,
+ int ioflg, struct ucred *active_cred, struct ucred *file_cred)
+{
+ struct mount *mp;
+ void *rl_cookie;
+ off_t off, len;
+ int error;
+#ifdef AUDIT
+ bool audited_vnode1 = false;
+#endif
+
+ rl_cookie = NULL;
+ error = 0;
+ mp = NULL;
+ off = *offset;
+ len = *length;
+
+ if ((ioflg & (IO_NODELOCKED|IO_RANGELOCKED)) == 0)
+ rl_cookie = vn_rangelock_wlock(vp, off, off + len);
+ while (len > 0 && error == 0) {
+ /*
+ * Try to deallocate the longest range in one pass.
+ * In case a pass takes too long to be executed, it returns
+ * partial result. The residue will be proceeded in the next
+ * pass.
+ */
+
+ if ((ioflg & IO_NODELOCKED) == 0) {
+ bwillwrite();
+ if ((error = vn_start_write(vp, &mp,
+ V_WAIT | PCATCH)) != 0)
+ goto out;
+ vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY);
+ }
+#ifdef AUDIT
+ if (!audited_vnode1) {
+ AUDIT_ARG_VNODE1(vp);
+ audited_vnode1 = true;
+ }
+#endif
+
+#ifdef MAC
+ if ((ioflg & IO_NOMACCHECK) == 0)
+ error = mac_vnode_check_write(active_cred, file_cred,
+ vp);
+#endif
+ if (error == 0)
+ error = VOP_DEALLOCATE(vp, &off, &len, flags,
+ active_cred);
+
+ if ((ioflg & IO_NODELOCKED) == 0) {
+ VOP_UNLOCK(vp);
+ if (mp != NULL) {
+ vn_finished_write(mp);
+ mp = NULL;
+ }
+ }
+ }
+out:
+ if (rl_cookie != NULL)
+ vn_rangelock_unlock(vp, rl_cookie);
+ *offset = off;
+ *length = len;
+ return (error);
+}
+
+int
+vn_deallocate(struct vnode *vp, off_t *offset, off_t *length, int flags,
+ int ioflg, struct ucred *active_cred, struct ucred *file_cred)
+{
+ if (*offset < 0 || *length <= 0 || *length > OFF_MAX - *offset ||
+ flags != 0)
+ return (EINVAL);
+ if (vp->v_type != VREG)
+ return (ENODEV);
+
+ return (vn_deallocate_impl(vp, offset, length, flags, ioflg,
+ active_cred, file_cred));
+}
+
+static int
+vn_fspacectl(struct file *fp, int cmd, off_t *offset, off_t *length, int flags,
+ struct ucred *active_cred, struct thread *td)
+{
+ int error;
+ struct vnode *vp;
+
+ vp = fp->f_vnode;
+
+ if (cmd != SPACECTL_DEALLOC || *offset < 0 || *length <= 0 ||
+ *length > OFF_MAX - *offset || flags != 0)
+ return (EINVAL);
+ if (vp->v_type != VREG)
+ return (ENODEV);
+
+ switch (cmd) {
+ case SPACECTL_DEALLOC:
+ error = vn_deallocate_impl(vp, offset, length, flags, 0,
+ active_cred, fp->f_cred);
+ break;
+ default:
+ panic("vn_fspacectl: unknown cmd %d", cmd);
+ }
+
+ return (error);
+}
+
static u_long vn_lock_pair_pause_cnt;
SYSCTL_ULONG(_debug, OID_AUTO, vn_lock_pair_pause, CTLFLAG_RD,
&vn_lock_pair_pause_cnt, 0,
diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src
index b506237f385d..97ac1cff6705 100644
--- a/sys/kern/vnode_if.src
+++ b/sys/kern/vnode_if.src
@@ -801,6 +801,17 @@ vop_vput_pair {
};
+%% deallocate vp L L L
+
+vop_deallocate {
+ IN struct vnode *vp;
+ INOUT off_t *offset;
+ INOUT off_t *len;
+ IN int flags;
+ IN struct ucred *cred;
+};
+
+
# The VOPs below are spares at the end of the table to allow new VOPs to be
# added in stable branches without breaking the KBI. New VOPs in HEAD should
# be added above these spares. When merging a new VOP to a stable branch,
diff --git a/sys/security/audit/audit_bsm.c b/sys/security/audit/audit_bsm.c
index d350ef3cf3c2..1f9918a42159 100644
--- a/sys/security/audit/audit_bsm.c
+++ b/sys/security/audit/audit_bsm.c
@@ -1091,6 +1091,18 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau)
FD_VNODE1_TOKENS;
break;
+ case AUE_FSPACECTL:
+ if (ARG_IS_VALID(kar, ARG_CMD)) {
+ tok = au_to_arg32(2, "operation", ar->ar_arg_cmd);
+ kau_write(rec, tok);
+ }
+ if (ARG_IS_VALID(kar, ARG_FFLAGS)) {
+ tok = au_to_arg32(4, "flags", ar->ar_arg_fflags);
+ kau_write(rec, tok);
+ }
+ FD_VNODE1_TOKENS;
+ break;
+
case AUE_RFORK:
if (ARG_IS_VALID(kar, ARG_FFLAGS)) {
tok = au_to_arg32(1, "flags", ar->ar_arg_fflags);
diff --git a/sys/sys/fcntl.h b/sys/sys/fcntl.h
index 8fa52aeacafd..934c648aecc0 100644
--- a/sys/sys/fcntl.h
+++ b/sys/sys/fcntl.h
@@ -323,6 +323,14 @@ struct __oflock {
short l_type; /* lock type: read/write, etc. */
short l_whence; /* type of l_start */
};
+
+/*
+ * Space control offset/length description
+ */
+struct spacectl_range {
+ off_t r_offset; /* starting offset */
+ off_t r_len; /* length */
+};
#endif
#if __BSD_VISIBLE
@@ -352,6 +360,16 @@ struct __oflock {
* similar syscalls.
*/
#define FD_NONE -200
+
+/*
+ * Commands for fspacectl(2)
+ */
+#define SPACECTL_DEALLOC 1 /* deallocate space */
+
+/*
+ * fspacectl(2) flags
+ */
+#define SPACECTL_F_SUPPORTED 0
#endif
#ifndef _KERNEL
@@ -361,6 +379,8 @@ int creat(const char *, mode_t);
int fcntl(int, int, ...);
#if __BSD_VISIBLE
int flock(int, int);
+int fspacectl(int, int, const struct spacectl_range *, int,
+ struct spacectl_range *);
#endif
#if __POSIX_VISIBLE >= 200809
int openat(int, const char *, int, ...);
diff --git a/sys/sys/file.h b/sys/sys/file.h
index b16e23bdfbcf..8a790a25fc6b 100644
--- a/sys/sys/file.h
+++ b/sys/sys/file.h
@@ -129,6 +129,9 @@ typedef int fo_add_seals_t(struct file *fp, int flags);
typedef int fo_get_seals_t(struct file *fp, int *flags);
typedef int fo_fallocate_t(struct file *fp, off_t offset, off_t len,
struct thread *td);
+typedef int fo_fspacectl_t(struct file *fp, int cmd,
+ off_t *offset, off_t *length, int flags,
+ struct ucred *active_cred, struct thread *td);
typedef int fo_flags_t;
struct fileops {
@@ -150,6 +153,7 @@ struct fileops {
fo_add_seals_t *fo_add_seals;
fo_get_seals_t *fo_get_seals;
fo_fallocate_t *fo_fallocate;
+ fo_fspacectl_t *fo_fspacectl;
fo_flags_t fo_flags; /* DFLAG_* below */
};
@@ -472,6 +476,17 @@ fo_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td)
return ((*fp->f_ops->fo_fallocate)(fp, offset, len, td));
}
+static __inline int fo_fspacectl(struct file *fp, int cmd, off_t *offset,
+ off_t *length, int flags, struct ucred *active_cred, struct thread *td)
+{
+
+ if (fp->f_ops->fo_fspacectl == NULL)
+ return (ENODEV);
+ return ((*fp->f_ops->fo_fspacectl)(fp, cmd, offset, length, flags,
+ active_cred, td));
+}
+
+
#endif /* _KERNEL */
#endif /* !SYS_FILE_H */
diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h
index eb7b82946988..dc1b3d6a83de 100644
--- a/sys/sys/syscallsubr.h
+++ b/sys/sys/syscallsubr.h
@@ -59,6 +59,7 @@ struct rusage;
struct sched_param;
union semun;
struct sockaddr;
+struct spacectl_range;
struct stat;
struct thr_param;
struct timex;
@@ -233,6 +234,8 @@ int kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
int advice);
int kern_posix_fallocate(struct thread *td, int fd, off_t offset,
off_t len);
+int kern_fspacectl(struct thread *td, int fd, int cmd,
+ const struct spacectl_range *, int flags, struct spacectl_range *);
int kern_procctl(struct thread *td, enum idtype idtype, id_t id, int com,
void *data);
int kern_pread(struct thread *td, int fd, void *buf, size_t nbyte,
diff --git a/sys/sys/unistd.h b/sys/sys/unistd.h
index 3b3de3aa33bc..ef8835a812d2 100644
--- a/sys/sys/unistd.h
+++ b/sys/sys/unistd.h
@@ -156,6 +156,7 @@
#define _PC_INF_PRESENT 62
#define _PC_MAC_PRESENT 63
#define _PC_ACL_NFS4 64
+#define _PC_DEALLOC_PRESENT 65
#endif
/* From OpenSolaris, used by SEEK_DATA/SEEK_HOLE. */
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 702fd6623e6a..56591a8d8a8d 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -741,6 +741,8 @@ int vn_copy_file_range(struct vnode *invp, off_t *inoffp,
struct vnode *outvp, off_t *outoffp, size_t *lenp,
unsigned int flags, struct ucred *incred, struct ucred *outcred,
struct thread *fsize_td);
+int vn_deallocate(struct vnode *vp, off_t *offset, off_t *length, int flags,
+ int ioflg, struct ucred *active_cred, struct ucred *file_cred);
void vn_finished_write(struct mount *mp);
void vn_finished_secondary_write(struct mount *mp);
int vn_fsync_buf(struct vnode *vp, int waitfor);
diff --git a/tests/sys/file/Makefile b/tests/sys/file/Makefile
index 46a6a9544c62..8191467ddc01 100644
--- a/tests/sys/file/Makefile
+++ b/tests/sys/file/Makefile
@@ -11,6 +11,7 @@ TAP_TESTS_C+= fcntlflags_test
TAP_TESTS_SH+= flock_test
PLAIN_TESTS_C+= ftruncate_test
PLAIN_TESTS_C+= newfileops_on_fork_test
+ATF_TESTS_C+= fspacectl_test
PROGS+= flock_helper
diff --git a/tests/sys/file/fspacectl_test.c b/tests/sys/file/fspacectl_test.c
new file mode 100644
index 000000000000..2831a333bc35
--- /dev/null
+++ b/tests/sys/file/fspacectl_test.c
@@ -0,0 +1,338 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2021 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Ka Ho Ng under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+
+#include <atf-c.h>
+#include <fcntl.h>
+#include <malloc.h>
+
+static off_t file_max_blocks = 32;
+static const char byte_to_fill = 0x5f;
+
+static int
+fill(int fd, off_t offset, off_t len)
+{
+ int error;
+ size_t blen;
+ char *buf;
+ struct stat statbuf;
+ blksize_t blocksize;
+
+ if (fstat(fd, &statbuf) == -1)
+ return (1);
+ blocksize = statbuf.st_blksize;
+ error = 0;
+ buf = malloc(blocksize);
+ if (buf == NULL)
+ return (1);
+
+ while (len > 0) {
+ blen = len < (off_t)blocksize ? len : blocksize;
+ memset(buf, byte_to_fill, blen);
+ if (pwrite(fd, buf, blen, offset) != (ssize_t)blen) {
+ error = 1;
+ break;
+ }
+ len -= blen;
+ offset += blen;
+ }
+
+ free(buf);
+ return (error);
+}
+
+static blksize_t
+fd_get_blksize(void)
+{
+ struct statfs statfsbuf;
+
+ if (statfs(".", &statfsbuf) == -1)
+ return (-1);
+ return statfsbuf.f_iosize;
+}
+
+static int
+check_content_dealloc(int fd, off_t hole_start, off_t hole_len, off_t file_sz)
+{
+ int error;
+ size_t blen;
+ off_t offset, resid;
+ struct stat statbuf;
+ char *buf, *sblk;
+ blksize_t blocksize;
+
+ blocksize = fd_get_blksize();
+ if (blocksize == -1)
+ return (1);
+ error = 0;
+ buf = malloc(blocksize * 2);
+ if (buf == NULL)
+ return (1);
+ sblk = buf + blocksize;
+
+ memset(sblk, 0, blocksize);
+
+ if ((uint64_t)hole_start + hole_len > (uint64_t)file_sz)
+ hole_len = file_sz - hole_start;
+
+ /*
+ * Check hole is zeroed.
+ */
+ offset = hole_start;
+ resid = hole_len;
+ while (resid > 0) {
+ blen = resid < (off_t)blocksize ? resid : blocksize;
+ if (pread(fd, buf, blen, offset) != (ssize_t)blen) {
+ error = 1;
+ break;
+ }
+ if (memcmp(buf, sblk, blen) != 0) {
+ error = 1;
+ break;
+ }
+ resid -= blen;
+ offset += blen;
+ }
+
+ memset(sblk, byte_to_fill, blocksize);
+
+ /*
+ * Check file region before hole is zeroed.
+ */
+ offset = 0;
+ resid = hole_start;
+ while (resid > 0) {
+ blen = resid < (off_t)blocksize ? resid : blocksize;
+ if (pread(fd, buf, blen, offset) != (ssize_t)blen) {
+ error = 1;
+ break;
+ }
+ if (memcmp(buf, sblk, blen) != 0) {
+ error = 1;
+ break;
+ }
+ resid -= blen;
+ offset += blen;
+ }
+
+ /*
+ * Check file region after hole is zeroed.
+ */
+ offset = hole_start + hole_len;
+ resid = file_sz - offset;
+ while (resid > 0) {
+ blen = resid < (off_t)blocksize ? resid : blocksize;
+ if (pread(fd, buf, blen, offset) != (ssize_t)blen) {
+ error = 1;
+ break;
+ }
+ if (memcmp(buf, sblk, blen) != 0) {
+ error = 1;
+ break;
+ }
+ resid -= blen;
+ offset += blen;
+ }
+
+ /*
+ * Check file size matches with expected file size.
+ */
+ if (fstat(fd, &statbuf) == -1)
+ error = -1;
+ if (statbuf.st_size != file_sz)
+ error = -1;
+
+ free(buf);
+ return (error);
+}
+
+/*
+ * Check aligned deallocation
+ */
+ATF_TC_WITHOUT_HEAD(aligned_dealloc);
+ATF_TC_BODY(aligned_dealloc, tc)
+{
+ struct spacectl_range range;
+ off_t offset, length;
+ blksize_t blocksize;
+ int fd;
+
+ ATF_REQUIRE((blocksize = fd_get_blksize()) != -1);
+ range.r_offset = offset = blocksize;
+ range.r_len = length = (file_max_blocks - 1) * blocksize -
+ range.r_offset;
+
+ ATF_REQUIRE((fd = open("sys_fspacectl_testfile",
+ O_CREAT | O_RDWR | O_TRUNC, 0600)) != -1);
+ ATF_REQUIRE(fill(fd, 0, file_max_blocks * blocksize) == 0);
+ ATF_CHECK(fspacectl(fd, SPACECTL_DEALLOC, &range, 0, &range) == 0);
+ ATF_CHECK(check_content_dealloc(fd, offset, length,
+ file_max_blocks * blocksize) == 0);
+ ATF_REQUIRE(close(fd) == 0);
+}
+
+/*
+ * Check unaligned deallocation
+ */
+ATF_TC_WITHOUT_HEAD(unaligned_dealloc);
+ATF_TC_BODY(unaligned_dealloc, tc)
+{
+ struct spacectl_range range;
+ off_t offset, length;
+ blksize_t blocksize;
+ int fd;
+
+ ATF_REQUIRE((blocksize = fd_get_blksize()) != -1);
+ range.r_offset = offset = blocksize / 2;
+ range.r_len = length = (file_max_blocks - 1) * blocksize +
+ blocksize / 2 - offset;
+
+ ATF_REQUIRE((fd = open("sys_fspacectl_testfile",
+ O_CREAT | O_RDWR | O_TRUNC, 0600)) != -1);
+ ATF_REQUIRE(fill(fd, 0, file_max_blocks * blocksize) == 0);
+ ATF_CHECK(fspacectl(fd, SPACECTL_DEALLOC, &range, 0, &range) == 0);
+ ATF_CHECK(check_content_dealloc(fd, offset, length,
+ file_max_blocks * blocksize) == 0);
+ ATF_REQUIRE(close(fd) == 0);
+}
+
+/*
+ * Check aligned deallocation from certain offset to OFF_MAX
+ */
+ATF_TC_WITHOUT_HEAD(aligned_dealloc_offmax);
+ATF_TC_BODY(aligned_dealloc_offmax, tc)
+{
+ struct spacectl_range range;
+ off_t offset, length;
+ blksize_t blocksize;
+ int fd;
+
+ ATF_REQUIRE((blocksize = fd_get_blksize()) != -1);
+ range.r_offset = offset = blocksize;
+ range.r_len = length = OFF_MAX - offset;
+
+ ATF_REQUIRE((fd = open("sys_fspacectl_testfile",
+ O_CREAT | O_RDWR | O_TRUNC, 0600)) != -1);
+ ATF_REQUIRE(fill(fd, 0, file_max_blocks * blocksize) == 0);
+ ATF_CHECK(fspacectl(fd, SPACECTL_DEALLOC, &range, 0, &range) == 0);
+ ATF_CHECK(check_content_dealloc(fd, offset, length,
+ file_max_blocks * blocksize) == 0);
+ ATF_REQUIRE(close(fd) == 0);
+}
+
+/*
+ * Check unaligned deallocation from certain offset to OFF_MAX
+ */
+ATF_TC_WITHOUT_HEAD(unaligned_dealloc_offmax);
+ATF_TC_BODY(unaligned_dealloc_offmax, tc)
+{
+ struct spacectl_range range;
+ off_t offset, length;
+ blksize_t blocksize;
+ int fd;
+
+ ATF_REQUIRE((blocksize = fd_get_blksize()) != -1);
+ range.r_offset = offset = blocksize / 2;
+ range.r_len = length = OFF_MAX - offset;
+
+ ATF_REQUIRE((fd = open("sys_fspacectl_testfile",
+ O_CREAT | O_RDWR | O_TRUNC, 0600)) != -1);
+ ATF_REQUIRE(fill(fd, 0, file_max_blocks * blocksize) == 0);
+ ATF_CHECK(fspacectl(fd, SPACECTL_DEALLOC, &range, 0, &range) == 0);
+ ATF_CHECK(check_content_dealloc(fd, offset, length,
+ file_max_blocks * blocksize) == 0);
+ ATF_REQUIRE(close(fd) == 0);
+}
+
+/*
+ * Check aligned deallocation around EOF
+ */
+ATF_TC_WITHOUT_HEAD(aligned_dealloc_eof);
+ATF_TC_BODY(aligned_dealloc_eof, tc)
+{
+ struct spacectl_range range;
+ off_t offset, length;
+ blksize_t blocksize;
+ int fd;
+
+ ATF_REQUIRE((blocksize = fd_get_blksize()) != -1);
+ range.r_offset = offset = blocksize;
+ range.r_len = length = (file_max_blocks + 1) * blocksize -
+ range.r_offset;
+
+ ATF_REQUIRE((fd = open("sys_fspacectl_testfile",
+ O_CREAT | O_RDWR | O_TRUNC, 0600)) != -1);
+ ATF_REQUIRE(fill(fd, 0, file_max_blocks * blocksize) == 0);
+ ATF_CHECK(fspacectl(fd, SPACECTL_DEALLOC, &range, 0, &range) == 0);
+ ATF_CHECK(check_content_dealloc(fd, offset, length,
+ file_max_blocks * blocksize) == 0);
+ ATF_REQUIRE(close(fd) == 0);
+}
+
+/*
+ * Check unaligned deallocation around EOF
+ */
+ATF_TC_WITHOUT_HEAD(unaligned_dealloc_eof);
+ATF_TC_BODY(unaligned_dealloc_eof, tc)
+{
+ struct spacectl_range range;
+ off_t offset, length;
+ blksize_t blocksize;
+ int fd;
+
+ ATF_REQUIRE((blocksize = fd_get_blksize()) != -1);
+ range.r_offset = offset = blocksize / 2;
+ range.r_len = length = file_max_blocks * blocksize + blocksize / 2 -
+ range.r_offset;
+
+ ATF_REQUIRE((fd = open("sys_fspacectl_testfile",
+ O_CREAT | O_RDWR | O_TRUNC, 0600)) != -1);
+ ATF_REQUIRE(fill(fd, 0, file_max_blocks * blocksize) == 0);
+ ATF_CHECK(fspacectl(fd, SPACECTL_DEALLOC, &range, 0, &range) == 0);
+ ATF_CHECK(check_content_dealloc(fd, offset, length,
+ file_max_blocks * blocksize) == 0);
+ ATF_REQUIRE(close(fd) == 0);
+}
+
+ATF_TP_ADD_TCS(tp)
+{
+ ATF_TP_ADD_TC(tp, aligned_dealloc);
+ ATF_TP_ADD_TC(tp, unaligned_dealloc);
+ ATF_TP_ADD_TC(tp, aligned_dealloc_eof);
+ ATF_TP_ADD_TC(tp, unaligned_dealloc_eof);
+ ATF_TP_ADD_TC(tp, aligned_dealloc_offmax);
+ ATF_TP_ADD_TC(tp, unaligned_dealloc_offmax);
+
+ return atf_no_error();
+}