aboutsummaryrefslogtreecommitdiff
path: root/share/man/man9/socket.9
blob: fb0ead0e20e120b073b4e4f4b863c3079c5fd63c (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
.\"-
.\" Copyright (c) 2006 Robert N. M. Watson
.\" Copyright (c) 2014 Benjamin J. Kaduk
.\" All rights reserved.
.\"
.\" Redistribution and use in source and binary forms, with or without
.\" modification, are permitted provided that the following conditions
.\" are met:
.\" 1. Redistributions of source code must retain the above copyright
.\"    notice, this list of conditions and the following disclaimer.
.\" 2. Redistributions in binary form must reproduce the above copyright
.\"    notice, this list of conditions and the following disclaimer in the
.\"    documentation and/or other materials provided with the distribution.
.\"
.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
.Dd September 6, 2022
.Dt SOCKET 9
.Os
.Sh NAME
.Nm socket
.Nd "kernel socket interface"
.Sh SYNOPSIS
.In sys/socket.h
.In sys/socketvar.h
.Ft void
.Fn soabort "struct socket *so"
.Ft int
.Fn soaccept "struct socket *so" "struct sockaddr *nam"
.Ft int
.Fn socheckuid "struct socket *so" "uid_t uid"
.Ft int
.Fn sobind "struct socket *so" "struct sockaddr *nam" "struct thread *td"
.Ft void
.Fn soclose "struct socket *so"
.Ft int
.Fn soconnect "struct socket *so" "struct sockaddr *nam" "struct thread *td"
.Ft int
.Fo socreate
.Fa "int dom" "struct socket **aso" "int type" "int proto"
.Fa "struct ucred *cred" "struct thread *td"
.Fc
.Ft int
.Fn sodisconnect "struct socket *so"
.Ft void
.Fo sodtor_set
.Fa "struct socket *so"
.Fa "void (*func)(struct socket *)"
.Fc
.Ft struct  sockaddr *
.Fn sodupsockaddr "const struct sockaddr *sa" "int mflags"
.Ft void
.Fn sofree "struct socket *so"
.Ft void
.Fn sohasoutofband "struct socket *so"
.Ft int
.Fn solisten "struct socket *so" "int backlog" "struct thread *td"
.Ft void
.Fn solisten_proto "struct socket *so" "int backlog"
.Ft int
.Fn solisten_proto_check "struct socket *so"
.Ft struct socket *
.Fn sonewconn "struct socket *head" "int connstatus"
.Ft int
.Fo sopoll
.Fa "struct socket *so" "int events" "struct ucred *active_cred"
.Fa "struct thread *td"
.Fc
.Ft int
.Fo sopoll_generic
.Fa "struct socket *so" "int events" "struct ucred *active_cred"
.Fa "struct thread *td"
.Fc
.Ft int
.Fo soreceive
.Fa "struct socket *so" "struct sockaddr **psa" "struct uio *uio"
.Fa "struct mbuf **mp0" "struct mbuf **controlp" "int *flagsp"
.Fc
.Ft int
.Fo soreceive_stream
.Fa "struct socket *so" "struct sockaddr **paddr"
.Fa "struct uio *uio" "struct mbuf **mp0" "struct mbuf **controlp"
.Fa "int *flagsp"
.Fc
.Ft int
.Fo soreceive_dgram
.Fa "struct socket *so" "struct sockaddr **paddr"
.Fa "struct uio *uio" "struct mbuf **mp0" "struct mbuf **controlp"
.Fa "int *flagsp"
.Fc
.Ft int
.Fo soreceive_generic
.Fa "struct socket *so" "struct sockaddr **paddr"
.Fa "struct uio *uio" "struct mbuf **mp0" "struct mbuf **controlp"
.Fa "int *flagsp"
.Fc
.Ft int
.Fn soreserve "struct socket *so" "u_long sndcc" "u_long rcvcc"
.Ft void
.Fn sorflush "struct socket *so"
.Ft int
.Fo sosend
.Fa "struct socket *so" "struct sockaddr *addr" "struct uio *uio"
.Fa "struct mbuf *top" "struct mbuf *control" "int flags" "struct thread *td"
.Fc
.Ft int
.Fo sosend_dgram
.Fa "struct socket *so" "struct sockaddr *addr"
.Fa "struct uio *uio" "struct mbuf *top" "struct mbuf *control"
.Fa "int flags" "struct thread *td"
.Fc
.Ft int
.Fo sosend_generic
.Fa "struct socket *so" "struct sockaddr *addr"
.Fa "struct uio *uio" "struct mbuf *top" "struct mbuf *control"
.Fa "int flags" "struct thread *td"
.Fc
.Ft int
.Fn soshutdown "struct socket *so" "int how"
.Ft void
.Fn sotoxsocket "struct socket *so" "struct xsocket *xso"
.Ft void
.Fn soupcall_clear "struct socket *so" "int which"
.Ft void
.Fo soupcall_set
.Fa "struct socket *so" "int which"
.Fa "int (*func)(struct socket *, void *, int)" "void *arg"
.Fc
.Ft void
.Fn sowakeup "struct socket *so" "struct sockbuf *sb"
.In sys/sockopt.h
.Ft int
.Fn sosetopt "struct socket *so" "struct sockopt *sopt"
.Ft int
.Fn sogetopt "struct socket *so" "struct sockopt *sopt"
.Ft int
.Fn sooptcopyin "struct sockopt *sopt" "void *buf" "size_t len" "size_t minlen"
.Ft int
.Fn sooptcopyout "struct sockopt *sopt" "const void *buf" "size_t len"
.Sh DESCRIPTION
The kernel
.Nm
programming interface permits in-kernel consumers to interact with
local and network socket objects in a manner similar to that permitted using
the
.Xr socket 2
user API.
These interfaces are appropriate for use by distributed file systems and
other network-aware kernel services.
While the user API operates on file descriptors, the kernel interfaces
operate directly on
.Vt "struct socket"
pointers.
Some portions of the kernel API exist only to implement the user API,
and are not expected to be used by kernel code.
The portions of the socket API used by socket consumers and
implementations of network protocols will differ; some routines
are only useful for protocol implementors.
.Pp
Except where otherwise indicated,
.Nm
functions may sleep, and are not appropriate for use in an interrupt thread
context or while holding non-sleepable kernel locks.
.Ss Creating and Destroying Sockets
A new socket may be created using
.Fn socreate .
As with
.Xr socket 2 ,
arguments specify the requested domain, type, and protocol via
.Fa dom , type ,
and
.Fa proto .
The socket is returned via
.Fa aso
on success.
In addition, the credential used to authorize operations associated with the
socket will be passed via
.Fa cred
(and will be cached for the lifetime of the socket), and the thread
performing the operation via
.Fa td .
.Em Warning :
authorization of the socket creation operation will be performed
using the thread credential for some protocols (such as raw sockets).
.Pp
Sockets may be closed and freed using
.Fn soclose ,
which has similar semantics to
.Xr close 2 .
.Pp
In certain circumstances, it is appropriate to destroy a socket without
waiting for it to disconnect, for which
.Fn soabort
is used.
This is only appropriate for incoming connections which are in a
partially connected state.
It must be called on an unreferenced socket, by the thread which
removed the socket from its listen queue, to prevent races.
It will call into protocol code, so no socket locks may be held
over the call.
The caller of
.Fn soabort
is responsible for setting the VNET context.
The normal path to freeing a socket is
.Fn sofree ,
which handles reference counting on the socket.
It should be called whenever a reference is released, and also whenever
reference flags are cleared in socket or protocol code.
Calls to
.Fn sofree
should not be made from outside the socket layer; outside callers
should use
.Fn soclose
instead.
.Ss Connections and Addresses
The
.Fn sobind
function is equivalent to the
.Xr bind 2
system call, and binds the socket
.Fa so
to the address
.Fa nam .
The operation would be authorized using the credential on thread
.Fa td .
.Pp
The
.Fn soconnect
function is equivalent to the
.Xr connect 2
system call, and initiates a connection on the socket
.Fa so
to the address
.Fa nam .
The operation will be authorized using the credential on thread
.Fa td .
Unlike the user system call,
.Fn soconnect
returns immediately; the caller may
.Xr msleep 9
on
.Fa so->so_timeo
while holding the socket mutex and waiting for the
.Dv SS_ISCONNECTING
flag to clear or
.Fa so->so_error
to become non-zero.
If
.Fn soconnect
fails, the caller must manually clear the
.Dv SS_ISCONNECTING
flag.
.Pp
A call to
.Fn sodisconnect
disconnects the socket without closing it.
.Pp
The
.Fn soshutdown
function is equivalent to the
.Xr shutdown 2
system call, and causes part or all of a connection on a socket to be closed
down.
.Pp
Sockets are transitioned from non-listening status to listening with
.Fn solisten .
.Ss Socket Options
The
.Fn sogetopt
function is equivalent to the
.Xr getsockopt 2
system call, and retrieves a socket option on socket
.Fa so .
The
.Fn sosetopt
function is equivalent to the
.Xr setsockopt 2
system call, and sets a socket option on socket
.Fa so .
.Pp
The second argument in both
.Fn sogetopt
and
.Fn sosetopt
is the
.Fa sopt
pointer to a
.Vt "struct sopt"
describing the socket option operation.
The caller-allocated structure must be zeroed, and then have its fields
initialized to specify socket option operation arguments:
.Bl -tag -width ".Va sopt_valsize"
.It Va sopt_dir
Set to
.Dv SOPT_SET
or
.Dv SOPT_GET
depending on whether this is a get or set operation.
.It Va sopt_level
Specify the level in the network stack the operation is targeted at; for
example,
.Dv SOL_SOCKET .
.It Va sopt_name
Specify the name of the socket option to set.
.It Va sopt_val
Kernel space pointer to the argument value for the socket option.
.It Va sopt_valsize
Size of the argument value in bytes.
.El
.Ss Socket Upcalls
In order for the owner of a socket to be notified when the socket
is ready to send or receive data, an upcall may be registered on
the socket.
The upcall is a function that will be called by the socket framework
when a socket buffer associated with the given socket is ready for
reading or writing.
.Fn soupcall_set
is used to register a socket upcall.
The function
.Va func
is registered, and the pointer
.Va arg
will be passed as its second argument when it is called by the framework.
The possible values for
.Va which
are
.Dv SO_RCV
and
.Dv SO_SND ,
which register upcalls for receive and send events, respectively.
The upcall function
.Fn func
must return either
.Dv SU_OK
or
.Dv SU_ISCONNECTED ,
depending on whether or not a call to
.Xr soisconnected
should be made by the socket framework after the upcall returns.
The upcall
.Va func
cannot call
.Xr soisconnected
itself due to lock ordering with the socket buffer lock.
Only
.Dv SO_RCV
upcalls should return
.Dv SU_ISCONNECTED .
When a
.Dv SO_RCV
upcall returns
.Dv SU_ISCONNECTED ,
the upcall will be removed from the socket.
.Pp
Upcalls are removed from their socket by
.Fn soupcall_clear .
The
.Va which
argument again specifies whether the sending or receiving upcall is to
be cleared, with
.Dv SO_RCV
or
.Dv SO_SND .
.Ss Socket Destructor Callback
A kernel system can use the
.Fn sodtor_set
function to set a destructor for a socket.
The destructor is called when the socket is about to be freed.
The destructor is called before the protocol detach routine.
The destructor can serve as a callback to initiate additional cleanup actions.
.Ss Socket I/O
The
.Fn soreceive
function is equivalent to the
.Xr recvmsg 2
system call, and attempts to receive bytes of data from the socket
.Fa so ,
optionally blocking awaiting for data if none is ready to read.
Data may be retrieved directly to kernel or user memory via the
.Fa uio
argument, or as an mbuf chain returned to the caller via
.Fa mp0 ,
avoiding a data copy.
The
.Fa uio
must always be
.Pf non- Dv NULL .
If
.Fa mp0
is
.Pf non- Dv NULL ,
only the
.Fa uio_resid
of
.Fa uio
is used.
The caller may optionally retrieve a socket address on a protocol with the
.Dv PR_ADDR
capability by providing storage via
.Pf non- Dv NULL
.Fa psa
argument.
The caller may optionally retrieve control data mbufs via a
.Pf non- Dv NULL
.Fa controlp
argument.
Optional flags may be passed to
.Fn soreceive
via a
.Pf non- Dv NULL
.Fa flagsp
argument, and use the same flag name space as the
.Xr recvmsg 2
system call.
.Pp
The
.Fn sosend
function is equivalent to the
.Xr sendmsg 2
system call, and attempts to send bytes of data via the socket
.Fa so ,
optionally blocking if data cannot be immediately sent.
Data may be sent directly from kernel or user memory via the
.Fa uio
argument, or as an mbuf chain via
.Fa top ,
avoiding a data copy.
Only one of the
.Fa uio
or
.Fa top
pointers may be
.Pf non- Dv NULL .
An optional destination address may be specified via a
.Pf non- Dv NULL
.Fa addr
argument, which may result in an implicit connect if supported by the
protocol.
The caller may optionally send control data mbufs via a
.Pf non- Dv NULL
.Fa control
argument.
Flags may be passed to
.Fn sosend
using the
.Fa flags
argument, and use the same flag name space as the
.Xr sendmsg 2
system call.
.Pp
Kernel callers running in an interrupt thread context, or with a mutex held,
will wish to use non-blocking sockets and pass the
.Dv MSG_DONTWAIT
flag in order to prevent these functions from sleeping.
.Pp
A socket can be queried for readability, writability, out-of-band data,
or end-of-file using
.Fn sopoll .
The possible values for
.Va events
are as for
.Xr poll 2 ,
with symbolic values
.Dv POLLIN ,
.Dv POLLPRI ,
.Dv POLLOUT ,
.Dv POLLRDNORM ,
.Dv POLLWRNORM ,
.Dv POLLRDBAND ,
and
.Dv POLLINGEOF
taken from
.In sys/poll.h .
.Pp
Calls to
.Fn soaccept
pass through to the protocol's accept routine to accept an incoming connection.
.Ss Socket Utility Functions
The uid of a socket's credential may be compared against a
.Va uid
with
.Fn socheckuid .
.Pp
A copy of an existing
.Vt struct sockaddr
may be made using
.Fn sodupsockaddr .
.Pp
Protocol implementations notify the socket layer of the arrival of
out-of-band data using
.Fn sohasoutofband ,
so that the socket layer can notify socket consumers of the available data.
.Pp
An
.Dq external-format
version of a
.Vt struct socket
can be created using
.Fn sotoxsocket ,
suitable for isolating user code from changes in the kernel structure.
.Ss Protocol Implementations
Protocols must supply an implementation for
.Fn solisten ;
such protocol implementations can call back into the socket layer using
.Fn solisten_proto_check
and
.Fn solisten_proto
to check and set the socket-layer listen state.
These callbacks are provided so that the protocol implementation
can order the socket layer and protocol locks as necessary.
Protocols must supply an implementation of
.Fn soreceive ;
the functions
.Fn soreceive_stream ,
.Fn soreceive_dgram ,
and
.Fn soreceive_generic
are supplied for use by such implementations.
.Pp
Protocol implementations can use
.Fn sonewconn
to create a socket and attach protocol state to that socket.
This can be used to create new sockets available for
.Fn soaccept
on a listen socket.
The returned socket has a reference count of zero.
.Pp
Protocols must supply an implementation for
.Fn sopoll ;
.Fn sopoll_generic
is provided for the use by protocol implementations.
.Pp
The functions
.Fn sosend_dgram
and
.Fn sosend_generic
are supplied to assist in protocol implementations of
.Fn sosend .
.Pp
When a protocol creates a new socket structure, it is necessary to
reserve socket buffer space for that socket, by calling
.Fn soreserve .
The rough inverse of this reservation is performed by
.Fn sorflush ,
which is called automatically by the socket framework.
.Pp
When a protocol needs to wake up threads waiting for the socket to
become ready to read or write, variants of
.Fn sowakeup
are used.
The
.Fn sowakeup
function should not be called directly by protocol code, instead use the
wrappers
.Fn sorwakeup ,
.Fn sorwakeup_locked ,
.Fn sowwakeup ,
and
.Fn sowwakeup_locked
for readers and writers, with the corresponding socket buffer lock
not already locked, or already held, respectively.
.Pp
The functions
.Fn sooptcopyin
and
.Fn sooptcopyout
are useful for transferring
.Vt struct sockopt
data between user and kernel code.
.Sh SEE ALSO
.Xr bind 2 ,
.Xr close 2 ,
.Xr connect 2 ,
.Xr getsockopt 2 ,
.Xr recv 2 ,
.Xr send 2 ,
.Xr setsockopt 2 ,
.Xr shutdown 2 ,
.Xr socket 2 ,
.Xr ng_ksocket 4 ,
.Xr intr_event 9 ,
.Xr msleep 9 ,
.Xr ucred 9
.Sh HISTORY
The
.Xr socket 2
system call appeared in
.Bx 4.2 .
This manual page was introduced in
.Fx 7.0 .
.Sh AUTHORS
This manual page was written by
.An Robert Watson
and
.An Benjamin Kaduk .
.Sh BUGS
The use of explicitly passed credentials, credentials hung from explicitly
passed threads, the credential on
.Dv curthread ,
and the cached credential from
socket creation time is inconsistent, and may lead to unexpected behaviour.
It is possible that several of the
.Fa td
arguments should be
.Fa cred
arguments, or simply not be present at all.
.Pp
The caller may need to manually clear
.Dv SS_ISCONNECTING
if
.Fn soconnect
returns an error.
.Pp
The
.Dv MSG_DONTWAIT
flag is not implemented for
.Fn sosend ,
and may not always work with
.Fn soreceive
when zero copy sockets are enabled.
.Pp
This manual page does not describe how to register socket upcalls or monitor
a socket for readability/writability without using blocking I/O.
.Pp
The
.Fn soref
and
.Fn sorele
functions are not described, and in most cases should not be used, due to
confusing and potentially incorrect interactions when
.Fn sorele
is last called after
.Fn soclose .