en_US.ISO8859-1/books/arch-handbook/smp/chapter.sgml


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160

<?xml version="1.0" encoding="ISO8859-1" standalone="no"?>
<!--
     The FreeBSD Documentation Project
     The FreeBSD SMP Next Generation Project

     $FreeBSD$
-->
<chapter id="smp">
  <chapterinfo>
    <authorgroup>
      <author>
	<firstname>John</firstname>
	<surname>Baldwin</surname>
	<contrib>Written by </contrib>
      </author>
      <author>
	<firstname>Robert</firstname>
	<surname>Watson</surname>
      </author>
    </authorgroup>

    <copyright>
      <year>2002</year>
      <year>2004</year>
      <year>2005</year>
      <holder>John Baldwin</holder>
      <holder>Robert Watson</holder>
    </copyright>
  </chapterinfo>

  <title>SMPng Design Document</title>

  <sect1 id="smp-intro">
    <title>Introduction</title>

    <indexterm><primary>SMP Next Generation</primary></indexterm>
    <indexterm><primary>kernel synchronization</primary></indexterm>

    <para>This document presents the current design and implementation
      of the SMPng Architecture.  First, the basic primitives and
      tools are introduced.  Next, a general architecture for the
      FreeBSD kernel's synchronization and execution model is laid
      out.  Then, locking strategies for specific subsystems are
      discussed, documenting the approaches taken to introduce
      fine-grained synchronization and parallelism for each subsystem.
      Finally, detailed implementation notes are provided to motivate
      design choices, and make the reader aware of important
      implications involving the use of specific primitives. </para>

    <para>This document is a work-in-progress, and will be updated to
      reflect on-going design and implementation activities associated
      with the SMPng Project.  Many sections currently exist only in
      outline form, but will be fleshed out as work proceeds.  Updates or
      suggestions regarding the document may be directed to the document
      editors.</para>

    <indexterm><primary>concurrency</primary></indexterm>
    <para>The goal of SMPng is to allow concurrency in the kernel.
      The kernel is basically one rather large and complex program. To
      make the kernel multi-threaded we use some of the same tools used
      to make other programs multi-threaded.  These include mutexes,
      shared/exclusive locks, semaphores, and condition variables.  For
      the definitions of these and other SMP-related terms, please see
      the <xref linkend="smp-glossary"/> section of this article.</para>
  </sect1>

  <sect1 id="smp-lock-fundamentals">
    <title>Basic Tools and Locking Fundamentals</title>

    <sect2>
      <title>Atomic Instructions and Memory Barriers</title>

    <indexterm><primary>atomic instructions</primary></indexterm>
    <indexterm><primary>memory barriers</primary></indexterm>

      <para>There are several existing treatments of memory barriers
	and atomic instructions, so this section will not include a
	lot of detail.  To put it simply, one can not go around reading
	variables without a lock if a lock is used to protect writes
	to that variable.  This becomes obvious when you consider that
	memory barriers simply determine relative order of memory
	operations; they do not make any guarantee about timing of
	memory operations.  That is, a memory barrier does not force
	the contents of a CPU's local cache or store buffer to flush.
	Instead, the memory barrier at lock release simply ensures
	that all writes to the protected data will be visible to other
	CPU's or devices if the write to release the lock is visible.
	The CPU is free to keep that data in its cache or store buffer
	as long as it wants. However, if another CPU performs an
	atomic instruction on the same datum, the first CPU must
	guarantee that the updated value is made visible to the second
	CPU along with any other operations that memory barriers may
	require.</para>

      <para>For example, assuming a simple model where data is
	considered visible when it is in main memory (or a global
	cache), when an atomic instruction is triggered on one CPU,
	other CPU's store buffers and caches must flush any writes to
	that same cache line along with any pending operations behind
	a memory barrier.</para>

      <para>This requires one to take special care when using an item
	protected by atomic instructions.  For example, in the sleep
	mutex implementation, we have to use an
	<function>atomic_cmpset</function> rather than an
	<function>atomic_set</function> to turn on the
	<constant>MTX_CONTESTED</constant> bit.  The reason is that we
	read the value of <structfield>mtx_lock</structfield> into a
	variable and then make a decision based on that read.
	However, the value we read may be stale, or it may change
	while we are making our decision.  Thus, when the
	<function>atomic_set</function> executed, it may end up
	setting the bit on another value than the one we made the
	decision on. Thus, we have to use an
	<function>atomic_cmpset</function> to set the value only if
	the value we made the decision on is up-to-date and
	valid.</para>

      <para>Finally, atomic instructions only allow one item to be
	updated or read.  If one needs to atomically update several
	items, then a lock must be used instead.  For example, if two
	counters must be read and have values that are consistent
	relative to each other, then those counters must be protected
	by a lock rather than by separate atomic instructions.</para>
    </sect2>

    <sect2>
      <title>Read Locks versus Write Locks</title>

    <indexterm><primary>read locks</primary></indexterm>
    <indexterm><primary>write locks</primary></indexterm>
      <para>Read locks do not need to be as strong as write locks.
	Both types of locks need to ensure that the data they are
	accessing is not stale.  However, only write access requires
	exclusive access.  Multiple threads can safely read a value.
	Using different types of locks for reads and writes can be
	implemented in a number of ways.</para>

      <para>First, sx locks can be used in this manner by using an
	exclusive lock when writing and a shared lock when reading.
	This method is quite straightforward.</para>

      <para>A second method is a bit more obscure.  You can protect a
	datum with multiple locks.  Then for reading that data you
	simply need to have a read lock of one of the locks.  However,
	to write to the data, you need to have a write lock of all of
	the locks.  This can make writing rather expensive but can be
	useful when data is accessed in various ways.  For example,
	the parent process pointer is protected by both the
	<varname>proctree_lock</varname> sx lock and the per-process
	mutex.  Sometimes the proc lock is easier as we are just
	checking to see who a parent of a process is that we already
	have locked.  However, other places such as
	<function>inferior</function> need to walk the tree of
	processes via parent pointers and locking each process would
	be prohibitive as well as a pain to guarantee that the
	condition you are checking remains valid for both the check
	and the actions taken as a result of the check.</para>
    </sect2>

    <sect2>
      <title>Locking Conditions and Results</title>

      <para>If you need a lock to check the state of a variable so
	that you can take an action based on the state you read, you
	can not just hold the lock while reading the variable and then
	drop the lock before you act on the value you read.  Once you
	drop the lock, the variable can change rendering your decision
	invalid. Thus, you must hold the lock both while reading the
	variable and while performing the action as a result of the
	test.</para>
    </sect2>
  </sect1>

  <sect1 id="smp-design">
    <title>General Architecture and Design</title>

    <sect2>
      <title>Interrupt Handling</title>

    <indexterm><primary>interrupt handling</primary></indexterm>

      <para>Following the pattern of several other multi-threaded &unix;
	kernels, FreeBSD deals with interrupt handlers by giving them
	their own thread context.  Providing a context for interrupt
	handlers allows them to block on locks.  To help avoid
	latency, however, interrupt threads run at real-time kernel
	priority. Thus, interrupt handlers should not execute for very
	long to avoid starving other kernel threads.  In addition,
	since multiple handlers may share an interrupt thread,
	interrupt handlers should not sleep or use a sleepable lock to
	avoid starving another interrupt handler.</para>

    <indexterm><primary>interrupt threads</primary></indexterm>

      <para>The interrupt threads currently in FreeBSD are referred to
	as heavyweight interrupt threads.  They are called this
	because switching to an interrupt thread involves a full
	context switch. In the initial implementation, the kernel was
	not preemptive and thus interrupts that interrupted a kernel
	thread would have to wait until the kernel thread blocked or
	returned to userland before they would have an opportunity to
	run.</para>

    <indexterm><primary>latency</primary></indexterm>
    <indexterm><primary>preemption</primary></indexterm>

      <para>To deal with the latency problems, the kernel in FreeBSD
	has been made preemptive.  Currently, we only preempt a kernel
	thread when we release a sleep mutex or when an interrupt
	comes in.  However, the plan is to make the FreeBSD kernel
	fully preemptive as described below.</para>

      <para>Not all interrupt handlers execute in a thread context.
	Instead, some handlers execute directly in primary interrupt
	context.  These interrupt handlers are currently misnamed
	<quote>fast</quote> interrupt handlers since the
	<constant>INTR_FAST</constant> flag used in earlier versions
	of the kernel is used to mark these handlers.  The only
	interrupts which currently use these types of interrupt
	handlers are clock interrupts and serial I/O device
	interrupts.  Since these handlers do not have their own
	context, they may not acquire blocking locks and thus may only
	use spin mutexes.</para>

    <indexterm><primary>context switches</primary></indexterm>

      <para>Finally, there is one optional optimization that can be
	added in MD code called lightweight context switches.  Since
	an interrupt thread executes in a kernel context, it can
	borrow the vmspace of any process.  Thus, in a lightweight
	context switch, the switch to the interrupt thread does not
	switch vmspaces but borrows the vmspace of the interrupted
	thread.  In order to ensure that the vmspace of the
	interrupted thread does not disappear out from under us, the
	interrupted thread is not allowed to execute until the
	interrupt thread is no longer borrowing its vmspace.  This can
	happen when the interrupt thread either blocks or finishes.
	If an interrupt thread blocks, then it will use its own
	context when it is made runnable again.  Thus, it can release
	the interrupted thread.</para>

      <para>The cons of this optimization are that they are very
	machine specific and complex and thus only worth the effort if
	their is a large performance improvement.  At this point it is
	probably too early to tell, and in fact, will probably hurt
	performance as almost all interrupt handlers will immediately
	block on Giant and require a thread fix-up when they block.
	Also, an alternative method of interrupt handling has been
	proposed by Mike Smith that works like so:</para>

      <orderedlist>
	<listitem>
	  <para>Each interrupt handler has two parts: a predicate
	    which runs in primary interrupt context and a handler
	    which runs in its own thread context.</para>
	</listitem>

	<listitem>
	  <para>If an interrupt handler has a predicate, then when an
	    interrupt is triggered, the predicate is run.  If the
	    predicate returns true then the interrupt is assumed to be
	    fully handled and the kernel returns from the interrupt.
	    If the predicate returns false or there is no predicate,
	    then the threaded handler is scheduled to run.</para>
	</listitem>
      </orderedlist>

      <para>Fitting light weight context switches into this scheme
	might prove rather complicated.  Since we may want to change
	to this scheme at some point in the future, it is probably
	best to defer work on light weight context switches until we
	have settled on the final interrupt handling architecture and
	determined how light weight context switches might or might
	not fit into it.</para>
    </sect2>

    <sect2>
      <title>Kernel Preemption and Critical Sections</title>

      <sect3>
	<title>Kernel Preemption in a Nutshell</title>

	<para>Kernel preemption is fairly simple.  The basic idea is
	  that a CPU should always be doing the highest priority work
	  available.  Well, that is the ideal at least.  There are a
	  couple of cases where the expense of achieving the ideal is
	  not worth being perfect.</para>

	<para>Implementing full kernel preemption is very
	  straightforward: when you schedule a thread to be executed
	  by putting it on a run queue, you check to see if its
	  priority is higher than the currently executing thread.  If
	  so, you initiate a context switch to that thread.</para>

	<para>While locks can protect most data in the case of a
	  preemption, not all of the kernel is preemption safe.  For
	  example, if a thread holding a spin mutex preempted and the
	  new thread attempts to grab the same spin mutex, the new
	  thread may spin forever as the interrupted thread may never
	  get a chance to execute.  Also, some code such as the code
	  to assign an address space number for a process during
	  <function>exec</function> on the Alpha needs to not be
	  preempted as it supports the actual context switch code.
	  Preemption is disabled for these code sections by using a
	  critical section.</para>
      </sect3>

      <sect3>
	<title>Critical Sections</title>

        <indexterm><primary>critical sections</primary></indexterm>

	<para>The responsibility of the critical section API is to
	  prevent context switches inside of a critical section.  With
	  a fully preemptive kernel, every
	  <function>setrunqueue</function> of a thread other than the
	  current thread is a preemption point.  One implementation is
	  for <function>critical_enter</function> to set a per-thread
	  flag that is cleared by its counterpart.  If
	  <function>setrunqueue</function> is called with this flag
	  set, it does not preempt regardless of the priority of the new
	  thread relative to the current thread.  However, since
	  critical sections are used in spin mutexes to prevent
	  context switches and multiple spin mutexes can be acquired,
	  the critical section API must support nesting.  For this
	  reason the current implementation uses a nesting count
	  instead of a single per-thread flag.</para>

	<para>In order to minimize latency, preemptions inside of a
	  critical section are deferred rather than dropped.  If a
	  thread that would normally be preempted to is made
	  runnable while the current thread is in a critical section,
	  then a per-thread flag is set
	  to indicate that there is a pending preemption.  When the
	  outermost critical section is exited, the flag is checked.
	  If the flag is set, then the current thread is preempted to
	  allow the higher priority thread to run.</para>

        <indexterm><primary>spin mutexes</primary></indexterm>
        <indexterm><primary>mutexes</primary><secondary>spin</secondary></indexterm>
	<para>Interrupts pose a problem with regards to spin mutexes.
	  If a low-level interrupt handler needs a lock, it needs to
	  not interrupt any code needing that lock to avoid possible
	  data structure corruption.  Currently, providing this
	  mechanism is piggybacked onto critical section API by means
	  of the <function>cpu_critical_enter</function> and
	  <function>cpu_critical_exit</function> functions.  Currently
	  this API disables and re-enables interrupts on all of
	  FreeBSD's current platforms.  This approach may not be
	  purely optimal, but it is simple to understand and simple to
	  get right.  Theoretically, this second API need only be used
	  for spin mutexes that are used in primary interrupt context.
	  However, to make the code simpler, it is used for all spin
	  mutexes and even all critical sections.  It may be desirable
	  to split out the MD API from the MI API and only use it in
	  conjunction with the MI API in the spin mutex
	  implementation.  If this approach is taken, then the MD API
	  likely would need a rename to show that it is a separate
	  API.</para>
      </sect3>

      <sect3>
	<title>Design Tradeoffs</title>

	<para>As mentioned earlier, a couple of trade-offs have been
	  made to sacrifice cases where perfect preemption may not
	  always provide the best performance.</para>

	<para>The first trade-off is that the preemption code does not
	  take other CPUs into account.  Suppose we have a two CPU's A
	  and B with the priority of A's thread as 4 and the priority
	  of B's thread as 2.  If CPU B makes a thread with priority 1
	  runnable, then in theory, we want CPU A to switch to the new
	  thread so that we will be running the two highest priority
	  runnable threads.  However, the cost of determining which
	  CPU to enforce a preemption on as well as actually signaling
	  that CPU via an IPI along with the synchronization that
	  would be required would be enormous.  Thus, the current code
	  would instead force CPU B to switch to the higher priority
	  thread. Note that this still puts the system in a better
	  position as CPU B is executing a thread of priority 1 rather
	  than a thread of priority 2.</para>

	<para>The second trade-off limits immediate kernel preemption
	  to real-time priority kernel threads.  In the simple case of
	  preemption defined above, a thread is always preempted
	  immediately (or as soon as a critical section is exited) if
	  a higher priority thread is made runnable.  However, many
	  threads executing in the kernel only execute in a kernel
	  context for a short time before either blocking or returning
	  to userland.  Thus, if the kernel preempts these threads to
	  run another non-realtime kernel thread, the kernel may
	  switch out the executing thread just before it is about to
	  sleep or execute.  The cache on the CPU must then adjust to
	  the new thread.  When the kernel returns to the preempted
	  thread, it must refill all the cache information that was lost.
	  In addition, two extra context switches are performed that
	  could be avoided if the kernel deferred the preemption until
	  the first thread blocked or returned to userland.  Thus, by
	  default, the preemption code will only preempt immediately
	  if the higher priority thread is a real-time priority
	  thread.</para>

	<para>Turning on full kernel preemption for all kernel threads
	  has value as a debugging aid since it exposes more race
	  conditions.  It is especially useful on UP systems were many
	  races are hard to simulate otherwise.  Thus, there is a
	  kernel option <literal>FULL_PREEMPTION</literal> to enable
	  preemption for all kernel threads that can be used for
	  debugging purposes.</para>
      </sect3>
    </sect2>

    <sect2>
      <title>Thread Migration</title>

      <indexterm><primary>thread migration</primary></indexterm>

      <para>Simply put, a thread migrates when it moves from one CPU
	to another.  In a non-preemptive kernel this can only happen
	at well-defined points such as when calling
	<function>msleep</function> or returning to userland.
	However, in the preemptive kernel, an interrupt can force a
	preemption and possible migration at any time.  This can have
	negative affects on per-CPU data since with the exception of
	<varname>curthread</varname> and <varname>curpcb</varname> the
	data can change whenever you migrate.  Since you can
	potentially migrate at any time this renders unprotected
	per-CPU data access rather useless. Thus it is desirable to be
	able to disable migration for sections of code that need
	per-CPU data to be stable.</para>

      <indexterm><primary>critical sections</primary></indexterm>

      <para>Critical sections currently prevent migration since they
	do not allow context switches.  However, this may be too
	strong of a requirement to enforce in some cases since a
	critical section also effectively blocks interrupt threads on
	the current processor.  As a result, another API has been
	provided to allow the current thread to indicate that if it
	preempted it should not migrate to another CPU.</para>

      <para>This API is known as thread pinning and is provided by the
	scheduler.  The API consists of two functions:
	<function>sched_pin</function> and
	<function>sched_unpin</function>.  These functions manage a
	per-thread nesting count <varname>td_pinned</varname>.  A
	thread is pinned when its nesting count is greater than zero
	and a thread starts off unpinned with a nesting count of zero.
	Each scheduler implementation is required to ensure that
	pinned threads are only executed on the CPU that they were
	executing on when the <function>sched_pin</function> was first
	called.  Since the nesting count is only written to by the
	thread itself and is only read by other threads when the
	pinned thread is not executing but while
	<varname>sched_lock</varname> is held, then
	<varname>td_pinned</varname> does not need any locking.  The
	<function>sched_pin</function> function increments the nesting
	count and <function>sched_unpin</function> decrements the
	nesting count.  Note that these functions only operate on the
	current thread and bind the current thread to the CPU it is
	executing on at the time.  To bind an arbitrary thread to a
	specific CPU, the <function>sched_bind</function> and
	<function>sched_unbind</function> functions should be used
	instead.</para>
    </sect2>

    <sect2>
      <title>Callouts</title>

      <para>The <function>timeout</function> kernel facility permits
	kernel services to register functions for execution as part
	of the <function>softclock</function> software interrupt.
	Events are scheduled based on a desired number of clock
	ticks, and callbacks to the consumer-provided function
	will occur at approximately the right time.</para>

      <para>The global list of pending timeout events is protected
	by a global spin mutex, <varname>callout_lock</varname>;
	all access to the timeout list must be performed with this
	mutex held.  When <function>softclock</function> is
	woken up, it scans the list of pending timeouts for those
	that should fire.  In order to avoid lock order reversal,
	the <function>softclock</function> thread will release the
	<varname>callout_lock</varname> mutex when invoking the
	provided <function>timeout</function> callback function.
	If the <constant>CALLOUT_MPSAFE</constant> flag was not set
	during registration, then Giant will be grabbed before
	invoking the callout, and then released afterwards.  The
	<varname>callout_lock</varname> mutex will be re-grabbed
	before proceeding.  The <function>softclock</function>
	code is careful to leave the list in a consistent state
	while releasing the mutex.  If <constant>DIAGNOSTIC</constant>
	is enabled, then the time taken to execute each function is
	measured, and a warning is generated if it exceeds a
	threshold.</para>
    </sect2>
  </sect1>

  <sect1 id="smp-lock-strategies">
    <title>Specific Locking Strategies</title>

    <sect2>
      <title>Credentials</title>

      <indexterm><primary>credentials</primary></indexterm>

      <para><structname>struct ucred</structname> is the kernel's
	internal credential structure, and is generally used as the
	basis for process-driven access control within the kernel.
	BSD-derived systems use a <quote>copy-on-write</quote> model
	for credential data: multiple references may exist for a
	credential structure, and when a change needs to be made, the
	structure is duplicated, modified, and then the reference
	replaced.  Due to wide-spread caching of the credential to
	implement access control on open, this results in substantial
	memory savings.  With a move to fine-grained SMP, this model
	also saves substantially on locking operations by requiring
	that modification only occur on an unshared credential,
	avoiding the need for explicit synchronization when consuming
	a known-shared credential.</para>

      <para>Credential structures with a single reference are
	considered mutable; shared credential structures must not be
	modified or a race condition is risked.  A mutex,
	<structfield>cr_mtxp</structfield> protects the reference
	count of <structname>struct ucred</structname> so as to
	maintain consistency.  Any use of the structure requires a
	valid reference for the duration of the use, or the structure
	may be released out from under the illegitimate
	consumer.</para>

      <para>The <structname>struct ucred</structname> mutex is a leaf
	mutex and is implemented via a mutex pool for performance
	reasons.</para>

      <para>Usually, credentials are used in a read-only manner for access
	control decisions, and in this case
	<structfield>td_ucred</structfield> is generally preferred
	because it requires no locking.  When a process' credential is
	updated the <literal>proc</literal> lock must be held across
	the check and update operations thus avoid races.  The process
	credential <structfield>p_ucred</structfield> must be used for
	check and update operations to prevent time-of-check,
	time-of-use races.</para>

      <para>If system call invocations will perform access control after
	an update to the process credential, the value of
	<structfield>td_ucred</structfield> must also be refreshed to
	the current process value.  This will prevent use of a stale
	credential following a change.  The kernel automatically
	refreshes the <structfield>td_ucred</structfield> pointer in
	the thread structure from the process
	<structfield>p_ucred</structfield> whenever a process enters
	the kernel, permitting use of a fresh credential for kernel
	access control.</para>
    </sect2>

    <sect2>
      <title>File Descriptors and File Descriptor Tables</title>

      <para>Details to follow.</para>
    </sect2>

    <sect2>
      <title>Jail Structures</title>

      <indexterm><primary>Jail</primary></indexterm>

      <para><structname>struct prison</structname> stores
	administrative details pertinent to the maintenance of jails
	created using the &man.jail.2; API.  This includes the
	per-jail hostname, IP address, and related settings.  This
	structure is reference-counted since pointers to instances of
	the structure are shared by many credential structures.  A
	single mutex, <structfield>pr_mtx</structfield> protects read
	and write access to the reference count and all mutable
	variables inside the struct jail.  Some variables are set only
	when the jail is created, and a valid reference to the
	<structname>struct prison</structname> is sufficient to read
	these values.  The precise locking of each entry is documented
	via comments in <filename>sys/jail.h</filename>.</para>
    </sect2>

    <sect2>
      <title>MAC Framework</title>

      <indexterm><primary>MAC</primary></indexterm>

      <para>The TrustedBSD MAC Framework maintains data in a variety
	of kernel objects, in the form of <structname>struct
	label</structname>.  In general, labels in kernel objects
	are protected by the same lock as the remainder of the kernel
	object.  For example, the <structfield>v_label</structfield>
	label in <structname>struct vnode</structname> is protected
	by the vnode lock on the vnode.</para>

      <para>In addition to labels maintained in standard kernel objects,
	the MAC Framework also maintains a list of registered and
	active policies.  The policy list is protected by a global
	mutex (<varname>mac_policy_list_lock</varname>) and a busy
	count (also protected by the mutex).  Since many access
	control checks may occur in parallel, entry to the framework
	for a read-only access to the policy list requires holding the
	mutex while incrementing (and later decrementing) the busy
	count.  The mutex need not be held for the duration of the
	MAC entry operation--some operations, such as label operations
	on file system objects--are long-lived.  To modify the policy
	list, such as during policy registration and de-registration,
	the mutex must be held and the reference count must be zero,
	to prevent modification of the list while it is in use.</para>

      <para>A condition variable,
	<varname>mac_policy_list_not_busy</varname>, is available to
	threads that need to wait for the list to become unbusy, but
	this condition variable must only be waited on if the caller is
	holding no other locks, or a lock order violation may be
	possible.  The busy count, in effect, acts as a form of
	shared/exclusive lock over access to the framework: the difference
	is that, unlike with an sx lock, consumers waiting for the list
	to become unbusy may be starved, rather than permitting lock
	order problems with regards to the busy count and other locks
	that may be held on entry to (or inside) the MAC Framework.</para>
    </sect2>

    <sect2>
      <title>Modules</title>

      <indexterm><primary>kernel modules</primary></indexterm>

      <para>For the module subsystem there exists a single lock that is
	used to protect the shared data.  This lock is a shared/exclusive
	(SX) lock and has a good chance of needing to be acquired (shared
	or exclusively), therefore there are a few macros that have been
	added to make access to the lock more easy.  These macros can be
	located in <filename>sys/module.h</filename> and are quite basic
	in terms of usage.  The main structures protected under this lock
	are the <structname>module_t</structname> structures (when shared)
	and the global <structname>modulelist_t</structname> structure,
	modules.  One should review the related source code in
	<filename>kern/kern_module.c</filename> to further understand the
	locking strategy.</para>
    </sect2>

    <sect2>
      <title>Newbus Device Tree</title>

      <indexterm><primary>Newbus</primary></indexterm>

      <para>The newbus system will have one sx lock.  Readers will
	hold a shared (read) lock (&man.sx.slock.9;) and writers will hold
	an exclusive (write) lock (&man.sx.xlock.9;).  Internal functions
	will not do locking at all.  Externally visible ones will lock as
	needed.
	Those items that do not matter if the race is won or lost will
	not be locked, since they tend to be read all over the place
	(e.g. &man.device.get.softc.9;).  There will be relatively few
	changes to the newbus data structures, so a single lock should
	be sufficient and not impose a performance penalty.</para>
    </sect2>

    <sect2>
      <title>Pipes</title>

      <para>...</para>
    </sect2>

    <sect2>
      <title>Processes and Threads</title>

      <para>- process hierarchy</para>
      <para>- proc locks, references</para>
      <para>- thread-specific copies of proc entries to freeze during system
	calls, including td_ucred</para>
      <para>- inter-process operations</para>
      <para>- process groups and sessions</para>
    </sect2>

    <sect2>
      <title>Scheduler</title>

      <indexterm><primary>scheduler</primary></indexterm>

      <para>Lots of references to <varname>sched_lock</varname> and notes
	pointing at specific primitives and related magic elsewhere in the
	document.</para>
    </sect2>

    <sect2>
      <title>Select and Poll</title>

      <para>The <function>select</function> and
	<function>poll</function> functions permit threads to block
	waiting on events on file descriptors--most frequently,
	whether or not the file descriptors are readable or
	writable.</para>

      <para>...</para>
    </sect2>

    <sect2>
      <title>SIGIO</title>

      <para>The SIGIO service permits processes to request the delivery
	of a SIGIO signal to its process group when the read/write
	status of specified file descriptors changes.  At most one
	process or process group is permitted to register for SIGIO
	from any given kernel object, and that process or group is
	referred to as the owner.  Each object supporting SIGIO
	registration contains pointer field that is
	<constant>NULL</constant> if the object is not registered, or
	points to a <structname>struct sigio</structname> describing
	the registration.  This field is protected by a global mutex,
	<varname>sigio_lock</varname>.  Callers to SIGIO maintenance
	functions must pass in this field <quote>by reference</quote>
	so that local register copies of the field are not made when
	unprotected by the lock.</para>

      <para>One <structname>struct sigio</structname> is allocated for
	each registered object associated with any process or process
	group, and contains back-pointers to the object, owner, signal
	information, a credential, and the general disposition of the
	registration.  Each process or progress group contains a list of
	registered <structname>struct sigio</structname> structures,
	<structfield>p_sigiolst</structfield> for processes, and
	<structfield>pg_sigiolst</structfield> for process groups.
	These lists are protected by the process or process group
	locks respectively.  Most fields in each <structname>struct
	sigio</structname> are constant for the duration of the
	registration, with the exception of the
	<structfield>sio_pgsigio</structfield> field which links the
	<structname>struct sigio</structname> into the process or
	process group list.  Developers implementing new kernel
	objects supporting SIGIO will, in general, want to avoid
	holding structure locks while invoking SIGIO supporting
	functions, such as <function>fsetown</function>
	or <function>funsetown</function> to avoid
	defining a lock order between structure locks and the global
	SIGIO lock.  This is generally possible through use of an
	elevated reference count on the structure, such as reliance
	on a file descriptor reference to a pipe during a pipe
	operation.</para>
    </sect2>

    <sect2>
      <title>Sysctl</title>

      <para>The <function>sysctl</function> MIB service is invoked
	from both within the kernel and from userland applications
	using a system call.  At least two issues are raised in
	locking: first, the protection of the structures maintaining
	the namespace, and second, interactions with kernel variables
	and functions that are accessed by the sysctl interface.
	Since sysctl permits the direct export (and modification) of
	kernel statistics and configuration parameters, the sysctl
	mechanism must become aware of appropriate locking semantics
	for those variables.  Currently, sysctl makes use of a single
	global sx lock to serialize use of
	<function>sysctl</function>; however, it is assumed to operate
	under Giant and other protections are not provided.  The
	remainder of this section speculates on locking and semantic
	changes to sysctl.</para>

      <para>- Need to change the order of operations for sysctl's that
	update values from read old, copyin and copyout, write new to
	copyin, lock, read old and write new, unlock, copyout.  Normal
	sysctl's that just copyout the old value and set a new value
	that they copyin may still be able to follow the old model.
	However, it may be cleaner to use the second model for all of
	the sysctl handlers to avoid lock operations.</para>

      <para>- To allow for the common case, a sysctl could embed a
	pointer to a mutex in the SYSCTL_FOO macros and in the struct.
	This would work for most sysctl's.  For values protected by sx
	locks, spin mutexes, or other locking strategies besides a
	single sleep mutex, SYSCTL_PROC nodes could be used to get the
	locking right.</para>
    </sect2>

    <sect2>
      <title>Taskqueue</title>

       <para> The taskqueue's interface has two basic locks associated
	with it in order to protect the related shared data.  The
	<varname>taskqueue_queues_mutex</varname> is meant to serve as a
	lock to protect the <varname>taskqueue_queues</varname> TAILQ.
	The other mutex lock associated with this system is the one in the
	<structname>struct taskqueue</structname> data structure.  The
	use of the synchronization primitive here is to protect the
	integrity of the data in the <structname>struct
	taskqueue</structname>.  It should be noted that there are no
	separate macros to assist the user in locking down his/her own work
	since these locks are most likely not going to be used outside of
	<filename>kern/subr_taskqueue.c</filename>.</para>
    </sect2>
  </sect1>

  <sect1 id="smp-implementation-notes">
    <title>Implementation Notes</title>

    <sect2>
      <title>Sleep Queues</title>

      <para>A sleep queue is a structure that holds the list of
	threads asleep on a wait channel.  Each thread that is not
	asleep on a wait channel carries a sleep queue structure
	around with it.  When a thread blocks on a wait channel, it
	donates its sleep queue structure to that wait channel.  Sleep
	queues associated with a wait channel are stored in a hash
	table.</para>

      <para>The sleep queue hash table holds sleep queues for wait
	channels that have at least one blocked thread.  Each entry in
	the hash table is called a sleepqueue chain.  The chain
	contains a linked list of sleep queues and a spin mutex.  The
	spin mutex protects the list of sleep queues as well as the
	contents of the sleep queue structures on the list.  Only one
	sleep queue is associated with a given wait channel.  If
	multiple threads block on a wait channel than the sleep queues
	associated with all but the first thread are stored on a list
	of free sleep queues in the master sleep queue.  When a thread
	is removed from the sleep queue it is given one of the sleep
	queue structures from the master queue's free list if it is
	not the only thread asleep on the queue.  The last thread is
	given the master sleep queue when it is resumed.  Since
	threads may be removed from the sleep queue in a different
	order than they are added, a thread may depart from a sleep
	queue with a different sleep queue structure than the one it
	arrived with.</para>

      <para>The <function>sleepq_lock</function> function locks the
	spin mutex of the sleep queue chain that maps to a specific
	wait channel.  The <function>sleepq_lookup</function> function
	looks in the hash table for the master sleep queue associated
	with a given wait channel.  If no master sleep queue is found,
	it returns <constant>NULL</constant>.  The
	<function>sleepq_release</function> function unlocks the spin
	mutex associated with a given wait channel.</para>

      <para>A thread is added to a sleep queue via the
	<function>sleepq_add</function>.  This function accepts the
	wait channel, a pointer to the mutex that protects the wait
	channel, a wait message description string, and a mask of
	flags.  The sleep queue chain should be locked via
	<function>sleepq_lock</function> before this function is
	called.  If no mutex protects the wait channel (or it is
	protected by Giant), then the mutex pointer argument should be
	<constant>NULL</constant>.  The flags argument contains a type
	field that indicates the kind of sleep queue that the thread
	is being added to and a flag to indicate if the sleep is
	interruptible (<constant>SLEEPQ_INTERRUPTIBLE</constant>).
	Currently there are only two types of sleep queues:
	traditional sleep queues managed via the
	<function>msleep</function> and <function>wakeup</function>
	functions (<constant>SLEEPQ_MSLEEP</constant>) and condition
	variable sleep queues (<constant>SLEEPQ_CONDVAR</constant>).
	The sleep queue type and lock pointer argument are used solely
	for internal assertion checking.  Code that calls
	<function>sleepq_add</function> should explicitly unlock any
	interlock protecting the wait channel after the associated
	sleepqueue chain has been locked via
	<function>sleepq_lock</function> and before blocking on the
	sleep queue via one of the waiting functions.</para>

      <para>A timeout for a sleep is set by invoking
	<function>sleepq_set_timeout</function>.  The function accepts
	the wait channel and the timeout time as a relative tick count
	as its arguments.  If a sleep should be interrupted by
	arriving signals, the
	<function>sleepq_catch_signals</function> function should be
	called as well.  This function accepts the wait channel as its
	only parameter.  If there is already a signal pending for this
	thread, then <function>sleepq_catch_signals</function> will
	return a signal number; otherwise, it will return 0.</para>

      <para>Once a thread has been added to a sleep queue, it blocks
	using one of the <function>sleepq_wait</function> functions.
	There are four wait functions depending on whether or not the
	caller wishes to use a timeout or have the sleep aborted by
	caught signals or an interrupt from the userland thread
	scheduler.  The <function>sleepq_wait</function> function
	simply waits until the current thread is explicitly resumed by
	one of the wakeup functions.  The
	<function>sleepq_timedwait</function> function waits until
	either the thread is explicitly resumed or the timeout set by
	an earlier call to <function>sleepq_set_timeout</function>
	expires.  The <function>sleepq_wait_sig</function> function
	waits until either the thread is explicitly resumed or its
	sleep is aborted.  The
	<function>sleepq_timedwait_sig</function> function waits until
	either the thread is explicitly resumed, the timeout set by an
	earlier call to <function>sleepq_set_timeout</function>
	expires, or the thread's sleep is aborted.  All of the wait
	functions accept the wait channel as their first parameter.
	In addition, the <function>sleepq_timedwait_sig</function>
	function accepts a second boolean parameter to indicate if the
	earlier call to <function>sleepq_catch_signals</function>
	found a pending signal.</para>

      <para>If the thread is explicitly resumed or is aborted by a
	signal, then a value of zero is returned by the wait function
	to indicate a successful sleep.  If the thread is resumed by
	either a timeout or an interrupt from the userland thread
	scheduler then an appropriate errno value is returned instead.
	Note that since <function>sleepq_wait</function> can only
	return 0 it does not return anything and the caller should
	assume a successful sleep.  Also, if a thread's sleep times
	out and is aborted simultaneously then
	<function>sleepq_timedwait_sig</function> will return an error
	indicating that a timeout occurred.  If an error value of
	0 is returned and either <function>sleepq_wait_sig</function>
	or <function>sleepq_timedwait_sig</function> was used to
	block, then the function
	<function>sleepq_calc_signal_retval</function> should be
	called to check for any pending signals and calculate an
	appropriate return value if any are found.  The signal number
	returned by the earlier call to
	<function>sleepq_catch_signals</function> should be passed as
	the sole argument to
	<function>sleepq_calc_signal_retval</function>.</para>

      <para>Threads asleep on a wait channel are explicitly resumed by
	the <function>sleepq_broadcast</function> and
	<function>sleepq_signal</function> functions.  Both functions
	accept the wait channel from which to resume threads, a
	priority to raise resumed threads to, and a flags argument to
	indicate which type of sleep queue is being resumed.  The
	priority argument is treated as a minimum priority.  If a
	thread being resumed already has a higher priority
	(numerically lower) than the priority argument then its
	priority is not adjusted.  The flags argument is used for
	internal assertions to ensure that sleep queues are not being
	treated as the wrong type.  For example, the condition
	variable functions should not resume threads on a traditional
	sleep queue.  The <function>sleepq_broadcast</function>
	function resumes all threads that are blocked on the specified
	wait channel while <function>sleepq_signal</function> only
	resumes the highest priority thread blocked on the wait
	channel.  The sleep queue chain should first be locked via the
	<function>sleepq_lock</function> function before calling these
	functions.</para>

      <para>A sleeping thread may have its sleep interrupted by
	calling the <function>sleepq_abort</function> function.  This
	function must be called with <varname>sched_lock</varname>
	held and the thread must be queued on a sleep queue.  A thread
	may also be removed from a specific sleep queue via the
	<function>sleepq_remove</function> function.  This function
	accepts both a thread and a wait channel as an argument and
	only awakens the thread if it is on the sleep queue for the
	specified wait channel.  If the thread is not on a sleep queue
	or it is on a sleep queue for a different wait channel, then
	this function does nothing.</para>
    </sect2>

    <sect2>
      <title>Turnstiles</title>

      <indexterm><primary>turnstiles</primary></indexterm>

      <para>- Compare/contrast with sleep queues.</para>

      <para>- Lookup/wait/release.
        - Describe TDF_TSNOBLOCK race.</para>

      <para>- Priority propagation.</para>
    </sect2>

    <sect2>
      <title>Details of the Mutex Implementation</title>

      <para>- Should we require mutexes to be owned for mtx_destroy()
	since we can not safely assert that they are unowned by anyone
	else otherwise?</para>

      <sect3>
	<title>Spin Mutexes</title>

        <indexterm><primary>mutex</primary><secondary>spin</secondary></indexterm>

	<para>- Use a critical section...</para>
      </sect3>

      <sect3>
	<title>Sleep Mutexes</title>

        <indexterm><primary>mutex</primary><secondary>sleep</secondary></indexterm>
	<para>- Describe the races with contested mutexes</para>

	<para>- Why it is safe to read mtx_lock of a contested mutex
	  when holding the turnstile chain lock.</para>
      </sect3>
    </sect2>

    <sect2>
      <title>Witness</title>

      <indexterm><primary>witness</primary></indexterm>

      <para>- What does it do</para>

      <para>- How does it work</para>
    </sect2>
  </sect1>

  <sect1 id="smp-misc">
    <title>Miscellaneous Topics</title>

    <sect2>
      <title>Interrupt Source and ICU Abstractions</title>

      <para>- struct isrc</para>

      <para>- pic drivers</para>
    </sect2>

    <sect2>
      <title>Other Random Questions/Topics</title>

      <para>- Should we pass an interlock into
	<function>sema_wait</function>?</para>

      <para>- Should we have non-sleepable sx locks?</para>

      <para>- Add some info about proper use of reference counts.</para>
    </sect2>
  </sect1>

  <glossary id="smp-glossary">
    <title>Glossary</title>

    <glossentry id="smp-glossary-atomic">
      <glossterm>atomic</glossterm>
      <glossdef>
	<para>An operation is atomic if all of its effects are visible
	  to other CPUs together when the proper access protocol is
	  followed.  In the degenerate case are atomic instructions
	  provided directly by machine architectures.  At a higher
	  level, if several members of a structure are protected by a
	  lock, then a set of operations are atomic if they are all
	  performed while holding the lock without releasing the lock
	  in between any of the operations.</para>

	<glossseealso>operation</glossseealso>
      </glossdef>
    </glossentry>

    <glossentry id="smp-glossary-block">
      <glossterm>block</glossterm>
      <glossdef>
	<para>A thread is blocked when it is waiting on a lock,
	  resource, or condition.  Unfortunately this term is a bit
	  overloaded as a result.</para>

	<glossseealso>sleep</glossseealso>
      </glossdef>
    </glossentry>

    <glossentry id="smp-glossary-critical-section">
      <glossterm>critical section</glossterm>
      <glossdef>
	<para>A section of code that is not allowed to be preempted.
	  A critical section is entered and exited using the
	  &man.critical.enter.9; API.</para>
      </glossdef>
    </glossentry>

    <glossentry id="smp-glossary-MD">
      <glossterm>MD</glossterm>
      <glossdef>
	<para>Machine dependent.</para>

	<glossseealso>MI</glossseealso>
      </glossdef>
    </glossentry>

    <glossentry id="smp-glossary-memory-operation">
      <glossterm>memory operation</glossterm>
      <glossdef>
	<para>A memory operation reads and/or writes to a memory
	  location.</para>
      </glossdef>
    </glossentry>

    <glossentry id="smp-glossary-MI">
      <glossterm>MI</glossterm>
      <glossdef>
	<para>Machine independent.</para>

	<glossseealso>MD</glossseealso>
      </glossdef>
    </glossentry>

    <glossentry id="smp-glossary-operation">
      <glossterm>operation</glossterm>
      <glosssee>memory operation</glosssee>
    </glossentry>

    <glossentry id="smp-glossary-primary-interrupt-context">
      <glossterm>primary interrupt context</glossterm>
      <glossdef>
	<para>Primary interrupt context refers to the code that runs
	  when an interrupt occurs.  This code can either run an
	  interrupt handler directly or schedule an asynchronous
	  interrupt thread to execute the interrupt handlers for a
	  given interrupt source.</para>
      </glossdef>
    </glossentry>

    <glossentry>
      <glossterm>realtime kernel thread</glossterm>
      <glossdef>
	<para>A high priority kernel thread.  Currently, the only
	  realtime priority kernel threads are interrupt threads.</para>

	<glossseealso>thread</glossseealso>
      </glossdef>
    </glossentry>

    <glossentry id="smp-glossary-sleep">
      <glossterm>sleep</glossterm>
      <glossdef>
	<para>A thread is asleep when it is blocked on a condition
	  variable or a sleep queue via <function>msleep</function> or
	  <function>tsleep</function>.</para>

	<glossseealso>block</glossseealso>
      </glossdef>
    </glossentry>

    <glossentry id="smp-glossary-sleepable-lock">
      <glossterm>sleepable lock</glossterm>
      <glossdef>
	<para>A sleepable lock is a lock that can be held by a thread
	  which is asleep.  Lockmgr locks and sx locks are currently
	  the only sleepable locks in FreeBSD.  Eventually, some sx
	  locks such as the allproc and proctree locks may become
	  non-sleepable locks.</para>

	<glossseealso>sleep</glossseealso>
      </glossdef>
    </glossentry>

    <glossentry id="smp-glossary-thread">
      <glossterm>thread</glossterm>
      <glossdef>
	<para>A kernel thread represented by a struct thread.  Threads own
	  locks and hold a single execution context.</para>
      </glossdef>
    </glossentry>

    <glossentry id="smp-glossary-wait-channel">
      <glossterm>wait channel</glossterm>
      <glossdef>
	<para>A kernel virtual address that threads may sleep on.</para>
      </glossdef>
    </glossentry>
  </glossary>
</chapter>