€•ù–      Œsphinx.addnodes”Œdocument”“”)”}”(Œ	rawsource”Œ ”Œchildren”]”(Œtranslations”ŒLanguagesNode”“”)”}”(hhh]”(h Œpending_xref”“”)”}”(hhh]”Œdocutils.nodes”ŒText”“”ŒChinese (Simplified)”…””}”Œparent”hsbaŒ
attributes”}”(Œids”]”Œclasses”]”Œnames”]”Œdupnames”]”Œbackrefs”]”Œ	refdomain”Œstd”Œreftype”Œdoc”Œ	reftarget”Œ7/translations/zh_CN/admin-guide/hw-vuln/core-scheduling”Œmodname”NŒ	classname”NŒrefexplicit”ˆuŒtagname”hhhubh)”}”(hhh]”hŒChinese (Traditional)”…””}”hh2sbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ7/translations/zh_TW/admin-guide/hw-vuln/core-scheduling”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubh)”}”(hhh]”hŒItalian”…””}”hhFsbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ7/translations/it_IT/admin-guide/hw-vuln/core-scheduling”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubh)”}”(hhh]”hŒJapanese”…””}”hhZsbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ7/translations/ja_JP/admin-guide/hw-vuln/core-scheduling”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubh)”}”(hhh]”hŒKorean”…””}”hhnsbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ7/translations/ko_KR/admin-guide/hw-vuln/core-scheduling”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubh)”}”(hhh]”hŒSpanish”…””}”hh‚sbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ7/translations/sp_SP/admin-guide/hw-vuln/core-scheduling”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubeh}”(h]”h ]”h"]”h$]”h&]”Œcurrent_language”ŒEnglish”uh1h
hhŒ	_document”hŒsource”NŒline”NubhŒcomment”“”)”}”(hŒ SPDX-License-Identifier: GPL-2.0”h]”hŒ SPDX-License-Identifier: GPL-2.0”…””}”hh£sbah}”(h]”h ]”h"]”h$]”h&]”Œ	xml:space”Œpreserve”uh1h¡hhhžhhŸŒQ/var/lib/git/docbuild/linux/Documentation/admin-guide/hw-vuln/core-scheduling.rst”h KubhŒsection”“”)”}”(hhh]”(hŒtitle”“”)”}”(hŒCore Scheduling”h]”hŒCore Scheduling”…””}”(hh»hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¹hh¶hžhhŸh³h KubhŒ	paragraph”“”)”}”(hXÓ  Core scheduling support allows userspace to define groups of tasks that can
share a core. These groups can be specified either for security usecases (one
group of tasks don't trust another), or for performance usecases (some
workloads may benefit from running on the same core as they don't need the same
hardware resources of the shared core, or may prefer different cores if they
do share hardware resource needs). This document only describes the security
usecase.”h]”hX×  Core scheduling support allows userspace to define groups of tasks that can
share a core. These groups can be specified either for security usecases (one
group of tasks donâ€™t trust another), or for performance usecases (some
workloads may benefit from running on the same core as they donâ€™t need the same
hardware resources of the shared core, or may prefer different cores if they
do share hardware resource needs). This document only describes the security
usecase.”…””}”(hhËhžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h Khh¶hžhubhµ)”}”(hhh]”(hº)”}”(hŒSecurity usecase”h]”hŒSecurity usecase”…””}”(hhÜhžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¹hhÙhžhhŸh³h KubhÊ)”}”(hX  A cross-HT attack involves the attacker and victim running on different Hyper
Threads of the same core. MDS and L1TF are examples of such attacks.  The only
full mitigation of cross-HT attacks is to disable Hyper Threading (HT). Core
scheduling is a scheduler feature that can mitigate some (not all) cross-HT
attacks. It allows HT to be turned on safely by ensuring that only tasks in a
user-designated trusted group can share a core. This increase in core sharing
can also improve performance, however it is not guaranteed that performance
will always improve, though that is seen to be the case with a number of real
world workloads. In theory, core scheduling aims to perform at least as good as
when Hyper Threading is disabled. In practice, this is mostly the case though
not always: as synchronizing scheduling decisions across 2 or more CPUs in a
core involves additional overhead - especially when the system is lightly
loaded. When ``total_threads <= N_CPUS/2``, the extra overhead may cause core
scheduling to perform more poorly compared to SMT-disabled, where N_CPUS is the
total number of CPUs. Please measure the performance of your workloads always.”h]”(hX®  A cross-HT attack involves the attacker and victim running on different Hyper
Threads of the same core. MDS and L1TF are examples of such attacks.  The only
full mitigation of cross-HT attacks is to disable Hyper Threading (HT). Core
scheduling is a scheduler feature that can mitigate some (not all) cross-HT
attacks. It allows HT to be turned on safely by ensuring that only tasks in a
user-designated trusted group can share a core. This increase in core sharing
can also improve performance, however it is not guaranteed that performance
will always improve, though that is seen to be the case with a number of real
world workloads. In theory, core scheduling aims to perform at least as good as
when Hyper Threading is disabled. In practice, this is mostly the case though
not always: as synchronizing scheduling decisions across 2 or more CPUs in a
core involves additional overhead - especially when the system is lightly
loaded. When ”…””}”(hhêhžhhŸNh NubhŒliteral”“”)”}”(hŒ``total_threads <= N_CPUS/2``”h]”hŒtotal_threads <= N_CPUS/2”…””}”(hhôhžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hòhhêubhŒÂ, the extra overhead may cause core
scheduling to perform more poorly compared to SMT-disabled, where N_CPUS is the
total number of CPUs. Please measure the performance of your workloads always.”…””}”(hhêhžhhŸNh Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h KhhÙhžhubeh}”(h]”Œsecurity-usecase”ah ]”h"]”Œsecurity usecase”ah$]”h&]”uh1h´hh¶hžhhŸh³h Kubhµ)”}”(hhh]”(hº)”}”(hŒUsage”h]”hŒUsage”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¹hj  hžhhŸh³h K!ubhÊ)”}”(hXz  Core scheduling support is enabled via the ``CONFIG_SCHED_CORE`` config option.
Using this feature, userspace defines groups of tasks that can be co-scheduled
on the same core. The core scheduler uses this information to make sure that
tasks that are not in the same group never run simultaneously on a core, while
doing its best to satisfy the system's scheduling requirements.”h]”(hŒ+Core scheduling support is enabled via the ”…””}”(hj%  hžhhŸNh Nubhó)”}”(hŒ``CONFIG_SCHED_CORE``”h]”hŒCONFIG_SCHED_CORE”…””}”(hj-  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hòhj%  ubhX<   config option.
Using this feature, userspace defines groups of tasks that can be co-scheduled
on the same core. The core scheduler uses this information to make sure that
tasks that are not in the same group never run simultaneously on a core, while
doing its best to satisfy the systemâ€™s scheduling requirements.”…””}”(hj%  hžhhŸNh Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h K"hj  hžhubhÊ)”}”(hŒÕCore scheduling can be enabled via the ``PR_SCHED_CORE`` prctl interface.
This interface provides support for the creation of core scheduling groups, as
well as admission and removal of tasks from created groups::”h]”(hŒ'Core scheduling can be enabled via the ”…””}”(hjE  hžhhŸNh Nubhó)”}”(hŒ``PR_SCHED_CORE``”h]”hŒPR_SCHED_CORE”…””}”(hjM  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hòhjE  ubhŒœ prctl interface.
This interface provides support for the creation of core scheduling groups, as
well as admission and removal of tasks from created groups:”…””}”(hjE  hžhhŸNh Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h K(hj  hžhubhŒliteral_block”“”)”}”(hŒ†#include <sys/prctl.h>

int prctl(int option, unsigned long arg2, unsigned long arg3,
        unsigned long arg4, unsigned long arg5);”h]”hŒ†#include <sys/prctl.h>

int prctl(int option, unsigned long arg2, unsigned long arg3,
        unsigned long arg4, unsigned long arg5);”…””}”hjg  sbah}”(h]”h ]”h"]”h$]”h&]”h±h²uh1je  hŸh³h K,hj  hžhubhŒdefinition_list”“”)”}”(hhh]”(hŒdefinition_list_item”“”)”}”(hŒoption:
``PR_SCHED_CORE``
”h]”(hŒterm”“”)”}”(hŒoption:”h]”hŒoption:”…””}”(hj‚  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1j€  hŸh³h K2hj|  ubhŒ
definition”“”)”}”(hhh]”hÊ)”}”(hŒ``PR_SCHED_CORE``”h]”hó)”}”(hj—  h]”hŒPR_SCHED_CORE”…””}”(hj™  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hòhj•  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h K2hj’  ubah}”(h]”h ]”h"]”h$]”h&]”uh1j  hj|  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1jz  hŸh³h K2hjw  ubj{  )”}”(hX<  arg2:
Command for operation, must be one off:

- ``PR_SCHED_CORE_GET`` -- get core_sched cookie of ``pid``.
- ``PR_SCHED_CORE_CREATE`` -- create a new unique cookie for ``pid``.
- ``PR_SCHED_CORE_SHARE_TO`` -- push core_sched cookie to ``pid``.
- ``PR_SCHED_CORE_SHARE_FROM`` -- pull core_sched cookie from ``pid``.
”h]”(j  )”}”(hŒarg2:”h]”hŒarg2:”…””}”(hj¼  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1j€  hŸh³h K:hj¸  ubj‘  )”}”(hhh]”(hÊ)”}”(hŒ'Command for operation, must be one off:”h]”hŒ'Command for operation, must be one off:”…””}”(hjÍ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h K5hjÊ  ubhŒbullet_list”“”)”}”(hhh]”(hŒ	list_item”“”)”}”(hŒ:``PR_SCHED_CORE_GET`` -- get core_sched cookie of ``pid``.”h]”hÊ)”}”(hjä  h]”(hó)”}”(hŒ``PR_SCHED_CORE_GET``”h]”hŒPR_SCHED_CORE_GET”…””}”(hjé  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hòhjæ  ubhŒ -- get core_sched cookie of ”…””}”(hjæ  hžhhŸNh Nubhó)”}”(hŒ``pid``”h]”hŒpid”…””}”(hjû  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hòhjæ  ubhŒ.”…””}”(hjæ  hžhhŸNh Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h K7hjâ  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jà  hjÝ  ubjá  )”}”(hŒC``PR_SCHED_CORE_CREATE`` -- create a new unique cookie for ``pid``.”h]”hÊ)”}”(hj  h]”(hó)”}”(hŒ``PR_SCHED_CORE_CREATE``”h]”hŒPR_SCHED_CORE_CREATE”…””}”(hj   hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hòhj  ubhŒ# -- create a new unique cookie for ”…””}”(hj  hžhhŸNh Nubhó)”}”(hŒ``pid``”h]”hŒpid”…””}”(hj2  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hòhj  ubhŒ.”…””}”(hj  hžhhŸNh Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h K8hj  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jà  hjÝ  ubjá  )”}”(hŒ@``PR_SCHED_CORE_SHARE_TO`` -- push core_sched cookie to ``pid``.”h]”hÊ)”}”(hjR  h]”(hó)”}”(hŒ``PR_SCHED_CORE_SHARE_TO``”h]”hŒPR_SCHED_CORE_SHARE_TO”…””}”(hjW  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hòhjT  ubhŒ -- push core_sched cookie to ”…””}”(hjT  hžhhŸNh Nubhó)”}”(hŒ``pid``”h]”hŒpid”…””}”(hji  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hòhjT  ubhŒ.”…””}”(hjT  hžhhŸNh Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h K9hjP  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jà  hjÝ  ubjá  )”}”(hŒE``PR_SCHED_CORE_SHARE_FROM`` -- pull core_sched cookie from ``pid``.
”h]”hÊ)”}”(hŒD``PR_SCHED_CORE_SHARE_FROM`` -- pull core_sched cookie from ``pid``.”h]”(hó)”}”(hŒ``PR_SCHED_CORE_SHARE_FROM``”h]”hŒPR_SCHED_CORE_SHARE_FROM”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hòhj‹  ubhŒ  -- pull core_sched cookie from ”…””}”(hj‹  hžhhŸNh Nubhó)”}”(hŒ``pid``”h]”hŒpid”…””}”(hj¡  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hòhj‹  ubhŒ.”…””}”(hj‹  hžhhŸNh Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h K:hj‡  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jà  hjÝ  ubeh}”(h]”h ]”h"]”h$]”h&]”Œbullet”Œ-”uh1jÛ  hŸh³h K7hjÊ  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1j  hj¸  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1jz  hŸh³h K:hjw  hžhubj{  )”}”(hŒ;arg3:
``pid`` of the task for which the operation applies.
”h]”(j  )”}”(hŒarg3:”h]”hŒarg3:”…””}”(hj×  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1j€  hŸh³h K=hjÓ  ubj‘  )”}”(hhh]”hÊ)”}”(hŒ4``pid`` of the task for which the operation applies.”h]”(hó)”}”(hŒ``pid``”h]”hŒpid”…””}”(hjì  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hòhjè  ubhŒ- of the task for which the operation applies.”…””}”(hjè  hžhhŸNh Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h K=hjå  ubah}”(h]”h ]”h"]”h$]”h&]”uh1j  hjÓ  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1jz  hŸh³h K=hjw  hžhubj{  )”}”(hX  arg4:
``pid_type`` for which the operation applies. It is one of
``PR_SCHED_CORE_SCOPE_``-prefixed macro constants.  For example, if arg4
is ``PR_SCHED_CORE_SCOPE_THREAD_GROUP``, then the operation of this command
will be performed for all tasks in the task group of ``pid``.
”h]”(j  )”}”(hŒarg4:”h]”hŒarg4:”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1j€  hŸh³h KChj  ubj‘  )”}”(hhh]”hÊ)”}”(hX  ``pid_type`` for which the operation applies. It is one of
``PR_SCHED_CORE_SCOPE_``-prefixed macro constants.  For example, if arg4
is ``PR_SCHED_CORE_SCOPE_THREAD_GROUP``, then the operation of this command
will be performed for all tasks in the task group of ``pid``.”h]”(hó)”}”(hŒ``pid_type``”h]”hŒpid_type”…””}”(hj)  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hòhj%  ubhŒ/ for which the operation applies. It is one of
”…””}”(hj%  hžhhŸNh Nubhó)”}”(hŒ``PR_SCHED_CORE_SCOPE_``”h]”hŒPR_SCHED_CORE_SCOPE_”…””}”(hj;  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hòhj%  ubhŒ4-prefixed macro constants.  For example, if arg4
is ”…””}”(hj%  hžhhŸNh Nubhó)”}”(hŒ$``PR_SCHED_CORE_SCOPE_THREAD_GROUP``”h]”hŒ PR_SCHED_CORE_SCOPE_THREAD_GROUP”…””}”(hjM  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hòhj%  ubhŒZ, then the operation of this command
will be performed for all tasks in the task group of ”…””}”(hj%  hžhhŸNh Nubhó)”}”(hŒ``pid``”h]”hŒpid”…””}”(hj_  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hòhj%  ubhŒ.”…””}”(hj%  hžhhŸNh Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h K@hj"  ubah}”(h]”h ]”h"]”h$]”h&]”uh1j  hj  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1jz  hŸh³h KChjw  hžhubj{  )”}”(hŒ—arg5:
userspace pointer to an unsigned long long for storing the cookie returned
by ``PR_SCHED_CORE_GET`` command. Should be 0 for all other commands.
”h]”(j  )”}”(hŒarg5:”h]”hŒarg5:”…””}”(hj‡  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1j€  hŸh³h KGhjƒ  ubj‘  )”}”(hhh]”hÊ)”}”(hŒuserspace pointer to an unsigned long long for storing the cookie returned
by ``PR_SCHED_CORE_GET`` command. Should be 0 for all other commands.”h]”(hŒNuserspace pointer to an unsigned long long for storing the cookie returned
by ”…””}”(hj˜  hžhhŸNh Nubhó)”}”(hŒ``PR_SCHED_CORE_GET``”h]”hŒPR_SCHED_CORE_GET”…””}”(hj   hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hòhj˜  ubhŒ- command. Should be 0 for all other commands.”…””}”(hj˜  hžhhŸNh Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h KFhj•  ubah}”(h]”h ]”h"]”h$]”h&]”uh1j  hjƒ  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1jz  hŸh³h KGhjw  hžhubeh}”(h]”h ]”h"]”h$]”h&]”uh1ju  hj  hžhhŸh³h NubhÊ)”}”(hŒ¨In order for a process to push a cookie to, or pull a cookie from a process, it
is required to have the ptrace access mode: `PTRACE_MODE_READ_REALCREDS` to the
process.”h]”(hŒ|In order for a process to push a cookie to, or pull a cookie from a process, it
is required to have the ptrace access mode: ”…””}”(hjÊ  hžhhŸNh NubhŒtitle_reference”“”)”}”(hŒ`PTRACE_MODE_READ_REALCREDS`”h]”hŒPTRACE_MODE_READ_REALCREDS”…””}”(hjÔ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jÒ  hjÊ  ubhŒ to the
process.”…””}”(hjÊ  hžhhŸNh Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h KIhj  hžhubhµ)”}”(hhh]”(hº)”}”(hŒBuilding hierarchies of tasks”h]”hŒBuilding hierarchies of tasks”…””}”(hjï  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¹hjì  hžhhŸh³h KNubhÊ)”}”(hX;  The simplest way to build hierarchies of threads/processes which share a
cookie and thus a core is to rely on the fact that the core-sched cookie is
inherited across forks/clones and execs, thus setting a cookie for the
'initial' script/executable/daemon will place every spawned child in the
same core-sched group.”h]”hX?  The simplest way to build hierarchies of threads/processes which share a
cookie and thus a core is to rely on the fact that the core-sched cookie is
inherited across forks/clones and execs, thus setting a cookie for the
â€˜initialâ€™ script/executable/daemon will place every spawned child in the
same core-sched group.”…””}”(hjý  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h KOhjì  hžhubeh}”(h]”Œbuilding-hierarchies-of-tasks”ah ]”h"]”Œbuilding hierarchies of tasks”ah$]”h&]”uh1h´hj  hžhhŸh³h KNubhµ)”}”(hhh]”(hº)”}”(hŒCookie Transferral”h]”hŒCookie Transferral”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¹hj  hžhhŸh³h KVubhÊ)”}”(hXh  Transferring a cookie between the current and other tasks is possible using
PR_SCHED_CORE_SHARE_FROM and PR_SCHED_CORE_SHARE_TO to inherit a cookie from a
specified task or a share a cookie with a task. In combination this allows a
simple helper program to pull a cookie from a task in an existing core
scheduling group and share it with already running tasks.”h]”hXh  Transferring a cookie between the current and other tasks is possible using
PR_SCHED_CORE_SHARE_FROM and PR_SCHED_CORE_SHARE_TO to inherit a cookie from a
specified task or a share a cookie with a task. In combination this allows a
simple helper program to pull a cookie from a task in an existing core
scheduling group and share it with already running tasks.”…””}”(hj$  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h KWhj  hžhubeh}”(h]”Œcookie-transferral”ah ]”h"]”Œcookie transferral”ah$]”h&]”uh1h´hj  hžhhŸh³h KVubeh}”(h]”Œusage”ah ]”h"]”Œusage”ah$]”h&]”uh1h´hh¶hžhhŸh³h K!Œ
referenced”Kubhµ)”}”(hhh]”(hº)”}”(hŒDesign/Implementation”h]”hŒDesign/Implementation”…””}”(hjF  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¹hjC  hžhhŸh³h K^ubhÊ)”}”(hŒ´Each task that is tagged is assigned a cookie internally in the kernel. As
mentioned in `Usage`_, tasks with the same cookie value are assumed to trust
each other and share a core.”h]”(hŒXEach task that is tagged is assigned a cookie internally in the kernel. As
mentioned in ”…””}”(hjT  hžhhŸNh NubhŒ	reference”“”)”}”(hŒ`Usage`_”h]”hŒUsage”…””}”(hj^  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”Œname”ŒUsage”Œrefid”j<  uh1j\  hjT  Œresolved”KubhŒT, tasks with the same cookie value are assumed to trust
each other and share a core.”…””}”(hjT  hžhhŸNh Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h K_hjC  hžhubhÊ)”}”(hX?  The basic idea is that, every schedule event tries to select tasks for all the
siblings of a core such that all the selected tasks running on a core are
trusted (same cookie) at any point in time. Kernel threads are assumed trusted.
The idle task is considered special, as it trusts everything and everything
trusts it.”h]”hX?  The basic idea is that, every schedule event tries to select tasks for all the
siblings of a core such that all the selected tasks running on a core are
trusted (same cookie) at any point in time. Kernel threads are assumed trusted.
The idle task is considered special, as it trusts everything and everything
trusts it.”…””}”(hjz  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h KchjC  hžhubhÊ)”}”(hXÁ  During a schedule() event on any sibling of a core, the highest priority task on
the sibling's core is picked and assigned to the sibling calling schedule(), if
the sibling has the task enqueued. For rest of the siblings in the core,
highest priority task with the same cookie is selected if there is one runnable
in their individual run queues. If a task with same cookie is not available,
the idle task is selected.  Idle task is globally trusted.”h]”hXÃ  During a schedule() event on any sibling of a core, the highest priority task on
the siblingâ€™s core is picked and assigned to the sibling calling schedule(), if
the sibling has the task enqueued. For rest of the siblings in the core,
highest priority task with the same cookie is selected if there is one runnable
in their individual run queues. If a task with same cookie is not available,
the idle task is selected.  Idle task is globally trusted.”…””}”(hjˆ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h KihjC  hžhubhÊ)”}”(hXª  Once a task has been selected for all the siblings in the core, an IPI is sent to
siblings for whom a new task was selected. Siblings on receiving the IPI will
switch to the new task immediately. If an idle task is selected for a sibling,
then the sibling is considered to be in a `forced idle` state. I.e., it may
have tasks on its on runqueue to run, however it will still have to run idle.
More on this in the next section.”h]”(hX  Once a task has been selected for all the siblings in the core, an IPI is sent to
siblings for whom a new task was selected. Siblings on receiving the IPI will
switch to the new task immediately. If an idle task is selected for a sibling,
then the sibling is considered to be in a ”…””}”(hj–  hžhhŸNh NubjÓ  )”}”(hŒ`forced idle`”h]”hŒforced idle”…””}”(hjž  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jÒ  hj–  ubhŒ„ state. I.e., it may
have tasks on its on runqueue to run, however it will still have to run idle.
More on this in the next section.”…””}”(hj–  hžhhŸNh Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h KphjC  hžhubhµ)”}”(hhh]”(hº)”}”(hŒForced-idling of hyperthreads”h]”hŒForced-idling of hyperthreads”…””}”(hj¹  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¹hj¶  hžhhŸh³h KxubhÊ)”}”(hXS  The scheduler tries its best to find tasks that trust each other such that all
tasks selected to be scheduled are of the highest priority in a core.  However,
it is possible that some runqueues had tasks that were incompatible with the
highest priority ones in the core. Favoring security over fairness, one or more
siblings could be forced to select a lower priority task if the highest
priority task is not trusted with respect to the core wide highest priority
task.  If a sibling does not have a trusted task to run, it will be forced idle
by the scheduler (idle thread is scheduled to run).”h]”hXS  The scheduler tries its best to find tasks that trust each other such that all
tasks selected to be scheduled are of the highest priority in a core.  However,
it is possible that some runqueues had tasks that were incompatible with the
highest priority ones in the core. Favoring security over fairness, one or more
siblings could be forced to select a lower priority task if the highest
priority task is not trusted with respect to the core wide highest priority
task.  If a sibling does not have a trusted task to run, it will be forced idle
by the scheduler (idle thread is scheduled to run).”…””}”(hjÇ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h Kyhj¶  hžhubhÊ)”}”(hŒøWhen the highest priority task is selected to run, a reschedule-IPI is sent to
the sibling to force it into idle. This results in 4 cases which need to be
considered depending on whether a VM or a regular usermode process was running
on either HT::”h]”hŒ÷When the highest priority task is selected to run, a reschedule-IPI is sent to
the sibling to force it into idle. This results in 4 cases which need to be
considered depending on whether a VM or a regular usermode process was running
on either HT:”…””}”(hjÕ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h K‚hj¶  hžhubjf  )”}”(hŒé       HT1 (attack)            HT2 (victim)
A      idle -> user space      user space -> idle
B      idle -> user space      guest -> idle
C      idle -> guest           user space -> idle
D      idle -> guest           guest -> idle”h]”hŒé       HT1 (attack)            HT2 (victim)
A      idle -> user space      user space -> idle
B      idle -> user space      guest -> idle
C      idle -> guest           user space -> idle
D      idle -> guest           guest -> idle”…””}”hjã  sbah}”(h]”h ]”h"]”h$]”h&]”h±h²uh1je  hŸh³h K‡hj¶  hžhubhÊ)”}”(hXÔ  Note that for better performance, we do not wait for the destination CPU
(victim) to enter idle mode. This is because the sending of the IPI would bring
the destination CPU immediately into kernel mode from user space, or VMEXIT
in the case of guests. At best, this would only leak some scheduler metadata
which may not be worth protecting. It is also possible that the IPI is received
too late on some architectures, but this has not been observed in the case of
x86.”h]”hXÔ  Note that for better performance, we do not wait for the destination CPU
(victim) to enter idle mode. This is because the sending of the IPI would bring
the destination CPU immediately into kernel mode from user space, or VMEXIT
in the case of guests. At best, this would only leak some scheduler metadata
which may not be worth protecting. It is also possible that the IPI is received
too late on some architectures, but this has not been observed in the case of
x86.”…””}”(hjñ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h Khj¶  hžhubeh}”(h]”Œforced-idling-of-hyperthreads”ah ]”h"]”Œforced-idling of hyperthreads”ah$]”h&]”uh1h´hjC  hžhhŸh³h Kxubhµ)”}”(hhh]”(hº)”}”(hŒTrust model”h]”hŒTrust model”…””}”(hj
  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¹hj  hžhhŸh³h K–ubhÊ)”}”(hX  Core scheduling maintains trust relationships amongst groups of tasks by
assigning them a tag that is the same cookie value.
When a system with core scheduling boots, all tasks are considered to trust
each other. This is because the core scheduler does not have information about
trust relationships until userspace uses the above mentioned interfaces, to
communicate them. In other words, all tasks have a default cookie value of 0.
and are considered system-wide trusted. The forced-idling of siblings running
cookie-0 tasks is also avoided.”h]”hX  Core scheduling maintains trust relationships amongst groups of tasks by
assigning them a tag that is the same cookie value.
When a system with core scheduling boots, all tasks are considered to trust
each other. This is because the core scheduler does not have information about
trust relationships until userspace uses the above mentioned interfaces, to
communicate them. In other words, all tasks have a default cookie value of 0.
and are considered system-wide trusted. The forced-idling of siblings running
cookie-0 tasks is also avoided.”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h K—hj  hžhubhÊ)”}”(hŒÞOnce userspace uses the above mentioned interfaces to group sets of tasks, tasks
within such groups are considered to trust each other, but do not trust those
outside. Tasks outside the group also don't trust tasks within.”h]”hŒàOnce userspace uses the above mentioned interfaces to group sets of tasks, tasks
within such groups are considered to trust each other, but do not trust those
outside. Tasks outside the group also donâ€™t trust tasks within.”…””}”(hj&  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h K hj  hžhubeh}”(h]”Œtrust-model”ah ]”h"]”Œtrust model”ah$]”h&]”uh1h´hjC  hžhhŸh³h K–ubeh}”(h]”Œdesign-implementation”ah ]”h"]”Œdesign/implementation”ah$]”h&]”uh1h´hh¶hžhhŸh³h K^ubhµ)”}”(hhh]”(hº)”}”(hŒLimitations of core-scheduling”h]”hŒLimitations of core-scheduling”…””}”(hjG  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¹hjD  hžhhŸh³h K¥ubhÊ)”}”(hŒ÷Core scheduling tries to guarantee that only trusted tasks run concurrently on a
core. But there could be small window of time during which untrusted tasks run
concurrently or kernel could be running concurrently with a task not trusted by
kernel.”h]”hŒ÷Core scheduling tries to guarantee that only trusted tasks run concurrently on a
core. But there could be small window of time during which untrusted tasks run
concurrently or kernel could be running concurrently with a task not trusted by
kernel.”…””}”(hjU  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h K¦hjD  hžhubhµ)”}”(hhh]”(hº)”}”(hŒIPI processing delays”h]”hŒIPI processing delays”…””}”(hjf  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¹hjc  hžhhŸh³h K¬ubhÊ)”}”(hX  Core scheduling selects only trusted tasks to run together. IPI is used to notify
the siblings to switch to the new task. But there could be hardware delays in
receiving of the IPI on some arch (on x86, this has not been observed). This may
cause an attacker task to start running on a CPU before its siblings receive the
IPI. Even though cache is flushed on entry to user mode, victim tasks on siblings
may populate data in the cache and micro architectural buffers after the attacker
starts to run and this is a possibility for data leak.”h]”hX  Core scheduling selects only trusted tasks to run together. IPI is used to notify
the siblings to switch to the new task. But there could be hardware delays in
receiving of the IPI on some arch (on x86, this has not been observed). This may
cause an attacker task to start running on a CPU before its siblings receive the
IPI. Even though cache is flushed on entry to user mode, victim tasks on siblings
may populate data in the cache and micro architectural buffers after the attacker
starts to run and this is a possibility for data leak.”…””}”(hjt  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h K­hjc  hžhubeh}”(h]”Œipi-processing-delays”ah ]”h"]”Œipi processing delays”ah$]”h&]”uh1h´hjD  hžhhŸh³h K¬ubeh}”(h]”Œlimitations-of-core-scheduling”ah ]”h"]”Œlimitations of core-scheduling”ah$]”h&]”uh1h´hh¶hžhhŸh³h K¥ubhµ)”}”(hhh]”(hº)”}”(hŒ8Open cross-HT issues that core scheduling does not solve”h]”hŒ8Open cross-HT issues that core scheduling does not solve”…””}”(hj•  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¹hj’  hžhhŸh³h K¶ubhµ)”}”(hhh]”(hº)”}”(hŒ
1. For MDS”h]”hŒ
1. For MDS”…””}”(hj¦  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¹hj£  hžhhŸh³h K¸ubhÊ)”}”(hX  Core scheduling cannot protect against MDS attacks between the siblings
running in user mode and the others running in kernel mode. Even though all
siblings run tasks which trust each other, when the kernel is executing
code on behalf of a task, it cannot trust the code running in the
sibling. Such attacks are possible for any combination of sibling CPU modes
(host or guest mode).”h]”hX  Core scheduling cannot protect against MDS attacks between the siblings
running in user mode and the others running in kernel mode. Even though all
siblings run tasks which trust each other, when the kernel is executing
code on behalf of a task, it cannot trust the code running in the
sibling. Such attacks are possible for any combination of sibling CPU modes
(host or guest mode).”…””}”(hj´  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h K¹hj£  hžhubeh}”(h]”Œfor-mds”ah ]”h"]”Œ
1. for mds”ah$]”h&]”uh1h´hj’  hžhhŸh³h K¸ubhµ)”}”(hhh]”(hº)”}”(hŒ2. For L1TF”h]”hŒ2. For L1TF”…””}”(hjÍ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¹hjÊ  hžhhŸh³h KÁubhÊ)”}”(hX  Core scheduling cannot protect against an L1TF guest attacker exploiting a
guest or host victim. This is because the guest attacker can craft invalid
PTEs which are not inverted due to a vulnerable guest kernel. The only
solution is to disable EPT (Extended Page Tables).”h]”hX  Core scheduling cannot protect against an L1TF guest attacker exploiting a
guest or host victim. This is because the guest attacker can craft invalid
PTEs which are not inverted due to a vulnerable guest kernel. The only
solution is to disable EPT (Extended Page Tables).”…””}”(hjÛ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h KÂhjÊ  hžhubhÊ)”}”(hŒöFor both MDS and L1TF, if the guest vCPU is configured to not trust each
other (by tagging separately), then the guest to guest attacks would go away.
Or it could be a system admin policy which considers guest to guest attacks as
a guest problem.”h]”hŒöFor both MDS and L1TF, if the guest vCPU is configured to not trust each
other (by tagging separately), then the guest to guest attacks would go away.
Or it could be a system admin policy which considers guest to guest attacks as
a guest problem.”…””}”(hjé  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h KÇhjÊ  hžhubhÊ)”}”(hX!  Another approach to resolve these would be to make every untrusted task on the
system to not trust every other untrusted task. While this could reduce
parallelism of the untrusted tasks, it would still solve the above issues while
allowing system processes (trusted tasks) to share a core.”h]”hX!  Another approach to resolve these would be to make every untrusted task on the
system to not trust every other untrusted task. While this could reduce
parallelism of the untrusted tasks, it would still solve the above issues while
allowing system processes (trusted tasks) to share a core.”…””}”(hj÷  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h KÌhjÊ  hžhubeh}”(h]”Œfor-l1tf”ah ]”h"]”Œ2. for l1tf”ah$]”h&]”uh1h´hj’  hžhhŸh³h KÁubhµ)”}”(hhh]”(hº)”}”(hŒ/3. Protecting the kernel (IRQ, syscall, VMEXIT)”h]”hŒ/3. Protecting the kernel (IRQ, syscall, VMEXIT)”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¹hj  hžhhŸh³h KÒubhÊ)”}”(hXj  Unfortunately, core scheduling does not protect kernel contexts running on
sibling hyperthreads from one another. Prototypes of mitigations have been posted
to LKML to solve this, but it is debatable whether such windows are practically
exploitable, and whether the performance overhead of the prototypes are worth
it (not to mention, the added code complexity).”h]”hXj  Unfortunately, core scheduling does not protect kernel contexts running on
sibling hyperthreads from one another. Prototypes of mitigations have been posted
to LKML to solve this, but it is debatable whether such windows are practically
exploitable, and whether the performance overhead of the prototypes are worth
it (not to mention, the added code complexity).”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h KÓhj  hžhubeh}”(h]”Œ(protecting-the-kernel-irq-syscall-vmexit”ah ]”h"]”Œ/3. protecting the kernel (irq, syscall, vmexit)”ah$]”h&]”uh1h´hj’  hžhhŸh³h KÒubeh}”(h]”Œ8open-cross-ht-issues-that-core-scheduling-does-not-solve”ah ]”h"]”Œ8open cross-ht issues that core scheduling does not solve”ah$]”h&]”uh1h´hh¶hžhhŸh³h K¶ubhµ)”}”(hhh]”(hº)”}”(hŒOther Use cases”h]”hŒOther Use cases”…””}”(hj?  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¹hj<  hžhhŸh³h KÚubhÊ)”}”(hŒžThe main use case for Core scheduling is mitigating the cross-HT vulnerabilities
with SMT enabled. There are other use cases where this feature could be used:”h]”hŒžThe main use case for Core scheduling is mitigating the cross-HT vulnerabilities
with SMT enabled. There are other use cases where this feature could be used:”…””}”(hjM  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h KÛhj<  hžhubjÜ  )”}”(hhh]”(já  )”}”(hŒpIsolating tasks that needs a whole core: Examples include realtime tasks, tasks
that uses SIMD instructions etc.”h]”hÊ)”}”(hŒpIsolating tasks that needs a whole core: Examples include realtime tasks, tasks
that uses SIMD instructions etc.”h]”hŒpIsolating tasks that needs a whole core: Examples include realtime tasks, tasks
that uses SIMD instructions etc.”…””}”(hjb  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h KÞhj^  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jà  hj[  hžhhŸh³h Nubjá  )”}”(hŒ¢Gang scheduling: Requirements for a group of tasks that needs to be scheduled
together could also be realized using core scheduling. One example is vCPUs of
a VM.”h]”hÊ)”}”(hŒ¢Gang scheduling: Requirements for a group of tasks that needs to be scheduled
together could also be realized using core scheduling. One example is vCPUs of
a VM.”h]”hŒ¢Gang scheduling: Requirements for a group of tasks that needs to be scheduled
together could also be realized using core scheduling. One example is vCPUs of
a VM.”…””}”(hjz  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÉhŸh³h Kàhjv  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jà  hj[  hžhhŸh³h Nubeh}”(h]”h ]”h"]”h$]”h&]”jÅ  jÆ  uh1jÛ  hŸh³h KÞhj<  hžhubeh}”(h]”Œother-use-cases”ah ]”h"]”Œother use cases”ah$]”h&]”uh1h´hh¶hžhhŸh³h KÚubeh}”(h]”Œcore-scheduling”ah ]”h"]”Œcore scheduling”ah$]”h&]”uh1h´hhhžhhŸh³h Kubeh}”(h]”h ]”h"]”h$]”h&]”Œsource”h³uh1hŒcurrent_source”NŒcurrent_line”NŒsettings”Œdocutils.frontend”ŒValues”“”)”}”(h¹NŒ	generator”NŒ	datestamp”NŒsource_link”NŒ
source_url”NŒtoc_backlinks”Œentry”Œfootnote_backlinks”KŒsectnum_xform”KŒstrip_comments”NŒstrip_elements_with_classes”NŒstrip_classes”NŒreport_level”KŒ
halt_level”KŒexit_status_level”KŒdebug”NŒwarning_stream”NŒ	traceback”ˆŒinput_encoding”Œ	utf-8-sig”Œinput_encoding_error_handler”Œstrict”Œoutput_encoding”Œutf-8”Œoutput_encoding_error_handler”jÇ  Œerror_encoding”Œutf-8”Œerror_encoding_error_handler”Œbackslashreplace”Œlanguage_code”Œen”Œrecord_dependencies”NŒconfig”NŒ	id_prefix”hŒauto_id_prefix”Œid”Œdump_settings”NŒdump_internals”NŒdump_transforms”NŒdump_pseudo_xml”NŒexpose_internals”NŒstrict_visitor”NŒ_disable_config”NŒ_source”h³Œ_destination”NŒ_config_files”]”Œ7/var/lib/git/docbuild/linux/Documentation/docutils.conf”aŒfile_insertion_enabled”ˆŒraw_enabled”KŒline_length_limit”M'Œpep_references”NŒpep_base_url”Œhttps://peps.python.org/”Œpep_file_url_template”Œpep-%04d”Œrfc_references”NŒrfc_base_url”Œ&https://datatracker.ietf.org/doc/html/”Œ	tab_width”KŒtrim_footnote_reference_space”‰Œsyntax_highlight”Œlong”Œsmart_quotes”ˆŒsmartquotes_locales”]”Œcharacter_level_inline_markup”‰Œdoctitle_xform”‰Œdocinfo_xform”KŒsectsubtitle_xform”‰Œimage_loading”Œlink”Œembed_stylesheet”‰Œcloak_email_addresses”ˆŒsection_self_link”‰Œenv”NubŒreporter”NŒindirect_targets”]”Œsubstitution_defs”}”Œsubstitution_names”}”Œrefnames”}”Œusage”]”j^  asŒrefids”}”Œnameids”}”(j¡  jž  j  j  j?  j<  j  j  j7  j4  jA  j>  j  j  j9  j6  j  jŒ  j‡  j„  j9  j6  jÇ  jÄ  j
  j  j1  j.  j™  j–  uŒ	nametypes”}”(j¡  ‰j  ‰j?  ‰j  ‰j7  ‰jA  ‰j  ‰j9  ‰j  ‰j‡  ‰j9  ‰jÇ  ‰j
  ‰j1  ‰j™  ‰uh}”(jž  h¶j  hÙj<  j  j  jì  j4  j  j>  jC  j  j¶  j6  j  jŒ  jD  j„  jc  j6  j’  jÄ  j£  j  jÊ  j.  j  j–  j<  uŒfootnote_refs”}”Œcitation_refs”}”Œautofootnotes”]”Œautofootnote_refs”]”Œsymbol_footnotes”]”Œsymbol_footnote_refs”]”Œ	footnotes”]”Œ	citations”]”Œautofootnote_start”KŒsymbol_footnote_start”K Œ
id_counter”Œcollections”ŒCounter”“”}”…”R”Œparse_messages”]”Œtransform_messages”]”Œtransformer”NŒinclude_log”]”Œ
decoration”Nhžhub.