€•H˜Œsphinx.addnodes”Œdocument”“”)”}”(Œ rawsource”Œ”Œchildren”]”(Œ translations”Œ LanguagesNode”“”)”}”(hhh]”(hŒ pending_xref”“”)”}”(hhh]”Œdocutils.nodes”ŒText”“”ŒChinese (Simplified)”…””}”Œparent”hsbaŒ attributes”}”(Œids”]”Œclasses”]”Œnames”]”Œdupnames”]”Œbackrefs”]”Œ refdomain”Œstd”Œreftype”Œdoc”Œ reftarget”Œ7/translations/zh_CN/admin-guide/hw-vuln/core-scheduling”Œmodname”NŒ classname”NŒ refexplicit”ˆuŒtagname”hhh ubh)”}”(hhh]”hŒChinese (Traditional)”…””}”hh2sbah}”(h]”h ]”h"]”h$]”h&]”Œ refdomain”h)Œreftype”h+Œ reftarget”Œ7/translations/zh_TW/admin-guide/hw-vuln/core-scheduling”Œmodname”NŒ classname”NŒ refexplicit”ˆuh1hhh ubh)”}”(hhh]”hŒItalian”…””}”hhFsbah}”(h]”h ]”h"]”h$]”h&]”Œ refdomain”h)Œreftype”h+Œ reftarget”Œ7/translations/it_IT/admin-guide/hw-vuln/core-scheduling”Œmodname”NŒ classname”NŒ refexplicit”ˆuh1hhh ubh)”}”(hhh]”hŒJapanese”…””}”hhZsbah}”(h]”h ]”h"]”h$]”h&]”Œ refdomain”h)Œreftype”h+Œ reftarget”Œ7/translations/ja_JP/admin-guide/hw-vuln/core-scheduling”Œmodname”NŒ classname”NŒ refexplicit”ˆuh1hhh ubh)”}”(hhh]”hŒKorean”…””}”hhnsbah}”(h]”h ]”h"]”h$]”h&]”Œ refdomain”h)Œreftype”h+Œ reftarget”Œ7/translations/ko_KR/admin-guide/hw-vuln/core-scheduling”Œmodname”NŒ classname”NŒ refexplicit”ˆuh1hhh ubh)”}”(hhh]”hŒPortuguese (Brazilian)”…””}”hh‚sbah}”(h]”h ]”h"]”h$]”h&]”Œ refdomain”h)Œreftype”h+Œ reftarget”Œ7/translations/pt_BR/admin-guide/hw-vuln/core-scheduling”Œmodname”NŒ classname”NŒ refexplicit”ˆuh1hhh ubh)”}”(hhh]”hŒSpanish”…””}”hh–sbah}”(h]”h ]”h"]”h$]”h&]”Œ refdomain”h)Œreftype”h+Œ reftarget”Œ7/translations/sp_SP/admin-guide/hw-vuln/core-scheduling”Œmodname”NŒ classname”NŒ refexplicit”ˆuh1hhh ubeh}”(h]”h ]”h"]”h$]”h&]”Œcurrent_language”ŒEnglish”uh1h hhŒ _document”hŒsource”NŒline”NubhŒcomment”“”)”}”(hŒ SPDX-License-Identifier: GPL-2.0”h]”hŒ SPDX-License-Identifier: GPL-2.0”…””}”hh·sbah}”(h]”h ]”h"]”h$]”h&]”Œ xml:space”Œpreserve”uh1hµhhh²hh³ŒQ/var/lib/git/docbuild/linux/Documentation/admin-guide/hw-vuln/core-scheduling.rst”h´KubhŒsection”“”)”}”(hhh]”(hŒtitle”“”)”}”(hŒCore Scheduling”h]”hŒCore Scheduling”…””}”(hhÏh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÍhhÊh²hh³hÇh´KubhŒ paragraph”“”)”}”(hXÓCore scheduling support allows userspace to define groups of tasks that can share a core. These groups can be specified either for security usecases (one group of tasks don't trust another), or for performance usecases (some workloads may benefit from running on the same core as they don't need the same hardware resources of the shared core, or may prefer different cores if they do share hardware resource needs). This document only describes the security usecase.”h]”hX×Core scheduling support allows userspace to define groups of tasks that can share a core. These groups can be specified either for security usecases (one group of tasks don’t trust another), or for performance usecases (some workloads may benefit from running on the same core as they don’t need the same hardware resources of the shared core, or may prefer different cores if they do share hardware resource needs). This document only describes the security usecase.”…””}”(hhßh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´KhhÊh²hubhÉ)”}”(hhh]”(hÎ)”}”(hŒSecurity usecase”h]”hŒSecurity usecase”…””}”(hhðh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÍhhíh²hh³hÇh´KubhÞ)”}”(hXA cross-HT attack involves the attacker and victim running on different Hyper Threads of the same core. MDS and L1TF are examples of such attacks. The only full mitigation of cross-HT attacks is to disable Hyper Threading (HT). Core scheduling is a scheduler feature that can mitigate some (not all) cross-HT attacks. It allows HT to be turned on safely by ensuring that only tasks in a user-designated trusted group can share a core. This increase in core sharing can also improve performance, however it is not guaranteed that performance will always improve, though that is seen to be the case with a number of real world workloads. In theory, core scheduling aims to perform at least as good as when Hyper Threading is disabled. In practice, this is mostly the case though not always: as synchronizing scheduling decisions across 2 or more CPUs in a core involves additional overhead - especially when the system is lightly loaded. When ``total_threads <= N_CPUS/2``, the extra overhead may cause core scheduling to perform more poorly compared to SMT-disabled, where N_CPUS is the total number of CPUs. Please measure the performance of your workloads always.”h]”(hX®A cross-HT attack involves the attacker and victim running on different Hyper Threads of the same core. MDS and L1TF are examples of such attacks. The only full mitigation of cross-HT attacks is to disable Hyper Threading (HT). Core scheduling is a scheduler feature that can mitigate some (not all) cross-HT attacks. It allows HT to be turned on safely by ensuring that only tasks in a user-designated trusted group can share a core. This increase in core sharing can also improve performance, however it is not guaranteed that performance will always improve, though that is seen to be the case with a number of real world workloads. In theory, core scheduling aims to perform at least as good as when Hyper Threading is disabled. In practice, this is mostly the case though not always: as synchronizing scheduling decisions across 2 or more CPUs in a core involves additional overhead - especially when the system is lightly loaded. When ”…””}”(hhþh²hh³Nh´NubhŒliteral”“”)”}”(hŒ``total_threads <= N_CPUS/2``”h]”hŒtotal_threads <= N_CPUS/2”…””}”(hjh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jhhþubhŒÂ, the extra overhead may cause core scheduling to perform more poorly compared to SMT-disabled, where N_CPUS is the total number of CPUs. Please measure the performance of your workloads always.”…””}”(hhþh²hh³Nh´Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´Khhíh²hubeh}”(h]”Œsecurity-usecase”ah ]”h"]”Œsecurity usecase”ah$]”h&]”uh1hÈhhÊh²hh³hÇh´KubhÉ)”}”(hhh]”(hÎ)”}”(hŒUsage”h]”hŒUsage”…””}”(hj+h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÍhj(h²hh³hÇh´K!ubhÞ)”}”(hXzCore scheduling support is enabled via the ``CONFIG_SCHED_CORE`` config option. Using this feature, userspace defines groups of tasks that can be co-scheduled on the same core. The core scheduler uses this information to make sure that tasks that are not in the same group never run simultaneously on a core, while doing its best to satisfy the system's scheduling requirements.”h]”(hŒ+Core scheduling support is enabled via the ”…””}”(hj9h²hh³Nh´Nubj)”}”(hŒ``CONFIG_SCHED_CORE``”h]”hŒCONFIG_SCHED_CORE”…””}”(hjAh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jhj9ubhX< config option. Using this feature, userspace defines groups of tasks that can be co-scheduled on the same core. The core scheduler uses this information to make sure that tasks that are not in the same group never run simultaneously on a core, while doing its best to satisfy the system’s scheduling requirements.”…””}”(hj9h²hh³Nh´Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´K"hj(h²hubhÞ)”}”(hŒÕCore scheduling can be enabled via the ``PR_SCHED_CORE`` prctl interface. This interface provides support for the creation of core scheduling groups, as well as admission and removal of tasks from created groups::”h]”(hŒ'Core scheduling can be enabled via the ”…””}”(hjYh²hh³Nh´Nubj)”}”(hŒ``PR_SCHED_CORE``”h]”hŒ PR_SCHED_CORE”…””}”(hjah²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jhjYubhŒœ prctl interface. This interface provides support for the creation of core scheduling groups, as well as admission and removal of tasks from created groups:”…””}”(hjYh²hh³Nh´Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´K(hj(h²hubhŒ literal_block”“”)”}”(hŒ†#include int prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5);”h]”hŒ†#include int prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5);”…””}”hj{sbah}”(h]”h ]”h"]”h$]”h&]”hÅhÆuh1jyh³hÇh´K,hj(h²hubhŒdefinition_list”“”)”}”(hhh]”(hŒdefinition_list_item”“”)”}”(hŒoption: ``PR_SCHED_CORE`` ”h]”(hŒterm”“”)”}”(hŒoption:”h]”hŒoption:”…””}”(hj–h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1j”h³hÇh´K2hjubhŒ definition”“”)”}”(hhh]”hÞ)”}”(hŒ``PR_SCHED_CORE``”h]”j)”}”(hj«h]”hŒ PR_SCHED_CORE”…””}”(hj­h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jhj©ubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´K2hj¦ubah}”(h]”h ]”h"]”h$]”h&]”uh1j¤hjubeh}”(h]”h ]”h"]”h$]”h&]”uh1jŽh³hÇh´K2hj‹ubj)”}”(hX<arg2: Command for operation, must be one off: - ``PR_SCHED_CORE_GET`` -- get core_sched cookie of ``pid``. - ``PR_SCHED_CORE_CREATE`` -- create a new unique cookie for ``pid``. - ``PR_SCHED_CORE_SHARE_TO`` -- push core_sched cookie to ``pid``. - ``PR_SCHED_CORE_SHARE_FROM`` -- pull core_sched cookie from ``pid``. ”h]”(j•)”}”(hŒarg2:”h]”hŒarg2:”…””}”(hjÐh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1j”h³hÇh´K:hjÌubj¥)”}”(hhh]”(hÞ)”}”(hŒ'Command for operation, must be one off:”h]”hŒ'Command for operation, must be one off:”…””}”(hjáh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´K5hjÞubhŒ bullet_list”“”)”}”(hhh]”(hŒ list_item”“”)”}”(hŒ:``PR_SCHED_CORE_GET`` -- get core_sched cookie of ``pid``.”h]”hÞ)”}”(hjøh]”(j)”}”(hŒ``PR_SCHED_CORE_GET``”h]”hŒPR_SCHED_CORE_GET”…””}”(hjýh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jhjúubhŒ -- get core_sched cookie of ”…””}”(hjúh²hh³Nh´Nubj)”}”(hŒ``pid``”h]”hŒpid”…””}”(hjh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jhjúubhŒ.”…””}”(hjúh²hh³Nh´Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´K7hjöubah}”(h]”h ]”h"]”h$]”h&]”uh1jôhjñubjõ)”}”(hŒC``PR_SCHED_CORE_CREATE`` -- create a new unique cookie for ``pid``.”h]”hÞ)”}”(hj/h]”(j)”}”(hŒ``PR_SCHED_CORE_CREATE``”h]”hŒPR_SCHED_CORE_CREATE”…””}”(hj4h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jhj1ubhŒ# -- create a new unique cookie for ”…””}”(hj1h²hh³Nh´Nubj)”}”(hŒ``pid``”h]”hŒpid”…””}”(hjFh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jhj1ubhŒ.”…””}”(hj1h²hh³Nh´Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´K8hj-ubah}”(h]”h ]”h"]”h$]”h&]”uh1jôhjñubjõ)”}”(hŒ@``PR_SCHED_CORE_SHARE_TO`` -- push core_sched cookie to ``pid``.”h]”hÞ)”}”(hjfh]”(j)”}”(hŒ``PR_SCHED_CORE_SHARE_TO``”h]”hŒPR_SCHED_CORE_SHARE_TO”…””}”(hjkh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jhjhubhŒ -- push core_sched cookie to ”…””}”(hjhh²hh³Nh´Nubj)”}”(hŒ``pid``”h]”hŒpid”…””}”(hj}h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jhjhubhŒ.”…””}”(hjhh²hh³Nh´Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´K9hjdubah}”(h]”h ]”h"]”h$]”h&]”uh1jôhjñubjõ)”}”(hŒE``PR_SCHED_CORE_SHARE_FROM`` -- pull core_sched cookie from ``pid``. ”h]”hÞ)”}”(hŒD``PR_SCHED_CORE_SHARE_FROM`` -- pull core_sched cookie from ``pid``.”h]”(j)”}”(hŒ``PR_SCHED_CORE_SHARE_FROM``”h]”hŒPR_SCHED_CORE_SHARE_FROM”…””}”(hj£h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jhjŸubhŒ -- pull core_sched cookie from ”…””}”(hjŸh²hh³Nh´Nubj)”}”(hŒ``pid``”h]”hŒpid”…””}”(hjµh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jhjŸubhŒ.”…””}”(hjŸh²hh³Nh´Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´K:hj›ubah}”(h]”h ]”h"]”h$]”h&]”uh1jôhjñubeh}”(h]”h ]”h"]”h$]”h&]”Œbullet”Œ-”uh1jïh³hÇh´K7hjÞubeh}”(h]”h ]”h"]”h$]”h&]”uh1j¤hjÌubeh}”(h]”h ]”h"]”h$]”h&]”uh1jŽh³hÇh´K:hj‹h²hubj)”}”(hŒ;arg3: ``pid`` of the task for which the operation applies. ”h]”(j•)”}”(hŒarg3:”h]”hŒarg3:”…””}”(hjëh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1j”h³hÇh´K=hjçubj¥)”}”(hhh]”hÞ)”}”(hŒ4``pid`` of the task for which the operation applies.”h]”(j)”}”(hŒ``pid``”h]”hŒpid”…””}”(hjh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jhjüubhŒ- of the task for which the operation applies.”…””}”(hjüh²hh³Nh´Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´K=hjùubah}”(h]”h ]”h"]”h$]”h&]”uh1j¤hjçubeh}”(h]”h ]”h"]”h$]”h&]”uh1jŽh³hÇh´K=hj‹h²hubj)”}”(hXarg4: ``pid_type`` for which the operation applies. It is one of ``PR_SCHED_CORE_SCOPE_``-prefixed macro constants. For example, if arg4 is ``PR_SCHED_CORE_SCOPE_THREAD_GROUP``, then the operation of this command will be performed for all tasks in the task group of ``pid``. ”h]”(j•)”}”(hŒarg4:”h]”hŒarg4:”…””}”(hj(h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1j”h³hÇh´KChj$ubj¥)”}”(hhh]”hÞ)”}”(hX ``pid_type`` for which the operation applies. It is one of ``PR_SCHED_CORE_SCOPE_``-prefixed macro constants. For example, if arg4 is ``PR_SCHED_CORE_SCOPE_THREAD_GROUP``, then the operation of this command will be performed for all tasks in the task group of ``pid``.”h]”(j)”}”(hŒ ``pid_type``”h]”hŒpid_type”…””}”(hj=h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jhj9ubhŒ/ for which the operation applies. It is one of ”…””}”(hj9h²hh³Nh´Nubj)”}”(hŒ``PR_SCHED_CORE_SCOPE_``”h]”hŒPR_SCHED_CORE_SCOPE_”…””}”(hjOh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jhj9ubhŒ4-prefixed macro constants. For example, if arg4 is ”…””}”(hj9h²hh³Nh´Nubj)”}”(hŒ$``PR_SCHED_CORE_SCOPE_THREAD_GROUP``”h]”hŒ PR_SCHED_CORE_SCOPE_THREAD_GROUP”…””}”(hjah²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jhj9ubhŒZ, then the operation of this command will be performed for all tasks in the task group of ”…””}”(hj9h²hh³Nh´Nubj)”}”(hŒ``pid``”h]”hŒpid”…””}”(hjsh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jhj9ubhŒ.”…””}”(hj9h²hh³Nh´Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´K@hj6ubah}”(h]”h ]”h"]”h$]”h&]”uh1j¤hj$ubeh}”(h]”h ]”h"]”h$]”h&]”uh1jŽh³hÇh´KChj‹h²hubj)”}”(hŒ—arg5: userspace pointer to an unsigned long long for storing the cookie returned by ``PR_SCHED_CORE_GET`` command. Should be 0 for all other commands. ”h]”(j•)”}”(hŒarg5:”h]”hŒarg5:”…””}”(hj›h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1j”h³hÇh´KGhj—ubj¥)”}”(hhh]”hÞ)”}”(hŒuserspace pointer to an unsigned long long for storing the cookie returned by ``PR_SCHED_CORE_GET`` command. Should be 0 for all other commands.”h]”(hŒNuserspace pointer to an unsigned long long for storing the cookie returned by ”…””}”(hj¬h²hh³Nh´Nubj)”}”(hŒ``PR_SCHED_CORE_GET``”h]”hŒPR_SCHED_CORE_GET”…””}”(hj´h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jhj¬ubhŒ- command. Should be 0 for all other commands.”…””}”(hj¬h²hh³Nh´Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´KFhj©ubah}”(h]”h ]”h"]”h$]”h&]”uh1j¤hj—ubeh}”(h]”h ]”h"]”h$]”h&]”uh1jŽh³hÇh´KGhj‹h²hubeh}”(h]”h ]”h"]”h$]”h&]”uh1j‰hj(h²hh³hÇh´NubhÞ)”}”(hŒ¨In order for a process to push a cookie to, or pull a cookie from a process, it is required to have the ptrace access mode: `PTRACE_MODE_READ_REALCREDS` to the process.”h]”(hŒ|In order for a process to push a cookie to, or pull a cookie from a process, it is required to have the ptrace access mode: ”…””}”(hjÞh²hh³Nh´NubhŒtitle_reference”“”)”}”(hŒ`PTRACE_MODE_READ_REALCREDS`”h]”hŒPTRACE_MODE_READ_REALCREDS”…””}”(hjèh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jæhjÞubhŒ to the process.”…””}”(hjÞh²hh³Nh´Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´KIhj(h²hubhÉ)”}”(hhh]”(hÎ)”}”(hŒBuilding hierarchies of tasks”h]”hŒBuilding hierarchies of tasks”…””}”(hjh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÍhjh²hh³hÇh´KNubhÞ)”}”(hX;The simplest way to build hierarchies of threads/processes which share a cookie and thus a core is to rely on the fact that the core-sched cookie is inherited across forks/clones and execs, thus setting a cookie for the 'initial' script/executable/daemon will place every spawned child in the same core-sched group.”h]”hX?The simplest way to build hierarchies of threads/processes which share a cookie and thus a core is to rely on the fact that the core-sched cookie is inherited across forks/clones and execs, thus setting a cookie for the ‘initial’ script/executable/daemon will place every spawned child in the same core-sched group.”…””}”(hjh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´KOhjh²hubeh}”(h]”Œbuilding-hierarchies-of-tasks”ah ]”h"]”Œbuilding hierarchies of tasks”ah$]”h&]”uh1hÈhj(h²hh³hÇh´KNubhÉ)”}”(hhh]”(hÎ)”}”(hŒCookie Transferral”h]”hŒCookie Transferral”…””}”(hj*h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÍhj'h²hh³hÇh´KVubhÞ)”}”(hXhTransferring a cookie between the current and other tasks is possible using PR_SCHED_CORE_SHARE_FROM and PR_SCHED_CORE_SHARE_TO to inherit a cookie from a specified task or a share a cookie with a task. In combination this allows a simple helper program to pull a cookie from a task in an existing core scheduling group and share it with already running tasks.”h]”hXhTransferring a cookie between the current and other tasks is possible using PR_SCHED_CORE_SHARE_FROM and PR_SCHED_CORE_SHARE_TO to inherit a cookie from a specified task or a share a cookie with a task. In combination this allows a simple helper program to pull a cookie from a task in an existing core scheduling group and share it with already running tasks.”…””}”(hj8h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´KWhj'h²hubeh}”(h]”Œcookie-transferral”ah ]”h"]”Œcookie transferral”ah$]”h&]”uh1hÈhj(h²hh³hÇh´KVubeh}”(h]”Œusage”ah ]”h"]”Œusage”ah$]”h&]”uh1hÈhhÊh²hh³hÇh´K!Œ referenced”KubhÉ)”}”(hhh]”(hÎ)”}”(hŒDesign/Implementation”h]”hŒDesign/Implementation”…””}”(hjZh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÍhjWh²hh³hÇh´K^ubhÞ)”}”(hŒ´Each task that is tagged is assigned a cookie internally in the kernel. As mentioned in `Usage`_, tasks with the same cookie value are assumed to trust each other and share a core.”h]”(hŒXEach task that is tagged is assigned a cookie internally in the kernel. As mentioned in ”…””}”(hjhh²hh³Nh´NubhŒ reference”“”)”}”(hŒ`Usage`_”h]”hŒUsage”…””}”(hjrh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”Œname”ŒUsage”Œrefid”jPuh1jphjhŒresolved”KubhŒT, tasks with the same cookie value are assumed to trust each other and share a core.”…””}”(hjhh²hh³Nh´Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´K_hjWh²hubhÞ)”}”(hX?The basic idea is that, every schedule event tries to select tasks for all the siblings of a core such that all the selected tasks running on a core are trusted (same cookie) at any point in time. Kernel threads are assumed trusted. The idle task is considered special, as it trusts everything and everything trusts it.”h]”hX?The basic idea is that, every schedule event tries to select tasks for all the siblings of a core such that all the selected tasks running on a core are trusted (same cookie) at any point in time. Kernel threads are assumed trusted. The idle task is considered special, as it trusts everything and everything trusts it.”…””}”(hjŽh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´KchjWh²hubhÞ)”}”(hXÁDuring a schedule() event on any sibling of a core, the highest priority task on the sibling's core is picked and assigned to the sibling calling schedule(), if the sibling has the task enqueued. For rest of the siblings in the core, highest priority task with the same cookie is selected if there is one runnable in their individual run queues. If a task with same cookie is not available, the idle task is selected. Idle task is globally trusted.”h]”hXÃDuring a schedule() event on any sibling of a core, the highest priority task on the sibling’s core is picked and assigned to the sibling calling schedule(), if the sibling has the task enqueued. For rest of the siblings in the core, highest priority task with the same cookie is selected if there is one runnable in their individual run queues. If a task with same cookie is not available, the idle task is selected. Idle task is globally trusted.”…””}”(hjœh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´KihjWh²hubhÞ)”}”(hXªOnce a task has been selected for all the siblings in the core, an IPI is sent to siblings for whom a new task was selected. Siblings on receiving the IPI will switch to the new task immediately. If an idle task is selected for a sibling, then the sibling is considered to be in a `forced idle` state. I.e., it may have tasks on its on runqueue to run, however it will still have to run idle. More on this in the next section.”h]”(hXOnce a task has been selected for all the siblings in the core, an IPI is sent to siblings for whom a new task was selected. Siblings on receiving the IPI will switch to the new task immediately. If an idle task is selected for a sibling, then the sibling is considered to be in a ”…””}”(hjªh²hh³Nh´Nubjç)”}”(hŒ `forced idle`”h]”hŒ forced idle”…””}”(hj²h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jæhjªubhŒ„ state. I.e., it may have tasks on its on runqueue to run, however it will still have to run idle. More on this in the next section.”…””}”(hjªh²hh³Nh´Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´KphjWh²hubhÉ)”}”(hhh]”(hÎ)”}”(hŒForced-idling of hyperthreads”h]”hŒForced-idling of hyperthreads”…””}”(hjÍh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÍhjÊh²hh³hÇh´KxubhÞ)”}”(hXSThe scheduler tries its best to find tasks that trust each other such that all tasks selected to be scheduled are of the highest priority in a core. However, it is possible that some runqueues had tasks that were incompatible with the highest priority ones in the core. Favoring security over fairness, one or more siblings could be forced to select a lower priority task if the highest priority task is not trusted with respect to the core wide highest priority task. If a sibling does not have a trusted task to run, it will be forced idle by the scheduler (idle thread is scheduled to run).”h]”hXSThe scheduler tries its best to find tasks that trust each other such that all tasks selected to be scheduled are of the highest priority in a core. However, it is possible that some runqueues had tasks that were incompatible with the highest priority ones in the core. Favoring security over fairness, one or more siblings could be forced to select a lower priority task if the highest priority task is not trusted with respect to the core wide highest priority task. If a sibling does not have a trusted task to run, it will be forced idle by the scheduler (idle thread is scheduled to run).”…””}”(hjÛh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´KyhjÊh²hubhÞ)”}”(hŒøWhen the highest priority task is selected to run, a reschedule-IPI is sent to the sibling to force it into idle. This results in 4 cases which need to be considered depending on whether a VM or a regular usermode process was running on either HT::”h]”hŒ÷When the highest priority task is selected to run, a reschedule-IPI is sent to the sibling to force it into idle. This results in 4 cases which need to be considered depending on whether a VM or a regular usermode process was running on either HT:”…””}”(hjéh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´K‚hjÊh²hubjz)”}”(hŒé HT1 (attack) HT2 (victim) A idle -> user space user space -> idle B idle -> user space guest -> idle C idle -> guest user space -> idle D idle -> guest guest -> idle”h]”hŒé HT1 (attack) HT2 (victim) A idle -> user space user space -> idle B idle -> user space guest -> idle C idle -> guest user space -> idle D idle -> guest guest -> idle”…””}”hj÷sbah}”(h]”h ]”h"]”h$]”h&]”hÅhÆuh1jyh³hÇh´K‡hjÊh²hubhÞ)”}”(hXÔNote that for better performance, we do not wait for the destination CPU (victim) to enter idle mode. This is because the sending of the IPI would bring the destination CPU immediately into kernel mode from user space, or VMEXIT in the case of guests. At best, this would only leak some scheduler metadata which may not be worth protecting. It is also possible that the IPI is received too late on some architectures, but this has not been observed in the case of x86.”h]”hXÔNote that for better performance, we do not wait for the destination CPU (victim) to enter idle mode. This is because the sending of the IPI would bring the destination CPU immediately into kernel mode from user space, or VMEXIT in the case of guests. At best, this would only leak some scheduler metadata which may not be worth protecting. It is also possible that the IPI is received too late on some architectures, but this has not been observed in the case of x86.”…””}”(hjh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´KhjÊh²hubeh}”(h]”Œforced-idling-of-hyperthreads”ah ]”h"]”Œforced-idling of hyperthreads”ah$]”h&]”uh1hÈhjWh²hh³hÇh´KxubhÉ)”}”(hhh]”(hÎ)”}”(hŒ Trust model”h]”hŒ Trust model”…””}”(hjh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÍhjh²hh³hÇh´K–ubhÞ)”}”(hXCore scheduling maintains trust relationships amongst groups of tasks by assigning them a tag that is the same cookie value. When a system with core scheduling boots, all tasks are considered to trust each other. This is because the core scheduler does not have information about trust relationships until userspace uses the above mentioned interfaces, to communicate them. In other words, all tasks have a default cookie value of 0. and are considered system-wide trusted. The forced-idling of siblings running cookie-0 tasks is also avoided.”h]”hXCore scheduling maintains trust relationships amongst groups of tasks by assigning them a tag that is the same cookie value. When a system with core scheduling boots, all tasks are considered to trust each other. This is because the core scheduler does not have information about trust relationships until userspace uses the above mentioned interfaces, to communicate them. In other words, all tasks have a default cookie value of 0. and are considered system-wide trusted. The forced-idling of siblings running cookie-0 tasks is also avoided.”…””}”(hj,h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´K—hjh²hubhÞ)”}”(hŒÞOnce userspace uses the above mentioned interfaces to group sets of tasks, tasks within such groups are considered to trust each other, but do not trust those outside. Tasks outside the group also don't trust tasks within.”h]”hŒàOnce userspace uses the above mentioned interfaces to group sets of tasks, tasks within such groups are considered to trust each other, but do not trust those outside. Tasks outside the group also don’t trust tasks within.”…””}”(hj:h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´K hjh²hubeh}”(h]”Œ trust-model”ah ]”h"]”Œ trust model”ah$]”h&]”uh1hÈhjWh²hh³hÇh´K–ubeh}”(h]”Œdesign-implementation”ah ]”h"]”Œdesign/implementation”ah$]”h&]”uh1hÈhhÊh²hh³hÇh´K^ubhÉ)”}”(hhh]”(hÎ)”}”(hŒLimitations of core-scheduling”h]”hŒLimitations of core-scheduling”…””}”(hj[h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÍhjXh²hh³hÇh´K¥ubhÞ)”}”(hŒ÷Core scheduling tries to guarantee that only trusted tasks run concurrently on a core. But there could be small window of time during which untrusted tasks run concurrently or kernel could be running concurrently with a task not trusted by kernel.”h]”hŒ÷Core scheduling tries to guarantee that only trusted tasks run concurrently on a core. But there could be small window of time during which untrusted tasks run concurrently or kernel could be running concurrently with a task not trusted by kernel.”…””}”(hjih²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´K¦hjXh²hubhÉ)”}”(hhh]”(hÎ)”}”(hŒIPI processing delays”h]”hŒIPI processing delays”…””}”(hjzh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÍhjwh²hh³hÇh´K¬ubhÞ)”}”(hXCore scheduling selects only trusted tasks to run together. IPI is used to notify the siblings to switch to the new task. But there could be hardware delays in receiving of the IPI on some arch (on x86, this has not been observed). This may cause an attacker task to start running on a CPU before its siblings receive the IPI. Even though cache is flushed on entry to user mode, victim tasks on siblings may populate data in the cache and micro architectural buffers after the attacker starts to run and this is a possibility for data leak.”h]”hXCore scheduling selects only trusted tasks to run together. IPI is used to notify the siblings to switch to the new task. But there could be hardware delays in receiving of the IPI on some arch (on x86, this has not been observed). This may cause an attacker task to start running on a CPU before its siblings receive the IPI. Even though cache is flushed on entry to user mode, victim tasks on siblings may populate data in the cache and micro architectural buffers after the attacker starts to run and this is a possibility for data leak.”…””}”(hjˆh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´K­hjwh²hubeh}”(h]”Œipi-processing-delays”ah ]”h"]”Œipi processing delays”ah$]”h&]”uh1hÈhjXh²hh³hÇh´K¬ubeh}”(h]”Œlimitations-of-core-scheduling”ah ]”h"]”Œlimitations of core-scheduling”ah$]”h&]”uh1hÈhhÊh²hh³hÇh´K¥ubhÉ)”}”(hhh]”(hÎ)”}”(hŒ8Open cross-HT issues that core scheduling does not solve”h]”hŒ8Open cross-HT issues that core scheduling does not solve”…””}”(hj©h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÍhj¦h²hh³hÇh´K¶ubhÉ)”}”(hhh]”(hÎ)”}”(hŒ 1. For MDS”h]”hŒ 1. For MDS”…””}”(hjºh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÍhj·h²hh³hÇh´K¸ubhÞ)”}”(hXCore scheduling cannot protect against MDS attacks between the siblings running in user mode and the others running in kernel mode. Even though all siblings run tasks which trust each other, when the kernel is executing code on behalf of a task, it cannot trust the code running in the sibling. Such attacks are possible for any combination of sibling CPU modes (host or guest mode).”h]”hXCore scheduling cannot protect against MDS attacks between the siblings running in user mode and the others running in kernel mode. Even though all siblings run tasks which trust each other, when the kernel is executing code on behalf of a task, it cannot trust the code running in the sibling. Such attacks are possible for any combination of sibling CPU modes (host or guest mode).”…””}”(hjÈh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´K¹hj·h²hubeh}”(h]”Œfor-mds”ah ]”h"]”Œ 1. for mds”ah$]”h&]”uh1hÈhj¦h²hh³hÇh´K¸ubhÉ)”}”(hhh]”(hÎ)”}”(hŒ 2. For L1TF”h]”hŒ 2. For L1TF”…””}”(hjáh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÍhjÞh²hh³hÇh´KÁubhÞ)”}”(hXCore scheduling cannot protect against an L1TF guest attacker exploiting a guest or host victim. This is because the guest attacker can craft invalid PTEs which are not inverted due to a vulnerable guest kernel. The only solution is to disable EPT (Extended Page Tables).”h]”hXCore scheduling cannot protect against an L1TF guest attacker exploiting a guest or host victim. This is because the guest attacker can craft invalid PTEs which are not inverted due to a vulnerable guest kernel. The only solution is to disable EPT (Extended Page Tables).”…””}”(hjïh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´KÂhjÞh²hubhÞ)”}”(hŒöFor both MDS and L1TF, if the guest vCPU is configured to not trust each other (by tagging separately), then the guest to guest attacks would go away. Or it could be a system admin policy which considers guest to guest attacks as a guest problem.”h]”hŒöFor both MDS and L1TF, if the guest vCPU is configured to not trust each other (by tagging separately), then the guest to guest attacks would go away. Or it could be a system admin policy which considers guest to guest attacks as a guest problem.”…””}”(hjýh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´KÇhjÞh²hubhÞ)”}”(hX!Another approach to resolve these would be to make every untrusted task on the system to not trust every other untrusted task. While this could reduce parallelism of the untrusted tasks, it would still solve the above issues while allowing system processes (trusted tasks) to share a core.”h]”hX!Another approach to resolve these would be to make every untrusted task on the system to not trust every other untrusted task. While this could reduce parallelism of the untrusted tasks, it would still solve the above issues while allowing system processes (trusted tasks) to share a core.”…””}”(hj h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´KÌhjÞh²hubeh}”(h]”Œfor-l1tf”ah ]”h"]”Œ 2. for l1tf”ah$]”h&]”uh1hÈhj¦h²hh³hÇh´KÁubhÉ)”}”(hhh]”(hÎ)”}”(hŒ/3. Protecting the kernel (IRQ, syscall, VMEXIT)”h]”hŒ/3. Protecting the kernel (IRQ, syscall, VMEXIT)”…””}”(hj$h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÍhj!h²hh³hÇh´KÒubhÞ)”}”(hXjUnfortunately, core scheduling does not protect kernel contexts running on sibling hyperthreads from one another. Prototypes of mitigations have been posted to LKML to solve this, but it is debatable whether such windows are practically exploitable, and whether the performance overhead of the prototypes are worth it (not to mention, the added code complexity).”h]”hXjUnfortunately, core scheduling does not protect kernel contexts running on sibling hyperthreads from one another. Prototypes of mitigations have been posted to LKML to solve this, but it is debatable whether such windows are practically exploitable, and whether the performance overhead of the prototypes are worth it (not to mention, the added code complexity).”…””}”(hj2h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´KÓhj!h²hubeh}”(h]”Œ(protecting-the-kernel-irq-syscall-vmexit”ah ]”h"]”Œ/3. protecting the kernel (irq, syscall, vmexit)”ah$]”h&]”uh1hÈhj¦h²hh³hÇh´KÒubeh}”(h]”Œ8open-cross-ht-issues-that-core-scheduling-does-not-solve”ah ]”h"]”Œ8open cross-ht issues that core scheduling does not solve”ah$]”h&]”uh1hÈhhÊh²hh³hÇh´K¶ubhÉ)”}”(hhh]”(hÎ)”}”(hŒOther Use cases”h]”hŒOther Use cases”…””}”(hjSh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÍhjPh²hh³hÇh´KÚubhÞ)”}”(hŒžThe main use case for Core scheduling is mitigating the cross-HT vulnerabilities with SMT enabled. There are other use cases where this feature could be used:”h]”hŒžThe main use case for Core scheduling is mitigating the cross-HT vulnerabilities with SMT enabled. There are other use cases where this feature could be used:”…””}”(hjah²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´KÛhjPh²hubjð)”}”(hhh]”(jõ)”}”(hŒpIsolating tasks that needs a whole core: Examples include realtime tasks, tasks that uses SIMD instructions etc.”h]”hÞ)”}”(hŒpIsolating tasks that needs a whole core: Examples include realtime tasks, tasks that uses SIMD instructions etc.”h]”hŒpIsolating tasks that needs a whole core: Examples include realtime tasks, tasks that uses SIMD instructions etc.”…””}”(hjvh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´KÞhjrubah}”(h]”h ]”h"]”h$]”h&]”uh1jôhjoh²hh³hÇh´Nubjõ)”}”(hŒ¢Gang scheduling: Requirements for a group of tasks that needs to be scheduled together could also be realized using core scheduling. One example is vCPUs of a VM.”h]”hÞ)”}”(hŒ¢Gang scheduling: Requirements for a group of tasks that needs to be scheduled together could also be realized using core scheduling. One example is vCPUs of a VM.”h]”hŒ¢Gang scheduling: Requirements for a group of tasks that needs to be scheduled together could also be realized using core scheduling. One example is vCPUs of a VM.”…””}”(hjŽh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hÝh³hÇh´KàhjŠubah}”(h]”h ]”h"]”h$]”h&]”uh1jôhjoh²hh³hÇh´Nubeh}”(h]”h ]”h"]”h$]”h&]”jÙjÚuh1jïh³hÇh´KÞhjPh²hubeh}”(h]”Œother-use-cases”ah ]”h"]”Œother use cases”ah$]”h&]”uh1hÈhhÊh²hh³hÇh´KÚubeh}”(h]”Œcore-scheduling”ah ]”h"]”Œcore scheduling”ah$]”h&]”uh1hÈhhh²hh³hÇh´Kubeh}”(h]”h ]”h"]”h$]”h&]”Œsource”hÇuh1hŒcurrent_source”NŒ current_line”NŒsettings”Œdocutils.frontend”ŒValues”“”)”}”(hÍNŒ generator”NŒ datestamp”NŒ source_link”NŒ source_url”NŒ toc_backlinks”Œentry”Œfootnote_backlinks”KŒ sectnum_xform”KŒstrip_comments”NŒstrip_elements_with_classes”NŒ strip_classes”NŒ report_level”KŒ halt_level”KŒexit_status_level”KŒdebug”NŒwarning_stream”NŒ traceback”ˆŒinput_encoding”Œ utf-8-sig”Œinput_encoding_error_handler”Œstrict”Œoutput_encoding”Œutf-8”Œoutput_encoding_error_handler”jÛŒerror_encoding”Œutf-8”Œerror_encoding_error_handler”Œbackslashreplace”Œ language_code”Œen”Œrecord_dependencies”NŒconfig”NŒ id_prefix”hŒauto_id_prefix”Œid”Œ dump_settings”NŒdump_internals”NŒdump_transforms”NŒdump_pseudo_xml”NŒexpose_internals”NŒstrict_visitor”NŒ_disable_config”NŒ_source”hÇŒ _destination”NŒ _config_files”]”Œ7/var/lib/git/docbuild/linux/Documentation/docutils.conf”aŒfile_insertion_enabled”ˆŒ raw_enabled”KŒline_length_limit”M'Œpep_references”NŒ pep_base_url”Œhttps://peps.python.org/”Œpep_file_url_template”Œpep-%04d”Œrfc_references”NŒ rfc_base_url”Œ&https://datatracker.ietf.org/doc/html/”Œ tab_width”KŒtrim_footnote_reference_space”‰Œsyntax_highlight”Œlong”Œ smart_quotes”ˆŒsmartquotes_locales”]”Œcharacter_level_inline_markup”‰Œdoctitle_xform”‰Œ docinfo_xform”KŒsectsubtitle_xform”‰Œ image_loading”Œlink”Œembed_stylesheet”‰Œcloak_email_addresses”ˆŒsection_self_link”‰Œenv”NubŒreporter”NŒindirect_targets”]”Œsubstitution_defs”}”Œsubstitution_names”}”Œrefnames”}”Œusage”]”jrasŒrefids”}”Œnameids”}”(jµj²j%j"jSjPj$j!jKjHjUjRjjjMjJj£j j›j˜jMjJjÛjØjjjEjBj­jªuŒ nametypes”}”(jµ‰j%‰jS‰j$‰jK‰jU‰j‰jM‰j£‰j›‰jM‰jÛ‰j‰jE‰j­‰uh}”(j²hÊj"híjPj(j!jjHj'jRjWjjÊjJjj jXj˜jwjJj¦jØj·jjÞjBj!jªjPuŒ footnote_refs”}”Œ citation_refs”}”Œ autofootnotes”]”Œautofootnote_refs”]”Œsymbol_footnotes”]”Œsymbol_footnote_refs”]”Œ footnotes”]”Œ citations”]”Œautofootnote_start”KŒsymbol_footnote_start”KŒ id_counter”Œ collections”ŒCounter”“”}”…”R”Œparse_messages”]”Œtransform_messages”]”Œ transformer”NŒ include_log”]”Œ decoration”Nh²hub.