sphinx.addnodesdocument)}( rawsourcechildren]( translations LanguagesNode)}(hhh](h pending_xref)}(hhh]docutils.nodesTextChinese (Simplified)}parenthsba attributes}(ids]classes]names]dupnames]backrefs] refdomainstdreftypedoc reftarget(/translations/zh_CN/accel/amdxdna/amdnpumodnameN classnameN refexplicitutagnamehhh ubh)}(hhh]hChinese (Traditional)}hh2sbah}(h]h ]h"]h$]h&] refdomainh)reftypeh+ reftarget(/translations/zh_TW/accel/amdxdna/amdnpumodnameN classnameN refexplicituh1hhh ubh)}(hhh]hItalian}hhFsbah}(h]h ]h"]h$]h&] refdomainh)reftypeh+ reftarget(/translations/it_IT/accel/amdxdna/amdnpumodnameN classnameN refexplicituh1hhh ubh)}(hhh]hJapanese}hhZsbah}(h]h ]h"]h$]h&] refdomainh)reftypeh+ reftarget(/translations/ja_JP/accel/amdxdna/amdnpumodnameN classnameN refexplicituh1hhh ubh)}(hhh]hKorean}hhnsbah}(h]h ]h"]h$]h&] refdomainh)reftypeh+ reftarget(/translations/ko_KR/accel/amdxdna/amdnpumodnameN classnameN refexplicituh1hhh ubh)}(hhh]hPortuguese (Brazilian)}hhsbah}(h]h ]h"]h$]h&] refdomainh)reftypeh+ reftarget(/translations/pt_BR/accel/amdxdna/amdnpumodnameN classnameN refexplicituh1hhh ubh)}(hhh]hSpanish}hhsbah}(h]h ]h"]h$]h&] refdomainh)reftypeh+ reftarget(/translations/sp_SP/accel/amdxdna/amdnpumodnameN classnameN refexplicituh1hhh ubeh}(h]h ]h"]h$]h&]current_languageEnglishuh1h hh _documenthsourceNlineNubhcomment)}(h%SPDX-License-Identifier: GPL-2.0-onlyh]h%SPDX-License-Identifier: GPL-2.0-only}hhsbah}(h]h ]h"]h$]h&] xml:spacepreserveuh1hhhhhhB/var/lib/git/docbuild/linux/Documentation/accel/amdxdna/amdnpu.rsthKubh)}(h4This data file has been placed in the public domain.h]h4This data file has been placed in the public domain.}hhsbah}(h]h ]h"]h$]h&]hhuh1hhhhhho/srv/docbuild/lib/venvs/build-kernel-docs/lib64/python3.9/site-packages/docutils/parsers/rst/include/isonum.txthKubh)}(hDerived from the Unicode character mappings available from . Processed by unicode2rstsubs.py, part of Docutils: .h]hDerived from the Unicode character mappings available from . Processed by unicode2rstsubs.py, part of Docutils: .}hhsbah}(h]h ]h"]h$]h&]hhuh1hhhhhhhhKubhsubstitution_definition)}(h*.. |amp| unicode:: U+00026 .. AMPERSANDh]h&}hhsbah}(h]h ]h"]ampah$]h&]uh1hhhhKhhhhubh)}(h+.. |apos| unicode:: U+00027 .. APOSTROPHEh]h'}hhsbah}(h]h ]h"]aposah$]h&]uh1hhhhKhhhhubh)}(h).. |ast| unicode:: U+0002A .. ASTERISKh]h*}hjsbah}(h]h ]h"]astah$]h&]uh1hhhhK hhhhubh)}(h+.. |brvbar| unicode:: U+000A6 .. BROKEN BARh]h¦}hjsbah}(h]h ]h"]brvbarah$]h&]uh1hhhhK hhhhubh)}(h0.. |bsol| unicode:: U+0005C .. REVERSE SOLIDUSh]h\}hj#sbah}(h]h ]h"]bsolah$]h&]uh1hhhhK hhhhubh)}(h*.. |cent| unicode:: U+000A2 .. CENT SIGNh]h¢}hj2sbah}(h]h ]h"]centah$]h&]uh1hhhhK hhhhubh)}(h&.. |colon| unicode:: U+0003A .. COLONh]h:}hjAsbah}(h]h ]h"]colonah$]h&]uh1hhhhK hhhhubh)}(h&.. |comma| unicode:: U+0002C .. COMMAh]h,}hjPsbah}(h]h ]h"]commaah$]h&]uh1hhhhKhhhhubh)}(h... |commat| unicode:: U+00040 .. COMMERCIAL ATh]h@}hj_sbah}(h]h ]h"]commatah$]h&]uh1hhhhKhhhhubh)}(h/.. |copy| unicode:: U+000A9 .. COPYRIGHT SIGNh]h©}hjnsbah}(h]h ]h"]copyah$]h&]uh1hhhhKhhhhubh)}(h... |curren| unicode:: U+000A4 .. CURRENCY SIGNh]h¤}hj}sbah}(h]h ]h"]currenah$]h&]uh1hhhhKhhhhubh)}(h0.. |darr| unicode:: U+02193 .. DOWNWARDS ARROWh]h↓}hjsbah}(h]h ]h"]darrah$]h&]uh1hhhhKhhhhubh)}(h,.. |deg| unicode:: U+000B0 .. DEGREE SIGNh]h°}hjsbah}(h]h ]h"]degah$]h&]uh1hhhhKhhhhubh)}(h... |divide| unicode:: U+000F7 .. DIVISION SIGNh]h÷}hjsbah}(h]h ]h"]divideah$]h&]uh1hhhhKhhhhubh)}(h,.. |dollar| unicode:: U+00024 .. DOLLAR SIGNh]h$}hjsbah}(h]h ]h"]dollarah$]h&]uh1hhhhKhhhhubh)}(h,.. |equals| unicode:: U+0003D .. EQUALS SIGNh]h=}hjsbah}(h]h ]h"]equalsah$]h&]uh1hhhhKhhhhubh)}(h1.. |excl| unicode:: U+00021 .. EXCLAMATION MARKh]h!}hjsbah}(h]h ]h"]exclah$]h&]uh1hhhhKhhhhubh)}(h9.. |frac12| unicode:: U+000BD .. VULGAR FRACTION ONE HALFh]h½}hjsbah}(h]h ]h"]frac12ah$]h&]uh1hhhhKhhhhubh)}(h<.. |frac14| unicode:: U+000BC .. VULGAR FRACTION ONE QUARTERh]h¼}hjsbah}(h]h ]h"]frac14ah$]h&]uh1hhhhKhhhhubh)}(h;.. |frac18| unicode:: U+0215B .. VULGAR FRACTION ONE EIGHTHh]h⅛}hjsbah}(h]h ]h"]frac18ah$]h&]uh1hhhhKhhhhubh)}(h?.. |frac34| unicode:: U+000BE .. VULGAR FRACTION THREE QUARTERSh]h¾}hjsbah}(h]h ]h"]frac34ah$]h&]uh1hhhhKhhhhubh)}(h>.. |frac38| unicode:: U+0215C .. VULGAR FRACTION THREE EIGHTHSh]h⅜}hj"sbah}(h]h ]h"]frac38ah$]h&]uh1hhhhKhhhhubh)}(h=.. |frac58| unicode:: U+0215D .. VULGAR FRACTION FIVE EIGHTHSh]h⅝}hj1sbah}(h]h ]h"]frac58ah$]h&]uh1hhhhKhhhhubh)}(h>.. |frac78| unicode:: U+0215E .. VULGAR FRACTION SEVEN EIGHTHSh]h⅞}hj@sbah}(h]h ]h"]frac78ah$]h&]uh1hhhhKhhhhubh)}(h2.. |gt| unicode:: U+0003E .. GREATER-THAN SIGNh]h>}hjOsbah}(h]h ]h"]gtah$]h&]uh1hhhhKhhhhubh)}(h9.. |half| unicode:: U+000BD .. VULGAR FRACTION ONE HALFh]h½}hj^sbah}(h]h ]h"]halfah$]h&]uh1hhhhK hhhhubh)}(h/.. |horbar| unicode:: U+02015 .. HORIZONTAL BARh]h―}hjmsbah}(h]h ]h"]horbarah$]h&]uh1hhhhK!hhhhubh)}(h'.. |hyphen| unicode:: U+02010 .. HYPHENh]h‐}hj|sbah}(h]h ]h"]hyphenah$]h&]uh1hhhhK"hhhhubh)}(h:.. |iexcl| unicode:: U+000A1 .. INVERTED EXCLAMATION MARKh]h¡}hjsbah}(h]h ]h"]iexclah$]h&]uh1hhhhK#hhhhubh)}(h7.. |iquest| unicode:: U+000BF .. INVERTED QUESTION MARKh]h¿}hjsbah}(h]h ]h"]iquestah$]h&]uh1hhhhK$hhhhubh)}(hJ.. |laquo| unicode:: U+000AB .. LEFT-POINTING DOUBLE ANGLE QUOTATION MARKh]h«}hjsbah}(h]h ]h"]laquoah$]h&]uh1hhhhK%hhhhubh)}(h0.. |larr| unicode:: U+02190 .. LEFTWARDS ARROWh]h←}hjsbah}(h]h ]h"]larrah$]h&]uh1hhhhK&hhhhubh)}(h3.. |lcub| unicode:: U+0007B .. LEFT CURLY BRACKETh]h{}hjsbah}(h]h ]h"]lcubah$]h&]uh1hhhhK'hhhhubh)}(h;.. |ldquo| unicode:: U+0201C .. LEFT DOUBLE QUOTATION MARKh]h“}hjsbah}(h]h ]h"]ldquoah$]h&]uh1hhhhK(hhhhubh)}(h).. |lowbar| unicode:: U+0005F .. LOW LINEh]h_}hjsbah}(h]h ]h"]lowbarah$]h&]uh1hhhhK)hhhhubh)}(h1.. |lpar| unicode:: U+00028 .. LEFT PARENTHESISh]h(}hjsbah}(h]h ]h"]lparah$]h&]uh1hhhhK*hhhhubh)}(h4.. |lsqb| unicode:: U+0005B .. LEFT SQUARE BRACKETh]h[}hjsbah}(h]h ]h"]lsqbah$]h&]uh1hhhhK+hhhhubh)}(h;.. |lsquo| unicode:: U+02018 .. LEFT SINGLE QUOTATION MARKh]h‘}hjsbah}(h]h ]h"]lsquoah$]h&]uh1hhhhK,hhhhubh)}(h/.. |lt| unicode:: U+0003C .. LESS-THAN SIGNh]h<}hj!sbah}(h]h ]h"]ltah$]h&]uh1hhhhK-hhhhubh)}(h+.. |micro| unicode:: U+000B5 .. MICRO SIGNh]hµ}hj0sbah}(h]h ]h"]microah$]h&]uh1hhhhK.hhhhubh)}(h+.. |middot| unicode:: U+000B7 .. MIDDLE DOTh]h·}hj?sbah}(h]h ]h"]middotah$]h&]uh1hhhhK/hhhhubh)}(h/.. |nbsp| unicode:: U+000A0 .. NO-BREAK SPACEh]h }hjNsbah}(h]h ]h"]nbspah$]h&]uh1hhhhK0hhhhubh)}(h).. |not| unicode:: U+000AC .. NOT SIGNh]h¬}hj]sbah}(h]h ]h"]notah$]h&]uh1hhhhK1hhhhubh)}(h,.. |num| unicode:: U+00023 .. NUMBER SIGNh]h#}hjlsbah}(h]h ]h"]numah$]h&]uh1hhhhK2hhhhubh)}(h).. |ohm| unicode:: U+02126 .. OHM SIGNh]hΩ}hj{sbah}(h]h ]h"]ohmah$]h&]uh1hhhhK3hhhhubh)}(h;.. |ordf| unicode:: U+000AA .. FEMININE ORDINAL INDICATORh]hª}hjsbah}(h]h ]h"]ordfah$]h&]uh1hhhhK4hhhhubh)}(h<.. |ordm| unicode:: U+000BA .. MASCULINE ORDINAL INDICATORh]hº}hjsbah}(h]h ]h"]ordmah$]h&]uh1hhhhK5hhhhubh)}(h-.. |para| unicode:: U+000B6 .. PILCROW SIGNh]h¶}hjsbah}(h]h ]h"]paraah$]h&]uh1hhhhK6hhhhubh)}(h-.. |percnt| unicode:: U+00025 .. PERCENT SIGNh]h%}hjsbah}(h]h ]h"]percntah$]h&]uh1hhhhK7hhhhubh)}(h*.. |period| unicode:: U+0002E .. FULL STOPh]h.}hjsbah}(h]h ]h"]periodah$]h&]uh1hhhhK8hhhhubh)}(h*.. |plus| unicode:: U+0002B .. PLUS SIGNh]h+}hjsbah}(h]h ]h"]plusah$]h&]uh1hhhhK9hhhhubh)}(h0.. |plusmn| unicode:: U+000B1 .. PLUS-MINUS SIGNh]h±}hjsbah}(h]h ]h"]plusmnah$]h&]uh1hhhhK:hhhhubh)}(h+.. |pound| unicode:: U+000A3 .. POUND SIGNh]h£}hjsbah}(h]h ]h"]poundah$]h&]uh1hhhhK;hhhhubh)}(h... |quest| unicode:: U+0003F .. QUESTION MARKh]h?}hjsbah}(h]h ]h"]questah$]h&]uh1hhhhKhhhhubh)}(h1.. |rarr| unicode:: U+02192 .. RIGHTWARDS ARROWh]h→}hj/sbah}(h]h ]h"]rarrah$]h&]uh1hhhhK?hhhhubh)}(h4.. |rcub| unicode:: U+0007D .. RIGHT CURLY BRACKETh]h}}hj>sbah}(h]h ]h"]rcubah$]h&]uh1hhhhK@hhhhubh)}(h<.. |rdquo| unicode:: U+0201D .. RIGHT DOUBLE QUOTATION MARKh]h”}hjMsbah}(h]h ]h"]rdquoah$]h&]uh1hhhhKAhhhhubh)}(h0.. |reg| unicode:: U+000AE .. REGISTERED SIGNh]h®}hj\sbah}(h]h ]h"]regah$]h&]uh1hhhhKBhhhhubh)}(h2.. |rpar| unicode:: U+00029 .. RIGHT PARENTHESISh]h)}hjksbah}(h]h ]h"]rparah$]h&]uh1hhhhKChhhhubh)}(h5.. |rsqb| unicode:: U+0005D .. RIGHT SQUARE BRACKETh]h]}hjzsbah}(h]h ]h"]rsqbah$]h&]uh1hhhhKDhhhhubh)}(h<.. |rsquo| unicode:: U+02019 .. RIGHT SINGLE QUOTATION MARKh]h’}hjsbah}(h]h ]h"]rsquoah$]h&]uh1hhhhKEhhhhubh)}(h-.. |sect| unicode:: U+000A7 .. SECTION SIGNh]h§}hjsbah}(h]h ]h"]sectah$]h&]uh1hhhhKFhhhhubh)}(h*.. |semi| unicode:: U+0003B .. SEMICOLONh]h;}hjsbah}(h]h ]h"]semiah$]h&]uh1hhhhKGhhhhubh)}(h,.. |shy| unicode:: U+000AD .. SOFT HYPHENh]h­}hjsbah}(h]h ]h"]shyah$]h&]uh1hhhhKHhhhhubh)}(h(.. |sol| unicode:: U+0002F .. SOLIDUSh]h/}hjsbah}(h]h ]h"]solah$]h&]uh1hhhhKIhhhhubh)}(h,.. |sung| unicode:: U+0266A .. EIGHTH NOTEh]h♪}hjsbah}(h]h ]h"]sungah$]h&]uh1hhhhKJhhhhubh)}(h0.. |sup1| unicode:: U+000B9 .. SUPERSCRIPT ONEh]h¹}hjsbah}(h]h ]h"]sup1ah$]h&]uh1hhhhKKhhhhubh)}(h0.. |sup2| unicode:: U+000B2 .. SUPERSCRIPT TWOh]h²}hjsbah}(h]h ]h"]sup2ah$]h&]uh1hhhhKLhhhhubh)}(h2.. |sup3| unicode:: U+000B3 .. SUPERSCRIPT THREEh]h³}hjsbah}(h]h ]h"]sup3ah$]h&]uh1hhhhKMhhhhubh)}(h4.. |times| unicode:: U+000D7 .. MULTIPLICATION SIGNh]h×}hjsbah}(h]h ]h"]timesah$]h&]uh1hhhhKNhhhhubh)}(h0.. |trade| unicode:: U+02122 .. TRADE MARK SIGNh]h™}hjsbah}(h]h ]h"]tradeah$]h&]uh1hhhhKOhhhhubh)}(h... |uarr| unicode:: U+02191 .. UPWARDS ARROWh]h↑}hj.sbah}(h]h ]h"]uarrah$]h&]uh1hhhhKPhhhhubh)}(h... |verbar| unicode:: U+0007C .. VERTICAL LINEh]h|}hj=sbah}(h]h ]h"]verbarah$]h&]uh1hhhhKQhhhhubh)}(h*.. |yen| unicode:: U+000A5 .. YEN SIGN h]h¥}hjLsbah}(h]h ]h"]yenah$]h&]uh1hhhhKRhhhhubhsection)}(hhh](htitle)}(hAMD NPUh]hAMD NPU}(hjbhhhNhNubah}(h]h ]h"]h$]h&]uh1j`hj]hhhhhKubh field_list)}(hhh](hfield)}(hhh](h field_name)}(h Copyrighth]h Copyright}(hj|hhhNhNubah}(h]h ]h"]h$]h&]uh1jzhjwhhhKubh field_body)}(h(|copy| 2024 Advanced Micro Devices, Inc.h]h paragraph)}(hjh](h©}(hjhhhNhNubh" 2024 Advanced Micro Devices, Inc.}(hjhhhNhNubeh}(h]h ]h"]h$]h&]uh1jhhhK hjubah}(h]h ]h"]h$]h&]uh1jhjwubeh}(h]h ]h"]h$]h&]uh1juhhhK hjrhhubjv)}(hhh](j{)}(hAuthorh]hAuthor}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jzhjhhhKubj)}(h$Sonal Santan h]j)}(h#Sonal Santan h](hSonal Santan <}(hjhhhNhNubh reference)}(hsonal.santan@amd.comh]hsonal.santan@amd.com}(hjhhhNhNubah}(h]h ]h"]h$]h&]refurimailto:sonal.santan@amd.comuh1jhjubh>}(hjhhhNhNubeh}(h]h ]h"]h$]h&]uh1jhhhK hjubah}(h]h ]h"]h$]h&]uh1jhjubeh}(h]h ]h"]h$]h&]uh1juhhhK hjrhhubeh}(h]h ]h"]h$]h&]uh1jphj]hhhhhK ubj\)}(hhh](ja)}(hOverviewh]hOverview}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1j`hjhhhhhK ubj)}(hXAMD NPU (Neural Processing Unit) is a multi-user AI inference accelerator integrated into AMD client APU. NPU enables efficient execution of Machine Learning applications like CNN, LLM, etc. NPU is based on `AMD XDNA Architecture`_. NPU is managed by **amdxdna** driver.h](hAMD NPU (Neural Processing Unit) is a multi-user AI inference accelerator integrated into AMD client APU. NPU enables efficient execution of Machine Learning applications like CNN, LLM, etc. NPU is based on }(hj hhhNhNubj)}(h`AMD XDNA Architecture`_h]hAMD XDNA Architecture}(hjhhhNhNubah}(h]h ]h"]h$]h&]nameAMD XDNA Architecturerefuri-https://www.amd.com/en/technologies/xdna.htmluh1jhj resolvedKubh. NPU is managed by }(hj hhhNhNubhstrong)}(h **amdxdna**h]hamdxdna}(hj,hhhNhNubah}(h]h ]h"]h$]h&]uh1j*hj ubh driver.}(hj hhhNhNubeh}(h]h ]h"]h$]h&]uh1jhhhKhjhhubeh}(h]overviewah ]h"]overviewah$]h&]uh1j[hj]hhhhhK ubj\)}(hhh](ja)}(hHardware Descriptionh]hHardware Description}(hjOhhhNhNubah}(h]h ]h"]h$]h&]uh1j`hjLhhhhhKubj)}(h6AMD NPU consists of the following hardware components:h]h6AMD NPU consists of the following hardware components:}(hj]hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKhjLhhubj\)}(hhh](ja)}(hAMD XDNA Arrayh]hAMD XDNA Array}(hjnhhhNhNubah}(h]h ]h"]h$]h&]uh1j`hjkhhhhhKubj)}(hXAMD XDNA Array comprises of 2D array of compute and memory tiles built with `AMD AI Engine Technology`_. Each column has 4 rows of compute tiles and 1 row of memory tile. Each compute tile contains a VLIW processor with its own dedicated program and data memory. The memory tile acts as L2 memory. The 2D array can be partitioned at a column boundary creating a spatially isolated partition which can be bound to a workload context.h](hLAMD XDNA Array comprises of 2D array of compute and memory tiles built with }(hj|hhhNhNubj)}(h`AMD AI Engine Technology`_h]hAMD AI Engine Technology}(hjhhhNhNubah}(h]h ]h"]h$]h&]nameAMD AI Engine Technologyj#9https://www.xilinx.com/products/technology/ai-engine.htmluh1jhj|j%KubhXI. Each column has 4 rows of compute tiles and 1 row of memory tile. Each compute tile contains a VLIW processor with its own dedicated program and data memory. The memory tile acts as L2 memory. The 2D array can be partitioned at a column boundary creating a spatially isolated partition which can be bound to a workload context.}(hj|hhhNhNubeh}(h]h ]h"]h$]h&]uh1jhhhKhjkhhubj)}(hYEach column also has dedicated DMA engines to move data between host DDR and memory tile.h]hYEach column also has dedicated DMA engines to move data between host DDR and memory tile.}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhK$hjkhhubj)}(hAMD Phoenix and AMD Hawk Point client NPU have a 4x5 topology, i.e., 4 rows of compute tiles arranged into 5 columns. AMD Strix Point client APU have 4x8 topology, i.e., 4 rows of compute tiles arranged into 8 columns.h]hAMD Phoenix and AMD Hawk Point client NPU have a 4x5 topology, i.e., 4 rows of compute tiles arranged into 5 columns. AMD Strix Point client APU have 4x8 topology, i.e., 4 rows of compute tiles arranged into 8 columns.}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhK'hjkhhubeh}(h]amd-xdna-arrayah ]h"]amd xdna arrayah$]h&]uh1j[hjLhhhhhKubj\)}(hhh](ja)}(hShared L2 Memoryh]hShared L2 Memory}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1j`hjhhhhhK,ubj)}(hXThe single row of memory tiles create a pool of software managed on chip L2 memory. DMA engines are used to move data between host DDR and memory tiles. AMD Phoenix and AMD Hawk Point NPUs have a total of 2560 KB of L2 memory. AMD Strix Point NPU has a total of 4096 KB of L2 memory.h]hXThe single row of memory tiles create a pool of software managed on chip L2 memory. DMA engines are used to move data between host DDR and memory tiles. AMD Phoenix and AMD Hawk Point NPUs have a total of 2560 KB of L2 memory. AMD Strix Point NPU has a total of 4096 KB of L2 memory.}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhK.hjhhubeh}(h]shared-l2-memoryah ]h"]shared l2 memoryah$]h&]uh1j[hjLhhhhhK,ubj\)}(hhh](ja)}(hMicrocontrollerh]hMicrocontroller}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1j`hjhhhhhK4ubj)}(hA microcontroller runs NPU Firmware which is responsible for command processing, XDNA Array partition setup, XDNA Array configuration, workload context management and workload orchestration.h]hA microcontroller runs NPU Firmware which is responsible for command processing, XDNA Array partition setup, XDNA Array configuration, workload context management and workload orchestration.}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhK6hjhhubj)}(hNPU Firmware uses a dedicated instance of an isolated non-privileged context called ERT to service each workload context. ERT is also used to execute user provided ``ctrlcode`` associated with the workload context.h](hNPU Firmware uses a dedicated instance of an isolated non-privileged context called ERT to service each workload context. ERT is also used to execute user provided }(hj hhhNhNubhliteral)}(h ``ctrlcode``h]hctrlcode}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jhj ubh& associated with the workload context.}(hj hhhNhNubeh}(h]h ]h"]h$]h&]uh1jhhhK:hjhhubj)}(hzNPU Firmware uses a single isolated privileged context called MERT to service management commands from the amdxdna driver.h]hzNPU Firmware uses a single isolated privileged context called MERT to service management commands from the amdxdna driver.}(hj+hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhK>hjhhubeh}(h]microcontrollerah ]h"]microcontrollerah$]h&]uh1j[hjLhhhhhK4ubj\)}(hhh](ja)}(h Mailboxesh]h Mailboxes}(hjDhhhNhNubah}(h]h ]h"]h$]h&]uh1j`hjAhhhhhKBubj)}(hX-The microcontroller and amdxdna driver use a privileged channel for management tasks like setting up of contexts, telemetry, query, error handling, setting up user channel, etc. As mentioned before, privileged channel requests are serviced by MERT. The privileged channel is bound to a single mailbox.h]hX-The microcontroller and amdxdna driver use a privileged channel for management tasks like setting up of contexts, telemetry, query, error handling, setting up user channel, etc. As mentioned before, privileged channel requests are serviced by MERT. The privileged channel is bound to a single mailbox.}(hjRhhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKDhjAhhubj)}(hX&The microcontroller and amdxdna driver use a dedicated user channel per workload context. The user channel is primarily used for submitting work to the NPU. As mentioned before, a user channel requests are serviced by an instance of ERT. Each user channel is bound to its own dedicated mailbox.h]hX&The microcontroller and amdxdna driver use a dedicated user channel per workload context. The user channel is primarily used for submitting work to the NPU. As mentioned before, a user channel requests are serviced by an instance of ERT. Each user channel is bound to its own dedicated mailbox.}(hj`hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKIhjAhhubeh}(h] mailboxesah ]h"] mailboxesah$]h&]uh1j[hjLhhhhhKBubj\)}(hhh](ja)}(hPCIe EPh]hPCIe EP}(hjyhhhNhNubah}(h]h ]h"]h$]h&]uh1j`hjvhhhhhKOubj)}(hX3NPU is visible to the x86 host CPU as a PCIe device with multiple BARs and some MSI-X interrupt vectors. NPU uses a dedicated high bandwidth SoC level fabric for reading or writing into host memory. Each instance of ERT gets its own dedicated MSI-X interrupt. MERT gets a single instance of MSI-X interrupt.h]hX3NPU is visible to the x86 host CPU as a PCIe device with multiple BARs and some MSI-X interrupt vectors. NPU uses a dedicated high bandwidth SoC level fabric for reading or writing into host memory. Each instance of ERT gets its own dedicated MSI-X interrupt. MERT gets a single instance of MSI-X interrupt.}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKQhjvhhubj)}(hThe number of PCIe BARs varies depending on the specific device. Based on their functions, PCIe BARs can generally be categorized into the following types.h]hThe number of PCIe BARs varies depending on the specific device. Based on their functions, PCIe BARs can generally be categorized into the following types.}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKVhjvhhubh bullet_list)}(hhh](h list_item)}(hBPSP BAR: Expose the AMD PSP (Platform Security Processor) functionh]j)}(hjh]hBPSP BAR: Expose the AMD PSP (Platform Security Processor) function}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKYhjubah}(h]h ]h"]h$]h&]uh1jhjhhhhhNubj)}(h=SMU BAR: Expose the AMD SMU (System Management Unit) functionh]j)}(hjh]h=SMU BAR: Expose the AMD SMU (System Management Unit) function}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKZhjubah}(h]h ]h"]h$]h&]uh1jhjhhhhhNubj)}(h-SRAM BAR: Expose ring buffers for the mailboxh]j)}(hjh]h-SRAM BAR: Expose ring buffers for the mailbox}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhK[hjubah}(h]h ]h"]h$]h&]uh1jhjhhhhhNubj)}(hUMailbox BAR: Expose the mailbox control registers (head, tail and ISR registers etc.)h]j)}(hUMailbox BAR: Expose the mailbox control registers (head, tail and ISR registers etc.)h]hUMailbox BAR: Expose the mailbox control registers (head, tail and ISR registers etc.)}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhK\hjubah}(h]h ]h"]h$]h&]uh1jhjhhhhhNubj)}(h-Public Register BAR: Expose public registers h]j)}(h,Public Register BAR: Expose public registersh]h,Public Register BAR: Expose public registers}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhK^hjubah}(h]h ]h"]h$]h&]uh1jhjhhhhhNubeh}(h]h ]h"]h$]h&]bullet*uh1jhhhKYhjvhhubj)}(hOn specific devices, the above-mentioned BAR type might be combined into a single physical PCIe BAR. Or a module might require two physical PCIe BARs to be fully functional. For example,h]hOn specific devices, the above-mentioned BAR type might be combined into a single physical PCIe BAR. Or a module might require two physical PCIe BARs to be fully functional. For example,}(hj'hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhK`hjvhhubj)}(hhh](j)}(hNOn AMD Phoenix device, PSP, SMU, Public Register BARs are on PCIe BAR index 0.h]j)}(hj:h]hNOn AMD Phoenix device, PSP, SMU, Public Register BARs are on PCIe BAR index 0.}(hj<hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKdhj8ubah}(h]h ]h"]h$]h&]uh1jhj5hhhhhNubj)}(hOn AMD Strix Point device, Mailbox and Public Register BARs are on PCIe BAR index 0. The PSP has some registers in PCIe BAR index 0 (Public Register BAR) and PCIe BAR index 4 (PSP BAR). h]j)}(hOn AMD Strix Point device, Mailbox and Public Register BARs are on PCIe BAR index 0. The PSP has some registers in PCIe BAR index 0 (Public Register BAR) and PCIe BAR index 4 (PSP BAR).h]hOn AMD Strix Point device, Mailbox and Public Register BARs are on PCIe BAR index 0. The PSP has some registers in PCIe BAR index 0 (Public Register BAR) and PCIe BAR index 4 (PSP BAR).}(hjShhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKehjOubah}(h]h ]h"]h$]h&]uh1jhj5hhhhhNubeh}(h]h ]h"]h$]h&]j%j&uh1jhhhKdhjvhhubeh}(h]pcie-epah ]h"]pcie epah$]h&]uh1j[hjLhhhhhKOubj\)}(hhh](ja)}(hProcess Isolation Hardwareh]hProcess Isolation Hardware}(hjxhhhNhNubah}(h]h ]h"]h$]h&]uh1j`hjuhhhhhKjubj)}(hXAs explained before, XDNA Array can be dynamically divided into isolated spatial partitions, each of which may have one or more columns. The spatial partition is setup by programming the column isolation registers by the microcontroller. Each spatial partition is associated with a PASID which is also programmed by the microcontroller. Hence multiple spatial partitions in the NPU can make concurrent host access protected by PASID.h]hXAs explained before, XDNA Array can be dynamically divided into isolated spatial partitions, each of which may have one or more columns. The spatial partition is setup by programming the column isolation registers by the microcontroller. Each spatial partition is associated with a PASID which is also programmed by the microcontroller. Hence multiple spatial partitions in the NPU can make concurrent host access protected by PASID.}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKlhjuhhubj)}(hyThe NPU FW itself uses microcontroller MMU enforced isolated contexts for servicing user and privileged channel requests.h]hyThe NPU FW itself uses microcontroller MMU enforced isolated contexts for servicing user and privileged channel requests.}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKshjuhhubeh}(h]process-isolation-hardwareah ]h"]process isolation hardwareah$]h&]uh1j[hjLhhhhhKjubeh}(h]hardware-descriptionah ]h"]hardware descriptionah$]h&]uh1j[hj]hhhhhKubj\)}(hhh](ja)}(h%Mixed Spatial and Temporal Schedulingh]h%Mixed Spatial and Temporal Scheduling}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1j`hjhhhhhKxubj)}(hXAMD XDNA architecture supports mixed spatial and temporal (time sharing) scheduling of 2D array. This means that spatial partitions may be setup and torn down dynamically to accommodate various workloads. A *spatial* partition may be *exclusively* bound to one workload context while another partition may be *temporarily* bound to more than one workload contexts. The microcontroller updates the PASID for a temporarily shared partition to match the context that has been bound to the partition at any moment.h](hAMD XDNA architecture supports mixed spatial and temporal (time sharing) scheduling of 2D array. This means that spatial partitions may be setup and torn down dynamically to accommodate various workloads. A }(hjhhhNhNubhemphasis)}(h *spatial*h]hspatial}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jhjubh partition may be }(hjhhhNhNubj)}(h *exclusively*h]h exclusively}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jhjubh> bound to one workload context while another partition may be }(hjhhhNhNubj)}(h *temporarily*h]h temporarily}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jhjubh bound to more than one workload contexts. The microcontroller updates the PASID for a temporarily shared partition to match the context that has been bound to the partition at any moment.}(hjhhhNhNubeh}(h]h ]h"]h$]h&]uh1jhhhKzhjhhubj\)}(hhh](ja)}(hResource Solverh]hResource Solver}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1j`hj hhhhhKubj)}(hXThe Resource Solver component of the amdxdna driver manages the allocation of 2D array among various workloads. Every workload describes the number of columns required to run the NPU binary in its metadata. The Resource Solver component uses hints passed by the workload and its own heuristics to decide 2D array (re)partition strategy and mapping of workloads for spatial and temporal sharing of columns. The FW enforces the context-to-column(s) resource binding decisions made by the Resource Solver.h]hXThe Resource Solver component of the amdxdna driver manages the allocation of 2D array among various workloads. Every workload describes the number of columns required to run the NPU binary in its metadata. The Resource Solver component uses hints passed by the workload and its own heuristics to decide 2D array (re)partition strategy and mapping of workloads for spatial and temporal sharing of columns. The FW enforces the context-to-column(s) resource binding decisions made by the Resource Solver.}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKhj hhubj)}(hAMD Phoenix and AMD Hawk Point client NPU can support 6 concurrent workload contexts. AMD Strix Point can support 16 concurrent workload contexts.h]hAMD Phoenix and AMD Hawk Point client NPU can support 6 concurrent workload contexts. AMD Strix Point can support 16 concurrent workload contexts.}(hj( hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKhj hhubeh}(h]resource-solverah ]h"]resource solverah$]h&]uh1j[hjhhhhhKubeh}(h]%mixed-spatial-and-temporal-schedulingah ]h"]%mixed spatial and temporal schedulingah$]h&]uh1j[hj]hhhhhKxubj\)}(hhh](ja)}(hApplication Binariesh]hApplication Binaries}(hjI hhhNhNubah}(h]h ]h"]h$]h&]uh1j`hjF hhhhhKubj)}(hiA NPU application workload is comprised of two separate binaries which are generated by the NPU compiler.h]hiA NPU application workload is comprised of two separate binaries which are generated by the NPU compiler.}(hjW hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKhjF hhubhenumerated_list)}(hhh](j)}(hXzAMD XDNA Array overlay, which is used to configure a NPU spatial partition. The overlay contains instructions for setting up the stream switch configuration and ELF for the compute tiles. The overlay is loaded on the spatial partition bound to the workload by the associated ERT instance. Refer to the `Versal Adaptive SoC AIE-ML Architecture Manual (AM020)`_ for more details. h]j)}(hXyAMD XDNA Array overlay, which is used to configure a NPU spatial partition. The overlay contains instructions for setting up the stream switch configuration and ELF for the compute tiles. The overlay is loaded on the spatial partition bound to the workload by the associated ERT instance. Refer to the `Versal Adaptive SoC AIE-ML Architecture Manual (AM020)`_ for more details.h](hX.AMD XDNA Array overlay, which is used to configure a NPU spatial partition. The overlay contains instructions for setting up the stream switch configuration and ELF for the compute tiles. The overlay is loaded on the spatial partition bound to the workload by the associated ERT instance. Refer to the }(hjn hhhNhNubj)}(h9`Versal Adaptive SoC AIE-ML Architecture Manual (AM020)`_h]h6Versal Adaptive SoC AIE-ML Architecture Manual (AM020)}(hjv hhhNhNubah}(h]h ]h"]h$]h&]name6Versal Adaptive SoC AIE-ML Architecture Manual (AM020)j#0https://docs.amd.com/r/en-US/am020-versal-aie-mluh1jhjn j%Kubh for more details.}(hjn hhhNhNubeh}(h]h ]h"]h$]h&]uh1jhhhKhjj ubah}(h]h ]h"]h$]h&]uh1jhjg hhhhhNubj)}(hXH``ctrlcode``, used for orchestrating the overlay loaded on the spatial partition. ``ctrlcode`` is executed by the ERT running in protected mode on the microcontroller in the context of the workload. ``ctrlcode`` is made up of a sequence of opcodes named ``XAie_TxnOpcode``. Refer to the `AI Engine Run Time`_ for more details. h]j)}(hXF``ctrlcode``, used for orchestrating the overlay loaded on the spatial partition. ``ctrlcode`` is executed by the ERT running in protected mode on the microcontroller in the context of the workload. ``ctrlcode`` is made up of a sequence of opcodes named ``XAie_TxnOpcode``. Refer to the `AI Engine Run Time`_ for more details.h](j)}(h ``ctrlcode``h]hctrlcode}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj ubhF, used for orchestrating the overlay loaded on the spatial partition. }(hj hhhNhNubj)}(h ``ctrlcode``h]hctrlcode}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj ubhi is executed by the ERT running in protected mode on the microcontroller in the context of the workload. }(hj hhhNhNubj)}(h ``ctrlcode``h]hctrlcode}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj ubh+ is made up of a sequence of opcodes named }(hj hhhNhNubj)}(h``XAie_TxnOpcode``h]hXAie_TxnOpcode}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj ubh. Refer to the }(hj hhhNhNubj)}(h`AI Engine Run Time`_h]hAI Engine Run Time}(hj hhhNhNubah}(h]h ]h"]h$]h&]nameAI Engine Run Timej#6https://github.com/Xilinx/aie-rt/tree/release/main_aiguh1jhj j%Kubh for more details.}(hj hhhNhNubeh}(h]h ]h"]h$]h&]uh1jhhhKhj ubah}(h]h ]h"]h$]h&]uh1jhjg hhhhhNubeh}(h]h ]h"]h$]h&]enumtypearabicprefixhsuffix.uh1je hjF hhhhhKubeh}(h]application-binariesah ]h"]application binariesah$]h&]uh1j[hj]hhhhhKubj\)}(hhh](ja)}(hSpecial Host Buffersh]hSpecial Host Buffers}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1j`hj hhhhhKubj\)}(hhh](ja)}(hPer-context Instruction Bufferh]hPer-context Instruction Buffer}(hj/ hhhNhNubah}(h]h ]h"]h$]h&]uh1j`hj, hhhhhKubj)}(hXtEvery workload context uses a host resident 64 MB buffer which is memory mapped into the ERT instance created to service the workload. The ``ctrlcode`` used by the workload is copied into this special memory. This buffer is protected by PASID like all other input/output buffers used by that workload. Instruction buffer is also mapped into the user space of the workload.h](hEvery workload context uses a host resident 64 MB buffer which is memory mapped into the ERT instance created to service the workload. The }(hj= hhhNhNubj)}(h ``ctrlcode``h]hctrlcode}(hjE hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj= ubh used by the workload is copied into this special memory. This buffer is protected by PASID like all other input/output buffers used by that workload. Instruction buffer is also mapped into the user space of the workload.}(hj= hhhNhNubeh}(h]h ]h"]h$]h&]uh1jhhhKhj, hhubeh}(h]per-context-instruction-bufferah ]h"]per-context instruction bufferah$]h&]uh1j[hj hhhhhKubj\)}(hhh](ja)}(hGlobal Privileged Bufferh]hGlobal Privileged Buffer}(hjh hhhNhNubah}(h]h ]h"]h$]h&]uh1j`hje hhhhhKubj)}(hIn addition, the driver also allocates a single buffer for maintenance tasks like recording errors from MERT. This global buffer uses the global IOMMU domain and is only accessible by MERT.h]hIn addition, the driver also allocates a single buffer for maintenance tasks like recording errors from MERT. This global buffer uses the global IOMMU domain and is only accessible by MERT.}(hjv hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKhje hhubeh}(h]global-privileged-bufferah ]h"]global privileged bufferah$]h&]uh1j[hj hhhhhKubeh}(h]special-host-buffersah ]h"]special host buffersah$]h&]uh1j[hj]hhhhhKubj\)}(hhh](ja)}(hHigh-level Use Flowh]hHigh-level Use Flow}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1j`hj hhhhhKubj)}(h0Here are the steps to run a workload on AMD NPU:h]h0Here are the steps to run a workload on AMD NPU:}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKhj hhubjf )}(hhh](j)}(h?Compile the workload into an overlay and a ``ctrlcode`` binary.h]j)}(hj h](h+Compile the workload into an overlay and a }(hj hhhNhNubj)}(h ``ctrlcode``h]hctrlcode}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj ubh binary.}(hj hhhNhNubeh}(h]h ]h"]h$]h&]uh1jhhhKhj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(hAUserspace opens a context in the driver and provides the overlay.h]j)}(hj h]hAUserspace opens a context in the driver and provides the overlay.}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKhj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(h^The driver checks with the Resource Solver for provisioning a set of columns for the workload.h]j)}(h^The driver checks with the Resource Solver for provisioning a set of columns for the workload.h]h^The driver checks with the Resource Solver for provisioning a set of columns for the workload.}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKhj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(hUThe driver then asks MERT to create a context on the device with the desired columns.h]j)}(hUThe driver then asks MERT to create a context on the device with the desired columns.h]hUThe driver then asks MERT to create a context on the device with the desired columns.}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKhj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(h\MERT then creates an instance of ERT. MERT also maps the Instruction Buffer into ERT memory.h]j)}(h\MERT then creates an instance of ERT. MERT also maps the Instruction Buffer into ERT memory.h]h\MERT then creates an instance of ERT. MERT also maps the Instruction Buffer into ERT memory.}(hj* hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKhj& ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(hEThe userspace then copies the ``ctrlcode`` to the Instruction Buffer.h]j)}(hj@ h](hThe userspace then copies the }(hjB hhhNhNubj)}(h ``ctrlcode``h]hctrlcode}(hjI hhhNhNubah}(h]h ]h"]h$]h&]uh1jhjB ubh to the Instruction Buffer.}(hjB hhhNhNubeh}(h]h ]h"]h$]h&]uh1jhhhKhj> ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(hUserspace then creates a command buffer with pointers to input, output, and instruction buffer; it then submits command buffer with the driver and goes to sleep waiting for completion.h]j)}(hUserspace then creates a command buffer with pointers to input, output, and instruction buffer; it then submits command buffer with the driver and goes to sleep waiting for completion.h]hUserspace then creates a command buffer with pointers to input, output, and instruction buffer; it then submits command buffer with the driver and goes to sleep waiting for completion.}(hjk hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKhjg ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(h5The driver sends the command over the Mailbox to ERT.h]j)}(hj h]h5The driver sends the command over the Mailbox to ERT.}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKhj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(h:ERT *executes* the ``ctrlcode`` in the instruction buffer.h]j)}(hj h](hERT }(hj hhhNhNubj)}(h *executes*h]hexecutes}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj ubh the }(hj hhhNhNubj)}(h ``ctrlcode``h]hctrlcode}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj ubh in the instruction buffer.}(hj hhhNhNubeh}(h]h ]h"]h$]h&]uh1jhhhKhj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(hfExecution of the ``ctrlcode`` kicks off DMAs to and from the host DDR while AMD XDNA Array is running.h]j)}(hfExecution of the ``ctrlcode`` kicks off DMAs to and from the host DDR while AMD XDNA Array is running.h](hExecution of the }(hj hhhNhNubj)}(h ``ctrlcode``h]hctrlcode}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj ubhI kicks off DMAs to and from the host DDR while AMD XDNA Array is running.}(hj hhhNhNubeh}(h]h ]h"]h$]h&]uh1jhhhKhj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(hWhen ERT reaches end of ``ctrlcode``, it raises an MSI-X to send completion signal to the driver which then wakes up the waiting workload. h]j)}(hWhen ERT reaches end of ``ctrlcode``, it raises an MSI-X to send completion signal to the driver which then wakes up the waiting workload.h](hWhen ERT reaches end of }(hj hhhNhNubj)}(h ``ctrlcode``h]hctrlcode}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj ubhf, it raises an MSI-X to send completion signal to the driver which then wakes up the waiting workload.}(hj hhhNhNubeh}(h]h ]h"]h$]h&]uh1jhhhKhj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubeh}(h]h ]h"]h$]h&]j j j hj j uh1je hj hhhhhKubeh}(h]high-level-use-flowah ]h"]high-level use flowah$]h&]uh1j[hj]hhhhhKubj\)}(hhh](ja)}(h Boot Flowh]h Boot Flow}(hj6 hhhNhNubah}(h]h ]h"]h$]h&]uh1j`hj3 hhhhhKubj)}(hXAamdxdna driver uses PSP to securely load signed NPU FW and kick off the boot of the NPU microcontroller. amdxdna driver then waits for the alive signal in a special location on BAR 0. The NPU is switched off during SoC suspend and turned on after resume where the NPU FW is reloaded, and the handshake is performed again.h]hXAamdxdna driver uses PSP to securely load signed NPU FW and kick off the boot of the NPU microcontroller. amdxdna driver then waits for the alive signal in a special location on BAR 0. The NPU is switched off during SoC suspend and turned on after resume where the NPU FW is reloaded, and the handshake is performed again.}(hjD hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhKhj3 hhubeh}(h] boot-flowah ]h"] boot flowah$]h&]uh1j[hj]hhhhhKubj\)}(hhh](ja)}(hUserspace componentsh]hUserspace components}(hj] hhhNhNubah}(h]h ]h"]h$]h&]uh1j`hjZ hhhhhKubj\)}(hhh](ja)}(hCompilerh]hCompiler}(hjn hhhNhNubah}(h]h ]h"]h$]h&]uh1j`hjk hhhhhKubj)}(hPeano is an LLVM based open-source single core compiler for AMD XDNA Array compute tile. Peano is available at: https://github.com/Xilinx/llvm-aieh](hpPeano is an LLVM based open-source single core compiler for AMD XDNA Array compute tile. Peano is available at: }(hj| hhhNhNubj)}(h"https://github.com/Xilinx/llvm-aieh]h"https://github.com/Xilinx/llvm-aie}(hj hhhNhNubah}(h]h ]h"]h$]h&]refurij uh1jhj| ubeh}(h]h ]h"]h$]h&]uh1jhhhKhjk hhubj)}(hIRON is an open-source array compiler for AMD XDNA Array based NPU which uses Peano underneath. IRON is available at: https://github.com/Xilinx/mlir-aieh](hvIRON is an open-source array compiler for AMD XDNA Array based NPU which uses Peano underneath. IRON is available at: }(hj hhhNhNubj)}(h"https://github.com/Xilinx/mlir-aieh]h"https://github.com/Xilinx/mlir-aie}(hj hhhNhNubah}(h]h ]h"]h$]h&]refurij uh1jhj ubeh}(h]h ]h"]h$]h&]uh1jhhhKhjk hhubeh}(h]compilerah ]h"]compilerah$]h&]uh1j[hjZ hhhhhKubj\)}(hhh](ja)}(hUsermode Driver (UMD)h]hUsermode Driver (UMD)}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1j`hj hhhhhKubj)}(h{The open-source XRT runtime stack interfaces with amdxdna kernel driver. XRT can be found at: https://github.com/Xilinx/XRTh](h^The open-source XRT runtime stack interfaces with amdxdna kernel driver. XRT can be found at: }(hj hhhNhNubj)}(hhttps://github.com/Xilinx/XRTh]hhttps://github.com/Xilinx/XRT}(hj hhhNhNubah}(h]h ]h"]h$]h&]refurij uh1jhj ubeh}(h]h ]h"]h$]h&]uh1jhhhKhj hhubj)}(hWThe open-source XRT shim for NPU is can be found at: https://github.com/amd/xdna-driverh](h5The open-source XRT shim for NPU is can be found at: }(hj hhhNhNubj)}(h"https://github.com/amd/xdna-driverh]h"https://github.com/amd/xdna-driver}(hj hhhNhNubah}(h]h ]h"]h$]h&]refurij uh1jhj ubeh}(h]h ]h"]h$]h&]uh1jhhhKhj hhubeh}(h]usermode-driver-umdah ]h"]usermode driver (umd)ah$]h&]uh1j[hjZ hhhhhKubeh}(h]userspace-componentsah ]h"]userspace componentsah$]h&]uh1j[hj]hhhhhKubj\)}(hhh](ja)}(h DMA Operationh]h DMA Operation}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1j`hj hhhhhKubj)}(hDMA operation instructions are encoded in the ``ctrlcode`` as ``XAIE_IO_BLOCKWRITE`` opcode. When ERT executes ``XAIE_IO_BLOCKWRITE``, DMA operations between host DDR and L2 memory are effected.h](h.DMA operation instructions are encoded in the }(hj* hhhNhNubj)}(h ``ctrlcode``h]hctrlcode}(hj2 hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj* ubh as }(hj* hhhNhNubj)}(h``XAIE_IO_BLOCKWRITE``h]hXAIE_IO_BLOCKWRITE}(hjD hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj* ubh opcode. When ERT executes }(hj* hhhNhNubj)}(h``XAIE_IO_BLOCKWRITE``h]hXAIE_IO_BLOCKWRITE}(hjV hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj* ubh=, DMA operations between host DDR and L2 memory are effected.}(hj* hhhNhNubeh}(h]h ]h"]h$]h&]uh1jhhhKhj hhubeh}(h] dma-operationah ]h"] dma operationah$]h&]uh1j[hj]hhhhhKubj\)}(hhh](ja)}(hError Handlingh]hError Handling}(hjy hhhNhNubah}(h]h ]h"]h$]h&]uh1j`hjv hhhhhKubj)}(hX}When MERT detects an error in AMD XDNA Array, it pauses execution for that workload context and sends an asynchronous message to the driver over the privileged channel. The driver then sends a buffer pointer to MERT to capture the register states for the partition bound to faulting workload context. The driver then decodes the error by reading the contents of the buffer pointer.h]hX}When MERT detects an error in AMD XDNA Array, it pauses execution for that workload context and sends an asynchronous message to the driver over the privileged channel. The driver then sends a buffer pointer to MERT to capture the register states for the partition bound to faulting workload context. The driver then decodes the error by reading the contents of the buffer pointer.}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhMhjv hhubeh}(h]error-handlingah ]h"]error handlingah$]h&]uh1j[hj]hhhhhKubj\)}(hhh](ja)}(h Telemetryh]h Telemetry}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1j`hj hhhhhMubj)}(hJMERT can report various kinds of telemetry information like the following:h]hJMERT can report various kinds of telemetry information like the following:}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhM hj hhubj)}(hhh](j)}(hL1 interrupt counterh]j)}(hj h]hL1 interrupt counter}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhM hj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(h DMA counterh]j)}(hj h]h DMA counter}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhM hj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(hDeep Sleep counterh]j)}(hj h]hDeep Sleep counter}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhMhj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(hetc. h]j)}(hetc.h]hetc.}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jhhhMhjubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubeh}(h]h ]h"]h$]h&]j%j&uh1jhhhM hj hhubeh}(h] telemetryah ]h"] telemetryah$]h&]uh1j[hj]hhhhhMubj\)}(hhh](ja)}(h Referencesh]h References}(hj-hhhNhNubah}(h]h ]h"]h$]h&]uh1j`hj*hhhhhMubj)}(hhh](j)}(hH`AMD XDNA Architecture `_h]j)}(hj@h](j)}(hj@h]hAMD XDNA Architecture}(hjEhhhNhNubah}(h]h ]h"]h$]h&]nameAMD XDNA Architecturej#j$uh1jhjBubhtarget)}(h0 h]h}(h]amd-xdna-architectureah ]h"]amd xdna architectureah$]h&]refurij$uh1jT referencedKhjBubeh}(h]h ]h"]h$]h&]uh1jhhhMhj>ubah}(h]h ]h"]h$]h&]uh1jhj;hhhhhNubj)}(hW`AMD AI Engine Technology `_h]j)}(hjrh](j)}(hjrh]hAMD AI Engine Technology}(hjwhhhNhNubah}(h]h ]h"]h$]h&]nameAMD AI Engine Technologyj#juh1jhjtubjU)}(h< h]h}(h]amd-ai-engine-technologyah ]h"]amd ai engine technologyah$]h&]refurijuh1jTjcKhjtubeh}(h]h ]h"]h$]h&]uh1jhhhMhjpubah}(h]h ]h"]h$]h&]uh1jhj;hhhhhNubj)}(h-`Peano `_h]j)}(hjh](j)}(hjh]hPeano}(hjhhhNhNubah}(h]h ]h"]h$]h&]namePeanoj#"https://github.com/Xilinx/llvm-aieuh1jhjubjU)}(h% h]h}(h]peanoah ]h"]peanoah$]h&]refurijuh1jTjcKhjubeh}(h]h ]h"]h$]h&]uh1jhhhMhjubah}(h]h ]h"]h$]h&]uh1jhj;hhhhhNubj)}(hl`Versal Adaptive SoC AIE-ML Architecture Manual (AM020) `_h]j)}(hjh](j)}(hjh]h6Versal Adaptive SoC AIE-ML Architecture Manual (AM020)}(hjhhhNhNubah}(h]h ]h"]h$]h&]name6Versal Adaptive SoC AIE-ML Architecture Manual (AM020)j#j uh1jhjubjU)}(h3 h]h}(h]4versal-adaptive-soc-aie-ml-architecture-manual-am020ah ]h"]6versal adaptive soc aie-ml architecture manual (am020)ah$]h&]refurij uh1jTjcKhjubeh}(h]h ]h"]h$]h&]uh1jhhhMhjubah}(h]h ]h"]h$]h&]uh1jhj;hhhhhNubj)}(hN`AI Engine Run Time `_h]j)}(hjh](j)}(hjh]hAI Engine Run Time}(hjhhhNhNubah}(h]h ]h"]h$]h&]nameAI Engine Run Timej#j uh1jhjubjU)}(h9 h]h}(h]ai-engine-run-timeah ]h"]ai engine run timeah$]h&]refurij uh1jTjcKhjubeh}(h]h ]h"]h$]h&]uh1jhhhMhjubah}(h]h ]h"]h$]h&]uh1jhj;hhhhhNubeh}(h]h ]h"]h$]h&]j%-uh1jhhhMhj*hhubeh}(h] referencesah ]h"] referencesah$]h&]uh1j[hj]hhhhhMubeh}(h]amd-npuah ]h"]amd npuah$]h&]uh1j[hhhhhhhKubeh}(h]h ]h"]h$]h&]sourcehuh1hcurrent_sourceN current_lineNsettingsdocutils.frontendValues)}(j`N generatorN datestampN source_linkN source_urlN toc_backlinksentryfootnote_backlinksK sectnum_xformKstrip_commentsNstrip_elements_with_classesN strip_classesN report_levelK halt_levelKexit_status_levelKdebugNwarning_streamN tracebackinput_encoding utf-8-siginput_encoding_error_handlerstrictoutput_encodingutf-8output_encoding_error_handlerjgerror_encodingutf-8error_encoding_error_handlerbackslashreplace language_codeenrecord_dependenciesNconfigN id_prefixhauto_id_prefixid dump_settingsNdump_internalsNdump_transformsNdump_pseudo_xmlNexpose_internalsNstrict_visitorN_disable_configN_sourcehnj _destinationN _config_files]7/var/lib/git/docbuild/linux/Documentation/docutils.confafile_insertion_enabled raw_enabledKline_length_limitM'pep_referencesN pep_base_urlhttps://peps.python.org/pep_file_url_templatepep-%04drfc_referencesN rfc_base_url&https://datatracker.ietf.org/doc/html/ tab_widthKtrim_footnote_reference_spacesyntax_highlightlong smart_quotessmartquotes_locales]character_level_inline_markupdoctitle_xform docinfo_xformKsectsubtitle_xform image_loadinglinkembed_stylesheetcloak_email_addressessection_self_linkenvNubreporterNindirect_targets]substitution_defs}(hhjhjjj jj/j#j>j2jMjAj\jPjkj_jzjnjj}jjjjjjjjjjjjjjjjjjjjj.j"j=j1jLj@j[jOjjj^jyjmjj|jjjjjjjjjjjjjjjjjjjjj-j!j<j0jKj?jZjNjij]jxjljj{jjjjjjjjjjjjjjjjjjjjj,j j;j/jJj>jYjMjhj\jwjkjjzjjjjjjjjjjjjjjjjj jjjj+jj:j.jIj=jXjLusubstitution_names}(amphaposjastjbrvbarj bsolj/centj>colonjMcommaj\commatjkcopyjzcurrenjdarrjdegjdividejdollarjequalsjexcljfrac12jfrac14jfrac18jfrac34jfrac38j.frac58j=frac78jLgtj[halfjjhorbarjyhyphenjiexcljiquestjlaquojlarrjlcubjldquojlowbarjlparjlsqbjlsquojltj-microj<middotjKnbspjZnotjinumjxohmjordfjordmjparajpercntjperiodjplusjplusmnjpoundjquestjquotjraquoj,rarrj;rcubjJrdquojYregjhrparjwrsqbjrsquojsectjsemijshyjsoljsungjsup1jsup2jsup3j timesjtradej+uarrj:verbarjIyenjXurefnames}(amd xdna architecture]jaamd ai engine technology]ja6versal adaptive soc aie-ml architecture manual (am020)]jv aai engine run time]j aurefids}nameids}(jAj>jIjFjjjjjjj>j;jsjpjrjojjjC j@ j; j8 j j j j jb j_ j j j0 j- jW jT j j j j j j js jp j j j'j$j9j6j_j\jjjjjjjju nametypes}(jAjIjjjj>jsjrjjC j; j j jb j j0 jW j j j js j j'j9j_jjjjuh}(j>j]jFjjjLjjkjjj;jjpjAjojvjjuj@ jj8 j j jF j j j_ j, j je j- j jT j3 j jZ j jk j j jp j j jv j$j j6j*j\jVjjjjjjjju footnote_refs} citation_refs} autofootnotes]autofootnote_refs]symbol_footnotes]symbol_footnote_refs] footnotes] citations]autofootnote_startKsymbol_footnote_startK id_counter collectionsCounter}Rparse_messages]transform_messages] transformerN include_log]&Documentation/accel/amdxdna/amdnpu.rst(NNNNta decorationNhhub.