sphinx.addnodesdocument)}( rawsourcechildren]( translations LanguagesNode)}(hhh](h pending_xref)}(hhh]docutils.nodesTextChinese (Simplified)}parenthsba attributes}(ids]classes]names]dupnames]backrefs] refdomainstdreftypedoc reftarget(/translations/zh_CN/accel/amdxdna/amdnpumodnameN classnameN refexplicitutagnamehhh ubh)}(hhh]hChinese (Traditional)}hh2sbah}(h]h ]h"]h$]h&] refdomainh)reftypeh+ reftarget(/translations/zh_TW/accel/amdxdna/amdnpumodnameN classnameN refexplicituh1hhh ubh)}(hhh]hItalian}hhFsbah}(h]h ]h"]h$]h&] refdomainh)reftypeh+ reftarget(/translations/it_IT/accel/amdxdna/amdnpumodnameN classnameN refexplicituh1hhh ubh)}(hhh]hJapanese}hhZsbah}(h]h ]h"]h$]h&] refdomainh)reftypeh+ reftarget(/translations/ja_JP/accel/amdxdna/amdnpumodnameN classnameN refexplicituh1hhh ubh)}(hhh]hKorean}hhnsbah}(h]h ]h"]h$]h&] refdomainh)reftypeh+ reftarget(/translations/ko_KR/accel/amdxdna/amdnpumodnameN classnameN refexplicituh1hhh ubh)}(hhh]hSpanish}hhsbah}(h]h ]h"]h$]h&] refdomainh)reftypeh+ reftarget(/translations/sp_SP/accel/amdxdna/amdnpumodnameN classnameN refexplicituh1hhh ubeh}(h]h ]h"]h$]h&]current_languageEnglishuh1h hh _documenthsourceNlineNubhcomment)}(h%SPDX-License-Identifier: GPL-2.0-onlyh]h%SPDX-License-Identifier: GPL-2.0-only}hhsbah}(h]h ]h"]h$]h&] xml:spacepreserveuh1hhhhhhB/var/lib/git/docbuild/linux/Documentation/accel/amdxdna/amdnpu.rsthKubh)}(h4This data file has been placed in the public domain.h]h4This data file has been placed in the public domain.}hhsbah}(h]h ]h"]h$]h&]hhuh1hhhhhho/srv/docbuild/lib/venvs/build-kernel-docs/lib64/python3.9/site-packages/docutils/parsers/rst/include/isonum.txthKubh)}(hDerived from the Unicode character mappings available from . Processed by unicode2rstsubs.py, part of Docutils: .h]hDerived from the Unicode character mappings available from . Processed by unicode2rstsubs.py, part of Docutils: .}hhsbah}(h]h ]h"]h$]h&]hhuh1hhhhhhhhKubhsubstitution_definition)}(h*.. |amp| unicode:: U+00026 .. AMPERSANDh]h&}hhsbah}(h]h ]h"]ampah$]h&]uh1hhhhKhhhhubh)}(h+.. |apos| unicode:: U+00027 .. APOSTROPHEh]h'}hhsbah}(h]h ]h"]aposah$]h&]uh1hhhhKhhhhubh)}(h).. |ast| unicode:: U+0002A .. ASTERISKh]h*}hhsbah}(h]h ]h"]astah$]h&]uh1hhhhK hhhhubh)}(h+.. |brvbar| unicode:: U+000A6 .. BROKEN BARh]h¦}hjsbah}(h]h ]h"]brvbarah$]h&]uh1hhhhK hhhhubh)}(h0.. |bsol| unicode:: U+0005C .. REVERSE SOLIDUSh]h\}hjsbah}(h]h ]h"]bsolah$]h&]uh1hhhhK hhhhubh)}(h*.. |cent| unicode:: U+000A2 .. CENT SIGNh]h¢}hjsbah}(h]h ]h"]centah$]h&]uh1hhhhK hhhhubh)}(h&.. |colon| unicode:: U+0003A .. COLONh]h:}hj-sbah}(h]h ]h"]colonah$]h&]uh1hhhhK hhhhubh)}(h&.. |comma| unicode:: U+0002C .. COMMAh]h,}hj<sbah}(h]h ]h"]commaah$]h&]uh1hhhhKhhhhubh)}(h... |commat| unicode:: U+00040 .. COMMERCIAL ATh]h@}hjKsbah}(h]h ]h"]commatah$]h&]uh1hhhhKhhhhubh)}(h/.. |copy| unicode:: U+000A9 .. COPYRIGHT SIGNh]h©}hjZsbah}(h]h ]h"]copyah$]h&]uh1hhhhKhhhhubh)}(h... |curren| unicode:: U+000A4 .. CURRENCY SIGNh]h¤}hjisbah}(h]h ]h"]currenah$]h&]uh1hhhhKhhhhubh)}(h0.. |darr| unicode:: U+02193 .. DOWNWARDS ARROWh]h↓}hjxsbah}(h]h ]h"]darrah$]h&]uh1hhhhKhhhhubh)}(h,.. |deg| unicode:: U+000B0 .. DEGREE SIGNh]h°}hjsbah}(h]h ]h"]degah$]h&]uh1hhhhKhhhhubh)}(h... |divide| unicode:: U+000F7 .. DIVISION SIGNh]h÷}hjsbah}(h]h ]h"]divideah$]h&]uh1hhhhKhhhhubh)}(h,.. |dollar| unicode:: U+00024 .. DOLLAR SIGNh]h$}hjsbah}(h]h ]h"]dollarah$]h&]uh1hhhhKhhhhubh)}(h,.. |equals| unicode:: U+0003D .. EQUALS SIGNh]h=}hjsbah}(h]h ]h"]equalsah$]h&]uh1hhhhKhhhhubh)}(h1.. |excl| unicode:: U+00021 .. EXCLAMATION MARKh]h!}hjsbah}(h]h ]h"]exclah$]h&]uh1hhhhKhhhhubh)}(h9.. |frac12| unicode:: U+000BD .. VULGAR FRACTION ONE HALFh]h½}hjsbah}(h]h ]h"]frac12ah$]h&]uh1hhhhKhhhhubh)}(h<.. |frac14| unicode:: U+000BC .. VULGAR FRACTION ONE QUARTERh]h¼}hjsbah}(h]h ]h"]frac14ah$]h&]uh1hhhhKhhhhubh)}(h;.. |frac18| unicode:: U+0215B .. VULGAR FRACTION ONE EIGHTHh]h⅛}hjsbah}(h]h ]h"]frac18ah$]h&]uh1hhhhKhhhhubh)}(h?.. |frac34| unicode:: U+000BE .. VULGAR FRACTION THREE QUARTERSh]h¾}hjsbah}(h]h ]h"]frac34ah$]h&]uh1hhhhKhhhhubh)}(h>.. |frac38| unicode:: U+0215C .. VULGAR FRACTION THREE EIGHTHSh]h⅜}hjsbah}(h]h ]h"]frac38ah$]h&]uh1hhhhKhhhhubh)}(h=.. |frac58| unicode:: U+0215D .. VULGAR FRACTION FIVE EIGHTHSh]h⅝}hjsbah}(h]h ]h"]frac58ah$]h&]uh1hhhhKhhhhubh)}(h>.. |frac78| unicode:: U+0215E .. VULGAR FRACTION SEVEN EIGHTHSh]h⅞}hj,sbah}(h]h ]h"]frac78ah$]h&]uh1hhhhKhhhhubh)}(h2.. |gt| unicode:: U+0003E .. GREATER-THAN SIGNh]h>}hj;sbah}(h]h ]h"]gtah$]h&]uh1hhhhKhhhhubh)}(h9.. |half| unicode:: U+000BD .. VULGAR FRACTION ONE HALFh]h½}hjJsbah}(h]h ]h"]halfah$]h&]uh1hhhhK hhhhubh)}(h/.. |horbar| unicode:: U+02015 .. HORIZONTAL BARh]h―}hjYsbah}(h]h ]h"]horbarah$]h&]uh1hhhhK!hhhhubh)}(h'.. |hyphen| unicode:: U+02010 .. HYPHENh]h‐}hjhsbah}(h]h ]h"]hyphenah$]h&]uh1hhhhK"hhhhubh)}(h:.. |iexcl| unicode:: U+000A1 .. INVERTED EXCLAMATION MARKh]h¡}hjwsbah}(h]h ]h"]iexclah$]h&]uh1hhhhK#hhhhubh)}(h7.. |iquest| unicode:: U+000BF .. INVERTED QUESTION MARKh]h¿}hjsbah}(h]h ]h"]iquestah$]h&]uh1hhhhK$hhhhubh)}(hJ.. |laquo| unicode:: U+000AB .. LEFT-POINTING DOUBLE ANGLE QUOTATION MARKh]h«}hjsbah}(h]h ]h"]laquoah$]h&]uh1hhhhK%hhhhubh)}(h0.. |larr| unicode:: U+02190 .. LEFTWARDS ARROWh]h←}hjsbah}(h]h ]h"]larrah$]h&]uh1hhhhK&hhhhubh)}(h3.. |lcub| unicode:: U+0007B .. LEFT CURLY BRACKETh]h{}hjsbah}(h]h ]h"]lcubah$]h&]uh1hhhhK'hhhhubh)}(h;.. |ldquo| unicode:: U+0201C .. LEFT DOUBLE QUOTATION MARKh]h“}hjsbah}(h]h ]h"]ldquoah$]h&]uh1hhhhK(hhhhubh)}(h).. |lowbar| unicode:: U+0005F .. LOW LINEh]h_}hjsbah}(h]h ]h"]lowbarah$]h&]uh1hhhhK)hhhhubh)}(h1.. |lpar| unicode:: U+00028 .. LEFT PARENTHESISh]h(}hjsbah}(h]h ]h"]lparah$]h&]uh1hhhhK*hhhhubh)}(h4.. |lsqb| unicode:: U+0005B .. LEFT SQUARE BRACKETh]h[}hjsbah}(h]h ]h"]lsqbah$]h&]uh1hhhhK+hhhhubh)}(h;.. |lsquo| unicode:: U+02018 .. LEFT SINGLE QUOTATION MARKh]h‘}hjsbah}(h]h ]h"]lsquoah$]h&]uh1hhhhK,hhhhubh)}(h/.. |lt| unicode:: U+0003C .. LESS-THAN SIGNh]h<}hj sbah}(h]h ]h"]ltah$]h&]uh1hhhhK-hhhhubh)}(h+.. |micro| unicode:: U+000B5 .. MICRO SIGNh]hµ}hjsbah}(h]h ]h"]microah$]h&]uh1hhhhK.hhhhubh)}(h+.. |middot| unicode:: U+000B7 .. MIDDLE DOTh]h·}hj+sbah}(h]h ]h"]middotah$]h&]uh1hhhhK/hhhhubh)}(h/.. |nbsp| unicode:: U+000A0 .. NO-BREAK SPACEh]h }hj:sbah}(h]h ]h"]nbspah$]h&]uh1hhhhK0hhhhubh)}(h).. |not| unicode:: U+000AC .. NOT SIGNh]h¬}hjIsbah}(h]h ]h"]notah$]h&]uh1hhhhK1hhhhubh)}(h,.. |num| unicode:: U+00023 .. NUMBER SIGNh]h#}hjXsbah}(h]h ]h"]numah$]h&]uh1hhhhK2hhhhubh)}(h).. |ohm| unicode:: U+02126 .. OHM SIGNh]hΩ}hjgsbah}(h]h ]h"]ohmah$]h&]uh1hhhhK3hhhhubh)}(h;.. |ordf| unicode:: U+000AA .. FEMININE ORDINAL INDICATORh]hª}hjvsbah}(h]h ]h"]ordfah$]h&]uh1hhhhK4hhhhubh)}(h<.. |ordm| unicode:: U+000BA .. MASCULINE ORDINAL INDICATORh]hº}hjsbah}(h]h ]h"]ordmah$]h&]uh1hhhhK5hhhhubh)}(h-.. |para| unicode:: U+000B6 .. PILCROW SIGNh]h¶}hjsbah}(h]h ]h"]paraah$]h&]uh1hhhhK6hhhhubh)}(h-.. |percnt| unicode:: U+00025 .. PERCENT SIGNh]h%}hjsbah}(h]h ]h"]percntah$]h&]uh1hhhhK7hhhhubh)}(h*.. |period| unicode:: U+0002E .. FULL STOPh]h.}hjsbah}(h]h ]h"]periodah$]h&]uh1hhhhK8hhhhubh)}(h*.. |plus| unicode:: U+0002B .. PLUS SIGNh]h+}hjsbah}(h]h ]h"]plusah$]h&]uh1hhhhK9hhhhubh)}(h0.. |plusmn| unicode:: U+000B1 .. PLUS-MINUS SIGNh]h±}hjsbah}(h]h ]h"]plusmnah$]h&]uh1hhhhK:hhhhubh)}(h+.. |pound| unicode:: U+000A3 .. POUND SIGNh]h£}hjsbah}(h]h ]h"]poundah$]h&]uh1hhhhK;hhhhubh)}(h... |quest| unicode:: U+0003F .. QUESTION MARKh]h?}hjsbah}(h]h ]h"]questah$]h&]uh1hhhhKhhhhubh)}(h1.. |rarr| unicode:: U+02192 .. RIGHTWARDS ARROWh]h→}hjsbah}(h]h ]h"]rarrah$]h&]uh1hhhhK?hhhhubh)}(h4.. |rcub| unicode:: U+0007D .. RIGHT CURLY BRACKETh]h}}hj*sbah}(h]h ]h"]rcubah$]h&]uh1hhhhK@hhhhubh)}(h<.. |rdquo| unicode:: U+0201D .. RIGHT DOUBLE QUOTATION MARKh]h”}hj9sbah}(h]h ]h"]rdquoah$]h&]uh1hhhhKAhhhhubh)}(h0.. |reg| unicode:: U+000AE .. REGISTERED SIGNh]h®}hjHsbah}(h]h ]h"]regah$]h&]uh1hhhhKBhhhhubh)}(h2.. |rpar| unicode:: U+00029 .. RIGHT PARENTHESISh]h)}hjWsbah}(h]h ]h"]rparah$]h&]uh1hhhhKChhhhubh)}(h5.. |rsqb| unicode:: U+0005D .. RIGHT SQUARE BRACKETh]h]}hjfsbah}(h]h ]h"]rsqbah$]h&]uh1hhhhKDhhhhubh)}(h<.. |rsquo| unicode:: U+02019 .. RIGHT SINGLE QUOTATION MARKh]h’}hjusbah}(h]h ]h"]rsquoah$]h&]uh1hhhhKEhhhhubh)}(h-.. |sect| unicode:: U+000A7 .. SECTION SIGNh]h§}hjsbah}(h]h ]h"]sectah$]h&]uh1hhhhKFhhhhubh)}(h*.. |semi| unicode:: U+0003B .. SEMICOLONh]h;}hjsbah}(h]h ]h"]semiah$]h&]uh1hhhhKGhhhhubh)}(h,.. |shy| unicode:: U+000AD .. SOFT HYPHENh]h­}hjsbah}(h]h ]h"]shyah$]h&]uh1hhhhKHhhhhubh)}(h(.. |sol| unicode:: U+0002F .. SOLIDUSh]h/}hjsbah}(h]h ]h"]solah$]h&]uh1hhhhKIhhhhubh)}(h,.. |sung| unicode:: U+0266A .. EIGHTH NOTEh]h♪}hjsbah}(h]h ]h"]sungah$]h&]uh1hhhhKJhhhhubh)}(h0.. |sup1| unicode:: U+000B9 .. SUPERSCRIPT ONEh]h¹}hjsbah}(h]h ]h"]sup1ah$]h&]uh1hhhhKKhhhhubh)}(h0.. |sup2| unicode:: U+000B2 .. SUPERSCRIPT TWOh]h²}hjsbah}(h]h ]h"]sup2ah$]h&]uh1hhhhKLhhhhubh)}(h2.. |sup3| unicode:: U+000B3 .. SUPERSCRIPT THREEh]h³}hjsbah}(h]h ]h"]sup3ah$]h&]uh1hhhhKMhhhhubh)}(h4.. |times| unicode:: U+000D7 .. MULTIPLICATION SIGNh]h×}hjsbah}(h]h ]h"]timesah$]h&]uh1hhhhKNhhhhubh)}(h0.. |trade| unicode:: U+02122 .. TRADE MARK SIGNh]h™}hj sbah}(h]h ]h"]tradeah$]h&]uh1hhhhKOhhhhubh)}(h... |uarr| unicode:: U+02191 .. UPWARDS ARROWh]h↑}hjsbah}(h]h ]h"]uarrah$]h&]uh1hhhhKPhhhhubh)}(h... |verbar| unicode:: U+0007C .. VERTICAL LINEh]h|}hj)sbah}(h]h ]h"]verbarah$]h&]uh1hhhhKQhhhhubh)}(h*.. |yen| unicode:: U+000A5 .. YEN SIGN h]h¥}hj8sbah}(h]h ]h"]yenah$]h&]uh1hhhhKRhhhhubhsection)}(hhh](htitle)}(hAMD NPUh]hAMD NPU}(hjNhhhNhNubah}(h]h ]h"]h$]h&]uh1jLhjIhhhhhKubh field_list)}(hhh](hfield)}(hhh](h field_name)}(h Copyrighth]h Copyright}(hjhhhhNhNubah}(h]h ]h"]h$]h&]uh1jfhjchhhKubh field_body)}(h(|copy| 2024 Advanced Micro Devices, Inc.h]h paragraph)}(hjzh](h©}(hj~hhhNhNubh" 2024 Advanced Micro Devices, Inc.}(hj~hhhNhNubeh}(h]h ]h"]h$]h&]uh1j|hhhK hjxubah}(h]h ]h"]h$]h&]uh1jvhjcubeh}(h]h ]h"]h$]h&]uh1jahhhK hj^hhubjb)}(hhh](jg)}(hAuthorh]hAuthor}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jfhjhhhKubjw)}(h$Sonal Santan h]j})}(h#Sonal Santan h](hSonal Santan <}(hjhhhNhNubh reference)}(hsonal.santan@amd.comh]hsonal.santan@amd.com}(hjhhhNhNubah}(h]h ]h"]h$]h&]refurimailto:sonal.santan@amd.comuh1jhjubh>}(hjhhhNhNubeh}(h]h ]h"]h$]h&]uh1j|hhhK hjubah}(h]h ]h"]h$]h&]uh1jvhjubeh}(h]h ]h"]h$]h&]uh1jahhhK hj^hhubeh}(h]h ]h"]h$]h&]uh1j\hjIhhhhhK ubjH)}(hhh](jM)}(hOverviewh]hOverview}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jLhjhhhhhK ubj})}(hXAMD NPU (Neural Processing Unit) is a multi-user AI inference accelerator integrated into AMD client APU. NPU enables efficient execution of Machine Learning applications like CNN, LLM, etc. NPU is based on `AMD XDNA Architecture`_. NPU is managed by **amdxdna** driver.h](hAMD NPU (Neural Processing Unit) is a multi-user AI inference accelerator integrated into AMD client APU. NPU enables efficient execution of Machine Learning applications like CNN, LLM, etc. NPU is based on }(hjhhhNhNubj)}(h`AMD XDNA Architecture`_h]hAMD XDNA Architecture}(hjhhhNhNubah}(h]h ]h"]h$]h&]nameAMD XDNA Architecturerefuri-https://www.amd.com/en/technologies/xdna.htmluh1jhjresolvedKubh. NPU is managed by }(hjhhhNhNubhstrong)}(h **amdxdna**h]hamdxdna}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jhjubh driver.}(hjhhhNhNubeh}(h]h ]h"]h$]h&]uh1j|hhhKhjhhubeh}(h]overviewah ]h"]overviewah$]h&]uh1jGhjIhhhhhK ubjH)}(hhh](jM)}(hHardware Descriptionh]hHardware Description}(hj;hhhNhNubah}(h]h ]h"]h$]h&]uh1jLhj8hhhhhKubj})}(h6AMD NPU consists of the following hardware components:h]h6AMD NPU consists of the following hardware components:}(hjIhhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKhj8hhubjH)}(hhh](jM)}(hAMD XDNA Arrayh]hAMD XDNA Array}(hjZhhhNhNubah}(h]h ]h"]h$]h&]uh1jLhjWhhhhhKubj})}(hXAMD XDNA Array comprises of 2D array of compute and memory tiles built with `AMD AI Engine Technology`_. Each column has 4 rows of compute tiles and 1 row of memory tile. Each compute tile contains a VLIW processor with its own dedicated program and data memory. The memory tile acts as L2 memory. The 2D array can be partitioned at a column boundary creating a spatially isolated partition which can be bound to a workload context.h](hLAMD XDNA Array comprises of 2D array of compute and memory tiles built with }(hjhhhhNhNubj)}(h`AMD AI Engine Technology`_h]hAMD AI Engine Technology}(hjphhhNhNubah}(h]h ]h"]h$]h&]nameAMD AI Engine Technologyj9https://www.xilinx.com/products/technology/ai-engine.htmluh1jhjhjKubhXI. Each column has 4 rows of compute tiles and 1 row of memory tile. Each compute tile contains a VLIW processor with its own dedicated program and data memory. The memory tile acts as L2 memory. The 2D array can be partitioned at a column boundary creating a spatially isolated partition which can be bound to a workload context.}(hjhhhhNhNubeh}(h]h ]h"]h$]h&]uh1j|hhhKhjWhhubj})}(hYEach column also has dedicated DMA engines to move data between host DDR and memory tile.h]hYEach column also has dedicated DMA engines to move data between host DDR and memory tile.}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhK$hjWhhubj})}(hAMD Phoenix and AMD Hawk Point client NPU have a 4x5 topology, i.e., 4 rows of compute tiles arranged into 5 columns. AMD Strix Point client APU have 4x8 topology, i.e., 4 rows of compute tiles arranged into 8 columns.h]hAMD Phoenix and AMD Hawk Point client NPU have a 4x5 topology, i.e., 4 rows of compute tiles arranged into 5 columns. AMD Strix Point client APU have 4x8 topology, i.e., 4 rows of compute tiles arranged into 8 columns.}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhK'hjWhhubeh}(h]amd-xdna-arrayah ]h"]amd xdna arrayah$]h&]uh1jGhj8hhhhhKubjH)}(hhh](jM)}(hShared L2 Memoryh]hShared L2 Memory}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jLhjhhhhhK,ubj})}(hXThe single row of memory tiles create a pool of software managed on chip L2 memory. DMA engines are used to move data between host DDR and memory tiles. AMD Phoenix and AMD Hawk Point NPUs have a total of 2560 KB of L2 memory. AMD Strix Point NPU has a total of 4096 KB of L2 memory.h]hXThe single row of memory tiles create a pool of software managed on chip L2 memory. DMA engines are used to move data between host DDR and memory tiles. AMD Phoenix and AMD Hawk Point NPUs have a total of 2560 KB of L2 memory. AMD Strix Point NPU has a total of 4096 KB of L2 memory.}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhK.hjhhubeh}(h]shared-l2-memoryah ]h"]shared l2 memoryah$]h&]uh1jGhj8hhhhhK,ubjH)}(hhh](jM)}(hMicrocontrollerh]hMicrocontroller}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jLhjhhhhhK4ubj})}(hA microcontroller runs NPU Firmware which is responsible for command processing, XDNA Array partition setup, XDNA Array configuration, workload context management and workload orchestration.h]hA microcontroller runs NPU Firmware which is responsible for command processing, XDNA Array partition setup, XDNA Array configuration, workload context management and workload orchestration.}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhK6hjhhubj})}(hNPU Firmware uses a dedicated instance of an isolated non-privileged context called ERT to service each workload context. ERT is also used to execute user provided ``ctrlcode`` associated with the workload context.h](hNPU Firmware uses a dedicated instance of an isolated non-privileged context called ERT to service each workload context. ERT is also used to execute user provided }(hjhhhNhNubhliteral)}(h ``ctrlcode``h]hctrlcode}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jhjubh& associated with the workload context.}(hjhhhNhNubeh}(h]h ]h"]h$]h&]uh1j|hhhK:hjhhubj})}(hzNPU Firmware uses a single isolated privileged context called MERT to service management commands from the amdxdna driver.h]hzNPU Firmware uses a single isolated privileged context called MERT to service management commands from the amdxdna driver.}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhK>hjhhubeh}(h]microcontrollerah ]h"]microcontrollerah$]h&]uh1jGhj8hhhhhK4ubjH)}(hhh](jM)}(h Mailboxesh]h Mailboxes}(hj0hhhNhNubah}(h]h ]h"]h$]h&]uh1jLhj-hhhhhKBubj})}(hX-The microcontroller and amdxdna driver use a privileged channel for management tasks like setting up of contexts, telemetry, query, error handling, setting up user channel, etc. As mentioned before, privileged channel requests are serviced by MERT. The privileged channel is bound to a single mailbox.h]hX-The microcontroller and amdxdna driver use a privileged channel for management tasks like setting up of contexts, telemetry, query, error handling, setting up user channel, etc. As mentioned before, privileged channel requests are serviced by MERT. The privileged channel is bound to a single mailbox.}(hj>hhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKDhj-hhubj})}(hX&The microcontroller and amdxdna driver use a dedicated user channel per workload context. The user channel is primarily used for submitting work to the NPU. As mentioned before, a user channel requests are serviced by an instance of ERT. Each user channel is bound to its own dedicated mailbox.h]hX&The microcontroller and amdxdna driver use a dedicated user channel per workload context. The user channel is primarily used for submitting work to the NPU. As mentioned before, a user channel requests are serviced by an instance of ERT. Each user channel is bound to its own dedicated mailbox.}(hjLhhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKIhj-hhubeh}(h] mailboxesah ]h"] mailboxesah$]h&]uh1jGhj8hhhhhKBubjH)}(hhh](jM)}(hPCIe EPh]hPCIe EP}(hjehhhNhNubah}(h]h ]h"]h$]h&]uh1jLhjbhhhhhKOubj})}(hX3NPU is visible to the x86 host CPU as a PCIe device with multiple BARs and some MSI-X interrupt vectors. NPU uses a dedicated high bandwidth SoC level fabric for reading or writing into host memory. Each instance of ERT gets its own dedicated MSI-X interrupt. MERT gets a single instance of MSI-X interrupt.h]hX3NPU is visible to the x86 host CPU as a PCIe device with multiple BARs and some MSI-X interrupt vectors. NPU uses a dedicated high bandwidth SoC level fabric for reading or writing into host memory. Each instance of ERT gets its own dedicated MSI-X interrupt. MERT gets a single instance of MSI-X interrupt.}(hjshhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKQhjbhhubj})}(hThe number of PCIe BARs varies depending on the specific device. Based on their functions, PCIe BARs can generally be categorized into the following types.h]hThe number of PCIe BARs varies depending on the specific device. Based on their functions, PCIe BARs can generally be categorized into the following types.}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKVhjbhhubh bullet_list)}(hhh](h list_item)}(hBPSP BAR: Expose the AMD PSP (Platform Security Processor) functionh]j})}(hjh]hBPSP BAR: Expose the AMD PSP (Platform Security Processor) function}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKYhjubah}(h]h ]h"]h$]h&]uh1jhjhhhhhNubj)}(h=SMU BAR: Expose the AMD SMU (System Management Unit) functionh]j})}(hjh]h=SMU BAR: Expose the AMD SMU (System Management Unit) function}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKZhjubah}(h]h ]h"]h$]h&]uh1jhjhhhhhNubj)}(h-SRAM BAR: Expose ring buffers for the mailboxh]j})}(hjh]h-SRAM BAR: Expose ring buffers for the mailbox}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhK[hjubah}(h]h ]h"]h$]h&]uh1jhjhhhhhNubj)}(hUMailbox BAR: Expose the mailbox control registers (head, tail and ISR registers etc.)h]j})}(hUMailbox BAR: Expose the mailbox control registers (head, tail and ISR registers etc.)h]hUMailbox BAR: Expose the mailbox control registers (head, tail and ISR registers etc.)}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhK\hjubah}(h]h ]h"]h$]h&]uh1jhjhhhhhNubj)}(h-Public Register BAR: Expose public registers h]j})}(h,Public Register BAR: Expose public registersh]h,Public Register BAR: Expose public registers}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhK^hjubah}(h]h ]h"]h$]h&]uh1jhjhhhhhNubeh}(h]h ]h"]h$]h&]bullet*uh1jhhhKYhjbhhubj})}(hOn specific devices, the above-mentioned BAR type might be combined into a single physical PCIe BAR. Or a module might require two physical PCIe BARs to be fully functional. For example,h]hOn specific devices, the above-mentioned BAR type might be combined into a single physical PCIe BAR. Or a module might require two physical PCIe BARs to be fully functional. For example,}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhK`hjbhhubj)}(hhh](j)}(hNOn AMD Phoenix device, PSP, SMU, Public Register BARs are on PCIe BAR index 0.h]j})}(hj&h]hNOn AMD Phoenix device, PSP, SMU, Public Register BARs are on PCIe BAR index 0.}(hj(hhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKdhj$ubah}(h]h ]h"]h$]h&]uh1jhj!hhhhhNubj)}(hOn AMD Strix Point device, Mailbox and Public Register BARs are on PCIe BAR index 0. The PSP has some registers in PCIe BAR index 0 (Public Register BAR) and PCIe BAR index 4 (PSP BAR). h]j})}(hOn AMD Strix Point device, Mailbox and Public Register BARs are on PCIe BAR index 0. The PSP has some registers in PCIe BAR index 0 (Public Register BAR) and PCIe BAR index 4 (PSP BAR).h]hOn AMD Strix Point device, Mailbox and Public Register BARs are on PCIe BAR index 0. The PSP has some registers in PCIe BAR index 0 (Public Register BAR) and PCIe BAR index 4 (PSP BAR).}(hj?hhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKehj;ubah}(h]h ]h"]h$]h&]uh1jhj!hhhhhNubeh}(h]h ]h"]h$]h&]jjuh1jhhhKdhjbhhubeh}(h]pcie-epah ]h"]pcie epah$]h&]uh1jGhj8hhhhhKOubjH)}(hhh](jM)}(hProcess Isolation Hardwareh]hProcess Isolation Hardware}(hjdhhhNhNubah}(h]h ]h"]h$]h&]uh1jLhjahhhhhKjubj})}(hXAs explained before, XDNA Array can be dynamically divided into isolated spatial partitions, each of which may have one or more columns. The spatial partition is setup by programming the column isolation registers by the microcontroller. Each spatial partition is associated with a PASID which is also programmed by the microcontroller. Hence multiple spatial partitions in the NPU can make concurrent host access protected by PASID.h]hXAs explained before, XDNA Array can be dynamically divided into isolated spatial partitions, each of which may have one or more columns. The spatial partition is setup by programming the column isolation registers by the microcontroller. Each spatial partition is associated with a PASID which is also programmed by the microcontroller. Hence multiple spatial partitions in the NPU can make concurrent host access protected by PASID.}(hjrhhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKlhjahhubj})}(hyThe NPU FW itself uses microcontroller MMU enforced isolated contexts for servicing user and privileged channel requests.h]hyThe NPU FW itself uses microcontroller MMU enforced isolated contexts for servicing user and privileged channel requests.}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKshjahhubeh}(h]process-isolation-hardwareah ]h"]process isolation hardwareah$]h&]uh1jGhj8hhhhhKjubeh}(h]hardware-descriptionah ]h"]hardware descriptionah$]h&]uh1jGhjIhhhhhKubjH)}(hhh](jM)}(h%Mixed Spatial and Temporal Schedulingh]h%Mixed Spatial and Temporal Scheduling}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jLhjhhhhhKxubj})}(hXAMD XDNA architecture supports mixed spatial and temporal (time sharing) scheduling of 2D array. This means that spatial partitions may be setup and torn down dynamically to accommodate various workloads. A *spatial* partition may be *exclusively* bound to one workload context while another partition may be *temporarily* bound to more than one workload contexts. The microcontroller updates the PASID for a temporarily shared partition to match the context that has been bound to the partition at any moment.h](hAMD XDNA architecture supports mixed spatial and temporal (time sharing) scheduling of 2D array. This means that spatial partitions may be setup and torn down dynamically to accommodate various workloads. A }(hjhhhNhNubhemphasis)}(h *spatial*h]hspatial}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jhjubh partition may be }(hjhhhNhNubj)}(h *exclusively*h]h exclusively}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jhjubh> bound to one workload context while another partition may be }(hjhhhNhNubj)}(h *temporarily*h]h temporarily}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jhjubh bound to more than one workload contexts. The microcontroller updates the PASID for a temporarily shared partition to match the context that has been bound to the partition at any moment.}(hjhhhNhNubeh}(h]h ]h"]h$]h&]uh1j|hhhKzhjhhubjH)}(hhh](jM)}(hResource Solverh]hResource Solver}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jLhjhhhhhKubj})}(hXThe Resource Solver component of the amdxdna driver manages the allocation of 2D array among various workloads. Every workload describes the number of columns required to run the NPU binary in its metadata. The Resource Solver component uses hints passed by the workload and its own heuristics to decide 2D array (re)partition strategy and mapping of workloads for spatial and temporal sharing of columns. The FW enforces the context-to-column(s) resource binding decisions made by the Resource Solver.h]hXThe Resource Solver component of the amdxdna driver manages the allocation of 2D array among various workloads. Every workload describes the number of columns required to run the NPU binary in its metadata. The Resource Solver component uses hints passed by the workload and its own heuristics to decide 2D array (re)partition strategy and mapping of workloads for spatial and temporal sharing of columns. The FW enforces the context-to-column(s) resource binding decisions made by the Resource Solver.}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKhjhhubj})}(hAMD Phoenix and AMD Hawk Point client NPU can support 6 concurrent workload contexts. AMD Strix Point can support 16 concurrent workload contexts.h]hAMD Phoenix and AMD Hawk Point client NPU can support 6 concurrent workload contexts. AMD Strix Point can support 16 concurrent workload contexts.}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKhjhhubeh}(h]resource-solverah ]h"]resource solverah$]h&]uh1jGhjhhhhhKubeh}(h]%mixed-spatial-and-temporal-schedulingah ]h"]%mixed spatial and temporal schedulingah$]h&]uh1jGhjIhhhhhKxubjH)}(hhh](jM)}(hApplication Binariesh]hApplication Binaries}(hj5 hhhNhNubah}(h]h ]h"]h$]h&]uh1jLhj2 hhhhhKubj})}(hiA NPU application workload is comprised of two separate binaries which are generated by the NPU compiler.h]hiA NPU application workload is comprised of two separate binaries which are generated by the NPU compiler.}(hjC hhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKhj2 hhubhenumerated_list)}(hhh](j)}(hXzAMD XDNA Array overlay, which is used to configure a NPU spatial partition. The overlay contains instructions for setting up the stream switch configuration and ELF for the compute tiles. The overlay is loaded on the spatial partition bound to the workload by the associated ERT instance. Refer to the `Versal Adaptive SoC AIE-ML Architecture Manual (AM020)`_ for more details. h]j})}(hXyAMD XDNA Array overlay, which is used to configure a NPU spatial partition. The overlay contains instructions for setting up the stream switch configuration and ELF for the compute tiles. The overlay is loaded on the spatial partition bound to the workload by the associated ERT instance. Refer to the `Versal Adaptive SoC AIE-ML Architecture Manual (AM020)`_ for more details.h](hX.AMD XDNA Array overlay, which is used to configure a NPU spatial partition. The overlay contains instructions for setting up the stream switch configuration and ELF for the compute tiles. The overlay is loaded on the spatial partition bound to the workload by the associated ERT instance. Refer to the }(hjZ hhhNhNubj)}(h9`Versal Adaptive SoC AIE-ML Architecture Manual (AM020)`_h]h6Versal Adaptive SoC AIE-ML Architecture Manual (AM020)}(hjb hhhNhNubah}(h]h ]h"]h$]h&]name6Versal Adaptive SoC AIE-ML Architecture Manual (AM020)j0https://docs.amd.com/r/en-US/am020-versal-aie-mluh1jhjZ jKubh for more details.}(hjZ hhhNhNubeh}(h]h ]h"]h$]h&]uh1j|hhhKhjV ubah}(h]h ]h"]h$]h&]uh1jhjS hhhhhNubj)}(hXH``ctrlcode``, used for orchestrating the overlay loaded on the spatial partition. ``ctrlcode`` is executed by the ERT running in protected mode on the microcontroller in the context of the workload. ``ctrlcode`` is made up of a sequence of opcodes named ``XAie_TxnOpcode``. Refer to the `AI Engine Run Time`_ for more details. h]j})}(hXF``ctrlcode``, used for orchestrating the overlay loaded on the spatial partition. ``ctrlcode`` is executed by the ERT running in protected mode on the microcontroller in the context of the workload. ``ctrlcode`` is made up of a sequence of opcodes named ``XAie_TxnOpcode``. Refer to the `AI Engine Run Time`_ for more details.h](j)}(h ``ctrlcode``h]hctrlcode}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj ubhF, used for orchestrating the overlay loaded on the spatial partition. }(hj hhhNhNubj)}(h ``ctrlcode``h]hctrlcode}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj ubhi is executed by the ERT running in protected mode on the microcontroller in the context of the workload. }(hj hhhNhNubj)}(h ``ctrlcode``h]hctrlcode}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj ubh+ is made up of a sequence of opcodes named }(hj hhhNhNubj)}(h``XAie_TxnOpcode``h]hXAie_TxnOpcode}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj ubh. Refer to the }(hj hhhNhNubj)}(h`AI Engine Run Time`_h]hAI Engine Run Time}(hj hhhNhNubah}(h]h ]h"]h$]h&]nameAI Engine Run Timej6https://github.com/Xilinx/aie-rt/tree/release/main_aiguh1jhj jKubh for more details.}(hj hhhNhNubeh}(h]h ]h"]h$]h&]uh1j|hhhKhj ubah}(h]h ]h"]h$]h&]uh1jhjS hhhhhNubeh}(h]h ]h"]h$]h&]enumtypearabicprefixhsuffix.uh1jQ hj2 hhhhhKubeh}(h]application-binariesah ]h"]application binariesah$]h&]uh1jGhjIhhhhhKubjH)}(hhh](jM)}(hSpecial Host Buffersh]hSpecial Host Buffers}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jLhj hhhhhKubjH)}(hhh](jM)}(hPer-context Instruction Bufferh]hPer-context Instruction Buffer}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jLhj hhhhhKubj})}(hXtEvery workload context uses a host resident 64 MB buffer which is memory mapped into the ERT instance created to service the workload. The ``ctrlcode`` used by the workload is copied into this special memory. This buffer is protected by PASID like all other input/output buffers used by that workload. Instruction buffer is also mapped into the user space of the workload.h](hEvery workload context uses a host resident 64 MB buffer which is memory mapped into the ERT instance created to service the workload. The }(hj) hhhNhNubj)}(h ``ctrlcode``h]hctrlcode}(hj1 hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj) ubh used by the workload is copied into this special memory. This buffer is protected by PASID like all other input/output buffers used by that workload. Instruction buffer is also mapped into the user space of the workload.}(hj) hhhNhNubeh}(h]h ]h"]h$]h&]uh1j|hhhKhj hhubeh}(h]per-context-instruction-bufferah ]h"]per-context instruction bufferah$]h&]uh1jGhj hhhhhKubjH)}(hhh](jM)}(hGlobal Privileged Bufferh]hGlobal Privileged Buffer}(hjT hhhNhNubah}(h]h ]h"]h$]h&]uh1jLhjQ hhhhhKubj})}(hIn addition, the driver also allocates a single buffer for maintenance tasks like recording errors from MERT. This global buffer uses the global IOMMU domain and is only accessible by MERT.h]hIn addition, the driver also allocates a single buffer for maintenance tasks like recording errors from MERT. This global buffer uses the global IOMMU domain and is only accessible by MERT.}(hjb hhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKhjQ hhubeh}(h]global-privileged-bufferah ]h"]global privileged bufferah$]h&]uh1jGhj hhhhhKubeh}(h]special-host-buffersah ]h"]special host buffersah$]h&]uh1jGhjIhhhhhKubjH)}(hhh](jM)}(hHigh-level Use Flowh]hHigh-level Use Flow}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jLhj hhhhhKubj})}(h0Here are the steps to run a workload on AMD NPU:h]h0Here are the steps to run a workload on AMD NPU:}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKhj hhubjR )}(hhh](j)}(h?Compile the workload into an overlay and a ``ctrlcode`` binary.h]j})}(hj h](h+Compile the workload into an overlay and a }(hj hhhNhNubj)}(h ``ctrlcode``h]hctrlcode}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj ubh binary.}(hj hhhNhNubeh}(h]h ]h"]h$]h&]uh1j|hhhKhj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(hAUserspace opens a context in the driver and provides the overlay.h]j})}(hj h]hAUserspace opens a context in the driver and provides the overlay.}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKhj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(h^The driver checks with the Resource Solver for provisioning a set of columns for the workload.h]j})}(h^The driver checks with the Resource Solver for provisioning a set of columns for the workload.h]h^The driver checks with the Resource Solver for provisioning a set of columns for the workload.}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKhj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(hUThe driver then asks MERT to create a context on the device with the desired columns.h]j})}(hUThe driver then asks MERT to create a context on the device with the desired columns.h]hUThe driver then asks MERT to create a context on the device with the desired columns.}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKhj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(h\MERT then creates an instance of ERT. MERT also maps the Instruction Buffer into ERT memory.h]j})}(h\MERT then creates an instance of ERT. MERT also maps the Instruction Buffer into ERT memory.h]h\MERT then creates an instance of ERT. MERT also maps the Instruction Buffer into ERT memory.}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKhj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(hEThe userspace then copies the ``ctrlcode`` to the Instruction Buffer.h]j})}(hj, h](hThe userspace then copies the }(hj. hhhNhNubj)}(h ``ctrlcode``h]hctrlcode}(hj5 hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj. ubh to the Instruction Buffer.}(hj. hhhNhNubeh}(h]h ]h"]h$]h&]uh1j|hhhKhj* ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(hUserspace then creates a command buffer with pointers to input, output, and instruction buffer; it then submits command buffer with the driver and goes to sleep waiting for completion.h]j})}(hUserspace then creates a command buffer with pointers to input, output, and instruction buffer; it then submits command buffer with the driver and goes to sleep waiting for completion.h]hUserspace then creates a command buffer with pointers to input, output, and instruction buffer; it then submits command buffer with the driver and goes to sleep waiting for completion.}(hjW hhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKhjS ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(h5The driver sends the command over the Mailbox to ERT.h]j})}(hjm h]h5The driver sends the command over the Mailbox to ERT.}(hjo hhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKhjk ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(h:ERT *executes* the ``ctrlcode`` in the instruction buffer.h]j})}(hj h](hERT }(hj hhhNhNubj)}(h *executes*h]hexecutes}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj ubh the }(hj hhhNhNubj)}(h ``ctrlcode``h]hctrlcode}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj ubh in the instruction buffer.}(hj hhhNhNubeh}(h]h ]h"]h$]h&]uh1j|hhhKhj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(hfExecution of the ``ctrlcode`` kicks off DMAs to and from the host DDR while AMD XDNA Array is running.h]j})}(hfExecution of the ``ctrlcode`` kicks off DMAs to and from the host DDR while AMD XDNA Array is running.h](hExecution of the }(hj hhhNhNubj)}(h ``ctrlcode``h]hctrlcode}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj ubhI kicks off DMAs to and from the host DDR while AMD XDNA Array is running.}(hj hhhNhNubeh}(h]h ]h"]h$]h&]uh1j|hhhKhj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(hWhen ERT reaches end of ``ctrlcode``, it raises an MSI-X to send completion signal to the driver which then wakes up the waiting workload. h]j})}(hWhen ERT reaches end of ``ctrlcode``, it raises an MSI-X to send completion signal to the driver which then wakes up the waiting workload.h](hWhen ERT reaches end of }(hj hhhNhNubj)}(h ``ctrlcode``h]hctrlcode}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj ubhf, it raises an MSI-X to send completion signal to the driver which then wakes up the waiting workload.}(hj hhhNhNubeh}(h]h ]h"]h$]h&]uh1j|hhhKhj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubeh}(h]h ]h"]h$]h&]j j j hj j uh1jQ hj hhhhhKubeh}(h]high-level-use-flowah ]h"]high-level use flowah$]h&]uh1jGhjIhhhhhKubjH)}(hhh](jM)}(h Boot Flowh]h Boot Flow}(hj" hhhNhNubah}(h]h ]h"]h$]h&]uh1jLhj hhhhhKubj})}(hXAamdxdna driver uses PSP to securely load signed NPU FW and kick off the boot of the NPU microcontroller. amdxdna driver then waits for the alive signal in a special location on BAR 0. The NPU is switched off during SoC suspend and turned on after resume where the NPU FW is reloaded, and the handshake is performed again.h]hXAamdxdna driver uses PSP to securely load signed NPU FW and kick off the boot of the NPU microcontroller. amdxdna driver then waits for the alive signal in a special location on BAR 0. The NPU is switched off during SoC suspend and turned on after resume where the NPU FW is reloaded, and the handshake is performed again.}(hj0 hhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhKhj hhubeh}(h] boot-flowah ]h"] boot flowah$]h&]uh1jGhjIhhhhhKubjH)}(hhh](jM)}(hUserspace componentsh]hUserspace components}(hjI hhhNhNubah}(h]h ]h"]h$]h&]uh1jLhjF hhhhhKubjH)}(hhh](jM)}(hCompilerh]hCompiler}(hjZ hhhNhNubah}(h]h ]h"]h$]h&]uh1jLhjW hhhhhKubj})}(h|Peano is an LLVM based open-source compiler for AMD XDNA Array compute tile available at: https://github.com/Xilinx/llvm-aieh](hZPeano is an LLVM based open-source compiler for AMD XDNA Array compute tile available at: }(hjh hhhNhNubj)}(h"https://github.com/Xilinx/llvm-aieh]h"https://github.com/Xilinx/llvm-aie}(hjp hhhNhNubah}(h]h ]h"]h$]h&]refurijr uh1jhjh ubeh}(h]h ]h"]h$]h&]uh1j|hhhKhjW hhubj})}(hThe open-source IREE compiler supports graph compilation of ML models for AMD NPU and uses Peano underneath. It is available at: https://github.com/nod-ai/iree-amd-aieh](hThe open-source IREE compiler supports graph compilation of ML models for AMD NPU and uses Peano underneath. It is available at: }(hj hhhNhNubj)}(h&https://github.com/nod-ai/iree-amd-aieh]h&https://github.com/nod-ai/iree-amd-aie}(hj hhhNhNubah}(h]h ]h"]h$]h&]refurij uh1jhj ubeh}(h]h ]h"]h$]h&]uh1j|hhhKhjW hhubeh}(h]compilerah ]h"]compilerah$]h&]uh1jGhjF hhhhhKubjH)}(hhh](jM)}(hUsermode Driver (UMD)h]hUsermode Driver (UMD)}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jLhj hhhhhKubj})}(h{The open-source XRT runtime stack interfaces with amdxdna kernel driver. XRT can be found at: https://github.com/Xilinx/XRTh](h^The open-source XRT runtime stack interfaces with amdxdna kernel driver. XRT can be found at: }(hj hhhNhNubj)}(hhttps://github.com/Xilinx/XRTh]hhttps://github.com/Xilinx/XRT}(hj hhhNhNubah}(h]h ]h"]h$]h&]refurij uh1jhj ubeh}(h]h ]h"]h$]h&]uh1j|hhhKhj hhubj})}(hWThe open-source XRT shim for NPU is can be found at: https://github.com/amd/xdna-driverh](h5The open-source XRT shim for NPU is can be found at: }(hj hhhNhNubj)}(h"https://github.com/amd/xdna-driverh]h"https://github.com/amd/xdna-driver}(hj hhhNhNubah}(h]h ]h"]h$]h&]refurij uh1jhj ubeh}(h]h ]h"]h$]h&]uh1j|hhhKhj hhubeh}(h]usermode-driver-umdah ]h"]usermode driver (umd)ah$]h&]uh1jGhjF hhhhhKubeh}(h]userspace-componentsah ]h"]userspace componentsah$]h&]uh1jGhjIhhhhhKubjH)}(hhh](jM)}(h DMA Operationh]h DMA Operation}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jLhj hhhhhKubj})}(hDMA operation instructions are encoded in the ``ctrlcode`` as ``XAIE_IO_BLOCKWRITE`` opcode. When ERT executes ``XAIE_IO_BLOCKWRITE``, DMA operations between host DDR and L2 memory are effected.h](h.DMA operation instructions are encoded in the }(hj hhhNhNubj)}(h ``ctrlcode``h]hctrlcode}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj ubh as }(hj hhhNhNubj)}(h``XAIE_IO_BLOCKWRITE``h]hXAIE_IO_BLOCKWRITE}(hj0 hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj ubh opcode. When ERT executes }(hj hhhNhNubj)}(h``XAIE_IO_BLOCKWRITE``h]hXAIE_IO_BLOCKWRITE}(hjB hhhNhNubah}(h]h ]h"]h$]h&]uh1jhj ubh=, DMA operations between host DDR and L2 memory are effected.}(hj hhhNhNubeh}(h]h ]h"]h$]h&]uh1j|hhhKhj hhubeh}(h] dma-operationah ]h"] dma operationah$]h&]uh1jGhjIhhhhhKubjH)}(hhh](jM)}(hError Handlingh]hError Handling}(hje hhhNhNubah}(h]h ]h"]h$]h&]uh1jLhjb hhhhhKubj})}(hX}When MERT detects an error in AMD XDNA Array, it pauses execution for that workload context and sends an asynchronous message to the driver over the privileged channel. The driver then sends a buffer pointer to MERT to capture the register states for the partition bound to faulting workload context. The driver then decodes the error by reading the contents of the buffer pointer.h]hX}When MERT detects an error in AMD XDNA Array, it pauses execution for that workload context and sends an asynchronous message to the driver over the privileged channel. The driver then sends a buffer pointer to MERT to capture the register states for the partition bound to faulting workload context. The driver then decodes the error by reading the contents of the buffer pointer.}(hjs hhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhMhjb hhubeh}(h]error-handlingah ]h"]error handlingah$]h&]uh1jGhjIhhhhhKubjH)}(hhh](jM)}(h Telemetryh]h Telemetry}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1jLhj hhhhhMubj})}(hJMERT can report various kinds of telemetry information like the following:h]hJMERT can report various kinds of telemetry information like the following:}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhM hj hhubj)}(hhh](j)}(hL1 interrupt counterh]j})}(hj h]hL1 interrupt counter}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhM hj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(h DMA counterh]j})}(hj h]h DMA counter}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhM hj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(hDeep Sleep counterh]j})}(hj h]hDeep Sleep counter}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhMhj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubj)}(hetc. h]j})}(hetc.h]hetc.}(hj hhhNhNubah}(h]h ]h"]h$]h&]uh1j|hhhMhj ubah}(h]h ]h"]h$]h&]uh1jhj hhhhhNubeh}(h]h ]h"]h$]h&]jjuh1jhhhM hj hhubeh}(h] telemetryah ]h"] telemetryah$]h&]uh1jGhjIhhhhhMubjH)}(hhh](jM)}(h Referencesh]h References}(hjhhhNhNubah}(h]h ]h"]h$]h&]uh1jLhjhhhhhMubj)}(hhh](j)}(hH`AMD XDNA Architecture `_h]j})}(hj,h](j)}(hj,h]hAMD XDNA Architecture}(hj1hhhNhNubah}(h]h ]h"]h$]h&]nameAMD XDNA Architecturejjuh1jhj.ubhtarget)}(h0 h]h}(h]amd-xdna-architectureah ]h"]amd xdna architectureah$]h&]refurijuh1j@ referencedKhj.ubeh}(h]h ]h"]h$]h&]uh1j|hhhMhj*ubah}(h]h ]h"]h$]h&]uh1jhj'hhhhhNubj)}(hW`AMD AI Engine Technology `_h]j})}(hj^h](j)}(hj^h]hAMD AI Engine Technology}(hjchhhNhNubah}(h]h ]h"]h$]h&]nameAMD AI Engine Technologyjjuh1jhj`ubjA)}(h< h]h}(h]amd-ai-engine-technologyah ]h"]amd ai engine technologyah$]h&]refurijuh1j@jOKhj`ubeh}(h]h ]h"]h$]h&]uh1j|hhhMhj\ubah}(h]h ]h"]h$]h&]uh1jhj'hhhhhNubj)}(h-`Peano `_h]j})}(hjh](j)}(hjh]hPeano}(hjhhhNhNubah}(h]h ]h"]h$]h&]namePeanoj"https://github.com/Xilinx/llvm-aieuh1jhjubjA)}(h% h]h}(h]peanoah ]h"]peanoah$]h&]refurijuh1j@jOKhjubeh}(h]h ]h"]h$]h&]uh1j|hhhMhjubah}(h]h ]h"]h$]h&]uh1jhj'hhhhhNubj)}(hl`Versal Adaptive SoC AIE-ML Architecture Manual (AM020) `_h]j})}(hjh](j)}(hjh]h6Versal Adaptive SoC AIE-ML Architecture Manual (AM020)}(hjhhhNhNubah}(h]h ]h"]h$]h&]name6Versal Adaptive SoC AIE-ML Architecture Manual (AM020)jjr uh1jhjubjA)}(h3 h]h}(h]4versal-adaptive-soc-aie-ml-architecture-manual-am020ah ]h"]6versal adaptive soc aie-ml architecture manual (am020)ah$]h&]refurijr uh1j@jOKhjubeh}(h]h ]h"]h$]h&]uh1j|hhhMhjubah}(h]h ]h"]h$]h&]uh1jhj'hhhhhNubj)}(hN`AI Engine Run Time `_h]j})}(hjh](j)}(hjh]hAI Engine Run Time}(hjhhhNhNubah}(h]h ]h"]h$]h&]nameAI Engine Run Timejj uh1jhjubjA)}(h9 h]h}(h]ai-engine-run-timeah ]h"]ai engine run timeah$]h&]refurij uh1j@jOKhjubeh}(h]h ]h"]h$]h&]uh1j|hhhMhjubah}(h]h ]h"]h$]h&]uh1jhj'hhhhhNubeh}(h]h ]h"]h$]h&]j-uh1jhhhMhjhhubeh}(h] referencesah ]h"] referencesah$]h&]uh1jGhjIhhhhhMubeh}(h]amd-npuah ]h"]amd npuah$]h&]uh1jGhhhhhhhKubeh}(h]h ]h"]h$]h&]sourcehuh1hcurrent_sourceN current_lineNsettingsdocutils.frontendValues)}(jLN generatorN datestampN source_linkN source_urlN toc_backlinksentryfootnote_backlinksK sectnum_xformKstrip_commentsNstrip_elements_with_classesN strip_classesN report_levelK halt_levelKexit_status_levelKdebugNwarning_streamN tracebackinput_encoding utf-8-siginput_encoding_error_handlerstrictoutput_encodingutf-8output_encoding_error_handlerjSerror_encodingutf-8error_encoding_error_handlerbackslashreplace language_codeenrecord_dependenciesNconfigN id_prefixhauto_id_prefixid dump_settingsNdump_internalsNdump_transformsNdump_pseudo_xmlNexpose_internalsNstrict_visitorN_disable_configN_sourceh _destinationN _config_files]7/var/lib/git/docbuild/linux/Documentation/docutils.confafile_insertion_enabled raw_enabledKline_length_limitM'pep_referencesN pep_base_urlhttps://peps.python.org/pep_file_url_templatepep-%04drfc_referencesN rfc_base_url&https://datatracker.ietf.org/doc/html/ tab_widthKtrim_footnote_reference_spacesyntax_highlightlong smart_quotessmartquotes_locales]character_level_inline_markupdoctitle_xform docinfo_xformKsectsubtitle_xform image_loadinglinkembed_stylesheetcloak_email_addressessection_self_linkenvNubreporterNindirect_targets]substitution_defs}(hhhhhhj jjjj*jj9j-jHj<jWjKjfjZjujijjxjjjjjjjjjjjjjjjjj jjjj)jj8j,jGj;jVjJjejYjtjhjjwjjjjjjjjjjjjjjjjj jjj j(jj7j+jFj:jUjIjdjXjsjgjjvjjjjjjjjjjjjjjjjj jjj j'jj6j*jEj9jTjHjcjWjrjfjjujjjjjjjjjjjjjjjjjjjj j&jj5j)jDj8usubstitution_names}(amphߌaposhasthbrvbarj bsoljcentj*colonj9commajHcommatjWcopyjfcurrenjudarrjdegjdividejdollarjequalsjexcljfrac12jfrac14jfrac18jfrac34j frac38jfrac58j)frac78j8gtjGhalfjVhorbarjehyphenjtiexcljiquestjlaquojlarrjlcubjldquojlowbarjlparjlsqbjlsquoj ltjmicroj(middotj7nbspjFnotjUnumjdohmjsordfjordmjparajpercntjperiodjplusjplusmnjpoundjquestjquotj raquojrarrj'rcubj6rdquojEregjTrparjcrsqbjrrsquojsectjsemijshyjsoljsungjsup1jsup2jsup3jtimesjtradejuarrj&verbarj5yenjDurefnames}(amd xdna architecture]jaamd ai engine technology]jpa6versal adaptive soc aie-ml architecture manual (am020)]jb aai engine run time]j aurefids}nameids}(j-j*j5j2jjjjjjj*j'j_j\j^j[jjj/ j, j' j$ j j j} jz jN jK ju jr j j jC j@ j j j j j j j_ j\ j j jjj%j"jKjHj{jxjjjjj ju nametypes}(j-j5jjjj*j_j^jj/ j' j j} jN ju j jC j j j j_ j jj%jKj{jjj uh}(j*jIj2jjj8jjWjjj'jj\j-j[jbjjaj, jj$ jj j2 jz j jK j jr jQ j j j@ j j jF j jW j j j\ j j jb jj j"jjHjBjxjrjjjjjju footnote_refs} citation_refs} autofootnotes]autofootnote_refs]symbol_footnotes]symbol_footnote_refs] footnotes] citations]autofootnote_startKsymbol_footnote_startK id_counter collectionsCounter}Rparse_messages]transform_messages] transformerN include_log]&Documentation/accel/amdxdna/amdnpu.rst(NNNNta decorationNhhub.