€•      Œsphinx.addnodes”Œdocument”“”)”}”(Œ	rawsource”Œ ”Œchildren”]”(Œtranslations”ŒLanguagesNode”“”)”}”(hhh]”(h Œpending_xref”“”)”}”(hhh]”Œdocutils.nodes”ŒText”“”ŒChinese (Simplified)”…””}”Œparent”hsbaŒ
attributes”}”(Œids”]”Œclasses”]”Œnames”]”Œdupnames”]”Œbackrefs”]”Œ	refdomain”Œstd”Œreftype”Œdoc”Œ	reftarget”Œ8/translations/zh_CN/admin-guide/perf/nvidia-tegra410-pmu”Œmodname”NŒ	classname”NŒrefexplicit”ˆuŒtagname”hhhubh)”}”(hhh]”hŒChinese (Traditional)”…””}”hh2sbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ8/translations/zh_TW/admin-guide/perf/nvidia-tegra410-pmu”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubh)”}”(hhh]”hŒItalian”…””}”hhFsbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ8/translations/it_IT/admin-guide/perf/nvidia-tegra410-pmu”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubh)”}”(hhh]”hŒJapanese”…””}”hhZsbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ8/translations/ja_JP/admin-guide/perf/nvidia-tegra410-pmu”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubh)”}”(hhh]”hŒKorean”…””}”hhnsbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ8/translations/ko_KR/admin-guide/perf/nvidia-tegra410-pmu”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubh)”}”(hhh]”hŒPortuguese (Brazilian)”…””}”hh‚sbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ8/translations/pt_BR/admin-guide/perf/nvidia-tegra410-pmu”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubh)”}”(hhh]”hŒSpanish”…””}”hh–sbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ8/translations/sp_SP/admin-guide/perf/nvidia-tegra410-pmu”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubeh}”(h]”h ]”h"]”h$]”h&]”Œcurrent_language”ŒEnglish”uh1h
hhŒ	_document”hŒsource”NŒline”NubhŒsection”“”)”}”(hhh]”(hŒtitle”“”)”}”(hŒ<NVIDIA Tegra410 SoC Uncore Performance Monitoring Unit (PMU)”h]”hŒ<NVIDIA Tegra410 SoC Uncore Performance Monitoring Unit (PMU)”…””}”(hh¼h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hºhh·h²hh³ŒR/var/lib/git/docbuild/linux/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst”h´KubhŒ	paragraph”“”)”}”(hŒˆThe NVIDIA Tegra410 SoC includes various system PMUs to measure key performance
metrics like memory bandwidth, latency, and utilization:”h]”hŒˆThe NVIDIA Tegra410 SoC includes various system PMUs to measure key performance
metrics like memory bandwidth, latency, and utilization:”…””}”(hhÍh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Khh·h²hubhŒbullet_list”“”)”}”(hhh]”(hŒ	list_item”“”)”}”(hŒUnified Coherence Fabric (UCF)”h]”hÌ)”}”(hhäh]”hŒUnified Coherence Fabric (UCF)”…””}”(hhæh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Khhâubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhhÝh²hh³hÊh´Nubhá)”}”(hŒPCIE”h]”hÌ)”}”(hhûh]”hŒPCIE”…””}”(hhýh²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K	hhùubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhhÝh²hh³hÊh´Nubhá)”}”(hŒPCIE-TGT”h]”hÌ)”}”(hj  h]”hŒPCIE-TGT”…””}”(hj  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K
hj  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhhÝh²hh³hÊh´Nubhá)”}”(hŒCPU Memory (CMEM) Latency”h]”hÌ)”}”(hj)  h]”hŒCPU Memory (CMEM) Latency”…””}”(hj+  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Khj'  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhhÝh²hh³hÊh´Nubhá)”}”(hŒ
NVLink-C2C”h]”hÌ)”}”(hj@  h]”hŒ
NVLink-C2C”…””}”(hjB  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Khj>  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhhÝh²hh³hÊh´Nubhá)”}”(hŒNV-CLink”h]”hÌ)”}”(hjW  h]”hŒNV-CLink”…””}”(hjY  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´KhjU  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhhÝh²hh³hÊh´Nubhá)”}”(hŒ	NV-DLink
”h]”hÌ)”}”(hŒNV-DLink”h]”hŒNV-DLink”…””}”(hjp  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Khjl  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhhÝh²hh³hÊh´Nubeh}”(h]”h ]”h"]”h$]”h&]”Œbullet”Œ*”uh1hÛh³hÊh´Khh·h²hubh¶)”}”(hhh]”(h»)”}”(hŒ
PMU Driver”h]”hŒ
PMU Driver”…””}”(hj  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hºhjŒ  h²hh³hÊh´KubhÌ)”}”(hX…  The PMU driver describes the available events and configuration of each PMU in
sysfs. Please see the sections below to get the sysfs path of each PMU. Like
other uncore PMU drivers, the driver provides "cpumask" sysfs attribute to show
the CPU id used to handle the PMU event. There is also "associated_cpus"
sysfs attribute, which contains a list of CPUs associated with the PMU instance.”h]”hX  The PMU driver describes the available events and configuration of each PMU in
sysfs. Please see the sections below to get the sysfs path of each PMU. Like
other uncore PMU drivers, the driver provides â€œcpumaskâ€ sysfs attribute to show
the CPU id used to handle the PMU event. There is also â€œassociated_cpusâ€
sysfs attribute, which contains a list of CPUs associated with the PMU instance.”…””}”(hj  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´KhjŒ  h²hubeh}”(h]”Œ
pmu-driver”ah ]”h"]”Œ
pmu driver”ah$]”h&]”uh1hµhh·h²hh³hÊh´Kubh¶)”}”(hhh]”(h»)”}”(hŒUCF PMU”h]”hŒUCF PMU”…””}”(hj¶  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hºhj³  h²hh³hÊh´KubhÌ)”}”(hŒýThe Unified Coherence Fabric (UCF) in the NVIDIA Tegra410 SoC serves as a
distributed cache, last level for CPU Memory and CXL Memory, and cache coherent
interconnect that supports hardware coherence across multiple coherently caching
agents, including:”h]”hŒýThe Unified Coherence Fabric (UCF) in the NVIDIA Tegra410 SoC serves as a
distributed cache, last level for CPU Memory and CXL Memory, and cache coherent
interconnect that supports hardware coherence across multiple coherently caching
agents, including:”…””}”(hjÄ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Khj³  h²hubhŒblock_quote”“”)”}”(hŒZ* CPU clusters
* GPU
* PCIe Ordering Controller Unit (OCU)
* Other IO-coherent requesters
”h]”hÜ)”}”(hhh]”(há)”}”(hŒCPU clusters”h]”hÌ)”}”(hjÝ  h]”hŒCPU clusters”…””}”(hjß  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K!hjÛ  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjØ  ubhá)”}”(hŒGPU”h]”hÌ)”}”(hjô  h]”hŒGPU”…””}”(hjö  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K"hjò  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjØ  ubhá)”}”(hŒ#PCIe Ordering Controller Unit (OCU)”h]”hÌ)”}”(hj  h]”hŒ#PCIe Ordering Controller Unit (OCU)”…””}”(hj  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K#hj	  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjØ  ubhá)”}”(hŒOther IO-coherent requesters
”h]”hÌ)”}”(hŒOther IO-coherent requesters”h]”hŒOther IO-coherent requesters”…””}”(hj$  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K$hj   ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjØ  ubeh}”(h]”h ]”h"]”h$]”h&]”jŠ  j‹  uh1hÛh³hÊh´K!hjÔ  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÒ  h³hÊh´K!hj³  h²hubhÌ)”}”(hŒThe events and configuration options of this PMU device are described in sysfs,
see /sys/bus/event_source/devices/nvidia_ucf_pmu_<socket-id>.”h]”hŒThe events and configuration options of this PMU device are described in sysfs,
see /sys/bus/event_source/devices/nvidia_ucf_pmu_<socket-id>.”…””}”(hjD  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K&hj³  h²hubhÌ)”}”(hŒZSome of the events available in this PMU can be used to measure bandwidth and
utilization:”h]”hŒZSome of the events available in this PMU can be used to measure bandwidth and
utilization:”…””}”(hjR  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K)hj³  h²hubjÓ  )”}”(hXU  * slc_access_rd: count the number of read requests to SLC.
* slc_access_wr: count the number of write requests to SLC.
* slc_bytes_rd: count the number of bytes transferred by slc_access_rd.
* slc_bytes_wr: count the number of bytes transferred by slc_access_wr.
* mem_access_rd: count the number of read requests to local or remote memory.
* mem_access_wr: count the number of write requests to local or remote memory.
* mem_bytes_rd: count the number of bytes transferred by mem_access_rd.
* mem_bytes_wr: count the number of bytes transferred by mem_access_wr.
* cycles: counts the UCF cycles.
”h]”hÜ)”}”(hhh]”(há)”}”(hŒ8slc_access_rd: count the number of read requests to SLC.”h]”hÌ)”}”(hji  h]”hŒ8slc_access_rd: count the number of read requests to SLC.”…””}”(hjk  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K,hjg  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjd  ubhá)”}”(hŒ9slc_access_wr: count the number of write requests to SLC.”h]”hÌ)”}”(hj€  h]”hŒ9slc_access_wr: count the number of write requests to SLC.”…””}”(hj‚  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K-hj~  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjd  ubhá)”}”(hŒEslc_bytes_rd: count the number of bytes transferred by slc_access_rd.”h]”hÌ)”}”(hj—  h]”hŒEslc_bytes_rd: count the number of bytes transferred by slc_access_rd.”…””}”(hj™  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K.hj•  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjd  ubhá)”}”(hŒEslc_bytes_wr: count the number of bytes transferred by slc_access_wr.”h]”hÌ)”}”(hj®  h]”hŒEslc_bytes_wr: count the number of bytes transferred by slc_access_wr.”…””}”(hj°  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K/hj¬  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjd  ubhá)”}”(hŒKmem_access_rd: count the number of read requests to local or remote memory.”h]”hÌ)”}”(hjÅ  h]”hŒKmem_access_rd: count the number of read requests to local or remote memory.”…””}”(hjÇ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K0hjÃ  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjd  ubhá)”}”(hŒLmem_access_wr: count the number of write requests to local or remote memory.”h]”hÌ)”}”(hjÜ  h]”hŒLmem_access_wr: count the number of write requests to local or remote memory.”…””}”(hjÞ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K1hjÚ  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjd  ubhá)”}”(hŒEmem_bytes_rd: count the number of bytes transferred by mem_access_rd.”h]”hÌ)”}”(hjó  h]”hŒEmem_bytes_rd: count the number of bytes transferred by mem_access_rd.”…””}”(hjõ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K2hjñ  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjd  ubhá)”}”(hŒEmem_bytes_wr: count the number of bytes transferred by mem_access_wr.”h]”hÌ)”}”(hj
  h]”hŒEmem_bytes_wr: count the number of bytes transferred by mem_access_wr.”…””}”(hj  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K3hj  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjd  ubhá)”}”(hŒcycles: counts the UCF cycles.
”h]”hÌ)”}”(hŒcycles: counts the UCF cycles.”h]”hŒcycles: counts the UCF cycles.”…””}”(hj#  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K4hj  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjd  ubeh}”(h]”h ]”h"]”h$]”h&]”jŠ  j‹  uh1hÛh³hÊh´K,hj`  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÒ  h³hÊh´K,hj³  h²hubhÌ)”}”(hŒ(The average bandwidth is calculated as::”h]”hŒ'The average bandwidth is calculated as:”…””}”(hjC  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K6hj³  h²hubhŒliteral_block”“”)”}”(hX  AVG_SLC_READ_BANDWIDTH_IN_GBPS = SLC_BYTES_RD / ELAPSED_TIME_IN_NS
AVG_SLC_WRITE_BANDWIDTH_IN_GBPS = SLC_BYTES_WR / ELAPSED_TIME_IN_NS
AVG_MEM_READ_BANDWIDTH_IN_GBPS = MEM_BYTES_RD / ELAPSED_TIME_IN_NS
AVG_MEM_WRITE_BANDWIDTH_IN_GBPS = MEM_BYTES_WR / ELAPSED_TIME_IN_NS”h]”hX  AVG_SLC_READ_BANDWIDTH_IN_GBPS = SLC_BYTES_RD / ELAPSED_TIME_IN_NS
AVG_SLC_WRITE_BANDWIDTH_IN_GBPS = SLC_BYTES_WR / ELAPSED_TIME_IN_NS
AVG_MEM_READ_BANDWIDTH_IN_GBPS = MEM_BYTES_RD / ELAPSED_TIME_IN_NS
AVG_MEM_WRITE_BANDWIDTH_IN_GBPS = MEM_BYTES_WR / ELAPSED_TIME_IN_NS”…””}”hjS  sbah}”(h]”h ]”h"]”h$]”h&]”Œ	xml:space”Œpreserve”uh1jQ  h³hÊh´K8hj³  h²hubhÌ)”}”(hŒ+The average request rate is calculated as::”h]”hŒ*The average request rate is calculated as:”…””}”(hjc  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K=hj³  h²hubjR  )”}”(hŒÍAVG_SLC_READ_REQUEST_RATE = SLC_ACCESS_RD / CYCLES
AVG_SLC_WRITE_REQUEST_RATE = SLC_ACCESS_WR / CYCLES
AVG_MEM_READ_REQUEST_RATE = MEM_ACCESS_RD / CYCLES
AVG_MEM_WRITE_REQUEST_RATE = MEM_ACCESS_WR / CYCLES”h]”hŒÍAVG_SLC_READ_REQUEST_RATE = SLC_ACCESS_RD / CYCLES
AVG_SLC_WRITE_REQUEST_RATE = SLC_ACCESS_WR / CYCLES
AVG_MEM_READ_REQUEST_RATE = MEM_ACCESS_RD / CYCLES
AVG_MEM_WRITE_REQUEST_RATE = MEM_ACCESS_WR / CYCLES”…””}”hjq  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´K?hj³  h²hubhÌ)”}”(hŒkMore details about what other events are available can be found in Tegra410 SoC
technical reference manual.”h]”hŒkMore details about what other events are available can be found in Tegra410 SoC
technical reference manual.”…””}”(hj  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´KDhj³  h²hubhÌ)”}”(hX'  The events can be filtered based on source or destination. The source filter
indicates the traffic initiator to the SLC, e.g local CPU, non-CPU device, or
remote socket. The destination filter specifies the destination memory type,
e.g. local system memory (CMEM), local GPU memory (GMEM), or remote memory. The
local/remote classification of the destination filter is based on the home
socket of the address, not where the data actually resides. The available
filters are described in
/sys/bus/event_source/devices/nvidia_ucf_pmu_<socket-id>/format/.”h]”hX'  The events can be filtered based on source or destination. The source filter
indicates the traffic initiator to the SLC, e.g local CPU, non-CPU device, or
remote socket. The destination filter specifies the destination memory type,
e.g. local system memory (CMEM), local GPU memory (GMEM), or remote memory. The
local/remote classification of the destination filter is based on the home
socket of the address, not where the data actually resides. The available
filters are described in
/sys/bus/event_source/devices/nvidia_ucf_pmu_<socket-id>/format/.”…””}”(hj  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´KGhj³  h²hubhÌ)”}”(hŒ"The list of UCF PMU event filters:”h]”hŒ"The list of UCF PMU event filters:”…””}”(hj›  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´KPhj³  h²hubhÜ)”}”(hhh]”(há)”}”(hŒÑSource filter:

* src_loc_cpu: if set, count events from local CPU
* src_loc_noncpu: if set, count events from local non-CPU device
* src_rem: if set, count events from CPU, GPU, PCIE devices of remote socket
”h]”(hÌ)”}”(hŒSource filter:”h]”hŒSource filter:”…””}”(hj°  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´KRhj¬  ubhÜ)”}”(hhh]”(há)”}”(hŒ0src_loc_cpu: if set, count events from local CPU”h]”hÌ)”}”(hjÃ  h]”hŒ0src_loc_cpu: if set, count events from local CPU”…””}”(hjÅ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´KThjÁ  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj¾  ubhá)”}”(hŒ>src_loc_noncpu: if set, count events from local non-CPU device”h]”hÌ)”}”(hjÚ  h]”hŒ>src_loc_noncpu: if set, count events from local non-CPU device”…””}”(hjÜ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´KUhjØ  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj¾  ubhá)”}”(hŒKsrc_rem: if set, count events from CPU, GPU, PCIE devices of remote socket
”h]”hÌ)”}”(hŒJsrc_rem: if set, count events from CPU, GPU, PCIE devices of remote socket”h]”hŒJsrc_rem: if set, count events from CPU, GPU, PCIE devices of remote socket”…””}”(hjó  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´KVhjï  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj¾  ubeh}”(h]”h ]”h"]”h$]”h&]”jŠ  j‹  uh1hÛh³hÊh´KThj¬  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhj©  h²hh³Nh´Nubhá)”}”(hX?  Destination filter:

* dst_loc_cmem: if set, count events to local system memory (CMEM) address
* dst_loc_gmem: if set, count events to local GPU memory (GMEM) address
* dst_loc_other: if set, count events to local CXL memory address
* dst_rem: if set, count events to CPU, GPU, and CXL memory address of remote socket
”h]”(hÌ)”}”(hŒDestination filter:”h]”hŒDestination filter:”…””}”(hj  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´KXhj  ubhÜ)”}”(hhh]”(há)”}”(hŒHdst_loc_cmem: if set, count events to local system memory (CMEM) address”h]”hÌ)”}”(hj*  h]”hŒHdst_loc_cmem: if set, count events to local system memory (CMEM) address”…””}”(hj,  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´KZhj(  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj%  ubhá)”}”(hŒEdst_loc_gmem: if set, count events to local GPU memory (GMEM) address”h]”hÌ)”}”(hjA  h]”hŒEdst_loc_gmem: if set, count events to local GPU memory (GMEM) address”…””}”(hjC  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K[hj?  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj%  ubhá)”}”(hŒ?dst_loc_other: if set, count events to local CXL memory address”h]”hÌ)”}”(hjX  h]”hŒ?dst_loc_other: if set, count events to local CXL memory address”…””}”(hjZ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K\hjV  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj%  ubhá)”}”(hŒSdst_rem: if set, count events to CPU, GPU, and CXL memory address of remote socket
”h]”hÌ)”}”(hŒRdst_rem: if set, count events to CPU, GPU, and CXL memory address of remote socket”h]”hŒRdst_rem: if set, count events to CPU, GPU, and CXL memory address of remote socket”…””}”(hjq  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K]hjm  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj%  ubeh}”(h]”h ]”h"]”h$]”h&]”jŠ  j‹  uh1hÛh³hÊh´KZhj  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhj©  h²hh³Nh´Nubeh}”(h]”h ]”h"]”h$]”h&]”jŠ  j‹  uh1hÛh³hÊh´KRhj³  h²hubhÌ)”}”(hŒŸIf the source is not specified, the PMU will count events from all sources. If
the destination is not specified, the PMU will count events to all destinations.”h]”hŒŸIf the source is not specified, the PMU will count events from all sources. If
the destination is not specified, the PMU will count events to all destinations.”…””}”(hj—  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K_hj³  h²hubhÌ)”}”(hŒExample usage:”h]”hŒExample usage:”…””}”(hj¥  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Kbhj³  h²hubhÜ)”}”(hhh]”(há)”}”(hŒyCount event id 0x0 in socket 0 from all sources and to all destinations::

  perf stat -a -e nvidia_ucf_pmu_0/event=0x0/
”h]”(hÌ)”}”(hŒICount event id 0x0 in socket 0 from all sources and to all destinations::”h]”hŒHCount event id 0x0 in socket 0 from all sources and to all destinations:”…””}”(hjº  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Kdhj¶  ubjR  )”}”(hŒ+perf stat -a -e nvidia_ucf_pmu_0/event=0x0/”h]”hŒ+perf stat -a -e nvidia_ucf_pmu_0/event=0x0/”…””}”hjÈ  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´Kfhj¶  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhj³  h²hh³hÊh´Nubhá)”}”(hŒÄCount event id 0x0 in socket 0 with source filter = local CPU and destination
filter = local system memory (CMEM)::

  perf stat -a -e nvidia_ucf_pmu_0/event=0x0,src_loc_cpu=0x1,dst_loc_cmem=0x1/
”h]”(hÌ)”}”(hŒsCount event id 0x0 in socket 0 with source filter = local CPU and destination
filter = local system memory (CMEM)::”h]”hŒrCount event id 0x0 in socket 0 with source filter = local CPU and destination
filter = local system memory (CMEM):”…””}”(hjà  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´KhhjÜ  ubjR  )”}”(hŒLperf stat -a -e nvidia_ucf_pmu_0/event=0x0,src_loc_cpu=0x1,dst_loc_cmem=0x1/”h]”hŒLperf stat -a -e nvidia_ucf_pmu_0/event=0x0,src_loc_cpu=0x1,dst_loc_cmem=0x1/”…””}”hjî  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´KkhjÜ  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhj³  h²hh³hÊh´Nubhá)”}”(hŒÀCount event id 0x0 in socket 1 with source filter = local non-CPU device and
destination filter = remote memory::

  perf stat -a -e nvidia_ucf_pmu_1/event=0x0,src_loc_noncpu=0x1,dst_rem=0x1/
”h]”(hÌ)”}”(hŒqCount event id 0x0 in socket 1 with source filter = local non-CPU device and
destination filter = remote memory::”h]”hŒpCount event id 0x0 in socket 1 with source filter = local non-CPU device and
destination filter = remote memory:”…””}”(hj  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Kmhj  ubjR  )”}”(hŒJperf stat -a -e nvidia_ucf_pmu_1/event=0x0,src_loc_noncpu=0x1,dst_rem=0x1/”h]”hŒJperf stat -a -e nvidia_ucf_pmu_1/event=0x0,src_loc_noncpu=0x1,dst_rem=0x1/”…””}”hj  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´Kphj  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhj³  h²hh³hÊh´Nubeh}”(h]”h ]”h"]”h$]”h&]”jŠ  j‹  uh1hÛh³hÊh´Kdhj³  h²hubeh}”(h]”Œucf-pmu”ah ]”h"]”Œucf pmu”ah$]”h&]”uh1hµhh·h²hh³hÊh´Kubh¶)”}”(hhh]”(h»)”}”(hŒPCIE PMU”h]”hŒPCIE PMU”…””}”(hj9  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hºhj6  h²hh³hÊh´KsubhÌ)”}”(hX"  This PMU is located in the SOC fabric connecting the PCIE root complex (RC) and
the memory subsystem. It monitors all read/write traffic from the root port(s)
or a particular BDF in a PCIE RC to local or remote memory. There is one PMU per
PCIE RC in the SoC. Each RC can have up to 16 lanes that can be bifurcated into
up to 8 root ports. The traffic from each root port can be filtered using RP or
BDF filter. For example, specifying "src_rp_mask=0xFF" means the PMU counter will
capture traffic from all RPs. Please see below for more details.”h]”hX&  This PMU is located in the SOC fabric connecting the PCIE root complex (RC) and
the memory subsystem. It monitors all read/write traffic from the root port(s)
or a particular BDF in a PCIE RC to local or remote memory. There is one PMU per
PCIE RC in the SoC. Each RC can have up to 16 lanes that can be bifurcated into
up to 8 root ports. The traffic from each root port can be filtered using RP or
BDF filter. For example, specifying â€œsrc_rp_mask=0xFFâ€ means the PMU counter will
capture traffic from all RPs. Please see below for more details.”…””}”(hjG  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Kuhj6  h²hubhÌ)”}”(hŒžThe events and configuration options of this PMU device are described in sysfs,
see /sys/bus/event_source/devices/nvidia_pcie_pmu_<socket-id>_rc_<pcie-rc-id>.”h]”hŒžThe events and configuration options of this PMU device are described in sysfs,
see /sys/bus/event_source/devices/nvidia_pcie_pmu_<socket-id>_rc_<pcie-rc-id>.”…””}”(hjU  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K}hj6  h²hubhÌ)”}”(hŒRThe events in this PMU can be used to measure bandwidth, utilization, and
latency:”h]”hŒRThe events in this PMU can be used to measure bandwidth, utilization, and
latency:”…””}”(hjc  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K€hj6  h²hubjÓ  )”}”(hXw  * rd_req: count the number of read requests by PCIE device.
* wr_req: count the number of write requests by PCIE device.
* rd_bytes: count the number of bytes transferred by rd_req.
* wr_bytes: count the number of bytes transferred by wr_req.
* rd_cum_outs: count outstanding rd_req each cycle.
* cycles: count the clock cycles of SOC fabric connected to the PCIE interface.
”h]”hÜ)”}”(hhh]”(há)”}”(hŒ9rd_req: count the number of read requests by PCIE device.”h]”hÌ)”}”(hjz  h]”hŒ9rd_req: count the number of read requests by PCIE device.”…””}”(hj|  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Kƒhjx  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhju  ubhá)”}”(hŒ:wr_req: count the number of write requests by PCIE device.”h]”hÌ)”}”(hj‘  h]”hŒ:wr_req: count the number of write requests by PCIE device.”…””}”(hj“  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K„hj  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhju  ubhá)”}”(hŒ:rd_bytes: count the number of bytes transferred by rd_req.”h]”hÌ)”}”(hj¨  h]”hŒ:rd_bytes: count the number of bytes transferred by rd_req.”…””}”(hjª  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K…hj¦  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhju  ubhá)”}”(hŒ:wr_bytes: count the number of bytes transferred by wr_req.”h]”hÌ)”}”(hj¿  h]”hŒ:wr_bytes: count the number of bytes transferred by wr_req.”…””}”(hjÁ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K†hj½  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhju  ubhá)”}”(hŒ1rd_cum_outs: count outstanding rd_req each cycle.”h]”hÌ)”}”(hjÖ  h]”hŒ1rd_cum_outs: count outstanding rd_req each cycle.”…””}”(hjØ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K‡hjÔ  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhju  ubhá)”}”(hŒNcycles: count the clock cycles of SOC fabric connected to the PCIE interface.
”h]”hÌ)”}”(hŒMcycles: count the clock cycles of SOC fabric connected to the PCIE interface.”h]”hŒMcycles: count the clock cycles of SOC fabric connected to the PCIE interface.”…””}”(hjï  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Kˆhjë  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhju  ubeh}”(h]”h ]”h"]”h$]”h&]”jŠ  j‹  uh1hÛh³hÊh´Kƒhjq  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÒ  h³hÊh´Kƒhj6  h²hubhÌ)”}”(hŒ(The average bandwidth is calculated as::”h]”hŒ'The average bandwidth is calculated as:”…””}”(hj  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´KŠhj6  h²hubjR  )”}”(hŒqAVG_RD_BANDWIDTH_IN_GBPS = RD_BYTES / ELAPSED_TIME_IN_NS
AVG_WR_BANDWIDTH_IN_GBPS = WR_BYTES / ELAPSED_TIME_IN_NS”h]”hŒqAVG_RD_BANDWIDTH_IN_GBPS = RD_BYTES / ELAPSED_TIME_IN_NS
AVG_WR_BANDWIDTH_IN_GBPS = WR_BYTES / ELAPSED_TIME_IN_NS”…””}”hj  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´KŒhj6  h²hubhÌ)”}”(hŒ+The average request rate is calculated as::”h]”hŒ*The average request rate is calculated as:”…””}”(hj+  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Khj6  h²hubjR  )”}”(hŒKAVG_RD_REQUEST_RATE = RD_REQ / CYCLES
AVG_WR_REQUEST_RATE = WR_REQ / CYCLES”h]”hŒKAVG_RD_REQUEST_RATE = RD_REQ / CYCLES
AVG_WR_REQUEST_RATE = WR_REQ / CYCLES”…””}”hj9  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´K‘hj6  h²hubhÌ)”}”(hŒ&The average latency is calculated as::”h]”hŒ%The average latency is calculated as:”…””}”(hjG  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K•hj6  h²hubjR  )”}”(hŒ’FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS
AVG_LATENCY_IN_CYCLES = RD_CUM_OUTS / RD_REQ
AVERAGE_LATENCY_IN_NS = AVG_LATENCY_IN_CYCLES / FREQ_IN_GHZ”h]”hŒ’FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS
AVG_LATENCY_IN_CYCLES = RD_CUM_OUTS / RD_REQ
AVERAGE_LATENCY_IN_NS = AVG_LATENCY_IN_CYCLES / FREQ_IN_GHZ”…””}”hjU  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´K—hj6  h²hubhÌ)”}”(hX  The PMU events can be filtered based on the traffic source and destination.
The source filter indicates the PCIE devices that will be monitored. The
destination filter specifies the destination memory type, e.g. local system
memory (CMEM), local GPU memory (GMEM), or remote memory. The local/remote
classification of the destination filter is based on the home socket of the
address, not where the data actually resides. These filters can be found in
/sys/bus/event_source/devices/nvidia_pcie_pmu_<socket-id>_rc_<pcie-rc-id>/format/.”h]”hX  The PMU events can be filtered based on the traffic source and destination.
The source filter indicates the PCIE devices that will be monitored. The
destination filter specifies the destination memory type, e.g. local system
memory (CMEM), local GPU memory (GMEM), or remote memory. The local/remote
classification of the destination filter is based on the home socket of the
address, not where the data actually resides. These filters can be found in
/sys/bus/event_source/devices/nvidia_pcie_pmu_<socket-id>_rc_<pcie-rc-id>/format/.”…””}”(hjc  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K›hj6  h²hubhÌ)”}”(hŒThe list of event filters:”h]”hŒThe list of event filters:”…””}”(hjq  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K£hj6  h²hubhÜ)”}”(hhh]”(há)”}”(hX  Source filter:

* src_rp_mask: bitmask of root ports that will be monitored. Each bit in this
  bitmask represents the RP index in the RC. If the bit is set, all devices under
  the associated RP will be monitored. E.g "src_rp_mask=0xF" will monitor
  devices in root port 0 to 3.
* src_bdf: the BDF that will be monitored. This is a 16-bit value that
  follows formula: (bus << 8) + (device << 3) + (function). For example, the
  value of BDF 27:01.1 is 0x2781.
* src_bdf_en: enable the BDF filter. If this is set, the BDF filter value in
  "src_bdf" is used to filter the traffic.

Note that Root-Port and BDF filters are mutually exclusive and the PMU in
each RC can only have one BDF filter for the whole counters. If BDF filter
is enabled, the BDF filter value will be applied to all events.
”h]”(hÌ)”}”(hŒSource filter:”h]”hŒSource filter:”…””}”(hj†  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K¥hj‚  ubhÜ)”}”(hhh]”(há)”}”(hX   src_rp_mask: bitmask of root ports that will be monitored. Each bit in this
bitmask represents the RP index in the RC. If the bit is set, all devices under
the associated RP will be monitored. E.g "src_rp_mask=0xF" will monitor
devices in root port 0 to 3.”h]”hÌ)”}”(hX   src_rp_mask: bitmask of root ports that will be monitored. Each bit in this
bitmask represents the RP index in the RC. If the bit is set, all devices under
the associated RP will be monitored. E.g "src_rp_mask=0xF" will monitor
devices in root port 0 to 3.”h]”hX  src_rp_mask: bitmask of root ports that will be monitored. Each bit in this
bitmask represents the RP index in the RC. If the bit is set, all devices under
the associated RP will be monitored. E.g â€œsrc_rp_mask=0xFâ€ will monitor
devices in root port 0 to 3.”…””}”(hj›  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K§hj—  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj”  ubhá)”}”(hŒ¯src_bdf: the BDF that will be monitored. This is a 16-bit value that
follows formula: (bus << 8) + (device << 3) + (function). For example, the
value of BDF 27:01.1 is 0x2781.”h]”hÌ)”}”(hŒ¯src_bdf: the BDF that will be monitored. This is a 16-bit value that
follows formula: (bus << 8) + (device << 3) + (function). For example, the
value of BDF 27:01.1 is 0x2781.”h]”hŒ¯src_bdf: the BDF that will be monitored. This is a 16-bit value that
follows formula: (bus << 8) + (device << 3) + (function). For example, the
value of BDF 27:01.1 is 0x2781.”…””}”(hj³  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K«hj¯  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj”  ubhá)”}”(hŒtsrc_bdf_en: enable the BDF filter. If this is set, the BDF filter value in
"src_bdf" is used to filter the traffic.
”h]”hÌ)”}”(hŒssrc_bdf_en: enable the BDF filter. If this is set, the BDF filter value in
"src_bdf" is used to filter the traffic.”h]”hŒwsrc_bdf_en: enable the BDF filter. If this is set, the BDF filter value in
â€œsrc_bdfâ€ is used to filter the traffic.”…””}”(hjË  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K®hjÇ  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj”  ubeh}”(h]”h ]”h"]”h$]”h&]”jŠ  j‹  uh1hÛh³hÊh´K§hj‚  ubhÌ)”}”(hŒÔNote that Root-Port and BDF filters are mutually exclusive and the PMU in
each RC can only have one BDF filter for the whole counters. If BDF filter
is enabled, the BDF filter value will be applied to all events.”h]”hŒÔNote that Root-Port and BDF filters are mutually exclusive and the PMU in
each RC can only have one BDF filter for the whole counters. If BDF filter
is enabled, the BDF filter value will be applied to all events.”…””}”(hjå  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K±hj‚  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhj  h²hh³hÊh´Nubhá)”}”(hXj  Destination filter:

* dst_loc_cmem: if set, count events to local system memory (CMEM) address
* dst_loc_gmem: if set, count events to local GPU memory (GMEM) address
* dst_loc_pcie_p2p: if set, count events to local PCIE peer address
* dst_loc_pcie_cxl: if set, count events to local CXL memory address
* dst_rem: if set, count events to remote memory address
”h]”(hÌ)”}”(hŒDestination filter:”h]”hŒDestination filter:”…””}”(hjý  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Kµhjù  ubhÜ)”}”(hhh]”(há)”}”(hŒHdst_loc_cmem: if set, count events to local system memory (CMEM) address”h]”hÌ)”}”(hj  h]”hŒHdst_loc_cmem: if set, count events to local system memory (CMEM) address”…””}”(hj  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K·hj  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj  ubhá)”}”(hŒEdst_loc_gmem: if set, count events to local GPU memory (GMEM) address”h]”hÌ)”}”(hj'  h]”hŒEdst_loc_gmem: if set, count events to local GPU memory (GMEM) address”…””}”(hj)  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K¸hj%  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj  ubhá)”}”(hŒAdst_loc_pcie_p2p: if set, count events to local PCIE peer address”h]”hÌ)”}”(hj>  h]”hŒAdst_loc_pcie_p2p: if set, count events to local PCIE peer address”…””}”(hj@  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K¹hj<  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj  ubhá)”}”(hŒBdst_loc_pcie_cxl: if set, count events to local CXL memory address”h]”hÌ)”}”(hjU  h]”hŒBdst_loc_pcie_cxl: if set, count events to local CXL memory address”…””}”(hjW  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´KºhjS  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj  ubhá)”}”(hŒ7dst_rem: if set, count events to remote memory address
”h]”hÌ)”}”(hŒ6dst_rem: if set, count events to remote memory address”h]”hŒ6dst_rem: if set, count events to remote memory address”…””}”(hjn  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K»hjj  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj  ubeh}”(h]”h ]”h"]”h$]”h&]”jŠ  j‹  uh1hÛh³hÊh´K·hjù  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhj  h²hh³Nh´Nubeh}”(h]”h ]”h"]”h$]”h&]”jŠ  j‹  uh1hÛh³hÊh´K¥hj6  h²hubhÌ)”}”(hŒ°If the source filter is not specified, the PMU will count events from all root
ports. If the destination filter is not specified, the PMU will count events
to all destinations.”h]”hŒ°If the source filter is not specified, the PMU will count events from all root
ports. If the destination filter is not specified, the PMU will count events
to all destinations.”…””}”(hj”  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K½hj6  h²hubhÌ)”}”(hŒExample usage:”h]”hŒExample usage:”…””}”(hj¢  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´KÁhj6  h²hubhÜ)”}”(hhh]”(há)”}”(hŒŸCount event id 0x0 from root port 0 of PCIE RC-0 on socket 0 targeting all
destinations::

  perf stat -a -e nvidia_pcie_pmu_0_rc_0/event=0x0,src_rp_mask=0x1/
”h]”(hÌ)”}”(hŒYCount event id 0x0 from root port 0 of PCIE RC-0 on socket 0 targeting all
destinations::”h]”hŒXCount event id 0x0 from root port 0 of PCIE RC-0 on socket 0 targeting all
destinations:”…””}”(hj·  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´KÃhj³  ubjR  )”}”(hŒAperf stat -a -e nvidia_pcie_pmu_0_rc_0/event=0x0,src_rp_mask=0x1/”h]”hŒAperf stat -a -e nvidia_pcie_pmu_0_rc_0/event=0x0,src_rp_mask=0x1/”…””}”hjÅ  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´KÆhj³  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhj°  h²hh³hÊh´Nubhá)”}”(hŒÅCount event id 0x1 from root port 0 and 1 of PCIE RC-1 on socket 0 and
targeting just local CMEM of socket 0::

  perf stat -a -e nvidia_pcie_pmu_0_rc_1/event=0x1,src_rp_mask=0x3,dst_loc_cmem=0x1/
”h]”(hÌ)”}”(hŒnCount event id 0x1 from root port 0 and 1 of PCIE RC-1 on socket 0 and
targeting just local CMEM of socket 0::”h]”hŒmCount event id 0x1 from root port 0 and 1 of PCIE RC-1 on socket 0 and
targeting just local CMEM of socket 0:”…””}”(hjÝ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´KÈhjÙ  ubjR  )”}”(hŒRperf stat -a -e nvidia_pcie_pmu_0_rc_1/event=0x1,src_rp_mask=0x3,dst_loc_cmem=0x1/”h]”hŒRperf stat -a -e nvidia_pcie_pmu_0_rc_1/event=0x1,src_rp_mask=0x3,dst_loc_cmem=0x1/”…””}”hjë  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´KËhjÙ  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhj°  h²hh³hÊh´Nubhá)”}”(hŒŸCount event id 0x2 from root port 0 of PCIE RC-2 on socket 1 targeting all
destinations::

  perf stat -a -e nvidia_pcie_pmu_1_rc_2/event=0x2,src_rp_mask=0x1/
”h]”(hÌ)”}”(hŒYCount event id 0x2 from root port 0 of PCIE RC-2 on socket 1 targeting all
destinations::”h]”hŒXCount event id 0x2 from root port 0 of PCIE RC-2 on socket 1 targeting all
destinations:”…””}”(hj  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´KÍhjÿ  ubjR  )”}”(hŒAperf stat -a -e nvidia_pcie_pmu_1_rc_2/event=0x2,src_rp_mask=0x1/”h]”hŒAperf stat -a -e nvidia_pcie_pmu_1_rc_2/event=0x2,src_rp_mask=0x1/”…””}”hj  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´KÐhjÿ  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhj°  h²hh³hÊh´Nubhá)”}”(hŒÅCount event id 0x3 from root port 0 and 1 of PCIE RC-3 on socket 1 and
targeting just local CMEM of socket 1::

  perf stat -a -e nvidia_pcie_pmu_1_rc_3/event=0x3,src_rp_mask=0x3,dst_loc_cmem=0x1/
”h]”(hÌ)”}”(hŒnCount event id 0x3 from root port 0 and 1 of PCIE RC-3 on socket 1 and
targeting just local CMEM of socket 1::”h]”hŒmCount event id 0x3 from root port 0 and 1 of PCIE RC-3 on socket 1 and
targeting just local CMEM of socket 1:”…””}”(hj)  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´KÒhj%  ubjR  )”}”(hŒRperf stat -a -e nvidia_pcie_pmu_1_rc_3/event=0x3,src_rp_mask=0x3,dst_loc_cmem=0x1/”h]”hŒRperf stat -a -e nvidia_pcie_pmu_1_rc_3/event=0x3,src_rp_mask=0x3,dst_loc_cmem=0x1/”…””}”hj7  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´KÕhj%  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhj°  h²hh³hÊh´Nubhá)”}”(hŒ­Count event id 0x4 from BDF 01:01.0 of PCIE RC-4 on socket 0 targeting all
destinations::

  perf stat -a -e nvidia_pcie_pmu_0_rc_4/event=0x4,src_bdf=0x0180,src_bdf_en=0x1/
”h]”(hÌ)”}”(hŒYCount event id 0x4 from BDF 01:01.0 of PCIE RC-4 on socket 0 targeting all
destinations::”h]”hŒXCount event id 0x4 from BDF 01:01.0 of PCIE RC-4 on socket 0 targeting all
destinations:”…””}”(hjO  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´K×hjK  ubjR  )”}”(hŒOperf stat -a -e nvidia_pcie_pmu_0_rc_4/event=0x4,src_bdf=0x0180,src_bdf_en=0x1/”h]”hŒOperf stat -a -e nvidia_pcie_pmu_0_rc_4/event=0x4,src_bdf=0x0180,src_bdf_en=0x1/”…””}”hj]  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´KÚhjK  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhj°  h²hh³hÊh´Nubeh}”(h]”h ]”h"]”h$]”h&]”jŠ  j‹  uh1hÛh³hÊh´KÃhj6  h²hubhŒtarget”“”)”}”(hŒ,.. _NVIDIA_T410_PCIE_PMU_RC_Mapping_Section:”h]”h}”(h]”h ]”h"]”h$]”h&]”Œrefid”Œ'nvidia-t410-pcie-pmu-rc-mapping-section”uh1jw  h´KÜhj6  h²hh³hÊubh¶)”}”(hhh]”(h»)”}”(hŒ'Mapping the RC# to lspci segment number”h]”hŒ'Mapping the RC# to lspci segment number”…””}”(hjˆ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hºhj…  h²hh³hÊh´KßubhÌ)”}”(hXV  Mapping the RC# to lspci segment number can be non-trivial; hence a new NVIDIA
Designated Vendor Specific Capability (DVSEC) register is added into the PCIE config space
for each RP. This DVSEC has vendor id "10de" and DVSEC id of "0x4". The DVSEC register
contains the following information to map PCIE devices under the RP back to its RC# :”h]”hX^  Mapping the RC# to lspci segment number can be non-trivial; hence a new NVIDIA
Designated Vendor Specific Capability (DVSEC) register is added into the PCIE config space
for each RP. This DVSEC has vendor id â€œ10deâ€ and DVSEC id of â€œ0x4â€. The DVSEC register
contains the following information to map PCIE devices under the RP back to its RC# :”…””}”(hj–  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Káhj…  h²hubjÓ  )”}”(hXp  - Bus# (byte 0xc) : bus number as reported by the lspci output
- Segment# (byte 0xd) : segment number as reported by the lspci output
- RP# (byte 0xe) : port number as reported by LnkCap attribute from lspci for a device with Root Port capability
- RC# (byte 0xf): root complex number associated with the RP
- Socket# (byte 0x10): socket number associated with the RP
”h]”hÜ)”}”(hhh]”(há)”}”(hŒ<Bus# (byte 0xc) : bus number as reported by the lspci output”h]”hÌ)”}”(hj­  h]”hŒ<Bus# (byte 0xc) : bus number as reported by the lspci output”…””}”(hj¯  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Kæhj«  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj¨  ubhá)”}”(hŒDSegment# (byte 0xd) : segment number as reported by the lspci output”h]”hÌ)”}”(hjÄ  h]”hŒDSegment# (byte 0xd) : segment number as reported by the lspci output”…””}”(hjÆ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´KçhjÂ  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj¨  ubhá)”}”(hŒnRP# (byte 0xe) : port number as reported by LnkCap attribute from lspci for a device with Root Port capability”h]”hÌ)”}”(hjÛ  h]”hŒnRP# (byte 0xe) : port number as reported by LnkCap attribute from lspci for a device with Root Port capability”…””}”(hjÝ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´KèhjÙ  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj¨  ubhá)”}”(hŒ:RC# (byte 0xf): root complex number associated with the RP”h]”hÌ)”}”(hjò  h]”hŒ:RC# (byte 0xf): root complex number associated with the RP”…””}”(hjô  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Kéhjð  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj¨  ubhá)”}”(hŒ:Socket# (byte 0x10): socket number associated with the RP
”h]”hÌ)”}”(hŒ9Socket# (byte 0x10): socket number associated with the RP”h]”hŒ9Socket# (byte 0x10): socket number associated with the RP”…””}”(hj	  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Kêhj	  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj¨  ubeh}”(h]”h ]”h"]”h$]”h&]”jŠ  Œ-”uh1hÛh³hÊh´Kæhj¤  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÒ  h³hÊh´Kæhj…  h²hubhÌ)”}”(hŒ9Example script for mapping lspci BDF to RC# and socket#::”h]”hŒ8Example script for mapping lspci BDF to RC# and socket#:”…””}”(hj,	  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Kìhj…  h²hubjR  )”}”(hXË  #!/bin/bash
while read bdf rest; do
  dvsec4_reg=$(lspci -vv -s $bdf | awk '
    /Designated Vendor-Specific: Vendor=10de ID=0004/ {
      match($0, /\[([0-9a-fA-F]+)/, arr);
      print "0x" arr[1];
      exit
    }
  ')
  if [ -n "$dvsec4_reg" ]; then
    bus=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xc))).b)
    segment=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xd))).b)
    rp=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xe))).b)
    rc=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xf))).b)
    socket=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0x10))).b)
    echo "$bdf: Bus=$bus, Segment=$segment, RP=$rp, RC=$rc, Socket=$socket"
  fi
done < <(lspci -d 10de:)”h]”hXË  #!/bin/bash
while read bdf rest; do
  dvsec4_reg=$(lspci -vv -s $bdf | awk '
    /Designated Vendor-Specific: Vendor=10de ID=0004/ {
      match($0, /\[([0-9a-fA-F]+)/, arr);
      print "0x" arr[1];
      exit
    }
  ')
  if [ -n "$dvsec4_reg" ]; then
    bus=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xc))).b)
    segment=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xd))).b)
    rp=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xe))).b)
    rc=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xf))).b)
    socket=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0x10))).b)
    echo "$bdf: Bus=$bus, Segment=$segment, RP=$rp, RC=$rc, Socket=$socket"
  fi
done < <(lspci -d 10de:)”…””}”hj:	  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´Kîhj…  h²hubhÌ)”}”(hŒExample output::”h]”hŒExample output:”…””}”(hjH	  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Mhj…  h²hubjR  )”}”(hXÁ  0001:00:00.0: Bus=00, Segment=01, RP=00, RC=00, Socket=00
0002:80:00.0: Bus=80, Segment=02, RP=01, RC=01, Socket=00
0002:a0:00.0: Bus=a0, Segment=02, RP=02, RC=01, Socket=00
0002:c0:00.0: Bus=c0, Segment=02, RP=03, RC=01, Socket=00
0002:e0:00.0: Bus=e0, Segment=02, RP=04, RC=01, Socket=00
0003:00:00.0: Bus=00, Segment=03, RP=00, RC=02, Socket=00
0004:00:00.0: Bus=00, Segment=04, RP=00, RC=03, Socket=00
0005:00:00.0: Bus=00, Segment=05, RP=00, RC=04, Socket=00
0005:40:00.0: Bus=40, Segment=05, RP=01, RC=04, Socket=00
0005:c0:00.0: Bus=c0, Segment=05, RP=02, RC=04, Socket=00
0006:00:00.0: Bus=00, Segment=06, RP=00, RC=05, Socket=00
0009:00:00.0: Bus=00, Segment=09, RP=00, RC=00, Socket=01
000a:80:00.0: Bus=80, Segment=0a, RP=01, RC=01, Socket=01
000a:a0:00.0: Bus=a0, Segment=0a, RP=02, RC=01, Socket=01
000a:e0:00.0: Bus=e0, Segment=0a, RP=03, RC=01, Socket=01
000b:00:00.0: Bus=00, Segment=0b, RP=00, RC=02, Socket=01
000c:00:00.0: Bus=00, Segment=0c, RP=00, RC=03, Socket=01
000d:00:00.0: Bus=00, Segment=0d, RP=00, RC=04, Socket=01
000d:40:00.0: Bus=40, Segment=0d, RP=01, RC=04, Socket=01
000d:c0:00.0: Bus=c0, Segment=0d, RP=02, RC=04, Socket=01
000e:00:00.0: Bus=00, Segment=0e, RP=00, RC=05, Socket=01”h]”hXÁ  0001:00:00.0: Bus=00, Segment=01, RP=00, RC=00, Socket=00
0002:80:00.0: Bus=80, Segment=02, RP=01, RC=01, Socket=00
0002:a0:00.0: Bus=a0, Segment=02, RP=02, RC=01, Socket=00
0002:c0:00.0: Bus=c0, Segment=02, RP=03, RC=01, Socket=00
0002:e0:00.0: Bus=e0, Segment=02, RP=04, RC=01, Socket=00
0003:00:00.0: Bus=00, Segment=03, RP=00, RC=02, Socket=00
0004:00:00.0: Bus=00, Segment=04, RP=00, RC=03, Socket=00
0005:00:00.0: Bus=00, Segment=05, RP=00, RC=04, Socket=00
0005:40:00.0: Bus=40, Segment=05, RP=01, RC=04, Socket=00
0005:c0:00.0: Bus=c0, Segment=05, RP=02, RC=04, Socket=00
0006:00:00.0: Bus=00, Segment=06, RP=00, RC=05, Socket=00
0009:00:00.0: Bus=00, Segment=09, RP=00, RC=00, Socket=01
000a:80:00.0: Bus=80, Segment=0a, RP=01, RC=01, Socket=01
000a:a0:00.0: Bus=a0, Segment=0a, RP=02, RC=01, Socket=01
000a:e0:00.0: Bus=e0, Segment=0a, RP=03, RC=01, Socket=01
000b:00:00.0: Bus=00, Segment=0b, RP=00, RC=02, Socket=01
000c:00:00.0: Bus=00, Segment=0c, RP=00, RC=03, Socket=01
000d:00:00.0: Bus=00, Segment=0d, RP=00, RC=04, Socket=01
000d:40:00.0: Bus=40, Segment=0d, RP=01, RC=04, Socket=01
000d:c0:00.0: Bus=c0, Segment=0d, RP=02, RC=04, Socket=01
000e:00:00.0: Bus=00, Segment=0e, RP=00, RC=05, Socket=01”…””}”hjV	  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´Mhj…  h²hubeh}”(h]”(Œ&mapping-the-rc-to-lspci-segment-number”j„  eh ]”h"]”(Œ'mapping the rc# to lspci segment number”Œ'nvidia_t410_pcie_pmu_rc_mapping_section”eh$]”h&]”uh1hµhj6  h²hh³hÊh´KßŒexpect_referenced_by_name”}”jj	  jy  sŒexpect_referenced_by_id”}”j„  jy  subeh}”(h]”Œpcie-pmu”ah ]”h"]”Œpcie pmu”ah$]”h&]”uh1hµhh·h²hh³hÊh´Ksubh¶)”}”(hhh]”(h»)”}”(hŒPCIE-TGT PMU”h]”hŒPCIE-TGT PMU”…””}”(hj|	  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hºhjy	  h²hh³hÊh´MubhÌ)”}”(hXý  This PMU is located in the SOC fabric connecting the PCIE root complex (RC) and
the memory subsystem. It monitors traffic targeting PCIE BAR and CXL HDM ranges.
There is one PCIE-TGT PMU per PCIE RC in the SoC. Each RC in Tegra410 SoC can
have up to 16 lanes that can be bifurcated into up to 8 root ports (RP). The PMU
provides RP filter to count PCIE BAR traffic to each RP and address filter to
count access to PCIE BAR or CXL HDM ranges. The details of the filters are
described in the following sections.”h]”hXý  This PMU is located in the SOC fabric connecting the PCIE root complex (RC) and
the memory subsystem. It monitors traffic targeting PCIE BAR and CXL HDM ranges.
There is one PCIE-TGT PMU per PCIE RC in the SoC. Each RC in Tegra410 SoC can
have up to 16 lanes that can be bifurcated into up to 8 root ports (RP). The PMU
provides RP filter to count PCIE BAR traffic to each RP and address filter to
count access to PCIE BAR or CXL HDM ranges. The details of the filters are
described in the following sections.”…””}”(hjŠ	  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Mhjy	  h²hubhÌ)”}”(hŒŒMapping the RC# to lspci segment number is similar to the PCIE PMU. Please see
:ref:`NVIDIA_T410_PCIE_PMU_RC_Mapping_Section` for more info.”h]”(hŒOMapping the RC# to lspci segment number is similar to the PCIE PMU. Please see
”…””}”(hj˜	  h²hh³Nh´Nubh)”}”(hŒ.:ref:`NVIDIA_T410_PCIE_PMU_RC_Mapping_Section`”h]”hŒinline”“”)”}”(hj¢	  h]”hŒ'NVIDIA_T410_PCIE_PMU_RC_Mapping_Section”…””}”(hj¦	  h²hh³Nh´Nubah}”(h]”h ]”(Œxref”Œstd”Œstd-ref”eh"]”h$]”h&]”uh1j¤	  hj 	  ubah}”(h]”h ]”h"]”h$]”h&]”Œrefdoc”Œ$admin-guide/perf/nvidia-tegra410-pmu”Œ	refdomain”j±	  Œreftype”Œref”Œrefexplicit”‰Œrefwarn”ˆŒ	reftarget”Œ'nvidia_t410_pcie_pmu_rc_mapping_section”uh1hh³hÊh´M$hj˜	  ubhŒ for more info.”…””}”(hj˜	  h²hh³Nh´Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M$hjy	  h²hubhÌ)”}”(hŒ¢The events and configuration options of this PMU device are available in sysfs,
see /sys/bus/event_source/devices/nvidia_pcie_tgt_pmu_<socket-id>_rc_<pcie-rc-id>.”h]”hŒ¢The events and configuration options of this PMU device are available in sysfs,
see /sys/bus/event_source/devices/nvidia_pcie_tgt_pmu_<socket-id>_rc_<pcie-rc-id>.”…””}”(hjÏ	  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M'hjy	  h²hubhÌ)”}”(hŒHThe events in this PMU can be used to measure bandwidth and utilization:”h]”hŒHThe events in this PMU can be used to measure bandwidth and utilization:”…””}”(hjÝ	  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M*hjy	  h²hubjÓ  )”}”(hX5  * rd_req: count the number of read requests to PCIE.
* wr_req: count the number of write requests to PCIE.
* rd_bytes: count the number of bytes transferred by rd_req.
* wr_bytes: count the number of bytes transferred by wr_req.
* cycles: count the clock cycles of SOC fabric connected to the PCIE interface.
”h]”hÜ)”}”(hhh]”(há)”}”(hŒ2rd_req: count the number of read requests to PCIE.”h]”hÌ)”}”(hjô	  h]”hŒ2rd_req: count the number of read requests to PCIE.”…””}”(hjö	  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M,hjò	  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjï	  ubhá)”}”(hŒ3wr_req: count the number of write requests to PCIE.”h]”hÌ)”}”(hj
  h]”hŒ3wr_req: count the number of write requests to PCIE.”…””}”(hj
  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M-hj	
  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjï	  ubhá)”}”(hŒ:rd_bytes: count the number of bytes transferred by rd_req.”h]”hÌ)”}”(hj"
  h]”hŒ:rd_bytes: count the number of bytes transferred by rd_req.”…””}”(hj$
  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M.hj 
  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjï	  ubhá)”}”(hŒ:wr_bytes: count the number of bytes transferred by wr_req.”h]”hÌ)”}”(hj9
  h]”hŒ:wr_bytes: count the number of bytes transferred by wr_req.”…””}”(hj;
  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M/hj7
  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjï	  ubhá)”}”(hŒNcycles: count the clock cycles of SOC fabric connected to the PCIE interface.
”h]”hÌ)”}”(hŒMcycles: count the clock cycles of SOC fabric connected to the PCIE interface.”h]”hŒMcycles: count the clock cycles of SOC fabric connected to the PCIE interface.”…””}”(hjR
  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M0hjN
  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjï	  ubeh}”(h]”h ]”h"]”h$]”h&]”jŠ  j‹  uh1hÛh³hÊh´M,hjë	  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÒ  h³hÊh´M,hjy	  h²hubhÌ)”}”(hŒ(The average bandwidth is calculated as::”h]”hŒ'The average bandwidth is calculated as:”…””}”(hjr
  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M2hjy	  h²hubjR  )”}”(hŒqAVG_RD_BANDWIDTH_IN_GBPS = RD_BYTES / ELAPSED_TIME_IN_NS
AVG_WR_BANDWIDTH_IN_GBPS = WR_BYTES / ELAPSED_TIME_IN_NS”h]”hŒqAVG_RD_BANDWIDTH_IN_GBPS = RD_BYTES / ELAPSED_TIME_IN_NS
AVG_WR_BANDWIDTH_IN_GBPS = WR_BYTES / ELAPSED_TIME_IN_NS”…””}”hj€
  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´M4hjy	  h²hubhÌ)”}”(hŒ+The average request rate is calculated as::”h]”hŒ*The average request rate is calculated as:”…””}”(hjŽ
  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M7hjy	  h²hubjR  )”}”(hŒKAVG_RD_REQUEST_RATE = RD_REQ / CYCLES
AVG_WR_REQUEST_RATE = WR_REQ / CYCLES”h]”hŒKAVG_RD_REQUEST_RATE = RD_REQ / CYCLES
AVG_WR_REQUEST_RATE = WR_REQ / CYCLES”…””}”hjœ
  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´M9hjy	  h²hubhÌ)”}”(hXS  The PMU events can be filtered based on the destination root port or target
address range. Filtering based on RP is only available for PCIE BAR traffic.
Address filter works for both PCIE BAR and CXL HDM ranges. These filters can be
found in sysfs, see
/sys/bus/event_source/devices/nvidia_pcie_tgt_pmu_<socket-id>_rc_<pcie-rc-id>/format/.”h]”hXS  The PMU events can be filtered based on the destination root port or target
address range. Filtering based on RP is only available for PCIE BAR traffic.
Address filter works for both PCIE BAR and CXL HDM ranges. These filters can be
found in sysfs, see
/sys/bus/event_source/devices/nvidia_pcie_tgt_pmu_<socket-id>_rc_<pcie-rc-id>/format/.”…””}”(hjª
  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M<hjy	  h²hubhÌ)”}”(hŒDestination filter settings:”h]”hŒDestination filter settings:”…””}”(hj¸
  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MBhjy	  h²hubhÜ)”}”(hhh]”(há)”}”(hŒÍdst_rp_mask: bitmask to select the root port(s) to monitor. E.g. "dst_rp_mask=0xFF"
corresponds to all root ports (from 0 to 7) in the PCIE RC. Note that this filter is
only available for PCIE BAR traffic.”h]”hÌ)”}”(hŒÍdst_rp_mask: bitmask to select the root port(s) to monitor. E.g. "dst_rp_mask=0xFF"
corresponds to all root ports (from 0 to 7) in the PCIE RC. Note that this filter is
only available for PCIE BAR traffic.”h]”hŒÑdst_rp_mask: bitmask to select the root port(s) to monitor. E.g. â€œdst_rp_mask=0xFFâ€
corresponds to all root ports (from 0 to 7) in the PCIE RC. Note that this filter is
only available for PCIE BAR traffic.”…””}”(hjÍ
  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MDhjÉ
  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjÆ
  h²hh³hÊh´Nubhá)”}”(hŒ2dst_addr_base: BAR or CXL HDM filter base address.”h]”hÌ)”}”(hjã
  h]”hŒ2dst_addr_base: BAR or CXL HDM filter base address.”…””}”(hjå
  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MGhjá
  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjÆ
  h²hh³hÊh´Nubhá)”}”(hŒ2dst_addr_mask: BAR or CXL HDM filter address mask.”h]”hÌ)”}”(hjú
  h]”hŒ2dst_addr_mask: BAR or CXL HDM filter address mask.”…””}”(hjü
  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MHhjø
  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjÆ
  h²hh³hÊh´Nubhá)”}”(hXÃ  dst_addr_en: enable BAR or CXL HDM address range filter. If this is set, the
address range specified by "dst_addr_base" and "dst_addr_mask" will be used to filter
the PCIE BAR and CXL HDM traffic address. The PMU uses the following comparison
to determine if the traffic destination address falls within the filter range::

  (txn's addr & dst_addr_mask) == (dst_addr_base & dst_addr_mask)

If the comparison succeeds, then the event will be counted.
”h]”(hÌ)”}”(hXB  dst_addr_en: enable BAR or CXL HDM address range filter. If this is set, the
address range specified by "dst_addr_base" and "dst_addr_mask" will be used to filter
the PCIE BAR and CXL HDM traffic address. The PMU uses the following comparison
to determine if the traffic destination address falls within the filter range::”h]”hXI  dst_addr_en: enable BAR or CXL HDM address range filter. If this is set, the
address range specified by â€œdst_addr_baseâ€ and â€œdst_addr_maskâ€ will be used to filter
the PCIE BAR and CXL HDM traffic address. The PMU uses the following comparison
to determine if the traffic destination address falls within the filter range:”…””}”(hj  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MIhj  ubjR  )”}”(hŒ?(txn's addr & dst_addr_mask) == (dst_addr_base & dst_addr_mask)”h]”hŒ?(txn's addr & dst_addr_mask) == (dst_addr_base & dst_addr_mask)”…””}”hj!  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´MNhj  ubhÌ)”}”(hŒ;If the comparison succeeds, then the event will be counted.”h]”hŒ;If the comparison succeeds, then the event will be counted.”…””}”(hj/  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MPhj  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhjÆ
  h²hh³hÊh´Nubeh}”(h]”h ]”h"]”h$]”h&]”jŠ  j‹  uh1hÛh³hÊh´MDhjy	  h²hubhÌ)”}”(hŒ„If the destination filter is not specified, the RP filter will be configured by default
to count PCIE BAR traffic to all root ports.”h]”hŒ„If the destination filter is not specified, the RP filter will be configured by default
to count PCIE BAR traffic to all root ports.”…””}”(hjI  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MRhjy	  h²hubhÌ)”}”(hŒExample usage:”h]”hŒExample usage:”…””}”(hjW  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MUhjy	  h²hubhÜ)”}”(hhh]”(há)”}”(hŒŒCount event id 0x0 to root port 0 and 1 of PCIE RC-0 on socket 0::

  perf stat -a -e nvidia_pcie_tgt_pmu_0_rc_0/event=0x0,dst_rp_mask=0x3/
”h]”(hÌ)”}”(hŒBCount event id 0x0 to root port 0 and 1 of PCIE RC-0 on socket 0::”h]”hŒACount event id 0x0 to root port 0 and 1 of PCIE RC-0 on socket 0:”…””}”(hjl  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MWhjh  ubjR  )”}”(hŒEperf stat -a -e nvidia_pcie_tgt_pmu_0_rc_0/event=0x0,dst_rp_mask=0x3/”h]”hŒEperf stat -a -e nvidia_pcie_tgt_pmu_0_rc_0/event=0x0,dst_rp_mask=0x3/”…””}”hjz  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´MYhjh  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhje  h²hh³hÊh´Nubhá)”}”(hŒçCount event id 0x1 for accesses to PCIE BAR or CXL HDM address range
0x10000 to 0x100FF on socket 0's PCIE RC-1::

  perf stat -a -e nvidia_pcie_tgt_pmu_0_rc_1/event=0x1,dst_addr_base=0x10000,dst_addr_mask=0xFFF00,dst_addr_en=0x1/
”h]”(hÌ)”}”(hŒqCount event id 0x1 for accesses to PCIE BAR or CXL HDM address range
0x10000 to 0x100FF on socket 0's PCIE RC-1::”h]”hŒrCount event id 0x1 for accesses to PCIE BAR or CXL HDM address range
0x10000 to 0x100FF on socket 0â€™s PCIE RC-1:”…””}”(hj’  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M[hjŽ  ubjR  )”}”(hŒqperf stat -a -e nvidia_pcie_tgt_pmu_0_rc_1/event=0x1,dst_addr_base=0x10000,dst_addr_mask=0xFFF00,dst_addr_en=0x1/”h]”hŒqperf stat -a -e nvidia_pcie_tgt_pmu_0_rc_1/event=0x1,dst_addr_base=0x10000,dst_addr_mask=0xFFF00,dst_addr_en=0x1/”…””}”hj   sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´M^hjŽ  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhje  h²hh³hÊh´Nubeh}”(h]”h ]”h"]”h$]”h&]”jŠ  j‹  uh1hÛh³hÊh´MWhjy	  h²hubeh}”(h]”Œpcie-tgt-pmu”ah ]”h"]”Œpcie-tgt pmu”ah$]”h&]”uh1hµhh·h²hh³hÊh´Mubh¶)”}”(hhh]”(h»)”}”(hŒCPU Memory (CMEM) Latency PMU”h]”hŒCPU Memory (CMEM) Latency PMU”…””}”(hjÅ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hºhjÂ  h²hh³hÊh´MaubhÌ)”}”(hŒThis PMU monitors latency events of memory read requests from the edge of the
Unified Coherence Fabric (UCF) to local CPU DRAM:”h]”hŒThis PMU monitors latency events of memory read requests from the edge of the
Unified Coherence Fabric (UCF) to local CPU DRAM:”…””}”(hjÓ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MchjÂ  h²hubjÓ  )”}”(hŒñ* RD_REQ counters: count read requests (32B per request).
* RD_CUM_OUTS counters: accumulated outstanding request counter, which track
  how many cycles the read requests are in flight.
* CYCLES counter: counts the number of elapsed cycles.
”h]”hÜ)”}”(hhh]”(há)”}”(hŒ7RD_REQ counters: count read requests (32B per request).”h]”hÌ)”}”(hjê  h]”hŒ7RD_REQ counters: count read requests (32B per request).”…””}”(hjì  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Mfhjè  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjå  ubhá)”}”(hŒ{RD_CUM_OUTS counters: accumulated outstanding request counter, which track
how many cycles the read requests are in flight.”h]”hÌ)”}”(hŒ{RD_CUM_OUTS counters: accumulated outstanding request counter, which track
how many cycles the read requests are in flight.”h]”hŒ{RD_CUM_OUTS counters: accumulated outstanding request counter, which track
how many cycles the read requests are in flight.”…””}”(hj  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Mghjÿ  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjå  ubhá)”}”(hŒ5CYCLES counter: counts the number of elapsed cycles.
”h]”hÌ)”}”(hŒ4CYCLES counter: counts the number of elapsed cycles.”h]”hŒ4CYCLES counter: counts the number of elapsed cycles.”…””}”(hj  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Mihj  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjå  ubeh}”(h]”h ]”h"]”h$]”h&]”jŠ  j‹  uh1hÛh³hÊh´Mfhjá  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÒ  h³hÊh´MfhjÂ  h²hubhÌ)”}”(hŒ&The average latency is calculated as::”h]”hŒ%The average latency is calculated as:”…””}”(hj;  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MkhjÂ  h²hubjR  )”}”(hŒ’FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS
AVG_LATENCY_IN_CYCLES = RD_CUM_OUTS / RD_REQ
AVERAGE_LATENCY_IN_NS = AVG_LATENCY_IN_CYCLES / FREQ_IN_GHZ”h]”hŒ’FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS
AVG_LATENCY_IN_CYCLES = RD_CUM_OUTS / RD_REQ
AVERAGE_LATENCY_IN_NS = AVG_LATENCY_IN_CYCLES / FREQ_IN_GHZ”…””}”hjI  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´MmhjÂ  h²hubhÌ)”}”(hŒ–The events and configuration options of this PMU device are described in sysfs,
see /sys/bus/event_source/devices/nvidia_cmem_latency_pmu_<socket-id>.”h]”hŒ–The events and configuration options of this PMU device are described in sysfs,
see /sys/bus/event_source/devices/nvidia_cmem_latency_pmu_<socket-id>.”…””}”(hjW  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MqhjÂ  h²hubhÌ)”}”(hŒExample usage::”h]”hŒExample usage:”…””}”(hje  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MthjÂ  h²hubjR  )”}”(hŒ~perf stat -a -e '{nvidia_cmem_latency_pmu_0/rd_req/,nvidia_cmem_latency_pmu_0/rd_cum_outs/,nvidia_cmem_latency_pmu_0/cycles/}'”h]”hŒ~perf stat -a -e '{nvidia_cmem_latency_pmu_0/rd_req/,nvidia_cmem_latency_pmu_0/rd_cum_outs/,nvidia_cmem_latency_pmu_0/cycles/}'”…””}”hjs  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´MvhjÂ  h²hubeh}”(h]”Œcpu-memory-cmem-latency-pmu”ah ]”h"]”Œcpu memory (cmem) latency pmu”ah$]”h&]”uh1hµhh·h²hh³hÊh´Maubh¶)”}”(hhh]”(h»)”}”(hŒNVLink-C2C PMU”h]”hŒNVLink-C2C PMU”…””}”(hjŒ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hºhj‰  h²hh³hÊh´MyubhÌ)”}”(hŒÕThis PMU monitors latency events of memory read/write requests that pass through
the NVIDIA Chip-to-Chip (C2C) interface. Bandwidth events are not available
in this PMU, unlike the C2C PMU in Grace (Tegra241 SoC).”h]”hŒÕThis PMU monitors latency events of memory read/write requests that pass through
the NVIDIA Chip-to-Chip (C2C) interface. Bandwidth events are not available
in this PMU, unlike the C2C PMU in Grace (Tegra241 SoC).”…””}”(hjš  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M{hj‰  h²hubhÌ)”}”(hŒ”The events and configuration options of this PMU device are available in sysfs,
see /sys/bus/event_source/devices/nvidia_nvlink_c2c_pmu_<socket-id>.”h]”hŒ”The events and configuration options of this PMU device are available in sysfs,
see /sys/bus/event_source/devices/nvidia_nvlink_c2c_pmu_<socket-id>.”…””}”(hj¨  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Mhj‰  h²hubhÌ)”}”(hŒThe list of events:”h]”hŒThe list of events:”…””}”(hj¶  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M‚hj‰  h²hubjÓ  )”}”(hXe  * IN_RD_CUM_OUTS: accumulated outstanding request (in cycles) of incoming read requests.
* IN_RD_REQ: the number of incoming read requests.
* IN_WR_CUM_OUTS: accumulated outstanding request (in cycles) of incoming write requests.
* IN_WR_REQ: the number of incoming write requests.
* OUT_RD_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing read requests.
* OUT_RD_REQ: the number of outgoing read requests.
* OUT_WR_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing write requests.
* OUT_WR_REQ: the number of outgoing write requests.
* CYCLES: NVLink-C2C interface cycle counts.
”h]”hÜ)”}”(hhh]”(há)”}”(hŒVIN_RD_CUM_OUTS: accumulated outstanding request (in cycles) of incoming read requests.”h]”hÌ)”}”(hjÍ  h]”hŒVIN_RD_CUM_OUTS: accumulated outstanding request (in cycles) of incoming read requests.”…””}”(hjÏ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M„hjË  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjÈ  ubhá)”}”(hŒ0IN_RD_REQ: the number of incoming read requests.”h]”hÌ)”}”(hjä  h]”hŒ0IN_RD_REQ: the number of incoming read requests.”…””}”(hjæ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M…hjâ  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjÈ  ubhá)”}”(hŒWIN_WR_CUM_OUTS: accumulated outstanding request (in cycles) of incoming write requests.”h]”hÌ)”}”(hjû  h]”hŒWIN_WR_CUM_OUTS: accumulated outstanding request (in cycles) of incoming write requests.”…””}”(hjý  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M†hjù  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjÈ  ubhá)”}”(hŒ1IN_WR_REQ: the number of incoming write requests.”h]”hÌ)”}”(hj  h]”hŒ1IN_WR_REQ: the number of incoming write requests.”…””}”(hj  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M‡hj  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjÈ  ubhá)”}”(hŒWOUT_RD_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing read requests.”h]”hÌ)”}”(hj)  h]”hŒWOUT_RD_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing read requests.”…””}”(hj+  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Mˆhj'  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjÈ  ubhá)”}”(hŒ1OUT_RD_REQ: the number of outgoing read requests.”h]”hÌ)”}”(hj@  h]”hŒ1OUT_RD_REQ: the number of outgoing read requests.”…””}”(hjB  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M‰hj>  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjÈ  ubhá)”}”(hŒXOUT_WR_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing write requests.”h]”hÌ)”}”(hjW  h]”hŒXOUT_WR_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing write requests.”…””}”(hjY  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MŠhjU  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjÈ  ubhá•’V      )”}”(hŒ2OUT_WR_REQ: the number of outgoing write requests.”h]”hÌ)”}”(hjn  h]”hŒ2OUT_WR_REQ: the number of outgoing write requests.”…””}”(hjp  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M‹hjl  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjÈ  ubhá)”}”(hŒ+CYCLES: NVLink-C2C interface cycle counts.
”h]”hÌ)”}”(hŒ*CYCLES: NVLink-C2C interface cycle counts.”h]”hŒ*CYCLES: NVLink-C2C interface cycle counts.”…””}”(hj‡  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MŒhjƒ  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjÈ  ubeh}”(h]”h ]”h"]”h$]”h&]”jŠ  j‹  uh1hÛh³hÊh´M„hjÄ  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÒ  h³hÊh´M„hj‰  h²hubhÌ)”}”(hŒ“The incoming events count the reads/writes from remote device to the SoC.
The outgoing events count the reads/writes from the SoC to remote device.”h]”hŒ“The incoming events count the reads/writes from remote device to the SoC.
The outgoing events count the reads/writes from the SoC to remote device.”…””}”(hj§  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MŽhj‰  h²hubhÌ)”}”(hŒƒThe sysfs /sys/bus/event_source/devices/nvidia_nvlink_c2c_pmu_<socket-id>/peer
contains the information about the connected device.”h]”hŒƒThe sysfs /sys/bus/event_source/devices/nvidia_nvlink_c2c_pmu_<socket-id>/peer
contains the information about the connected device.”…””}”(hjµ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M‘hj‰  h²hubhÌ)”}”(hX1  When the C2C interface is connected to GPU(s), the user can use the
"gpu_mask" parameter to filter traffic to/from specific GPU(s). Each bit represents the GPU
index, e.g. "gpu_mask=0x1" corresponds to GPU 0 and "gpu_mask=0x3" is for GPU 0 and 1.
The PMU will monitor all GPUs by default if not specified.”h]”hX=  When the C2C interface is connected to GPU(s), the user can use the
â€œgpu_maskâ€ parameter to filter traffic to/from specific GPU(s). Each bit represents the GPU
index, e.g. â€œgpu_mask=0x1â€ corresponds to GPU 0 and â€œgpu_mask=0x3â€ is for GPU 0 and 1.
The PMU will monitor all GPUs by default if not specified.”…””}”(hjÃ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M”hj‰  h²hubhÌ)”}”(hŒBWhen connected to another SoC, only the read events are available.”h]”hŒBWhen connected to another SoC, only the read events are available.”…””}”(hjÑ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M™hj‰  h²hubhÌ)”}”(hŒTThe events can be used to calculate the average latency of the read/write requests::”h]”hŒSThe events can be used to calculate the average latency of the read/write requests:”…””}”(hjß  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M›hj‰  h²hubjR  )”}”(hX?  C2C_FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS

IN_RD_AVG_LATENCY_IN_CYCLES = IN_RD_CUM_OUTS / IN_RD_REQ
IN_RD_AVG_LATENCY_IN_NS = IN_RD_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ

IN_WR_AVG_LATENCY_IN_CYCLES = IN_WR_CUM_OUTS / IN_WR_REQ
IN_WR_AVG_LATENCY_IN_NS = IN_WR_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ

OUT_RD_AVG_LATENCY_IN_CYCLES = OUT_RD_CUM_OUTS / OUT_RD_REQ
OUT_RD_AVG_LATENCY_IN_NS = OUT_RD_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ

OUT_WR_AVG_LATENCY_IN_CYCLES = OUT_WR_CUM_OUTS / OUT_WR_REQ
OUT_WR_AVG_LATENCY_IN_NS = OUT_WR_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ”h]”hX?  C2C_FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS

IN_RD_AVG_LATENCY_IN_CYCLES = IN_RD_CUM_OUTS / IN_RD_REQ
IN_RD_AVG_LATENCY_IN_NS = IN_RD_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ

IN_WR_AVG_LATENCY_IN_CYCLES = IN_WR_CUM_OUTS / IN_WR_REQ
IN_WR_AVG_LATENCY_IN_NS = IN_WR_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ

OUT_RD_AVG_LATENCY_IN_CYCLES = OUT_RD_CUM_OUTS / OUT_RD_REQ
OUT_RD_AVG_LATENCY_IN_NS = OUT_RD_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ

OUT_WR_AVG_LATENCY_IN_CYCLES = OUT_WR_CUM_OUTS / OUT_WR_REQ
OUT_WR_AVG_LATENCY_IN_NS = OUT_WR_AVG_LATENCY_IN_CYCLES / C2C_FREQ_IN_GHZ”…””}”hjí  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´Mhj‰  h²hubhÌ)”}”(hŒExample usage:”h]”hŒExample usage:”…””}”(hjû  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M«hj‰  h²hubjÓ  )”}”(hX  * Count incoming traffic from all GPUs connected via NVLink-C2C::

    perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_req/

* Count incoming traffic from GPU 0 connected via NVLink-C2C::

    perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_cum_outs,gpu_mask=0x1/

* Count incoming traffic from GPU 1 connected via NVLink-C2C::

    perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_cum_outs,gpu_mask=0x2/

* Count outgoing traffic to all GPUs connected via NVLink-C2C::

    perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_req/

* Count outgoing traffic to GPU 0 connected via NVLink-C2C::

    perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_cum_outs,gpu_mask=0x1/

* Count outgoing traffic to GPU 1 connected via NVLink-C2C::

    perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_cum_outs,gpu_mask=0x2/
”h]”hÜ)”}”(hhh]”(há)”}”(hŒvCount incoming traffic from all GPUs connected via NVLink-C2C::

  perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_req/
”h]”(hÌ)”}”(hŒ?Count incoming traffic from all GPUs connected via NVLink-C2C::”h]”hŒ>Count incoming traffic from all GPUs connected via NVLink-C2C:”…””}”(hj  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M­hj  ubjR  )”}”(hŒ2perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_req/”h]”hŒ2perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_req/”…””}”hj"  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´M¯hj  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhj  ubhá)”}”(hŒ…Count incoming traffic from GPU 0 connected via NVLink-C2C::

  perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_cum_outs,gpu_mask=0x1/
”h]”(hÌ)”}”(hŒ<Count incoming traffic from GPU 0 connected via NVLink-C2C::”h]”hŒ;Count incoming traffic from GPU 0 connected via NVLink-C2C:”…””}”(hj:  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M±hj6  ubjR  )”}”(hŒDperf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_cum_outs,gpu_mask=0x1/”h]”hŒDperf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_cum_outs,gpu_mask=0x1/”…””}”hjH  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´M³hj6  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhj  ubhá)”}”(hŒ…Count incoming traffic from GPU 1 connected via NVLink-C2C::

  perf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_cum_outs,gpu_mask=0x2/
”h]”(hÌ)”}”(hŒ<Count incoming traffic from GPU 1 connected via NVLink-C2C::”h]”hŒ;Count incoming traffic from GPU 1 connected via NVLink-C2C:”…””}”(hj`  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Mµhj\  ubjR  )”}”(hŒDperf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_cum_outs,gpu_mask=0x2/”h]”hŒDperf stat -a -e nvidia_nvlink_c2c_pmu_0/in_rd_cum_outs,gpu_mask=0x2/”…””}”hjn  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´M·hj\  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhj  ubhá)”}”(hŒuCount outgoing traffic to all GPUs connected via NVLink-C2C::

  perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_req/
”h]”(hÌ)”}”(hŒ=Count outgoing traffic to all GPUs connected via NVLink-C2C::”h]”hŒ<Count outgoing traffic to all GPUs connected via NVLink-C2C:”…””}”(hj†  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M¹hj‚  ubjR  )”}”(hŒ3perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_req/”h]”hŒ3perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_req/”…””}”hj”  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´M»hj‚  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhj  ubhá)”}”(hŒ„Count outgoing traffic to GPU 0 connected via NVLink-C2C::

  perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_cum_outs,gpu_mask=0x1/
”h]”(hÌ)”}”(hŒ:Count outgoing traffic to GPU 0 connected via NVLink-C2C::”h]”hŒ9Count outgoing traffic to GPU 0 connected via NVLink-C2C:”…””}”(hj¬  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´M½hj¨  ubjR  )”}”(hŒEperf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_cum_outs,gpu_mask=0x1/”h]”hŒEperf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_cum_outs,gpu_mask=0x1/”…””}”hjº  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´M¿hj¨  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhj  ubhá)”}”(hŒ„Count outgoing traffic to GPU 1 connected via NVLink-C2C::

  perf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_cum_outs,gpu_mask=0x2/
”h]”(hÌ)”}”(hŒ:Count outgoing traffic to GPU 1 connected via NVLink-C2C::”h]”hŒ9Count outgoing traffic to GPU 1 connected via NVLink-C2C:”…””}”(hjÒ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MÁhjÎ  ubjR  )”}”(hŒEperf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_cum_outs,gpu_mask=0x2/”h]”hŒEperf stat -a -e nvidia_nvlink_c2c_pmu_0/out_rd_cum_outs,gpu_mask=0x2/”…””}”hjà  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´MÃhjÎ  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhj  ubeh}”(h]”h ]”h"]”h$]”h&]”jŠ  j‹  uh1hÛh³hÊh´M­hj	  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÒ  h³hÊh´M­hj‰  h²hubeh}”(h]”Œnvlink-c2c-pmu”ah ]”h"]”Œnvlink-c2c pmu”ah$]”h&]”uh1hµhh·h²hh³hÊh´Myubh¶)”}”(hhh]”(h»)”}”(hŒNV-CLink PMU”h]”hŒNV-CLink PMU”…””}”(hj  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hºhj  h²hh³hÊh´MÆubhÌ)”}”(hX  This PMU monitors latency events of memory read requests that pass through
the NV-CLINK interface. Bandwidth events are not available in this PMU.
In Tegra410 SoC, the NV-CLink interface is used to connect to another Tegra410
SoC and this PMU only counts read traffic.”h]”hX  This PMU monitors latency events of memory read requests that pass through
the NV-CLINK interface. Bandwidth events are not available in this PMU.
In Tegra410 SoC, the NV-CLink interface is used to connect to another Tegra410
SoC and this PMU only counts read traffic.”…””}”(hj  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MÈhj  h²hubhÌ)”}”(hŒ‘The events and configuration options of this PMU device are available in sysfs,
see /sys/bus/event_source/devices/nvidia_nvclink_pmu_<socket-id>.”h]”hŒ‘The events and configuration options of this PMU device are available in sysfs,
see /sys/bus/event_source/devices/nvidia_nvclink_pmu_<socket-id>.”…””}”(hj'  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MÍhj  h²hubhÌ)”}”(hŒThe list of events:”h]”hŒThe list of events:”…””}”(hj5  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MÐhj  h²hubjÓ  )”}”(hXE  * IN_RD_CUM_OUTS: accumulated outstanding request (in cycles) of incoming read requests.
* IN_RD_REQ: the number of incoming read requests.
* OUT_RD_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing read requests.
* OUT_RD_REQ: the number of outgoing read requests.
* CYCLES: NV-CLINK interface cycle counts.
”h]”hÜ)”}”(hhh]”(há)”}”(hŒVIN_RD_CUM_OUTS: accumulated outstanding request (in cycles) of incoming read requests.”h]”hÌ)”}”(hjL  h]”hŒVIN_RD_CUM_OUTS: accumulated outstanding request (in cycles) of incoming read requests.”…””}”(hjN  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MÒhjJ  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjG  ubhá)”}”(hŒ0IN_RD_REQ: the number of incoming read requests.”h]”hÌ)”}”(hjc  h]”hŒ0IN_RD_REQ: the number of incoming read requests.”…””}”(hje  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MÓhja  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjG  ubhá)”}”(hŒWOUT_RD_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing read requests.”h]”hÌ)”}”(hjz  h]”hŒWOUT_RD_CUM_OUTS: accumulated outstanding request (in cycles) of outgoing read requests.”…””}”(hj|  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MÔhjx  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjG  ubhá)”}”(hŒ1OUT_RD_REQ: the number of outgoing read requests.”h]”hÌ)”}”(hj‘  h]”hŒ1OUT_RD_REQ: the number of outgoing read requests.”…””}”(hj“  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MÕhj  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjG  ubhá)”}”(hŒ)CYCLES: NV-CLINK interface cycle counts.
”h]”hÌ)”}”(hŒ(CYCLES: NV-CLINK interface cycle counts.”h]”hŒ(CYCLES: NV-CLINK interface cycle counts.”…””}”(hjª  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MÖhj¦  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhjG  ubeh}”(h]”h ]”h"]”h$]”h&]”jŠ  j‹  uh1hÛh³hÊh´MÒhjC  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÒ  h³hÊh´MÒhj  h²hubhÌ)”}”(hŒ…The incoming events count the reads from remote device to the SoC.
The outgoing events count the reads from the SoC to remote device.”h]”hŒ…The incoming events count the reads from remote device to the SoC.
The outgoing events count the reads from the SoC to remote device.”…””}”(hjÊ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MØhj  h²hubhÌ)”}”(hŒNThe events can be used to calculate the average latency of the read requests::”h]”hŒMThe events can be used to calculate the average latency of the read requests:”…””}”(hjØ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MÛhj  h²hubjR  )”}”(hX<  CLINK_FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS

IN_RD_AVG_LATENCY_IN_CYCLES = IN_RD_CUM_OUTS / IN_RD_REQ
IN_RD_AVG_LATENCY_IN_NS = IN_RD_AVG_LATENCY_IN_CYCLES / CLINK_FREQ_IN_GHZ

OUT_RD_AVG_LATENCY_IN_CYCLES = OUT_RD_CUM_OUTS / OUT_RD_REQ
OUT_RD_AVG_LATENCY_IN_NS = OUT_RD_AVG_LATENCY_IN_CYCLES / CLINK_FREQ_IN_GHZ”h]”hX<  CLINK_FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS

IN_RD_AVG_LATENCY_IN_CYCLES = IN_RD_CUM_OUTS / IN_RD_REQ
IN_RD_AVG_LATENCY_IN_NS = IN_RD_AVG_LATENCY_IN_CYCLES / CLINK_FREQ_IN_GHZ

OUT_RD_AVG_LATENCY_IN_CYCLES = OUT_RD_CUM_OUTS / OUT_RD_REQ
OUT_RD_AVG_LATENCY_IN_NS = OUT_RD_AVG_LATENCY_IN_CYCLES / CLINK_FREQ_IN_GHZ”…””}”hjæ  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´MÝhj  h²hubhÌ)”}”(hŒExample usage:”h]”hŒExample usage:”…””}”(hjô  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Måhj  h²hubjÓ  )”}”(hŒø* Count incoming read traffic from remote SoC connected via NV-CLINK::

    perf stat -a -e nvidia_nvclink_pmu_0/in_rd_req/

* Count outgoing read traffic to remote SoC connected via NV-CLINK::

    perf stat -a -e nvidia_nvclink_pmu_0/out_rd_req/
”h]”hÜ)”}”(hhh]”(há)”}”(hŒxCount incoming read traffic from remote SoC connected via NV-CLINK::

  perf stat -a -e nvidia_nvclink_pmu_0/in_rd_req/
”h]”(hÌ)”}”(hŒDCount incoming read traffic from remote SoC connected via NV-CLINK::”h]”hŒCCount incoming read traffic from remote SoC connected via NV-CLINK:”…””}”(hj  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Mçhj	  ubjR  )”}”(hŒ/perf stat -a -e nvidia_nvclink_pmu_0/in_rd_req/”h]”hŒ/perf stat -a -e nvidia_nvclink_pmu_0/in_rd_req/”…””}”hj  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´Méhj	  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhj  ubhá)”}”(hŒwCount outgoing read traffic to remote SoC connected via NV-CLINK::

  perf stat -a -e nvidia_nvclink_pmu_0/out_rd_req/
”h]”(hÌ)”}”(hŒBCount outgoing read traffic to remote SoC connected via NV-CLINK::”h]”hŒACount outgoing read traffic to remote SoC connected via NV-CLINK:”…””}”(hj3  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Mëhj/  ubjR  )”}”(hŒ0perf stat -a -e nvidia_nvclink_pmu_0/out_rd_req/”h]”hŒ0perf stat -a -e nvidia_nvclink_pmu_0/out_rd_req/”…””}”hjA  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´Míhj/  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhj  ubeh}”(h]”h ]”h"]”h$]”h&]”jŠ  j‹  uh1hÛh³hÊh´Mçhj  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÒ  h³hÊh´Mçhj  h²hubeh}”(h]”Œnv-clink-pmu”ah ]”h"]”Œnv-clink pmu”ah$]”h&]”uh1hµhh·h²hh³hÊh´MÆubh¶)”}”(hhh]”(h»)”}”(hŒNV-DLink PMU”h]”hŒNV-DLink PMU”…””}”(hjl  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hºhji  h²hh³hÊh´MðubhÌ)”}”(hŒÒThis PMU monitors latency events of memory read requests that pass through
the NV-DLINK interface.  Bandwidth events are not available in this PMU.
In Tegra410 SoC, this PMU only counts CXL memory read traffic.”h]”hŒÒThis PMU monitors latency events of memory read requests that pass through
the NV-DLINK interface.  Bandwidth events are not available in this PMU.
In Tegra410 SoC, this PMU only counts CXL memory read traffic.”…””}”(hjz  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Mòhji  h²hubhÌ)”}”(hŒ‘The events and configuration options of this PMU device are available in sysfs,
see /sys/bus/event_source/devices/nvidia_nvdlink_pmu_<socket-id>.”h]”hŒ‘The events and configuration options of this PMU device are available in sysfs,
see /sys/bus/event_source/devices/nvidia_nvdlink_pmu_<socket-id>.”…””}”(hjˆ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Möhji  h²hubhÌ)”}”(hŒThe list of events:”h]”hŒThe list of events:”…””}”(hj–  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Mùhji  h²hubjÓ  )”}”(hŒ¶* IN_RD_CUM_OUTS: accumulated outstanding read requests (in cycles) to CXL memory.
* IN_RD_REQ: the number of read requests to CXL memory.
* CYCLES: NV-DLINK interface cycle counts.
”h]”hÜ)”}”(hhh]”(há)”}”(hŒPIN_RD_CUM_OUTS: accumulated outstanding read requests (in cycles) to CXL memory.”h]”hÌ)”}”(hj­  h]”hŒPIN_RD_CUM_OUTS: accumulated outstanding read requests (in cycles) to CXL memory.”…””}”(hj¯  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Mûhj«  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj¨  ubhá)”}”(hŒ5IN_RD_REQ: the number of read requests to CXL memory.”h]”hÌ)”}”(hjÄ  h]”hŒ5IN_RD_REQ: the number of read requests to CXL memory.”…””}”(hjÆ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MühjÂ  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj¨  ubhá)”}”(hŒ)CYCLES: NV-DLINK interface cycle counts.
”h]”hÌ)”}”(hŒ(CYCLES: NV-DLINK interface cycle counts.”h]”hŒ(CYCLES: NV-DLINK interface cycle counts.”…””}”(hjÝ  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´MýhjÙ  ubah}”(h]”h ]”h"]”h$]”h&]”uh1hàhj¨  ubeh}”(h]”h ]”h"]”h$]”h&]”jŠ  j‹  uh1hÛh³hÊh´Mûhj¤  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÒ  h³hÊh´Mûhji  h²hubhÌ)”}”(hŒNThe events can be used to calculate the average latency of the read requests::”h]”hŒMThe events can be used to calculate the average latency of the read requests:”…””}”(hjý  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Mÿhji  h²hubjR  )”}”(hŒ³DLINK_FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS

IN_RD_AVG_LATENCY_IN_CYCLES = IN_RD_CUM_OUTS / IN_RD_REQ
IN_RD_AVG_LATENCY_IN_NS = IN_RD_AVG_LATENCY_IN_CYCLES / DLINK_FREQ_IN_GHZ”h]”hŒ³DLINK_FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS

IN_RD_AVG_LATENCY_IN_CYCLES = IN_RD_CUM_OUTS / IN_RD_REQ
IN_RD_AVG_LATENCY_IN_NS = IN_RD_AVG_LATENCY_IN_CYCLES / DLINK_FREQ_IN_GHZ”…””}”hj  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´Mhji  h²hubhÌ)”}”(hŒExample usage:”h]”hŒExample usage:”…””}”(hj  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Mhji  h²hubjÓ  )”}”(hŒ* Count read events to CXL memory::

    perf stat -a -e '{nvidia_nvdlink_pmu_0/in_rd_req/,nvidia_nvdlink_pmu_0/in_rd_cum_outs/}'”h]”hÜ)”}”(hhh]”há)”}”(hŒ}Count read events to CXL memory::

  perf stat -a -e '{nvidia_nvdlink_pmu_0/in_rd_req/,nvidia_nvdlink_pmu_0/in_rd_cum_outs/}'”h]”(hÌ)”}”(hŒ!Count read events to CXL memory::”h]”hŒ Count read events to CXL memory:”…””}”(hj2  h²hh³Nh´Nubah}”(h]”h ]”h"]”h$]”h&]”uh1hËh³hÊh´Mhj.  ubjR  )”}”(hŒXperf stat -a -e '{nvidia_nvdlink_pmu_0/in_rd_req/,nvidia_nvdlink_pmu_0/in_rd_cum_outs/}'”h]”hŒXperf stat -a -e '{nvidia_nvdlink_pmu_0/in_rd_req/,nvidia_nvdlink_pmu_0/in_rd_cum_outs/}'”…””}”hj@  sbah}”(h]”h ]”h"]”h$]”h&]”ja  jb  uh1jQ  h³hÊh´M
hj.  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1hàhj+  ubah}”(h]”h ]”h"]”h$]”h&]”jŠ  j‹  uh1hÛh³hÊh´Mhj'  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÒ  h³hÊh´Mhji  h²hubeh}”(h]”Œnv-dlink-pmu”ah ]”h"]”Œnv-dlink pmu”ah$]”h&]”uh1hµhh·h²hh³hÊh´Mðubeh}”(h]”Œ:nvidia-tegra410-soc-uncore-performance-monitoring-unit-pmu”ah ]”h"]”Œ<nvidia tegra410 soc uncore performance monitoring unit (pmu)”ah$]”h&]”uh1hµhhh²hh³hÊh´Kubeh}”(h]”h ]”h"]”h$]”h&]”Œsource”hÊuh1hŒcurrent_source”NŒcurrent_line”NŒsettings”Œdocutils.frontend”ŒValues”“”)”}”(hºNŒ	generator”NŒ	datestamp”NŒsource_link”NŒ
source_url”NŒtoc_backlinks”Œentry”Œfootnote_backlinks”KŒsectnum_xform”KŒstrip_comments”NŒstrip_elements_with_classes”NŒstrip_classes”NŒreport_level”KŒ
halt_level”KŒexit_status_level”KŒdebug”NŒwarning_stream”NŒ	traceback”ˆŒinput_encoding”Œ	utf-8-sig”Œinput_encoding_error_handler”Œstrict”Œoutput_encoding”Œutf-8”Œoutput_encoding_error_handler”j“  Œerror_encoding”Œutf-8”Œerror_encoding_error_handler”Œbackslashreplace”Œlanguage_code”Œen”Œrecord_dependencies”NŒconfig”NŒ	id_prefix”hŒauto_id_prefix”Œid”Œdump_settings”NŒdump_internals”NŒdump_transforms”NŒdump_pseudo_xml”NŒexpose_internals”NŒstrict_visitor”NŒ_disable_config”NŒ_source”hÊŒ_destination”NŒ_config_files”]”Œ7/var/lib/git/docbuild/linux/Documentation/docutils.conf”aŒfile_insertion_enabled”ˆŒraw_enabled”KŒline_length_limit”M'Œpep_references”NŒpep_base_url”Œhttps://peps.python.org/”Œpep_file_url_template”Œpep-%04d”Œrfc_references”NŒrfc_base_url”Œ&https://datatracker.ietf.org/doc/html/”Œ	tab_width”KŒtrim_footnote_reference_space”‰Œsyntax_highlight”Œlong”Œsmart_quotes”ˆŒsmartquotes_locales”]”Œcharacter_level_inline_markup”‰Œdoctitle_xform”‰Œdocinfo_xform”KŒsectsubtitle_xform”‰Œimage_loading”Œlink”Œembed_stylesheet”‰Œcloak_email_addresses”ˆŒsection_self_link”‰Œenv”NubŒreporter”NŒindirect_targets”]”Œsubstitution_defs”}”Œsubstitution_names”}”Œrefnames”}”Œrefids”}”j„  ]”jy  asŒnameids”}”(jm  jj  j°  j­  j3  j0  jv	  js	  jj	  j„  ji	  jf	  j¿  j¼  j†  jƒ  j  j  jf  jc  je  jb  uŒ	nametypes”}”(jm  ‰j°  ‰j3  ‰jv	  ‰jj	  ˆji	  ‰j¿  ‰j†  ‰j  ‰jf  ‰je  ‰uh}”(jj  h·j­  jŒ  j0  j³  js	  j6  j„  j…  jf	  j…  j¼  jy	  jƒ  jÂ  j  j‰  jc  j  jb  ji  uŒfootnote_refs”}”Œcitation_refs”}”Œautofootnotes”]”Œautofootnote_refs”]”Œsymbol_footnotes”]”Œsymbol_footnote_refs”]”Œ	footnotes”]”Œ	citations”]”Œautofootnote_start”KŒsymbol_footnote_start”K Œ
id_counter”Œcollections”ŒCounter”“”}”…”R”Œparse_messages”]”Œtransform_messages”]”hŒsystem_message”“”)”}”(hhh]”hÌ)”}”(hhh]”hŒMHyperlink target "nvidia-t410-pcie-pmu-rc-mapping-section" is not referenced.”…””}”hjý  sbah}”(h]”h ]”h"]”h$]”h&]”uh1hËhjú  ubah}”(h]”h ]”h"]”h$]”h&]”Œlevel”KŒtype”ŒINFO”Œsource”hÊŒline”KÜuh1jø  ubaŒtransformer”NŒinclude_log”]”Œ
decoration”Nh²hub.