€•ôÿ      Œsphinx.addnodes”Œdocument”“”)”}”(Œ	rawsource”Œ ”Œchildren”]”(Œtranslations”ŒLanguagesNode”“”)”}”(hhh]”(h Œpending_xref”“”)”}”(hhh]”Œdocutils.nodes”ŒText”“”ŒChinese (Simplified)”…””}”Œparent”hsbaŒ
attributes”}”(Œids”]”Œclasses”]”Œnames”]”Œdupnames”]”Œbackrefs”]”Œ	refdomain”Œstd”Œreftype”Œdoc”Œ	reftarget”Œ</translations/zh_CN/arch/powerpc/pci_iov_resource_on_powernv”Œmodname”NŒ	classname”NŒrefexplicit”ˆuŒtagname”hhhubh)”}”(hhh]”hŒChinese (Traditional)”…””}”hh2sbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ</translations/zh_TW/arch/powerpc/pci_iov_resource_on_powernv”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubh)”}”(hhh]”hŒItalian”…””}”hhFsbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ</translations/it_IT/arch/powerpc/pci_iov_resource_on_powernv”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubh)”}”(hhh]”hŒJapanese”…””}”hhZsbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ</translations/ja_JP/arch/powerpc/pci_iov_resource_on_powernv”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubh)”}”(hhh]”hŒKorean”…””}”hhnsbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ</translations/ko_KR/arch/powerpc/pci_iov_resource_on_powernv”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubh)”}”(hhh]”hŒSpanish”…””}”hh‚sbah}”(h]”h ]”h"]”h$]”h&]”Œ	refdomain”h)Œreftype”h+Œ	reftarget”Œ</translations/sp_SP/arch/powerpc/pci_iov_resource_on_powernv”Œmodname”NŒ	classname”NŒrefexplicit”ˆuh1hhhubeh}”(h]”h ]”h"]”h$]”h&]”Œcurrent_language”ŒEnglish”uh1h
hhŒ	_document”hŒsource”NŒline”NubhŒsection”“”)”}”(hhh]”(hŒtitle”“”)”}”(hŒ3PCI Express I/O Virtualization Resource on Powerenv”h]”hŒ3PCI Express I/O Virtualization Resource on Powerenv”…””}”(hh¨hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¦hh£hžhhŸŒV/var/lib/git/docbuild/linux/Documentation/arch/powerpc/pci_iov_resource_on_powernv.rst”h KubhŒ	paragraph”“”)”}”(hŒ%Wei Yang <weiyang@linux.vnet.ibm.com>”h]”(hŒ
Wei Yang <”…””}”(hh¹hžhhŸNh NubhŒ	reference”“”)”}”(hŒweiyang@linux.vnet.ibm.com”h]”hŒweiyang@linux.vnet.ibm.com”…””}”(hhÃhžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”Œrefuri”Œ!mailto:weiyang@linux.vnet.ibm.com”uh1hÁhh¹ubhŒ>”…””}”(hh¹hžhhŸNh Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Khh£hžhubh¸)”}”(hŒ)Benjamin Herrenschmidt <benh@au1.ibm.com>”h]”(hŒBenjamin Herrenschmidt <”…””}”(hhÝhžhhŸNh NubhÂ)”}”(hŒbenh@au1.ibm.com”h]”hŒbenh@au1.ibm.com”…””}”(hhåhžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”Œrefuri”Œmailto:benh@au1.ibm.com”uh1hÁhhÝubhŒ>”…””}”(hhÝhžhhŸNh Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Khh£hžhubh¸)”}”(hŒ#Bjorn Helgaas <bhelgaas@google.com>”h]”(hŒBjorn Helgaas <”…””}”(hhÿhžhhŸNh NubhÂ)”}”(hŒbhelgaas@google.com”h]”hŒbhelgaas@google.com”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”Œrefuri”Œmailto:bhelgaas@google.com”uh1hÁhhÿubhŒ>”…””}”(hhÿhžhhŸNh Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K	hh£hžhubh¸)”}”(hŒ26 Aug 2014”h]”hŒ26 Aug 2014”…””}”(hj!  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Khh£hžhubh¸)”}”(hX[  This document describes the requirement from hardware for PCI MMIO resource
sizing and assignment on PowerKVM and how generic PCI code handles this
requirement. The first two sections describe the concepts of Partitionable
Endpoints and the implementation on P8 (IODA2). The next two sections talks
about considerations on enabling SRIOV on IODA2.”h]”hX[  This document describes the requirement from hardware for PCI MMIO resource
sizing and assignment on PowerKVM and how generic PCI code handles this
requirement. The first two sections describe the concepts of Partitionable
Endpoints and the implementation on P8 (IODA2). The next two sections talks
about considerations on enabling SRIOV on IODA2.”…””}”(hj/  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Khh£hžhubh¢)”}”(hhh]”(h§)”}”(hŒ*1. Introduction to Partitionable Endpoints”h]”hŒ*1. Introduction to Partitionable Endpoints”…””}”(hj@  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¦hj=  hžhhŸh¶h Kubh¸)”}”(hXA  A Partitionable Endpoint (PE) is a way to group the various resources
associated with a device or a set of devices to provide isolation between
partitions (i.e., filtering of DMA, MSIs etc.) and to provide a mechanism
to freeze a device that is causing errors in order to limit the possibility
of propagation of bad data.”h]”hXA  A Partitionable Endpoint (PE) is a way to group the various resources
associated with a device or a set of devices to provide isolation between
partitions (i.e., filtering of DMA, MSIs etc.) and to provide a mechanism
to freeze a device that is causing errors in order to limit the possibility
of propagation of bad data.”…””}”(hjN  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Khj=  hžhubh¸)”}”(hŒºThere is thus, in HW, a table of PE states that contains a pair of "frozen"
state bits (one for MMIO and one for DMA, they get set together but can be
cleared independently) for each PE.”h]”hŒ¾There is thus, in HW, a table of PE states that contains a pair of â€œfrozenâ€
state bits (one for MMIO and one for DMA, they get set together but can be
cleared independently) for each PE.”…””}”(hj\  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Khj=  hžhubh¸)”}”(hŒùWhen a PE is frozen, all stores in any direction are dropped and all loads
return all 1's value. MSIs are also blocked. There's a bit more state that
captures things like the details of the error that caused the freeze etc., but
that's not critical.”h]”hŒÿWhen a PE is frozen, all stores in any direction are dropped and all loads
return all 1â€™s value. MSIs are also blocked. Thereâ€™s a bit more state that
captures things like the details of the error that caused the freeze etc., but
thatâ€™s not critical.”…””}”(hjj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K hj=  hžhubh¸)”}”(hŒrThe interesting part is how the various PCIe transactions (MMIO, DMA, ...)
are matched to their corresponding PEs.”h]”hŒrThe interesting part is how the various PCIe transactions (MMIO, DMA, ...)
are matched to their corresponding PEs.”…””}”(hjx  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K%hj=  hžhubh¸)”}”(hŒúThe following section provides a rough description of what we have on P8
(IODA2).  Keep in mind that this is all per PHB (PCI host bridge).  Each PHB
is a completely separate HW entity that replicates the entire logic, so has
its own set of PEs, etc.”h]”hŒúThe following section provides a rough description of what we have on P8
(IODA2).  Keep in mind that this is all per PHB (PCI host bridge).  Each PHB
is a completely separate HW entity that replicates the entire logic, so has
its own set of PEs, etc.”…””}”(hj†  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K(hj=  hžhubeh}”(h]”Œ'introduction-to-partitionable-endpoints”ah ]”h"]”Œ*1. introduction to partitionable endpoints”ah$]”h&]”uh1h¡hh£hžhhŸh¶h Kubh¢)”}”(hhh]”(h§)”}”(hŒ:2. Implementation of Partitionable Endpoints on P8 (IODA2)”h]”hŒ:2. Implementation of Partitionable Endpoints on P8 (IODA2)”…””}”(hjŸ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¦hjœ  hžhhŸh¶h K.ubh¸)”}”(hŒ6P8 supports up to 256 Partitionable Endpoints per PHB.”h]”hŒ6P8 supports up to 256 Partitionable Endpoints per PHB.”…””}”(hj­  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K0hjœ  hžhubhŒblock_quote”“”)”}”(hXÁ  * Inbound

  For DMA, MSIs and inbound PCIe error messages, we have a table (in
  memory but accessed in HW by the chip) that provides a direct
  correspondence between a PCIe RID (bus/dev/fn) with a PE number.
  We call this the RTT.

  - For DMA we then provide an entire address space for each PE that can
    contain two "windows", depending on the value of PCI address bit 59.
    Each window can be configured to be remapped via a "TCE table" (IOMMU
    translation table), which has various configurable characteristics
    not described here.

  - For MSIs, we have two windows in the address space (one at the top of
    the 32-bit space and one much higher) which, via a combination of the
    address and MSI value, will result in one of the 2048 interrupts per
    bridge being triggered.  There's a PE# in the interrupt controller
    descriptor table as well which is compared with the PE# obtained from
    the RTT to "authorize" the device to emit that specific interrupt.

  - Error messages just use the RTT.

* Outbound.  That's where the tricky part is.

  Like other PCI host bridges, the Power8 IODA2 PHB supports "windows"
  from the CPU address space to the PCI address space.  There is one M32
  window and sixteen M64 windows.  They have different characteristics.
  First what they have in common: they forward a configurable portion of
  the CPU address space to the PCIe bus and must be naturally aligned
  power of two in size.  The rest is different:

  - The M32 window:

    * Is limited to 4GB in size.

    * Drops the top bits of the address (above the size) and replaces
      them with a configurable value.  This is typically used to generate
      32-bit PCIe accesses.  We configure that window at boot from FW and
      don't touch it from Linux; it's usually set to forward a 2GB
      portion of address space from the CPU to PCIe
      0x8000_0000..0xffff_ffff.  (Note: The top 64KB are actually
      reserved for MSIs but this is not a problem at this point; we just
      need to ensure Linux doesn't assign anything there, the M32 logic
      ignores that however and will forward in that space if we try).

    * It is divided into 256 segments of equal size.  A table in the chip
      maps each segment to a PE#.  That allows portions of the MMIO space
      to be assigned to PEs on a segment granularity.  For a 2GB window,
      the segment granularity is 2GB/256 = 8MB.

  Now, this is the "main" window we use in Linux today (excluding
  SR-IOV).  We basically use the trick of forcing the bridge MMIO windows
  onto a segment alignment/granularity so that the space behind a bridge
  can be assigned to a PE.

  Ideally we would like to be able to have individual functions in PEs
  but that would mean using a completely different address allocation
  scheme where individual function BARs can be "grouped" to fit in one or
  more segments.

  - The M64 windows:

    * Must be at least 256MB in size.

    * Do not translate addresses (the address on PCIe is the same as the
      address on the PowerBus).  There is a way to also set the top 14
      bits which are not conveyed by PowerBus but we don't use this.

    * Can be configured to be segmented.  When not segmented, we can
      specify the PE# for the entire window.  When segmented, a window
      has 256 segments; however, there is no table for mapping a segment
      to a PE#.  The segment number *is* the PE#.

    * Support overlaps.  If an address is covered by multiple windows,
      there's a defined ordering for which window applies.

  We have code (fairly new compared to the M32 stuff) that exploits that
  for large BARs in 64-bit space:

  We configure an M64 window to cover the entire region of address space
  that has been assigned by FW for the PHB (about 64GB, ignore the space
  for the M32, it comes out of a different "reserve").  We configure it
  as segmented.

  Then we do the same thing as with M32, using the bridge alignment
  trick, to match to those giant segments.

  Since we cannot remap, we have two additional constraints:

  - We do the PE# allocation *after* the 64-bit space has been assigned
    because the addresses we use directly determine the PE#.  We then
    update the M32 PE# for the devices that use both 32-bit and 64-bit
    spaces or assign the remaining PE# to 32-bit only devices.

  - We cannot "group" segments in HW, so if a device ends up using more
    than one segment, we end up with more than one PE#.  There is a HW
    mechanism to make the freeze state cascade to "companion" PEs but
    that only works for PCIe error messages (typically used so that if
    you freeze a switch, it freezes all its children).  So we do it in
    SW.  We lose a bit of effectiveness of EEH in that case, but that's
    the best we found.  So when any of the PEs freezes, we freeze the
    other ones for that "domain".  We thus introduce the concept of
    "master PE" which is the one used for DMA, MSIs, etc., and "secondary
    PEs" that are used for the remaining M64 segments.

  We would like to investigate using additional M64 windows in "single
  PE" mode to overlay over specific BARs to work around some of that, for
  example for devices with very large BARs, e.g., GPUs.  It would make
  sense, but we haven't done it yet.
”h]”hŒbullet_list”“”)”}”(hhh]”(hŒ	list_item”“”)”}”(hXá  Inbound

For DMA, MSIs and inbound PCIe error messages, we have a table (in
memory but accessed in HW by the chip) that provides a direct
correspondence between a PCIe RID (bus/dev/fn) with a PE number.
We call this the RTT.

- For DMA we then provide an entire address space for each PE that can
  contain two "windows", depending on the value of PCI address bit 59.
  Each window can be configured to be remapped via a "TCE table" (IOMMU
  translation table), which has various configurable characteristics
  not described here.

- For MSIs, we have two windows in the address space (one at the top of
  the 32-bit space and one much higher) which, via a combination of the
  address and MSI value, will result in one of the 2048 interrupts per
  bridge being triggered.  There's a PE# in the interrupt controller
  descriptor table as well which is compared with the PE# obtained from
  the RTT to "authorize" the device to emit that specific interrupt.

- Error messages just use the RTT.
”h]”(h¸)”}”(hŒInbound”h]”hŒInbound”…””}”(hjÌ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K2hjÈ  ubh¸)”}”(hŒ×For DMA, MSIs and inbound PCIe error messages, we have a table (in
memory but accessed in HW by the chip) that provides a direct
correspondence between a PCIe RID (bus/dev/fn) with a PE number.
We call this the RTT.”h]”hŒ×For DMA, MSIs and inbound PCIe error messages, we have a table (in
memory but accessed in HW by the chip) that provides a direct
correspondence between a PCIe RID (bus/dev/fn) with a PE number.
We call this the RTT.”…””}”(hjÚ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K4hjÈ  ubjÂ  )”}”(hhh]”(jÇ  )”}”(hX'  For DMA we then provide an entire address space for each PE that can
contain two "windows", depending on the value of PCI address bit 59.
Each window can be configured to be remapped via a "TCE table" (IOMMU
translation table), which has various configurable characteristics
not described here.
”h]”h¸)”}”(hX&  For DMA we then provide an entire address space for each PE that can
contain two "windows", depending on the value of PCI address bit 59.
Each window can be configured to be remapped via a "TCE table" (IOMMU
translation table), which has various configurable characteristics
not described here.”h]”hX.  For DMA we then provide an entire address space for each PE that can
contain two â€œwindowsâ€, depending on the value of PCI address bit 59.
Each window can be configured to be remapped via a â€œTCE tableâ€ (IOMMU
translation table), which has various configurable characteristics
not described here.”…””}”(hjï  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K9hjë  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÆ  hjè  ubjÇ  )”}”(hX  For MSIs, we have two windows in the address space (one at the top of
the 32-bit space and one much higher) which, via a combination of the
address and MSI value, will result in one of the 2048 interrupts per
bridge being triggered.  There's a PE# in the interrupt controller
descriptor table as well which is compared with the PE# obtained from
the RTT to "authorize" the device to emit that specific interrupt.
”h]”h¸)”}”(hXœ  For MSIs, we have two windows in the address space (one at the top of
the 32-bit space and one much higher) which, via a combination of the
address and MSI value, will result in one of the 2048 interrupts per
bridge being triggered.  There's a PE# in the interrupt controller
descriptor table as well which is compared with the PE# obtained from
the RTT to "authorize" the device to emit that specific interrupt.”h]”hX¢  For MSIs, we have two windows in the address space (one at the top of
the 32-bit space and one much higher) which, via a combination of the
address and MSI value, will result in one of the 2048 interrupts per
bridge being triggered.  Thereâ€™s a PE# in the interrupt controller
descriptor table as well which is compared with the PE# obtained from
the RTT to â€œauthorizeâ€ the device to emit that specific interrupt.”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K?hj  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÆ  hjè  ubjÇ  )”}”(hŒ!Error messages just use the RTT.
”h]”h¸)”}”(hŒ Error messages just use the RTT.”h]”hŒ Error messages just use the RTT.”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KFhj  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÆ  hjè  ubeh}”(h]”h ]”h"]”h$]”h&]”Œbullet”Œ-”uh1jÁ  hŸh¶h K9hjÈ  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1jÆ  hjÃ  ubjÇ  )”}”(hX5  Outbound.  That's where the tricky part is.

Like other PCI host bridges, the Power8 IODA2 PHB supports "windows"
from the CPU address space to the PCI address space.  There is one M32
window and sixteen M64 windows.  They have different characteristics.
First what they have in common: they forward a configurable portion of
the CPU address space to the PCIe bus and must be naturally aligned
power of two in size.  The rest is different:

- The M32 window:

  * Is limited to 4GB in size.

  * Drops the top bits of the address (above the size) and replaces
    them with a configurable value.  This is typically used to generate
    32-bit PCIe accesses.  We configure that window at boot from FW and
    don't touch it from Linux; it's usually set to forward a 2GB
    portion of address space from the CPU to PCIe
    0x8000_0000..0xffff_ffff.  (Note: The top 64KB are actually
    reserved for MSIs but this is not a problem at this point; we just
    need to ensure Linux doesn't assign anything there, the M32 logic
    ignores that however and will forward in that space if we try).

  * It is divided into 256 segments of equal size.  A table in the chip
    maps each segment to a PE#.  That allows portions of the MMIO space
    to be assigned to PEs on a segment granularity.  For a 2GB window,
    the segment granularity is 2GB/256 = 8MB.

Now, this is the "main" window we use in Linux today (excluding
SR-IOV).  We basically use the trick of forcing the bridge MMIO windows
onto a segment alignment/granularity so that the space behind a bridge
can be assigned to a PE.

Ideally we would like to be able to have individual functions in PEs
but that would mean using a completely different address allocation
scheme where individual function BARs can be "grouped" to fit in one or
more segments.

- The M64 windows:

  * Must be at least 256MB in size.

  * Do not translate addresses (the address on PCIe is the same as the
    address on the PowerBus).  There is a way to also set the top 14
    bits which are not conveyed by PowerBus but we don't use this.

  * Can be configured to be segmented.  When not segmented, we can
    specify the PE# for the entire window.  When segmented, a window
    has 256 segments; however, there is no table for mapping a segment
    to a PE#.  The segment number *is* the PE#.

  * Support overlaps.  If an address is covered by multiple windows,
    there's a defined ordering for which window applies.

We have code (fairly new compared to the M32 stuff) that exploits that
for large BARs in 64-bit space:

We configure an M64 window to cover the entire region of address space
that has been assigned by FW for the PHB (about 64GB, ignore the space
for the M32, it comes out of a different "reserve").  We configure it
as segmented.

Then we do the same thing as with M32, using the bridge alignment
trick, to match to those giant segments.

Since we cannot remap, we have two additional constraints:

- We do the PE# allocation *after* the 64-bit space has been assigned
  because the addresses we use directly determine the PE#.  We then
  update the M32 PE# for the devices that use both 32-bit and 64-bit
  spaces or assign the remaining PE# to 32-bit only devices.

- We cannot "group" segments in HW, so if a device ends up using more
  than one segment, we end up with more than one PE#.  There is a HW
  mechanism to make the freeze state cascade to "companion" PEs but
  that only works for PCIe error messages (typically used so that if
  you freeze a switch, it freezes all its children).  So we do it in
  SW.  We lose a bit of effectiveness of EEH in that case, but that's
  the best we found.  So when any of the PEs freezes, we freeze the
  other ones for that "domain".  We thus introduce the concept of
  "master PE" which is the one used for DMA, MSIs, etc., and "secondary
  PEs" that are used for the remaining M64 segments.

We would like to investigate using additional M64 windows in "single
PE" mode to overlay over specific BARs to work around some of that, for
example for devices with very large BARs, e.g., GPUs.  It would make
sense, but we haven't done it yet.
”h]”(h¸)”}”(hŒ+Outbound.  That's where the tricky part is.”h]”hŒ-Outbound.  Thatâ€™s where the tricky part is.”…””}”(hjE  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KHhjA  ubh¸)”}”(hXŠ  Like other PCI host bridges, the Power8 IODA2 PHB supports "windows"
from the CPU address space to the PCI address space.  There is one M32
window and sixteen M64 windows.  They have different characteristics.
First what they have in common: they forward a configurable portion of
the CPU address space to the PCIe bus and must be naturally aligned
power of two in size.  The rest is different:”h]”hXŽ  Like other PCI host bridges, the Power8 IODA2 PHB supports â€œwindowsâ€
from the CPU address space to the PCI address space.  There is one M32
window and sixteen M64 windows.  They have different characteristics.
First what they have in common: they forward a configurable portion of
the CPU address space to the PCIe bus and must be naturally aligned
power of two in size.  The rest is different:”…””}”(hjS  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KJhjA  ubjÂ  )”}”(hhh]”jÇ  )”}”(hXs  The M32 window:

* Is limited to 4GB in size.

* Drops the top bits of the address (above the size) and replaces
  them with a configurable value.  This is typically used to generate
  32-bit PCIe accesses.  We configure that window at boot from FW and
  don't touch it from Linux; it's usually set to forward a 2GB
  portion of address space from the CPU to PCIe
  0x8000_0000..0xffff_ffff.  (Note: The top 64KB are actually
  reserved for MSIs but this is not a problem at this point; we just
  need to ensure Linux doesn't assign anything there, the M32 logic
  ignores that however and will forward in that space if we try).

* It is divided into 256 segments of equal size.  A table in the chip
  maps each segment to a PE#.  That allows portions of the MMIO space
  to be assigned to PEs on a segment granularity.  For a 2GB window,
  the segment granularity is 2GB/256 = 8MB.
”h]”(h¸)”}”(hŒThe M32 window:”h]”hŒThe M32 window:”…””}”(hjh  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KQhjd  ubjÂ  )”}”(hhh]”(jÇ  )”}”(hŒIs limited to 4GB in size.
”h]”h¸)”}”(hŒIs limited to 4GB in size.”h]”hŒIs limited to 4GB in size.”…””}”(hj}  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KShjy  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÆ  hjv  ubjÇ  )”}”(hX4  Drops the top bits of the address (above the size) and replaces
them with a configurable value.  This is typically used to generate
32-bit PCIe accesses.  We configure that window at boot from FW and
don't touch it from Linux; it's usually set to forward a 2GB
portion of address space from the CPU to PCIe
0x8000_0000..0xffff_ffff.  (Note: The top 64KB are actually
reserved for MSIs but this is not a problem at this point; we just
need to ensure Linux doesn't assign anything there, the M32 logic
ignores that however and will forward in that space if we try).
”h]”h¸)”}”(hX3  Drops the top bits of the address (above the size) and replaces
them with a configurable value.  This is typically used to generate
32-bit PCIe accesses.  We configure that window at boot from FW and
don't touch it from Linux; it's usually set to forward a 2GB
portion of address space from the CPU to PCIe
0x8000_0000..0xffff_ffff.  (Note: The top 64KB are actually
reserved for MSIs but this is not a problem at this point; we just
need to ensure Linux doesn't assign anything there, the M32 logic
ignores that however and will forward in that space if we try).”h]”hX9  Drops the top bits of the address (above the size) and replaces
them with a configurable value.  This is typically used to generate
32-bit PCIe accesses.  We configure that window at boot from FW and
donâ€™t touch it from Linux; itâ€™s usually set to forward a 2GB
portion of address space from the CPU to PCIe
0x8000_0000..0xffff_ffff.  (Note: The top 64KB are actually
reserved for MSIs but this is not a problem at this point; we just
need to ensure Linux doesnâ€™t assign anything there, the M32 logic
ignores that however and will forward in that space if we try).”…””}”(hj•  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KUhj‘  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÆ  hjv  ubjÇ  )”}”(hŒõIt is divided into 256 segments of equal size.  A table in the chip
maps each segment to a PE#.  That allows portions of the MMIO space
to be assigned to PEs on a segment granularity.  For a 2GB window,
the segment granularity is 2GB/256 = 8MB.
”h]”h¸)”}”(hŒôIt is divided into 256 segments of equal size.  A table in the chip
maps each segment to a PE#.  That allows portions of the MMIO space
to be assigned to PEs on a segment granularity.  For a 2GB window,
the segment granularity is 2GB/256 = 8MB.”h]”hŒôIt is divided into 256 segments of equal size.  A table in the chip
maps each segment to a PE#.  That allows portions of the MMIO space
to be assigned to PEs on a segment granularity.  For a 2GB window,
the segment granularity is 2GB/256 = 8MB.”…””}”(hj­  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K_hj©  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÆ  hjv  ubeh}”(h]”h ]”h"]”h$]”h&]”j9  Œ*”uh1jÁ  hŸh¶h KShjd  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1jÆ  hja  ubah}”(h]”h ]”h"]”h$]”h&]”j9  j:  uh1jÁ  hŸh¶h KQhjA  ubh¸)”}”(hŒçNow, this is the "main" window we use in Linux today (excluding
SR-IOV).  We basically use the trick of forcing the bridge MMIO windows
onto a segment alignment/granularity so that the space behind a bridge
can be assigned to a PE.”h]”hŒëNow, this is the â€œmainâ€ window we use in Linux today (excluding
SR-IOV).  We basically use the trick of forcing the bridge MMIO windows
onto a segment alignment/granularity so that the space behind a bridge
can be assigned to a PE.”…””}”(hjÔ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KdhjA  ubh¸)”}”(hŒßIdeally we would like to be able to have individual functions in PEs
but that would mean using a completely different address allocation
scheme where individual function BARs can be "grouped" to fit in one or
more segments.”h]”hŒãIdeally we would like to be able to have individual functions in PEs
but that would mean using a completely different address allocation
scheme where individual function BARs can be â€œgroupedâ€ to fit in one or
more segments.”…””}”(hjâ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KihjA  ubjÂ  )”}”(hhh]”jÇ  )”}”(hXq  The M64 windows:

* Must be at least 256MB in size.

* Do not translate addresses (the address on PCIe is the same as the
  address on the PowerBus).  There is a way to also set the top 14
  bits which are not conveyed by PowerBus but we don't use this.

* Can be configured to be segmented.  When not segmented, we can
  specify the PE# for the entire window.  When segmented, a window
  has 256 segments; however, there is no table for mapping a segment
  to a PE#.  The segment number *is* the PE#.

* Support overlaps.  If an address is covered by multiple windows,
  there's a defined ordering for which window applies.
”h]”(h¸)”}”(hŒThe M64 windows:”h]”hŒThe M64 windows:”…””}”(hj÷  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Knhjó  ubjÂ  )”}”(hhh]”(jÇ  )”}”(hŒ Must be at least 256MB in size.
”h]”h¸)”}”(hŒMust be at least 256MB in size.”h]”hŒMust be at least 256MB in size.”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Kphj  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÆ  hj  ubjÇ  )”}”(hŒÃDo not translate addresses (the address on PCIe is the same as the
address on the PowerBus).  There is a way to also set the top 14
bits which are not conveyed by PowerBus but we don't use this.
”h]”h¸)”}”(hŒÂDo not translate addresses (the address on PCIe is the same as the
address on the PowerBus).  There is a way to also set the top 14
bits which are not conveyed by PowerBus but we don't use this.”h]”hŒÄDo not translate addresses (the address on PCIe is the same as the
address on the PowerBus).  There is a way to also set the top 14
bits which are not conveyed by PowerBus but we donâ€™t use this.”…””}”(hj$  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Krhj   ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÆ  hj  ubjÇ  )”}”(hŒïCan be configured to be segmented.  When not segmented, we can
specify the PE# for the entire window.  When segmented, a window
has 256 segments; however, there is no table for mapping a segment
to a PE#.  The segment number *is* the PE#.
”h]”h¸)”}”(hŒîCan be configured to be segmented.  When not segmented, we can
specify the PE# for the entire window.  When segmented, a window
has 256 segments; however, there is no table for mapping a segment
to a PE#.  The segment number *is* the PE#.”h]”(hŒáCan be configured to be segmented.  When not segmented, we can
specify the PE# for the entire window.  When segmented, a window
has 256 segments; however, there is no table for mapping a segment
to a PE#.  The segment number ”…””}”(hj<  hžhhŸNh NubhŒemphasis”“”)”}”(hŒ*is*”h]”hŒis”…””}”(hjF  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jD  hj<  ubhŒ	 the PE#.”…””}”(hj<  hžhhŸNh Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Kvhj8  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÆ  hj  ubjÇ  )”}”(hŒvSupport overlaps.  If an address is covered by multiple windows,
there's a defined ordering for which window applies.
”h]”h¸)”}”(hŒuSupport overlaps.  If an address is covered by multiple windows,
there's a defined ordering for which window applies.”h]”hŒwSupport overlaps.  If an address is covered by multiple windows,
thereâ€™s a defined ordering for which window applies.”…””}”(hjh  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K{hjd  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÆ  hj  ubeh}”(h]”h ]”h"]”h$]”h&]”j9  jÇ  uh1jÁ  hŸh¶h Kphjó  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1jÆ  hjð  ubah}”(h]”h ]”h"]”h$]”h&]”j9  j:  uh1jÁ  hŸh¶h KnhjA  ubh¸)”}”(hŒfWe have code (fairly new compared to the M32 stuff) that exploits that
for large BARs in 64-bit space:”h]”hŒfWe have code (fairly new compared to the M32 stuff) that exploits that
for large BARs in 64-bit space:”…””}”(hjŽ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K~hjA  ubh¸)”}”(hŒáWe configure an M64 window to cover the entire region of address space
that has been assigned by FW for the PHB (about 64GB, ignore the space
for the M32, it comes out of a different "reserve").  We configure it
as segmented.”h]”hŒåWe configure an M64 window to cover the entire region of address space
that has been assigned by FW for the PHB (about 64GB, ignore the space
for the M32, it comes out of a different â€œreserveâ€).  We configure it
as segmented.”…””}”(hjœ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KhjA  ubh¸)”}”(hŒjThen we do the same thing as with M32, using the bridge alignment
trick, to match to those giant segments.”h]”hŒjThen we do the same thing as with M32, using the bridge alignment
trick, to match to those giant segments.”…””}”(hjª  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K†hjA  ubh¸)”}”(hŒ:Since we cannot remap, we have two additional constraints:”h]”hŒ:Since we cannot remap, we have two additional constraints:”…””}”(hj¸  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K‰hjA  ubjÂ  )”}”(hhh]”(jÇ  )”}”(hX  We do the PE# allocation *after* the 64-bit space has been assigned
because the addresses we use directly determine the PE#.  We then
update the M32 PE# for the devices that use both 32-bit and 64-bit
spaces or assign the remaining PE# to 32-bit only devices.
”h]”h¸)”}”(hX  We do the PE# allocation *after* the 64-bit space has been assigned
because the addresses we use directly determine the PE#.  We then
update the M32 PE# for the devices that use both 32-bit and 64-bit
spaces or assign the remaining PE# to 32-bit only devices.”h]”(hŒWe do the PE# allocation ”…””}”(hjÍ  hžhhŸNh NubjE  )”}”(hŒ*after*”h]”hŒafter”…””}”(hjÕ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jD  hjÍ  ubhŒã the 64-bit space has been assigned
because the addresses we use directly determine the PE#.  We then
update the M32 PE# for the devices that use both 32-bit and 64-bit
spaces or assign the remaining PE# to 32-bit only devices.”…””}”(hjÍ  hžhhŸNh Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K‹hjÉ  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÆ  hjÆ  ubjÇ  )”}”(hXŽ  We cannot "group" segments in HW, so if a device ends up using more
than one segment, we end up with more than one PE#.  There is a HW
mechanism to make the freeze state cascade to "companion" PEs but
that only works for PCIe error messages (typically used so that if
you freeze a switch, it freezes all its children).  So we do it in
SW.  We lose a bit of effectiveness of EEH in that case, but that's
the best we found.  So when any of the PEs freezes, we freeze the
other ones for that "domain".  We thus introduce the concept of
"master PE" which is the one used for DMA, MSIs, etc., and "secondary
PEs" that are used for the remaining M64 segments.
”h]”h¸)”}”(hX  We cannot "group" segments in HW, so if a device ends up using more
than one segment, we end up with more than one PE#.  There is a HW
mechanism to make the freeze state cascade to "companion" PEs but
that only works for PCIe error messages (typically used so that if
you freeze a switch, it freezes all its children).  So we do it in
SW.  We lose a bit of effectiveness of EEH in that case, but that's
the best we found.  So when any of the PEs freezes, we freeze the
other ones for that "domain".  We thus introduce the concept of
"master PE" which is the one used for DMA, MSIs, etc., and "secondary
PEs" that are used for the remaining M64 segments.”h]”hX£  We cannot â€œgroupâ€ segments in HW, so if a device ends up using more
than one segment, we end up with more than one PE#.  There is a HW
mechanism to make the freeze state cascade to â€œcompanionâ€ PEs but
that only works for PCIe error messages (typically used so that if
you freeze a switch, it freezes all its children).  So we do it in
SW.  We lose a bit of effectiveness of EEH in that case, but thatâ€™s
the best we found.  So when any of the PEs freezes, we freeze the
other ones for that â€œdomainâ€.  We thus introduce the concept of
â€œmaster PEâ€ which is the one used for DMA, MSIs, etc., and â€œsecondary
PEsâ€ that are used for the remaining M64 segments.”…””}”(hj÷  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h Khjó  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÆ  hjÆ  ubeh}”(h]”h ]”h"]”h$]”h&]”j9  j:  uh1jÁ  hŸh¶h K‹hjA  ubh¸)”}”(hŒôWe would like to investigate using additional M64 windows in "single
PE" mode to overlay over specific BARs to work around some of that, for
example for devices with very large BARs, e.g., GPUs.  It would make
sense, but we haven't done it yet.”h]”hŒúWe would like to investigate using additional M64 windows in â€œsingle
PEâ€ mode to overlay over specific BARs to work around some of that, for
example for devices with very large BARs, e.g., GPUs.  It would make
sense, but we havenâ€™t done it yet.”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K›hjA  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1jÆ  hjÃ  ubeh}”(h]”h ]”h"]”h$]”h&]”j9  jÇ  uh1jÁ  hŸh¶h K2hj½  ubah}”(h]”h ]”h"]”h$]”h&]”uh1j»  hŸh¶h K2hjœ  hžhubeh}”(h]”Œ5implementation-of-partitionable-endpoints-on-p8-ioda2”ah ]”h"]”Œ:2. implementation of partitionable endpoints on p8 (ioda2)”ah$]”h&]”uh1h¡hh£hžhhŸh¶h K.ubh¢)”}”(hhh]”(h§)”}”(hŒ(3. Considerations for SR-IOV on PowerKVM”h]”hŒ(3. Considerations for SR-IOV on PowerKVM”…””}”(hj<  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¦hj9  hžhhŸh¶h K¡ubj¼  )”}”(hX  * SR-IOV Background

  The PCIe SR-IOV feature allows a single Physical Function (PF) to
  support several Virtual Functions (VFs).  Registers in the PF's SR-IOV
  Capability control the number of VFs and whether they are enabled.

  When VFs are enabled, they appear in Configuration Space like normal
  PCI devices, but the BARs in VF config space headers are unusual.  For
  a non-VF device, software uses BARs in the config space header to
  discover the BAR sizes and assign addresses for them.  For VF devices,
  software uses VF BAR registers in the *PF* SR-IOV Capability to
  discover sizes and assign addresses.  The BARs in the VF's config space
  header are read-only zeros.

  When a VF BAR in the PF SR-IOV Capability is programmed, it sets the
  base address for all the corresponding VF(n) BARs.  For example, if the
  PF SR-IOV Capability is programmed to enable eight VFs, and it has a
  1MB VF BAR0, the address in that VF BAR sets the base of an 8MB region.
  This region is divided into eight contiguous 1MB regions, each of which
  is a BAR0 for one of the VFs.  Note that even though the VF BAR
  describes an 8MB region, the alignment requirement is for a single VF,
  i.e., 1MB in this example.

There are several strategies for isolating VFs in PEs:

- M32 window: There's one M32 window, and it is split into 256
  equally-sized segments.  The finest granularity possible is a 256MB
  window with 1MB segments.  VF BARs that are 1MB or larger could be
  mapped to separate PEs in this window.  Each segment can be
  individually mapped to a PE via the lookup table, so this is quite
  flexible, but it works best when all the VF BARs are the same size.  If
  they are different sizes, the entire window has to be small enough that
  the segment size matches the smallest VF BAR, which means larger VF
  BARs span several segments.

- Non-segmented M64 window: A non-segmented M64 window is mapped entirely
  to a single PE, so it could only isolate one VF.

- Single segmented M64 windows: A segmented M64 window could be used just
  like the M32 window, but the segments can't be individually mapped to
  PEs (the segment number is the PE#), so there isn't as much
  flexibility.  A VF with multiple BARs would have to be in a "domain" of
  multiple PEs, which is not as well isolated as a single PE.

- Multiple segmented M64 windows: As usual, each window is split into 256
  equally-sized segments, and the segment number is the PE#.  But if we
  use several M64 windows, they can be set to different base addresses
  and different segment sizes.  If we have VFs that each have a 1MB BAR
  and a 32MB BAR, we could use one M64 window to assign 1MB segments and
  another M64 window to assign 32MB segments.

Finally, the plan to use M64 windows for SR-IOV, which will be described
more in the next two sections.  For a given VF BAR, we need to
effectively reserve the entire 256 segments (256 * VF BAR size) and
position the VF BAR to start at the beginning of a free range of
segments/PEs inside that M64 window.

The goal is of course to be able to give a separate PE for each VF.

The IODA2 platform has 16 M64 windows, which are used to map MMIO
range to PE#.  Each M64 window defines one MMIO range and this range is
divided into 256 segments, with each segment corresponding to one PE.

We decide to leverage this M64 window to map VFs to individual PEs, since
SR-IOV VF BARs are all the same size.

But doing so introduces another problem: total_VFs is usually smaller
than the number of M64 window segments, so if we map one VF BAR directly
to one M64 window, some part of the M64 window will map to another
device's MMIO range.

IODA supports 256 PEs, so segmented windows contain 256 segments, so if
total_VFs is less than 256, we have the situation in Figure 1.0, where
segments [total_VFs, 255] of the M64 window may map to some MMIO range on
other devices::

   0      1                     total_VFs - 1
   +------+------+-     -+------+------+
   |      |      |  ...  |      |      |
   +------+------+-     -+------+------+

                         VF(n) BAR space

   0      1                     total_VFs - 1                255
   +------+------+-     -+------+------+-      -+------+------+
   |      |      |  ...  |      |      |   ...  |      |      |
   +------+------+-     -+------+------+-      -+------+------+

                         M64 window

              Figure 1.0 Direct map VF(n) BAR space

Our current solution is to allocate 256 segments even if the VF(n) BAR
space doesn't need that much, as shown in Figure 1.1::

   0      1                     total_VFs - 1                255
   +------+------+-     -+------+------+-      -+------+------+
   |      |      |  ...  |      |      |   ...  |      |      |
   +------+------+-     -+------+------+-      -+------+------+

                         VF(n) BAR space + extra

   0      1                     total_VFs - 1                255
   +------+------+-     -+------+------+-      -+------+------+
   |      |      |  ...  |      |      |   ...  |      |      |
   +------+------+-     -+------+------+-      -+------+------+

                         M64 window

              Figure 1.1 Map VF(n) BAR space + extra

Allocating the extra space ensures that the entire M64 window will be
assigned to this one SR-IOV device and none of the space will be
available for other devices.  Note that this only expands the space
reserved in software; there are still only total_VFs VFs, and they only
respond to segments [0, total_VFs - 1].  There's nothing in hardware that
responds to segments [total_VFs, 255].
”h]”(jÂ  )”}”(hhh]”jÇ  )”}”(hXž  SR-IOV Background

The PCIe SR-IOV feature allows a single Physical Function (PF) to
support several Virtual Functions (VFs).  Registers in the PF's SR-IOV
Capability control the number of VFs and whether they are enabled.

When VFs are enabled, they appear in Configuration Space like normal
PCI devices, but the BARs in VF config space headers are unusual.  For
a non-VF device, software uses BARs in the config space header to
discover the BAR sizes and assign addresses for them.  For VF devices,
software uses VF BAR registers in the *PF* SR-IOV Capability to
discover sizes and assign addresses.  The BARs in the VF's config space
header are read-only zeros.

When a VF BAR in the PF SR-IOV Capability is programmed, it sets the
base address for all the corresponding VF(n) BARs.  For example, if the
PF SR-IOV Capability is programmed to enable eight VFs, and it has a
1MB VF BAR0, the address in that VF BAR sets the base of an 8MB region.
This region is divided into eight contiguous 1MB regions, each of which
is a BAR0 for one of the VFs.  Note that even though the VF BAR
describes an 8MB region, the alignment requirement is for a single VF,
i.e., 1MB in this example.
”h]”(h¸)”}”(hŒSR-IOV Background”h]”hŒSR-IOV Background”…””}”(hjU  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K£hjQ  ubh¸)”}”(hŒËThe PCIe SR-IOV feature allows a single Physical Function (PF) to
support several Virtual Functions (VFs).  Registers in the PF's SR-IOV
Capability control the number of VFs and whether they are enabled.”h]”hŒÍThe PCIe SR-IOV feature allows a single Physical Function (PF) to
support several Virtual Functions (VFs).  Registers in the PFâ€™s SR-IOV
Capability control the number of VFs and whether they are enabled.”…””}”(hjc  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K¥hjQ  ubh¸)”}”(hX¸  When VFs are enabled, they appear in Configuration Space like normal
PCI devices, but the BARs in VF config space headers are unusual.  For
a non-VF device, software uses BARs in the config space header to
discover the BAR sizes and assign addresses for them.  For VF devices,
software uses VF BAR registers in the *PF* SR-IOV Capability to
discover sizes and assign addresses.  The BARs in the VF's config space
header are read-only zeros.”h]”(hX;  When VFs are enabled, they appear in Configuration Space like normal
PCI devices, but the BARs in VF config space headers are unusual.  For
a non-VF device, software uses BARs in the config space header to
discover the BAR sizes and assign addresses for them.  For VF devices,
software uses VF BAR registers in the ”…””}”(hjq  hžhhŸNh NubjE  )”}”(hŒ*PF*”h]”hŒPF”…””}”(hjy  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1jD  hjq  ubhŒ{ SR-IOV Capability to
discover sizes and assign addresses.  The BARs in the VFâ€™s config space
header are read-only zeros.”…””}”(hjq  hžhhŸNh Nubeh}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K©hjQ  ubh¸)”}”(hX  When a VF BAR in the PF SR-IOV Capability is programmed, it sets the
base address for all the corresponding VF(n) BARs.  For example, if the
PF SR-IOV Capability is programmed to enable eight VFs, and it has a
1MB VF BAR0, the address in that VF BAR sets the base of an 8MB region.
This region is divided into eight contiguous 1MB regions, each of which
is a BAR0 for one of the VFs.  Note that even though the VF BAR
describes an 8MB region, the alignment requirement is for a single VF,
i.e., 1MB in this example.”h]”hX  When a VF BAR in the PF SR-IOV Capability is programmed, it sets the
base address for all the corresponding VF(n) BARs.  For example, if the
PF SR-IOV Capability is programmed to enable eight VFs, and it has a
1MB VF BAR0, the address in that VF BAR sets the base of an 8MB region.
This region is divided into eight contiguous 1MB regions, each of which
is a BAR0 for one of the VFs.  Note that even though the VF BAR
describes an 8MB region, the alignment requirement is for a single VF,
i.e., 1MB in this example.”…””}”(hj‘  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K±hjQ  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1jÆ  hjN  ubah}”(h]”h ]”h"]”h$]”h&]”j9  jÇ  uh1jÁ  hŸh¶h K£hjJ  ubh¸)”}”(hŒ6There are several strategies for isolating VFs in PEs:”h]”hŒ6There are several strategies for isolating VFs in PEs:”…””}”(hj«  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KºhjJ  ubjÂ  )”}”(hhh]”(jÇ  )”}”(hX3  M32 window: There's one M32 window, and it is split into 256
equally-sized segments.  The finest granularity possible is a 256MB
window with 1MB segments.  VF BARs that are 1MB or larger could be
mapped to separate PEs in this window.  Each segment can be
individually mapped to a PE via the lookup table, so this is quite
flexible, but it works best when all the VF BARs are the same size.  If
they are different sizes, the entire window has to be small enough that
the segment size matches the smallest VF BAR, which means larger VF
BARs span several segments.
”h]”h¸)”}”(hX2  M32 window: There's one M32 window, and it is split into 256
equally-sized segments.  The finest granularity possible is a 256MB
window with 1MB segments.  VF BARs that are 1MB or larger could be
mapped to separate PEs in this window.  Each segment can be
individually mapped to a PE via the lookup table, so this is quite
flexible, but it works best when all the VF BARs are the same size.  If
they are different sizes, the entire window has to be small enough that
the segment size matches the smallest VF BAR, which means larger VF
BARs span several segments.”h]”hX4  M32 window: Thereâ€™s one M32 window, and it is split into 256
equally-sized segments.  The finest granularity possible is a 256MB
window with 1MB segments.  VF BARs that are 1MB or larger could be
mapped to separate PEs in this window.  Each segment can be
individually mapped to a PE via the lookup table, so this is quite
flexible, but it works best when all the VF BARs are the same size.  If
they are different sizes, the entire window has to be small enough that
the segment size matches the smallest VF BAR, which means larger VF
BARs span several segments.”…””}”(hjÀ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h K¼hj¼  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÆ  hj¹  ubjÇ  )”}”(hŒyNon-segmented M64 window: A non-segmented M64 window is mapped entirely
to a single PE, so it could only isolate one VF.
”h]”h¸)”}”(hŒxNon-segmented M64 window: A non-segmented M64 window is mapped entirely
to a single PE, so it could only isolate one VF.”h]”hŒxNon-segmented M64 window: A non-segmented M64 window is mapped entirely
to a single PE, so it could only isolate one VF.”…””}”(hjØ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KÆhjÔ  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÆ  hj¹  ubjÇ  )”}”(hXN  Single segmented M64 windows: A segmented M64 window could be used just
like the M32 window, but the segments can't be individually mapped to
PEs (the segment number is the PE#), so there isn't as much
flexibility.  A VF with multiple BARs would have to be in a "domain" of
multiple PEs, which is not as well isolated as a single PE.
”h]”h¸)”}”(hXM  Single segmented M64 windows: A segmented M64 window could be used just
like the M32 window, but the segments can't be individually mapped to
PEs (the segment number is the PE#), so there isn't as much
flexibility.  A VF with multiple BARs would have to be in a "domain" of
multiple PEs, which is not as well isolated as a single PE.”h]”hXU  Single segmented M64 windows: A segmented M64 window could be used just
like the M32 window, but the segments canâ€™t be individually mapped to
PEs (the segment number is the PE#), so there isnâ€™t as much
flexibility.  A VF with multiple BARs would have to be in a â€œdomainâ€ of
multiple PEs, which is not as well isolated as a single PE.”…””}”(hjð  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KÉhjì  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÆ  hj¹  ubjÇ  )”}”(hXŒ  Multiple segmented M64 windows: As usual, each window is split into 256
equally-sized segments, and the segment number is the PE#.  But if we
use several M64 windows, they can be set to different base addresses
and different segment sizes.  If we have VFs that each have a 1MB BAR
and a 32MB BAR, we could use one M64 window to assign 1MB segments and
another M64 window to assign 32MB segments.
”h]”h¸)”}”(hX‹  Multiple segmented M64 windows: As usual, each window is split into 256
equally-sized segments, and the segment number is the PE#.  But if we
use several M64 windows, they can be set to different base addresses
and different segment sizes.  If we have VFs that each have a 1MB BAR
and a 32MB BAR, we could use one M64 window to assign 1MB segments and
another M64 window to assign 32MB segments.”h]”hX‹  Multiple segmented M64 windows: As usual, each window is split into 256
equally-sized segments, and the segment number is the PE#.  But if we
use several M64 windows, they can be set to different base addresses
and different segment sizes.  If we have VFs that each have a 1MB BAR
and a 32MB BAR, we could use one M64 window to assign 1MB segments and
another M64 window to assign 32MB segments.”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KÏhj  ubah}”(h]”h ]”h"]”h$]”h&]”uh1jÆ  hj¹  ubeh}”(h]”h ]”h"]”h$]”h&]”j9  j:  uh1jÁ  hŸh¶h K¼hjJ  ubh¸)”}”(hX1  Finally, the plan to use M64 windows for SR-IOV, which will be described
more in the next two sections.  For a given VF BAR, we need to
effectively reserve the entire 256 segments (256 * VF BAR size) and
position the VF BAR to start at the beginning of a free range of
segments/PEs inside that M64 window.”h]”hX1  Finally, the plan to use M64 windows for SR-IOV, which will be described
more in the next two sections.  For a given VF BAR, we need to
effectively reserve the entire 256 segments (256 * VF BAR size) and
position the VF BAR to start at the beginning of a free range of
segments/PEs inside that M64 window.”…””}”(hj"  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KÖhjJ  ubh¸)”}”(hŒCThe goal is of course to be able to give a separate PE for each VF.”h]”hŒCThe goal is of course to be able to give a separate PE for each VF.”…””}”(hj0  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KÜhjJ  ubh¸)”}”(hŒÏThe IODA2 platform has 16 M64 windows, which are used to map MMIO
range to PE#.  Each M64 window defines one MMIO range and this range is
divided into 256 segments, with each segment corresponding to one PE.”h]”hŒÏThe IODA2 platform has 16 M64 windows, which are used to map MMIO
range to PE#.  Each M64 window defines one MMIO range and this range is
divided into 256 segments, with each segment corresponding to one PE.”…””}”(hj>  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KÞhjJ  ubh¸)”}”(hŒoWe decide to leverage this M64 window to map VFs to individual PEs, since
SR-IOV VF BARs are all the same size.”h]”hŒoWe decide to leverage this M64 window to map VFs to individual PEs, since
SR-IOV VF BARs are all the same size.”…””}”(hjL  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KâhjJ  ubh¸)”}”(hŒæBut doing so introduces another problem: total_VFs is usually smaller
than the number of M64 window segments, so if we map one VF BAR directly
to one M64 window, some part of the M64 window will map to another
device's MMIO range.”h]”hŒèBut doing so introduces another problem: total_VFs is usually smaller
than the number of M64 window segments, so if we map one VF BAR directly
to one M64 window, some part of the M64 window will map to another
deviceâ€™s MMIO range.”…””}”(hjZ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KåhjJ  ubh¸)”}”(hŒèIODA supports 256 PEs, so segmented windows contain 256 segments, so if
total_VFs is less than 256, we have the situation in Figure 1.0, where
segments [total_VFs, 255] of the M64 window may map to some MMIO range on
other devices::”h]”hŒçIODA supports 256 PEs, so segmented windows contain 256 segments, so if
total_VFs is less than 256, we have the situation in Figure 1.0, where
segments [total_VFs, 255] of the M64 window may map to some MMIO range on
other devices:”…””}”(hjh  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KêhjJ  ubhŒliteral_block”“”)”}”(hX  0      1                     total_VFs - 1
+------+------+-     -+------+------+
|      |      |  ...  |      |      |
+------+------+-     -+------+------+

                      VF(n) BAR space

0      1                     total_VFs - 1                255
+------+------+-     -+------+------+-      -+------+------+
|      |      |  ...  |      |      |   ...  |      |      |
+------+------+-     -+------+------+-      -+------+------+

                      M64 window

           Figure 1.0 Direct map VF(n) BAR space”h]”hX  0      1                     total_VFs - 1
+------+------+-     -+------+------+
|      |      |  ...  |      |      |
+------+------+-     -+------+------+

                      VF(n) BAR space

0      1                     total_VFs - 1                255
+------+------+-     -+------+------+-      -+------+------+
|      |      |  ...  |      |      |   ...  |      |      |
+------+------+-     -+------+------+-      -+------+------+

                      M64 window

           Figure 1.0 Direct map VF(n) BAR space”…””}”hjx  sbah}”(h]”h ]”h"]”h$]”h&]”Œ	xml:space”Œpreserve”uh1jv  hŸh¶h KïhjJ  ubh¸)”}”(hŒ}Our current solution is to allocate 256 segments even if the VF(n) BAR
space doesn't need that much, as shown in Figure 1.1::”h]”hŒ~Our current solution is to allocate 256 segments even if the VF(n) BAR
space doesnâ€™t need that much, as shown in Figure 1.1:”…””}”(hjˆ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h KÿhjJ  ubjw  )”}”(hXn  0      1                     total_VFs - 1                255
+------+------+-     -+------+------+-      -+------+------+
|      |      |  ...  |      |      |   ...  |      |      |
+------+------+-     -+------+------+-      -+------+------+

                      VF(n) BAR space + extra

0      1                     total_VFs - 1                255
+------+------+-     -+------+------+-      -+------+------+
|      |      |  ...  |      |      |   ...  |      |      |
+------+------+-     -+------+------+-      -+------+------+

                      M64 window

           Figure 1.1 Map VF(n) BAR space + extra”h]”hXn  0      1                     total_VFs - 1                255
+------+------+-     -+------+------+-      -+------+------+
|      |      |  ...  |      |      |   ...  |      |      |
+------+------+-     -+------+------+-      -+------+------+

                      VF(n) BAR space + extra

0      1                     total_VFs - 1                255
+------+------+-     -+------+------+-      -+------+------+
|      |      |  ...  |      |      |   ...  |      |      |
+------+------+-     -+------+------+-      -+------+------+

                      M64 window

           Figure 1.1 Map VF(n) BAR space + extra”…””}”hj–  sbah}”(h]”h ]”h"]”h$]”h&]”j†  j‡  uh1jv  hŸh¶h MhjJ  ubh¸)”}”(hXƒ  Allocating the extra space ensures that the entire M64 window will be
assigned to this one SR-IOV device and none of the space will be
available for other devices.  Note that this only expands the space
reserved in software; there are still only total_VFs VFs, and they only
respond to segments [0, total_VFs - 1].  There's nothing in hardware that
responds to segments [total_VFs, 255].”h]”hX…  Allocating the extra space ensures that the entire M64 window will be
assigned to this one SR-IOV device and none of the space will be
available for other devices.  Note that this only expands the space
reserved in software; there are still only total_VFs VFs, and they only
respond to segments [0, total_VFs - 1].  Thereâ€™s nothing in hardware that
responds to segments [total_VFs, 255].”…””}”(hj¤  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h MhjJ  ubeh}”(h]”h ]”h"]”h$]”h&]”uh1j»  hŸh¶h K£hj9  hžhubeh}”(h]”Œ%considerations-for-sr-iov-on-powerkvm”ah ]”h"]”Œ(3. considerations for sr-iov on powerkvm”ah$]”h&]”uh1h¡hh£hžhhŸh¶h K¡ubh¢)”}”(hhh]”(h§)”}”(hŒ(4. Implications for the Generic PCI Code”h]”hŒ(4. Implications for the Generic PCI Code”…””}”(hjÃ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h¦hjÀ  hžhhŸh¶h Mubh¸)”}”(hŒrThe PCIe SR-IOV spec requires that the base of the VF(n) BAR space be
aligned to the size of an individual VF BAR.”h]”hŒrThe PCIe SR-IOV spec requires that the base of the VF(n) BAR space be
aligned to the size of an individual VF BAR.”…””}”(hjÑ  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h MhjÀ  hžhubh¸)”}”(hXD  In IODA2, the MMIO address determines the PE#.  If the address is in an M32
window, we can set the PE# by updating the table that translates segments
to PE#s.  Similarly, if the address is in an unsegmented M64 window, we can
set the PE# for the window.  But if it's in a segmented M64 window, the
segment number is the PE#.”h]”hXF  In IODA2, the MMIO address determines the PE#.  If the address is in an M32
window, we can set the PE# by updating the table that translates segments
to PE#s.  Similarly, if the address is in an unsegmented M64 window, we can
set the PE# for the window.  But if itâ€™s in a segmented M64 window, the
segment number is the PE#.”…””}”(hjß  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h MhjÀ  hžhubh¸)”}”(hŒ÷Therefore, the only way to control the PE# for a VF is to change the base
of the VF(n) BAR space in the VF BAR.  If the PCI core allocates the exact
amount of space required for the VF(n) BAR space, the VF BAR value is fixed
and cannot be changed.”h]”hŒ÷Therefore, the only way to control the PE# for a VF is to change the base
of the VF(n) BAR space in the VF BAR.  If the PCI core allocates the exact
amount of space required for the VF(n) BAR space, the VF BAR value is fixed
and cannot be changed.”…””}”(hjí  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h M%hjÀ  hžhubh¸)”}”(hŒ´On the other hand, if the PCI core allocates additional space, the VF BAR
value can be changed as long as the entire VF(n) BAR space remains inside
the space allocated by the core.”h]”hŒ´On the other hand, if the PCI core allocates additional space, the VF BAR
value can be changed as long as the entire VF(n) BAR space remains inside
the space allocated by the core.”…””}”(hjû  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h M*hjÀ  hžhubh¸)”}”(hX#  Ideally the segment size will be the same as an individual VF BAR size.
Then each VF will be in its own PE.  The VF BARs (and therefore the PE#s)
are contiguous.  If VF0 is in PE(x), then VF(n) is in PE(x+n).  If we
allocate 256 segments, there are (256 - numVFs) choices for the PE# of VF0.”h]”hX#  Ideally the segment size will be the same as an individual VF BAR size.
Then each VF will be in its own PE.  The VF BARs (and therefore the PE#s)
are contiguous.  If VF0 is in PE(x), then VF(n) is in PE(x+n).  If we
allocate 256 segments, there are (256 - numVFs) choices for the PE# of VF0.”…””}”(hj	  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h M.hjÀ  hžhubh¸)”}”(hX­  If the segment size is smaller than the VF BAR size, it will take several
segments to cover a VF BAR, and a VF will be in several PEs.  This is
possible, but the isolation isn't as good, and it reduces the number of PE#
choices because instead of consuming only numVFs segments, the VF(n) BAR
space will consume (numVFs * n) segments.  That means there aren't as many
available segments for adjusting base of the VF(n) BAR space.”h]”hX±  If the segment size is smaller than the VF BAR size, it will take several
segments to cover a VF BAR, and a VF will be in several PEs.  This is
possible, but the isolation isnâ€™t as good, and it reduces the number of PE#
choices because instead of consuming only numVFs segments, the VF(n) BAR
space will consume (numVFs * n) segments.  That means there arenâ€™t as many
available segments for adjusting base of the VF(n) BAR space.”…””}”(hj  hžhhŸNh Nubah}”(h]”h ]”h"]”h$]”h&]”uh1h·hŸh¶h M3hjÀ  hžhubeh}”(h]”Œ%implications-for-the-generic-pci-code”ah ]”h"]”Œ(4. implications for the generic pci code”ah$]”h&]”uh1h¡hh£hžhhŸh¶h Mubeh}”(h]”Œ3pci-express-i-o-virtualization-resource-on-powerenv”ah ]”h"]”Œ3pci express i/o virtualization resource on powerenv”ah$]”h&]”uh1h¡hhhžhhŸh¶h Kubeh}”(h]”h ]”h"]”h$]”h&]”Œsource”h¶uh1hŒcurrent_source”NŒcurrent_line”NŒsettings”Œdocutils.frontend”ŒValues”“”)”}”(h¦NŒ	generator”NŒ	datestamp”NŒsource_link”NŒ
source_url”NŒtoc_backlinks”Œentry”Œfootnote_backlinks”KŒsectnum_xform”KŒstrip_comments”NŒstrip_elements_with_classes”NŒstrip_classes”NŒreport_level”KŒ
halt_level”KŒexit_status_level”KŒdebug”NŒwarning_stream”NŒ	traceback”ˆŒinput_encoding”Œ	utf-8-sig”Œinput_encoding_error_handler”Œstrict”Œoutput_encoding”Œutf-8”Œoutput_encoding_error_handler”jX  Œerror_encoding”Œutf-8”Œerror_encoding_error_handler”Œbackslashreplace”Œlanguage_code”Œen”Œrecord_dependencies”NŒconfig”NŒ	id_prefix”hŒauto_id_prefix”Œid”Œdump_settings”NŒdump_internals”NŒdump_transforms”NŒdump_pseudo_xml”NŒexpose_internals”NŒstrict_visitor”NŒ_disable_config”NŒ_source”h¶Œ_destination”NŒ_config_files”]”Œ7/var/lib/git/docbuild/linux/Documentation/docutils.conf”aŒfile_insertion_enabled”ˆŒraw_enabled”KŒline_length_limit”M'Œpep_references”NŒpep_base_url”Œhttps://peps.python.org/”Œpep_file_url_template”Œpep-%04d”Œrfc_references”NŒrfc_base_url”Œ&https://datatracker.ietf.org/doc/html/”Œ	tab_width”KŒtrim_footnote_reference_space”‰Œsyntax_highlight”Œlong”Œsmart_quotes”ˆŒsmartquotes_locales”]”Œcharacter_level_inline_markup”‰Œdoctitle_xform”‰Œdocinfo_xform”KŒsectsubtitle_xform”‰Œimage_loading”Œlink”Œembed_stylesheet”‰Œcloak_email_addresses”ˆŒsection_self_link”‰Œenv”NubŒreporter”NŒindirect_targets”]”Œsubstitution_defs”}”Œsubstitution_names”}”Œrefnames”}”Œrefids”}”Œnameids”}”(j2  j/  j™  j–  j6  j3  j½  jº  j*  j'  uŒ	nametypes”}”(j2  ‰j™  ‰j6  ‰j½  ‰j*  ‰uh}”(j/  h£j–  j=  j3  jœ  jº  j9  j'  jÀ  uŒfootnote_refs”}”Œcitation_refs”}”Œautofootnotes”]”Œautofootnote_refs”]”Œsymbol_footnotes”]”Œsymbol_footnote_refs”]”Œ	footnotes”]”Œ	citations”]”Œautofootnote_start”KŒsymbol_footnote_start”K Œ
id_counter”Œcollections”ŒCounter”“”}”…”R”Œparse_messages”]”Œtransform_messages”]”Œtransformer”NŒinclude_log”]”Œ
decoration”Nhžhub.