aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/ABI/testing/sysfs-class-net-queues23
-rw-r--r--Documentation/devicetree/bindings/leds/leds-pwm-multicolor.yaml2
-rw-r--r--Documentation/devicetree/bindings/net/renesas,etheravb.yaml1
-rw-r--r--Documentation/devicetree/bindings/net/renesas,ethertsn.yaml2
-rw-r--r--Documentation/devicetree/bindings/net/ti,dp83822.yaml8
-rw-r--r--Documentation/devicetree/bindings/net/wireless/mediatek,mt76.yaml33
-rw-r--r--Documentation/devicetree/bindings/net/wireless/qcom,ath10k.yaml1
-rw-r--r--Documentation/devicetree/bindings/net/wireless/qcom,ath11k-pci.yaml1
-rw-r--r--Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml1
-rw-r--r--Documentation/driver-api/dpll.rst2
-rw-r--r--Documentation/netlink/genetlink-c.yaml41
-rw-r--r--Documentation/netlink/genetlink-legacy.yaml41
-rw-r--r--Documentation/netlink/genetlink.yaml21
-rw-r--r--Documentation/netlink/netlink-raw.yaml22
-rw-r--r--Documentation/netlink/specs/dpll.yaml1
-rw-r--r--Documentation/netlink/specs/mptcp_pm.yaml3
-rw-r--r--Documentation/netlink/specs/netdev.yaml91
-rw-r--r--Documentation/netlink/specs/nlctrl.yaml206
-rw-r--r--Documentation/networking/device_drivers/ethernet/intel/ice.rst21
-rw-r--r--Documentation/networking/index.rst1
-rw-r--r--Documentation/networking/multi-pf-netdev.rst174
-rw-r--r--Documentation/networking/netconsole.rst8
-rw-r--r--Documentation/networking/sfp-phylink.rst147
-rw-r--r--Documentation/networking/statistics.rst15
-rw-r--r--Documentation/virt/hyperv/index.rst1
-rw-r--r--Documentation/virt/hyperv/vpci.rst316
26 files changed, 1128 insertions, 55 deletions
diff --git a/Documentation/ABI/testing/sysfs-class-net-queues b/Documentation/ABI/testing/sysfs-class-net-queues
index 5bff64d256c207..84aa25e0d14d12 100644
--- a/Documentation/ABI/testing/sysfs-class-net-queues
+++ b/Documentation/ABI/testing/sysfs-class-net-queues
@@ -96,3 +96,26 @@ Description:
Indicates the absolute minimum limit of bytes allowed to be
queued on this network device transmit queue. Default value is
0.
+
+What: /sys/class/net/<iface>/queues/tx-<queue>/byte_queue_limits/stall_thrs
+Date: Jan 2024
+KernelVersion: 6.9
+Contact: netdev@vger.kernel.org
+Description:
+ Tx completion stall detection threshold in ms. Kernel will
+ guarantee to detect all stalls longer than this threshold but
+ may also detect stalls longer than half of the threshold.
+
+What: /sys/class/net/<iface>/queues/tx-<queue>/byte_queue_limits/stall_cnt
+Date: Jan 2024
+KernelVersion: 6.9
+Contact: netdev@vger.kernel.org
+Description:
+ Number of detected Tx completion stalls.
+
+What: /sys/class/net/<iface>/queues/tx-<queue>/byte_queue_limits/stall_max
+Date: Jan 2024
+KernelVersion: 6.9
+Contact: netdev@vger.kernel.org
+Description:
+ Longest detected Tx completion stall. Write 0 to clear.
diff --git a/Documentation/devicetree/bindings/leds/leds-pwm-multicolor.yaml b/Documentation/devicetree/bindings/leds/leds-pwm-multicolor.yaml
index 5edfbe347341cd..a31a202afe5ccf 100644
--- a/Documentation/devicetree/bindings/leds/leds-pwm-multicolor.yaml
+++ b/Documentation/devicetree/bindings/leds/leds-pwm-multicolor.yaml
@@ -41,6 +41,8 @@ properties:
pwm-names: true
+ active-low: true
+
color: true
required:
diff --git a/Documentation/devicetree/bindings/net/renesas,etheravb.yaml b/Documentation/devicetree/bindings/net/renesas,etheravb.yaml
index 890f7858d0dc4c..de7ba7f345a937 100644
--- a/Documentation/devicetree/bindings/net/renesas,etheravb.yaml
+++ b/Documentation/devicetree/bindings/net/renesas,etheravb.yaml
@@ -46,6 +46,7 @@ properties:
- enum:
- renesas,etheravb-r8a779a0 # R-Car V3U
- renesas,etheravb-r8a779g0 # R-Car V4H
+ - renesas,etheravb-r8a779h0 # R-Car V4M
- const: renesas,etheravb-rcar-gen4 # R-Car Gen4
- items:
diff --git a/Documentation/devicetree/bindings/net/renesas,ethertsn.yaml b/Documentation/devicetree/bindings/net/renesas,ethertsn.yaml
index 475aff7714d641..ea35d19be829a3 100644
--- a/Documentation/devicetree/bindings/net/renesas,ethertsn.yaml
+++ b/Documentation/devicetree/bindings/net/renesas,ethertsn.yaml
@@ -65,9 +65,11 @@ properties:
rx-internal-delay-ps:
enum: [0, 1800]
+ default: 0
tx-internal-delay-ps:
enum: [0, 2000]
+ default: 0
'#address-cells':
const: 1
diff --git a/Documentation/devicetree/bindings/net/ti,dp83822.yaml b/Documentation/devicetree/bindings/net/ti,dp83822.yaml
index 8f23254c0458f7..784866ea392b20 100644
--- a/Documentation/devicetree/bindings/net/ti,dp83822.yaml
+++ b/Documentation/devicetree/bindings/net/ti,dp83822.yaml
@@ -84,10 +84,10 @@ properties:
description: |
If present, select the RMII operation mode. Two modes are
available:
- - RMII master, where the PHY operates from a 25MHz clock reference,
- provided by a crystal or a CMOS-level oscillator
- - RMII slave, where the PHY operates from a 50MHz clock reference,
- provided by a CMOS-level oscillator
+ - RMII master, where the PHY outputs a 50MHz reference clock which can
+ be connected to the MAC.
+ - RMII slave, where the PHY expects a 50MHz reference clock input
+ shared with the MAC.
The RMII operation mode can also be configured by its straps.
If the strap pin is not set correctly or not set at all, then this can be
used to configure it.
diff --git a/Documentation/devicetree/bindings/net/wireless/mediatek,mt76.yaml b/Documentation/devicetree/bindings/net/wireless/mediatek,mt76.yaml
index 252207adbc54c5..eabceb849537c4 100644
--- a/Documentation/devicetree/bindings/net/wireless/mediatek,mt76.yaml
+++ b/Documentation/devicetree/bindings/net/wireless/mediatek,mt76.yaml
@@ -19,9 +19,6 @@ description: |
Alternatively, it can specify the wireless part of the MT7628/MT7688
or MT7622/MT7986 SoC.
-allOf:
- - $ref: ieee80211.yaml#
-
properties:
compatible:
enum:
@@ -38,7 +35,12 @@ properties:
MT7986 should contain 3 regions consys, dcm, and sku, in this order.
interrupts:
- maxItems: 1
+ minItems: 1
+ items:
+ - description: major interrupt for rings
+ - description: additional interrupt for ring 19
+ - description: additional interrupt for ring 4
+ - description: additional interrupt for ring 5
power-domains:
maxItems: 1
@@ -217,6 +219,24 @@ required:
- compatible
- reg
+allOf:
+ - $ref: ieee80211.yaml#
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - mediatek,mt7981-wmac
+ - mediatek,mt7986-wmac
+ then:
+ properties:
+ interrupts:
+ minItems: 4
+ else:
+ properties:
+ interrupts:
+ maxItems: 1
+
unevaluatedProperties: false
examples:
@@ -293,7 +313,10 @@ examples:
reg = <0x18000000 0x1000000>,
<0x10003000 0x1000>,
<0x11d10000 0x1000>;
- interrupts = <GIC_SPI 213 IRQ_TYPE_LEVEL_HIGH>;
+ interrupts = <GIC_SPI 213 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 214 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 215 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 216 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&topckgen 50>,
<&topckgen 62>;
clock-names = "mcu", "ap2conn";
diff --git a/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.yaml b/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.yaml
index 7758a55dd32866..9b3ef4bc373258 100644
--- a/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.yaml
+++ b/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.yaml
@@ -8,6 +8,7 @@ title: Qualcomm Technologies ath10k wireless devices
maintainers:
- Kalle Valo <kvalo@kernel.org>
+ - Jeff Johnson <jjohnson@kernel.org>
description:
Qualcomm Technologies, Inc. IEEE 802.11ac devices.
diff --git a/Documentation/devicetree/bindings/net/wireless/qcom,ath11k-pci.yaml b/Documentation/devicetree/bindings/net/wireless/qcom,ath11k-pci.yaml
index 817f02a8b481d8..41d023797d7d3b 100644
--- a/Documentation/devicetree/bindings/net/wireless/qcom,ath11k-pci.yaml
+++ b/Documentation/devicetree/bindings/net/wireless/qcom,ath11k-pci.yaml
@@ -9,6 +9,7 @@ title: Qualcomm Technologies ath11k wireless devices (PCIe)
maintainers:
- Kalle Valo <kvalo@kernel.org>
+ - Jeff Johnson <jjohnson@kernel.org>
description: |
Qualcomm Technologies IEEE 802.11ax PCIe devices
diff --git a/Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml b/Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml
index 7d5f982a3d09db..672282cdfc2fc4 100644
--- a/Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml
+++ b/Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml
@@ -9,6 +9,7 @@ title: Qualcomm Technologies ath11k wireless devices
maintainers:
- Kalle Valo <kvalo@kernel.org>
+ - Jeff Johnson <jjohnson@kernel.org>
description: |
These are dt entries for Qualcomm Technologies, Inc. IEEE 802.11ax
diff --git a/Documentation/driver-api/dpll.rst b/Documentation/driver-api/dpll.rst
index e3d593841aa7dd..ea8d16600e16a8 100644
--- a/Documentation/driver-api/dpll.rst
+++ b/Documentation/driver-api/dpll.rst
@@ -545,7 +545,7 @@ In such scenario, dpll device input signal shall be also configurable
to drive dpll with signal recovered from the PHY netdevice.
This is done by exposing a pin to the netdevice - attaching pin to the
netdevice itself with
-``netdev_dpll_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin)``.
+``dpll_netdev_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin)``.
Exposed pin id handle ``DPLL_A_PIN_ID`` is then identifiable by the user
as it is attached to rtnetlink respond to get ``RTM_NEWLINK`` command in
nested attribute ``IFLA_DPLL_PIN``.
diff --git a/Documentation/netlink/genetlink-c.yaml b/Documentation/netlink/genetlink-c.yaml
index c58f7153fcf828..4dfd899a1661ce 100644
--- a/Documentation/netlink/genetlink-c.yaml
+++ b/Documentation/netlink/genetlink-c.yaml
@@ -11,7 +11,7 @@ $defs:
minimum: 0
len-or-define:
type: [ string, integer ]
- pattern: ^[0-9A-Za-z_]+( - 1)?$
+ pattern: ^[0-9A-Za-z_-]+( - 1)?$
minimum: 0
len-or-limit:
# literal int or limit based on fixed-width type e.g. u8-min, u16-max, etc.
@@ -126,8 +126,9 @@ properties:
Prefix for the C enum name of the attributes. Default family[name]-set[name]-a-
type: string
enum-name:
- description: Name for the enum type of the attribute.
- type: string
+ description: |
+ Name for the enum type of the attribute, if empty no name will be used.
+ type: [ string, "null" ]
doc:
description: Documentation of the space.
type: string
@@ -208,6 +209,11 @@ properties:
exact-len:
description: Exact length for a string or a binary attribute.
$ref: '#/$defs/len-or-define'
+ unterminated-ok:
+ description: |
+ For string attributes, do not check whether attribute
+ contains the terminating null character.
+ type: boolean
sub-type: *attr-type
display-hint: &display-hint
description: |
@@ -261,14 +267,16 @@ properties:
the prefix with the upper case name of the command, with dashes replaced by underscores.
type: string
enum-name:
- description: Name for the enum type with commands.
- type: string
+ description: |
+ Name for the enum type with commands, if empty no name will be used.
+ type: [ string, "null" ]
async-prefix:
description: Same as name-prefix but used to render notifications and events to separate enum.
type: string
async-enum:
- description: Name for the enum type with notifications/events.
- type: string
+ description: |
+ Name for the enum type with commands, if empty no name will be used.
+ type: [ string, "null" ]
list:
description: List of commands
type: array
@@ -370,3 +378,22 @@ properties:
type: string
# End genetlink-c
flags: *cmd_flags
+
+ kernel-family:
+ description: Additional global attributes used for kernel C code generation.
+ type: object
+ additionalProperties: False
+ properties:
+ headers:
+ description: |
+ List of extra headers which should be included in the source
+ of the generated code.
+ type: array
+ items:
+ type: string
+ sock-priv:
+ description: |
+ Literal name of the type which is used within the kernel
+ to store the socket state. The type / structure is internal
+ to the kernel, and is not defined in the spec.
+ type: string
diff --git a/Documentation/netlink/genetlink-legacy.yaml b/Documentation/netlink/genetlink-legacy.yaml
index 938703088306ba..b48ad3b1cc32e2 100644
--- a/Documentation/netlink/genetlink-legacy.yaml
+++ b/Documentation/netlink/genetlink-legacy.yaml
@@ -11,7 +11,7 @@ $defs:
minimum: 0
len-or-define:
type: [ string, integer ]
- pattern: ^[0-9A-Za-z_]+( - 1)?$
+ pattern: ^[0-9A-Za-z_-]+( - 1)?$
minimum: 0
len-or-limit:
# literal int or limit based on fixed-width type e.g. u8-min, u16-max, etc.
@@ -168,8 +168,9 @@ properties:
Prefix for the C enum name of the attributes. Default family[name]-set[name]-a-
type: string
enum-name:
- description: Name for the enum type of the attribute.
- type: string
+ description: |
+ Name for the enum type of the attribute, if empty no name will be used.
+ type: [ string, "null" ]
doc:
description: Documentation of the space.
type: string
@@ -251,6 +252,11 @@ properties:
exact-len:
description: Exact length for a string or a binary attribute.
$ref: '#/$defs/len-or-define'
+ unterminated-ok:
+ description: |
+ For string attributes, do not check whether attribute
+ contains the terminating null character.
+ type: boolean
sub-type: *attr-type
display-hint: *display-hint
# Start genetlink-c
@@ -304,14 +310,16 @@ properties:
the prefix with the upper case name of the command, with dashes replaced by underscores.
type: string
enum-name:
- description: Name for the enum type with commands.
- type: string
+ description: |
+ Name for the enum type with commands, if empty no name will be used.
+ type: [ string, "null" ]
async-prefix:
description: Same as name-prefix but used to render notifications and events to separate enum.
type: string
async-enum:
- description: Name for the enum type with notifications/events.
- type: string
+ description: |
+ Name for the enum type with commands, if empty no name will be used.
+ type: [ string, "null" ]
# Start genetlink-legacy
fixed-header: &fixed-header
description: |
@@ -431,3 +439,22 @@ properties:
type: string
# End genetlink-c
flags: *cmd_flags
+
+ kernel-family:
+ description: Additional global attributes used for kernel C code generation.
+ type: object
+ additionalProperties: False
+ properties:
+ headers:
+ description: |
+ List of extra headers which should be included in the source
+ of the generated code.
+ type: array
+ items:
+ type: string
+ sock-priv:
+ description: |
+ Literal name of the type which is used within the kernel
+ to store the socket state. The type / structure is internal
+ to the kernel, and is not defined in the spec.
+ type: string
diff --git a/Documentation/netlink/genetlink.yaml b/Documentation/netlink/genetlink.yaml
index 3283bf458ff13d..ebd6ee743fccc2 100644
--- a/Documentation/netlink/genetlink.yaml
+++ b/Documentation/netlink/genetlink.yaml
@@ -11,7 +11,7 @@ $defs:
minimum: 0
len-or-define:
type: [ string, integer ]
- pattern: ^[0-9A-Za-z_]+( - 1)?$
+ pattern: ^[0-9A-Za-z_-]+( - 1)?$
minimum: 0
len-or-limit:
# literal int or limit based on fixed-width type e.g. u8-min, u16-max, etc.
@@ -328,3 +328,22 @@ properties:
The name for the group, used to form the define and the value of the define.
type: string
flags: *cmd_flags
+
+ kernel-family:
+ description: Additional global attributes used for kernel C code generation.
+ type: object
+ additionalProperties: False
+ properties:
+ headers:
+ description: |
+ List of extra headers which should be included in the source
+ of the generated code.
+ type: array
+ items:
+ type: string
+ sock-priv:
+ description: |
+ Literal name of the type which is used within the kernel
+ to store the socket state. The type / structure is internal
+ to the kernel, and is not defined in the spec.
+ type: string
diff --git a/Documentation/netlink/netlink-raw.yaml b/Documentation/netlink/netlink-raw.yaml
index ac4e05415f2f5e..a76e54cbadbc44 100644
--- a/Documentation/netlink/netlink-raw.yaml
+++ b/Documentation/netlink/netlink-raw.yaml
@@ -11,7 +11,7 @@ $defs:
minimum: 0
len-or-define:
type: [ string, integer ]
- pattern: ^[0-9A-Za-z_]+( - 1)?$
+ pattern: ^[0-9A-Za-z_-]+( - 1)?$
minimum: 0
# Schema for specs
@@ -189,8 +189,9 @@ properties:
Prefix for the C enum name of the attributes. Default family[name]-set[name]-a-
type: string
enum-name:
- description: Name for the enum type of the attribute.
- type: string
+ description: |
+ Name for the enum type of the attribute, if empty no name will be used.
+ type: [ string, "null" ]
doc:
description: Documentation of the space.
type: string
@@ -270,6 +271,11 @@ properties:
exact-len:
description: Exact length for a string or a binary attribute.
$ref: '#/$defs/len-or-define'
+ unterminated-ok:
+ description: |
+ For string attributes, do not check whether attribute
+ contains the terminating null character.
+ type: boolean
sub-type: *attr-type
display-hint: *display-hint
# Start genetlink-c
@@ -371,14 +377,16 @@ properties:
the prefix with the upper case name of the command, with dashes replaced by underscores.
type: string
enum-name:
- description: Name for the enum type with commands.
- type: string
+ description: |
+ Name for the enum type with commands, if empty no name will be used.
+ type: [ string, "null" ]
async-prefix:
description: Same as name-prefix but used to render notifications and events to separate enum.
type: string
async-enum:
- description: Name for the enum type with notifications/events.
- type: string
+ description: |
+ Name for the enum type with commands, if empty no name will be used.
+ type: [ string, "null" ]
# Start genetlink-legacy
fixed-header: &fixed-header
description: |
diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml
index 8dc1df5cfae746..95b0eb1486bf7e 100644
--- a/Documentation/netlink/specs/dpll.yaml
+++ b/Documentation/netlink/specs/dpll.yaml
@@ -312,6 +312,7 @@ attribute-sets:
-
name: capabilities
type: u32
+ enum: pin-capabilities
-
name: parent-device
type: nest
diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml
index 49f90cfb469894..af525ed2979234 100644
--- a/Documentation/netlink/specs/mptcp_pm.yaml
+++ b/Documentation/netlink/specs/mptcp_pm.yaml
@@ -292,13 +292,14 @@ operations:
-
name: get-addr
doc: Get endpoint information
- attribute-set: endpoint
+ attribute-set: attr
dont-validate: [ strict ]
flags: [ uns-admin-perm ]
do: &get-addr-attrs
request:
attributes:
- addr
+ - token
reply:
attributes:
- addr
diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index 3addac97068048..76352dbd2be4eb 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -74,6 +74,10 @@ definitions:
name: queue-type
type: enum
entries: [ rx, tx ]
+ -
+ name: qstats-scope
+ type: flags
+ entries: [ queue ]
attribute-sets:
-
@@ -265,6 +269,73 @@ attribute-sets:
doc: ID of the NAPI instance which services this queue.
type: u32
+ -
+ name: qstats
+ doc: |
+ Get device statistics, scoped to a device or a queue.
+ These statistics extend (and partially duplicate) statistics available
+ in struct rtnl_link_stats64.
+ Value of the `scope` attribute determines how statistics are
+ aggregated. When aggregated for the entire device the statistics
+ represent the total number of events since last explicit reset of
+ the device (i.e. not a reconfiguration like changing queue count).
+ When reported per-queue, however, the statistics may not add
+ up to the total number of events, will only be reported for currently
+ active objects, and will likely report the number of events since last
+ reconfiguration.
+ attributes:
+ -
+ name: ifindex
+ doc: ifindex of the netdevice to which stats belong.
+ type: u32
+ checks:
+ min: 1
+ -
+ name: queue-type
+ doc: Queue type as rx, tx, for queue-id.
+ type: u32
+ enum: queue-type
+ -
+ name: queue-id
+ doc: Queue ID, if stats are scoped to a single queue instance.
+ type: u32
+ -
+ name: scope
+ doc: |
+ What object type should be used to iterate over the stats.
+ type: uint
+ enum: qstats-scope
+ -
+ name: rx-packets
+ doc: |
+ Number of wire packets successfully received and passed to the stack.
+ For drivers supporting XDP, XDP is considered the first layer
+ of the stack, so packets consumed by XDP are still counted here.
+ type: uint
+ value: 8 # reserve some attr ids in case we need more metadata later
+ -
+ name: rx-bytes
+ doc: Successfully received bytes, see `rx-packets`.
+ type: uint
+ -
+ name: tx-packets
+ doc: |
+ Number of wire packets successfully sent. Packet is considered to be
+ successfully sent once it is in device memory (usually this means
+ the device has issued a DMA completion for the packet).
+ type: uint
+ -
+ name: tx-bytes
+ doc: Successfully sent bytes, see `tx-packets`.
+ type: uint
+ -
+ name: rx-alloc-fail
+ doc: |
+ Number of times skb or buffer allocation failed on the Rx datapath.
+ Allocation failure may, or may not result in a packet drop, depending
+ on driver implementation and whether system recovers quickly.
+ type: uint
+
operations:
list:
-
@@ -405,6 +476,26 @@ operations:
attributes:
- ifindex
reply: *napi-get-op
+ -
+ name: qstats-get
+ doc: |
+ Get / dump fine grained statistics. Which statistics are reported
+ depends on the device and the driver, and whether the driver stores
+ software counters per-queue.
+ attribute-set: qstats
+ dump:
+ request:
+ attributes:
+ - scope
+ reply:
+ attributes:
+ - ifindex
+ - queue-type
+ - queue-id
+ - rx-packets
+ - rx-bytes
+ - tx-packets
+ - tx-bytes
mcast-groups:
list:
diff --git a/Documentation/netlink/specs/nlctrl.yaml b/Documentation/netlink/specs/nlctrl.yaml
new file mode 100644
index 00000000000000..b1632b95f725fe
--- /dev/null
+++ b/Documentation/netlink/specs/nlctrl.yaml
@@ -0,0 +1,206 @@
+# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+
+name: nlctrl
+protocol: genetlink-legacy
+uapi-header: linux/genetlink.h
+
+doc: |
+ genetlink meta-family that exposes information about all genetlink
+ families registered in the kernel (including itself).
+
+definitions:
+ -
+ name: op-flags
+ type: flags
+ enum-name:
+ entries:
+ - admin-perm
+ - cmd-cap-do
+ - cmd-cap-dump
+ - cmd-cap-haspol
+ - uns-admin-perm
+ -
+ name: attr-type
+ enum-name: netlink-attribute-type
+ type: enum
+ entries:
+ - invalid
+ - flag
+ - u8
+ - u16
+ - u32
+ - u64
+ - s8
+ - s16
+ - s32
+ - s64
+ - binary
+ - string
+ - nul-string
+ - nested
+ - nested-array
+ - bitfield32
+ - sint
+ - uint
+
+attribute-sets:
+ -
+ name: ctrl-attrs
+ name-prefix: ctrl-attr-
+ attributes:
+ -
+ name: family-id
+ type: u16
+ -
+ name: family-name
+ type: string
+ -
+ name: version
+ type: u32
+ -
+ name: hdrsize
+ type: u32
+ -
+ name: maxattr
+ type: u32
+ -
+ name: ops
+ type: array-nest
+ nested-attributes: op-attrs
+ -
+ name: mcast-groups
+ type: array-nest
+ nested-attributes: mcast-group-attrs
+ -
+ name: policy
+ type: nest-type-value
+ type-value: [ policy-id, attr-id ]
+ nested-attributes: policy-attrs
+ -
+ name: op-policy
+ type: nest-type-value
+ type-value: [ op-id ]
+ nested-attributes: op-policy-attrs
+ -
+ name: op
+ type: u32
+ -
+ name: mcast-group-attrs
+ name-prefix: ctrl-attr-mcast-grp-
+ enum-name:
+ attributes:
+ -
+ name: name
+ type: string
+ -
+ name: id
+ type: u32
+ -
+ name: op-attrs
+ name-prefix: ctrl-attr-op-
+ enum-name:
+ attributes:
+ -
+ name: id
+ type: u32
+ -
+ name: flags
+ type: u32
+ enum: op-flags
+ enum-as-flags: true
+ -
+ name: policy-attrs
+ name-prefix: nl-policy-type-attr-
+ enum-name:
+ attributes:
+ -
+ name: type
+ type: u32
+ enum: attr-type
+ -
+ name: min-value-s
+ type: s64
+ -
+ name: max-value-s
+ type: s64
+ -
+ name: min-value-u
+ type: u64
+ -
+ name: max-value-u
+ type: u64
+ -
+ name: min-length
+ type: u32
+ -
+ name: max-length
+ type: u32
+ -
+ name: policy-idx
+ type: u32
+ -
+ name: policy-maxtype
+ type: u32
+ -
+ name: bitfield32-mask
+ type: u32
+ -
+ name: mask
+ type: u64
+ -
+ name: pad
+ type: pad
+ -
+ name: op-policy-attrs
+ name-prefix: ctrl-attr-policy-
+ enum-name:
+ attributes:
+ -
+ name: do
+ type: u32
+ -
+ name: dump
+ type: u32
+
+operations:
+ enum-model: directional
+ name-prefix: ctrl-cmd-
+ list:
+ -
+ name: getfamily
+ doc: Get / dump genetlink families
+ attribute-set: ctrl-attrs
+ do:
+ request:
+ value: 3
+ attributes:
+ - family-name
+ reply: &all-attrs
+ value: 1
+ attributes:
+ - family-id
+ - family-name
+ - hdrsize
+ - maxattr
+ - mcast-groups
+ - ops
+ - version
+ dump:
+ reply: *all-attrs
+ -
+ name: getpolicy
+ doc: Get / dump genetlink policies
+ attribute-set: ctrl-attrs
+ dump:
+ request:
+ value: 10
+ attributes:
+ - family-name
+ - family-id
+ - op
+ reply:
+ value: 10
+ attributes:
+ - family-id
+ - op-policy
+ - policy
diff --git a/Documentation/networking/device_drivers/ethernet/intel/ice.rst b/Documentation/networking/device_drivers/ethernet/intel/ice.rst
index 5038e54586af66..934752f675ba40 100644
--- a/Documentation/networking/device_drivers/ethernet/intel/ice.rst
+++ b/Documentation/networking/device_drivers/ethernet/intel/ice.rst
@@ -368,15 +368,28 @@ more options for Receive Side Scaling (RSS) hash byte configuration.
# ethtool -N <ethX> rx-flow-hash <type> <option>
Where <type> is:
- tcp4 signifying TCP over IPv4
- udp4 signifying UDP over IPv4
- tcp6 signifying TCP over IPv6
- udp6 signifying UDP over IPv6
+ tcp4 signifying TCP over IPv4
+ udp4 signifying UDP over IPv4
+ gtpc4 signifying GTP-C over IPv4
+ gtpc4t signifying GTP-C (include TEID) over IPv4
+ gtpu4 signifying GTP-U over IPV4
+ gtpu4e signifying GTP-U and Extension Header over IPV4
+ gtpu4u signifying GTP-U PSC Uplink over IPV4
+ gtpu4d signifying GTP-U PSC Downlink over IPV4
+ tcp6 signifying TCP over IPv6
+ udp6 signifying UDP over IPv6
+ gtpc6 signifying GTP-C over IPv6
+ gtpc6t signifying GTP-C (include TEID) over IPv6
+ gtpu6 signifying GTP-U over IPV6
+ gtpu6e signifying GTP-U and Extension Header over IPV6
+ gtpu6u signifying GTP-U PSC Uplink over IPV6
+ gtpu6d signifying GTP-U PSC Downlink over IPV6
And <option> is one or more of:
s Hash on the IP source address of the Rx packet.
d Hash on the IP destination address of the Rx packet.
f Hash on bytes 0 and 1 of the Layer 4 header of the Rx packet.
n Hash on bytes 2 and 3 of the Layer 4 header of the Rx packet.
+ e Hash on GTP Packet on TEID (4bytes) of the Rx packet.
Accelerated Receive Flow Steering (aRFS)
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 69f3d6dcd9fd81..473d72c36d6175 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -74,6 +74,7 @@ Contents:
mpls-sysctl
mptcp-sysctl
multiqueue
+ multi-pf-netdev
napi
net_cachelines/index
netconsole
diff --git a/Documentation/networking/multi-pf-netdev.rst b/Documentation/networking/multi-pf-netdev.rst
new file mode 100644
index 00000000000000..be8e4bcadf1190
--- /dev/null
+++ b/Documentation/networking/multi-pf-netdev.rst
@@ -0,0 +1,174 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: <isonum.txt>
+
+===============
+Multi-PF Netdev
+===============
+
+Contents
+========
+
+- `Background`_
+- `Overview`_
+- `mlx5 implementation`_
+- `Channels distribution`_
+- `Observability`_
+- `Steering`_
+- `Mutually exclusive features`_
+
+Background
+==========
+
+The Multi-PF NIC technology enables several CPUs within a multi-socket server to connect directly to
+the network, each through its own dedicated PCIe interface. Through either a connection harness that
+splits the PCIe lanes between two cards or by bifurcating a PCIe slot for a single card. This
+results in eliminating the network traffic traversing over the internal bus between the sockets,
+significantly reducing overhead and latency, in addition to reducing CPU utilization and increasing
+network throughput.
+
+Overview
+========
+
+The feature adds support for combining multiple PFs of the same port in a Multi-PF environment under
+one netdev instance. It is implemented in the netdev layer. Lower-layer instances like pci func,
+sysfs entry, and devlink are kept separate.
+Passing traffic through different devices belonging to different NUMA sockets saves cross-NUMA
+traffic and allows apps running on the same netdev from different NUMAs to still feel a sense of
+proximity to the device and achieve improved performance.
+
+mlx5 implementation
+===================
+
+Multi-PF or Socket-direct in mlx5 is achieved by grouping PFs together which belong to the same
+NIC and has the socket-direct property enabled, once all PFs are probed, we create a single netdev
+to represent all of them, symmetrically, we destroy the netdev whenever any of the PFs is removed.
+
+The netdev network channels are distributed between all devices, a proper configuration would utilize
+the correct close NUMA node when working on a certain app/CPU.
+
+We pick one PF to be a primary (leader), and it fills a special role. The other devices
+(secondaries) are disconnected from the network at the chip level (set to silent mode). In silent
+mode, no south <-> north traffic flowing directly through a secondary PF. It needs the assistance of
+the leader PF (east <-> west traffic) to function. All Rx/Tx traffic is steered through the primary
+to/from the secondaries.
+
+Currently, we limit the support to PFs only, and up to two PFs (sockets).
+
+Channels distribution
+=====================
+
+We distribute the channels between the different PFs to achieve local NUMA node performance
+on multiple NUMA nodes.
+
+Each combined channel works against one specific PF, creating all its datapath queues against it. We
+distribute channels to PFs in a round-robin policy.
+
+::
+
+ Example for 2 PFs and 5 channels:
+ +--------+--------+
+ | ch idx | PF idx |
+ +--------+--------+
+ | 0 | 0 |
+ | 1 | 1 |
+ | 2 | 0 |
+ | 3 | 1 |
+ | 4 | 0 |
+ +--------+--------+
+
+
+The reason we prefer round-robin is, it is less influenced by changes in the number of channels. The
+mapping between a channel index and a PF is fixed, no matter how many channels the user configures.
+As the channel stats are persistent across channel's closure, changing the mapping every single time
+would turn the accumulative stats less representing of the channel's history.
+
+This is achieved by using the correct core device instance (mdev) in each channel, instead of them
+all using the same instance under "priv->mdev".
+
+Observability
+=============
+The relation between PF, irq, napi, and queue can be observed via netlink spec:
+
+$ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml --dump queue-get --json='{"ifindex": 13}'
+[{'id': 0, 'ifindex': 13, 'napi-id': 539, 'type': 'rx'},
+ {'id': 1, 'ifindex': 13, 'napi-id': 540, 'type': 'rx'},
+ {'id': 2, 'ifindex': 13, 'napi-id': 541, 'type': 'rx'},
+ {'id': 3, 'ifindex': 13, 'napi-id': 542, 'type': 'rx'},
+ {'id': 4, 'ifindex': 13, 'napi-id': 543, 'type': 'rx'},
+ {'id': 0, 'ifindex': 13, 'napi-id': 539, 'type': 'tx'},
+ {'id': 1, 'ifindex': 13, 'napi-id': 540, 'type': 'tx'},
+ {'id': 2, 'ifindex': 13, 'napi-id': 541, 'type': 'tx'},
+ {'id': 3, 'ifindex': 13, 'napi-id': 542, 'type': 'tx'},
+ {'id': 4, 'ifindex': 13, 'napi-id': 543, 'type': 'tx'}]
+
+$ ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml --dump napi-get --json='{"ifindex": 13}'
+[{'id': 543, 'ifindex': 13, 'irq': 42},
+ {'id': 542, 'ifindex': 13, 'irq': 41},
+ {'id': 541, 'ifindex': 13, 'irq': 40},
+ {'id': 540, 'ifindex': 13, 'irq': 39},
+ {'id': 539, 'ifindex': 13, 'irq': 36}]
+
+Here you can clearly observe our channels distribution policy:
+
+$ ls /proc/irq/{36,39,40,41,42}/mlx5* -d -1
+/proc/irq/36/mlx5_comp1@pci:0000:08:00.0
+/proc/irq/39/mlx5_comp1@pci:0000:09:00.0
+/proc/irq/40/mlx5_comp2@pci:0000:08:00.0
+/proc/irq/41/mlx5_comp2@pci:0000:09:00.0
+/proc/irq/42/mlx5_comp3@pci:0000:08:00.0
+
+Steering
+========
+Secondary PFs are set to "silent" mode, meaning they are disconnected from the network.
+
+In Rx, the steering tables belong to the primary PF only, and it is its role to distribute incoming
+traffic to other PFs, via cross-vhca steering capabilities. Still maintain a single default RSS table,
+that is capable of pointing to the receive queues of a different PF.
+
+In Tx, the primary PF creates a new Tx flow table, which is aliased by the secondaries, so they can
+go out to the network through it.
+
+In addition, we set default XPS configuration that, based on the CPU, selects an SQ belonging to the
+PF on the same node as the CPU.
+
+XPS default config example:
+
+NUMA node(s): 2
+NUMA node0 CPU(s): 0-11
+NUMA node1 CPU(s): 12-23
+
+PF0 on node0, PF1 on node1.
+
+- /sys/class/net/eth2/queues/tx-0/xps_cpus:000001
+- /sys/class/net/eth2/queues/tx-1/xps_cpus:001000
+- /sys/class/net/eth2/queues/tx-2/xps_cpus:000002
+- /sys/class/net/eth2/queues/tx-3/xps_cpus:002000
+- /sys/class/net/eth2/queues/tx-4/xps_cpus:000004
+- /sys/class/net/eth2/queues/tx-5/xps_cpus:004000
+- /sys/class/net/eth2/queues/tx-6/xps_cpus:000008
+- /sys/class/net/eth2/queues/tx-7/xps_cpus:008000
+- /sys/class/net/eth2/queues/tx-8/xps_cpus:000010
+- /sys/class/net/eth2/queues/tx-9/xps_cpus:010000
+- /sys/class/net/eth2/queues/tx-10/xps_cpus:000020
+- /sys/class/net/eth2/queues/tx-11/xps_cpus:020000
+- /sys/class/net/eth2/queues/tx-12/xps_cpus:000040
+- /sys/class/net/eth2/queues/tx-13/xps_cpus:040000
+- /sys/class/net/eth2/queues/tx-14/xps_cpus:000080
+- /sys/class/net/eth2/queues/tx-15/xps_cpus:080000
+- /sys/class/net/eth2/queues/tx-16/xps_cpus:000100
+- /sys/class/net/eth2/queues/tx-17/xps_cpus:100000
+- /sys/class/net/eth2/queues/tx-18/xps_cpus:000200
+- /sys/class/net/eth2/queues/tx-19/xps_cpus:200000
+- /sys/class/net/eth2/queues/tx-20/xps_cpus:000400
+- /sys/class/net/eth2/queues/tx-21/xps_cpus:400000
+- /sys/class/net/eth2/queues/tx-22/xps_cpus:000800
+- /sys/class/net/eth2/queues/tx-23/xps_cpus:800000
+
+Mutually exclusive features
+===========================
+
+The nature of Multi-PF, where different channels work with different PFs, conflicts with
+stateful features where the state is maintained in one of the PFs.
+For example, in the TLS device-offload feature, special context objects are created per connection
+and maintained in the PF. Transitioning between different RQs/SQs would break the feature. Hence,
+we disable this combination for now.
diff --git a/Documentation/networking/netconsole.rst b/Documentation/networking/netconsole.rst
index b28c525e5d1e22..d55c2a22ec7af0 100644
--- a/Documentation/networking/netconsole.rst
+++ b/Documentation/networking/netconsole.rst
@@ -180,7 +180,7 @@ Custom user data can be appended to the end of messages with netconsole
dynamic configuration enabled. User data entries can be modified without
changing the "enabled" attribute of a target.
-Directories (keys) under `userdata` are limited to 54 character length, and
+Directories (keys) under `userdata` are limited to 53 character length, and
data in `userdata/<key>/value` are limited to 200 bytes::
cd /sys/kernel/config/netconsole && mkdir cmdline0
@@ -197,8 +197,8 @@ Messages will now include this additional user data::
Sends::
12,607,22085407756,-;This is a message
- foo=bar
- qux=baz
+ foo=bar
+ qux=baz
Preview the userdata that will be appended with::
@@ -218,7 +218,7 @@ The `qux` key is omitted since it has no value::
echo "This is a message" > /dev/kmsg
12,607,22085407756,-;This is a message
- foo=bar
+ foo=bar
Delete `userdata` entries with `rmdir`::
diff --git a/Documentation/networking/sfp-phylink.rst b/Documentation/networking/sfp-phylink.rst
index 8054d33f449f56..5bf285d73e8a1a 100644
--- a/Documentation/networking/sfp-phylink.rst
+++ b/Documentation/networking/sfp-phylink.rst
@@ -231,16 +231,136 @@ this documentation.
For further information on these methods, please see the inline
documentation in :c:type:`struct phylink_mac_ops <phylink_mac_ops>`.
-9. Remove calls to of_parse_phandle() for the PHY,
- of_phy_register_fixed_link() for fixed links etc. from the probe
- function, and replace with:
+9. Fill-in the :c:type:`struct phylink_config <phylink_config>` fields with
+ a reference to the :c:type:`struct device <device>` associated to your
+ :c:type:`struct net_device <net_device>`:
.. code-block:: c
- struct phylink *phylink;
priv->phylink_config.dev = &dev.dev;
priv->phylink_config.type = PHYLINK_NETDEV;
+ Fill-in the various speeds, pause and duplex modes your MAC can handle:
+
+ .. code-block:: c
+
+ priv->phylink_config.mac_capabilities = MAC_SYM_PAUSE | MAC_10 | MAC_100 | MAC_1000FD;
+
+10. Some Ethernet controllers work in pair with a PCS (Physical Coding Sublayer)
+ block, that can handle among other things the encoding/decoding, link
+ establishment detection and autonegotiation. While some MACs have internal
+ PCS whose operation is transparent, some other require dedicated PCS
+ configuration for the link to become functional. In that case, phylink
+ provides a PCS abstraction through :c:type:`struct phylink_pcs <phylink_pcs>`.
+
+ Identify if your driver has one or more internal PCS blocks, and/or if
+ your controller can use an external PCS block that might be internally
+ connected to your controller.
+
+ If your controller doesn't have any internal PCS, you can go to step 11.
+
+ If your Ethernet controller contains one or several PCS blocks, create
+ one :c:type:`struct phylink_pcs <phylink_pcs>` instance per PCS block within
+ your driver's private data structure:
+
+ .. code-block:: c
+
+ struct phylink_pcs pcs;
+
+ Populate the relevant :c:type:`struct phylink_pcs_ops <phylink_pcs_ops>` to
+ configure your PCS. Create a :c:func:`pcs_get_state` function that reports
+ the inband link state, a :c:func:`pcs_config` function to configure your
+ PCS according to phylink-provided parameters, and a :c:func:`pcs_validate`
+ function that report to phylink all accepted configuration parameters for
+ your PCS:
+
+ .. code-block:: c
+
+ struct phylink_pcs_ops foo_pcs_ops = {
+ .pcs_validate = foo_pcs_validate,
+ .pcs_get_state = foo_pcs_get_state,
+ .pcs_config = foo_pcs_config,
+ };
+
+ Arrange for PCS link state interrupts to be forwarded into
+ phylink, via:
+
+ .. code-block:: c
+
+ phylink_pcs_change(pcs, link_is_up);
+
+ where ``link_is_up`` is true if the link is currently up or false
+ otherwise. If a PCS is unable to provide these interrupts, then
+ it should set ``pcs->pcs_poll = true;`` when creating the PCS.
+
+11. If your controller relies on, or accepts the presence of an external PCS
+ controlled through its own driver, add a pointer to a phylink_pcs instance
+ in your driver private data structure:
+
+ .. code-block:: c
+
+ struct phylink_pcs *pcs;
+
+ The way of getting an instance of the actual PCS depends on the platform,
+ some PCS sit on an MDIO bus and are grabbed by passing a pointer to the
+ corresponding :c:type:`struct mii_bus <mii_bus>` and the PCS's address on
+ that bus. In this example, we assume the controller attaches to a Lynx PCS
+ instance:
+
+ .. code-block:: c
+
+ priv->pcs = lynx_pcs_create_mdiodev(bus, 0);
+
+ Some PCS can be recovered based on firmware information:
+
+ .. code-block:: c
+
+ priv->pcs = lynx_pcs_create_fwnode(of_fwnode_handle(node));
+
+12. Populate the :c:func:`mac_select_pcs` callback and add it to your
+ :c:type:`struct phylink_mac_ops <phylink_mac_ops>` set of ops. This function
+ must return a pointer to the relevant :c:type:`struct phylink_pcs <phylink_pcs>`
+ that will be used for the requested link configuration:
+
+ .. code-block:: c
+
+ static struct phylink_pcs *foo_select_pcs(struct phylink_config *config,
+ phy_interface_t interface)
+ {
+ struct foo_priv *priv = container_of(config, struct foo_priv,
+ phylink_config);
+
+ if ( /* 'interface' needs a PCS to function */ )
+ return priv->pcs;
+
+ return NULL;
+ }
+
+ See :c:func:`mvpp2_select_pcs` for an example of a driver that has multiple
+ internal PCS.
+
+13. Fill-in all the :c:type:`phy_interface_t <phy_interface_t>` (i.e. all MAC to
+ PHY link modes) that your MAC can output. The following example shows a
+ configuration for a MAC that can handle all RGMII modes, SGMII and 1000BaseX.
+ You must adjust these according to what your MAC and all PCS associated
+ with this MAC are capable of, and not just the interface you wish to use:
+
+ .. code-block:: c
+
+ phy_interface_set_rgmii(priv->phylink_config.supported_interfaces);
+ __set_bit(PHY_INTERFACE_MODE_SGMII,
+ priv->phylink_config.supported_interfaces);
+ __set_bit(PHY_INTERFACE_MODE_1000BASEX,
+ priv->phylink_config.supported_interfaces);
+
+14. Remove calls to of_parse_phandle() for the PHY,
+ of_phy_register_fixed_link() for fixed links etc. from the probe
+ function, and replace with:
+
+ .. code-block:: c
+
+ struct phylink *phylink;
+
phylink = phylink_create(&priv->phylink_config, node, phy_mode, &phylink_ops);
if (IS_ERR(phylink)) {
err = PTR_ERR(phylink);
@@ -249,14 +369,14 @@ this documentation.
priv->phylink = phylink;
- and arrange to destroy the phylink in the probe failure path as
- appropriate and the removal path too by calling:
+ and arrange to destroy the phylink in the probe failure path as
+ appropriate and the removal path too by calling:
- .. code-block:: c
+ .. code-block:: c
phylink_destroy(priv->phylink);
-10. Arrange for MAC link state interrupts to be forwarded into
+15. Arrange for MAC link state interrupts to be forwarded into
phylink, via:
.. code-block:: c
@@ -264,17 +384,16 @@ this documentation.
phylink_mac_change(priv->phylink, link_is_up);
where ``link_is_up`` is true if the link is currently up or false
- otherwise. If a MAC is unable to provide these interrupts, then
- it should set ``priv->phylink_config.pcs_poll = true;`` in step 9.
+ otherwise.
-11. Verify that the driver does not call::
+16. Verify that the driver does not call::
netif_carrier_on()
netif_carrier_off()
- as these will interfere with phylink's tracking of the link state,
- and cause phylink to omit calls via the :c:func:`mac_link_up` and
- :c:func:`mac_link_down` methods.
+ as these will interfere with phylink's tracking of the link state,
+ and cause phylink to omit calls via the :c:func:`mac_link_up` and
+ :c:func:`mac_link_down` methods.
Network drivers should call phylink_stop() and phylink_start() via their
suspend/resume paths, which ensures that the appropriate
diff --git a/Documentation/networking/statistics.rst b/Documentation/networking/statistics.rst
index 551b3cc29a4132..75e017dfa8251d 100644
--- a/Documentation/networking/statistics.rst
+++ b/Documentation/networking/statistics.rst
@@ -41,6 +41,15 @@ If `-s` is specified once the detailed errors won't be shown.
`ip` supports JSON formatting via the `-j` option.
+Queue statistics
+~~~~~~~~~~~~~~~~
+
+Queue statistics are accessible via the netdev netlink family.
+
+Currently no widely distributed CLI exists to access those statistics.
+Kernel development tools (ynl) can be used to experiment with them,
+see `Documentation/userspace-api/netlink/intro-specs.rst`.
+
Protocol-specific statistics
----------------------------
@@ -147,6 +156,12 @@ Statistics are reported both in the responses to link information
requests (`RTM_GETLINK`) and statistic requests (`RTM_GETSTATS`,
when `IFLA_STATS_LINK_64` bit is set in the `.filter_mask` of the request).
+netdev (netlink)
+~~~~~~~~~~~~~~~~
+
+`netdev` generic netlink family allows accessing page pool and per queue
+statistics.
+
ethtool
-------
diff --git a/Documentation/virt/hyperv/index.rst b/Documentation/virt/hyperv/index.rst
index 4a7a1b738bbead..de447e11b4a5c3 100644
--- a/Documentation/virt/hyperv/index.rst
+++ b/Documentation/virt/hyperv/index.rst
@@ -10,3 +10,4 @@ Hyper-V Enlightenments
overview
vmbus
clocks
+ vpci
diff --git a/Documentation/virt/hyperv/vpci.rst b/Documentation/virt/hyperv/vpci.rst
new file mode 100644
index 00000000000000..b65b2126ede3e3
--- /dev/null
+++ b/Documentation/virt/hyperv/vpci.rst
@@ -0,0 +1,316 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+PCI pass-thru devices
+=========================
+In a Hyper-V guest VM, PCI pass-thru devices (also called
+virtual PCI devices, or vPCI devices) are physical PCI devices
+that are mapped directly into the VM's physical address space.
+Guest device drivers can interact directly with the hardware
+without intermediation by the host hypervisor. This approach
+provides higher bandwidth access to the device with lower
+latency, compared with devices that are virtualized by the
+hypervisor. The device should appear to the guest just as it
+would when running on bare metal, so no changes are required
+to the Linux device drivers for the device.
+
+Hyper-V terminology for vPCI devices is "Discrete Device
+Assignment" (DDA). Public documentation for Hyper-V DDA is
+available here: `DDA`_
+
+.. _DDA: https://learn.microsoft.com/en-us/windows-server/virtualization/hyper-v/plan/plan-for-deploying-devices-using-discrete-device-assignment
+
+DDA is typically used for storage controllers, such as NVMe,
+and for GPUs. A similar mechanism for NICs is called SR-IOV
+and produces the same benefits by allowing a guest device
+driver to interact directly with the hardware. See Hyper-V
+public documentation here: `SR-IOV`_
+
+.. _SR-IOV: https://learn.microsoft.com/en-us/windows-hardware/drivers/network/overview-of-single-root-i-o-virtualization--sr-iov-
+
+This discussion of vPCI devices includes DDA and SR-IOV
+devices.
+
+Device Presentation
+-------------------
+Hyper-V provides full PCI functionality for a vPCI device when
+it is operating, so the Linux device driver for the device can
+be used unchanged, provided it uses the correct Linux kernel
+APIs for accessing PCI config space and for other integration
+with Linux. But the initial detection of the PCI device and
+its integration with the Linux PCI subsystem must use Hyper-V
+specific mechanisms. Consequently, vPCI devices on Hyper-V
+have a dual identity. They are initially presented to Linux
+guests as VMBus devices via the standard VMBus "offer"
+mechanism, so they have a VMBus identity and appear under
+/sys/bus/vmbus/devices. The VMBus vPCI driver in Linux at
+drivers/pci/controller/pci-hyperv.c handles a newly introduced
+vPCI device by fabricating a PCI bus topology and creating all
+the normal PCI device data structures in Linux that would
+exist if the PCI device were discovered via ACPI on a bare-
+metal system. Once those data structures are set up, the
+device also has a normal PCI identity in Linux, and the normal
+Linux device driver for the vPCI device can function as if it
+were running in Linux on bare-metal. Because vPCI devices are
+presented dynamically through the VMBus offer mechanism, they
+do not appear in the Linux guest's ACPI tables. vPCI devices
+may be added to a VM or removed from a VM at any time during
+the life of the VM, and not just during initial boot.
+
+With this approach, the vPCI device is a VMBus device and a
+PCI device at the same time. In response to the VMBus offer
+message, the hv_pci_probe() function runs and establishes a
+VMBus connection to the vPCI VSP on the Hyper-V host. That
+connection has a single VMBus channel. The channel is used to
+exchange messages with the vPCI VSP for the purpose of setting
+up and configuring the vPCI device in Linux. Once the device
+is fully configured in Linux as a PCI device, the VMBus
+channel is used only if Linux changes the vCPU to be interrupted
+in the guest, or if the vPCI device is removed from
+the VM while the VM is running. The ongoing operation of the
+device happens directly between the Linux device driver for
+the device and the hardware, with VMBus and the VMBus channel
+playing no role.
+
+PCI Device Setup
+----------------
+PCI device setup follows a sequence that Hyper-V originally
+created for Windows guests, and that can be ill-suited for
+Linux guests due to differences in the overall structure of
+the Linux PCI subsystem compared with Windows. Nonetheless,
+with a bit of hackery in the Hyper-V virtual PCI driver for
+Linux, the virtual PCI device is setup in Linux so that
+generic Linux PCI subsystem code and the Linux driver for the
+device "just work".
+
+Each vPCI device is set up in Linux to be in its own PCI
+domain with a host bridge. The PCI domainID is derived from
+bytes 4 and 5 of the instance GUID assigned to the VMBus vPCI
+device. The Hyper-V host does not guarantee that these bytes
+are unique, so hv_pci_probe() has an algorithm to resolve
+collisions. The collision resolution is intended to be stable
+across reboots of the same VM so that the PCI domainIDs don't
+change, as the domainID appears in the user space
+configuration of some devices.
+
+hv_pci_probe() allocates a guest MMIO range to be used as PCI
+config space for the device. This MMIO range is communicated
+to the Hyper-V host over the VMBus channel as part of telling
+the host that the device is ready to enter d0. See
+hv_pci_enter_d0(). When the guest subsequently accesses this
+MMIO range, the Hyper-V host intercepts the accesses and maps
+them to the physical device PCI config space.
+
+hv_pci_probe() also gets BAR information for the device from
+the Hyper-V host, and uses this information to allocate MMIO
+space for the BARs. That MMIO space is then setup to be
+associated with the host bridge so that it works when generic
+PCI subsystem code in Linux processes the BARs.
+
+Finally, hv_pci_probe() creates the root PCI bus. At this
+point the Hyper-V virtual PCI driver hackery is done, and the
+normal Linux PCI machinery for scanning the root bus works to
+detect the device, to perform driver matching, and to
+initialize the driver and device.
+
+PCI Device Removal
+------------------
+A Hyper-V host may initiate removal of a vPCI device from a
+guest VM at any time during the life of the VM. The removal
+is instigated by an admin action taken on the Hyper-V host and
+is not under the control of the guest OS.
+
+A guest VM is notified of the removal by an unsolicited
+"Eject" message sent from the host to the guest over the VMBus
+channel associated with the vPCI device. Upon receipt of such
+a message, the Hyper-V virtual PCI driver in Linux
+asynchronously invokes Linux kernel PCI subsystem calls to
+shutdown and remove the device. When those calls are
+complete, an "Ejection Complete" message is sent back to
+Hyper-V over the VMBus channel indicating that the device has
+been removed. At this point, Hyper-V sends a VMBus rescind
+message to the Linux guest, which the VMBus driver in Linux
+processes by removing the VMBus identity for the device. Once
+that processing is complete, all vestiges of the device having
+been present are gone from the Linux kernel. The rescind
+message also indicates to the guest that Hyper-V has stopped
+providing support for the vPCI device in the guest. If the
+guest were to attempt to access that device's MMIO space, it
+would be an invalid reference. Hypercalls affecting the device
+return errors, and any further messages sent in the VMBus
+channel are ignored.
+
+After sending the Eject message, Hyper-V allows the guest VM
+60 seconds to cleanly shutdown the device and respond with
+Ejection Complete before sending the VMBus rescind
+message. If for any reason the Eject steps don't complete
+within the allowed 60 seconds, the Hyper-V host forcibly
+performs the rescind steps, which will likely result in
+cascading errors in the guest because the device is now no
+longer present from the guest standpoint and accessing the
+device MMIO space will fail.
+
+Because ejection is asynchronous and can happen at any point
+during the guest VM lifecycle, proper synchronization in the
+Hyper-V virtual PCI driver is very tricky. Ejection has been
+observed even before a newly offered vPCI device has been
+fully setup. The Hyper-V virtual PCI driver has been updated
+several times over the years to fix race conditions when
+ejections happen at inopportune times. Care must be taken when
+modifying this code to prevent re-introducing such problems.
+See comments in the code.
+
+Interrupt Assignment
+--------------------
+The Hyper-V virtual PCI driver supports vPCI devices using
+MSI, multi-MSI, or MSI-X. Assigning the guest vCPU that will
+receive the interrupt for a particular MSI or MSI-X message is
+complex because of the way the Linux setup of IRQs maps onto
+the Hyper-V interfaces. For the single-MSI and MSI-X cases,
+Linux calls hv_compse_msi_msg() twice, with the first call
+containing a dummy vCPU and the second call containing the
+real vCPU. Furthermore, hv_irq_unmask() is finally called
+(on x86) or the GICD registers are set (on arm64) to specify
+the real vCPU again. Each of these three calls interact
+with Hyper-V, which must decide which physical CPU should
+receive the interrupt before it is forwarded to the guest VM.
+Unfortunately, the Hyper-V decision-making process is a bit
+limited, and can result in concentrating the physical
+interrupts on a single CPU, causing a performance bottleneck.
+See details about how this is resolved in the extensive
+comment above the function hv_compose_msi_req_get_cpu().
+
+The Hyper-V virtual PCI driver implements the
+irq_chip.irq_compose_msi_msg function as hv_compose_msi_msg().
+Unfortunately, on Hyper-V the implementation requires sending
+a VMBus message to the Hyper-V host and awaiting an interrupt
+indicating receipt of a reply message. Since
+irq_chip.irq_compose_msi_msg can be called with IRQ locks
+held, it doesn't work to do the normal sleep until awakened by
+the interrupt. Instead hv_compose_msi_msg() must send the
+VMBus message, and then poll for the completion message. As
+further complexity, the vPCI device could be ejected/rescinded
+while the polling is in progress, so this scenario must be
+detected as well. See comments in the code regarding this
+very tricky area.
+
+Most of the code in the Hyper-V virtual PCI driver (pci-
+hyperv.c) applies to Hyper-V and Linux guests running on x86
+and on arm64 architectures. But there are differences in how
+interrupt assignments are managed. On x86, the Hyper-V
+virtual PCI driver in the guest must make a hypercall to tell
+Hyper-V which guest vCPU should be interrupted by each
+MSI/MSI-X interrupt, and the x86 interrupt vector number that
+the x86_vector IRQ domain has picked for the interrupt. This
+hypercall is made by hv_arch_irq_unmask(). On arm64, the
+Hyper-V virtual PCI driver manages the allocation of an SPI
+for each MSI/MSI-X interrupt. The Hyper-V virtual PCI driver
+stores the allocated SPI in the architectural GICD registers,
+which Hyper-V emulates, so no hypercall is necessary as with
+x86. Hyper-V does not support using LPIs for vPCI devices in
+arm64 guest VMs because it does not emulate a GICv3 ITS.
+
+The Hyper-V virtual PCI driver in Linux supports vPCI devices
+whose drivers create managed or unmanaged Linux IRQs. If the
+smp_affinity for an unmanaged IRQ is updated via the /proc/irq
+interface, the Hyper-V virtual PCI driver is called to tell
+the Hyper-V host to change the interrupt targeting and
+everything works properly. However, on x86 if the x86_vector
+IRQ domain needs to reassign an interrupt vector due to
+running out of vectors on a CPU, there's no path to inform the
+Hyper-V host of the change, and things break. Fortunately,
+guest VMs operate in a constrained device environment where
+using all the vectors on a CPU doesn't happen. Since such a
+problem is only a theoretical concern rather than a practical
+concern, it has been left unaddressed.
+
+DMA
+---
+By default, Hyper-V pins all guest VM memory in the host
+when the VM is created, and programs the physical IOMMU to
+allow the VM to have DMA access to all its memory. Hence
+it is safe to assign PCI devices to the VM, and allow the
+guest operating system to program the DMA transfers. The
+physical IOMMU prevents a malicious guest from initiating
+DMA to memory belonging to the host or to other VMs on the
+host. From the Linux guest standpoint, such DMA transfers
+are in "direct" mode since Hyper-V does not provide a virtual
+IOMMU in the guest.
+
+Hyper-V assumes that physical PCI devices always perform
+cache-coherent DMA. When running on x86, this behavior is
+required by the architecture. When running on arm64, the
+architecture allows for both cache-coherent and
+non-cache-coherent devices, with the behavior of each device
+specified in the ACPI DSDT. But when a PCI device is assigned
+to a guest VM, that device does not appear in the DSDT, so the
+Hyper-V VMBus driver propagates cache-coherency information
+from the VMBus node in the ACPI DSDT to all VMBus devices,
+including vPCI devices (since they have a dual identity as a VMBus
+device and as a PCI device). See vmbus_dma_configure().
+Current Hyper-V versions always indicate that the VMBus is
+cache coherent, so vPCI devices on arm64 always get marked as
+cache coherent and the CPU does not perform any sync
+operations as part of dma_map/unmap_*() calls.
+
+vPCI protocol versions
+----------------------
+As previously described, during vPCI device setup and teardown
+messages are passed over a VMBus channel between the Hyper-V
+host and the Hyper-v vPCI driver in the Linux guest. Some
+messages have been revised in newer versions of Hyper-V, so
+the guest and host must agree on the vPCI protocol version to
+be used. The version is negotiated when communication over
+the VMBus channel is first established. See
+hv_pci_protocol_negotiation(). Newer versions of the protocol
+extend support to VMs with more than 64 vCPUs, and provide
+additional information about the vPCI device, such as the
+guest virtual NUMA node to which it is most closely affined in
+the underlying hardware.
+
+Guest NUMA node affinity
+------------------------
+When the vPCI protocol version provides it, the guest NUMA
+node affinity of the vPCI device is stored as part of the Linux
+device information for subsequent use by the Linux driver. See
+hv_pci_assign_numa_node(). If the negotiated protocol version
+does not support the host providing NUMA affinity information,
+the Linux guest defaults the device NUMA node to 0. But even
+when the negotiated protocol version includes NUMA affinity
+information, the ability of the host to provide such
+information depends on certain host configuration options. If
+the guest receives NUMA node value "0", it could mean NUMA
+node 0, or it could mean "no information is available".
+Unfortunately it is not possible to distinguish the two cases
+from the guest side.
+
+PCI config space access in a CoCo VM
+------------------------------------
+Linux PCI device drivers access PCI config space using a
+standard set of functions provided by the Linux PCI subsystem.
+In Hyper-V guests these standard functions map to functions
+hv_pcifront_read_config() and hv_pcifront_write_config()
+in the Hyper-V virtual PCI driver. In normal VMs,
+these hv_pcifront_*() functions directly access the PCI config
+space, and the accesses trap to Hyper-V to be handled.
+But in CoCo VMs, memory encryption prevents Hyper-V
+from reading the guest instruction stream to emulate the
+access, so the hv_pcifront_*() functions must invoke
+hypercalls with explicit arguments describing the access to be
+made.
+
+Config Block back-channel
+-------------------------
+The Hyper-V host and Hyper-V virtual PCI driver in Linux
+together implement a non-standard back-channel communication
+path between the host and guest. The back-channel path uses
+messages sent over the VMBus channel associated with the vPCI
+device. The functions hyperv_read_cfg_blk() and
+hyperv_write_cfg_blk() are the primary interfaces provided to
+other parts of the Linux kernel. As of this writing, these
+interfaces are used only by the Mellanox mlx5 driver to pass
+diagnostic data to a Hyper-V host running in the Azure public
+cloud. The functions hyperv_read_cfg_blk() and
+hyperv_write_cfg_blk() are implemented in a separate module
+(pci-hyperv-intf.c, under CONFIG_PCI_HYPERV_INTERFACE) that
+effectively stubs them out when running in non-Hyper-V
+environments.