summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichael S. Tsirkin <mst@redhat.com>2012-06-26 13:26:21 +0300
committerMichael S. Tsirkin <mst@redhat.com>2012-06-26 13:26:21 +0300
commit612aa99cfe80f02e1460eb041f7b9f30826065d8 (patch)
tree52735adb2f19c2d0b53a21dd2474cc309e2cdf02
parent4b2916a37009c9b3d62082f6e0b0f66760cf7efb (diff)
downloadvirtio-spec-612aa99cfe80f02e1460eb041f7b9f30826065d8.tar.gz
virtio spec 0.8.10
-rw-r--r--virtio.lyx5903
1 files changed, 5903 insertions, 0 deletions
diff --git a/virtio.lyx b/virtio.lyx
new file mode 100644
index 0000000..f7c9c38
--- /dev/null
+++ b/virtio.lyx
@@ -0,0 +1,5903 @@
+#LyX 1.6.7 created this file. For more info see http://www.lyx.org/
+\lyxformat 345
+\begin_document
+\begin_header
+\textclass report
+\use_default_options false
+\language english
+\inputencoding auto
+\font_roman default
+\font_sans default
+\font_typewriter default
+\font_default_family default
+\font_sc false
+\font_osf false
+\font_sf_scale 100
+\font_tt_scale 100
+
+\graphics default
+\paperfontsize default
+\spacing single
+\use_hyperref false
+\papersize default
+\use_geometry false
+\use_amsmath 1
+\use_esint 1
+\cite_engine basic
+\use_bibtopic false
+\paperorientation portrait
+\secnumdepth 3
+\tocdepth 3
+\paragraph_separation skip
+\defskip medskip
+\quotes_language english
+\papercolumns 1
+\papersides 1
+\paperpagestyle default
+\tracking_changes true
+\output_changes true
+\author ""
+\author ""
+\end_header
+
+\begin_body
+
+\begin_layout Title
+Virtio PCI Card Specification
+\begin_inset Newline newline
+\end_inset
+
+v0.8.10 DRAFT
+\begin_inset Newline newline
+\end_inset
+
+-
+\end_layout
+
+\begin_layout Author
+Rusty Russell <rusty@rustcorp.com.au>
+\begin_inset Newline newline
+\end_inset
+
+IBM Corporation
+\end_layout
+
+\begin_layout Date
+2010 October 6.
+\end_layout
+
+\begin_layout Chapter
+Purpose and Description
+\end_layout
+
+\begin_layout Standard
+This document describes the specifications of the
+\begin_inset Quotes eld
+\end_inset
+
+virtio
+\begin_inset Quotes erd
+\end_inset
+
+ family of
+\emph on
+PCI
+\emph default
+
+\begin_inset CommandInset nomenclature
+LatexCommand nomenclature
+symbol "PCI"
+description "Peripheral Component Interconnect; a common device bus. See\\\\http://en.wikipedia.org/wiki/Peripheral Component Interconnect"
+
+\end_inset
+
+ devices.
+ These are devices are found in
+\emph on
+virtual
+\emph default
+
+\emph on
+environments
+\begin_inset CommandInset nomenclature
+LatexCommand nomenclature
+symbol "virtualized"
+description "Environments where access to hardware is restricted (and often emulated) by a hypervisor."
+
+\end_inset
+
+
+\emph default
+, yet by design they are not all that different from physical PCI devices,
+ and this document treats them as such.
+ This allows the guest to use standard PCI drivers and discovery mechanisms.
+\end_layout
+
+\begin_layout Standard
+The purpose of virtio and this specification is that virtual environments
+ and guests should have a straightforward, efficient, standard and extensible
+ mechanism for virtual devices, rather than boutique per-environment or
+ per-OS mechanisms.
+\end_layout
+
+\begin_layout Description
+Straightforward: Virtio PCI devices use normal PCI mechanisms of interrupts
+ and DMA which should be familiar to any device driver author.
+ There is no exotic page-flipping or COW mechanism: it's just a PCI device.
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+This lack of page-sharing implies that the implementation of the device
+ (e.g.
+ the hypervisor or host) needs full access to the guest memory.
+ Communication with untrusted parties (i.e.
+ inter-guest communication) requires copying.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Description
+Efficient: Virtio PCI devices consist of rings of descriptors for input
+ and output, which are neatly separated to avoid cache effects from both
+ guest and device writing to the same cache lines.
+\end_layout
+
+\begin_layout Description
+Standard: Virtio PCI makes no assumptions about the environment in which
+ it operates, beyond supporting PCI.
+ In fact the virtio devices specified in the appendices do not require PCI
+ at all: they have been implemented on non-PCI buses.
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+The Linux implementation further separates the PCI virtio code from the
+ specific virtio drivers: these drivers are shared with the non-PCI implementati
+ons (currently lguest and S/390).
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Description
+Extensible: Virtio PCI devices contain feature bits which are acknowledged
+ by the guest operating system during device setup.
+ This allows forwards and backwards compatibility: the device offers all
+ the features it knows about, and the driver acknowledges those it understands
+ and wishes to use.
+\end_layout
+
+\begin_layout Section
+Virtqueues
+\end_layout
+
+\begin_layout Standard
+The mechanism for bulk data transport on virtio PCI devices is pretentiously
+ called a virtqueue.
+ Each device can have zero or more virtqueues: for example, the network
+ device has one for transmit and one for receive.
+\end_layout
+
+\begin_layout Standard
+Each virtqueue occupies two or more physically-contiguous pages (defined,
+ for the purposes of this specification, as 4096 bytes), and consists of
+ three parts:
+\end_layout
+
+\begin_layout Standard
+\begin_inset Tabular
+<lyxtabular version="3" rows="1" columns="4">
+<features>
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="top" width="0">
+<row>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Descriptor Table
+\end_layout
+
+\end_inset
+</cell>
+<cell multicolumn="1" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Available Ring
+\begin_inset space ~
+\end_inset
+
+
+\begin_inset space ~
+\end_inset
+
+
+\begin_inset space ~
+\end_inset
+
+
+\begin_inset space ~
+\end_inset
+
+
+\begin_inset space ~
+\end_inset
+
+
+\emph on
+(padding)
+\end_layout
+
+\end_inset
+</cell>
+<cell multicolumn="2" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Used Ring
+\end_layout
+
+\end_inset
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+When the driver wants to send buffers to the device, it puts them in one
+ or more slots in the descriptor table, and writes the descriptor indices
+ into the available ring.
+ It then notifies the device.
+ When the device has finished with the buffers, it writes the descriptors
+ into the used ring, and sends an interrupt.
+\end_layout
+
+\begin_layout Chapter
+Specification
+\end_layout
+
+\begin_layout Section
+PCI Discovery
+\end_layout
+
+\begin_layout Standard
+Any PCI device with Vendor ID 0x1AF4, and Device ID 0x1000 through 0x103F
+ inclusive is a virtio device
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+The actual value within this range is ignored
+\end_layout
+
+\end_inset
+
+.
+ The device must also have a Revision ID of 0 to match this specification.
+\end_layout
+
+\begin_layout Standard
+The Subsystem Device ID indicates which virtio device is supported by the
+ device.
+ The Subsystem Vendor ID should reflect the PCI Vendor ID of the environment
+ (it's currently only used for informational purposes by the guest).
+\end_layout
+
+\begin_layout Standard
+\begin_inset Tabular
+<lyxtabular version="3" rows="8" columns="3">
+<features>
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="top" width="0">
+<column alignment="center" valignment="bottom" width="0">
+<row>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Subsystem Device ID
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Virtio Device
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Specification
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+1
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+network card
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Appendix C
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+2
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+block device
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Appendix D
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+3
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+console
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Appendix E
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+4
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+entropy source
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Appendix F
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+5
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+memory ballooning
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Appendix G
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+6
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+ioMemory
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+9
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+9P transport
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+-
+\end_layout
+
+\end_inset
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section
+Device Configuration
+\end_layout
+
+\begin_layout Standard
+To configure the device, we use the first I/O region of the PCI device.
+ This contains a
+\emph on
+virtio header
+\emph default
+ followed by a
+\emph on
+device-specific region.
+\end_layout
+
+\begin_layout Standard
+There may be different widths of accesses to the I/O region; the
+\begin_inset Quotes eld
+\end_inset
+
+natural
+\begin_inset Quotes erd
+\end_inset
+
+ access method for each field in the virtio header must be used (i.e.
+ 32-bit accesses for 32-bit fields, etc), but the device-specific region
+ can be accessed using any width accesses, and should obtain the same results.
+\end_layout
+
+\begin_layout Standard
+Note that this is possible because while the virtio header is PCI (i.e.
+ little) endian, the device-specific region is encoded in the native endian
+ of the guest (where such distinction is applicable).
+\end_layout
+
+\begin_layout Subsection
+Device Initialization Sequence
+\end_layout
+
+\begin_layout Standard
+We start with an overview of device initialization, then expand on the details
+ of the device and how each step is preformed.
+\end_layout
+
+\begin_layout Enumerate
+Reset the device.
+ This is not required on initial start up.
+\end_layout
+
+\begin_layout Enumerate
+The ACKNOWLEDGE status bit is set: we have noticed the device.
+\end_layout
+
+\begin_layout Enumerate
+The DRIVER status bit is set: we know how to drive the device.
+\end_layout
+
+\begin_layout Enumerate
+Device-specific setup, including reading the Device Feature Bits, discovery
+ of virtqueues for the device, optional MSI-X setup, and reading and possibly
+ writing the virtio configuration space.
+\end_layout
+
+\begin_layout Enumerate
+The subset of Device Feature Bits understood by the driver is written to
+ the device.
+\end_layout
+
+\begin_layout Enumerate
+The DRIVER_OK status bit is set.
+\end_layout
+
+\begin_layout Enumerate
+The device can now be used (ie.
+ buffers added to the virtqueues)
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Historically, drivers have used the device before steps 5 and 6.
+ This is only allowed if the driver does not use any features which would
+ alter this early use of the device.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+If any of these steps go irrecoverably wrong, the guest should set the FAILED
+ status bit to indicate that it has given up on the device (it can reset
+ the device later to restart if desired).
+\end_layout
+
+\begin_layout Standard
+We now cover the fields required for general setup in detail.
+\end_layout
+
+\begin_layout Subsection
+Virtio Header
+\end_layout
+
+\begin_layout Standard
+The virtio header looks as follows:
+\end_layout
+
+\begin_layout Standard
+\begin_inset Tabular
+<lyxtabular version="3" rows="4" columns="9">
+<features>
+<column alignment="left" valignment="top" width="0">
+<column alignment="left" valignment="top" width="0">
+<column alignment="left" valignment="top" width="0">
+<column alignment="left" valignment="top" width="0">
+<column alignment="left" valignment="top" width="0">
+<column alignment="left" valignment="top" width="0">
+<column alignment="left" valignment="top" width="0">
+<column alignment="left" valignment="top" width="0">
+<column alignment="left" valignment="top" width="0">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Bits
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+32
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+32
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+32
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+16
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+16
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+16
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+8
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+8
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Read/Write
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+R
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+R+W
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+R+W
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+R
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+R+W
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+R+W
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+R+W
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+R
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Purpose
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size footnotesize
+Device
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size footnotesize
+Guest
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size footnotesize
+Queue
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size footnotesize
+Queue
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size footnotesize
+Queue
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size footnotesize
+Queue
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size footnotesize
+Device
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size footnotesize
+ISR
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size footnotesize
+Features
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size footnotesize
+Features
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size footnotesize
+Address
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size footnotesize
+Size
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size footnotesize
+Select
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size footnotesize
+Notify
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size footnotesize
+Status
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size footnotesize
+Status
+\end_layout
+
+\end_inset
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+If MSI-X is enabled for the device, two additional fields immediately follow
+ this header:
+\end_layout
+
+\begin_layout Standard
+\begin_inset Tabular
+<lyxtabular version="3" rows="4" columns="3">
+<features>
+<column alignment="left" valignment="top" width="0">
+<column alignment="left" valignment="top" width="0">
+<column alignment="left" valignment="top" width="0">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Bits
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+16
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+16
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Read/Write
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+R+W
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+R+W
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Purpose
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size footnotesize
+Configuration
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size footnotesize
+Queue
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+(MSI-X)
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size footnotesize
+Vector
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size footnotesize
+Vector
+\end_layout
+
+\end_inset
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Immediately following these general headers, there may be device-specific
+ headers:
+\end_layout
+
+\begin_layout Standard
+\begin_inset Tabular
+<lyxtabular version="3" rows="4" columns="2">
+<features>
+<column alignment="left" valignment="top" width="0">
+<column alignment="left" valignment="top" width="0">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Bits
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Device Specific
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Read/Write
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Device Specific
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+Purpose
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\size footnotesize
+Device Specific...
+\end_layout
+
+\end_inset
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+</cell>
+<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\end_inset
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+Device Status
+\end_layout
+
+\begin_layout Standard
+The Device Status field is updated by the guest to indicate its progress.
+ This provides a simple low-level diagnostic: it's most useful to imagine
+ them hooked up to traffic lights on the console indicating the status of
+ each device.
+\end_layout
+
+\begin_layout Standard
+The device can be reset by writing a 0 to this field, otherwise at least
+ one bit should be set:
+\end_layout
+
+\begin_layout Description
+ACKNOWLEDGE
+\begin_inset space ~
+\end_inset
+
+(1) Indicates that the guest OS has found the device and recognized it as
+ a valid virtio device.
+\end_layout
+
+\begin_layout Description
+DRIVER
+\begin_inset space ~
+\end_inset
+
+(2) Indicates that the guest OS knows how to drive the device.
+ Under Linux, drivers can be loadable modules so there may be a significant
+ (or infinite) delay before setting this bit.
+\end_layout
+
+\begin_layout Description
+DRIVER_OK
+\begin_inset space ~
+\end_inset
+
+(3) Indicates that the driver is set up and ready to drive the device.
+\end_layout
+
+\begin_layout Description
+FAILED
+\begin_inset space ~
+\end_inset
+
+(128) Indicates that something went wrong in the guest, and it has given
+ up on the device.
+ This could be an internal error, or the driver didn't like the device for
+ some reason, or even a fatal error during device operation.
+ The device must be reset before attempting to re-initialize.
+\end_layout
+
+\begin_layout Subsubsection
+Feature Bits
+\end_layout
+
+\begin_layout Standard
+The least significant 31 bits of the first configuration field indicates
+ the features that the device supports (the high bit is reserved, and will
+ be used to indicate the presence of future feature bits elsewhere).
+ The bits are allocated as follows:
+\end_layout
+
+\begin_layout Description
+0
+\begin_inset space ~
+\end_inset
+
+to
+\begin_inset space ~
+\end_inset
+
+23 Feature bits for the specific device type
+\end_layout
+
+\begin_layout Description
+24
+\begin_inset space \space{}
+\end_inset
+
+to
+\begin_inset space ~
+\end_inset
+
+30 Feature bits reserved for extensions to the queue mechanism
+\end_layout
+
+\begin_layout Standard
+For example, feature bit 0 for a network device (i.e.
+ Subsystem Device ID 1) indicates that the device supports checksumming
+ of packets.
+\end_layout
+
+\begin_layout Standard
+The feature bits are
+\emph on
+negotiated:
+\emph default
+ the device lists all the features it understands in the Device Features
+ field, and the guest writes the subset that it understands into the Guest
+ Features field.
+ The only way to renegotiate is to reset the device.
+\end_layout
+
+\begin_layout Standard
+In particular, new fields in the device configuration header are indicated
+ by offering a feature bit, so the guest can check before accessing that
+ part of the configuration space.
+\end_layout
+
+\begin_layout Standard
+This allows for forwards and backwards compatibility: if the device is enhanced
+ with a new feature bit, older guests will not write that feature bit back
+ to the Guest Features field and it can go into backwards compatibility
+ mode.
+ Similarly, if a guest is enhanced with a feature that the device doesn't
+ support, it will not see that feature bit in the Device Features field
+ and can go into backwards compatibility mode (or, for poor implementations,
+ set the FAILED Device Status bit).
+\end_layout
+
+\begin_layout Subsubsection
+Configuration/Queue Vectors
+\end_layout
+
+\begin_layout Standard
+When MSI-X capability is present and enabled in the device (through standard
+ PCI configuration space) 4 bytes at byte offset 20 are used to map configuratio
+n change and queue interrupts to MSI-X vectors.
+ In this case, the ISR Status field is unused, and device specific configuration
+ starts at byte offset 24 in virtio header structure.
+ When MSI-X capability is not enabled, device specific configuration starts
+ at byte offset 20 in virtio header.
+\end_layout
+
+\begin_layout Standard
+Writing a valid MSI-X Table entry number, 0 to 0x7FF, to one of Configuration/Qu
+eue Vector registers,
+\emph on
+maps
+\emph default
+ interrupts triggered by the configuration change/selected queue events
+ respectively to the corresponding MSI-X vector.
+ To disable interrupts for a specific event type, unmap it by writing a
+ special NO_VECTOR value:
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+/* Vector value used to disable MSI for queue */
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_MSI_NO_VECTOR 0xffff
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Reading these registers returns vector mapped to a given event, or NO_VECTOR
+ if unmapped.
+ All queue and configuration change events are unmapped by default.
+\end_layout
+
+\begin_layout Standard
+Note that mapping an event to vector might require allocating internal device
+ resources, and might fail.
+ Devices report such failures by returning the NO_VECTOR value when the
+ relevant Vector field is read.
+ After mapping an event to vector, the driver must verify success by reading
+ the Vector field value: on success, the previously written value is returned,
+ and on failure, NO_VECTOR is returned.
+ If a mapping failure is detected, the driver can retry mapping with fewervector
+s, or disable MSI-X.
+\end_layout
+
+\begin_layout Section
+Virtqueue Configuration
+\end_layout
+
+\begin_layout Standard
+As a device can have zero or more virtqueues for bulk data transport (for
+ example, the network driver has two), the driver needs to configure them
+ as part of the device-specific configuration.
+\end_layout
+
+\begin_layout Standard
+This is done as follows, for each virtqueue a device has:
+\end_layout
+
+\begin_layout Enumerate
+Write the virtqueue index (first queue is 0) to the Queue Select field.
+\end_layout
+
+\begin_layout Enumerate
+Read the virtqueue size from the Queue Size field, which is always a power
+ of 2.
+ This controls how big the virtqueue is (see below).
+ If this field is 0, the virtqueue does not exist.
+
+\end_layout
+
+\begin_layout Enumerate
+Allocate and zero virtqueue in contiguous physical memory, on a 4096 byte
+ alignment.
+ Write the physical address, divided by 4096 to the Queue Address field.
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+The 4096 is based on the x86 page size, but it's also large enough to ensure
+ that the separate parts of the virtqueue are on separate cache lines.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Enumerate
+Optionally, if MSI-X capability is present and enabled on the device, select
+ a vector to use to request interrupts triggered by virtqueue events.
+ Write the MSI-X Table entry number corresponding to this vector in Queue
+ Vector field.
+ Read the Queue Vector field: on success, previously written value is returned;
+ on failure, NO_VECTOR value is returned.
+\end_layout
+
+\begin_layout Standard
+The Queue Size field controls the total number of bytes required for the
+ virtqueue according to the following formula:
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+#define ALIGN(x) (((x) + 4095) & ~4095)
+\end_layout
+
+\begin_layout Plain Layout
+
+static inline unsigned vring_size(unsigned int qsz)
+\end_layout
+
+\begin_layout Plain Layout
+
+{
+\end_layout
+
+\begin_layout Plain Layout
+
+ return ALIGN(sizeof(struct vring_desc)*qsz + sizeof(u16)*(2 + qsz))
+\end_layout
+
+\begin_layout Plain Layout
+
+ + ALIGN(sizeof(struct vring_used_elem)*qsz);
+\end_layout
+
+\begin_layout Plain Layout
+
+}
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+This currently wastes some space with padding, but also allows future extensions.
+ The virtqueue layout structure looks like this (qsz is the Queue Size field,
+ which is a variable, so this code won't compile):
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+struct vring {
+\end_layout
+
+\begin_layout Plain Layout
+
+ /* The actual descriptors (16 bytes each) */
+\end_layout
+
+\begin_layout Plain Layout
+
+ struct vring_desc desc[qsz];
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+ /* A ring of available descriptor heads with free-running index.
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+ struct vring_avail avail;
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+ // Padding to the next 4096 boundary.
+\end_layout
+
+\begin_layout Plain Layout
+
+ char pad[];
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+ // A ring of used descriptor heads with free-running index.
+\end_layout
+
+\begin_layout Plain Layout
+
+ struct vring_used used;
+\end_layout
+
+\begin_layout Plain Layout
+
+};
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsection
+A Note on Virtqueue Endianness
+\end_layout
+
+\begin_layout Standard
+Note that the
+\emph on
+endian
+\emph default
+ of these fields and everything else in the virtqueue is the native endian
+ of the guest, not little-endian as PCI normally is.
+ This makes for simpler guest code, and it is assumed that the host already
+ has to be deeply aware of the guest endian so such an
+\begin_inset Quotes eld
+\end_inset
+
+endian-aware
+\begin_inset Quotes erd
+\end_inset
+
+ device is not a significant issue.
+\end_layout
+
+\begin_layout Subsection
+Descriptor Table
+\end_layout
+
+\begin_layout Standard
+The descriptor table refers to the buffers the guest is using for the device.
+ The addresses are physical addresses, and the buffers can be chained via
+ the next field.
+ Each descriptor describes a buffer which is read-only or write-only, but
+ a chain of descriptors can contain both read-only and write-only buffers.
+\end_layout
+
+\begin_layout Standard
+No descriptor chain may be more than 2^32 bytes long in total.
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+struct vring_desc {
+\end_layout
+
+\begin_layout Plain Layout
+
+ /* Address (guest-physical).
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+ u64 addr;
+\end_layout
+
+\begin_layout Plain Layout
+
+ /* Length.
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+ u32 len;
+\end_layout
+
+\begin_layout Plain Layout
+
+/* This marks a buffer as continuing via the next field.
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VRING_DESC_F_NEXT 1
+\end_layout
+
+\begin_layout Plain Layout
+
+/* This marks a buffer as write-only (otherwise read-only).
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VRING_DESC_F_WRITE 2
+\end_layout
+
+\begin_layout Plain Layout
+
+/* This means the buffer contains a list of buffer descriptors.
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VRING_DESC_F_INDIRECT 4
+\end_layout
+
+\begin_layout Plain Layout
+
+ /* The flags as indicated above.
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+ u16 flags;
+\end_layout
+
+\begin_layout Plain Layout
+
+ /* Next field if flags & NEXT */
+\end_layout
+
+\begin_layout Plain Layout
+
+ u16 next;
+\end_layout
+
+\begin_layout Plain Layout
+
+};
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+The number of descriptors in the table is specified by the Queue Size field
+ for this virtqueue.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Indirect-Descriptors"
+
+\end_inset
+
+Indirect Descriptors
+\end_layout
+
+\begin_layout Standard
+Some devices benefit by concurrently dispatching a large number of large
+ requests.
+ The VIRTIO_RING_F_INDIRECT_DESC feature can be used to allow this (see
+
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "cha:Reserved-Feature-Bits"
+
+\end_inset
+
+).
+ To increase ring capacity it is possible to store a table of
+\emph on
+indirect descriptors
+\emph default
+ anywhere in memory, and insert a descriptor in main virtqueue (with flags&INDIR
+ECT on) that refers to memory buffer containing this
+\emph on
+indirect descriptor table
+\emph default
+; fields
+\emph on
+addr
+\emph default
+ and
+\emph on
+len
+\emph default
+ refer to the indirect table address and length in bytes, respectively.
+ The indirect table layout structure looks like this (len is the length
+ of the descriptor that refers to this table, which is a variable, so this
+ code won't compile):
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+struct indirect_descriptor_table {
+\end_layout
+
+\begin_layout Plain Layout
+
+ /* The actual descriptors (16 bytes each) */
+\end_layout
+
+\begin_layout Plain Layout
+
+ struct vring_desc desc[len / 16];
+\end_layout
+
+\begin_layout Plain Layout
+
+};
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+The first indirect descriptor is located at start of the indirect descriptor
+ table (index 0), additional indirect descriptors are chained by next field.
+ An indirect descriptor without next field (with flags&NEXT off) signals
+ the end of the indirect descriptor table, and transfers control back to
+ the main virtqueue.
+ An indirect descriptor can not refer to another indirect descriptor table
+ (flags&INDIRECT must be off).
+ A single indirect descriptor table can include both read-only and write-only
+ descriptors; write-only flag (flags&WRITE) in the descriptor that refers
+ to it is ignored.
+\end_layout
+
+\begin_layout Subsection
+Available Ring
+\end_layout
+
+\begin_layout Standard
+The available ring refers to what descriptors we are offering the device:
+ it refers to the head of a descriptor chain.
+ The
+\begin_inset Quotes eld
+\end_inset
+
+flags
+\begin_inset Quotes erd
+\end_inset
+
+ field is currently 0 or 1: 1 indicating that we do not need an interrupt
+ when the device consumes a descriptor from the available ring.
+ This interrupt suppression is merely an optimization; it may not suppress
+ interrupts entirely.
+\end_layout
+
+\begin_layout Standard
+The
+\begin_inset Quotes eld
+\end_inset
+
+idx
+\begin_inset Quotes erd
+\end_inset
+
+ field indicates where we would put the
+\emph on
+next
+\emph default
+ descriptor entry (modulo the ring size).
+ This starts at 0, and increases.
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+struct vring_avail {
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VRING_AVAIL_F_NO_INTERRUPT 1
+\end_layout
+
+\begin_layout Plain Layout
+
+ u16 flags;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u16 idx;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u16 ring[qsz]; /* qsz is the Queue Size field read from device */
+\end_layout
+
+\begin_layout Plain Layout
+
+};
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsection
+Used Ring
+\end_layout
+
+\begin_layout Standard
+The used ring is where the device returns buffers once it is done with them.
+ The flags field can be used by the device to hint that no notification
+ is necessary when the guest adds to the
+\emph on
+available
+\emph default
+ ring (the flag is kept here because this is the only part of the virtqueue
+ written by the device).
+\end_layout
+
+\begin_layout Standard
+Each entry in the ring is a pair: the head entry of the descriptor chain
+ describing the buffer (this matches an entry placed in the available ring
+ by the guest earlier), and the total of bytes written into the buffer.
+ The latter is extremely useful for guests using untrusted buffers: if you
+ do not know exactly how much has been written by the device, you usually
+ have to zero the buffer to ensure no data leakage occurs.
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+/* u32 is used here for ids for padding reasons.
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+struct vring_used_elem {
+\end_layout
+
+\begin_layout Plain Layout
+
+ /* Index of start of used descriptor chain.
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+ u32 id;
+\end_layout
+
+\begin_layout Plain Layout
+
+ /* Total length of the descriptor chain which was used (written to)
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+ u32 len;
+\end_layout
+
+\begin_layout Plain Layout
+
+};
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+struct vring_used {
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VRING_USED_F_NO_NOTIFY 1
+\end_layout
+
+\begin_layout Plain Layout
+
+ u16 flags;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u16 idx;
+\end_layout
+
+\begin_layout Plain Layout
+
+ struct vring_used_elem ring[qsz];
+\end_layout
+
+\begin_layout Plain Layout
+
+};
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsection
+Helpers for Managing Virtqueues
+\end_layout
+
+\begin_layout Standard
+The Linux Kernel Source code contains the definitions above and helper routines
+ in a more usable form, in include/linux/virtio_ring.h.
+ This was explicitly licensed by IBM under the (3-clause) BSD license so
+ that it can be freely used by all other projects, and is reproduced (with
+ slight variation to remove Linux assumptions) in Appendix A.
+\end_layout
+
+\begin_layout Section
+Device Operation
+\end_layout
+
+\begin_layout Standard
+There are two parts to device operation: supplying new buffers to the device,
+ and processing used buffers from the device.
+ As an example, the virtio network device has two virtqueues: the transmit
+ virtqueue and the receive virtqueue.
+ The driver adds outgoing (read-only) packets to the transmit virtqueue,
+ and then frees them after they are used.
+ Similarly, incoming (write-only) buffers are added to the receive virtqueue,
+ and processed after they are used.
+\end_layout
+
+\begin_layout Subsection
+Supplying Buffers to The Device
+\end_layout
+
+\begin_layout Standard
+Actual transfer of buffers from the guest OS to the device operates as follows:
+\end_layout
+
+\begin_layout Enumerate
+Place the buffer(s) into free descriptor(s).
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+If there are no free descriptors, the guest may choose to notify the device
+ even if notifications are suppressed (to reduce latency).
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+The Linux drivers do this only for read-only buffers: for write-only buffers,
+ it is assumed that the driver is merely trying to keep the receive buffer
+ ring full, and no notification of this expected condition is necessary.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_deeper
+\begin_layout Enumerate
+Place the id of the buffer in the next ring entry of the available ring.
+\end_layout
+
+\begin_layout Enumerate
+The steps (1) and (2) may be performed repeatedly if batching is possible.
+\end_layout
+
+\begin_layout Enumerate
+A memory barrier should be executed to ensure the device sees the updated
+ descriptor table and available ring before the next step.
+\end_layout
+
+\begin_layout Enumerate
+The available
+\begin_inset Quotes eld
+\end_inset
+
+idx
+\begin_inset Quotes erd
+\end_inset
+
+ field should be increased by the number of entries added to the available
+ ring.
+\end_layout
+
+\begin_layout Enumerate
+A memory barrier should be executed to ensure that we update the idx field
+ before checking for notification suppression.
+\end_layout
+
+\begin_layout Enumerate
+If notifications are not suppressed, the device should be notified of the
+ new buffers.
+\end_layout
+
+\begin_layout Standard
+Note that the above code does not take precautions against the available
+ ring buffer wrapping around: this is not possible since the ring buffer
+ is the same size as the descriptor table, so step (1) will prevent such
+ a condition.
+\end_layout
+
+\begin_layout Standard
+In addition, the maximum queue size is 32768 (it must be a power of 2 which
+ fits in 16 bits), so the 16-bit
+\begin_inset Quotes eld
+\end_inset
+
+idx
+\begin_inset Quotes erd
+\end_inset
+
+ value can always distinguish between a full and empty buffer.
+\end_layout
+
+\begin_layout Standard
+Here is a description of each stage in more detail.
+\end_layout
+
+\begin_layout Subsubsection
+Placing Buffers Into The Descriptor Table
+\end_layout
+
+\begin_layout Standard
+A buffer consists of zero or more read-only physically-contiguous elements
+ followed by zero or more physically-contiguous write-only elements (it
+ must have at least one element).
+ This algorithm maps it into the descriptor table:
+\end_layout
+
+\begin_layout Enumerate
+for each buffer element,
+\family typewriter
+b
+\family default
+:
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+Get the next free descriptor table entry,
+\family typewriter
+d
+\end_layout
+
+\begin_layout Enumerate
+Set
+\family typewriter
+d.addr
+\family default
+ to the physical address of the start of
+\family typewriter
+b
+\end_layout
+
+\begin_layout Enumerate
+Set
+\family typewriter
+d.len
+\family default
+ to the length of
+\family typewriter
+b
+\family default
+.
+\end_layout
+
+\begin_layout Enumerate
+If
+\family typewriter
+b
+\family default
+ is write-only, set
+\family typewriter
+d.flags
+\family default
+ to VRING_DESC_F_WRITE, otherwise 0.
+\end_layout
+
+\begin_layout Enumerate
+If there is a buffer element after this:
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+Set
+\family typewriter
+d.next
+\family default
+ to the index of the next free descriptor element.
+\end_layout
+
+\begin_layout Enumerate
+Set the VRING_DESC_F_NEXT bit in
+\family typewriter
+d.flags
+\family default
+.
+\end_layout
+
+\end_deeper
+\end_deeper
+\begin_layout Standard
+In practice, the d.next fields are usually used to chain free descriptors,
+ and a separate count kept to check there are enough free descriptors before
+ beginning the mappings.
+\end_layout
+
+\begin_layout Subsubsection
+Updating The Available Ring
+\end_layout
+
+\begin_layout Standard
+The head of the buffer we mapped is the first
+\family typewriter
+d
+\family default
+ in the algorithm above.
+ A naive implementation would do the following:
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+avail->ring[avail->idx % qsz] = head;
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+However, in general we can add many descriptors before we update the
+\begin_inset Quotes eld
+\end_inset
+
+idx
+\begin_inset Quotes erd
+\end_inset
+
+ field (at which point they become visible to the device), so we keep a
+ counter of how many we've added:
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+avail->ring[(avail->idx + added++) % qsz] = head;
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+Updating The Index Field
+\end_layout
+
+\begin_layout Standard
+Once the idx field of the virtqueue is updated, the device will be able
+ to access the descriptor entries we've created and the memory they refer
+ to.
+ This is why a memory barrier is generally used before the idx update, to
+ ensure it sees the most up-to-date copy.
+\end_layout
+
+\begin_layout Standard
+The idx field always increments, and we let it wrap naturally at 65536:
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+avail->idx += added;
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsubsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Notifying-The-Device"
+
+\end_inset
+
+Notifying The Device
+\end_layout
+
+\begin_layout Standard
+Device notification occurs by writing the 16-bit virtqueue index of this
+ virtqueue to the Queue Notify field of the virtio header in the first I/O
+ region of the PCI device.
+ This can be expensive, however, so the device can suppress such notifications
+ if it doesn't need them.
+ We have to be careful to expose the new idx value
+\emph on
+before
+\emph default
+ checking the suppression flag: it's OK to notify gratuitously, but not
+ to omit a required notification.
+ So again, we use a memory barrier here before reading the flags.
+\end_layout
+
+\begin_layout Standard
+If the VRING_USED_F_NOTIFY flag is not set, we go ahead and write to the
+ PCI configuration space.
+\end_layout
+
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Receiving-Used-Buffers"
+
+\end_inset
+
+Receiving Used Buffers From The Device
+\end_layout
+
+\begin_layout Standard
+Once the device has used a buffer (read from or written to it, or parts
+ of both, depending on the nature of the virtqueue and the device), it sends
+ an interrupt, following an algorithm very similar to the algorithm used
+ for the driver to send the device a buffer:
+\end_layout
+
+\begin_layout Enumerate
+Write the head descriptor number to the next field in the used ring.
+\end_layout
+
+\begin_layout Enumerate
+Update the used ring idx.
+\end_layout
+
+\begin_layout Enumerate
+If the VRING_AVAIL_F_NO_INTERRUPT flag is not set in avail\SpecialChar \nobreakdash-
+>flags:
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+If MSI-X capability is disabled:
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+Set the lower bit of the ISR Status field for the device.
+\end_layout
+
+\begin_layout Enumerate
+Send the appropriate PCI interrupt for the device.
+\end_layout
+
+\end_deeper
+\begin_layout Enumerate
+If MSI-X capability is enabled:
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+Request the appropriate MSI-X interrupt message for the device, Queue Vector
+ field sets the MSI-X Table entry number.
+\end_layout
+
+\begin_layout Enumerate
+If Queue Vector field value is NO_VECTOR, no interrupt message is requested
+ for this event.
+\end_layout
+
+\end_deeper
+\end_deeper
+\begin_layout Standard
+The guest interrupt handler should:
+\end_layout
+
+\begin_layout Enumerate
+If MSI-X capability is disabled: read the ISR Status field, which will reset
+ it to zero.
+ If the lower bit is zero, the interrupt was not for this device.
+ Otherwise, the guest driver should look through the used rings of each
+ virtqueue for the device, to see if any progress has been made by the device
+ which requires servicing.
+\end_layout
+
+\begin_layout Enumerate
+If MSI-X capability is enabled: look through the used rings of each virtqueue
+ mapped to the specific MSI-X vector for the device, to see if any progress
+ has been made by the device which requires servicing.
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+while (vq->last_seen_used != vring->used.idx) {
+\end_layout
+
+\begin_layout Plain Layout
+
+ struct vring_used_elem *e = vring.used->ring[vq->last_seen_used%vsz];
+\end_layout
+
+\begin_layout Plain Layout
+
+ process_buffer(e);
+\end_layout
+
+\begin_layout Plain Layout
+
+ vq->last_seen_used++;
+\end_layout
+
+\begin_layout Plain Layout
+
+}
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsection
+Dealing With Configuration Changes
+\end_layout
+
+\begin_layout Standard
+Some virtio PCI devices can change the device configuration state, as reflected
+ in the virtio header in the PCI configuration space.
+ In this case:
+\end_layout
+
+\begin_layout Enumerate
+If MSI-X capability is disabled: an interrupt is delivered and the second
+ highest bit is set in the ISR Status field to indicate that the driver
+ should re-examine the configuration space.Note that a single interrupt can
+ indicate both that one or more virtqueue has been used and that the configurati
+on space has changed: even if the config bit is set, virtqueues must be
+ scanned.
+\end_layout
+
+\begin_layout Enumerate
+If MSI-X capability is enabled: an interrupt message is requested.
+ The Configuration Vector field sets the MSI-X Table entry number to use.
+ If Configuration Vector field value is NO_VECTOR, no interrupt message
+ is requested for this event.
+\end_layout
+
+\begin_layout Chapter
+Creating New Device Types
+\end_layout
+
+\begin_layout Standard
+Various considerations are necessary when creating a new device type:
+\end_layout
+
+\begin_layout Section*
+How Many Virtqueues?
+\end_layout
+
+\begin_layout Standard
+It is possible that a very simple device will operate entirely through its
+ configuration space, but most will need at least one virtqueue in which
+ it will place requests.
+ A device with both input and output (eg.
+ console and network devices described here) need two queues: one which
+ the driver fills with buffers to receive input, and one which the driver
+ places buffers to transmit output.
+\end_layout
+
+\begin_layout Section*
+What Configuration Space Layout?
+\end_layout
+
+\begin_layout Standard
+Configuration space is generally used for rarely-changing or initialization-time
+ parameters.
+ But it is a limited resource, so it might be better to use a virtqueue
+ to update configuration information (the network device does this for filtering
+, otherwise the table in the config space could potentially be very large).
+\end_layout
+
+\begin_layout Standard
+Note that this space is generally the guest's native endian, rather than
+ PCI's little-endian.
+\end_layout
+
+\begin_layout Section*
+What Device Number?
+\end_layout
+
+\begin_layout Standard
+Currently device numbers are assigned quite freely: a simple request mail
+ to the author of this document or the Linux virtualization mailing list
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+https://lists.linux-foundation.org/mailman/listinfo/virtualization
+\end_layout
+
+\end_inset
+
+ will be sufficient to secure a unique one.
+\end_layout
+
+\begin_layout Standard
+Meanwhile for experimental drivers, use 65535 and work backwards.
+\end_layout
+
+\begin_layout Section*
+How many MSI-X vectors?
+\end_layout
+
+\begin_layout Standard
+Using the optional MSI-X capability devices can speed up interrupt processing
+ by removing the need to read ISR Status register by guest driver (which
+ might be an expensive operation), reducing interrupt sharing between devices
+ and queues within the device, and handling interrupts from multiple CPUs.
+ However, some systems impose a limit (which might be as low as 256) on
+ the total number of MSI-X vectors that can be allocated to all devices.
+ Devices and/or device drivers should take this into account, limiting the
+ number of vectors used unless the device is expected to cause a high volume
+ of interrupts.
+ Devices can control the number of vectors used by limiting the MSI-X Table
+ Size or not presenting MSI-X capability in PCI configuration space.
+ Drivers can control this by mapping events to as small number of vectors
+ as possible, or disabling MSI-X capability altogether.
+\end_layout
+
+\begin_layout Section*
+Message Framing
+\end_layout
+
+\begin_layout Standard
+The descriptors used for a buffer should not effect the semantics of the
+ message, except for the total length of the buffer.
+ For example, a network buffer consists of a 10 byte header followed by
+ the network packet.
+ Whether this is presented in the ring descriptor chain as (say) a 10 byte
+ buffer and a 1514 byte buffer, or a single 1524 byte buffer, or even three
+ buffers, should have no effect.
+\end_layout
+
+\begin_layout Standard
+In particular, no implementation should use the descriptor boundaries to
+ determine the size of any header in a request.
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+The current qemu device implementations mistakenly insist that the first
+ descriptor cover the header in these cases exactly, so a cautious driver
+ should arrange it so.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section*
+Device Improvements
+\end_layout
+
+\begin_layout Standard
+Any change to configuration space, or new virtqueues, or behavioural changes,
+ should be indicated by negotiation of a new feature bit.
+ This establishes clarity
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Even if it does mean documenting design or implementation mistakes!
+\end_layout
+
+\end_inset
+
+ and avoids future expansion problems.
+\end_layout
+
+\begin_layout Standard
+Clusters of functionality which are always implemented together can use
+ a single bit, but if one feature makes sense without the others they should
+ not be gratuitously grouped together to conserve feature bits.
+ We can always extend the spec when the first person needs more than 24
+ feature bits for their device.
+\end_layout
+
+\begin_layout Standard
+\begin_inset CommandInset nomencl_print
+LatexCommand printnomenclature
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Chapter*
+Appendix A: virtio_ring.h
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+#ifndef VIRTIO_RING_H
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_RING_H
+\end_layout
+
+\begin_layout Plain Layout
+
+/* An interface for efficient virtio implementation.
+\end_layout
+
+\begin_layout Plain Layout
+
+ *
+\end_layout
+
+\begin_layout Plain Layout
+
+ * This header is BSD licensed so anyone can use the definitions
+\end_layout
+
+\begin_layout Plain Layout
+
+ * to implement compatible drivers/servers.
+\end_layout
+
+\begin_layout Plain Layout
+
+ *
+\end_layout
+
+\begin_layout Plain Layout
+
+ * Copyright 2007, 2009, IBM Corporation
+\end_layout
+
+\begin_layout Plain Layout
+
+ * All rights reserved.
+\end_layout
+
+\begin_layout Plain Layout
+
+ *
+\end_layout
+
+\begin_layout Plain Layout
+
+ * Redistribution and use in source and binary forms, with or without
+\end_layout
+
+\begin_layout Plain Layout
+
+ * modification, are permitted provided that the following conditions
+\end_layout
+
+\begin_layout Plain Layout
+
+ * are met:
+\end_layout
+
+\begin_layout Plain Layout
+
+ * 1.
+ Redistributions of source code must retain the above copyright
+\end_layout
+
+\begin_layout Plain Layout
+
+ * notice, this list of conditions and the following disclaimer.
+\end_layout
+
+\begin_layout Plain Layout
+
+ * 2.
+ Redistributions in binary form must reproduce the above copyright
+\end_layout
+
+\begin_layout Plain Layout
+
+ * notice, this list of conditions and the following disclaimer in the
+\end_layout
+
+\begin_layout Plain Layout
+
+ * documentation and/or other materials provided with the distribution.
+\end_layout
+
+\begin_layout Plain Layout
+
+ * 3.
+ Neither the name of IBM nor the names of its contributors
+\end_layout
+
+\begin_layout Plain Layout
+
+ * may be used to endorse or promote products derived from this software
+\end_layout
+
+\begin_layout Plain Layout
+
+ * without specific prior written permission.
+\end_layout
+
+\begin_layout Plain Layout
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
+ IS'' AND
+\end_layout
+
+\begin_layout Plain Layout
+
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+\end_layout
+
+\begin_layout Plain Layout
+
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+\end_layout
+
+\begin_layout Plain Layout
+
+ * ARE DISCLAIMED.
+ IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+\end_layout
+
+\begin_layout Plain Layout
+
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+\end_layout
+
+\begin_layout Plain Layout
+
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+\end_layout
+
+\begin_layout Plain Layout
+
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+\end_layout
+
+\begin_layout Plain Layout
+
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+\end_layout
+
+\begin_layout Plain Layout
+
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ WAY
+\end_layout
+
+\begin_layout Plain Layout
+
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+\end_layout
+
+\begin_layout Plain Layout
+
+ * SUCH DAMAGE.
+\end_layout
+
+\begin_layout Plain Layout
+
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+/* This marks a buffer as continuing via the next field.
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VRING_DESC_F_NEXT 1
+\end_layout
+
+\begin_layout Plain Layout
+
+/* This marks a buffer as write-only (otherwise read-only).
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VRING_DESC_F_WRITE 2
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+/* The Host uses this in used->flags to advise the Guest: don't kick me
+\end_layout
+
+\begin_layout Plain Layout
+
+ * when you add a buffer.
+ It's unreliable, so it's simply an
+\end_layout
+
+\begin_layout Plain Layout
+
+ * optimization.
+ Guest will still kick if it's out of buffers.
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VRING_USED_F_NO_NOTIFY 1
+\end_layout
+
+\begin_layout Plain Layout
+
+/* The Guest uses this in avail->flags to advise the Host: don't
+\end_layout
+
+\begin_layout Plain Layout
+
+ * interrupt me when you consume a buffer.
+ It's unreliable, so it's
+\end_layout
+
+\begin_layout Plain Layout
+
+ * simply an optimization.
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VRING_AVAIL_F_NO_INTERRUPT 1
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+/* Virtio ring descriptors: 16 bytes.
+\end_layout
+
+\begin_layout Plain Layout
+
+ * These can chain together via "next".
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+struct vring_desc {
+\end_layout
+
+\begin_layout Plain Layout
+
+ /* Address (guest-physical).
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+ uint64_t addr;
+\end_layout
+
+\begin_layout Plain Layout
+
+ /* Length.
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+ uint32_t len;
+\end_layout
+
+\begin_layout Plain Layout
+
+ /* The flags as indicated above.
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+ uint16_t flags;
+\end_layout
+
+\begin_layout Plain Layout
+
+ /* We chain unused descriptors via this, too */
+\end_layout
+
+\begin_layout Plain Layout
+
+ uint16_t next;
+\end_layout
+
+\begin_layout Plain Layout
+
+};
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+struct vring_avail {
+\end_layout
+
+\begin_layout Plain Layout
+
+ uint16_t flags;
+\end_layout
+
+\begin_layout Plain Layout
+
+ uint16_t idx;
+\end_layout
+
+\begin_layout Plain Layout
+
+ uint16_t ring[];
+\end_layout
+
+\begin_layout Plain Layout
+
+};
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+/* u32 is used here for ids for padding reasons.
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+struct vring_used_elem {
+\end_layout
+
+\begin_layout Plain Layout
+
+ /* Index of start of used descriptor chain.
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+ uint32_t id;
+\end_layout
+
+\begin_layout Plain Layout
+
+ /* Total length of the descriptor chain which was written to.
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+ uint32_t len;
+\end_layout
+
+\begin_layout Plain Layout
+
+};
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+struct vring_used {
+\end_layout
+
+\begin_layout Plain Layout
+
+ uint16_t flags;
+\end_layout
+
+\begin_layout Plain Layout
+
+ uint16_t idx;
+\end_layout
+
+\begin_layout Plain Layout
+
+ struct vring_used_elem ring[];
+\end_layout
+
+\begin_layout Plain Layout
+
+};
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+struct vring {
+\end_layout
+
+\begin_layout Plain Layout
+
+ unsigned int num;
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+ struct vring_desc *desc;
+\end_layout
+
+\begin_layout Plain Layout
+
+ struct vring_avail *avail;
+\end_layout
+
+\begin_layout Plain Layout
+
+ struct vring_used *used;
+\end_layout
+
+\begin_layout Plain Layout
+
+};
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+/* The standard layout for the ring is a continuous chunk of memory which
+\end_layout
+
+\begin_layout Plain Layout
+
+ * looks like this.
+ We assume num is a power of 2.
+\end_layout
+
+\begin_layout Plain Layout
+
+ *
+\end_layout
+
+\begin_layout Plain Layout
+
+ * struct vring {
+\end_layout
+
+\begin_layout Plain Layout
+
+ * // The actual descriptors (16 bytes each)
+\end_layout
+
+\begin_layout Plain Layout
+
+ * struct vring_desc desc[num];
+\end_layout
+
+\begin_layout Plain Layout
+
+ *
+\end_layout
+
+\begin_layout Plain Layout
+
+ * // A ring of available descriptor heads with free-running index.
+\end_layout
+
+\begin_layout Plain Layout
+
+ * __u16 avail_flags;
+\end_layout
+
+\begin_layout Plain Layout
+
+ * __u16 avail_idx;
+\end_layout
+
+\begin_layout Plain Layout
+
+ * __u16 available[num];
+\end_layout
+
+\begin_layout Plain Layout
+
+ *
+\end_layout
+
+\begin_layout Plain Layout
+
+ * // Padding to the next align boundary.
+\end_layout
+
+\begin_layout Plain Layout
+
+ * char pad[];
+\end_layout
+
+\begin_layout Plain Layout
+
+ *
+\end_layout
+
+\begin_layout Plain Layout
+
+ * // A ring of used descriptor heads with free-running index.
+\end_layout
+
+\begin_layout Plain Layout
+
+ * __u16 used_flags;
+\end_layout
+
+\begin_layout Plain Layout
+
+ * __u16 used_idx;
+\end_layout
+
+\begin_layout Plain Layout
+
+ * struct vring_used_elem used[num];
+\end_layout
+
+\begin_layout Plain Layout
+
+ * };
+\end_layout
+
+\begin_layout Plain Layout
+
+ * Note: for virtio PCI, align is 4096.
+\end_layout
+
+\begin_layout Plain Layout
+
+ */
+\end_layout
+
+\begin_layout Plain Layout
+
+static inline void vring_init(struct vring *vr, unsigned int num, void *p,
+\end_layout
+
+\begin_layout Plain Layout
+
+ unsigned long align)
+\end_layout
+
+\begin_layout Plain Layout
+
+{
+\end_layout
+
+\begin_layout Plain Layout
+
+ vr->num = num;
+\end_layout
+
+\begin_layout Plain Layout
+
+ vr->desc = p;
+\end_layout
+
+\begin_layout Plain Layout
+
+ vr->avail = p + num*sizeof(struct vring_desc);
+\end_layout
+
+\begin_layout Plain Layout
+
+ vr->used = (void *)(((unsigned long)&vr->avail->ring[num]
+\end_layout
+
+\begin_layout Plain Layout
+
+ + align-1)
+\end_layout
+
+\begin_layout Plain Layout
+
+ & ~(align - 1));
+\end_layout
+
+\begin_layout Plain Layout
+
+}
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+static inline unsigned vring_size(unsigned int num, unsigned long align)
+\end_layout
+
+\begin_layout Plain Layout
+
+{
+\end_layout
+
+\begin_layout Plain Layout
+
+ return ((sizeof(struct vring_desc)*num + sizeof(uint16_t)*(2+num)
+\end_layout
+
+\begin_layout Plain Layout
+
+ + align - 1) & ~(align - 1))
+\end_layout
+
+\begin_layout Plain Layout
+
+ + sizeof(uint16_t)*2 + sizeof(struct vring_used_elem)*num;
+\end_layout
+
+\begin_layout Plain Layout
+
+}
+\end_layout
+
+\begin_layout Plain Layout
+
+#endif /* VIRTIO_RING_H */
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Chapter*
+\begin_inset CommandInset label
+LatexCommand label
+name "cha:Reserved-Feature-Bits"
+
+\end_inset
+
+Appendix B: Reserved Feature Bits
+\end_layout
+
+\begin_layout Standard
+Currently there are three device-independent feature bits defined:
+\end_layout
+
+\begin_layout Description
+VIRTIO_F_NOTIFY_ON_EMPTY
+\begin_inset space ~
+\end_inset
+
+(24) Negotiating this feature indicates that the driver wants an interrupt
+ if the device runs out of available descriptors on a virtqueue, even though
+ interrupts are suppressed using the VRING_AVAIL_F_NO_INTERRUPT flag.
+ An example of this is the networking driver: it doesn't need to know every
+ time a packet is transmitted, but it does need to free the transmitted
+ packets a finite time after they are transmitted.
+ It can avoid using a timer if the device interrupts it when all the packets
+ are transmitted.
+\end_layout
+
+\begin_layout Description
+VIRTIO_F_RING_INDIRECT_DESC
+\begin_inset space ~
+\end_inset
+
+(28) Negotiating this feature indicates that the driver can use descriptors
+ with the VRING_DESC_F_INDIRECT flag set, as described in
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Indirect-Descriptors"
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Description
+VIRTIO_F_BAD_FEATURE(30) This feature should never be negotiated by the
+ guest; doing so is an indication that the guest is faulty
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+An experimental virtio PCI driver contained in Linux version 2.6.25 had this
+ problem, and this feature bit can be used to detect it.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Chapter*
+Appendix C: Network Device
+\end_layout
+
+\begin_layout Standard
+The virtio network device is a virtual ethernet card, and is the most complex
+ of the devices supported so far by virtio.
+ It has enhanced rapidly and demonstrates clearly how support for new features
+ should be added to an existing device.
+ Empty buffers are placed in one virtqueue for receiving packets, and outgoing
+ packets are enqueued into another for transmission in that order.
+ A third command queue is used to control advanced filtering features.
+\end_layout
+
+\begin_layout Section*
+Configuration
+\end_layout
+
+\begin_layout Description
+Subsystem
+\begin_inset space ~
+\end_inset
+
+Device
+\begin_inset space ~
+\end_inset
+
+ID 1
+\end_layout
+
+\begin_layout Description
+Virtqueues 0:receiveq.
+ 1:transmitq.
+ 2:controlq
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Only if VIRTIO_NET_F_CTRL_VQ set
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Description
+Feature
+\begin_inset space ~
+\end_inset
+
+bits
+\end_layout
+
+\begin_deeper
+\begin_layout Description
+VIRTIO_NET_F_CSUM
+\begin_inset space ~
+\end_inset
+
+(0) Device handles packets with partial checksum
+\end_layout
+
+\begin_layout Description
+VIRTIO_NET_F_GUEST_CSUM
+\begin_inset space ~
+\end_inset
+
+(1) Guest handles packets with partial checksum
+\end_layout
+
+\begin_layout Description
+VIRTIO_NET_F_MAC
+\begin_inset space ~
+\end_inset
+
+(5) Device has given MAC address.
+\end_layout
+
+\begin_layout Description
+VIRTIO_NET_F_GSO
+\begin_inset space ~
+\end_inset
+
+(6) (Deprecated) device handles packets with any GSO type.
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+It was supposed to indicate segmentation offload support, but upon further
+ investigation it became clear that multiple bits were required.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Description
+VIRTIO_NET_F_GUEST_TSO4
+\begin_inset space ~
+\end_inset
+
+(7) Guest can receive TSOv4.
+\end_layout
+
+\begin_layout Description
+VIRTIO_NET_F_GUEST_TSO6
+\begin_inset space ~
+\end_inset
+
+(8) Guest can receive TSOv6.
+\end_layout
+
+\begin_layout Description
+VIRTIO_NET_F_GUEST_ECN
+\begin_inset space ~
+\end_inset
+
+(9) Guest can receive TSO with ECN.
+\end_layout
+
+\begin_layout Description
+VIRTIO_NET_F_GUEST_UFO
+\begin_inset space ~
+\end_inset
+
+(10) Guest can receive UFO.
+\end_layout
+
+\begin_layout Description
+VIRTIO_NET_F_HOST_TSO4
+\begin_inset space ~
+\end_inset
+
+(11) Device can receive TSOv4.
+\end_layout
+
+\begin_layout Description
+VIRTIO_NET_F_HOST_TSO6
+\begin_inset space ~
+\end_inset
+
+(12) Device can receive TSOv6.
+\end_layout
+
+\begin_layout Description
+VIRTIO_NET_F_HOST_ECN
+\begin_inset space ~
+\end_inset
+
+(13) Device can receive TSO with ECN.
+\end_layout
+
+\begin_layout Description
+VIRTIO_NET_F_HOST_UFO
+\begin_inset space ~
+\end_inset
+
+(14) Device can receive UFO.
+\end_layout
+
+\begin_layout Description
+VIRTIO_NET_F_MRG_RXBUF
+\begin_inset space ~
+\end_inset
+
+(15) Guest can merge receive buffers.
+\end_layout
+
+\begin_layout Description
+VIRTIO_NET_F_STATUS
+\begin_inset space ~
+\end_inset
+
+(16) Configuration status field is available.
+\end_layout
+
+\begin_layout Description
+VIRTIO_NET_F_CTRL_VQ
+\begin_inset space ~
+\end_inset
+
+(17) Control channel is available.
+\end_layout
+
+\begin_layout Description
+VIRTIO_NET_F_CTRL_RX
+\begin_inset space ~
+\end_inset
+
+(18) Control channel RX mode support.
+\end_layout
+
+\begin_layout Description
+VIRTIO_NET_F_CTRL_VLAN
+\begin_inset space ~
+\end_inset
+
+(19) Control channel VLAN filtering.
+\end_layout
+
+\end_deeper
+\begin_layout Description
+Device
+\begin_inset space ~
+\end_inset
+
+configuration
+\begin_inset space ~
+\end_inset
+
+layout Two configuration fields are currently defined.
+ The mac address field always exists (though is only valid if VIRTIO_NET_F_MAC
+ is set), and the status field only exists if VIRTIO_NET_F_STATUS is set.
+ Only one bit is currently defined for the status field: VIRTIO_NET_S_LINK_UP.
+
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+#define VIRTIO_NET_S_LINK_UP 1
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+struct virtio_net_config {
+\end_layout
+
+\begin_layout Plain Layout
+
+ u8 mac[6];
+\end_layout
+
+\begin_layout Plain Layout
+
+ u16 status;
+\end_layout
+
+\begin_layout Plain Layout
+
+};
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section*
+Device Initialization
+\end_layout
+
+\begin_layout Enumerate
+The initialization routine should identify the receive and transmission
+ virtqueues.
+\end_layout
+
+\begin_layout Enumerate
+If the VIRTIO_NET_F_MAC feature bit is set, the configuration space
+\begin_inset Quotes eld
+\end_inset
+
+mac
+\begin_inset Quotes erd
+\end_inset
+
+ entry indicates the
+\begin_inset Quotes eld
+\end_inset
+
+physical
+\begin_inset Quotes erd
+\end_inset
+
+ address of the the network card, otherwise a private MAC address should
+ be assigned.
+ All guests are expected to negotiate this feature if it is set.
+\end_layout
+
+\begin_layout Enumerate
+If the VIRTIO_NET_F_CTRL_VQ feature bit is negotiated, identify the control
+ virtqueue.
+\end_layout
+
+\begin_layout Enumerate
+If the VIRTIO_NET_F_STATUS feature bit is negotiated, the link status can
+ be read from the bottom bit of the
+\begin_inset Quotes eld
+\end_inset
+
+status
+\begin_inset Quotes erd
+\end_inset
+
+ config field.
+ Otherwise, the link should be assumed active.
+\end_layout
+
+\begin_layout Enumerate
+The receive virtqueue should be filled with receive buffers.
+ This is described in detail below in
+\begin_inset Quotes eld
+\end_inset
+
+Setting Up Receive Buffers
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Enumerate
+A driver can indicate that it will generate checksumless packets by negotating
+ the VIRTIO_NET_F_CSUM feature.
+ This
+\begin_inset Quotes eld
+\end_inset
+
+checksum offload
+\begin_inset Quotes erd
+\end_inset
+
+ is a common feature on modern network cards.
+\end_layout
+
+\begin_layout Enumerate
+If that feature is negotiated, a driver can use TCP or UDP segmentation
+ offload by negotiating the VIRTIO_NET_F_HOST_TSO4 (IPv4 TCP), VIRTIO_NET_F_HOST
+_TSO6 (IPv6 TCP) and VIRTIO_NET_F_HOST_UFO (UDP fragmentation) features.
+ It should not send TCP packets requiring segmentation offload which have
+ the Explicit Congestion Notification bit set, unless the VIRTIO_NET_F_HOST_ECN
+ feature is negotiated.
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+This is a common restriction in real, older network cards.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Enumerate
+The converse features are also available: a driver can save the virtual
+ device some work by negotiating these features.
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+For example, a network packet transported between two guests on the same
+ system may not require checksumming at all, nor segmentation, if both guests
+ are amenable.
+\end_layout
+
+\end_inset
+
+ The VIRTIO_NET_F_GUEST_CSUM feature indicates that partially checksummed
+ packets can be received, and if it can do that then the VIRTIO_NET_F_GUEST_TSO4
+, VIRTIO_NET_F_GUEST_TSO6, VIRTIO_NET_F_GUEST_UFO and VIRTIO_NET_F_GUEST_ECN
+ are the input equivalents of the features described above.
+ See
+\begin_inset Quotes eld
+\end_inset
+
+Receiving Packets
+\begin_inset Quotes erd
+\end_inset
+
+ below.
+\end_layout
+
+\begin_layout Section*
+Device Operation
+\end_layout
+
+\begin_layout Standard
+Packets are transmitted by placing them in the transmitq, and buffers for
+ incoming packets are placed in the receiveq.
+ In each case, the packet itself is preceeded by a header:
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+struct virtio_net_hdr {
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1
+\end_layout
+
+\begin_layout Plain Layout
+
+ u8 flags;
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_NET_HDR_GSO_NONE 0
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_NET_HDR_GSO_TCPV4 1
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_NET_HDR_GSO_UDP 3
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_NET_HDR_GSO_TCPV6 4
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_NET_HDR_GSO_ECN 0x80
+\end_layout
+
+\begin_layout Plain Layout
+
+ u8 gso_type;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u16 hdr_len;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u16 gso_size;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u16 csum_start;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u16 csum_offset;
+\end_layout
+
+\begin_layout Plain Layout
+
+/* Only if VIRTIO_NET_F_MRG_RXBUF: */
+\end_layout
+
+\begin_layout Plain Layout
+
+ u16 num_buffers
+\end_layout
+
+\begin_layout Plain Layout
+
+};
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+The controlq is used to control device features such as filtering.
+\end_layout
+
+\begin_layout Subsection*
+Packet Transmission
+\end_layout
+
+\begin_layout Standard
+Transmitting a single packet is simple, but varies depending on the different
+ features the driver negotiated.
+\end_layout
+
+\begin_layout Enumerate
+If the driver negotiated VIRTIO_NET_F_CSUM, and the packet has not been
+ fully checksummed, then the virtio_net_hdr's fields are set as follows.
+ Otherwise, the packet must be fully checksummed, and flags is zero.
+\end_layout
+
+\begin_deeper
+\begin_layout Itemize
+flags has the VIRTIO_NET_HDR_F_NEEDS_CSUM set,
+\end_layout
+
+\begin_layout Itemize
+\begin_inset CommandInset label
+LatexCommand label
+name "ite:csum_start-is-set"
+
+\end_inset
+
+csum_start is set to the offset within the packet to begin checksumming,
+ and
+\end_layout
+
+\begin_layout Itemize
+csum_offset indicates how many bytes after the csum_start the new (16 bit
+ ones' complement) checksum should be placed.
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+For example, consider a partially checksummed TCP (IPv4) packet.
+ It will have a 14 byte ethernet header and 20 byte IP header followed by
+ the TCP header (with the TCP checksum field 16 bytes into that header).
+ csum_start will be 14+20 = 34 (the TCP checksum includes the header), and
+ csum_offset will be 16.
+ The value in the TCP checksum field will be the sum of the TCP pseudo header,
+ so that replacing it by the ones' complement checksum of the TCP header
+ and body will give the correct result.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_deeper
+\begin_layout Enumerate
+\begin_inset CommandInset label
+LatexCommand label
+name "enu:If-the-driver"
+
+\end_inset
+
+If the driver negotiated VIRTIO_NET_F_HOST_TSO4, TSO6 or UFO, and the packet
+ requires TCP segmentation or UDP fragmentation, then the
+\begin_inset Quotes eld
+\end_inset
+
+gso_type
+\begin_inset Quotes erd
+\end_inset
+
+ field is set to VIRTIO_NET_HDR_GSO_TCPV4, TCPV6 or UDP.
+ (Otherwise, it is set to VIRTIO_NET_HDR_GSO_NONE).
+ In this case, packets larger than 1514 bytes can be transmitted: the metadata
+ indicates how to replicate the packet header to cut it into smaller packets.
+ The other gso fields are set:
+\end_layout
+
+\begin_deeper
+\begin_layout Itemize
+hdr_len is a hint to the device as to how much of the header needs to be
+ kept to copy into each packet, usually set to the length of the headers,
+ including the transport header.
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Due to various bugs in implementations, this field is not useful as a guarantee
+ of the transport header size.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Itemize
+gso_size is the size of the packet beyond that header (ie.
+ MSS).
+\end_layout
+
+\begin_layout Itemize
+If the driver negotiated the VIRTIO_NET_F_HOST_ECN feature, the VIRTIO_NET_HDR_G
+SO_ECN bit may be set in
+\begin_inset Quotes eld
+\end_inset
+
+gso_type
+\begin_inset Quotes erd
+\end_inset
+
+ as well, indicating that the TCP packet has the ECN bit set.
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+This case is not handled by some older hardware, so is called out specifically
+ in the protocol.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_deeper
+\begin_layout Enumerate
+If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature, the num_buffers
+ field is set to zero.
+\end_layout
+
+\begin_layout Enumerate
+The header and packet are added as one output buffer to the transmitq, and
+ the device is notified of the new entry (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Notifying-The-Device"
+
+\end_inset
+
+).
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Note that the header will be two bytes longer for the VIRTIO_NET_F_MRG_RXBUF
+ case.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Subsection*
+Packet Transmission Interrupt
+\end_layout
+
+\begin_layout Standard
+Often a driver will suppress transmission interrupts using the VRING_AVAIL_F_NO_
+INTERRUPT flag (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Receiving-Used-Buffers"
+
+\end_inset
+
+) and check for used packets in the transmit path of following packets.
+ However, it will still receive interrupts if the VIRTIO_F_NOTIFY_ON_EMPTY
+ feature is negotiated, indicating that the transmission queue is completely
+ emptied.
+\end_layout
+
+\begin_layout Standard
+The normal behavior in this interrupt handler is to retrieve and new descriptors
+ from the used ring and free the corresponding headers and packets.
+\end_layout
+
+\begin_layout Subsection*
+Setting Up Receive Buffers
+\end_layout
+
+\begin_layout Standard
+It is generally a good idea to keep the receive virtqueue as fully populated
+ as possible: if it runs out, network performance will suffer.
+\end_layout
+
+\begin_layout Standard
+If the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6 or VIRTIO_NET_F_GUEST_UF
+O features are used, the Guest will need to accept packets of up to 65550
+ bytes long (the maximum size of a TCP or UDP packet, plus the 14 byte ethernet
+ header), otherwise 1514 bytes.
+ So unless VIRTIO_NET_F_MRG_RXBUF is negotiated, every buffer in the receive
+ queue needs to be at least this length
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Obviously each one can be split across multiple descriptor elements.
+\end_layout
+
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+If VIRTIO_NET_F_MRG_RXBUF is negotiated, each buffer must be at least the
+ size of the
+\family typewriter
+struct virtio_net_hdr
+\family default
+.
+\end_layout
+
+\begin_layout Subsection*
+Packet Receive Interrupt
+\end_layout
+
+\begin_layout Standard
+When a packet is copied into a buffer in the receiveq, the optimal path
+ is to disable further interrupts for the receiveq (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Receiving-Used-Buffers"
+
+\end_inset
+
+) and process packets until no more are found, then re-enable them.
+\end_layout
+
+\begin_layout Standard
+Processing packet involves:
+\end_layout
+
+\begin_layout Enumerate
+If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature, then the
+\begin_inset Quotes eld
+\end_inset
+
+num_buffers
+\begin_inset Quotes erd
+\end_inset
+
+ field indicates how many descriptors this packet is spread over (including
+ this one).
+ This allows receipt of large packets without having to allocate large buffers.
+ In this case, there will be at least
+\begin_inset Quotes eld
+\end_inset
+
+num_buffers
+\begin_inset Quotes erd
+\end_inset
+
+ in the used ring, and they should be chained together to form a single
+ packet.
+ The other buffers will
+\emph on
+not
+\emph default
+ begin with a
+\family typewriter
+struct virtio_net_hdr
+\family default
+.
+\end_layout
+
+\begin_layout Enumerate
+If the VIRTIO_NET_F_MRG_RXBUF feature was not negotiated, or the
+\begin_inset Quotes eld
+\end_inset
+
+num_buffers
+\begin_inset Quotes erd
+\end_inset
+
+ field is one, then the entire packet will be contained within this buffer,
+ immediately following the
+\family typewriter
+struct virtio_net_hdr
+\family default
+.
+\end_layout
+
+\begin_layout Enumerate
+If the VIRTIO_NET_F_GUEST_CSUM feature was negotiated, the VIRTIO_NET_HDR_F_NEED
+S_CSUM bit in the
+\begin_inset Quotes eld
+\end_inset
+
+flags
+\begin_inset Quotes erd
+\end_inset
+
+ field may be set: if so, the checksum on the packet is incomplete and the
+
+\begin_inset Quotes eld
+\end_inset
+
+csum_start
+\begin_inset Quotes erd
+\end_inset
+
+ and
+\begin_inset Quotes eld
+\end_inset
+
+csum_offset
+\begin_inset Quotes erd
+\end_inset
+
+ fields indicate how to calculate it (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "ite:csum_start-is-set"
+
+\end_inset
+
+).
+\end_layout
+
+\begin_layout Enumerate
+If the VIRTIO_NET_F_GUEST_TSO4, TSO6 or UFO options were negotiated, then
+ the
+\begin_inset Quotes eld
+\end_inset
+
+gso_type
+\begin_inset Quotes erd
+\end_inset
+
+ may be something other than VIRTIO_NET_HDR_GSO_NONE, and the
+\begin_inset Quotes eld
+\end_inset
+
+gso_size
+\begin_inset Quotes erd
+\end_inset
+
+ field indicates the desired MSS (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "enu:If-the-driver"
+
+\end_inset
+
+).Control Virtqueue
+\end_layout
+
+\begin_layout Standard
+The driver uses the control virtqueue (if VIRTIO_NET_F_VTRL_VQ is negotiated)
+ to send commands to manipulate various features of the device which would
+ not easily map into the configuration space.
+\end_layout
+
+\begin_layout Standard
+All commands are of the following form:
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+struct virtio_net_ctrl {
+\end_layout
+
+\begin_layout Plain Layout
+
+ u8 class;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u8 command;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u8 command-specific-data[];
+\end_layout
+
+\begin_layout Plain Layout
+
+ u8 ack;
+\end_layout
+
+\begin_layout Plain Layout
+
+};
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+/* ack values */
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_NET_OK 0
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_NET_ERR 1
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+The class, command and command-specific-data are set by the driver, and
+ the device sets the ack byte.
+ There is little it can do except issue a diagnostic if the ack byte is
+ not VIRTIO_NET_OK.
+\end_layout
+
+\begin_layout Subsection*
+Packet Receive Filtering
+\end_layout
+
+\begin_layout Standard
+If the VIRTIO_NET_F_CTRL_RX feature is negotiated, the driver can send control
+ commands for promiscuous mode, multicast receiving, and filtering of MAC
+ addresses.
+\end_layout
+
+\begin_layout Standard
+Note that in general, these commands are best-effort: unwanted packets may
+ still arrive.
+
+\end_layout
+
+\begin_layout Subsubsection*
+Setting Promiscuous Mode
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+#define VIRTIO_NET_CTRL_RX 0
+\end_layout
+
+\begin_layout Plain Layout
+
+ #define VIRTIO_NET_CTRL_RX_PROMISC 0
+\end_layout
+
+\begin_layout Plain Layout
+
+ #define VIRTIO_NET_CTRL_RX_ALLMULTI 1
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+The class VIRTIO_NET_CTRL_RX has two commands: VIRTIO_NET_CTRL_RX_PROMISC
+ turns promiscuous mode on and off, and VIRTIO_NET_CTRL_RX_ALLMULTI turns
+ all-multicast receive on and off.
+ The command-specific-data is one byte containing 0 (off) or 1 (on).
+\end_layout
+
+\begin_layout Subsubsection*
+Setting MAC Address Filtering
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+struct virtio_net_ctrl_mac {
+\end_layout
+
+\begin_layout Plain Layout
+
+ u32 entries;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u8 macs[entries][ETH_ALEN];
+\end_layout
+
+\begin_layout Plain Layout
+
+};
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_NET_CTRL_MAC 1
+\end_layout
+
+\begin_layout Plain Layout
+
+ #define VIRTIO_NET_CTRL_MAC_TABLE_SET 0
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+The device can filter incoming packets by any number of destination MAC
+ addresses.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+Since there are no guarentees, it can use a hash filter orsilently switch
+ to allmulti or promiscuous mode if it is given too many addresses.
+\end_layout
+
+\end_inset
+
+ This table is set using the class VIRTIO_NET_CTRL_MAC and the command VIRTIO_NE
+T_CTRL_MAC_TABLE_SET.
+ The command-specific-data is two variable length tables of 6-byte MAC addresses.
+ The first table contains unicast addresses, and the second contains multicast
+ addresses.
+\end_layout
+
+\begin_layout Subsection*
+VLAN Filtering
+\end_layout
+
+\begin_layout Standard
+If the driver negotiates the VIRTION_NET_F_CTRL_VLAN feature, it can control
+ a VLAN filter table in the device.
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+#define VIRTIO_NET_CTRL_VLAN 2
+\end_layout
+
+\begin_layout Plain Layout
+
+ #define VIRTIO_NET_CTRL_VLAN_ADD 0
+\end_layout
+
+\begin_layout Plain Layout
+
+ #define VIRTIO_NET_CTRL_VLAN_DEL 1
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Both the VIRTIO_NET_CTRL_VLAN_ADD and VIRTIO_NET_CTRL_VLAN_DEL command take
+ a 16-bit VLAN id as the command-specific-data.
+\end_layout
+
+\begin_layout Chapter*
+Appendix D: Block Device
+\end_layout
+
+\begin_layout Standard
+The virtio block device is a simple virtual block device (ie.
+ disk).
+ Read and write requests (and other exotic requests) are placed in the queue,
+ and serviced (probably out of order) by the device except where noted.
+\end_layout
+
+\begin_layout Section*
+Configuration
+\end_layout
+
+\begin_layout Description
+Subsystem
+\begin_inset space ~
+\end_inset
+
+Device
+\begin_inset space ~
+\end_inset
+
+ID 2
+\end_layout
+
+\begin_layout Description
+Virtqueues 0:requestq.
+\end_layout
+
+\begin_layout Description
+Feature
+\begin_inset space ~
+\end_inset
+
+bits
+\end_layout
+
+\begin_deeper
+\begin_layout Description
+VIRTIO_BLK_F_BARRIER
+\begin_inset space ~
+\end_inset
+
+(0) Host supports request barriers.
+\end_layout
+
+\begin_layout Description
+VIRTIO_BLK_F_SIZE_MAX
+\begin_inset space ~
+\end_inset
+
+(1) Maximum size of any single segment is in
+\begin_inset Quotes eld
+\end_inset
+
+size_max
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Description
+VIRTIO_BLK_F_SEG_MAX
+\begin_inset space ~
+\end_inset
+
+(2) Maximum number of segments in a request is in
+\begin_inset Quotes eld
+\end_inset
+
+seg_max
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Description
+VIRTIO_BLK_F_GEOMETRY
+\begin_inset space ~
+\end_inset
+
+(4) Disk-style geometry specified in
+\begin_inset Quotes eld
+\end_inset
+
+geometry
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Description
+VIRTIO_BLK_F_RO
+\begin_inset space \space{}
+\end_inset
+
+(5) Device is read-only.
+\end_layout
+
+\begin_layout Description
+VIRTIO_BLK_F_BLK_SIZE
+\begin_inset space ~
+\end_inset
+
+(6) Block size of disk is in
+\begin_inset Quotes eld
+\end_inset
+
+blk_size
+\begin_inset Quotes erd
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Description
+VIRTIO_BLK_F_SCSI (7) Device supports scsi packet commands.
+\end_layout
+
+\begin_layout Description
+VIRTIO_BLK_F_FLUSH (9) Cache flush command support.
+\end_layout
+
+\begin_layout Description
+VIRTIO_BLK_F_SECTOR_MAX
+\begin_inset space ~
+\end_inset
+
+(10) Maximum total sectors in an I/O.
+\end_layout
+
+\end_deeper
+\begin_layout Description
+Device
+\begin_inset space ~
+\end_inset
+
+configuration
+\begin_inset space ~
+\end_inset
+
+layout The capacity of the device (expressed in 512-byte sectors) is always
+ present.
+ The availability of the others all depend on various feature bits as indicated
+ above.
+
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+struct virtio_blk_config {
+\end_layout
+
+\begin_layout Plain Layout
+
+ u64 capacity;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u32 size_max;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u32 seg_max;
+\end_layout
+
+\begin_layout Plain Layout
+
+ struct virtio_blk_geometry {
+\end_layout
+
+\begin_layout Plain Layout
+
+ u16 cylinders;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u8 heads;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u8 sectors;
+\end_layout
+
+\begin_layout Plain Layout
+
+ } geometry;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u32 blk_size;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u32 sectors_max;
+\end_layout
+
+\begin_layout Plain Layout
+
+};
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section*
+Device Initialization
+\end_layout
+
+\begin_layout Enumerate
+The device size should be read from the
+\begin_inset Quotes eld
+\end_inset
+
+capacity
+\begin_inset Quotes erd
+\end_inset
+
+ configuration field.
+ No requests should be submitted which goes beyond this limit.
+\end_layout
+
+\begin_layout Enumerate
+If the VIRTIO_BLK_F_BLK_SIZE feature is negotiated, the blk_size field can
+ be read to determine the optimal sector size for the driver to use.
+ This does not effect the units used in the protocol (always 512 bytes),
+ but awareness of the correct value can effect performance.
+\end_layout
+
+\begin_layout Enumerate
+If the VIRTIO_BLK_F_RO feature is set by the device, any write requests
+ will fail.
+\end_layout
+
+\begin_layout Enumerate
+If the VIRTIO_BLK_F_SECTOR_MAX feature is negotiated, the sectors_max field
+ should be read to determine the maximum I/O size for the driver to use.
+ No requests should be submitted which go beyond this limit.
+\end_layout
+
+\begin_layout Section*
+Device Operation
+\end_layout
+
+\begin_layout Standard
+The driver queues requests to the virtqueue, and they are used by the device
+ (not necessarily in order).
+ Each request is of form:
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+struct virtio_blk_req {
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+ u32 type;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u32 ioprio;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u64 sector;
+\end_layout
+
+\begin_layout Plain Layout
+
+ char data[][512];
+\end_layout
+
+\begin_layout Plain Layout
+
+ u8 status;
+\end_layout
+
+\begin_layout Plain Layout
+
+};
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+If the device has VIRTIO_BLK_F_SCSI feature, it can also support scsi packet
+ command requests, each of these requests is of form:
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+struct virtio_scsi_pc_req {
+\end_layout
+
+\begin_layout Plain Layout
+
+ u32 type;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u32 ioprio;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u64 sector;
+\end_layout
+
+\begin_layout Plain Layout
+
+ char cmd[];
+\end_layout
+
+\begin_layout Plain Layout
+
+ char data[][512];
+\end_layout
+
+\begin_layout Plain Layout
+
+#define SCSI_SENSE_BUFFERSIZE 96
+\end_layout
+
+\begin_layout Plain Layout
+
+ u8 sense[SCSI_SENSE_BUFFERSIZE];
+\end_layout
+
+\begin_layout Plain Layout
+
+ u32 errors;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u32 data_len;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u32 sense_len;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u32 residual;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u8 status;
+\end_layout
+
+\begin_layout Plain Layout
+
+};
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+The
+\emph on
+type
+\emph default
+ of the request is either a read (VIRTIO_BLK_T_IN), a write (VIRTIO_BLK_T_OUT),
+ a scsi packet command (VIRTIO_BLK_T_SCSI_CMD or VIRTIO_BLK_T_SCSI_CMD_OUT
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+the SCSI_CMD and SCSI_CMD_OUT types are equivalent, the device does not
+ distinguish between them
+\end_layout
+
+\end_inset
+
+) or a flush (VIRTIO_BLK_T_FLUSH or VIRTIO_BLK_T_FLUSH_OUT
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+the FLUSH and FLUSH_OUT types are equivalent, the device does not distinguish
+ between them
+\end_layout
+
+\end_inset
+
+).
+ If the device has VIRTIO_BLK_F_BARRIER feature
+\begin_inset space ~
+\end_inset
+
+the high bit (VIRTIO_BLK_T_BARRIER) indicates that this request acts as
+ a barrier and that all preceeding requests must be complete before this
+ one, and all following requests must not be started until this is complete.
+ Note that a barrier does not flush caches in the underlying backend device
+ in host, and thus does not serve as data consistency guarantee.
+ Driver must use FLUSH request to flush the host cache.
+\end_layout
+
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+#define VIRTIO_BLK_T_IN 0
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_BLK_T_OUT 1
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_BLK_T_SCSI_CMD 2
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_BLK_T_SCSI_CMD_OUT 3
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_BLK_T_FLUSH 4
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_BLK_T_FLUSH_OUT 5
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_BLK_T_BARRIER 0x80000000
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+The
+\emph on
+ioprio
+\emph default
+ field is a hint about the relative priorities of requests to the device:
+ higher numbers indicate more important requests.
+\end_layout
+
+\begin_layout Standard
+The
+\emph on
+sector
+\emph default
+ number indicates the offset (multiplied by 512) where the read or write
+ is to occur.
+ This field is unused and set to 0 for scsi packet commands and for flush
+ commands.
+\end_layout
+
+\begin_layout Standard
+The
+\emph on
+cmd
+\emph default
+ field is only present for scsi packet command requests, and indicates the
+ command to perform.
+ This field must reside in a single, separate read-only buffer; command
+ length can be derived from the length of this buffer.
+
+\end_layout
+
+\begin_layout Standard
+Note that these first three (four for scsi packet commands) fields are always
+ read-only: the
+\emph on
+data
+\emph default
+ field is either read-only or write-only, depending on the request.
+ The size of the read or write can be derived from the total size of the
+ request buffers.
+\end_layout
+
+\begin_layout Standard
+The
+\emph on
+ sense
+\emph default
+ field is only present for scsi packet command requests, and indicates the
+ buffer for scsi sense data.
+\end_layout
+
+\begin_layout Standard
+The
+\emph on
+data_len
+\emph default
+ field is only present for scsi packet command requests, this field is deprecate
+d, and should be ignored by the driver.
+ Historically, devices copied data length there.
+\end_layout
+
+\begin_layout Standard
+The
+\emph on
+sense_len
+\emph default
+ field is only present for scsi packet command requests and indicates the
+ number of bytes actually written to the
+\emph on
+sense
+\emph default
+ buffer.
+\end_layout
+
+\begin_layout Standard
+The
+\emph on
+residual
+\emph default
+ field is only present for scsi packet command requests and indicates the
+ residual size, calculated as data length - number of bytes actually transferred.
+\end_layout
+
+\begin_layout Standard
+The final
+\emph on
+status
+\emph default
+ byte is written by the device: either VIRTIO_BLK_S_OK for success, VIRTIO_BLK_S
+_IOERR for host or guest error or VIRTIO_BLK_S_UNSUPP for a request unsupported
+ by host:
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+#define VIRTIO_BLK_S_OK 0
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_BLK_S_IOERR 1
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_BLK_S_UNSUPP 2
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Historically, devices assumed that the fields
+\emph on
+type
+\emph default
+,
+\emph on
+ioprio
+\emph default
+ and
+\emph on
+sector
+\emph default
+ reside in a single, separate read-only buffer; the fields
+\emph on
+errors
+\emph default
+,
+\emph on
+data_len
+\emph default
+,
+\emph on
+sense_len
+\emph default
+ and
+\emph on
+residual
+\emph default
+ reside in a single, separate write-only buffer; the
+\emph on
+sense
+\emph default
+ field in a separate write-only buffer of size 96 bytes, by itself; the
+ fields
+\emph on
+errors
+\emph default
+,
+\emph on
+data_len
+\emph default
+,
+\emph on
+sense_len
+\emph default
+ and
+\emph on
+residual
+\emph default
+ in a single write-only buffer; and the
+\emph on
+status
+\emph default
+ field is a separate read-only buffer of size 1 byte, by itself.
+\end_layout
+
+\begin_layout Chapter*
+Appendix E: Console Device
+\end_layout
+
+\begin_layout Standard
+The virtio console device is a simple device for data input and output.
+ A device may have one or more ports.
+ Each port has a pair of input and output virtqueues.
+ Moreover, a device has a pair of control IO virtqueues.
+ The control virtqueues are used to communicate information between the
+ device and the driver about ports being opened and closed on either side
+ of the connection, indication from the host about whether a particular
+ port is a console port, adding new ports, port hot-plug/unplug, etc., and
+ indication from the guest about whether a port or a device was successfully
+ added, port open/close, etc..
+ For data IO, one or more empty buffers are placed in the receive queue
+ for incoming data and outgoing characters are placed in the transmit queue.
+\end_layout
+
+\begin_layout Section*
+Configuration
+\end_layout
+
+\begin_layout Description
+Subsystem
+\begin_inset space ~
+\end_inset
+
+Device
+\begin_inset space ~
+\end_inset
+
+ID 3
+\end_layout
+
+\begin_layout Description
+Virtqueues 0:receiveq(port0).
+ 1:transmitq(port0), 2:control receiveq
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Ports 2 onwards only if VIRTIO_CONSOLE_F_MULTIPORT is set
+\end_layout
+
+\end_inset
+
+, 3:control transmitq, 4:receiveq(port1), 5:transmitq(port1), ...
+\end_layout
+
+\begin_layout Description
+Feature
+\begin_inset space ~
+\end_inset
+
+bits
+\end_layout
+
+\begin_deeper
+\begin_layout Description
+VIRTIO_CONSOLE_F_SIZE
+\begin_inset space ~
+\end_inset
+
+(0) Configuration cols and rows fields are valid.
+\end_layout
+
+\begin_layout Description
+VIRTIO_CONSOLE_F_MULTIPORT(1) Device has support for multiple ports; configurati
+on fields nr_ports and max_nr_ports are valid and control virtqueues will
+ be used.
+\end_layout
+
+\end_deeper
+\begin_layout Description
+Device
+\begin_inset space ~
+\end_inset
+
+configuration
+\begin_inset space ~
+\end_inset
+
+layout The size of the console is supplied in the configuration space if
+ the VIRTIO_CONSOLE_F_SIZE feature is set.
+ Furthermore, if the VIRTIO_CONSOLE_F_MULTIPORT feature is set, the maximum
+ number of ports supported by the device can be fetched.
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+struct virtio_console_config {
+\end_layout
+
+\begin_layout Plain Layout
+
+ u16 cols;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u16 rows;
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+ u32 max_nr_ports;
+\end_layout
+
+\begin_layout Plain Layout
+
+};
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section*
+Device Initialization
+\end_layout
+
+\begin_layout Enumerate
+If the VIRTIO_CONSOLE_F_SIZE feature is negotiated, the driver can read
+ the console dimensions from the configuration fields.
+\end_layout
+
+\begin_layout Enumerate
+If the VIRTIO_CONSOLE_F_MULTIPORT feature is negotiated, the driver can
+ spawn multiple ports, not all of which may be attached to a console.
+ Some could be generic ports.
+ In this case, the control virtqueues are enabled and according to the max_nr_po
+rts configuration-space value, the appropriate number of virtqueues are
+ created.
+ A control message indicating the driver is ready is sent to the host.
+ The host can then send control messages for adding new ports to the device.
+ After creating and initializing each port, a VIRTIO_CONSOLE_PORT_READY
+ control message is sent to the host for that port so the host can let us
+ know of any additional configuration options set for that port.
+\end_layout
+
+\begin_layout Enumerate
+The receiveq for each port is populated with one or more receive buffers.
+\end_layout
+
+\begin_layout Section*
+Device Operation
+\end_layout
+
+\begin_layout Enumerate
+For output, a buffer containing the characters is placed in the port's transmitq.
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Because this is high importance and low bandwidth, the current Linux implementat
+ion polls for the buffer to be used, rather than waiting for an interrupt,
+ simplifying the implementation significantly.
+ However, for generic serial ports with the O_NONBLOCK flag set, the polling
+ limitation is relaxed and the consumed buffers are freed upon the next
+ write or poll call or when a port is closed or hot-unplugged.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Enumerate
+When a buffer is used in the receiveq (signalled by an interrupt), the contents
+ is the input to the port associated with the virtqueue for which the notificati
+on was received.
+\end_layout
+
+\begin_layout Enumerate
+If the driver negotiated the VIRTIO_CONSOLE_F_SIZE feature, a configuration
+ change interrupt may occur.
+ The updated size can be read from the configuration fields.
+\end_layout
+
+\begin_layout Enumerate
+If the driver negotiated the VIRTIO_CONSOLE_F_MULTIPORT feature, active
+ ports are announced by the host using the VIRTIO_CONSOLE_PORT_ADD control
+ message.
+ The same message is used for port hot-plug as well.
+\end_layout
+
+\begin_layout Enumerate
+If the host specified a port `name', a sysfs attribute is created with the
+ name filled in, so that udev rules can be written that can create a symlink
+ from the port's name to the char device for port discovery by applications
+ in the guest.
+\end_layout
+
+\begin_layout Enumerate
+Changes to ports' state are effected by control messages.
+ Appropriate action is taken on the port indicated in the control message.
+ The layout of the structure of the control buffer and the events associated
+ are:
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+struct virtio_console_control {
+\end_layout
+
+\begin_layout Plain Layout
+
+ uint32_t id; /* Port number */
+\end_layout
+
+\begin_layout Plain Layout
+
+ uint16_t event; /* The kind of control event */
+\end_layout
+
+\begin_layout Plain Layout
+
+ uint16_t value; /* Extra information for the event */
+\end_layout
+
+\begin_layout Plain Layout
+
+};
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+/* Some events for the internal messages (control packets) */
+\end_layout
+
+\begin_layout Plain Layout
+
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_CONSOLE_DEVICE_READY 0
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_CONSOLE_PORT_ADD 1
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_CONSOLE_PORT_REMOVE 2
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_CONSOLE_PORT_READY 3
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_CONSOLE_CONSOLE_PORT 4
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_CONSOLE_RESIZE 5
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_CONSOLE_PORT_OPEN 6
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_CONSOLE_PORT_NAME 7
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Chapter*
+Appendix F: Entropy Device
+\end_layout
+
+\begin_layout Standard
+The virtio entropy device supplies high-quality randomness for guest use.
+\end_layout
+
+\begin_layout Section*
+Configuration
+\end_layout
+
+\begin_layout Description
+Subsystem
+\begin_inset space ~
+\end_inset
+
+Device
+\begin_inset space ~
+\end_inset
+
+ID 4
+\end_layout
+
+\begin_layout Description
+Virtqueues 0:requestq.
+\end_layout
+
+\begin_layout Description
+Feature
+\begin_inset space ~
+\end_inset
+
+bits None currently defined
+\end_layout
+
+\begin_layout Description
+Device
+\begin_inset space ~
+\end_inset
+
+configuration
+\begin_inset space ~
+\end_inset
+
+layout None currently defined.
+\end_layout
+
+\begin_layout Section*
+Device Initialization
+\end_layout
+
+\begin_layout Enumerate
+The virtqueue is initialized
+\end_layout
+
+\begin_layout Section*
+Device Operation
+\end_layout
+
+\begin_layout Standard
+When the driver requires random bytes, it places the descriptor of one or
+ more buffers in the queue.
+ It will be completely filled by random data by the device.
+\end_layout
+
+\begin_layout Chapter*
+Appendix G: Memory Balloon Device
+\end_layout
+
+\begin_layout Standard
+The virtio memory balloon device is a primitive device for managing guest
+ memory: the device asks for a certain amount of memory, and the guest supplies
+ it (or withdraws it, if the device has more than it asks for).
+ This allows the guest to adapt to changes in allowance of underlying physical
+ memory.
+ If the feature is negotiated, the device can also be used to communicate
+ guest memory statistics to the host.
+\end_layout
+
+\begin_layout Section*
+Configuration
+\end_layout
+
+\begin_layout Description
+Subsystem
+\begin_inset space ~
+\end_inset
+
+Device
+\begin_inset space ~
+\end_inset
+
+ID 5
+\end_layout
+
+\begin_layout Description
+Virtqueues 0:inflateq.
+ 1:deflateq.
+ 2:statsq.
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+Only if VIRTIO_BALLON_F_STATS_VQ set
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Description
+Feature
+\begin_inset space ~
+\end_inset
+
+bits
+\end_layout
+
+\begin_deeper
+\begin_layout Description
+VIRTIO_BALLOON_F_MUST_TELL_HOST
+\begin_inset space ~
+\end_inset
+
+(0) Host must be told before pages from the balloon are used.
+\end_layout
+
+\begin_layout Description
+VIRTIO_BALLOON_F_STATS_VQ
+\begin_inset space \space{}
+\end_inset
+
+(1) A virtqueue for reporting guest memory statistics is present.
+\end_layout
+
+\end_deeper
+\begin_layout Description
+Device
+\begin_inset space ~
+\end_inset
+
+configuration
+\begin_inset space ~
+\end_inset
+
+layout Both fields of this configuration are always available.
+ Note that they are little endian, despite convention that device fields
+ are guest endian:
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+struct virtio_console_config {
+\end_layout
+
+\begin_layout Plain Layout
+
+ u32 num_pages;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u32 actual;
+\end_layout
+
+\begin_layout Plain Layout
+
+};
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Section*
+Device Initialization
+\end_layout
+
+\begin_layout Enumerate
+The inflate and deflate virtqueues are identified.
+\end_layout
+
+\begin_layout Enumerate
+If the VIRTIO_BALLOON_F_STATS_VQ feature bit is negotiated:
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+Identify the stats virtqueue.
+\end_layout
+
+\begin_layout Enumerate
+Add one empty buffer to the stats virtqueue and notify the host.
+\end_layout
+
+\end_deeper
+\begin_layout Standard
+Device operation begins immediately.
+\end_layout
+
+\begin_layout Section*
+Device Operation
+\end_layout
+
+\begin_layout Description
+Memory
+\begin_inset space \space{}
+\end_inset
+
+Ballooning The device is driven by the receipt of a configuration change
+ interrupt.
+\end_layout
+
+\begin_layout Enumerate
+The
+\begin_inset Quotes eld
+\end_inset
+
+num_pages
+\begin_inset Quotes erd
+\end_inset
+
+ configuration field is examined.
+ If this is greater than the
+\begin_inset Quotes eld
+\end_inset
+
+actual
+\begin_inset Quotes erd
+\end_inset
+
+ number of pages, memory must be given to the balloon.
+ If it is less than the
+\begin_inset Quotes eld
+\end_inset
+
+actual
+\begin_inset Quotes erd
+\end_inset
+
+ number of pages, memory may be taken back from the balloon for general
+ use.
+\end_layout
+
+\begin_layout Enumerate
+To supply memory to the balloon (aka.
+ inflate):
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+The driver constructs an array of addresses of unused memory pages.
+ These addresses are divided by 4096
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+This is historical, and independent of the guest page size
+\end_layout
+
+\end_inset
+
+ and the descriptor describing the resulting 32-bit array is added to the
+ inflateq.
+\end_layout
+
+\end_deeper
+\begin_layout Enumerate
+To remove memory from the balloon (aka.
+ deflate):
+\end_layout
+
+\begin_deeper
+\begin_layout Enumerate
+The driver constructs an array of addresses of memory pages it has previously
+ given to the balloon, as described above.
+ This descriptor is added to the deflateq.
+\end_layout
+
+\begin_layout Enumerate
+If the VIRTIO_BALLOON_F_MUST_TELL_HOST feature is set, the guest may not
+ use these requested pages until that descriptor in the deflateq has been
+ used by the device.
+\end_layout
+
+\begin_layout Enumerate
+Otherwise, the guest may begin to re-use pages previously given to the balloon
+ before the device has acknowledged their withdrawl.
+
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+In this case, deflation advice is merely a courtesy
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_deeper
+\begin_layout Enumerate
+In either case, once the device has completed the inflation or deflation,
+ the
+\begin_inset Quotes eld
+\end_inset
+
+actual
+\begin_inset Quotes erd
+\end_inset
+
+ field of the configuration should be updated to reflect the new number
+ of pages in the balloon.
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+As updates to configuration space are not atomic, this field isn't particularly
+ reliable, but can be used to diagnose buggy guests.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Description
+Memory
+\begin_inset space \space{}
+\end_inset
+
+Statistics
+\end_layout
+
+\begin_layout Standard
+The stats virtqueue is atypical because communication is driven by the device
+ (not the driver).
+ The channel becomes active at driver initialization time when the driver
+ adds an empty buffer and notifies the device.
+ A request for memory statistics proceeds as follows:
+\end_layout
+
+\begin_layout Enumerate
+The device pushes the buffer onto the used ring and sends an interrupt.
+\end_layout
+
+\begin_layout Enumerate
+The driver pops the used buffer and discards it.
+\end_layout
+
+\begin_layout Enumerate
+The driver collects memory statistics and writes them into a new buffer.
+\end_layout
+
+\begin_layout Enumerate
+The driver adds the buffer to the virtqueue and notifies the device.
+\end_layout
+
+\begin_layout Enumerate
+The device pops the buffer (retaining it to initiate a subsequent request)
+ and consumes the statistics.
+\end_layout
+
+\begin_layout Description
+Memory
+\begin_inset space \space{}
+\end_inset
+
+Statistics
+\begin_inset space \space{}
+\end_inset
+
+Format Each statistic consists of a 16 bit tag and a 64 bit value.
+ Both quantities are represented in the native endian of the guest.
+ All statistics are optional and the driver may choose which ones to supply.
+ To guarantee backwards compatibility, unsupported statistics should be
+ omitted.
+\end_layout
+
+\begin_deeper
+\begin_layout Standard
+\begin_inset listings
+inline false
+status open
+
+\begin_layout Plain Layout
+
+struct virtio_balloon_stat {
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_BALLOON_S_SWAP_IN 0
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_BALLOON_S_SWAP_OUT 1
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_BALLOON_S_MAJFLT 2
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_BALLOON_S_MINFLT 3
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_BALLOON_S_MEMFREE 4
+\end_layout
+
+\begin_layout Plain Layout
+
+#define VIRTIO_BALLOON_S_MEMTOT 5
+\end_layout
+
+\begin_layout Plain Layout
+
+ u16 tag;
+\end_layout
+
+\begin_layout Plain Layout
+
+ u64 val;
+\end_layout
+
+\begin_layout Plain Layout
+
+} __attribute__((packed));
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_deeper
+\begin_layout Description
+Tags
+\end_layout
+
+\begin_layout Description
+VIRTIO_BALLOON_S_SWAP_IN The amount of memory that has been swapped in (in
+ bytes).
+\end_layout
+
+\begin_layout Description
+VIRTIO_BALLOON_S_SWAP_OUT The amount of memory that has been swapped out
+ to disk (in bytes).
+\end_layout
+
+\begin_layout Description
+VIRTIO_BALLOON_S_MAJFLT The number of major page faults that have occurred.
+\end_layout
+
+\begin_layout Description
+VIRTIO_BALLOON_S_MINFLT The number of minor page faults that have occurred.
+\end_layout
+
+\begin_layout Description
+VIRTIO_BALLOON_S_MEMFREE The amount of memory not being used for any purpose
+ (in bytes).
+\end_layout
+
+\begin_layout Description
+VIRTIO_BALLOON_S_MEMTOT The total amount of memory available (in bytes).
+\end_layout
+
+\end_body
+\end_document