diff options
author | Michael S. Tsirkin <mst@redhat.com> | 2012-06-26 13:26:21 +0300 |
---|---|---|
committer | Michael S. Tsirkin <mst@redhat.com> | 2012-06-26 13:26:21 +0300 |
commit | 612aa99cfe80f02e1460eb041f7b9f30826065d8 (patch) | |
tree | 52735adb2f19c2d0b53a21dd2474cc309e2cdf02 | |
parent | 4b2916a37009c9b3d62082f6e0b0f66760cf7efb (diff) | |
download | virtio-spec-612aa99cfe80f02e1460eb041f7b9f30826065d8.tar.gz |
virtio spec 0.8.10
-rw-r--r-- | virtio.lyx | 5903 |
1 files changed, 5903 insertions, 0 deletions
diff --git a/virtio.lyx b/virtio.lyx new file mode 100644 index 0000000..f7c9c38 --- /dev/null +++ b/virtio.lyx @@ -0,0 +1,5903 @@ +#LyX 1.6.7 created this file. For more info see http://www.lyx.org/ +\lyxformat 345 +\begin_document +\begin_header +\textclass report +\use_default_options false +\language english +\inputencoding auto +\font_roman default +\font_sans default +\font_typewriter default +\font_default_family default +\font_sc false +\font_osf false +\font_sf_scale 100 +\font_tt_scale 100 + +\graphics default +\paperfontsize default +\spacing single +\use_hyperref false +\papersize default +\use_geometry false +\use_amsmath 1 +\use_esint 1 +\cite_engine basic +\use_bibtopic false +\paperorientation portrait +\secnumdepth 3 +\tocdepth 3 +\paragraph_separation skip +\defskip medskip +\quotes_language english +\papercolumns 1 +\papersides 1 +\paperpagestyle default +\tracking_changes true +\output_changes true +\author "" +\author "" +\end_header + +\begin_body + +\begin_layout Title +Virtio PCI Card Specification +\begin_inset Newline newline +\end_inset + +v0.8.10 DRAFT +\begin_inset Newline newline +\end_inset + +- +\end_layout + +\begin_layout Author +Rusty Russell <rusty@rustcorp.com.au> +\begin_inset Newline newline +\end_inset + +IBM Corporation +\end_layout + +\begin_layout Date +2010 October 6. +\end_layout + +\begin_layout Chapter +Purpose and Description +\end_layout + +\begin_layout Standard +This document describes the specifications of the +\begin_inset Quotes eld +\end_inset + +virtio +\begin_inset Quotes erd +\end_inset + + family of +\emph on +PCI +\emph default + +\begin_inset CommandInset nomenclature +LatexCommand nomenclature +symbol "PCI" +description "Peripheral Component Interconnect; a common device bus. See\\\\http://en.wikipedia.org/wiki/Peripheral Component Interconnect" + +\end_inset + + devices. + These are devices are found in +\emph on +virtual +\emph default + +\emph on +environments +\begin_inset CommandInset nomenclature +LatexCommand nomenclature +symbol "virtualized" +description "Environments where access to hardware is restricted (and often emulated) by a hypervisor." + +\end_inset + + +\emph default +, yet by design they are not all that different from physical PCI devices, + and this document treats them as such. + This allows the guest to use standard PCI drivers and discovery mechanisms. +\end_layout + +\begin_layout Standard +The purpose of virtio and this specification is that virtual environments + and guests should have a straightforward, efficient, standard and extensible + mechanism for virtual devices, rather than boutique per-environment or + per-OS mechanisms. +\end_layout + +\begin_layout Description +Straightforward: Virtio PCI devices use normal PCI mechanisms of interrupts + and DMA which should be familiar to any device driver author. + There is no exotic page-flipping or COW mechanism: it's just a PCI device. +\begin_inset Foot +status open + +\begin_layout Plain Layout +This lack of page-sharing implies that the implementation of the device + (e.g. + the hypervisor or host) needs full access to the guest memory. + Communication with untrusted parties (i.e. + inter-guest communication) requires copying. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Description +Efficient: Virtio PCI devices consist of rings of descriptors for input + and output, which are neatly separated to avoid cache effects from both + guest and device writing to the same cache lines. +\end_layout + +\begin_layout Description +Standard: Virtio PCI makes no assumptions about the environment in which + it operates, beyond supporting PCI. + In fact the virtio devices specified in the appendices do not require PCI + at all: they have been implemented on non-PCI buses. +\begin_inset Foot +status open + +\begin_layout Plain Layout +The Linux implementation further separates the PCI virtio code from the + specific virtio drivers: these drivers are shared with the non-PCI implementati +ons (currently lguest and S/390). +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Description +Extensible: Virtio PCI devices contain feature bits which are acknowledged + by the guest operating system during device setup. + This allows forwards and backwards compatibility: the device offers all + the features it knows about, and the driver acknowledges those it understands + and wishes to use. +\end_layout + +\begin_layout Section +Virtqueues +\end_layout + +\begin_layout Standard +The mechanism for bulk data transport on virtio PCI devices is pretentiously + called a virtqueue. + Each device can have zero or more virtqueues: for example, the network + device has one for transmit and one for receive. +\end_layout + +\begin_layout Standard +Each virtqueue occupies two or more physically-contiguous pages (defined, + for the purposes of this specification, as 4096 bytes), and consists of + three parts: +\end_layout + +\begin_layout Standard +\begin_inset Tabular +<lyxtabular version="3" rows="1" columns="4"> +<features> +<column alignment="center" valignment="top" width="0"> +<column alignment="center" valignment="top" width="0"> +<column alignment="center" valignment="top" width="0"> +<column alignment="center" valignment="top" width="0"> +<row> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Descriptor Table +\end_layout + +\end_inset +</cell> +<cell multicolumn="1" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Available Ring +\begin_inset space ~ +\end_inset + + +\begin_inset space ~ +\end_inset + + +\begin_inset space ~ +\end_inset + + +\begin_inset space ~ +\end_inset + + +\begin_inset space ~ +\end_inset + + +\emph on +(padding) +\end_layout + +\end_inset +</cell> +<cell multicolumn="2" alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Used Ring +\end_layout + +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\end_layout + +\begin_layout Standard +When the driver wants to send buffers to the device, it puts them in one + or more slots in the descriptor table, and writes the descriptor indices + into the available ring. + It then notifies the device. + When the device has finished with the buffers, it writes the descriptors + into the used ring, and sends an interrupt. +\end_layout + +\begin_layout Chapter +Specification +\end_layout + +\begin_layout Section +PCI Discovery +\end_layout + +\begin_layout Standard +Any PCI device with Vendor ID 0x1AF4, and Device ID 0x1000 through 0x103F + inclusive is a virtio device +\begin_inset Foot +status open + +\begin_layout Plain Layout +The actual value within this range is ignored +\end_layout + +\end_inset + +. + The device must also have a Revision ID of 0 to match this specification. +\end_layout + +\begin_layout Standard +The Subsystem Device ID indicates which virtio device is supported by the + device. + The Subsystem Vendor ID should reflect the PCI Vendor ID of the environment + (it's currently only used for informational purposes by the guest). +\end_layout + +\begin_layout Standard +\begin_inset Tabular +<lyxtabular version="3" rows="8" columns="3"> +<features> +<column alignment="center" valignment="top" width="0"> +<column alignment="center" valignment="top" width="0"> +<column alignment="center" valignment="bottom" width="0"> +<row> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Subsystem Device ID +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Virtio Device +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Specification +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +1 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +network card +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Appendix C +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +2 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +block device +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Appendix D +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +3 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +console +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Appendix E +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +4 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +entropy source +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Appendix F +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +5 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +memory ballooning +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Appendix G +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +6 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +ioMemory +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +9 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +9P transport +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +- +\end_layout + +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\end_layout + +\begin_layout Section +Device Configuration +\end_layout + +\begin_layout Standard +To configure the device, we use the first I/O region of the PCI device. + This contains a +\emph on +virtio header +\emph default + followed by a +\emph on +device-specific region. +\end_layout + +\begin_layout Standard +There may be different widths of accesses to the I/O region; the +\begin_inset Quotes eld +\end_inset + +natural +\begin_inset Quotes erd +\end_inset + + access method for each field in the virtio header must be used (i.e. + 32-bit accesses for 32-bit fields, etc), but the device-specific region + can be accessed using any width accesses, and should obtain the same results. +\end_layout + +\begin_layout Standard +Note that this is possible because while the virtio header is PCI (i.e. + little) endian, the device-specific region is encoded in the native endian + of the guest (where such distinction is applicable). +\end_layout + +\begin_layout Subsection +Device Initialization Sequence +\end_layout + +\begin_layout Standard +We start with an overview of device initialization, then expand on the details + of the device and how each step is preformed. +\end_layout + +\begin_layout Enumerate +Reset the device. + This is not required on initial start up. +\end_layout + +\begin_layout Enumerate +The ACKNOWLEDGE status bit is set: we have noticed the device. +\end_layout + +\begin_layout Enumerate +The DRIVER status bit is set: we know how to drive the device. +\end_layout + +\begin_layout Enumerate +Device-specific setup, including reading the Device Feature Bits, discovery + of virtqueues for the device, optional MSI-X setup, and reading and possibly + writing the virtio configuration space. +\end_layout + +\begin_layout Enumerate +The subset of Device Feature Bits understood by the driver is written to + the device. +\end_layout + +\begin_layout Enumerate +The DRIVER_OK status bit is set. +\end_layout + +\begin_layout Enumerate +The device can now be used (ie. + buffers added to the virtqueues) +\begin_inset Foot +status open + +\begin_layout Plain Layout +Historically, drivers have used the device before steps 5 and 6. + This is only allowed if the driver does not use any features which would + alter this early use of the device. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +If any of these steps go irrecoverably wrong, the guest should set the FAILED + status bit to indicate that it has given up on the device (it can reset + the device later to restart if desired). +\end_layout + +\begin_layout Standard +We now cover the fields required for general setup in detail. +\end_layout + +\begin_layout Subsection +Virtio Header +\end_layout + +\begin_layout Standard +The virtio header looks as follows: +\end_layout + +\begin_layout Standard +\begin_inset Tabular +<lyxtabular version="3" rows="4" columns="9"> +<features> +<column alignment="left" valignment="top" width="0"> +<column alignment="left" valignment="top" width="0"> +<column alignment="left" valignment="top" width="0"> +<column alignment="left" valignment="top" width="0"> +<column alignment="left" valignment="top" width="0"> +<column alignment="left" valignment="top" width="0"> +<column alignment="left" valignment="top" width="0"> +<column alignment="left" valignment="top" width="0"> +<column alignment="left" valignment="top" width="0"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Bits +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +32 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +32 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +32 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +16 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +16 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +16 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +8 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +8 +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Read/Write +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +R +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +R+W +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +R+W +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +R +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +R+W +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +R+W +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +R+W +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +R +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Purpose +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\size footnotesize +Device +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\size footnotesize +Guest +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\size footnotesize +Queue +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\size footnotesize +Queue +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\size footnotesize +Queue +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\size footnotesize +Queue +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\size footnotesize +Device +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\size footnotesize +ISR +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\size footnotesize +Features +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\size footnotesize +Features +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\size footnotesize +Address +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\size footnotesize +Size +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\size footnotesize +Select +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\size footnotesize +Notify +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\size footnotesize +Status +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\size footnotesize +Status +\end_layout + +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\end_layout + +\begin_layout Standard +If MSI-X is enabled for the device, two additional fields immediately follow + this header: +\end_layout + +\begin_layout Standard +\begin_inset Tabular +<lyxtabular version="3" rows="4" columns="3"> +<features> +<column alignment="left" valignment="top" width="0"> +<column alignment="left" valignment="top" width="0"> +<column alignment="left" valignment="top" width="0"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Bits +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +16 +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +16 +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Read/Write +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +R+W +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +R+W +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Purpose +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\size footnotesize +Configuration +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\size footnotesize +Queue +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +(MSI-X) +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" bottomline="true" leftline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\size footnotesize +Vector +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\size footnotesize +Vector +\end_layout + +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\end_layout + +\begin_layout Standard +Immediately following these general headers, there may be device-specific + headers: +\end_layout + +\begin_layout Standard +\begin_inset Tabular +<lyxtabular version="3" rows="4" columns="2"> +<features> +<column alignment="left" valignment="top" width="0"> +<column alignment="left" valignment="top" width="0"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Bits +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Device Specific +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Read/Write +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Device Specific +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout +Purpose +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\size footnotesize +Device Specific... +\end_layout + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset +</cell> +<cell alignment="center" valignment="top" bottomline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\begin_layout Plain Layout + +\end_layout + +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\end_layout + +\begin_layout Subsubsection +Device Status +\end_layout + +\begin_layout Standard +The Device Status field is updated by the guest to indicate its progress. + This provides a simple low-level diagnostic: it's most useful to imagine + them hooked up to traffic lights on the console indicating the status of + each device. +\end_layout + +\begin_layout Standard +The device can be reset by writing a 0 to this field, otherwise at least + one bit should be set: +\end_layout + +\begin_layout Description +ACKNOWLEDGE +\begin_inset space ~ +\end_inset + +(1) Indicates that the guest OS has found the device and recognized it as + a valid virtio device. +\end_layout + +\begin_layout Description +DRIVER +\begin_inset space ~ +\end_inset + +(2) Indicates that the guest OS knows how to drive the device. + Under Linux, drivers can be loadable modules so there may be a significant + (or infinite) delay before setting this bit. +\end_layout + +\begin_layout Description +DRIVER_OK +\begin_inset space ~ +\end_inset + +(3) Indicates that the driver is set up and ready to drive the device. +\end_layout + +\begin_layout Description +FAILED +\begin_inset space ~ +\end_inset + +(128) Indicates that something went wrong in the guest, and it has given + up on the device. + This could be an internal error, or the driver didn't like the device for + some reason, or even a fatal error during device operation. + The device must be reset before attempting to re-initialize. +\end_layout + +\begin_layout Subsubsection +Feature Bits +\end_layout + +\begin_layout Standard +The least significant 31 bits of the first configuration field indicates + the features that the device supports (the high bit is reserved, and will + be used to indicate the presence of future feature bits elsewhere). + The bits are allocated as follows: +\end_layout + +\begin_layout Description +0 +\begin_inset space ~ +\end_inset + +to +\begin_inset space ~ +\end_inset + +23 Feature bits for the specific device type +\end_layout + +\begin_layout Description +24 +\begin_inset space \space{} +\end_inset + +to +\begin_inset space ~ +\end_inset + +30 Feature bits reserved for extensions to the queue mechanism +\end_layout + +\begin_layout Standard +For example, feature bit 0 for a network device (i.e. + Subsystem Device ID 1) indicates that the device supports checksumming + of packets. +\end_layout + +\begin_layout Standard +The feature bits are +\emph on +negotiated: +\emph default + the device lists all the features it understands in the Device Features + field, and the guest writes the subset that it understands into the Guest + Features field. + The only way to renegotiate is to reset the device. +\end_layout + +\begin_layout Standard +In particular, new fields in the device configuration header are indicated + by offering a feature bit, so the guest can check before accessing that + part of the configuration space. +\end_layout + +\begin_layout Standard +This allows for forwards and backwards compatibility: if the device is enhanced + with a new feature bit, older guests will not write that feature bit back + to the Guest Features field and it can go into backwards compatibility + mode. + Similarly, if a guest is enhanced with a feature that the device doesn't + support, it will not see that feature bit in the Device Features field + and can go into backwards compatibility mode (or, for poor implementations, + set the FAILED Device Status bit). +\end_layout + +\begin_layout Subsubsection +Configuration/Queue Vectors +\end_layout + +\begin_layout Standard +When MSI-X capability is present and enabled in the device (through standard + PCI configuration space) 4 bytes at byte offset 20 are used to map configuratio +n change and queue interrupts to MSI-X vectors. + In this case, the ISR Status field is unused, and device specific configuration + starts at byte offset 24 in virtio header structure. + When MSI-X capability is not enabled, device specific configuration starts + at byte offset 20 in virtio header. +\end_layout + +\begin_layout Standard +Writing a valid MSI-X Table entry number, 0 to 0x7FF, to one of Configuration/Qu +eue Vector registers, +\emph on +maps +\emph default + interrupts triggered by the configuration change/selected queue events + respectively to the corresponding MSI-X vector. + To disable interrupts for a specific event type, unmap it by writing a + special NO_VECTOR value: +\end_layout + +\begin_layout Standard +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +/* Vector value used to disable MSI for queue */ +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_MSI_NO_VECTOR 0xffff +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +Reading these registers returns vector mapped to a given event, or NO_VECTOR + if unmapped. + All queue and configuration change events are unmapped by default. +\end_layout + +\begin_layout Standard +Note that mapping an event to vector might require allocating internal device + resources, and might fail. + Devices report such failures by returning the NO_VECTOR value when the + relevant Vector field is read. + After mapping an event to vector, the driver must verify success by reading + the Vector field value: on success, the previously written value is returned, + and on failure, NO_VECTOR is returned. + If a mapping failure is detected, the driver can retry mapping with fewervector +s, or disable MSI-X. +\end_layout + +\begin_layout Section +Virtqueue Configuration +\end_layout + +\begin_layout Standard +As a device can have zero or more virtqueues for bulk data transport (for + example, the network driver has two), the driver needs to configure them + as part of the device-specific configuration. +\end_layout + +\begin_layout Standard +This is done as follows, for each virtqueue a device has: +\end_layout + +\begin_layout Enumerate +Write the virtqueue index (first queue is 0) to the Queue Select field. +\end_layout + +\begin_layout Enumerate +Read the virtqueue size from the Queue Size field, which is always a power + of 2. + This controls how big the virtqueue is (see below). + If this field is 0, the virtqueue does not exist. + +\end_layout + +\begin_layout Enumerate +Allocate and zero virtqueue in contiguous physical memory, on a 4096 byte + alignment. + Write the physical address, divided by 4096 to the Queue Address field. +\begin_inset Foot +status open + +\begin_layout Plain Layout +The 4096 is based on the x86 page size, but it's also large enough to ensure + that the separate parts of the virtqueue are on separate cache lines. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Enumerate +Optionally, if MSI-X capability is present and enabled on the device, select + a vector to use to request interrupts triggered by virtqueue events. + Write the MSI-X Table entry number corresponding to this vector in Queue + Vector field. + Read the Queue Vector field: on success, previously written value is returned; + on failure, NO_VECTOR value is returned. +\end_layout + +\begin_layout Standard +The Queue Size field controls the total number of bytes required for the + virtqueue according to the following formula: +\end_layout + +\begin_layout Standard +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +#define ALIGN(x) (((x) + 4095) & ~4095) +\end_layout + +\begin_layout Plain Layout + +static inline unsigned vring_size(unsigned int qsz) +\end_layout + +\begin_layout Plain Layout + +{ +\end_layout + +\begin_layout Plain Layout + + return ALIGN(sizeof(struct vring_desc)*qsz + sizeof(u16)*(2 + qsz)) +\end_layout + +\begin_layout Plain Layout + + + ALIGN(sizeof(struct vring_used_elem)*qsz); +\end_layout + +\begin_layout Plain Layout + +} +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +This currently wastes some space with padding, but also allows future extensions. + The virtqueue layout structure looks like this (qsz is the Queue Size field, + which is a variable, so this code won't compile): +\end_layout + +\begin_layout Standard +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +struct vring { +\end_layout + +\begin_layout Plain Layout + + /* The actual descriptors (16 bytes each) */ +\end_layout + +\begin_layout Plain Layout + + struct vring_desc desc[qsz]; +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + + /* A ring of available descriptor heads with free-running index. + */ +\end_layout + +\begin_layout Plain Layout + + struct vring_avail avail; +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + + // Padding to the next 4096 boundary. +\end_layout + +\begin_layout Plain Layout + + char pad[]; +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + + // A ring of used descriptor heads with free-running index. +\end_layout + +\begin_layout Plain Layout + + struct vring_used used; +\end_layout + +\begin_layout Plain Layout + +}; +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +A Note on Virtqueue Endianness +\end_layout + +\begin_layout Standard +Note that the +\emph on +endian +\emph default + of these fields and everything else in the virtqueue is the native endian + of the guest, not little-endian as PCI normally is. + This makes for simpler guest code, and it is assumed that the host already + has to be deeply aware of the guest endian so such an +\begin_inset Quotes eld +\end_inset + +endian-aware +\begin_inset Quotes erd +\end_inset + + device is not a significant issue. +\end_layout + +\begin_layout Subsection +Descriptor Table +\end_layout + +\begin_layout Standard +The descriptor table refers to the buffers the guest is using for the device. + The addresses are physical addresses, and the buffers can be chained via + the next field. + Each descriptor describes a buffer which is read-only or write-only, but + a chain of descriptors can contain both read-only and write-only buffers. +\end_layout + +\begin_layout Standard +No descriptor chain may be more than 2^32 bytes long in total. +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +struct vring_desc { +\end_layout + +\begin_layout Plain Layout + + /* Address (guest-physical). + */ +\end_layout + +\begin_layout Plain Layout + + u64 addr; +\end_layout + +\begin_layout Plain Layout + + /* Length. + */ +\end_layout + +\begin_layout Plain Layout + + u32 len; +\end_layout + +\begin_layout Plain Layout + +/* This marks a buffer as continuing via the next field. + */ +\end_layout + +\begin_layout Plain Layout + +#define VRING_DESC_F_NEXT 1 +\end_layout + +\begin_layout Plain Layout + +/* This marks a buffer as write-only (otherwise read-only). + */ +\end_layout + +\begin_layout Plain Layout + +#define VRING_DESC_F_WRITE 2 +\end_layout + +\begin_layout Plain Layout + +/* This means the buffer contains a list of buffer descriptors. + */ +\end_layout + +\begin_layout Plain Layout + +#define VRING_DESC_F_INDIRECT 4 +\end_layout + +\begin_layout Plain Layout + + /* The flags as indicated above. + */ +\end_layout + +\begin_layout Plain Layout + + u16 flags; +\end_layout + +\begin_layout Plain Layout + + /* Next field if flags & NEXT */ +\end_layout + +\begin_layout Plain Layout + + u16 next; +\end_layout + +\begin_layout Plain Layout + +}; +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +The number of descriptors in the table is specified by the Queue Size field + for this virtqueue. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "sub:Indirect-Descriptors" + +\end_inset + +Indirect Descriptors +\end_layout + +\begin_layout Standard +Some devices benefit by concurrently dispatching a large number of large + requests. + The VIRTIO_RING_F_INDIRECT_DESC feature can be used to allow this (see + +\begin_inset CommandInset ref +LatexCommand ref +reference "cha:Reserved-Feature-Bits" + +\end_inset + +). + To increase ring capacity it is possible to store a table of +\emph on +indirect descriptors +\emph default + anywhere in memory, and insert a descriptor in main virtqueue (with flags&INDIR +ECT on) that refers to memory buffer containing this +\emph on +indirect descriptor table +\emph default +; fields +\emph on +addr +\emph default + and +\emph on +len +\emph default + refer to the indirect table address and length in bytes, respectively. + The indirect table layout structure looks like this (len is the length + of the descriptor that refers to this table, which is a variable, so this + code won't compile): +\end_layout + +\begin_layout Standard +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +struct indirect_descriptor_table { +\end_layout + +\begin_layout Plain Layout + + /* The actual descriptors (16 bytes each) */ +\end_layout + +\begin_layout Plain Layout + + struct vring_desc desc[len / 16]; +\end_layout + +\begin_layout Plain Layout + +}; +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +The first indirect descriptor is located at start of the indirect descriptor + table (index 0), additional indirect descriptors are chained by next field. + An indirect descriptor without next field (with flags&NEXT off) signals + the end of the indirect descriptor table, and transfers control back to + the main virtqueue. + An indirect descriptor can not refer to another indirect descriptor table + (flags&INDIRECT must be off). + A single indirect descriptor table can include both read-only and write-only + descriptors; write-only flag (flags&WRITE) in the descriptor that refers + to it is ignored. +\end_layout + +\begin_layout Subsection +Available Ring +\end_layout + +\begin_layout Standard +The available ring refers to what descriptors we are offering the device: + it refers to the head of a descriptor chain. + The +\begin_inset Quotes eld +\end_inset + +flags +\begin_inset Quotes erd +\end_inset + + field is currently 0 or 1: 1 indicating that we do not need an interrupt + when the device consumes a descriptor from the available ring. + This interrupt suppression is merely an optimization; it may not suppress + interrupts entirely. +\end_layout + +\begin_layout Standard +The +\begin_inset Quotes eld +\end_inset + +idx +\begin_inset Quotes erd +\end_inset + + field indicates where we would put the +\emph on +next +\emph default + descriptor entry (modulo the ring size). + This starts at 0, and increases. +\end_layout + +\begin_layout Standard +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +struct vring_avail { +\end_layout + +\begin_layout Plain Layout + +#define VRING_AVAIL_F_NO_INTERRUPT 1 +\end_layout + +\begin_layout Plain Layout + + u16 flags; +\end_layout + +\begin_layout Plain Layout + + u16 idx; +\end_layout + +\begin_layout Plain Layout + + u16 ring[qsz]; /* qsz is the Queue Size field read from device */ +\end_layout + +\begin_layout Plain Layout + +}; +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +Used Ring +\end_layout + +\begin_layout Standard +The used ring is where the device returns buffers once it is done with them. + The flags field can be used by the device to hint that no notification + is necessary when the guest adds to the +\emph on +available +\emph default + ring (the flag is kept here because this is the only part of the virtqueue + written by the device). +\end_layout + +\begin_layout Standard +Each entry in the ring is a pair: the head entry of the descriptor chain + describing the buffer (this matches an entry placed in the available ring + by the guest earlier), and the total of bytes written into the buffer. + The latter is extremely useful for guests using untrusted buffers: if you + do not know exactly how much has been written by the device, you usually + have to zero the buffer to ensure no data leakage occurs. +\end_layout + +\begin_layout Standard +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +/* u32 is used here for ids for padding reasons. + */ +\end_layout + +\begin_layout Plain Layout + +struct vring_used_elem { +\end_layout + +\begin_layout Plain Layout + + /* Index of start of used descriptor chain. + */ +\end_layout + +\begin_layout Plain Layout + + u32 id; +\end_layout + +\begin_layout Plain Layout + + /* Total length of the descriptor chain which was used (written to) + */ +\end_layout + +\begin_layout Plain Layout + + u32 len; +\end_layout + +\begin_layout Plain Layout + +}; +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + +struct vring_used { +\end_layout + +\begin_layout Plain Layout + +#define VRING_USED_F_NO_NOTIFY 1 +\end_layout + +\begin_layout Plain Layout + + u16 flags; +\end_layout + +\begin_layout Plain Layout + + u16 idx; +\end_layout + +\begin_layout Plain Layout + + struct vring_used_elem ring[qsz]; +\end_layout + +\begin_layout Plain Layout + +}; +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +Helpers for Managing Virtqueues +\end_layout + +\begin_layout Standard +The Linux Kernel Source code contains the definitions above and helper routines + in a more usable form, in include/linux/virtio_ring.h. + This was explicitly licensed by IBM under the (3-clause) BSD license so + that it can be freely used by all other projects, and is reproduced (with + slight variation to remove Linux assumptions) in Appendix A. +\end_layout + +\begin_layout Section +Device Operation +\end_layout + +\begin_layout Standard +There are two parts to device operation: supplying new buffers to the device, + and processing used buffers from the device. + As an example, the virtio network device has two virtqueues: the transmit + virtqueue and the receive virtqueue. + The driver adds outgoing (read-only) packets to the transmit virtqueue, + and then frees them after they are used. + Similarly, incoming (write-only) buffers are added to the receive virtqueue, + and processed after they are used. +\end_layout + +\begin_layout Subsection +Supplying Buffers to The Device +\end_layout + +\begin_layout Standard +Actual transfer of buffers from the guest OS to the device operates as follows: +\end_layout + +\begin_layout Enumerate +Place the buffer(s) into free descriptor(s). +\end_layout + +\begin_deeper +\begin_layout Enumerate +If there are no free descriptors, the guest may choose to notify the device + even if notifications are suppressed (to reduce latency). +\begin_inset Foot +status open + +\begin_layout Plain Layout +The Linux drivers do this only for read-only buffers: for write-only buffers, + it is assumed that the driver is merely trying to keep the receive buffer + ring full, and no notification of this expected condition is necessary. +\end_layout + +\end_inset + + +\end_layout + +\end_deeper +\begin_layout Enumerate +Place the id of the buffer in the next ring entry of the available ring. +\end_layout + +\begin_layout Enumerate +The steps (1) and (2) may be performed repeatedly if batching is possible. +\end_layout + +\begin_layout Enumerate +A memory barrier should be executed to ensure the device sees the updated + descriptor table and available ring before the next step. +\end_layout + +\begin_layout Enumerate +The available +\begin_inset Quotes eld +\end_inset + +idx +\begin_inset Quotes erd +\end_inset + + field should be increased by the number of entries added to the available + ring. +\end_layout + +\begin_layout Enumerate +A memory barrier should be executed to ensure that we update the idx field + before checking for notification suppression. +\end_layout + +\begin_layout Enumerate +If notifications are not suppressed, the device should be notified of the + new buffers. +\end_layout + +\begin_layout Standard +Note that the above code does not take precautions against the available + ring buffer wrapping around: this is not possible since the ring buffer + is the same size as the descriptor table, so step (1) will prevent such + a condition. +\end_layout + +\begin_layout Standard +In addition, the maximum queue size is 32768 (it must be a power of 2 which + fits in 16 bits), so the 16-bit +\begin_inset Quotes eld +\end_inset + +idx +\begin_inset Quotes erd +\end_inset + + value can always distinguish between a full and empty buffer. +\end_layout + +\begin_layout Standard +Here is a description of each stage in more detail. +\end_layout + +\begin_layout Subsubsection +Placing Buffers Into The Descriptor Table +\end_layout + +\begin_layout Standard +A buffer consists of zero or more read-only physically-contiguous elements + followed by zero or more physically-contiguous write-only elements (it + must have at least one element). + This algorithm maps it into the descriptor table: +\end_layout + +\begin_layout Enumerate +for each buffer element, +\family typewriter +b +\family default +: +\end_layout + +\begin_deeper +\begin_layout Enumerate +Get the next free descriptor table entry, +\family typewriter +d +\end_layout + +\begin_layout Enumerate +Set +\family typewriter +d.addr +\family default + to the physical address of the start of +\family typewriter +b +\end_layout + +\begin_layout Enumerate +Set +\family typewriter +d.len +\family default + to the length of +\family typewriter +b +\family default +. +\end_layout + +\begin_layout Enumerate +If +\family typewriter +b +\family default + is write-only, set +\family typewriter +d.flags +\family default + to VRING_DESC_F_WRITE, otherwise 0. +\end_layout + +\begin_layout Enumerate +If there is a buffer element after this: +\end_layout + +\begin_deeper +\begin_layout Enumerate +Set +\family typewriter +d.next +\family default + to the index of the next free descriptor element. +\end_layout + +\begin_layout Enumerate +Set the VRING_DESC_F_NEXT bit in +\family typewriter +d.flags +\family default +. +\end_layout + +\end_deeper +\end_deeper +\begin_layout Standard +In practice, the d.next fields are usually used to chain free descriptors, + and a separate count kept to check there are enough free descriptors before + beginning the mappings. +\end_layout + +\begin_layout Subsubsection +Updating The Available Ring +\end_layout + +\begin_layout Standard +The head of the buffer we mapped is the first +\family typewriter +d +\family default + in the algorithm above. + A naive implementation would do the following: +\end_layout + +\begin_layout Standard +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +avail->ring[avail->idx % qsz] = head; +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +However, in general we can add many descriptors before we update the +\begin_inset Quotes eld +\end_inset + +idx +\begin_inset Quotes erd +\end_inset + + field (at which point they become visible to the device), so we keep a + counter of how many we've added: +\end_layout + +\begin_layout Standard +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +avail->ring[(avail->idx + added++) % qsz] = head; +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsubsection +Updating The Index Field +\end_layout + +\begin_layout Standard +Once the idx field of the virtqueue is updated, the device will be able + to access the descriptor entries we've created and the memory they refer + to. + This is why a memory barrier is generally used before the idx update, to + ensure it sees the most up-to-date copy. +\end_layout + +\begin_layout Standard +The idx field always increments, and we let it wrap naturally at 65536: +\end_layout + +\begin_layout Standard +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +avail->idx += added; +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsubsection +\begin_inset CommandInset label +LatexCommand label +name "sub:Notifying-The-Device" + +\end_inset + +Notifying The Device +\end_layout + +\begin_layout Standard +Device notification occurs by writing the 16-bit virtqueue index of this + virtqueue to the Queue Notify field of the virtio header in the first I/O + region of the PCI device. + This can be expensive, however, so the device can suppress such notifications + if it doesn't need them. + We have to be careful to expose the new idx value +\emph on +before +\emph default + checking the suppression flag: it's OK to notify gratuitously, but not + to omit a required notification. + So again, we use a memory barrier here before reading the flags. +\end_layout + +\begin_layout Standard +If the VRING_USED_F_NOTIFY flag is not set, we go ahead and write to the + PCI configuration space. +\end_layout + +\begin_layout Subsection +\begin_inset CommandInset label +LatexCommand label +name "sub:Receiving-Used-Buffers" + +\end_inset + +Receiving Used Buffers From The Device +\end_layout + +\begin_layout Standard +Once the device has used a buffer (read from or written to it, or parts + of both, depending on the nature of the virtqueue and the device), it sends + an interrupt, following an algorithm very similar to the algorithm used + for the driver to send the device a buffer: +\end_layout + +\begin_layout Enumerate +Write the head descriptor number to the next field in the used ring. +\end_layout + +\begin_layout Enumerate +Update the used ring idx. +\end_layout + +\begin_layout Enumerate +If the VRING_AVAIL_F_NO_INTERRUPT flag is not set in avail\SpecialChar \nobreakdash- +>flags: +\end_layout + +\begin_deeper +\begin_layout Enumerate +If MSI-X capability is disabled: +\end_layout + +\begin_deeper +\begin_layout Enumerate +Set the lower bit of the ISR Status field for the device. +\end_layout + +\begin_layout Enumerate +Send the appropriate PCI interrupt for the device. +\end_layout + +\end_deeper +\begin_layout Enumerate +If MSI-X capability is enabled: +\end_layout + +\begin_deeper +\begin_layout Enumerate +Request the appropriate MSI-X interrupt message for the device, Queue Vector + field sets the MSI-X Table entry number. +\end_layout + +\begin_layout Enumerate +If Queue Vector field value is NO_VECTOR, no interrupt message is requested + for this event. +\end_layout + +\end_deeper +\end_deeper +\begin_layout Standard +The guest interrupt handler should: +\end_layout + +\begin_layout Enumerate +If MSI-X capability is disabled: read the ISR Status field, which will reset + it to zero. + If the lower bit is zero, the interrupt was not for this device. + Otherwise, the guest driver should look through the used rings of each + virtqueue for the device, to see if any progress has been made by the device + which requires servicing. +\end_layout + +\begin_layout Enumerate +If MSI-X capability is enabled: look through the used rings of each virtqueue + mapped to the specific MSI-X vector for the device, to see if any progress + has been made by the device which requires servicing. +\end_layout + +\begin_layout Standard +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +while (vq->last_seen_used != vring->used.idx) { +\end_layout + +\begin_layout Plain Layout + + struct vring_used_elem *e = vring.used->ring[vq->last_seen_used%vsz]; +\end_layout + +\begin_layout Plain Layout + + process_buffer(e); +\end_layout + +\begin_layout Plain Layout + + vq->last_seen_used++; +\end_layout + +\begin_layout Plain Layout + +} +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection +Dealing With Configuration Changes +\end_layout + +\begin_layout Standard +Some virtio PCI devices can change the device configuration state, as reflected + in the virtio header in the PCI configuration space. + In this case: +\end_layout + +\begin_layout Enumerate +If MSI-X capability is disabled: an interrupt is delivered and the second + highest bit is set in the ISR Status field to indicate that the driver + should re-examine the configuration space.Note that a single interrupt can + indicate both that one or more virtqueue has been used and that the configurati +on space has changed: even if the config bit is set, virtqueues must be + scanned. +\end_layout + +\begin_layout Enumerate +If MSI-X capability is enabled: an interrupt message is requested. + The Configuration Vector field sets the MSI-X Table entry number to use. + If Configuration Vector field value is NO_VECTOR, no interrupt message + is requested for this event. +\end_layout + +\begin_layout Chapter +Creating New Device Types +\end_layout + +\begin_layout Standard +Various considerations are necessary when creating a new device type: +\end_layout + +\begin_layout Section* +How Many Virtqueues? +\end_layout + +\begin_layout Standard +It is possible that a very simple device will operate entirely through its + configuration space, but most will need at least one virtqueue in which + it will place requests. + A device with both input and output (eg. + console and network devices described here) need two queues: one which + the driver fills with buffers to receive input, and one which the driver + places buffers to transmit output. +\end_layout + +\begin_layout Section* +What Configuration Space Layout? +\end_layout + +\begin_layout Standard +Configuration space is generally used for rarely-changing or initialization-time + parameters. + But it is a limited resource, so it might be better to use a virtqueue + to update configuration information (the network device does this for filtering +, otherwise the table in the config space could potentially be very large). +\end_layout + +\begin_layout Standard +Note that this space is generally the guest's native endian, rather than + PCI's little-endian. +\end_layout + +\begin_layout Section* +What Device Number? +\end_layout + +\begin_layout Standard +Currently device numbers are assigned quite freely: a simple request mail + to the author of this document or the Linux virtualization mailing list +\begin_inset Foot +status open + +\begin_layout Plain Layout +https://lists.linux-foundation.org/mailman/listinfo/virtualization +\end_layout + +\end_inset + + will be sufficient to secure a unique one. +\end_layout + +\begin_layout Standard +Meanwhile for experimental drivers, use 65535 and work backwards. +\end_layout + +\begin_layout Section* +How many MSI-X vectors? +\end_layout + +\begin_layout Standard +Using the optional MSI-X capability devices can speed up interrupt processing + by removing the need to read ISR Status register by guest driver (which + might be an expensive operation), reducing interrupt sharing between devices + and queues within the device, and handling interrupts from multiple CPUs. + However, some systems impose a limit (which might be as low as 256) on + the total number of MSI-X vectors that can be allocated to all devices. + Devices and/or device drivers should take this into account, limiting the + number of vectors used unless the device is expected to cause a high volume + of interrupts. + Devices can control the number of vectors used by limiting the MSI-X Table + Size or not presenting MSI-X capability in PCI configuration space. + Drivers can control this by mapping events to as small number of vectors + as possible, or disabling MSI-X capability altogether. +\end_layout + +\begin_layout Section* +Message Framing +\end_layout + +\begin_layout Standard +The descriptors used for a buffer should not effect the semantics of the + message, except for the total length of the buffer. + For example, a network buffer consists of a 10 byte header followed by + the network packet. + Whether this is presented in the ring descriptor chain as (say) a 10 byte + buffer and a 1514 byte buffer, or a single 1524 byte buffer, or even three + buffers, should have no effect. +\end_layout + +\begin_layout Standard +In particular, no implementation should use the descriptor boundaries to + determine the size of any header in a request. +\begin_inset Foot +status open + +\begin_layout Plain Layout +The current qemu device implementations mistakenly insist that the first + descriptor cover the header in these cases exactly, so a cautious driver + should arrange it so. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section* +Device Improvements +\end_layout + +\begin_layout Standard +Any change to configuration space, or new virtqueues, or behavioural changes, + should be indicated by negotiation of a new feature bit. + This establishes clarity +\begin_inset Foot +status open + +\begin_layout Plain Layout +Even if it does mean documenting design or implementation mistakes! +\end_layout + +\end_inset + + and avoids future expansion problems. +\end_layout + +\begin_layout Standard +Clusters of functionality which are always implemented together can use + a single bit, but if one feature makes sense without the others they should + not be gratuitously grouped together to conserve feature bits. + We can always extend the spec when the first person needs more than 24 + feature bits for their device. +\end_layout + +\begin_layout Standard +\begin_inset CommandInset nomencl_print +LatexCommand printnomenclature + +\end_inset + + +\end_layout + +\begin_layout Chapter* +Appendix A: virtio_ring.h +\end_layout + +\begin_layout Standard +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +#ifndef VIRTIO_RING_H +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_RING_H +\end_layout + +\begin_layout Plain Layout + +/* An interface for efficient virtio implementation. +\end_layout + +\begin_layout Plain Layout + + * +\end_layout + +\begin_layout Plain Layout + + * This header is BSD licensed so anyone can use the definitions +\end_layout + +\begin_layout Plain Layout + + * to implement compatible drivers/servers. +\end_layout + +\begin_layout Plain Layout + + * +\end_layout + +\begin_layout Plain Layout + + * Copyright 2007, 2009, IBM Corporation +\end_layout + +\begin_layout Plain Layout + + * All rights reserved. +\end_layout + +\begin_layout Plain Layout + + * +\end_layout + +\begin_layout Plain Layout + + * Redistribution and use in source and binary forms, with or without +\end_layout + +\begin_layout Plain Layout + + * modification, are permitted provided that the following conditions +\end_layout + +\begin_layout Plain Layout + + * are met: +\end_layout + +\begin_layout Plain Layout + + * 1. + Redistributions of source code must retain the above copyright +\end_layout + +\begin_layout Plain Layout + + * notice, this list of conditions and the following disclaimer. +\end_layout + +\begin_layout Plain Layout + + * 2. + Redistributions in binary form must reproduce the above copyright +\end_layout + +\begin_layout Plain Layout + + * notice, this list of conditions and the following disclaimer in the +\end_layout + +\begin_layout Plain Layout + + * documentation and/or other materials provided with the distribution. +\end_layout + +\begin_layout Plain Layout + + * 3. + Neither the name of IBM nor the names of its contributors +\end_layout + +\begin_layout Plain Layout + + * may be used to endorse or promote products derived from this software +\end_layout + +\begin_layout Plain Layout + + * without specific prior written permission. +\end_layout + +\begin_layout Plain Layout + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS + IS'' AND +\end_layout + +\begin_layout Plain Layout + + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +\end_layout + +\begin_layout Plain Layout + + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +\end_layout + +\begin_layout Plain Layout + + * ARE DISCLAIMED. + IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE +\end_layout + +\begin_layout Plain Layout + + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +\end_layout + +\begin_layout Plain Layout + + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +\end_layout + +\begin_layout Plain Layout + + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +\end_layout + +\begin_layout Plain Layout + + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +\end_layout + +\begin_layout Plain Layout + + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + WAY +\end_layout + +\begin_layout Plain Layout + + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +\end_layout + +\begin_layout Plain Layout + + * SUCH DAMAGE. +\end_layout + +\begin_layout Plain Layout + + */ +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + +/* This marks a buffer as continuing via the next field. + */ +\end_layout + +\begin_layout Plain Layout + +#define VRING_DESC_F_NEXT 1 +\end_layout + +\begin_layout Plain Layout + +/* This marks a buffer as write-only (otherwise read-only). + */ +\end_layout + +\begin_layout Plain Layout + +#define VRING_DESC_F_WRITE 2 +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + +/* The Host uses this in used->flags to advise the Guest: don't kick me +\end_layout + +\begin_layout Plain Layout + + * when you add a buffer. + It's unreliable, so it's simply an +\end_layout + +\begin_layout Plain Layout + + * optimization. + Guest will still kick if it's out of buffers. + */ +\end_layout + +\begin_layout Plain Layout + +#define VRING_USED_F_NO_NOTIFY 1 +\end_layout + +\begin_layout Plain Layout + +/* The Guest uses this in avail->flags to advise the Host: don't +\end_layout + +\begin_layout Plain Layout + + * interrupt me when you consume a buffer. + It's unreliable, so it's +\end_layout + +\begin_layout Plain Layout + + * simply an optimization. + */ +\end_layout + +\begin_layout Plain Layout + +#define VRING_AVAIL_F_NO_INTERRUPT 1 +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + +/* Virtio ring descriptors: 16 bytes. +\end_layout + +\begin_layout Plain Layout + + * These can chain together via "next". + */ +\end_layout + +\begin_layout Plain Layout + +struct vring_desc { +\end_layout + +\begin_layout Plain Layout + + /* Address (guest-physical). + */ +\end_layout + +\begin_layout Plain Layout + + uint64_t addr; +\end_layout + +\begin_layout Plain Layout + + /* Length. + */ +\end_layout + +\begin_layout Plain Layout + + uint32_t len; +\end_layout + +\begin_layout Plain Layout + + /* The flags as indicated above. + */ +\end_layout + +\begin_layout Plain Layout + + uint16_t flags; +\end_layout + +\begin_layout Plain Layout + + /* We chain unused descriptors via this, too */ +\end_layout + +\begin_layout Plain Layout + + uint16_t next; +\end_layout + +\begin_layout Plain Layout + +}; +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + +struct vring_avail { +\end_layout + +\begin_layout Plain Layout + + uint16_t flags; +\end_layout + +\begin_layout Plain Layout + + uint16_t idx; +\end_layout + +\begin_layout Plain Layout + + uint16_t ring[]; +\end_layout + +\begin_layout Plain Layout + +}; +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + +/* u32 is used here for ids for padding reasons. + */ +\end_layout + +\begin_layout Plain Layout + +struct vring_used_elem { +\end_layout + +\begin_layout Plain Layout + + /* Index of start of used descriptor chain. + */ +\end_layout + +\begin_layout Plain Layout + + uint32_t id; +\end_layout + +\begin_layout Plain Layout + + /* Total length of the descriptor chain which was written to. + */ +\end_layout + +\begin_layout Plain Layout + + uint32_t len; +\end_layout + +\begin_layout Plain Layout + +}; +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + +struct vring_used { +\end_layout + +\begin_layout Plain Layout + + uint16_t flags; +\end_layout + +\begin_layout Plain Layout + + uint16_t idx; +\end_layout + +\begin_layout Plain Layout + + struct vring_used_elem ring[]; +\end_layout + +\begin_layout Plain Layout + +}; +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + +struct vring { +\end_layout + +\begin_layout Plain Layout + + unsigned int num; +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + + struct vring_desc *desc; +\end_layout + +\begin_layout Plain Layout + + struct vring_avail *avail; +\end_layout + +\begin_layout Plain Layout + + struct vring_used *used; +\end_layout + +\begin_layout Plain Layout + +}; +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + +/* The standard layout for the ring is a continuous chunk of memory which +\end_layout + +\begin_layout Plain Layout + + * looks like this. + We assume num is a power of 2. +\end_layout + +\begin_layout Plain Layout + + * +\end_layout + +\begin_layout Plain Layout + + * struct vring { +\end_layout + +\begin_layout Plain Layout + + * // The actual descriptors (16 bytes each) +\end_layout + +\begin_layout Plain Layout + + * struct vring_desc desc[num]; +\end_layout + +\begin_layout Plain Layout + + * +\end_layout + +\begin_layout Plain Layout + + * // A ring of available descriptor heads with free-running index. +\end_layout + +\begin_layout Plain Layout + + * __u16 avail_flags; +\end_layout + +\begin_layout Plain Layout + + * __u16 avail_idx; +\end_layout + +\begin_layout Plain Layout + + * __u16 available[num]; +\end_layout + +\begin_layout Plain Layout + + * +\end_layout + +\begin_layout Plain Layout + + * // Padding to the next align boundary. +\end_layout + +\begin_layout Plain Layout + + * char pad[]; +\end_layout + +\begin_layout Plain Layout + + * +\end_layout + +\begin_layout Plain Layout + + * // A ring of used descriptor heads with free-running index. +\end_layout + +\begin_layout Plain Layout + + * __u16 used_flags; +\end_layout + +\begin_layout Plain Layout + + * __u16 used_idx; +\end_layout + +\begin_layout Plain Layout + + * struct vring_used_elem used[num]; +\end_layout + +\begin_layout Plain Layout + + * }; +\end_layout + +\begin_layout Plain Layout + + * Note: for virtio PCI, align is 4096. +\end_layout + +\begin_layout Plain Layout + + */ +\end_layout + +\begin_layout Plain Layout + +static inline void vring_init(struct vring *vr, unsigned int num, void *p, +\end_layout + +\begin_layout Plain Layout + + unsigned long align) +\end_layout + +\begin_layout Plain Layout + +{ +\end_layout + +\begin_layout Plain Layout + + vr->num = num; +\end_layout + +\begin_layout Plain Layout + + vr->desc = p; +\end_layout + +\begin_layout Plain Layout + + vr->avail = p + num*sizeof(struct vring_desc); +\end_layout + +\begin_layout Plain Layout + + vr->used = (void *)(((unsigned long)&vr->avail->ring[num] +\end_layout + +\begin_layout Plain Layout + + + align-1) +\end_layout + +\begin_layout Plain Layout + + & ~(align - 1)); +\end_layout + +\begin_layout Plain Layout + +} +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + +static inline unsigned vring_size(unsigned int num, unsigned long align) +\end_layout + +\begin_layout Plain Layout + +{ +\end_layout + +\begin_layout Plain Layout + + return ((sizeof(struct vring_desc)*num + sizeof(uint16_t)*(2+num) +\end_layout + +\begin_layout Plain Layout + + + align - 1) & ~(align - 1)) +\end_layout + +\begin_layout Plain Layout + + + sizeof(uint16_t)*2 + sizeof(struct vring_used_elem)*num; +\end_layout + +\begin_layout Plain Layout + +} +\end_layout + +\begin_layout Plain Layout + +#endif /* VIRTIO_RING_H */ +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Chapter* +\begin_inset CommandInset label +LatexCommand label +name "cha:Reserved-Feature-Bits" + +\end_inset + +Appendix B: Reserved Feature Bits +\end_layout + +\begin_layout Standard +Currently there are three device-independent feature bits defined: +\end_layout + +\begin_layout Description +VIRTIO_F_NOTIFY_ON_EMPTY +\begin_inset space ~ +\end_inset + +(24) Negotiating this feature indicates that the driver wants an interrupt + if the device runs out of available descriptors on a virtqueue, even though + interrupts are suppressed using the VRING_AVAIL_F_NO_INTERRUPT flag. + An example of this is the networking driver: it doesn't need to know every + time a packet is transmitted, but it does need to free the transmitted + packets a finite time after they are transmitted. + It can avoid using a timer if the device interrupts it when all the packets + are transmitted. +\end_layout + +\begin_layout Description +VIRTIO_F_RING_INDIRECT_DESC +\begin_inset space ~ +\end_inset + +(28) Negotiating this feature indicates that the driver can use descriptors + with the VRING_DESC_F_INDIRECT flag set, as described in +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:Indirect-Descriptors" + +\end_inset + +. +\end_layout + +\begin_layout Description +VIRTIO_F_BAD_FEATURE(30) This feature should never be negotiated by the + guest; doing so is an indication that the guest is faulty +\begin_inset Foot +status open + +\begin_layout Plain Layout +An experimental virtio PCI driver contained in Linux version 2.6.25 had this + problem, and this feature bit can be used to detect it. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Chapter* +Appendix C: Network Device +\end_layout + +\begin_layout Standard +The virtio network device is a virtual ethernet card, and is the most complex + of the devices supported so far by virtio. + It has enhanced rapidly and demonstrates clearly how support for new features + should be added to an existing device. + Empty buffers are placed in one virtqueue for receiving packets, and outgoing + packets are enqueued into another for transmission in that order. + A third command queue is used to control advanced filtering features. +\end_layout + +\begin_layout Section* +Configuration +\end_layout + +\begin_layout Description +Subsystem +\begin_inset space ~ +\end_inset + +Device +\begin_inset space ~ +\end_inset + +ID 1 +\end_layout + +\begin_layout Description +Virtqueues 0:receiveq. + 1:transmitq. + 2:controlq +\begin_inset Foot +status open + +\begin_layout Plain Layout +Only if VIRTIO_NET_F_CTRL_VQ set +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Description +Feature +\begin_inset space ~ +\end_inset + +bits +\end_layout + +\begin_deeper +\begin_layout Description +VIRTIO_NET_F_CSUM +\begin_inset space ~ +\end_inset + +(0) Device handles packets with partial checksum +\end_layout + +\begin_layout Description +VIRTIO_NET_F_GUEST_CSUM +\begin_inset space ~ +\end_inset + +(1) Guest handles packets with partial checksum +\end_layout + +\begin_layout Description +VIRTIO_NET_F_MAC +\begin_inset space ~ +\end_inset + +(5) Device has given MAC address. +\end_layout + +\begin_layout Description +VIRTIO_NET_F_GSO +\begin_inset space ~ +\end_inset + +(6) (Deprecated) device handles packets with any GSO type. +\begin_inset Foot +status open + +\begin_layout Plain Layout +It was supposed to indicate segmentation offload support, but upon further + investigation it became clear that multiple bits were required. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Description +VIRTIO_NET_F_GUEST_TSO4 +\begin_inset space ~ +\end_inset + +(7) Guest can receive TSOv4. +\end_layout + +\begin_layout Description +VIRTIO_NET_F_GUEST_TSO6 +\begin_inset space ~ +\end_inset + +(8) Guest can receive TSOv6. +\end_layout + +\begin_layout Description +VIRTIO_NET_F_GUEST_ECN +\begin_inset space ~ +\end_inset + +(9) Guest can receive TSO with ECN. +\end_layout + +\begin_layout Description +VIRTIO_NET_F_GUEST_UFO +\begin_inset space ~ +\end_inset + +(10) Guest can receive UFO. +\end_layout + +\begin_layout Description +VIRTIO_NET_F_HOST_TSO4 +\begin_inset space ~ +\end_inset + +(11) Device can receive TSOv4. +\end_layout + +\begin_layout Description +VIRTIO_NET_F_HOST_TSO6 +\begin_inset space ~ +\end_inset + +(12) Device can receive TSOv6. +\end_layout + +\begin_layout Description +VIRTIO_NET_F_HOST_ECN +\begin_inset space ~ +\end_inset + +(13) Device can receive TSO with ECN. +\end_layout + +\begin_layout Description +VIRTIO_NET_F_HOST_UFO +\begin_inset space ~ +\end_inset + +(14) Device can receive UFO. +\end_layout + +\begin_layout Description +VIRTIO_NET_F_MRG_RXBUF +\begin_inset space ~ +\end_inset + +(15) Guest can merge receive buffers. +\end_layout + +\begin_layout Description +VIRTIO_NET_F_STATUS +\begin_inset space ~ +\end_inset + +(16) Configuration status field is available. +\end_layout + +\begin_layout Description +VIRTIO_NET_F_CTRL_VQ +\begin_inset space ~ +\end_inset + +(17) Control channel is available. +\end_layout + +\begin_layout Description +VIRTIO_NET_F_CTRL_RX +\begin_inset space ~ +\end_inset + +(18) Control channel RX mode support. +\end_layout + +\begin_layout Description +VIRTIO_NET_F_CTRL_VLAN +\begin_inset space ~ +\end_inset + +(19) Control channel VLAN filtering. +\end_layout + +\end_deeper +\begin_layout Description +Device +\begin_inset space ~ +\end_inset + +configuration +\begin_inset space ~ +\end_inset + +layout Two configuration fields are currently defined. + The mac address field always exists (though is only valid if VIRTIO_NET_F_MAC + is set), and the status field only exists if VIRTIO_NET_F_STATUS is set. + Only one bit is currently defined for the status field: VIRTIO_NET_S_LINK_UP. + +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +#define VIRTIO_NET_S_LINK_UP 1 +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + +struct virtio_net_config { +\end_layout + +\begin_layout Plain Layout + + u8 mac[6]; +\end_layout + +\begin_layout Plain Layout + + u16 status; +\end_layout + +\begin_layout Plain Layout + +}; +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section* +Device Initialization +\end_layout + +\begin_layout Enumerate +The initialization routine should identify the receive and transmission + virtqueues. +\end_layout + +\begin_layout Enumerate +If the VIRTIO_NET_F_MAC feature bit is set, the configuration space +\begin_inset Quotes eld +\end_inset + +mac +\begin_inset Quotes erd +\end_inset + + entry indicates the +\begin_inset Quotes eld +\end_inset + +physical +\begin_inset Quotes erd +\end_inset + + address of the the network card, otherwise a private MAC address should + be assigned. + All guests are expected to negotiate this feature if it is set. +\end_layout + +\begin_layout Enumerate +If the VIRTIO_NET_F_CTRL_VQ feature bit is negotiated, identify the control + virtqueue. +\end_layout + +\begin_layout Enumerate +If the VIRTIO_NET_F_STATUS feature bit is negotiated, the link status can + be read from the bottom bit of the +\begin_inset Quotes eld +\end_inset + +status +\begin_inset Quotes erd +\end_inset + + config field. + Otherwise, the link should be assumed active. +\end_layout + +\begin_layout Enumerate +The receive virtqueue should be filled with receive buffers. + This is described in detail below in +\begin_inset Quotes eld +\end_inset + +Setting Up Receive Buffers +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\begin_layout Enumerate +A driver can indicate that it will generate checksumless packets by negotating + the VIRTIO_NET_F_CSUM feature. + This +\begin_inset Quotes eld +\end_inset + +checksum offload +\begin_inset Quotes erd +\end_inset + + is a common feature on modern network cards. +\end_layout + +\begin_layout Enumerate +If that feature is negotiated, a driver can use TCP or UDP segmentation + offload by negotiating the VIRTIO_NET_F_HOST_TSO4 (IPv4 TCP), VIRTIO_NET_F_HOST +_TSO6 (IPv6 TCP) and VIRTIO_NET_F_HOST_UFO (UDP fragmentation) features. + It should not send TCP packets requiring segmentation offload which have + the Explicit Congestion Notification bit set, unless the VIRTIO_NET_F_HOST_ECN + feature is negotiated. +\begin_inset Foot +status open + +\begin_layout Plain Layout +This is a common restriction in real, older network cards. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Enumerate +The converse features are also available: a driver can save the virtual + device some work by negotiating these features. +\begin_inset Foot +status open + +\begin_layout Plain Layout +For example, a network packet transported between two guests on the same + system may not require checksumming at all, nor segmentation, if both guests + are amenable. +\end_layout + +\end_inset + + The VIRTIO_NET_F_GUEST_CSUM feature indicates that partially checksummed + packets can be received, and if it can do that then the VIRTIO_NET_F_GUEST_TSO4 +, VIRTIO_NET_F_GUEST_TSO6, VIRTIO_NET_F_GUEST_UFO and VIRTIO_NET_F_GUEST_ECN + are the input equivalents of the features described above. + See +\begin_inset Quotes eld +\end_inset + +Receiving Packets +\begin_inset Quotes erd +\end_inset + + below. +\end_layout + +\begin_layout Section* +Device Operation +\end_layout + +\begin_layout Standard +Packets are transmitted by placing them in the transmitq, and buffers for + incoming packets are placed in the receiveq. + In each case, the packet itself is preceeded by a header: +\end_layout + +\begin_layout Standard +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +struct virtio_net_hdr { +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 +\end_layout + +\begin_layout Plain Layout + + u8 flags; +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_NET_HDR_GSO_NONE 0 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_NET_HDR_GSO_TCPV4 1 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_NET_HDR_GSO_UDP 3 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_NET_HDR_GSO_TCPV6 4 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_NET_HDR_GSO_ECN 0x80 +\end_layout + +\begin_layout Plain Layout + + u8 gso_type; +\end_layout + +\begin_layout Plain Layout + + u16 hdr_len; +\end_layout + +\begin_layout Plain Layout + + u16 gso_size; +\end_layout + +\begin_layout Plain Layout + + u16 csum_start; +\end_layout + +\begin_layout Plain Layout + + u16 csum_offset; +\end_layout + +\begin_layout Plain Layout + +/* Only if VIRTIO_NET_F_MRG_RXBUF: */ +\end_layout + +\begin_layout Plain Layout + + u16 num_buffers +\end_layout + +\begin_layout Plain Layout + +}; +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +The controlq is used to control device features such as filtering. +\end_layout + +\begin_layout Subsection* +Packet Transmission +\end_layout + +\begin_layout Standard +Transmitting a single packet is simple, but varies depending on the different + features the driver negotiated. +\end_layout + +\begin_layout Enumerate +If the driver negotiated VIRTIO_NET_F_CSUM, and the packet has not been + fully checksummed, then the virtio_net_hdr's fields are set as follows. + Otherwise, the packet must be fully checksummed, and flags is zero. +\end_layout + +\begin_deeper +\begin_layout Itemize +flags has the VIRTIO_NET_HDR_F_NEEDS_CSUM set, +\end_layout + +\begin_layout Itemize +\begin_inset CommandInset label +LatexCommand label +name "ite:csum_start-is-set" + +\end_inset + +csum_start is set to the offset within the packet to begin checksumming, + and +\end_layout + +\begin_layout Itemize +csum_offset indicates how many bytes after the csum_start the new (16 bit + ones' complement) checksum should be placed. +\begin_inset Foot +status open + +\begin_layout Plain Layout +For example, consider a partially checksummed TCP (IPv4) packet. + It will have a 14 byte ethernet header and 20 byte IP header followed by + the TCP header (with the TCP checksum field 16 bytes into that header). + csum_start will be 14+20 = 34 (the TCP checksum includes the header), and + csum_offset will be 16. + The value in the TCP checksum field will be the sum of the TCP pseudo header, + so that replacing it by the ones' complement checksum of the TCP header + and body will give the correct result. +\end_layout + +\end_inset + + +\end_layout + +\end_deeper +\begin_layout Enumerate +\begin_inset CommandInset label +LatexCommand label +name "enu:If-the-driver" + +\end_inset + +If the driver negotiated VIRTIO_NET_F_HOST_TSO4, TSO6 or UFO, and the packet + requires TCP segmentation or UDP fragmentation, then the +\begin_inset Quotes eld +\end_inset + +gso_type +\begin_inset Quotes erd +\end_inset + + field is set to VIRTIO_NET_HDR_GSO_TCPV4, TCPV6 or UDP. + (Otherwise, it is set to VIRTIO_NET_HDR_GSO_NONE). + In this case, packets larger than 1514 bytes can be transmitted: the metadata + indicates how to replicate the packet header to cut it into smaller packets. + The other gso fields are set: +\end_layout + +\begin_deeper +\begin_layout Itemize +hdr_len is a hint to the device as to how much of the header needs to be + kept to copy into each packet, usually set to the length of the headers, + including the transport header. +\begin_inset Foot +status open + +\begin_layout Plain Layout +Due to various bugs in implementations, this field is not useful as a guarantee + of the transport header size. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Itemize +gso_size is the size of the packet beyond that header (ie. + MSS). +\end_layout + +\begin_layout Itemize +If the driver negotiated the VIRTIO_NET_F_HOST_ECN feature, the VIRTIO_NET_HDR_G +SO_ECN bit may be set in +\begin_inset Quotes eld +\end_inset + +gso_type +\begin_inset Quotes erd +\end_inset + + as well, indicating that the TCP packet has the ECN bit set. +\begin_inset Foot +status open + +\begin_layout Plain Layout +This case is not handled by some older hardware, so is called out specifically + in the protocol. +\end_layout + +\end_inset + + +\end_layout + +\end_deeper +\begin_layout Enumerate +If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature, the num_buffers + field is set to zero. +\end_layout + +\begin_layout Enumerate +The header and packet are added as one output buffer to the transmitq, and + the device is notified of the new entry (see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:Notifying-The-Device" + +\end_inset + +). +\begin_inset Foot +status open + +\begin_layout Plain Layout +Note that the header will be two bytes longer for the VIRTIO_NET_F_MRG_RXBUF + case. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Subsection* +Packet Transmission Interrupt +\end_layout + +\begin_layout Standard +Often a driver will suppress transmission interrupts using the VRING_AVAIL_F_NO_ +INTERRUPT flag (see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:Receiving-Used-Buffers" + +\end_inset + +) and check for used packets in the transmit path of following packets. + However, it will still receive interrupts if the VIRTIO_F_NOTIFY_ON_EMPTY + feature is negotiated, indicating that the transmission queue is completely + emptied. +\end_layout + +\begin_layout Standard +The normal behavior in this interrupt handler is to retrieve and new descriptors + from the used ring and free the corresponding headers and packets. +\end_layout + +\begin_layout Subsection* +Setting Up Receive Buffers +\end_layout + +\begin_layout Standard +It is generally a good idea to keep the receive virtqueue as fully populated + as possible: if it runs out, network performance will suffer. +\end_layout + +\begin_layout Standard +If the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6 or VIRTIO_NET_F_GUEST_UF +O features are used, the Guest will need to accept packets of up to 65550 + bytes long (the maximum size of a TCP or UDP packet, plus the 14 byte ethernet + header), otherwise 1514 bytes. + So unless VIRTIO_NET_F_MRG_RXBUF is negotiated, every buffer in the receive + queue needs to be at least this length +\begin_inset Foot +status open + +\begin_layout Plain Layout +Obviously each one can be split across multiple descriptor elements. +\end_layout + +\end_inset + +. +\end_layout + +\begin_layout Standard +If VIRTIO_NET_F_MRG_RXBUF is negotiated, each buffer must be at least the + size of the +\family typewriter +struct virtio_net_hdr +\family default +. +\end_layout + +\begin_layout Subsection* +Packet Receive Interrupt +\end_layout + +\begin_layout Standard +When a packet is copied into a buffer in the receiveq, the optimal path + is to disable further interrupts for the receiveq (see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:Receiving-Used-Buffers" + +\end_inset + +) and process packets until no more are found, then re-enable them. +\end_layout + +\begin_layout Standard +Processing packet involves: +\end_layout + +\begin_layout Enumerate +If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature, then the +\begin_inset Quotes eld +\end_inset + +num_buffers +\begin_inset Quotes erd +\end_inset + + field indicates how many descriptors this packet is spread over (including + this one). + This allows receipt of large packets without having to allocate large buffers. + In this case, there will be at least +\begin_inset Quotes eld +\end_inset + +num_buffers +\begin_inset Quotes erd +\end_inset + + in the used ring, and they should be chained together to form a single + packet. + The other buffers will +\emph on +not +\emph default + begin with a +\family typewriter +struct virtio_net_hdr +\family default +. +\end_layout + +\begin_layout Enumerate +If the VIRTIO_NET_F_MRG_RXBUF feature was not negotiated, or the +\begin_inset Quotes eld +\end_inset + +num_buffers +\begin_inset Quotes erd +\end_inset + + field is one, then the entire packet will be contained within this buffer, + immediately following the +\family typewriter +struct virtio_net_hdr +\family default +. +\end_layout + +\begin_layout Enumerate +If the VIRTIO_NET_F_GUEST_CSUM feature was negotiated, the VIRTIO_NET_HDR_F_NEED +S_CSUM bit in the +\begin_inset Quotes eld +\end_inset + +flags +\begin_inset Quotes erd +\end_inset + + field may be set: if so, the checksum on the packet is incomplete and the + +\begin_inset Quotes eld +\end_inset + +csum_start +\begin_inset Quotes erd +\end_inset + + and +\begin_inset Quotes eld +\end_inset + +csum_offset +\begin_inset Quotes erd +\end_inset + + fields indicate how to calculate it (see +\begin_inset CommandInset ref +LatexCommand ref +reference "ite:csum_start-is-set" + +\end_inset + +). +\end_layout + +\begin_layout Enumerate +If the VIRTIO_NET_F_GUEST_TSO4, TSO6 or UFO options were negotiated, then + the +\begin_inset Quotes eld +\end_inset + +gso_type +\begin_inset Quotes erd +\end_inset + + may be something other than VIRTIO_NET_HDR_GSO_NONE, and the +\begin_inset Quotes eld +\end_inset + +gso_size +\begin_inset Quotes erd +\end_inset + + field indicates the desired MSS (see +\begin_inset CommandInset ref +LatexCommand ref +reference "enu:If-the-driver" + +\end_inset + +).Control Virtqueue +\end_layout + +\begin_layout Standard +The driver uses the control virtqueue (if VIRTIO_NET_F_VTRL_VQ is negotiated) + to send commands to manipulate various features of the device which would + not easily map into the configuration space. +\end_layout + +\begin_layout Standard +All commands are of the following form: +\end_layout + +\begin_layout Standard +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +struct virtio_net_ctrl { +\end_layout + +\begin_layout Plain Layout + + u8 class; +\end_layout + +\begin_layout Plain Layout + + u8 command; +\end_layout + +\begin_layout Plain Layout + + u8 command-specific-data[]; +\end_layout + +\begin_layout Plain Layout + + u8 ack; +\end_layout + +\begin_layout Plain Layout + +}; +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + +/* ack values */ +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_NET_OK 0 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_NET_ERR 1 +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +The class, command and command-specific-data are set by the driver, and + the device sets the ack byte. + There is little it can do except issue a diagnostic if the ack byte is + not VIRTIO_NET_OK. +\end_layout + +\begin_layout Subsection* +Packet Receive Filtering +\end_layout + +\begin_layout Standard +If the VIRTIO_NET_F_CTRL_RX feature is negotiated, the driver can send control + commands for promiscuous mode, multicast receiving, and filtering of MAC + addresses. +\end_layout + +\begin_layout Standard +Note that in general, these commands are best-effort: unwanted packets may + still arrive. + +\end_layout + +\begin_layout Subsubsection* +Setting Promiscuous Mode +\end_layout + +\begin_layout Standard +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +#define VIRTIO_NET_CTRL_RX 0 +\end_layout + +\begin_layout Plain Layout + + #define VIRTIO_NET_CTRL_RX_PROMISC 0 +\end_layout + +\begin_layout Plain Layout + + #define VIRTIO_NET_CTRL_RX_ALLMULTI 1 +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +The class VIRTIO_NET_CTRL_RX has two commands: VIRTIO_NET_CTRL_RX_PROMISC + turns promiscuous mode on and off, and VIRTIO_NET_CTRL_RX_ALLMULTI turns + all-multicast receive on and off. + The command-specific-data is one byte containing 0 (off) or 1 (on). +\end_layout + +\begin_layout Subsubsection* +Setting MAC Address Filtering +\end_layout + +\begin_layout Standard +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +struct virtio_net_ctrl_mac { +\end_layout + +\begin_layout Plain Layout + + u32 entries; +\end_layout + +\begin_layout Plain Layout + + u8 macs[entries][ETH_ALEN]; +\end_layout + +\begin_layout Plain Layout + +}; +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_NET_CTRL_MAC 1 +\end_layout + +\begin_layout Plain Layout + + #define VIRTIO_NET_CTRL_MAC_TABLE_SET 0 +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +The device can filter incoming packets by any number of destination MAC + addresses. +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +Since there are no guarentees, it can use a hash filter orsilently switch + to allmulti or promiscuous mode if it is given too many addresses. +\end_layout + +\end_inset + + This table is set using the class VIRTIO_NET_CTRL_MAC and the command VIRTIO_NE +T_CTRL_MAC_TABLE_SET. + The command-specific-data is two variable length tables of 6-byte MAC addresses. + The first table contains unicast addresses, and the second contains multicast + addresses. +\end_layout + +\begin_layout Subsection* +VLAN Filtering +\end_layout + +\begin_layout Standard +If the driver negotiates the VIRTION_NET_F_CTRL_VLAN feature, it can control + a VLAN filter table in the device. +\end_layout + +\begin_layout Standard +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +#define VIRTIO_NET_CTRL_VLAN 2 +\end_layout + +\begin_layout Plain Layout + + #define VIRTIO_NET_CTRL_VLAN_ADD 0 +\end_layout + +\begin_layout Plain Layout + + #define VIRTIO_NET_CTRL_VLAN_DEL 1 +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +Both the VIRTIO_NET_CTRL_VLAN_ADD and VIRTIO_NET_CTRL_VLAN_DEL command take + a 16-bit VLAN id as the command-specific-data. +\end_layout + +\begin_layout Chapter* +Appendix D: Block Device +\end_layout + +\begin_layout Standard +The virtio block device is a simple virtual block device (ie. + disk). + Read and write requests (and other exotic requests) are placed in the queue, + and serviced (probably out of order) by the device except where noted. +\end_layout + +\begin_layout Section* +Configuration +\end_layout + +\begin_layout Description +Subsystem +\begin_inset space ~ +\end_inset + +Device +\begin_inset space ~ +\end_inset + +ID 2 +\end_layout + +\begin_layout Description +Virtqueues 0:requestq. +\end_layout + +\begin_layout Description +Feature +\begin_inset space ~ +\end_inset + +bits +\end_layout + +\begin_deeper +\begin_layout Description +VIRTIO_BLK_F_BARRIER +\begin_inset space ~ +\end_inset + +(0) Host supports request barriers. +\end_layout + +\begin_layout Description +VIRTIO_BLK_F_SIZE_MAX +\begin_inset space ~ +\end_inset + +(1) Maximum size of any single segment is in +\begin_inset Quotes eld +\end_inset + +size_max +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\begin_layout Description +VIRTIO_BLK_F_SEG_MAX +\begin_inset space ~ +\end_inset + +(2) Maximum number of segments in a request is in +\begin_inset Quotes eld +\end_inset + +seg_max +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\begin_layout Description +VIRTIO_BLK_F_GEOMETRY +\begin_inset space ~ +\end_inset + +(4) Disk-style geometry specified in +\begin_inset Quotes eld +\end_inset + +geometry +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\begin_layout Description +VIRTIO_BLK_F_RO +\begin_inset space \space{} +\end_inset + +(5) Device is read-only. +\end_layout + +\begin_layout Description +VIRTIO_BLK_F_BLK_SIZE +\begin_inset space ~ +\end_inset + +(6) Block size of disk is in +\begin_inset Quotes eld +\end_inset + +blk_size +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\begin_layout Description +VIRTIO_BLK_F_SCSI (7) Device supports scsi packet commands. +\end_layout + +\begin_layout Description +VIRTIO_BLK_F_FLUSH (9) Cache flush command support. +\end_layout + +\begin_layout Description +VIRTIO_BLK_F_SECTOR_MAX +\begin_inset space ~ +\end_inset + +(10) Maximum total sectors in an I/O. +\end_layout + +\end_deeper +\begin_layout Description +Device +\begin_inset space ~ +\end_inset + +configuration +\begin_inset space ~ +\end_inset + +layout The capacity of the device (expressed in 512-byte sectors) is always + present. + The availability of the others all depend on various feature bits as indicated + above. + +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +struct virtio_blk_config { +\end_layout + +\begin_layout Plain Layout + + u64 capacity; +\end_layout + +\begin_layout Plain Layout + + u32 size_max; +\end_layout + +\begin_layout Plain Layout + + u32 seg_max; +\end_layout + +\begin_layout Plain Layout + + struct virtio_blk_geometry { +\end_layout + +\begin_layout Plain Layout + + u16 cylinders; +\end_layout + +\begin_layout Plain Layout + + u8 heads; +\end_layout + +\begin_layout Plain Layout + + u8 sectors; +\end_layout + +\begin_layout Plain Layout + + } geometry; +\end_layout + +\begin_layout Plain Layout + + u32 blk_size; +\end_layout + +\begin_layout Plain Layout + + u32 sectors_max; +\end_layout + +\begin_layout Plain Layout + +}; +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section* +Device Initialization +\end_layout + +\begin_layout Enumerate +The device size should be read from the +\begin_inset Quotes eld +\end_inset + +capacity +\begin_inset Quotes erd +\end_inset + + configuration field. + No requests should be submitted which goes beyond this limit. +\end_layout + +\begin_layout Enumerate +If the VIRTIO_BLK_F_BLK_SIZE feature is negotiated, the blk_size field can + be read to determine the optimal sector size for the driver to use. + This does not effect the units used in the protocol (always 512 bytes), + but awareness of the correct value can effect performance. +\end_layout + +\begin_layout Enumerate +If the VIRTIO_BLK_F_RO feature is set by the device, any write requests + will fail. +\end_layout + +\begin_layout Enumerate +If the VIRTIO_BLK_F_SECTOR_MAX feature is negotiated, the sectors_max field + should be read to determine the maximum I/O size for the driver to use. + No requests should be submitted which go beyond this limit. +\end_layout + +\begin_layout Section* +Device Operation +\end_layout + +\begin_layout Standard +The driver queues requests to the virtqueue, and they are used by the device + (not necessarily in order). + Each request is of form: +\end_layout + +\begin_layout Standard +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +struct virtio_blk_req { +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + + u32 type; +\end_layout + +\begin_layout Plain Layout + + u32 ioprio; +\end_layout + +\begin_layout Plain Layout + + u64 sector; +\end_layout + +\begin_layout Plain Layout + + char data[][512]; +\end_layout + +\begin_layout Plain Layout + + u8 status; +\end_layout + +\begin_layout Plain Layout + +}; +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +If the device has VIRTIO_BLK_F_SCSI feature, it can also support scsi packet + command requests, each of these requests is of form: +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +struct virtio_scsi_pc_req { +\end_layout + +\begin_layout Plain Layout + + u32 type; +\end_layout + +\begin_layout Plain Layout + + u32 ioprio; +\end_layout + +\begin_layout Plain Layout + + u64 sector; +\end_layout + +\begin_layout Plain Layout + + char cmd[]; +\end_layout + +\begin_layout Plain Layout + + char data[][512]; +\end_layout + +\begin_layout Plain Layout + +#define SCSI_SENSE_BUFFERSIZE 96 +\end_layout + +\begin_layout Plain Layout + + u8 sense[SCSI_SENSE_BUFFERSIZE]; +\end_layout + +\begin_layout Plain Layout + + u32 errors; +\end_layout + +\begin_layout Plain Layout + + u32 data_len; +\end_layout + +\begin_layout Plain Layout + + u32 sense_len; +\end_layout + +\begin_layout Plain Layout + + u32 residual; +\end_layout + +\begin_layout Plain Layout + + u8 status; +\end_layout + +\begin_layout Plain Layout + +}; +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +The +\emph on +type +\emph default + of the request is either a read (VIRTIO_BLK_T_IN), a write (VIRTIO_BLK_T_OUT), + a scsi packet command (VIRTIO_BLK_T_SCSI_CMD or VIRTIO_BLK_T_SCSI_CMD_OUT +\begin_inset Foot +status open + +\begin_layout Plain Layout +the SCSI_CMD and SCSI_CMD_OUT types are equivalent, the device does not + distinguish between them +\end_layout + +\end_inset + +) or a flush (VIRTIO_BLK_T_FLUSH or VIRTIO_BLK_T_FLUSH_OUT +\begin_inset Foot +status open + +\begin_layout Plain Layout +the FLUSH and FLUSH_OUT types are equivalent, the device does not distinguish + between them +\end_layout + +\end_inset + +). + If the device has VIRTIO_BLK_F_BARRIER feature +\begin_inset space ~ +\end_inset + +the high bit (VIRTIO_BLK_T_BARRIER) indicates that this request acts as + a barrier and that all preceeding requests must be complete before this + one, and all following requests must not be started until this is complete. + Note that a barrier does not flush caches in the underlying backend device + in host, and thus does not serve as data consistency guarantee. + Driver must use FLUSH request to flush the host cache. +\end_layout + +\begin_layout Standard +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +#define VIRTIO_BLK_T_IN 0 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_BLK_T_OUT 1 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_BLK_T_SCSI_CMD 2 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_BLK_T_SCSI_CMD_OUT 3 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_BLK_T_FLUSH 4 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_BLK_T_FLUSH_OUT 5 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_BLK_T_BARRIER 0x80000000 +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +The +\emph on +ioprio +\emph default + field is a hint about the relative priorities of requests to the device: + higher numbers indicate more important requests. +\end_layout + +\begin_layout Standard +The +\emph on +sector +\emph default + number indicates the offset (multiplied by 512) where the read or write + is to occur. + This field is unused and set to 0 for scsi packet commands and for flush + commands. +\end_layout + +\begin_layout Standard +The +\emph on +cmd +\emph default + field is only present for scsi packet command requests, and indicates the + command to perform. + This field must reside in a single, separate read-only buffer; command + length can be derived from the length of this buffer. + +\end_layout + +\begin_layout Standard +Note that these first three (four for scsi packet commands) fields are always + read-only: the +\emph on +data +\emph default + field is either read-only or write-only, depending on the request. + The size of the read or write can be derived from the total size of the + request buffers. +\end_layout + +\begin_layout Standard +The +\emph on + sense +\emph default + field is only present for scsi packet command requests, and indicates the + buffer for scsi sense data. +\end_layout + +\begin_layout Standard +The +\emph on +data_len +\emph default + field is only present for scsi packet command requests, this field is deprecate +d, and should be ignored by the driver. + Historically, devices copied data length there. +\end_layout + +\begin_layout Standard +The +\emph on +sense_len +\emph default + field is only present for scsi packet command requests and indicates the + number of bytes actually written to the +\emph on +sense +\emph default + buffer. +\end_layout + +\begin_layout Standard +The +\emph on +residual +\emph default + field is only present for scsi packet command requests and indicates the + residual size, calculated as data length - number of bytes actually transferred. +\end_layout + +\begin_layout Standard +The final +\emph on +status +\emph default + byte is written by the device: either VIRTIO_BLK_S_OK for success, VIRTIO_BLK_S +_IOERR for host or guest error or VIRTIO_BLK_S_UNSUPP for a request unsupported + by host: +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +#define VIRTIO_BLK_S_OK 0 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_BLK_S_IOERR 1 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_BLK_S_UNSUPP 2 +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +Historically, devices assumed that the fields +\emph on +type +\emph default +, +\emph on +ioprio +\emph default + and +\emph on +sector +\emph default + reside in a single, separate read-only buffer; the fields +\emph on +errors +\emph default +, +\emph on +data_len +\emph default +, +\emph on +sense_len +\emph default + and +\emph on +residual +\emph default + reside in a single, separate write-only buffer; the +\emph on +sense +\emph default + field in a separate write-only buffer of size 96 bytes, by itself; the + fields +\emph on +errors +\emph default +, +\emph on +data_len +\emph default +, +\emph on +sense_len +\emph default + and +\emph on +residual +\emph default + in a single write-only buffer; and the +\emph on +status +\emph default + field is a separate read-only buffer of size 1 byte, by itself. +\end_layout + +\begin_layout Chapter* +Appendix E: Console Device +\end_layout + +\begin_layout Standard +The virtio console device is a simple device for data input and output. + A device may have one or more ports. + Each port has a pair of input and output virtqueues. + Moreover, a device has a pair of control IO virtqueues. + The control virtqueues are used to communicate information between the + device and the driver about ports being opened and closed on either side + of the connection, indication from the host about whether a particular + port is a console port, adding new ports, port hot-plug/unplug, etc., and + indication from the guest about whether a port or a device was successfully + added, port open/close, etc.. + For data IO, one or more empty buffers are placed in the receive queue + for incoming data and outgoing characters are placed in the transmit queue. +\end_layout + +\begin_layout Section* +Configuration +\end_layout + +\begin_layout Description +Subsystem +\begin_inset space ~ +\end_inset + +Device +\begin_inset space ~ +\end_inset + +ID 3 +\end_layout + +\begin_layout Description +Virtqueues 0:receiveq(port0). + 1:transmitq(port0), 2:control receiveq +\begin_inset Foot +status open + +\begin_layout Plain Layout +Ports 2 onwards only if VIRTIO_CONSOLE_F_MULTIPORT is set +\end_layout + +\end_inset + +, 3:control transmitq, 4:receiveq(port1), 5:transmitq(port1), ... +\end_layout + +\begin_layout Description +Feature +\begin_inset space ~ +\end_inset + +bits +\end_layout + +\begin_deeper +\begin_layout Description +VIRTIO_CONSOLE_F_SIZE +\begin_inset space ~ +\end_inset + +(0) Configuration cols and rows fields are valid. +\end_layout + +\begin_layout Description +VIRTIO_CONSOLE_F_MULTIPORT(1) Device has support for multiple ports; configurati +on fields nr_ports and max_nr_ports are valid and control virtqueues will + be used. +\end_layout + +\end_deeper +\begin_layout Description +Device +\begin_inset space ~ +\end_inset + +configuration +\begin_inset space ~ +\end_inset + +layout The size of the console is supplied in the configuration space if + the VIRTIO_CONSOLE_F_SIZE feature is set. + Furthermore, if the VIRTIO_CONSOLE_F_MULTIPORT feature is set, the maximum + number of ports supported by the device can be fetched. +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +struct virtio_console_config { +\end_layout + +\begin_layout Plain Layout + + u16 cols; +\end_layout + +\begin_layout Plain Layout + + u16 rows; +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + + u32 max_nr_ports; +\end_layout + +\begin_layout Plain Layout + +}; +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section* +Device Initialization +\end_layout + +\begin_layout Enumerate +If the VIRTIO_CONSOLE_F_SIZE feature is negotiated, the driver can read + the console dimensions from the configuration fields. +\end_layout + +\begin_layout Enumerate +If the VIRTIO_CONSOLE_F_MULTIPORT feature is negotiated, the driver can + spawn multiple ports, not all of which may be attached to a console. + Some could be generic ports. + In this case, the control virtqueues are enabled and according to the max_nr_po +rts configuration-space value, the appropriate number of virtqueues are + created. + A control message indicating the driver is ready is sent to the host. + The host can then send control messages for adding new ports to the device. + After creating and initializing each port, a VIRTIO_CONSOLE_PORT_READY + control message is sent to the host for that port so the host can let us + know of any additional configuration options set for that port. +\end_layout + +\begin_layout Enumerate +The receiveq for each port is populated with one or more receive buffers. +\end_layout + +\begin_layout Section* +Device Operation +\end_layout + +\begin_layout Enumerate +For output, a buffer containing the characters is placed in the port's transmitq. +\begin_inset Foot +status open + +\begin_layout Plain Layout +Because this is high importance and low bandwidth, the current Linux implementat +ion polls for the buffer to be used, rather than waiting for an interrupt, + simplifying the implementation significantly. + However, for generic serial ports with the O_NONBLOCK flag set, the polling + limitation is relaxed and the consumed buffers are freed upon the next + write or poll call or when a port is closed or hot-unplugged. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Enumerate +When a buffer is used in the receiveq (signalled by an interrupt), the contents + is the input to the port associated with the virtqueue for which the notificati +on was received. +\end_layout + +\begin_layout Enumerate +If the driver negotiated the VIRTIO_CONSOLE_F_SIZE feature, a configuration + change interrupt may occur. + The updated size can be read from the configuration fields. +\end_layout + +\begin_layout Enumerate +If the driver negotiated the VIRTIO_CONSOLE_F_MULTIPORT feature, active + ports are announced by the host using the VIRTIO_CONSOLE_PORT_ADD control + message. + The same message is used for port hot-plug as well. +\end_layout + +\begin_layout Enumerate +If the host specified a port `name', a sysfs attribute is created with the + name filled in, so that udev rules can be written that can create a symlink + from the port's name to the char device for port discovery by applications + in the guest. +\end_layout + +\begin_layout Enumerate +Changes to ports' state are effected by control messages. + Appropriate action is taken on the port indicated in the control message. + The layout of the structure of the control buffer and the events associated + are: +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +struct virtio_console_control { +\end_layout + +\begin_layout Plain Layout + + uint32_t id; /* Port number */ +\end_layout + +\begin_layout Plain Layout + + uint16_t event; /* The kind of control event */ +\end_layout + +\begin_layout Plain Layout + + uint16_t value; /* Extra information for the event */ +\end_layout + +\begin_layout Plain Layout + +}; +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + +/* Some events for the internal messages (control packets) */ +\end_layout + +\begin_layout Plain Layout + +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_CONSOLE_DEVICE_READY 0 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_CONSOLE_PORT_ADD 1 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_CONSOLE_PORT_REMOVE 2 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_CONSOLE_PORT_READY 3 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_CONSOLE_CONSOLE_PORT 4 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_CONSOLE_RESIZE 5 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_CONSOLE_PORT_OPEN 6 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_CONSOLE_PORT_NAME 7 +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Chapter* +Appendix F: Entropy Device +\end_layout + +\begin_layout Standard +The virtio entropy device supplies high-quality randomness for guest use. +\end_layout + +\begin_layout Section* +Configuration +\end_layout + +\begin_layout Description +Subsystem +\begin_inset space ~ +\end_inset + +Device +\begin_inset space ~ +\end_inset + +ID 4 +\end_layout + +\begin_layout Description +Virtqueues 0:requestq. +\end_layout + +\begin_layout Description +Feature +\begin_inset space ~ +\end_inset + +bits None currently defined +\end_layout + +\begin_layout Description +Device +\begin_inset space ~ +\end_inset + +configuration +\begin_inset space ~ +\end_inset + +layout None currently defined. +\end_layout + +\begin_layout Section* +Device Initialization +\end_layout + +\begin_layout Enumerate +The virtqueue is initialized +\end_layout + +\begin_layout Section* +Device Operation +\end_layout + +\begin_layout Standard +When the driver requires random bytes, it places the descriptor of one or + more buffers in the queue. + It will be completely filled by random data by the device. +\end_layout + +\begin_layout Chapter* +Appendix G: Memory Balloon Device +\end_layout + +\begin_layout Standard +The virtio memory balloon device is a primitive device for managing guest + memory: the device asks for a certain amount of memory, and the guest supplies + it (or withdraws it, if the device has more than it asks for). + This allows the guest to adapt to changes in allowance of underlying physical + memory. + If the feature is negotiated, the device can also be used to communicate + guest memory statistics to the host. +\end_layout + +\begin_layout Section* +Configuration +\end_layout + +\begin_layout Description +Subsystem +\begin_inset space ~ +\end_inset + +Device +\begin_inset space ~ +\end_inset + +ID 5 +\end_layout + +\begin_layout Description +Virtqueues 0:inflateq. + 1:deflateq. + 2:statsq. +\begin_inset Foot +status open + +\begin_layout Plain Layout +Only if VIRTIO_BALLON_F_STATS_VQ set +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Description +Feature +\begin_inset space ~ +\end_inset + +bits +\end_layout + +\begin_deeper +\begin_layout Description +VIRTIO_BALLOON_F_MUST_TELL_HOST +\begin_inset space ~ +\end_inset + +(0) Host must be told before pages from the balloon are used. +\end_layout + +\begin_layout Description +VIRTIO_BALLOON_F_STATS_VQ +\begin_inset space \space{} +\end_inset + +(1) A virtqueue for reporting guest memory statistics is present. +\end_layout + +\end_deeper +\begin_layout Description +Device +\begin_inset space ~ +\end_inset + +configuration +\begin_inset space ~ +\end_inset + +layout Both fields of this configuration are always available. + Note that they are little endian, despite convention that device fields + are guest endian: +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +struct virtio_console_config { +\end_layout + +\begin_layout Plain Layout + + u32 num_pages; +\end_layout + +\begin_layout Plain Layout + + u32 actual; +\end_layout + +\begin_layout Plain Layout + +}; +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section* +Device Initialization +\end_layout + +\begin_layout Enumerate +The inflate and deflate virtqueues are identified. +\end_layout + +\begin_layout Enumerate +If the VIRTIO_BALLOON_F_STATS_VQ feature bit is negotiated: +\end_layout + +\begin_deeper +\begin_layout Enumerate +Identify the stats virtqueue. +\end_layout + +\begin_layout Enumerate +Add one empty buffer to the stats virtqueue and notify the host. +\end_layout + +\end_deeper +\begin_layout Standard +Device operation begins immediately. +\end_layout + +\begin_layout Section* +Device Operation +\end_layout + +\begin_layout Description +Memory +\begin_inset space \space{} +\end_inset + +Ballooning The device is driven by the receipt of a configuration change + interrupt. +\end_layout + +\begin_layout Enumerate +The +\begin_inset Quotes eld +\end_inset + +num_pages +\begin_inset Quotes erd +\end_inset + + configuration field is examined. + If this is greater than the +\begin_inset Quotes eld +\end_inset + +actual +\begin_inset Quotes erd +\end_inset + + number of pages, memory must be given to the balloon. + If it is less than the +\begin_inset Quotes eld +\end_inset + +actual +\begin_inset Quotes erd +\end_inset + + number of pages, memory may be taken back from the balloon for general + use. +\end_layout + +\begin_layout Enumerate +To supply memory to the balloon (aka. + inflate): +\end_layout + +\begin_deeper +\begin_layout Enumerate +The driver constructs an array of addresses of unused memory pages. + These addresses are divided by 4096 +\begin_inset Foot +status open + +\begin_layout Plain Layout +This is historical, and independent of the guest page size +\end_layout + +\end_inset + + and the descriptor describing the resulting 32-bit array is added to the + inflateq. +\end_layout + +\end_deeper +\begin_layout Enumerate +To remove memory from the balloon (aka. + deflate): +\end_layout + +\begin_deeper +\begin_layout Enumerate +The driver constructs an array of addresses of memory pages it has previously + given to the balloon, as described above. + This descriptor is added to the deflateq. +\end_layout + +\begin_layout Enumerate +If the VIRTIO_BALLOON_F_MUST_TELL_HOST feature is set, the guest may not + use these requested pages until that descriptor in the deflateq has been + used by the device. +\end_layout + +\begin_layout Enumerate +Otherwise, the guest may begin to re-use pages previously given to the balloon + before the device has acknowledged their withdrawl. + +\begin_inset Foot +status open + +\begin_layout Plain Layout +In this case, deflation advice is merely a courtesy +\end_layout + +\end_inset + + +\end_layout + +\end_deeper +\begin_layout Enumerate +In either case, once the device has completed the inflation or deflation, + the +\begin_inset Quotes eld +\end_inset + +actual +\begin_inset Quotes erd +\end_inset + + field of the configuration should be updated to reflect the new number + of pages in the balloon. +\begin_inset Foot +status open + +\begin_layout Plain Layout +As updates to configuration space are not atomic, this field isn't particularly + reliable, but can be used to diagnose buggy guests. +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Description +Memory +\begin_inset space \space{} +\end_inset + +Statistics +\end_layout + +\begin_layout Standard +The stats virtqueue is atypical because communication is driven by the device + (not the driver). + The channel becomes active at driver initialization time when the driver + adds an empty buffer and notifies the device. + A request for memory statistics proceeds as follows: +\end_layout + +\begin_layout Enumerate +The device pushes the buffer onto the used ring and sends an interrupt. +\end_layout + +\begin_layout Enumerate +The driver pops the used buffer and discards it. +\end_layout + +\begin_layout Enumerate +The driver collects memory statistics and writes them into a new buffer. +\end_layout + +\begin_layout Enumerate +The driver adds the buffer to the virtqueue and notifies the device. +\end_layout + +\begin_layout Enumerate +The device pops the buffer (retaining it to initiate a subsequent request) + and consumes the statistics. +\end_layout + +\begin_layout Description +Memory +\begin_inset space \space{} +\end_inset + +Statistics +\begin_inset space \space{} +\end_inset + +Format Each statistic consists of a 16 bit tag and a 64 bit value. + Both quantities are represented in the native endian of the guest. + All statistics are optional and the driver may choose which ones to supply. + To guarantee backwards compatibility, unsupported statistics should be + omitted. +\end_layout + +\begin_deeper +\begin_layout Standard +\begin_inset listings +inline false +status open + +\begin_layout Plain Layout + +struct virtio_balloon_stat { +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_BALLOON_S_SWAP_IN 0 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_BALLOON_S_SWAP_OUT 1 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_BALLOON_S_MAJFLT 2 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_BALLOON_S_MINFLT 3 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_BALLOON_S_MEMFREE 4 +\end_layout + +\begin_layout Plain Layout + +#define VIRTIO_BALLOON_S_MEMTOT 5 +\end_layout + +\begin_layout Plain Layout + + u16 tag; +\end_layout + +\begin_layout Plain Layout + + u64 val; +\end_layout + +\begin_layout Plain Layout + +} __attribute__((packed)); +\end_layout + +\end_inset + + +\end_layout + +\end_deeper +\begin_layout Description +Tags +\end_layout + +\begin_layout Description +VIRTIO_BALLOON_S_SWAP_IN The amount of memory that has been swapped in (in + bytes). +\end_layout + +\begin_layout Description +VIRTIO_BALLOON_S_SWAP_OUT The amount of memory that has been swapped out + to disk (in bytes). +\end_layout + +\begin_layout Description +VIRTIO_BALLOON_S_MAJFLT The number of major page faults that have occurred. +\end_layout + +\begin_layout Description +VIRTIO_BALLOON_S_MINFLT The number of minor page faults that have occurred. +\end_layout + +\begin_layout Description +VIRTIO_BALLOON_S_MEMFREE The amount of memory not being used for any purpose + (in bytes). +\end_layout + +\begin_layout Description +VIRTIO_BALLOON_S_MEMTOT The total amount of memory available (in bytes). +\end_layout + +\end_body +\end_document |