virtio-fs: add multi-queue supportfor-next

This commit creates a multi-queue mapping at device bring-up. The driver first attempts to use the existing MSI-X interrupt affinities (previously disabled), and if not present, will distribute the request queues evenly over the CPUs. If the latter fails as well, all CPUs are mapped to request queue zero. When a request is handed from FUSE to the virtio-fs device driver, the driver will use the current CPU to index into the multi-queue mapping and determine the optimal request queue to use. We measured the performance of this patch with the fio benchmarking tool, increasing the number of queues results in a significant speedup for both read and write operations, demonstrating the effectiveness of multi-queue support. Host: - Dell PowerEdge R760 - CPU: Intel(R) Xeon(R) Gold 6438M, 128 cores - VM: KVM with 32 cores Virtio-fs device: - BlueField-3 DPU - CPU: ARM Cortex-A78AE, 16 cores - One thread per queue, each busy polling on one request queue - Each queue is 1024 descriptors deep Workload: - fio, sequential read or write, ioengine=libaio, numjobs=32, 4GiB file per job, iodepth=8, bs=256KiB, runtime=30s Performance Results: +===========================+==========+===========+ | Number of queues | Fio read | Fio write | +===========================+==========+===========+ | 1 request queue (GiB/s) | 6.1 | 4.6 | +---------------------------+----------+-----------+ | 8 request queues (GiB/s) | 25.8 | 10.3 | +---------------------------+----------+-----------+ | 16 request queues (GiB/s) | 30.9 | 19.5 | +---------------------------+----------+-----------+ | 32 request queue (GiB/s) | 33.2 | 22.6 | +---------------------------+----------+-----------+ | Speedup | 5.5x | 5x | +---------------=-----------+----------+-----------+ Signed-off-by: Peter-Jan Gootzen <pgootzen@nvidia.com> Signed-off-by: Yoray Zack <yorayz@nvidia.com> Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
author: Peter-Jan Gootzen <pgootzen@nvidia.com> 2024-05-01 17:38:17 +0200
committer: Miklos Szeredi <mszeredi@redhat.com> 2024-05-10 13:38:14 +0200
commit: 529395d2ae6456c556405016ea0c43081fe607f3 (patch)
tree: b7f1b4967f42119ae7ad64e7bb596a946b5185f7
parent: 103c2de111bf32f7c36a0ce8f638b114a37e0b76 (diff)
download: fuse-for-next.tar.gz
1 files changed, 62 insertions, 8 deletions
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index fa4c16abfe101a..8ffa4f063a37b7 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -7,6 +7,8 @@
 #include <linux/fs.h>
 #include <linux/dax.h>
 #include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/group_cpus.h>
 #include <linux/pfn_t.h>
 #include <linux/memremap.h>
 #include <linux/module.h>
@@ -67,6 +69,8 @@ struct virtio_fs {
 	unsigned int num_request_queues; /* number of request queues */
 	struct dax_device *dax_dev;
 
+	unsigned int *mq_map; /* index = cpu id, value = request vq id */
+
 	/* DAX memory window where file contents are mapped */
 	void *window_kaddr;
 	phys_addr_t window_phys_addr;
@@ -185,6 +189,7 @@ static void virtio_fs_ktype_release(struct kobject *kobj)
 {
 	struct virtio_fs *vfs = container_of(kobj, struct virtio_fs, kobj);
 
+	kfree(vfs->mq_map);
 	kfree(vfs->vqs);
 	kfree(vfs);
 }
@@ -706,6 +711,44 @@ static void virtio_fs_requests_done_work(struct work_struct *work)
 	}
 }
 
+static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *fs)
+{
+	const struct cpumask *mask, *masks;
+	unsigned int q, cpu;
+
+	/* First attempt to map using existing transport layer affinities
+	 * e.g. PCIe MSI-X
+	 */
+	if (!vdev->config->get_vq_affinity)
+		goto fallback;
+
+	for (q = 0; q < fs->num_request_queues; q++) {
+		mask = vdev->config->get_vq_affinity(vdev, VQ_REQUEST + q);
+		if (!mask)
+			goto fallback;
+
+		for_each_cpu(cpu, mask)
+			fs->mq_map[cpu] = q;
+	}
+
+	return;
+fallback:
+	/* Attempt to map evenly in groups over the CPUs */
+	masks = group_cpus_evenly(fs->num_request_queues);
+	/* If even this fails we default to all CPUs use queue zero */
+	if (!masks) {
+		for_each_possible_cpu(cpu)
+			fs->mq_map[cpu] = 0;
+		return;
+	}
+
+	for (q = 0; q < fs->num_request_queues; q++) {
+		for_each_cpu(cpu, &masks[q])
+			fs->mq_map[cpu] = q;
+	}
+	kfree(masks);
+}
+
 /* Virtqueue interrupt handler */
 static void virtio_fs_vq_done(struct virtqueue *vq)
 {
@@ -742,6 +785,11 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
 {
 	struct virtqueue **vqs;
 	vq_callback_t **callbacks;
+	/* Specify pre_vectors to ensure that the queues before the
+	 * request queues (e.g. hiprio) don't claim any of the CPUs in
+	 * the multi-queue mapping and interrupt affinities
+	 */
+	struct irq_affinity desc = { .pre_vectors = VQ_REQUEST };
 	const char **names;
 	unsigned int i;
 	int ret = 0;
@@ -763,7 +811,9 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
 	callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]),
 					GFP_KERNEL);
 	names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL);
-	if (!vqs || !callbacks || !names) {
+	fs->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*fs->mq_map), GFP_KERNEL,
+					dev_to_node(&vdev->dev));
+	if (!vqs || !callbacks || !names || !fs->mq_map) {
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -783,7 +833,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
 		names[i] = fs->vqs[i].name;
 	}
 
-	ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL);
+	ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, &desc);
 	if (ret < 0)
 		goto out;
 
@@ -795,8 +845,10 @@ out:
 	kfree(names);
 	kfree(callbacks);
 	kfree(vqs);
-	if (ret)
+	if (ret) {
 		kfree(fs->vqs);
+		kfree(fs->mq_map);
+	}
 	return ret;
 }
 
@@ -942,7 +994,7 @@ static int virtio_fs_probe(struct virtio_device *vdev)
 	if (ret < 0)
 		goto out;
 
-	/* TODO vq affinity */
+	virtio_fs_map_queues(vdev, fs);
 
 	ret = virtio_fs_setup_dax(vdev, fs);
 	if (ret < 0)
@@ -1291,7 +1343,7 @@ out:
 static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
 __releases(fiq->lock)
 {
-	unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */
+	unsigned int queue_id;
 	struct virtio_fs *fs;
 	struct fuse_req *req;
 	struct virtio_fs_vq *fsvq;
@@ -1305,11 +1357,13 @@ __releases(fiq->lock)
 	spin_unlock(&fiq->lock);
 
 	fs = fiq->priv;
+	queue_id = VQ_REQUEST + fs->mq_map[raw_smp_processor_id()];
 
-	pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n",
-		  __func__, req->in.h.opcode, req->in.h.unique,
+	pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u queue_id %u\n",
+		 __func__, req->in.h.opcode, req->in.h.unique,
 		 req->in.h.nodeid, req->in.h.len,
-		 fuse_len_args(req->args->out_numargs, req->args->out_args));
+		 fuse_len_args(req->args->out_numargs, req->args->out_args),
+		 queue_id);
 
 	fsvq = &fs->vqs[queue_id];
 	ret = virtio_fs_enqueue_req(fsvq, req, false);
author	Peter-Jan Gootzen <pgootzen@nvidia.com>	2024-05-01 17:38:17 +0200
committer	Miklos Szeredi <mszeredi@redhat.com>	2024-05-10 13:38:14 +0200
commit	529395d2ae6456c556405016ea0c43081fe607f3 (patch)
tree	b7f1b4967f42119ae7ad64e7bb596a946b5185f7
parent	103c2de111bf32f7c36a0ce8f638b114a37e0b76 (diff)
download	fuse-for-next.tar.gz