aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Sterba <dsterba@suse.com>2024-02-20 09:53:45 +0100
committerDavid Sterba <dsterba@suse.com>2024-02-20 09:56:00 +0100
commitd03594b0313db71413b9dcb040f8d5c4da7213b1 (patch)
tree3cb875166d023eddefa0b7939b9a5056b225b942
parent06d0ed483dde86a1c7f5157f456e876778d4d9ad (diff)
downloadbtrfs-progs-d03594b0313db71413b9dcb040f8d5c4da7213b1.tar.gz
btrfs-progs: fix exclusive op enqueue timeout
There's a report that 'btrfs balance start --enqueue' does not properly wait when there are multiple instances started. The command does a busy wait instead of timeouts. Strace output: 0.000006 pselect6(5, NULL, NULL, [4], {tv_sec=60, tv_nsec=0}, NULL) = 1 (except [4], left {tv_sec=59, tv_nsec=999999716}) 0.000008 pselect6(5, NULL, NULL, [4], {tv_sec=29, tv_nsec=999999000}, NULL) = 1 (except [4], left {tv_sec=29, tv_nsec=999998786}) After the first select there's almost the entire time left, the second one starts right after it. Polling/selecting sysfs files is possible under some conditions: - the file descriptor must be reopened before each poll/select - the whole buffer must be read too With that in place it now works as expected. The remaining timeout logic is slightly adjusted to wait at most 10 seconds so the pending jobs do not wait too long if there's still a lot of time left from the first select. Issue: #746 Signed-off-by: David Sterba <dsterba@suse.com>
-rw-r--r--common/utils.c21
1 files changed, 20 insertions, 1 deletions
diff --git a/common/utils.c b/common/utils.c
index 8d860726..eccdd7f1 100644
--- a/common/utils.c
+++ b/common/utils.c
@@ -1345,26 +1345,45 @@ int check_running_fs_exclop(int fd, enum exclusive_operation start, bool enqueue
fflush(stdout);
}
+ /*
+ * The sysfs file descriptor needs to be reopened and all data read
+ * before each select().
+ */
while (exclop > 0) {
fd_set fds;
struct timeval tv = { .tv_sec = 60, .tv_usec = 0 };
+ char tmp[1024];
+ close(sysfs_fd);
+ sysfs_fd = sysfs_open_fsid_file(fd, "exclusive_operation");
+ if (sysfs_fd < 0)
+ return sysfs_fd;
FD_ZERO(&fds);
FD_SET(sysfs_fd, &fds);
+ ret = read(sysfs_fd, tmp, sizeof(tmp));
ret = select(sysfs_fd + 1, NULL, NULL, &fds, &tv);
if (ret < 0) {
ret = -errno;
break;
}
if (ret > 0) {
+ close(sysfs_fd);
+ sysfs_fd = sysfs_open_fsid_file(fd, "exclusive_operation");
+ if (sysfs_fd < 0)
+ return sysfs_fd;
+
+ FD_ZERO(&fds);
+ FD_SET(sysfs_fd, &fds);
+
+ ret = read(sysfs_fd, tmp, sizeof(tmp));
/*
* Notified before the timeout, check again before
* returning. In case there are more operations
* waiting, we want to reduce the chances to race so
* reuse the remaining time to randomize the order.
*/
- tv.tv_sec /= 2;
+ tv.tv_sec = (tv.tv_sec % 10) + 1;
ret = select(sysfs_fd + 1, NULL, NULL, &fds, &tv);
exclop = get_fs_exclop(fd);
if (exclop <= 0)