aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig20
-rw-r--r--block/badblocks.c6
-rw-r--r--block/bdev.c258
-rw-r--r--block/blk-core.c14
-rw-r--r--block/blk-mq.c14
-rw-r--r--block/blk-sysfs.c2
-rw-r--r--block/fops.c23
7 files changed, 220 insertions, 117 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 55ae2286a4ded..1de4682d48ccb 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -78,6 +78,26 @@ config BLK_DEV_INTEGRITY_T10
select CRC_T10DIF
select CRC64_ROCKSOFT
+config BLK_DEV_WRITE_MOUNTED
+ bool "Allow writing to mounted block devices"
+ default y
+ help
+ When a block device is mounted, writing to its buffer cache is very
+ likely going to cause filesystem corruption. It is also rather easy to
+ crash the kernel in this way since the filesystem has no practical way
+ of detecting these writes to buffer cache and verifying its metadata
+ integrity. However there are some setups that need this capability
+ like running fsck on read-only mounted root device, modifying some
+ features on mounted ext4 filesystem, and similar. If you say N, the
+ kernel will prevent processes from writing to block devices that are
+ mounted by filesystems which provides some more protection from runaway
+ privileged processes and generally makes it much harder to crash
+ filesystem drivers. Note however that this does not prevent
+ underlying device(s) from being modified by other means, e.g. by
+ directly submitting SCSI commands or through access to lower layers of
+ storage stack. If in doubt, say Y. The configuration can be overridden
+ with the bdev_allow_write_mounted boot option.
+
config BLK_DEV_ZONED
bool "Zoned block device support"
select MQ_IOSCHED_DEADLINE
diff --git a/block/badblocks.c b/block/badblocks.c
index fc92d4e18aa3c..db4ec8b9b2a8c 100644
--- a/block/badblocks.c
+++ b/block/badblocks.c
@@ -1312,12 +1312,14 @@ re_check:
prev = prev_badblocks(bb, &bad, hint);
/* start after all badblocks */
- if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
+ if ((prev >= 0) &&
+ ((prev + 1) >= bb->count) && !overlap_front(bb, prev, &bad)) {
len = sectors;
goto update_sectors;
}
- if (overlap_front(bb, prev, &bad)) {
+ /* Overlapped with front badblocks record */
+ if ((prev >= 0) && overlap_front(bb, prev, &bad)) {
if (BB_ACK(p[prev]))
acked_badblocks++;
else
diff --git a/block/bdev.c b/block/bdev.c
index 750aec178b6ab..e9f1b12bd75c7 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -30,6 +30,9 @@
#include "../fs/internal.h"
#include "blk.h"
+/* Should we allow writing to mounted block devices? */
+static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED);
+
struct bdev_inode {
struct block_device bdev;
struct inode vfs_inode;
@@ -207,85 +210,88 @@ int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend)
EXPORT_SYMBOL(sync_blockdev_range);
/**
- * freeze_bdev - lock a filesystem and force it into a consistent state
+ * bdev_freeze - lock a filesystem and force it into a consistent state
* @bdev: blockdevice to lock
*
* If a superblock is found on this device, we take the s_umount semaphore
* on it to make sure nobody unmounts until the snapshot creation is done.
* The reference counter (bd_fsfreeze_count) guarantees that only the last
* unfreeze process can unfreeze the frozen filesystem actually when multiple
- * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
- * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
+ * freeze requests arrive simultaneously. It counts up in bdev_freeze() and
+ * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze
* actually.
+ *
+ * Return: On success zero is returned, negative error code on failure.
*/
-int freeze_bdev(struct block_device *bdev)
+int bdev_freeze(struct block_device *bdev)
{
- struct super_block *sb;
int error = 0;
mutex_lock(&bdev->bd_fsfreeze_mutex);
- if (++bdev->bd_fsfreeze_count > 1)
- goto done;
-
- sb = get_active_super(bdev);
- if (!sb)
- goto sync;
- if (sb->s_op->freeze_super)
- error = sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE);
- else
- error = freeze_super(sb, FREEZE_HOLDER_USERSPACE);
- deactivate_super(sb);
- if (error) {
- bdev->bd_fsfreeze_count--;
- goto done;
+ if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) {
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ return 0;
}
- bdev->bd_fsfreeze_sb = sb;
-sync:
- sync_blockdev(bdev);
-done:
+ mutex_lock(&bdev->bd_holder_lock);
+ if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) {
+ error = bdev->bd_holder_ops->freeze(bdev);
+ lockdep_assert_not_held(&bdev->bd_holder_lock);
+ } else {
+ mutex_unlock(&bdev->bd_holder_lock);
+ error = sync_blockdev(bdev);
+ }
+
+ if (error)
+ atomic_dec(&bdev->bd_fsfreeze_count);
+
mutex_unlock(&bdev->bd_fsfreeze_mutex);
return error;
}
-EXPORT_SYMBOL(freeze_bdev);
+EXPORT_SYMBOL(bdev_freeze);
/**
- * thaw_bdev - unlock filesystem
+ * bdev_thaw - unlock filesystem
* @bdev: blockdevice to unlock
*
- * Unlocks the filesystem and marks it writeable again after freeze_bdev().
+ * Unlocks the filesystem and marks it writeable again after bdev_freeze().
+ *
+ * Return: On success zero is returned, negative error code on failure.
*/
-int thaw_bdev(struct block_device *bdev)
+int bdev_thaw(struct block_device *bdev)
{
- struct super_block *sb;
- int error = -EINVAL;
+ int error = -EINVAL, nr_freeze;
mutex_lock(&bdev->bd_fsfreeze_mutex);
- if (!bdev->bd_fsfreeze_count)
+
+ /*
+ * If this returns < 0 it means that @bd_fsfreeze_count was
+ * already 0 and no decrement was performed.
+ */
+ nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count);
+ if (nr_freeze < 0)
goto out;
error = 0;
- if (--bdev->bd_fsfreeze_count > 0)
+ if (nr_freeze > 0)
goto out;
- sb = bdev->bd_fsfreeze_sb;
- if (!sb)
- goto out;
+ mutex_lock(&bdev->bd_holder_lock);
+ if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) {
+ error = bdev->bd_holder_ops->thaw(bdev);
+ lockdep_assert_not_held(&bdev->bd_holder_lock);
+ } else {
+ mutex_unlock(&bdev->bd_holder_lock);
+ }
- if (sb->s_op->thaw_super)
- error = sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE);
- else
- error = thaw_super(sb, FREEZE_HOLDER_USERSPACE);
if (error)
- bdev->bd_fsfreeze_count++;
- else
- bdev->bd_fsfreeze_sb = NULL;
+ atomic_inc(&bdev->bd_fsfreeze_count);
out:
mutex_unlock(&bdev->bd_fsfreeze_mutex);
return error;
}
-EXPORT_SYMBOL(thaw_bdev);
+EXPORT_SYMBOL(bdev_thaw);
/*
* pseudo-fs
@@ -729,9 +735,60 @@ void blkdev_put_no_open(struct block_device *bdev)
{
put_device(&bdev->bd_device);
}
-
+
+static bool bdev_writes_blocked(struct block_device *bdev)
+{
+ return bdev->bd_writers == -1;
+}
+
+static void bdev_block_writes(struct block_device *bdev)
+{
+ bdev->bd_writers = -1;
+}
+
+static void bdev_unblock_writes(struct block_device *bdev)
+{
+ bdev->bd_writers = 0;
+}
+
+static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode)
+{
+ if (bdev_allow_write_mounted)
+ return true;
+ /* Writes blocked? */
+ if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev))
+ return false;
+ if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0)
+ return false;
+ return true;
+}
+
+static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode)
+{
+ if (bdev_allow_write_mounted)
+ return;
+
+ /* Claim exclusive or shared write access. */
+ if (mode & BLK_OPEN_RESTRICT_WRITES)
+ bdev_block_writes(bdev);
+ else if (mode & BLK_OPEN_WRITE)
+ bdev->bd_writers++;
+}
+
+static void bdev_yield_write_access(struct block_device *bdev, blk_mode_t mode)
+{
+ if (bdev_allow_write_mounted)
+ return;
+
+ /* Yield exclusive or shared write access. */
+ if (mode & BLK_OPEN_RESTRICT_WRITES)
+ bdev_unblock_writes(bdev);
+ else if (mode & BLK_OPEN_WRITE)
+ bdev->bd_writers--;
+}
+
/**
- * blkdev_get_by_dev - open a block device by device number
+ * bdev_open_by_dev - open a block device by device number
* @dev: device number of block device to open
* @mode: open mode (BLK_OPEN_*)
* @holder: exclusive holder identifier
@@ -743,32 +800,46 @@ void blkdev_put_no_open(struct block_device *bdev)
*
* Use this interface ONLY if you really do not have anything better - i.e. when
* you are behind a truly sucky interface and all you are given is a device
- * number. Everything else should use blkdev_get_by_path().
+ * number. Everything else should use bdev_open_by_path().
*
* CONTEXT:
* Might sleep.
*
* RETURNS:
- * Reference to the block_device on success, ERR_PTR(-errno) on failure.
+ * Handle with a reference to the block_device on success, ERR_PTR(-errno) on
+ * failure.
*/
-struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
- const struct blk_holder_ops *hops)
+struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
+ const struct blk_holder_ops *hops)
{
- bool unblock_events = true;
+ struct bdev_handle *handle = kmalloc(sizeof(struct bdev_handle),
+ GFP_KERNEL);
struct block_device *bdev;
+ bool unblock_events = true;
struct gendisk *disk;
int ret;
+ if (!handle)
+ return ERR_PTR(-ENOMEM);
+
ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
MAJOR(dev), MINOR(dev),
((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) |
((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0));
if (ret)
- return ERR_PTR(ret);
+ goto free_handle;
+
+ /* Blocking writes requires exclusive opener */
+ if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) {
+ ret = -EINVAL;
+ goto free_handle;
+ }
bdev = blkdev_get_no_open(dev);
- if (!bdev)
- return ERR_PTR(-ENXIO);
+ if (!bdev) {
+ ret = -ENXIO;
+ goto free_handle;
+ }
disk = bdev->bd_disk;
if (holder) {
@@ -791,12 +862,16 @@ struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
goto abort_claiming;
if (!try_module_get(disk->fops->owner))
goto abort_claiming;
+ ret = -EBUSY;
+ if (!bdev_may_open(bdev, mode))
+ goto abort_claiming;
if (bdev_is_partition(bdev))
ret = blkdev_get_part(bdev, mode);
else
ret = blkdev_get_whole(bdev, mode);
if (ret)
goto put_module;
+ bdev_claim_write_access(bdev, mode);
if (holder) {
bd_finish_claiming(bdev, holder, hops);
@@ -817,7 +892,10 @@ struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
if (unblock_events)
disk_unblock_events(disk);
- return bdev;
+ handle->bdev = bdev;
+ handle->holder = holder;
+ handle->mode = mode;
+ return handle;
put_module:
module_put(disk->fops->owner);
abort_claiming:
@@ -827,34 +905,14 @@ abort_claiming:
disk_unblock_events(disk);
put_blkdev:
blkdev_put_no_open(bdev);
+free_handle:
+ kfree(handle);
return ERR_PTR(ret);
}
-EXPORT_SYMBOL(blkdev_get_by_dev);
-
-struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
- const struct blk_holder_ops *hops)
-{
- struct bdev_handle *handle = kmalloc(sizeof(*handle), GFP_KERNEL);
- struct block_device *bdev;
-
- if (!handle)
- return ERR_PTR(-ENOMEM);
- bdev = blkdev_get_by_dev(dev, mode, holder, hops);
- if (IS_ERR(bdev)) {
- kfree(handle);
- return ERR_CAST(bdev);
- }
- handle->bdev = bdev;
- handle->holder = holder;
- if (holder)
- mode |= BLK_OPEN_EXCL;
- handle->mode = mode;
- return handle;
-}
EXPORT_SYMBOL(bdev_open_by_dev);
/**
- * blkdev_get_by_path - open a block device by name
+ * bdev_open_by_path - open a block device by name
* @path: path to the block device to open
* @mode: open mode (BLK_OPEN_*)
* @holder: exclusive holder identifier
@@ -868,29 +926,9 @@ EXPORT_SYMBOL(bdev_open_by_dev);
* Might sleep.
*
* RETURNS:
- * Reference to the block_device on success, ERR_PTR(-errno) on failure.
+ * Handle with a reference to the block_device on success, ERR_PTR(-errno) on
+ * failure.
*/
-struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
- void *holder, const struct blk_holder_ops *hops)
-{
- struct block_device *bdev;
- dev_t dev;
- int error;
-
- error = lookup_bdev(path, &dev);
- if (error)
- return ERR_PTR(error);
-
- bdev = blkdev_get_by_dev(dev, mode, holder, hops);
- if (!IS_ERR(bdev) && (mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
- blkdev_put(bdev, holder);
- return ERR_PTR(-EACCES);
- }
-
- return bdev;
-}
-EXPORT_SYMBOL(blkdev_get_by_path);
-
struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
void *holder, const struct blk_holder_ops *hops)
{
@@ -913,8 +951,9 @@ struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
}
EXPORT_SYMBOL(bdev_open_by_path);
-void blkdev_put(struct block_device *bdev, void *holder)
+void bdev_release(struct bdev_handle *handle)
{
+ struct block_device *bdev = handle->bdev;
struct gendisk *disk = bdev->bd_disk;
/*
@@ -928,8 +967,10 @@ void blkdev_put(struct block_device *bdev, void *holder)
sync_blockdev(bdev);
mutex_lock(&disk->open_mutex);
- if (holder)
- bd_end_claim(bdev, holder);
+ bdev_yield_write_access(bdev, handle->mode);
+
+ if (handle->holder)
+ bd_end_claim(bdev, handle->holder);
/*
* Trigger event checking and tell drivers to flush MEDIA_CHANGE
@@ -946,12 +987,6 @@ void blkdev_put(struct block_device *bdev, void *holder)
module_put(disk->fops->owner);
blkdev_put_no_open(bdev);
-}
-EXPORT_SYMBOL(blkdev_put);
-
-void bdev_release(struct bdev_handle *handle)
-{
- blkdev_put(handle->bdev, handle->holder);
kfree(handle);
}
EXPORT_SYMBOL(bdev_release);
@@ -1102,3 +1137,12 @@ void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
blkdev_put_no_open(bdev);
}
+
+static int __init setup_bdev_allow_write_mounted(char *str)
+{
+ if (kstrtobool(str, &bdev_allow_write_mounted))
+ pr_warn("Invalid option string for bdev_allow_write_mounted:"
+ " '%s'\n", str);
+ return 1;
+}
+__setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted);
diff --git a/block/blk-core.c b/block/blk-core.c
index 9520ccab30500..11342af420d0c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -501,9 +501,17 @@ static inline void bio_check_ro(struct bio *bio)
if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) {
if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
return;
- pr_warn_ratelimited("Trying to write to read-only block-device %pg\n",
- bio->bi_bdev);
- /* Older lvm-tools actually trigger this */
+
+ if (bio->bi_bdev->bd_ro_warned)
+ return;
+
+ bio->bi_bdev->bd_ro_warned = true;
+ /*
+ * Use ioctl to set underlying disk of raid/dm to read-only
+ * will trigger this.
+ */
+ pr_warn("Trying to write to read-only block-device %pg\n",
+ bio->bi_bdev);
}
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index fb29ff5cc281d..c11c97afa0bc1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1513,14 +1513,26 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q,
}
EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
+static bool blk_is_flush_data_rq(struct request *rq)
+{
+ return (rq->rq_flags & RQF_FLUSH_SEQ) && !is_flush_rq(rq);
+}
+
static bool blk_mq_rq_inflight(struct request *rq, void *priv)
{
/*
* If we find a request that isn't idle we know the queue is busy
* as it's checked in the iter.
* Return false to stop the iteration.
+ *
+ * In case of queue quiesce, if one flush data request is completed,
+ * don't count it as inflight given the flush sequence is suspended,
+ * and the original flush data request is invisible to driver, just
+ * like other pending requests because of quiesce
*/
- if (blk_mq_request_started(rq)) {
+ if (blk_mq_request_started(rq) && !(blk_queue_quiesced(rq->q) &&
+ blk_is_flush_data_rq(rq) &&
+ blk_mq_request_completed(rq))) {
bool *busy = priv;
*busy = true;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 40bab5975c561..6b2429cad81af 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -610,6 +610,7 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
#endif
+/* Common attributes for bio-based and request-based queues. */
static struct attribute *queue_attrs[] = {
&queue_ra_entry.attr,
&queue_max_hw_sectors_entry.attr,
@@ -654,6 +655,7 @@ static struct attribute *queue_attrs[] = {
NULL,
};
+/* Request-based queue attributes that are not relevant for bio-based queues. */
static struct attribute *blk_mq_queue_attrs[] = {
&queue_requests_entry.attr,
&elv_iosched_entry.attr,
diff --git a/block/fops.c b/block/fops.c
index 0abaac705dafb..0cf8cf72cdfa1 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -410,9 +410,24 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock,
return 0;
}
-static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
+/*
+ * We cannot call mpage_writepages() as it does not take the buffer lock.
+ * We must use block_write_full_folio() directly which holds the buffer
+ * lock. The buffer lock provides the synchronisation with writeback
+ * that filesystems rely on when they use the blockdev's mapping.
+ */
+static int blkdev_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- return block_write_full_page(page, blkdev_get_block, wbc);
+ struct blk_plug plug;
+ int err;
+
+ blk_start_plug(&plug);
+ err = write_cache_pages(mapping, wbc, block_write_full_folio,
+ blkdev_get_block);
+ blk_finish_plug(&plug);
+
+ return err;
}
static int blkdev_read_folio(struct file *file, struct folio *folio)
@@ -449,7 +464,7 @@ const struct address_space_operations def_blk_aops = {
.invalidate_folio = block_invalidate_folio,
.read_folio = blkdev_read_folio,
.readahead = blkdev_readahead,
- .writepage = blkdev_writepage,
+ .writepages = blkdev_writepages,
.write_begin = blkdev_write_begin,
.write_end = blkdev_write_end,
.migrate_folio = buffer_migrate_folio_norefs,
@@ -500,7 +515,7 @@ const struct address_space_operations def_blk_aops = {
.readahead = blkdev_readahead,
.writepages = blkdev_writepages,
.is_partially_uptodate = iomap_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.migrate_folio = filemap_migrate_folio,
};
#endif /* CONFIG_BUFFER_HEAD */