If an ext3 file is truncated while its buffers are attached to the committing transaction, JBD will refuse to detach those buffers from the page. But the VFS will still truncate the page from the address_space. We end up with large numbers of pages which have buffers and no other references. These pages float down the LRU until vmscan.c reclaims them. Problem is, these pages are effectively "free", but the VM overcommit accounting does not know that: it can and does make very wrong decisions. The best fix for this is to change the JBD behaviour: if the to-be-truncated ordered-data buffer is undergoing writeout in commit we actually can just snip it off the committing transaction and allow the page to be freed at truncate time. Across a single `dbench 16' run this new code shot down 5,000 pages which would otherwise been left adrift on the LRU. fs/jbd/commit.c | 37 ++++++++++++++++++++++++++++++++++--- fs/jbd/transaction.c | 22 ++++++++++++++++++---- 2 files changed, 52 insertions(+), 7 deletions(-) diff -puN fs/jbd/transaction.c~jbd-640-ordered-truncate-fix fs/jbd/transaction.c --- 25/fs/jbd/transaction.c~jbd-640-ordered-truncate-fix 2003-06-08 23:20:53.000000000 -0700 +++ 25-akpm/fs/jbd/transaction.c 2003-06-08 23:20:53.000000000 -0700 @@ -1848,10 +1848,24 @@ static int journal_unmap_buffer(journal_ } } } else if (transaction == journal->j_committing_transaction) { - /* If it is committing, we simply cannot touch it. We - * can remove it's next_transaction pointer from the - * running transaction if that is set, but nothing - * else. */ + if (jh->b_jlist == BJ_SyncData) { + /* + * Ordered data buffer on committing transaction: + * unfile it now. + */ + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + __journal_unfile_buffer(jh); + jh->b_transaction = NULL; + journal_remove_journal_head(bh); + __brelse(bh); + goto zap_buffer; + } + + /* + * If it is committing, we simply cannot touch it. We can + * remove it's next_transaction pointer from the running + * transaction if that is set, but nothing else. + */ JBUFFER_TRACE(jh, "on committing transaction"); set_buffer_freed(bh); if (jh->b_next_transaction) { diff -puN fs/jbd/commit.c~jbd-640-ordered-truncate-fix fs/jbd/commit.c --- 25/fs/jbd/commit.c~jbd-640-ordered-truncate-fix 2003-06-08 23:56:24.000000000 -0700 +++ 25-akpm/fs/jbd/commit.c 2003-06-09 00:16:05.000000000 -0700 @@ -34,6 +34,38 @@ static void journal_end_buffer_io_sync(s } /* + * Write an array of ordered-data buffers to the filesystem. A ref was taken + * against each buffer as it was added to the array. end_buffer_io_sync() + * will undo that ref, or it is undone here if we decide not to write it. + */ +static void jbd_write_buffers(int nr, struct buffer_head *bhs[]) +{ + int i; + + for (i = 0; i < nr; i++) { + struct buffer_head *bh = bhs[i]; + + if (test_set_buffer_locked(bh)) { + put_bh(bh); + continue; + } + + /* + * journal_unmap_buffer() may have got there first + */ + if (buffer_mapped(bh)) { + if (test_clear_buffer_dirty(bh)) { + bh->b_end_io = end_buffer_io_sync; + submit_bh(WRITE, bh); + continue; + } + } + unlock_buffer(bh); + put_bh(bh); + } +} + +/* * journal_commit_transaction * * The primary function for committing a transaction to the log. This @@ -193,7 +225,7 @@ write_out_data_locked: if (!buffer_locked(bh)) { if (buffer_dirty(bh)) { BUFFER_TRACE(bh, "start journal writeout"); - atomic_inc(&bh->b_count); + get_bh(bh); wbuf[bufs++] = bh; } else { BUFFER_TRACE(bh, "writeout complete: unfile"); @@ -227,9 +259,8 @@ write_out_data_locked: jbd_debug(2, "submit %d writes\n", bufs); spin_unlock(&journal->j_list_lock); if (bufs) - ll_rw_block(WRITE, bufs, wbuf); + jbd_write_buffers(bufs, wbuf); cond_resched(); - journal_brelse_array(wbuf, bufs); spin_lock(&journal->j_list_lock); if (bufs) goto write_out_data_locked; _