--- linux-2.2.17.ext3-0.0.4a/arch/i386/kdb/modules/kdbm_jfs.c.~1~ Thu Sep 7 14:26:44 2000 +++ linux-2.2.17.ext3-0.0.4a/arch/i386/kdb/modules/kdbm_jfs.c Mon Sep 11 17:26:51 2000 @@ -40,8 +40,8 @@ "T_FINISH" }; -const char *bh_statenames = "UDLRPTJ"; -int bh_states[] = {0,1,2,3,6,7,8,-1}; +const char *bh_statenames = "UDLRPTJW"; +int bh_states[] = {0,1,2,3,6,7,8,9,-1}; const char *bh_listname(int list) { @@ -69,7 +69,8 @@ const char *bh_state(int state) { - char *result=" "; + static char buffer[9]; + char *result = buffer; int i = 0; do { if (state & (1<= 0); + result[i] = 0; return result; } @@ -160,9 +162,9 @@ unsigned long where, const char *kind) { - kdb_printf("%s%s at %08lx for %d bytes:\n", + kdb_printf("%s%s at 0x%08lx (journal 0x%08x) for %d bytes:\n", kind ? kind : "", kind ? " transaction" : "Transaction", - where, sizeof(*transaction)); + where, transaction->t_journal, sizeof(*transaction)); kdb_printf(" ID: %-5ld State: %9s " "Buffers: %-5d Log at: %-5ld\n", --- linux-2.2.17.ext3-0.0.4a/fs/buffer.c.~1~ Thu Sep 7 14:26:44 2000 +++ linux-2.2.17.ext3-0.0.4a/fs/buffer.c Thu Sep 14 16:36:48 2000 @@ -158,6 +158,15 @@ bh->b_count--; } +/* @@@ ext3 debugging only. */ +static void check_buffer_flushable(struct buffer_head *bh) +{ + if (bh->b_jlist == 0) + J_ASSERT (bh->b_transaction == NULL); + else + J_ASSERT (bh->b_jlist == BJ_Data); +} + /* Call sync_buffers with wait!=0 to ensure that the call does not * return until all buffer writes have completed. Sync() may return * before the writes have finished; fsync() may not. @@ -190,8 +199,7 @@ for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) { if (bh->b_list != BUF_DIRTY) goto repeat; - J_ASSERT(!bh->b_transaction); - J_ASSERT(bh->b_jlist == 0); + check_buffer_flushable(bh); next = bh->b_next_free; if (!lru_list[BUF_DIRTY]) break; @@ -234,8 +242,7 @@ bh->b_count++; next->b_count++; bh->b_flushtime = 0; - J_ASSERT(!bh->b_transaction); - J_ASSERT(bh->b_jlist == 0); + check_buffer_flushable(bh); ll_rw_block(WRITE, 1, &bh); bh->b_count--; next->b_count--; @@ -836,7 +843,7 @@ *((char *) 0) = 0; return; } - if (buf->b_jlist != BJ_None) + if (buf->b_jlist != BJ_None && buf->b_jlist != BJ_Data) dispose = BUF_JOURNAL; else if (buffer_dirty(buf)) dispose = BUF_DIRTY; @@ -887,7 +894,7 @@ if (buf->b_count) { buf->b_count--; if (!buf->b_count && - (buf->b_jlist != BJ_None && buf->b_jlist != BJ_Shadow)) + (buf->b_jlist != BJ_None && buf->b_jlist != BJ_Shadow && buf->b_jlist != BJ_Data)) J_ASSERT (!test_bit(BH_JWrite, &buf->b_state)); return; } @@ -1758,8 +1765,7 @@ for (i = nr_buffers_type[nlist]; i-- > 0; bh = next) { /* We may have stalled while waiting for I/O to complete. */ if(bh->b_list != nlist) goto repeat; - J_ASSERT(!bh->b_transaction); - J_ASSERT(bh->b_jlist == 0); + check_buffer_flushable(bh); next = bh->b_next_free; if(!lru_list[nlist]) { printk("Dirty list empty %d\n", i); @@ -1790,8 +1796,7 @@ #ifdef DEBUG if(nlist != BUF_DIRTY) ncount++; #endif - J_ASSERT(!bh->b_transaction); - J_ASSERT(bh->b_jlist == 0); + check_buffer_flushable(bh); ll_rw_block(WRITE, 1, &bh); bh->b_count--; next->b_count--; @@ -1922,8 +1927,7 @@ bh = next) { /* We may have stalled while waiting for I/O to complete. */ if(bh->b_list != nlist) goto repeat; - J_ASSERT(!bh->b_transaction); - J_ASSERT(bh->b_jlist == 0); + check_buffer_flushable(bh); next = bh->b_next_free; if(!lru_list[nlist]) { printk("Dirty list empty %d\n", i); @@ -1951,8 +1955,7 @@ bh->b_count++; ndirty++; bh->b_flushtime = 0; - J_ASSERT(!bh->b_transaction); - J_ASSERT(bh->b_jlist == 0); + check_buffer_flushable(bh); if (major == LOOP_MAJOR) { ll_rw_block(wrta_cmd,1, &bh); wrta_cmd = WRITEA; --- linux-2.2.17.ext3-0.0.4a/fs/ext3/balloc.c.~1~ Thu Sep 7 14:26:44 2000 +++ linux-2.2.17.ext3-0.0.4a/fs/ext3/balloc.c Wed Sep 13 18:27:20 2000 @@ -319,7 +319,7 @@ "Block = %lu, count = %lu", block, count); - journal_get_write_access(handle, bh); + journal_get_undo_access(handle, bh); journal_get_write_access(handle, bh2); journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); @@ -335,6 +335,27 @@ es->s_free_blocks_count = cpu_to_le32(le32_to_cpu(es->s_free_blocks_count)+1); } + + /* @@@ This prevents newly-allocated data from being + * freed and then reallocated within the same + * transaction. + * + * Ideally we would want to allow that to happen, but to + * do so requires making journal_forget() capable of + * revoking the queued write of a data block, which + * implies blocking on the journal lock. *forget() + * cannot block due to truncate races. + * + * Eventually we can fix this by making journal_forget() + * return a status indicating whether or not it was able + * to revoke the buffer. On successful revoke, it is + * safe not to set the allocation bit in the committed + * bitmap, because we know that there is no outstanding + * activity on the buffer any more and so it is safe to + * reallocate it. + */ + J_ASSERT(bh->b_committed_data != NULL); + ext3_set_bit (bit + i, bh->b_committed_data); } journal_dirty_metadata(handle, bh); @@ -355,6 +376,30 @@ return; } +/* For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This + * prevents deletes from freeing up the page for reuse until we have + * committed the delete transaction. + * + * If we didn't do this, then deleting something and reallocating it as + * data would allow the old block to be overwritten before the + * transaction committed (because we force data to disk before commit). + * This would lead to corruption if we crashed between overwriting the + * data and committing the delete. + * + * @@@ We may want to make this allocation behaviour conditional on + * data-writes at some point, and disable it for metadata allocations or + * sync-data inodes. + */ +static int ext3_test_allocatable(int nr, struct buffer_head *bh) +{ + if (ext3_test_bit(nr, bh->b_data)) + return 0; + if (!bh->b_committed_data) + return 1; + return !ext3_test_bit(nr, bh->b_committed_data); +} + /* * ext3_new_block uses a goal block to assist allocation. If the goal is * free, or there is a free block within 32 blocks of the goal, that block @@ -362,9 +407,11 @@ * each block group the search first looks for an entire free byte in the block * bitmap, and then for any free bit if that fails. */ -int ext3_new_block (handle_t *handle, - const struct inode * inode, unsigned long goal, - u32 * prealloc_count, u32 * prealloc_block, int * err) +struct buffer_head * ext3_new_block (handle_t *handle, + const struct inode * inode, + unsigned long goal, + u32 * prealloc_count, + u32 * prealloc_block, int * err) { struct buffer_head * bh; struct buffer_head * bh2; @@ -381,7 +428,7 @@ sb = inode->i_sb; if (!sb) { printk ("ext3_new_block: nonexistent device"); - return 0; + return NULL; } lock_super (sb); @@ -392,7 +439,7 @@ !in_group_p (sb->u.ext3_sb.s_resgid)) && !capable(CAP_SYS_RESOURCE))) { unlock_super (sb); - return 0; + return NULL; } ext3_debug ("goal=%lu.\n", goal); @@ -423,7 +470,7 @@ ext3_debug ("goal is at %d:%d.\n", i, j); - if (!ext3_test_bit(j, bh->b_data)) { + if (ext3_test_allocatable(j, bh)) { #ifdef EXT3FS_DEBUG goal_hits++; ext3_debug ("goal bit allocated.\n"); @@ -441,7 +488,7 @@ */ int end_goal = (j + 63) & ~63; j = ext3_find_next_zero_bit(bh->b_data, end_goal, j); - if (j < end_goal) + if (j < end_goal && ext3_test_allocatable(j, bh)) goto got_block; } @@ -459,18 +506,34 @@ p = ((char *) bh->b_data) + (j >> 3); r = memscan(p, 0, (EXT3_BLOCKS_PER_GROUP(sb) - j + 7) >> 3); k = (r - ((char *) bh->b_data)) << 3; - if (k < EXT3_BLOCKS_PER_GROUP(sb)) { + if (k < EXT3_BLOCKS_PER_GROUP(sb) && + ext3_test_allocatable(k, bh)) { j = k; goto search_back; } + /* The bitmap search --- search forward alternately + * through the actual bitmap and the last-committed copy + * until we find a bit free in both. */ + repeat_bit_search: k = ext3_find_next_zero_bit ((unsigned long *) bh->b_data, EXT3_BLOCKS_PER_GROUP(sb), j); - if (k < EXT3_BLOCKS_PER_GROUP(sb)) { + if (k < EXT3_BLOCKS_PER_GROUP(sb) && + ext3_test_allocatable(k, bh)) { j = k; goto got_block; } + + if (k < EXT3_BLOCKS_PER_GROUP(sb) && bh->b_committed_data) { + k = ext3_find_next_zero_bit + ((unsigned long *) bh->b_committed_data, + EXT3_BLOCKS_PER_GROUP(sb), k); + if (k < EXT3_BLOCKS_PER_GROUP(sb)) { + j = k; + goto repeat_bit_search; + } + } } ext3_debug ("Bit not found in block group %d.\n", i); @@ -487,14 +550,14 @@ if (!gdp) { *err = -EIO; unlock_super (sb); - return 0; + return NULL; } if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) break; } if (k >= sb->u.ext3_sb.s_groups_count) { unlock_super (sb); - return 0; + return NULL; } bitmap_nr = load_block_bitmap (sb, i); if (bitmap_nr < 0) @@ -512,7 +575,7 @@ ext3_error (sb, "ext3_new_block", "Free blocks count corrupted for block group %d", i); unlock_super (sb); - return 0; + return NULL; } search_back: @@ -533,7 +596,7 @@ if(DQUOT_ALLOC_BLOCK(sb, inode, 1)) { unlock_super(sb); *err = -EDQUOT; - return 0; + return NULL; } journal_get_write_access(handle, bh); @@ -604,23 +667,20 @@ "block >= blocks count - " "block_group = %d, block=%d", i, j); unlock_super (sb); - return 0; + return NULL; } if (!(bh = getblk (sb->s_dev, j, sb->s_blocksize))) { ext3_error (sb, "ext3_new_block", "cannot get block %d", j); unlock_super (sb); - return 0; + return NULL; } if (!buffer_uptodate(bh)) wait_on_buffer(bh); - /* @@@ This will eventually have to be a data-style operation, - not metadata */ - mark_buffer_uptodate(bh, 1); - journal_get_write_access(handle, bh); memset(bh->b_data, 0, sb->s_blocksize); - journal_dirty_metadata(handle, bh); - brelse (bh); + mark_buffer_uptodate(bh, 1); + /* Don't mark it dirty --- the caller has to decide whether the new + * block needs to be journaled as data or metadata. */ ext3_debug ("allocating block %d. " "Goal hits %d of %d.\n", j, goal_hits, goal_attempts); @@ -632,12 +692,12 @@ sb->s_dirt = 1; unlock_super (sb); *err = 0; - return j; + return bh; io_error: *err = -EIO; unlock_super (sb); - return 0; + return NULL; } --- linux-2.2.17.ext3-0.0.4a/fs/ext3/file.c.~1~ Thu Sep 7 14:26:44 2000 +++ linux-2.2.17.ext3-0.0.4a/fs/ext3/file.c Thu Sep 14 16:15:49 2000 @@ -165,6 +165,7 @@ int i, buffercount, write_error, new_buffer; unsigned long limit; handle_t *handle; + int will_journal_data; /* POSIX: mtime/ctime may not change for 0 count */ if (!count) @@ -210,12 +211,21 @@ offset = pos & (sb->s_blocksize - 1); c = sb->s_blocksize - offset; + /* Record this now so that we don't get confused if the user + * changes the flag half-way through! */ + will_journal_data = ext3_should_journal_data(inode); + /* How large a transaction might we need? We can always * underestimate and grow later for really large writes */ - needed = (count >> EXT3_BLOCK_SIZE_BITS(sb)) + 1; - if (needed > EXT3_MAX_TRANS_DATA) - needed = EXT3_MAX_TRANS_DATA; + if (will_journal_data) { + needed = (count >> EXT3_BLOCK_SIZE_BITS(sb)) + 1; + if (needed > EXT3_MAX_TRANS_DATA) + needed = EXT3_MAX_TRANS_DATA; + } else + needed = 0; + handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS + needed); + /* Check for overflow.. */ @@ -335,12 +345,17 @@ if (new_buffer) { set_bit(BH_Lock, &bh->b_state); - journal_get_create_access(handle, bh); + if (will_journal_data) + journal_get_create_access(handle, bh); c -= copy_from_user (bh->b_data + offset, buf, c); if (c != sb->s_blocksize) { + /* On getting an EFAULT mid-copy, we end + * up throwing away the whole block. + * Too bad. */ c = 0; unlock_buffer(bh); - journal_release_buffer(handle, bh); + if (will_journal_data) + journal_release_buffer(handle, bh); brelse(bh); if (!written) written = -EFAULT; @@ -353,14 +368,14 @@ ll_rw_block (READ, 1, &bh); wait_on_buffer (bh); if (!buffer_uptodate(bh)) { - journal_release_buffer(handle, bh); brelse (bh); if (!written) written = -EIO; break; } } - journal_get_write_access(handle, bh); + if (will_journal_data) + journal_get_write_access(handle, bh); c -= copy_from_user (bh->b_data + offset, buf, c); } if (!c) { @@ -370,7 +385,10 @@ written = -EFAULT; break; } - journal_dirty_metadata(handle, bh); + if (will_journal_data) + journal_dirty_metadata(handle, bh); + else + journal_dirty_data(handle, bh); update_vm_cache(inode, pos, bh->b_data + offset, c); pos += c; written += c; --- linux-2.2.17.ext3-0.0.4a/fs/ext3/inode.c.~1~ Fri Sep 8 14:12:17 2000 +++ linux-2.2.17.ext3-0.0.4a/fs/ext3/inode.c Wed Sep 13 18:43:27 2000 @@ -141,8 +141,9 @@ #endif } -static int ext3_alloc_block (handle_t *handle, struct inode * inode, - unsigned long goal, int * err) +static struct buffer_head * ext3_alloc_block (handle_t *handle, + struct inode * inode, + unsigned long goal, int * err) { #ifdef EXT3FS_DEBUG static unsigned long alloc_hits = 0, alloc_attempts = 0; @@ -299,10 +300,11 @@ ext3_debug ("goal = %d.\n", goal); - tmp = ext3_alloc_block (handle, inode, goal, err); - if (!tmp) + result = ext3_alloc_block (handle, inode, goal, err); + if (!result) return NULL; - result = getblk (inode->i_dev, tmp, inode->i_sb->s_blocksize); + tmp = result->b_blocknr; + if (*p) { ext3_free_blocks (handle, inode, tmp, 1); brelse (result); @@ -370,12 +372,11 @@ if (!goal) goal = bh->b_blocknr; } - tmp = ext3_alloc_block (handle, inode, goal, err); - if (!tmp) { - brelse (bh); + result = ext3_alloc_block (handle, inode, goal, err); + if (!result) return NULL; - } - result = getblk (bh->b_dev, tmp, blocksize); + tmp = result->b_blocknr; + journal_get_write_access(handle, bh); if (le32_to_cpu(*p)) { /* @@@ Major danger here: we are using up more and more --- linux-2.2.17.ext3-0.0.4a/fs/ext3/truncate.c.~1~ Thu Sep 7 14:26:44 2000 +++ linux-2.2.17.ext3-0.0.4a/fs/ext3/truncate.c Thu Sep 14 16:16:50 2000 @@ -103,16 +103,6 @@ */ -static inline void ext3_bforget(struct buffer_head *buf) -{ - if (buf) { - J_ASSERT(buf->b_cp_transaction == NULL); - J_ASSERT(buf->b_jlist == BJ_None); - J_ASSERT(!test_bit(BH_JWrite, &buf->b_state)); - __bforget(buf); - } -} - /* * The journaling doesn't have to break the rules above, as long as we * do a journal_get_write_access() on the appropriate indirect blocks --- linux-2.2.17.ext3-0.0.4a/fs/jfs/commit.c.~1~ Thu Sep 7 14:26:44 2000 +++ linux-2.2.17.ext3-0.0.4a/fs/jfs/commit.c Mon Sep 11 15:41:28 2000 @@ -138,7 +138,6 @@ if (bh) do { if (buffer_dirty(bh) && !buffer_locked(bh)) { - set_bit(BH_JWrite, &bh->b_state); wbuf[bufs++] = bh; } bh = bh->b_tnext; @@ -162,7 +161,6 @@ bh = commit_transaction->t_datalist; if (bh) do { - clear_bit(BH_JWrite, &bh->b_state); if (buffer_locked(bh)) { unlock_journal(journal); wait_on_buffer(bh); --- linux-2.2.17.ext3-0.0.4a/fs/jfs/journal.c.~1~ Thu Sep 7 14:26:44 2000 +++ linux-2.2.17.ext3-0.0.4a/fs/jfs/journal.c Mon Sep 11 16:05:07 2000 @@ -340,16 +340,26 @@ /* The call to lock_buffer() above should be the only place we ever lock - * a buffer which is being journaled (ignoring the checkpoint lists). */ + * a buffer which is being journaled (ignoring the checkpoint lists). + * + * @@@ This is heavily dependent on the big kernel lock in 2.2! */ void jfs_prelock_buffer_check(struct buffer_head *bh) { - if (bh->b_jlist == 0 && bh->b_transaction == NULL) + transaction_t *transaction = bh->b_transaction; + journal_t *journal; + + if (bh->b_jlist == 0 && transaction == NULL) return; - J_ASSERT(bh->b_jlist == 0 || bh->b_jlist == BJ_LogCtl || bh->b_jlist == BJ_IO); - J_ASSERT(bh->b_transaction != NULL); - J_ASSERT(bh->b_transaction == bh->b_transaction->t_journal->j_committing_transaction); - J_ASSERT(test_bit(BH_JWrite, &bh->b_state)); + J_ASSERT(bh->b_jlist == 0 || bh->b_jlist == BJ_LogCtl || bh->b_jlist == BJ_IO || bh->b_jlist == BJ_Data); + J_ASSERT(transaction != NULL); + journal = transaction->t_journal; + if (bh->b_jlist == BJ_Data) { + J_ASSERT(transaction == journal->j_running_transaction || transaction == journal->j_committing_transaction); + } else { + J_ASSERT(transaction == journal->j_running_transaction || transaction == journal->j_committing_transaction); + J_ASSERT(test_bit(BH_JWrite, &bh->b_state)); + } } /* We are not allowed to forget the dirty status on any buffer which is --- linux-2.2.17.ext3-0.0.4a/fs/jfs/transaction.c.~1~ Thu Sep 7 14:26:44 2000 +++ linux-2.2.17.ext3-0.0.4a/fs/jfs/transaction.c Thu Sep 14 16:13:53 2000 @@ -491,6 +491,10 @@ transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; + /* The buffer may already belong to this transaction due to + pre-zeroing in the filesystem's new_block code */ + J_ASSERT (bh->b_transaction == transaction || bh->b_transaction == NULL); + J_ASSERT (buffer_locked(bh)); lock_journal(journal); @@ -511,7 +515,7 @@ * rewindable consequences * * Sometimes there is a need to distinguish between metadata which has - * been committed to disk and that which has not. The ext2fs code uses + * been committed to disk and that which has not. The ext3fs code uses * this for freeing and allocating space: we have to make sure that we * do not reuse freed space until the deallocation has been committed, * since if we overwrote that space we would make the delete @@ -664,13 +668,11 @@ /* * journal_release_buffer: undo a get_write_access without any buffer - * updates, if the transaction decided in the end that it didn't need - * access. + * updates, if the update decided in the end that it didn't need access. * * journal_get_write_access() can block, so it is quite possible for a * journaling component to decide after the write access is returned - * that global state has changed and the update is no longer required. - */ + * that global state has changed and the update is no longer required. */ void journal_release_buffer (handle_t *handle, struct buffer_head *bh) { @@ -682,7 +684,8 @@ * transaction, then it is safe to release it. In all other * cases, just leave the buffer as it is. */ - if (bh->b_jlist == BJ_Reserved && bh->b_transaction == transaction) { + if (bh->b_jlist == BJ_Reserved && bh->b_transaction == transaction && + !buffer_dirty(bh)) { handle->h_buffer_credits++; journal_refile_buffer(bh); } --- linux-2.2.17.ext3-0.0.4a/include/linux/ext3_fs.h.~1~ Fri Sep 8 20:45:04 2000 +++ linux-2.2.17.ext3-0.0.4a/include/linux/ext3_fs.h Wed Sep 13 18:27:11 2000 @@ -36,7 +36,7 @@ /* * The second extended file system version */ -#define EXT3FS_DATE "2000/09/08" +#define EXT3FS_DATE "2000/09/11" #define EXT3FS_VERSION "0.0.4a" /* @@ -564,8 +564,9 @@ /* balloc.c */ extern int ext3_group_sparse(int group); -extern int ext3_new_block (handle_t *, const struct inode *, unsigned long, - __u32 *, __u32 *, int *); +extern struct buffer_head * ext3_new_block (handle_t *, const struct inode *, + unsigned long, + __u32 *, __u32 *, int *); extern void ext3_free_blocks (handle_t *, const struct inode *, unsigned long, unsigned long); extern unsigned long ext3_count_free_blocks (struct super_block *); @@ -597,6 +598,7 @@ extern void ext3_check_inodes_bitmap (struct super_block *); /* inode.c */ + extern int ext3_bmap (struct inode *, int); extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); @@ -664,6 +666,13 @@ /* symlink.c */ extern struct inode_operations ext3_symlink_inode_operations; + +/* @@@ Fix this in the future to allow data-journaling to be re-enabled + * per-inode or per-filesystem */ +static inline int ext3_should_journal_data(struct inode *inode) +{ + return 0; +} #endif /* __KERNEL__ */