Ext4 bug fixes and cleanups, including:

* hardening against maliciously fuzzed file systems
   * backwards compatibility for the brief period when we attempted to
      ignore zero-width characters
   * avoid potentially BUG'ing if there is a file system corruption found
     during the file system unmount
   * fix free space reporting by statfs when project quotas are enabled
     and the free space is less than the remaining project quota
 
 Also improve performance when replaying a journal with a very large
 number of revoke records (applicable for Lustre volumes).
 -----BEGIN PGP SIGNATURE-----
 
 iQEzBAABCAAdFiEEK2m5VNv+CHkogTfJ8vlZVpUNgaMFAmflfY4ACgkQ8vlZVpUN
 gaMx7Qf/akTELvyBZ7iPCCHh2HwayuO8qLhPNqrU0TmYMFvgwgYUPcQ3BLn8CE+/
 j5UeT8XxNaLU4GJn3z+q6yW6PnNHfqZqKry9j/iPc3s1mjTslntr/xENlgu6i4Bp
 Q58xc7Pj45vdmP+xmYhRnJcefgsZMvB/N1SEHxwIP8bntZqsEvP9pI82r9Ouc8SA
 ZLQ1/K4OADmk7f3GhlPr9AtgH7O0CjlAas30h/AW77DXBQl7ZgbDsGDlgTwaGqkR
 jHcvfr6hLnWy+MUVGmlNZ2HY6iUgBPItWlYCP/fsrUdnc+CONyl5E17JPSl1QQtR
 CLYlo4xV8j1+zJ094DjhDWMKI2G7jw==
 =oudL
 -----END PGP SIGNATURE-----

Merge tag 'ext4-for_linus-6.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:
 "Ext4 bug fixes and cleanups, including:

   - hardening against maliciously fuzzed file systems

   - backwards compatibility for the brief period when we attempted to
     ignore zero-width characters

   - avoid potentially BUG'ing if there is a file system corruption
     found during the file system unmount

   - fix free space reporting by statfs when project quotas are enabled
     and the free space is less than the remaining project quota

  Also improve performance when replaying a journal with a very large
  number of revoke records (applicable for Lustre volumes)"

* tag 'ext4-for_linus-6.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (71 commits)
  ext4: fix OOB read when checking dotdot dir
  ext4: on a remount, only log the ro or r/w state when it has changed
  ext4: correct the error handle in ext4_fallocate()
  ext4: Make sb update interval tunable
  ext4: avoid journaling sb update on error if journal is destroying
  ext4: define ext4_journal_destroy wrapper
  ext4: hash: simplify kzalloc(n * 1, ...) to kzalloc(n, ...)
  jbd2: add a missing data flush during file and fs synchronization
  ext4: don't over-report free space or inodes in statvfs
  ext4: clear DISCARD flag if device does not support discard
  jbd2: remove jbd2_journal_unfile_buffer()
  ext4: reorder capability check last
  ext4: update the comment about mb_optimize_scan
  jbd2: fix off-by-one while erasing journal
  ext4: remove references to bh->b_page
  ext4: goto right label 'out_mmap_sem' in ext4_setattr()
  ext4: fix out-of-bound read in ext4_xattr_inode_dec_ref_all()
  ext4: introduce ITAIL helper
  jbd2: remove redundant function jbd2_journal_has_csum_v2or3_feature
  ext4: remove redundant function ext4_has_metadata_csum
  ...
This commit is contained in:
Linus Torvalds 2025-03-27 13:27:08 -07:00
commit 5c2a430e85
35 changed files with 1000 additions and 1145 deletions

View File

@ -238,11 +238,10 @@ When mounting an ext4 filesystem, the following option are accepted:
configured using tune2fs)
data_err=ignore(*)
Just print an error message if an error occurs in a file data buffer in
ordered mode.
Just print an error message if an error occurs in a file data buffer.
data_err=abort
Abort the journal if an error occurs in a file data buffer in ordered
mode.
Abort the journal if an error occurs in a file data buffer.
grpid | bsdgroups
New objects have the group ID of their parent.

View File

@ -111,9 +111,7 @@ a callback function when the transaction is finally committed to disk,
so that you can do some of your own management. You ask the journalling
layer for calling the callback by simply setting
``journal->j_commit_callback`` function pointer and that function is
called after each transaction commit. You can also use
``transaction->t_private_list`` for attaching entries to a transaction
that need processing when the transaction commits.
called after each transaction commit.
JBD2 also provides a way to block all transaction updates via
jbd2_journal_lock_updates() /

View File

@ -649,8 +649,8 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
/* Hm, nope. Are (enough) root reserved clusters available? */
if (uid_eq(sbi->s_resuid, current_fsuid()) ||
(!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
capable(CAP_SYS_RESOURCE) ||
(flags & EXT4_MB_USE_ROOT_BLOCKS)) {
(flags & EXT4_MB_USE_ROOT_BLOCKS) ||
capable(CAP_SYS_RESOURCE)) {
if (free_clusters >= (nclusters + dirty_clusters +
resv_clusters))

View File

@ -25,7 +25,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
int sz;
if (!ext4_has_metadata_csum(sb))
if (!ext4_has_feature_metadata_csum(sb))
return 1;
sz = EXT4_INODES_PER_GROUP(sb) >> 3;
@ -48,7 +48,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
int sz;
if (!ext4_has_metadata_csum(sb))
if (!ext4_has_feature_metadata_csum(sb))
return;
sz = EXT4_INODES_PER_GROUP(sb) >> 3;
@ -67,7 +67,7 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
if (!ext4_has_metadata_csum(sb))
if (!ext4_has_feature_metadata_csum(sb))
return 1;
provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
@ -89,7 +89,7 @@ void ext4_block_bitmap_csum_set(struct super_block *sb,
__u32 csum;
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (!ext4_has_metadata_csum(sb))
if (!ext4_has_feature_metadata_csum(sb))
return;
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);

View File

@ -86,7 +86,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
dir->i_sb->s_blocksize);
const int next_offset = ((char *) de - buf) + rlen;
bool fake = is_fake_dir_entry(de);
bool has_csum = ext4_has_metadata_csum(dir->i_sb);
bool has_csum = ext4_has_feature_metadata_csum(dir->i_sb);
if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir)))
error_msg = "rec_len is smaller than minimal";
@ -104,6 +104,9 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
else if (unlikely(le32_to_cpu(de->inode) >
le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
error_msg = "inode out of bounds";
else if (unlikely(next_offset == size && de->name_len == 1 &&
de->name[0] == '.'))
error_msg = "'.' directory cannot be the last in data block";
else
return 0;
@ -145,7 +148,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
return err;
/* Can we just clear INDEX flag to ignore htree information? */
if (!ext4_has_metadata_csum(sb)) {
if (!ext4_has_feature_metadata_csum(sb)) {
/*
* We don't set the inode dirty flag since it's not
* critical that it gets flushed back to the disk.

View File

@ -278,7 +278,10 @@ struct ext4_system_blocks {
/*
* Flags for ext4_io_end->flags
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
#define EXT4_IO_END_UNWRITTEN 0x0001
#define EXT4_IO_END_FAILED 0x0002
#define EXT4_IO_END_DEFER_COMPLETION (EXT4_IO_END_UNWRITTEN | EXT4_IO_END_FAILED)
struct ext4_io_end_vec {
struct list_head list; /* list of io_end_vec */
@ -367,6 +370,8 @@ struct ext4_io_submit {
#define EXT4_MAX_BLOCKS(size, offset, blkbits) \
((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \
blkbits))
#define EXT4_B_TO_LBLK(inode, offset) \
(round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits)
/* Translate a block number to a cluster number */
#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits)
@ -1058,7 +1063,8 @@ struct ext4_inode_info {
/* Number of ongoing updates on this inode */
atomic_t i_fc_updates;
atomic_t i_unwritten; /* Nr. of inflight conversions pending */
spinlock_t i_raw_lock; /* protects updates to the raw inode */
/* Fast commit wait queue for this inode */
wait_queue_head_t i_fc_wait;
@ -1097,8 +1103,6 @@ struct ext4_inode_info {
struct inode vfs_inode;
struct jbd2_inode *jinode;
spinlock_t i_raw_lock; /* protects updates to the raw inode */
/*
* File creation time. Its function is same as that of
* struct timespec64 i_{a,c,m}time in the generic inode.
@ -1141,6 +1145,7 @@ struct ext4_inode_info {
/* quota space reservation, managed internally by quota code */
qsize_t i_reserved_quota;
#endif
spinlock_t i_block_reservation_lock;
/* Lock protecting lists below */
spinlock_t i_completed_io_lock;
@ -1151,8 +1156,6 @@ struct ext4_inode_info {
struct list_head i_rsv_conversion_list;
struct work_struct i_rsv_conversion_work;
spinlock_t i_block_reservation_lock;
/*
* Transactions that contain inode's metadata needed to complete
* fsync and fdatasync, respectively.
@ -1606,6 +1609,8 @@ struct ext4_sb_info {
unsigned int s_mb_prefetch;
unsigned int s_mb_prefetch_limit;
unsigned int s_mb_best_avail_max_trim_order;
unsigned int s_sb_update_sec;
unsigned int s_sb_update_kb;
/* stats for buddy allocator */
atomic_t s_bal_reqs; /* number of reqs with len > 1 */
@ -1821,7 +1826,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
*/
enum {
EXT4_MF_MNTDIR_SAMPLED,
EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */
EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */
EXT4_MF_JOURNAL_DESTROY /* Journal is in process of destroying */
};
static inline void ext4_set_mount_flag(struct super_block *sb, int bit)
@ -2232,15 +2238,32 @@ extern int ext4_feature_set_ok(struct super_block *sb, int readonly);
/*
* Superblock flags
*/
#define EXT4_FLAGS_RESIZING 0
#define EXT4_FLAGS_SHUTDOWN 1
#define EXT4_FLAGS_BDEV_IS_DAX 2
enum {
EXT4_FLAGS_RESIZING, /* Avoid superblock update and resize race */
EXT4_FLAGS_SHUTDOWN, /* Prevent access to the file system */
EXT4_FLAGS_BDEV_IS_DAX, /* Current block device support DAX */
EXT4_FLAGS_EMERGENCY_RO,/* Emergency read-only due to fs errors */
};
static inline int ext4_forced_shutdown(struct super_block *sb)
{
return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags);
}
static inline int ext4_emergency_ro(struct super_block *sb)
{
return test_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags);
}
static inline int ext4_emergency_state(struct super_block *sb)
{
if (unlikely(ext4_forced_shutdown(sb)))
return -EIO;
if (unlikely(ext4_emergency_ro(sb)))
return -EROFS;
return 0;
}
/*
* Default values for user and/or group using reserved blocks
*/
@ -2277,6 +2300,13 @@ static inline int ext4_forced_shutdown(struct super_block *sb)
#define EXT4_DEF_MIN_BATCH_TIME 0
#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
/*
* Default values for superblock update
*/
#define EXT4_DEF_SB_UPDATE_INTERVAL_SEC (3600) /* seconds (1 hour) */
#define EXT4_DEF_SB_UPDATE_INTERVAL_KB (16384) /* kilobytes (16MB) */
/*
* Minimum number of groups in a flexgroup before we separate out
* directories into the first block group of a flexgroup
@ -2810,8 +2840,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
struct ext4_dir_entry_2 *dirent,
struct fscrypt_str *ent_name);
extern void ext4_htree_free_dir_info(struct dir_private_info *p);
extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
struct buffer_head *bh,
extern int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh,
void *buf, int buf_size,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **dest_de);
@ -3001,6 +3030,8 @@ extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
extern int ext4_truncate_page_cache_block_range(struct inode *inode,
loff_t start, loff_t end);
extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
@ -3259,14 +3290,10 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
extern int ext4_register_li_request(struct super_block *sb,
ext4_group_t first_not_zeroed);
static inline int ext4_has_metadata_csum(struct super_block *sb)
{
return ext4_has_feature_metadata_csum(sb);
}
static inline int ext4_has_group_desc_csum(struct super_block *sb)
{
return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
return ext4_has_feature_gdt_csum(sb) ||
ext4_has_feature_metadata_csum(sb);
}
#define ext4_read_incompat_64bit_val(es, name) \
@ -3546,11 +3573,11 @@ extern int ext4_try_to_write_inline_data(struct address_space *mapping,
struct folio **foliop);
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
unsigned copied, struct folio *folio);
extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
struct folio **foliop,
void **fsdata);
extern int ext4_generic_write_inline_data(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
struct folio **foliop,
void **fsdata, bool da);
extern int ext4_try_add_inline_entry(handle_t *handle,
struct ext4_filename *fname,
struct inode *dir, struct inode *inode);
@ -3785,34 +3812,19 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}
/* For ioend & aio unwritten conversion wait queues */
#define EXT4_WQ_HASH_SZ 37
#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
EXT4_WQ_HASH_SZ])
extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
extern int ext4_resize_begin(struct super_block *sb);
extern int ext4_resize_end(struct super_block *sb, bool update_backups);
static inline void ext4_set_io_unwritten_flag(struct inode *inode,
struct ext4_io_end *io_end)
static inline void ext4_set_io_unwritten_flag(struct ext4_io_end *io_end)
{
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN))
io_end->flag |= EXT4_IO_END_UNWRITTEN;
atomic_inc(&EXT4_I(inode)->i_unwritten);
}
}
static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
{
struct inode *inode = io_end->inode;
if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
if (io_end->flag & EXT4_IO_END_UNWRITTEN)
io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
/* Wake up anyone waiting on unwritten extent conversion */
if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
wake_up_all(ext4_ioend_wq(inode));
}
}
extern const struct iomap_ops ext4_iomap_ops;

View File

@ -63,12 +63,14 @@ static void ext4_put_nojournal(handle_t *handle)
*/
static int ext4_journal_check_start(struct super_block *sb)
{
int ret;
journal_t *journal;
might_sleep();
if (unlikely(ext4_forced_shutdown(sb)))
return -EIO;
ret = ext4_emergency_state(sb);
if (unlikely(ret))
return ret;
if (WARN_ON_ONCE(sb_rdonly(sb)))
return -EROFS;
@ -244,7 +246,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
}
} else
ext4_check_bdev_write_error(sb);
if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb))
if (trigger_type == EXT4_JTR_NONE ||
!ext4_has_feature_metadata_csum(sb))
return 0;
BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
jbd2_journal_set_triggers(bh,
@ -331,7 +334,8 @@ int __ext4_journal_get_create_access(const char *where, unsigned int line,
err);
return err;
}
if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb))
if (trigger_type == EXT4_JTR_NONE ||
!ext4_has_feature_metadata_csum(sb))
return 0;
BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
jbd2_journal_set_triggers(bh,

View File

@ -122,90 +122,6 @@
#define EXT4_HT_EXT_CONVERT 11
#define EXT4_HT_MAX 12
/**
* struct ext4_journal_cb_entry - Base structure for callback information.
*
* This struct is a 'seed' structure for a using with your own callback
* structs. If you are using callbacks you must allocate one of these
* or another struct of your own definition which has this struct
* as it's first element and pass it to ext4_journal_callback_add().
*/
struct ext4_journal_cb_entry {
/* list information for other callbacks attached to the same handle */
struct list_head jce_list;
/* Function to call with this callback structure */
void (*jce_func)(struct super_block *sb,
struct ext4_journal_cb_entry *jce, int error);
/* user data goes here */
};
/**
* ext4_journal_callback_add: add a function to call after transaction commit
* @handle: active journal transaction handle to register callback on
* @func: callback function to call after the transaction has committed:
* @sb: superblock of current filesystem for transaction
* @jce: returned journal callback data
* @rc: journal state at commit (0 = transaction committed properly)
* @jce: journal callback data (internal and function private data struct)
*
* The registered function will be called in the context of the journal thread
* after the transaction for which the handle was created has completed.
*
* No locks are held when the callback function is called, so it is safe to
* call blocking functions from within the callback, but the callback should
* not block or run for too long, or the filesystem will be blocked waiting for
* the next transaction to commit. No journaling functions can be used, or
* there is a risk of deadlock.
*
* There is no guaranteed calling order of multiple registered callbacks on
* the same transaction.
*/
static inline void _ext4_journal_callback_add(handle_t *handle,
struct ext4_journal_cb_entry *jce)
{
/* Add the jce to transaction's private list */
list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
}
static inline void ext4_journal_callback_add(handle_t *handle,
void (*func)(struct super_block *sb,
struct ext4_journal_cb_entry *jce,
int rc),
struct ext4_journal_cb_entry *jce)
{
struct ext4_sb_info *sbi =
EXT4_SB(handle->h_transaction->t_journal->j_private);
/* Add the jce to transaction's private list */
jce->jce_func = func;
spin_lock(&sbi->s_md_lock);
_ext4_journal_callback_add(handle, jce);
spin_unlock(&sbi->s_md_lock);
}
/**
* ext4_journal_callback_del: delete a registered callback
* @handle: active journal transaction handle on which callback was registered
* @jce: registered journal callback entry to unregister
* Return true if object was successfully removed
*/
static inline bool ext4_journal_callback_try_del(handle_t *handle,
struct ext4_journal_cb_entry *jce)
{
bool deleted;
struct ext4_sb_info *sbi =
EXT4_SB(handle->h_transaction->t_journal->j_private);
spin_lock(&sbi->s_md_lock);
deleted = !list_empty(&jce->jce_list);
list_del_init(&jce->jce_list);
spin_unlock(&sbi->s_md_lock);
return deleted;
}
int
ext4_mark_iloc_dirty(handle_t *handle,
struct inode *inode,
@ -513,4 +429,33 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
return 1;
}
/*
* Pass journal explicitly as it may not be cached in the sbi->s_journal in some
* cases
*/
static inline int ext4_journal_destroy(struct ext4_sb_info *sbi, journal_t *journal)
{
int err = 0;
/*
* At this point only two things can be operating on the journal.
* JBD2 thread performing transaction commit and s_sb_upd_work
* issuing sb update through the journal. Once we set
* EXT4_JOURNAL_DESTROY, new ext4_handle_error() calls will not
* queue s_sb_upd_work and ext4_force_commit() makes sure any
* ext4_handle_error() calls from the running transaction commit are
* finished. Hence no new s_sb_upd_work can be queued after we
* flush it here.
*/
ext4_set_mount_flag(sbi->s_sb, EXT4_MF_JOURNAL_DESTROY);
ext4_force_commit(sbi->s_sb);
flush_work(&sbi->s_sb_upd_work);
err = jbd2_journal_destroy(journal);
sbi->s_journal = NULL;
return err;
}
#endif /* _EXT4_JBD2_H */

View File

@ -63,7 +63,7 @@ static int ext4_extent_block_csum_verify(struct inode *inode,
{
struct ext4_extent_tail *et;
if (!ext4_has_metadata_csum(inode->i_sb))
if (!ext4_has_feature_metadata_csum(inode->i_sb))
return 1;
et = find_ext4_extent_tail(eh);
@ -77,7 +77,7 @@ static void ext4_extent_block_csum_set(struct inode *inode,
{
struct ext4_extent_tail *et;
if (!ext4_has_metadata_csum(inode->i_sb))
if (!ext4_has_feature_metadata_csum(inode->i_sb))
return;
et = find_ext4_extent_tail(eh);
@ -4568,131 +4568,65 @@ static long ext4_zero_range(struct file *file, loff_t offset,
loff_t len, int mode)
{
struct inode *inode = file_inode(file);
struct address_space *mapping = file->f_mapping;
handle_t *handle = NULL;
unsigned int max_blocks;
loff_t new_size = 0;
int ret = 0;
int flags;
int credits;
int partial_begin, partial_end;
loff_t start, end;
ext4_lblk_t lblk;
loff_t end = offset + len;
ext4_lblk_t start_lblk, end_lblk;
unsigned int blocksize = i_blocksize(inode);
unsigned int blkbits = inode->i_blkbits;
int ret, flags, credits;
trace_ext4_zero_range(inode, offset, len, mode);
WARN_ON_ONCE(!inode_is_locked(inode));
/*
* Round up offset. This is not fallocate, we need to zero out
* blocks, so convert interior block aligned part of the range to
* unwritten and possibly manually zero out unaligned parts of the
* range. Here, start and partial_begin are inclusive, end and
* partial_end are exclusive.
*/
start = round_up(offset, 1 << blkbits);
end = round_down((offset + len), 1 << blkbits);
if (start < offset || end > offset + len)
return -EINVAL;
partial_begin = offset & ((1 << blkbits) - 1);
partial_end = (offset + len) & ((1 << blkbits) - 1);
lblk = start >> blkbits;
max_blocks = (end >> blkbits);
if (max_blocks < lblk)
max_blocks = 0;
else
max_blocks -= lblk;
inode_lock(inode);
/*
* Indirect files do not support unwritten extents
*/
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
ret = -EOPNOTSUPP;
goto out_mutex;
}
/* Indirect files do not support unwritten extents */
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EOPNOTSUPP;
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
(offset + len > inode->i_size ||
offset + len > EXT4_I(inode)->i_disksize)) {
new_size = offset + len;
(end > inode->i_size || end > EXT4_I(inode)->i_disksize)) {
new_size = end;
ret = inode_newsize_ok(inode, new_size);
if (ret)
goto out_mutex;
return ret;
}
flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
/* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);
ret = file_modified(file);
if (ret)
goto out_mutex;
/* Preallocate the range including the unaligned edges */
if (partial_begin || partial_end) {
ret = ext4_alloc_file_blocks(file,
round_down(offset, 1 << blkbits) >> blkbits,
(round_up((offset + len), 1 << blkbits) -
round_down(offset, 1 << blkbits)) >> blkbits,
new_size, flags);
if (ret)
goto out_mutex;
if (!IS_ALIGNED(offset | end, blocksize)) {
ext4_lblk_t alloc_lblk = offset >> blkbits;
ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits);
ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk,
new_size, flags);
if (ret)
return ret;
}
ret = ext4_update_disksize_before_punch(inode, offset, len);
if (ret)
return ret;
/* Now release the pages and zero block aligned part of pages */
ret = ext4_truncate_page_cache_block_range(inode, offset, end);
if (ret)
return ret;
/* Zero range excluding the unaligned edges */
if (max_blocks > 0) {
flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
EXT4_EX_NOCACHE);
start_lblk = EXT4_B_TO_LBLK(inode, offset);
end_lblk = end >> blkbits;
if (end_lblk > start_lblk) {
ext4_lblk_t zero_blks = end_lblk - start_lblk;
/*
* Prevent page faults from reinstantiating pages we have
* released from page cache.
*/
filemap_invalidate_lock(mapping);
ret = ext4_break_layouts(inode);
if (ret) {
filemap_invalidate_unlock(mapping);
goto out_mutex;
}
ret = ext4_update_disksize_before_punch(inode, offset, len);
if (ret) {
filemap_invalidate_unlock(mapping);
goto out_mutex;
}
/*
* For journalled data we need to write (and checkpoint) pages
* before discarding page cache to avoid inconsitent data on
* disk in case of crash before zeroing trans is committed.
*/
if (ext4_should_journal_data(inode)) {
ret = filemap_write_and_wait_range(mapping, start,
end - 1);
if (ret) {
filemap_invalidate_unlock(mapping);
goto out_mutex;
}
}
/* Now release the pages and zero block aligned part of pages */
truncate_pagecache_range(inode, start, end - 1);
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags);
filemap_invalidate_unlock(mapping);
flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE);
ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks,
new_size, flags);
if (ret)
goto out_mutex;
return ret;
}
if (!partial_begin && !partial_end)
goto out_mutex;
/* Finish zeroing out if it doesn't contain partial block */
if (IS_ALIGNED(offset | end, blocksize))
return ret;
/*
* In worst case we have to writeout two nonadjacent unwritten
@ -4705,27 +4639,69 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_std_error(inode->i_sb, ret);
goto out_mutex;
return ret;
}
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
/* Zero out partial block at the edges of the range */
ret = ext4_zero_partial_blocks(handle, inode, offset, len);
if (ret)
goto out_handle;
if (new_size)
ext4_update_inode_size(inode, new_size);
ret = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret))
goto out_handle;
/* Zero out partial block at the edges of the range */
ret = ext4_zero_partial_blocks(handle, inode, offset, len);
if (ret >= 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
ext4_update_inode_fsync_trans(handle, inode, 1);
if (file->f_flags & O_SYNC)
ext4_handle_sync(handle);
out_handle:
ext4_journal_stop(handle);
out_mutex:
inode_unlock(inode);
return ret;
}
static long ext4_do_fallocate(struct file *file, loff_t offset,
loff_t len, int mode)
{
struct inode *inode = file_inode(file);
loff_t end = offset + len;
loff_t new_size = 0;
ext4_lblk_t start_lblk, len_lblk;
int ret;
trace_ext4_fallocate_enter(inode, offset, len, mode);
WARN_ON_ONCE(!inode_is_locked(inode));
start_lblk = offset >> inode->i_blkbits;
len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits);
/* We only support preallocation for extent-based files only. */
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
ret = -EOPNOTSUPP;
goto out;
}
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
(end > inode->i_size || end > EXT4_I(inode)->i_disksize)) {
new_size = end;
ret = inode_newsize_ok(inode, new_size);
if (ret)
goto out;
}
ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size,
EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
if (ret)
goto out;
if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
EXT4_I(inode)->i_sync_tid);
}
out:
trace_ext4_fallocate_exit(inode, offset, len_lblk, ret);
return ret;
}
@ -4739,12 +4715,8 @@ out_mutex:
long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
loff_t new_size = 0;
unsigned int max_blocks;
int ret = 0;
int flags;
ext4_lblk_t lblk;
unsigned int blkbits = inode->i_blkbits;
struct address_space *mapping = file->f_mapping;
int ret;
/*
* Encrypted inodes can't handle collapse range or insert
@ -4764,73 +4736,47 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
inode_lock(inode);
ret = ext4_convert_inline_data(inode);
inode_unlock(inode);
if (ret)
goto exit;
if (mode & FALLOC_FL_PUNCH_HOLE) {
ret = ext4_punch_hole(file, offset, len);
goto exit;
}
if (mode & FALLOC_FL_COLLAPSE_RANGE) {
ret = ext4_collapse_range(file, offset, len);
goto exit;
}
if (mode & FALLOC_FL_INSERT_RANGE) {
ret = ext4_insert_range(file, offset, len);
goto exit;
}
if (mode & FALLOC_FL_ZERO_RANGE) {
ret = ext4_zero_range(file, offset, len, mode);
goto exit;
}
trace_ext4_fallocate_enter(inode, offset, len, mode);
lblk = offset >> blkbits;
max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
inode_lock(inode);
/*
* We only support preallocation for extent-based files only
*/
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
ret = -EOPNOTSUPP;
goto out;
}
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
(offset + len > inode->i_size ||
offset + len > EXT4_I(inode)->i_disksize)) {
new_size = offset + len;
ret = inode_newsize_ok(inode, new_size);
if (ret)
goto out;
}
goto out_inode_lock;
/* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);
ret = file_modified(file);
if (ret)
goto out;
goto out_inode_lock;
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
if (ret)
goto out;
if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
EXT4_I(inode)->i_sync_tid);
if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ALLOCATE_RANGE) {
ret = ext4_do_fallocate(file, offset, len, mode);
goto out_inode_lock;
}
out:
/*
* Follow-up operations will drop page cache, hold invalidate lock
* to prevent page faults from reinstantiating pages we have
* released from page cache.
*/
filemap_invalidate_lock(mapping);
ret = ext4_break_layouts(inode);
if (ret)
goto out_invalidate_lock;
if (mode & FALLOC_FL_PUNCH_HOLE)
ret = ext4_punch_hole(file, offset, len);
else if (mode & FALLOC_FL_COLLAPSE_RANGE)
ret = ext4_collapse_range(file, offset, len);
else if (mode & FALLOC_FL_INSERT_RANGE)
ret = ext4_insert_range(file, offset, len);
else if (mode & FALLOC_FL_ZERO_RANGE)
ret = ext4_zero_range(file, offset, len, mode);
else
ret = -EOPNOTSUPP;
out_invalidate_lock:
filemap_invalidate_unlock(mapping);
out_inode_lock:
inode_unlock(inode);
trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
exit:
return ret;
}
@ -5332,109 +5278,72 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct address_space *mapping = inode->i_mapping;
ext4_lblk_t punch_start, punch_stop;
loff_t end = offset + len;
ext4_lblk_t start_lblk, end_lblk;
handle_t *handle;
unsigned int credits;
loff_t new_size, ioffset;
loff_t start, new_size;
int ret;
/*
* We need to test this early because xfstests assumes that a
* collapse range of (0, 1) will return EOPNOTSUPP if the file
* system does not support collapse range.
*/
trace_ext4_collapse_range(inode, offset, len);
WARN_ON_ONCE(!inode_is_locked(inode));
/* Currently just for extent based files */
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return -EOPNOTSUPP;
/* Collapse range works only on fs cluster size aligned regions. */
if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
return -EINVAL;
trace_ext4_collapse_range(inode, offset, len);
punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
inode_lock(inode);
/*
* There is no need to overlap collapse range with EOF, in which case
* it is effectively a truncate operation
*/
if (offset + len >= inode->i_size) {
ret = -EINVAL;
goto out_mutex;
}
/* Currently just for extent based files */
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
ret = -EOPNOTSUPP;
goto out_mutex;
}
/* Wait for existing dio to complete */
inode_dio_wait(inode);
ret = file_modified(file);
if (ret)
goto out_mutex;
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
*/
filemap_invalidate_lock(mapping);
ret = ext4_break_layouts(inode);
if (ret)
goto out_mmap;
if (end >= inode->i_size)
return -EINVAL;
/*
* Write tail of the last page before removed range and data that
* will be shifted since they will get removed from the page cache
* below. We are also protected from pages becoming dirty by
* i_rwsem and invalidate_lock.
* Need to round down offset to be aligned with page size boundary
* for page size > block size.
*/
ioffset = round_down(offset, PAGE_SIZE);
/*
* Write tail of the last page before removed range since it will get
* removed from the page cache below.
*/
ret = filemap_write_and_wait_range(mapping, ioffset, offset);
start = round_down(offset, PAGE_SIZE);
ret = filemap_write_and_wait_range(mapping, start, offset);
if (!ret)
ret = filemap_write_and_wait_range(mapping, end, LLONG_MAX);
if (ret)
goto out_mmap;
/*
* Write data that will be shifted to preserve them when discarding
* page cache below. We are also protected from pages becoming dirty
* by i_rwsem and invalidate_lock.
*/
ret = filemap_write_and_wait_range(mapping, offset + len,
LLONG_MAX);
if (ret)
goto out_mmap;
truncate_pagecache(inode, ioffset);
return ret;
truncate_pagecache(inode, start);
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out_mmap;
}
if (IS_ERR(handle))
return PTR_ERR(handle);
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
start_lblk = offset >> inode->i_blkbits;
end_lblk = (offset + len) >> inode->i_blkbits;
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start);
ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk);
ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
ret = ext4_ext_remove_space(inode, start_lblk, end_lblk - 1);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
goto out_handle;
}
ext4_discard_preallocations(inode);
ret = ext4_ext_shift_extents(inode, handle, punch_stop,
punch_stop - punch_start, SHIFT_LEFT);
ret = ext4_ext_shift_extents(inode, handle, end_lblk,
end_lblk - start_lblk, SHIFT_LEFT);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
goto out_handle;
}
new_size = inode->i_size - len;
@ -5442,18 +5351,16 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
EXT4_I(inode)->i_disksize = new_size;
up_write(&EXT4_I(inode)->i_data_sem);
ret = ext4_mark_inode_dirty(handle, inode);
if (ret)
goto out_handle;
ext4_update_inode_fsync_trans(handle, inode, 1);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret = ext4_mark_inode_dirty(handle, inode);
ext4_update_inode_fsync_trans(handle, inode, 1);
out_stop:
out_handle:
ext4_journal_stop(handle);
out_mmap:
filemap_invalidate_unlock(mapping);
out_mutex:
inode_unlock(inode);
return ret;
}
@ -5473,100 +5380,63 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
handle_t *handle;
struct ext4_ext_path *path;
struct ext4_extent *extent;
ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
ext4_lblk_t start_lblk, len_lblk, ee_start_lblk = 0;
unsigned int credits, ee_len;
int ret = 0, depth, split_flag = 0;
loff_t ioffset;
int ret, depth, split_flag = 0;
loff_t start;
/*
* We need to test this early because xfstests assumes that an
* insert range of (0, 1) will return EOPNOTSUPP if the file
* system does not support insert range.
*/
trace_ext4_insert_range(inode, offset, len);
WARN_ON_ONCE(!inode_is_locked(inode));
/* Currently just for extent based files */
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return -EOPNOTSUPP;
/* Insert range works only on fs cluster size aligned regions. */
if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
return -EINVAL;
trace_ext4_insert_range(inode, offset, len);
offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
inode_lock(inode);
/* Currently just for extent based files */
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
ret = -EOPNOTSUPP;
goto out_mutex;
}
/* Check whether the maximum file size would be exceeded */
if (len > inode->i_sb->s_maxbytes - inode->i_size) {
ret = -EFBIG;
goto out_mutex;
}
/* Offset must be less than i_size */
if (offset >= inode->i_size) {
ret = -EINVAL;
goto out_mutex;
}
/* Wait for existing dio to complete */
inode_dio_wait(inode);
ret = file_modified(file);
if (ret)
goto out_mutex;
if (offset >= inode->i_size)
return -EINVAL;
/* Check whether the maximum file size would be exceeded */
if (len > inode->i_sb->s_maxbytes - inode->i_size)
return -EFBIG;
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
* Write out all dirty pages. Need to round down to align start offset
* to page size boundary for page size > block size.
*/
filemap_invalidate_lock(mapping);
ret = ext4_break_layouts(inode);
start = round_down(offset, PAGE_SIZE);
ret = filemap_write_and_wait_range(mapping, start, LLONG_MAX);
if (ret)
goto out_mmap;
return ret;
/*
* Need to round down to align start offset to page size boundary
* for page size > block size.
*/
ioffset = round_down(offset, PAGE_SIZE);
/* Write out all dirty pages */
ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
LLONG_MAX);
if (ret)
goto out_mmap;
truncate_pagecache(inode, ioffset);
truncate_pagecache(inode, start);
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out_mmap;
}
if (IS_ERR(handle))
return PTR_ERR(handle);
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
/* Expand file to avoid data loss if there is error while shifting */
inode->i_size += len;
EXT4_I(inode)->i_disksize += len;
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret = ext4_mark_inode_dirty(handle, inode);
if (ret)
goto out_stop;
goto out_handle;
start_lblk = offset >> inode->i_blkbits;
len_lblk = len >> inode->i_blkbits;
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
path = ext4_find_extent(inode, offset_lblk, NULL, 0);
path = ext4_find_extent(inode, start_lblk, NULL, 0);
if (IS_ERR(path)) {
up_write(&EXT4_I(inode)->i_data_sem);
ret = PTR_ERR(path);
goto out_stop;
goto out_handle;
}
depth = ext_depth(inode);
@ -5576,16 +5446,16 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
ee_len = ext4_ext_get_actual_len(extent);
/*
* If offset_lblk is not the starting block of extent, split
* the extent @offset_lblk
* If start_lblk is not the starting block of extent, split
* the extent @start_lblk
*/
if ((offset_lblk > ee_start_lblk) &&
(offset_lblk < (ee_start_lblk + ee_len))) {
if ((start_lblk > ee_start_lblk) &&
(start_lblk < (ee_start_lblk + ee_len))) {
if (ext4_ext_is_unwritten(extent))
split_flag = EXT4_EXT_MARK_UNWRIT1 |
EXT4_EXT_MARK_UNWRIT2;
path = ext4_split_extent_at(handle, inode, path,
offset_lblk, split_flag,
start_lblk, split_flag,
EXT4_EX_NOCACHE |
EXT4_GET_BLOCKS_PRE_IO |
EXT4_GET_BLOCKS_METADATA_NOFAIL);
@ -5594,32 +5464,29 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
if (IS_ERR(path)) {
up_write(&EXT4_I(inode)->i_data_sem);
ret = PTR_ERR(path);
goto out_stop;
goto out_handle;
}
}
ext4_free_ext_path(path);
ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk);
ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk);
/*
* if offset_lblk lies in a hole which is at start of file, use
* if start_lblk lies in a hole which is at start of file, use
* ee_start_lblk to shift extents
*/
ret = ext4_ext_shift_extents(inode, handle,
max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT);
max(ee_start_lblk, start_lblk), len_lblk, SHIFT_RIGHT);
up_write(&EXT4_I(inode)->i_data_sem);
if (ret)
goto out_handle;
ext4_update_inode_fsync_trans(handle, inode, 1);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
if (ret >= 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
out_stop:
out_handle:
ext4_journal_stop(handle);
out_mmap:
filemap_invalidate_unlock(mapping);
out_mutex:
inode_unlock(inode);
return ret;
}

View File

@ -1551,7 +1551,6 @@ retry:
ext4_es_print_tree(inode);
ext4_da_release_space(inode, reserved);
return;
}
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,

View File

@ -688,10 +688,12 @@ out:
static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
int ret;
struct inode *inode = file_inode(iocb->ki_filp);
if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
ret = ext4_emergency_state(inode->i_sb);
if (unlikely(ret))
return ret;
#ifdef CONFIG_FS_DAX
if (IS_DAX(inode))
@ -700,7 +702,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_flags & IOCB_ATOMIC) {
size_t len = iov_iter_count(from);
int ret;
if (len < EXT4_SB(inode->i_sb)->s_awu_min ||
len > EXT4_SB(inode->i_sb)->s_awu_max)
@ -800,11 +801,16 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
int ret;
struct inode *inode = file->f_mapping->host;
struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
if (file->f_mode & FMODE_WRITE)
ret = ext4_emergency_state(inode->i_sb);
else
ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0;
if (unlikely(ret))
return ret;
/*
* We don't support synchronous mappings for non-DAX files and
@ -835,7 +841,8 @@ static int ext4_sample_last_mounted(struct super_block *sb,
if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED)))
return 0;
if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
if (ext4_emergency_state(sb) || sb_rdonly(sb) ||
!sb_start_intwrite_trylock(sb))
return 0;
ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED);
@ -878,8 +885,12 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
{
int ret;
if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
if (filp->f_mode & FMODE_WRITE)
ret = ext4_emergency_state(inode->i_sb);
else
ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0;
if (unlikely(ret))
return ret;
ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt);
if (ret)

View File

@ -132,20 +132,16 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
bool needs_barrier = false;
struct inode *inode = file->f_mapping->host;
if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
ret = ext4_emergency_state(inode->i_sb);
if (unlikely(ret))
return ret;
ASSERT(ext4_journal_current_handle() == NULL);
trace_ext4_sync_file_enter(file, datasync);
if (sb_rdonly(inode->i_sb)) {
/* Make sure that we read updated s_ext4_flags value */
smp_rmb();
if (ext4_forced_shutdown(inode->i_sb))
ret = -EROFS;
if (sb_rdonly(inode->i_sb))
goto out;
}
if (!EXT4_SB(inode->i_sb)->s_journal) {
ret = ext4_fsync_nojournal(file, start, end, datasync,

View File

@ -302,7 +302,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
if (len && IS_CASEFOLDED(dir) &&
(!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) {
buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL);
buff = kzalloc(PATH_MAX, GFP_KERNEL);
if (!buff)
return -ENOMEM;

View File

@ -951,8 +951,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
sb = dir->i_sb;
sbi = EXT4_SB(sb);
if (unlikely(ext4_forced_shutdown(sb)))
return ERR_PTR(-EIO);
ret2 = ext4_emergency_state(sb);
if (unlikely(ret2))
return ERR_PTR(ret2);
ngroups = ext4_get_groups_count(sb);
trace_ext4_request_inode(dir, mode);
@ -1282,7 +1283,7 @@ got:
inode->i_generation = get_random_u32();
/* Precompute checksum seed for inode metadata */
if (ext4_has_metadata_csum(sb)) {
if (ext4_has_feature_metadata_csum(sb)) {
__u32 csum;
__le32 inum = cpu_to_le32(inode->i_ino);
__le32 gen = cpu_to_le32(inode->i_generation);
@ -1298,7 +1299,7 @@ got:
ei->i_extra_isize = sbi->s_want_extra_isize;
ei->i_inline_off = 0;
if (ext4_has_feature_inline_data(sb) &&
(!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode)))
(!(ei->i_flags & (EXT4_DAX_FL|EXT4_EA_INODE_FL)) || S_ISDIR(mode)))
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
ret = inode;
err = dquot_alloc_inode(inode);

View File

@ -20,6 +20,11 @@
#define EXT4_INLINE_DOTDOT_OFFSET 2
#define EXT4_INLINE_DOTDOT_SIZE 4
static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
struct inode *inode,
void **fsdata);
static int ext4_get_inline_size(struct inode *inode)
{
if (EXT4_I(inode)->i_inline_off)
@ -228,7 +233,7 @@ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
struct ext4_inode *raw_inode;
int cp_len = 0;
if (unlikely(ext4_forced_shutdown(inode->i_sb)))
if (unlikely(ext4_emergency_state(inode->i_sb)))
return;
BUG_ON(!EXT4_I(inode)->i_inline_off);
@ -652,6 +657,95 @@ out_nofolio:
return ret;
}
/*
* Prepare the write for the inline data.
* If the data can be written into the inode, we just read
* the page and make it uptodate, and start the journal.
* Otherwise read the page, makes it dirty so that it can be
* handle in writepages(the i_disksize update is left to the
* normal ext4_da_write_end).
*/
int ext4_generic_write_inline_data(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
struct folio **foliop,
void **fsdata, bool da)
{
int ret;
handle_t *handle;
struct folio *folio;
struct ext4_iloc iloc;
int retries = 0;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
return ret;
retry_journal:
handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out_release_bh;
}
ret = ext4_prepare_inline_data(handle, inode, pos + len);
if (ret && ret != -ENOSPC)
goto out_stop_journal;
if (ret == -ENOSPC) {
ext4_journal_stop(handle);
if (!da) {
brelse(iloc.bh);
/* Retry inside */
return ext4_convert_inline_data_to_extent(mapping, inode);
}
ret = ext4_da_convert_inline_data_to_extent(mapping, inode, fsdata);
if (ret == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry_journal;
goto out_release_bh;
}
folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
mapping_gfp_mask(mapping));
if (IS_ERR(folio)) {
ret = PTR_ERR(folio);
goto out_stop_journal;
}
down_read(&EXT4_I(inode)->xattr_sem);
/* Someone else had converted it to extent */
if (!ext4_has_inline_data(inode)) {
ret = 0;
goto out_release_folio;
}
if (!folio_test_uptodate(folio)) {
ret = ext4_read_inline_folio(inode, folio);
if (ret < 0)
goto out_release_folio;
}
ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE);
if (ret)
goto out_release_folio;
*foliop = folio;
up_read(&EXT4_I(inode)->xattr_sem);
brelse(iloc.bh);
return 1;
out_release_folio:
up_read(&EXT4_I(inode)->xattr_sem);
folio_unlock(folio);
folio_put(folio);
out_stop_journal:
ext4_journal_stop(handle);
out_release_bh:
brelse(iloc.bh);
return ret;
}
/*
* Try to write data in the inode.
* If the inode has inline data, check whether the new write can be
@ -663,81 +757,10 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop)
{
int ret;
handle_t *handle;
struct folio *folio;
struct ext4_iloc iloc;
if (pos + len > ext4_get_max_inline_size(inode))
goto convert;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
return ret;
/*
* The possible write could happen in the inode,
* so try to reserve the space in inode first.
*/
handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
handle = NULL;
goto out;
}
ret = ext4_prepare_inline_data(handle, inode, pos + len);
if (ret && ret != -ENOSPC)
goto out;
/* We don't have space in inline inode, so convert it to extent. */
if (ret == -ENOSPC) {
ext4_journal_stop(handle);
brelse(iloc.bh);
goto convert;
}
ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh,
EXT4_JTR_NONE);
if (ret)
goto out;
folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
mapping_gfp_mask(mapping));
if (IS_ERR(folio)) {
ret = PTR_ERR(folio);
goto out;
}
*foliop = folio;
down_read(&EXT4_I(inode)->xattr_sem);
if (!ext4_has_inline_data(inode)) {
ret = 0;
folio_unlock(folio);
folio_put(folio);
goto out_up_read;
}
if (!folio_test_uptodate(folio)) {
ret = ext4_read_inline_folio(inode, folio);
if (ret < 0) {
folio_unlock(folio);
folio_put(folio);
goto out_up_read;
}
}
ret = 1;
handle = NULL;
out_up_read:
up_read(&EXT4_I(inode)->xattr_sem);
out:
if (handle && (ret != 1))
ext4_journal_stop(handle);
brelse(iloc.bh);
return ret;
convert:
return ext4_convert_inline_data_to_extent(mapping, inode);
return ext4_convert_inline_data_to_extent(mapping, inode);
return ext4_generic_write_inline_data(mapping, inode, pos, len,
foliop, NULL, false);
}
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
@ -881,94 +904,6 @@ out:
return ret;
}
/*
* Prepare the write for the inline data.
* If the data can be written into the inode, we just read
* the page and make it uptodate, and start the journal.
* Otherwise read the page, makes it dirty so that it can be
* handle in writepages(the i_disksize update is left to the
* normal ext4_da_write_end).
*/
int ext4_da_write_inline_data_begin(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
struct folio **foliop,
void **fsdata)
{
int ret;
handle_t *handle;
struct folio *folio;
struct ext4_iloc iloc;
int retries = 0;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
return ret;
retry_journal:
handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
}
ret = ext4_prepare_inline_data(handle, inode, pos + len);
if (ret && ret != -ENOSPC)
goto out_journal;
if (ret == -ENOSPC) {
ext4_journal_stop(handle);
ret = ext4_da_convert_inline_data_to_extent(mapping,
inode,
fsdata);
if (ret == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry_journal;
goto out;
}
/*
* We cannot recurse into the filesystem as the transaction
* is already started.
*/
folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
mapping_gfp_mask(mapping));
if (IS_ERR(folio)) {
ret = PTR_ERR(folio);
goto out_journal;
}
down_read(&EXT4_I(inode)->xattr_sem);
if (!ext4_has_inline_data(inode)) {
ret = 0;
goto out_release_page;
}
if (!folio_test_uptodate(folio)) {
ret = ext4_read_inline_folio(inode, folio);
if (ret < 0)
goto out_release_page;
}
ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh,
EXT4_JTR_NONE);
if (ret)
goto out_release_page;
up_read(&EXT4_I(inode)->xattr_sem);
*foliop = folio;
brelse(iloc.bh);
return 1;
out_release_page:
up_read(&EXT4_I(inode)->xattr_sem);
folio_unlock(folio);
folio_put(folio);
out_journal:
ext4_journal_stop(handle);
out:
brelse(iloc.bh);
return ret;
}
#ifdef INLINE_DIR_DEBUG
void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
void *inline_start, int inline_size)
@ -1012,7 +947,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
int err;
struct ext4_dir_entry_2 *de;
err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start,
err = ext4_find_dest_de(dir, iloc->bh, inline_start,
inline_size, fname, &de);
if (err)
return err;
@ -1146,7 +1081,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle,
memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
inline_size - EXT4_INLINE_DOTDOT_SIZE);
if (ext4_has_metadata_csum(inode->i_sb))
if (ext4_has_feature_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
inode->i_size = inode->i_sb->s_blocksize;

View File

@ -31,6 +31,7 @@
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/mpage.h>
#include <linux/rmap.h>
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
@ -93,7 +94,7 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
cpu_to_le32(EXT4_OS_LINUX) ||
!ext4_has_metadata_csum(inode->i_sb))
!ext4_has_feature_metadata_csum(inode->i_sb))
return 1;
provided = le16_to_cpu(raw->i_checksum_lo);
@ -114,7 +115,7 @@ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
cpu_to_le32(EXT4_OS_LINUX) ||
!ext4_has_metadata_csum(inode->i_sb))
!ext4_has_feature_metadata_csum(inode->i_sb))
return;
csum = ext4_inode_csum(inode, raw, ei);
@ -751,7 +752,7 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
flags &= EXT4_MAP_FLAGS;
/* Dummy buffer_head? Set non-atomically. */
if (!bh->b_page) {
if (!bh->b_folio) {
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
return;
}
@ -1149,8 +1150,9 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index;
unsigned from, to;
if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
ret = ext4_emergency_state(inode->i_sb);
if (unlikely(ret))
return ret;
trace_ext4_write_begin(inode, pos, len);
/*
@ -2225,7 +2227,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
mpd->io_submit.io_end->handle = handle->h_rsv_handle;
handle->h_rsv_handle = NULL;
}
ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
ext4_set_io_unwritten_flag(mpd->io_submit.io_end);
}
BUG_ON(map->m_len == 0);
@ -2273,7 +2275,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
if (err < 0) {
struct super_block *sb = inode->i_sb;
if (ext4_forced_shutdown(sb))
if (ext4_emergency_state(sb))
goto invalidate_dirty_pages;
/*
* Let the uper layers retry transient errors.
@ -2599,10 +2601,9 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
* *never* be called, so if that ever happens, we would want
* the stack trace.
*/
if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) {
ret = -EROFS;
ret = ext4_emergency_state(mapping->host->i_sb);
if (unlikely(ret))
goto out_writepages;
}
/*
* If we have inline data and arrive here, it means that
@ -2817,8 +2818,9 @@ static int ext4_writepages(struct address_space *mapping,
int ret;
int alloc_ctx;
if (unlikely(ext4_forced_shutdown(sb)))
return -EIO;
ret = ext4_emergency_state(sb);
if (unlikely(ret))
return ret;
alloc_ctx = ext4_writepages_down_read(sb);
ret = ext4_do_writepages(&mpd);
@ -2858,8 +2860,9 @@ static int ext4_dax_writepages(struct address_space *mapping,
struct inode *inode = mapping->host;
int alloc_ctx;
if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
ret = ext4_emergency_state(inode->i_sb);
if (unlikely(ret))
return ret;
alloc_ctx = ext4_writepages_down_read(inode->i_sb);
trace_ext4_writepages(inode, wbc);
@ -2915,8 +2918,9 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index;
struct inode *inode = mapping->host;
if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
ret = ext4_emergency_state(inode->i_sb);
if (unlikely(ret))
return ret;
index = pos >> PAGE_SHIFT;
@ -2929,8 +2933,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
trace_ext4_da_write_begin(inode, pos, len);
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len,
foliop, fsdata);
ret = ext4_generic_write_inline_data(mapping, inode, pos, len,
foliop, fsdata, true);
if (ret < 0)
return ret;
if (ret == 1)
@ -3906,6 +3910,68 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
return ret;
}
static inline void ext4_truncate_folio(struct inode *inode,
loff_t start, loff_t end)
{
unsigned long blocksize = i_blocksize(inode);
struct folio *folio;
/* Nothing to be done if no complete block needs to be truncated. */
if (round_up(start, blocksize) >= round_down(end, blocksize))
return;
folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT);
if (IS_ERR(folio))
return;
if (folio_mkclean(folio))
folio_mark_dirty(folio);
folio_unlock(folio);
folio_put(folio);
}
int ext4_truncate_page_cache_block_range(struct inode *inode,
loff_t start, loff_t end)
{
unsigned long blocksize = i_blocksize(inode);
int ret;
/*
* For journalled data we need to write (and checkpoint) pages
* before discarding page cache to avoid inconsitent data on disk
* in case of crash before freeing or unwritten converting trans
* is committed.
*/
if (ext4_should_journal_data(inode)) {
ret = filemap_write_and_wait_range(inode->i_mapping, start,
end - 1);
if (ret)
return ret;
goto truncate_pagecache;
}
/*
* If the block size is less than the page size, the file's mapped
* blocks within one page could be freed or converted to unwritten.
* So it's necessary to remove writable userspace mappings, and then
* ext4_page_mkwrite() can be called during subsequent write access
* to these partial folios.
*/
if (!IS_ALIGNED(start | end, PAGE_SIZE) &&
blocksize < PAGE_SIZE && start < inode->i_size) {
loff_t page_boundary = round_up(start, PAGE_SIZE);
ext4_truncate_folio(inode, start, min(page_boundary, end));
if (end > page_boundary)
ext4_truncate_folio(inode,
round_down(end, PAGE_SIZE), end);
}
truncate_pagecache:
truncate_pagecache_range(inode, start, end - 1);
return 0;
}
static void ext4_wait_dax_page(struct inode *inode)
{
filemap_invalidate_unlock(inode->i_mapping);
@ -3950,91 +4016,50 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
ext4_lblk_t first_block, stop_block;
struct address_space *mapping = inode->i_mapping;
loff_t first_block_offset, last_block_offset, max_length;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t start_lblk, end_lblk;
loff_t max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize;
loff_t end = offset + length;
handle_t *handle;
unsigned int credits;
int ret = 0, ret2 = 0;
int ret;
trace_ext4_punch_hole(inode, offset, length, 0);
/*
* Write out all dirty pages to avoid race conditions
* Then release them.
*/
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
ret = filemap_write_and_wait_range(mapping, offset,
offset + length - 1);
if (ret)
return ret;
}
inode_lock(inode);
WARN_ON_ONCE(!inode_is_locked(inode));
/* No need to punch hole beyond i_size */
if (offset >= inode->i_size)
goto out_mutex;
return 0;
/*
* If the hole extends beyond i_size, set the hole
* to end after the page that contains i_size
* If the hole extends beyond i_size, set the hole to end after
* the page that contains i_size, and also make sure that the hole
* within one block before last range.
*/
if (offset + length > inode->i_size) {
length = inode->i_size +
PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) -
offset;
}
if (end > inode->i_size)
end = round_up(inode->i_size, PAGE_SIZE);
if (end > max_end)
end = max_end;
length = end - offset;
/*
* For punch hole the length + offset needs to be within one block
* before last range. Adjust the length if it goes beyond that limit.
* Attach jinode to inode for jbd2 if we do any zeroing of partial
* block.
*/
max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
if (offset + length > max_length)
length = max_length - offset;
if (offset & (sb->s_blocksize - 1) ||
(offset + length) & (sb->s_blocksize - 1)) {
/*
* Attach jinode to inode for jbd2 if we do any zeroing of
* partial block
*/
if (!IS_ALIGNED(offset | end, sb->s_blocksize)) {
ret = ext4_inode_attach_jinode(inode);
if (ret < 0)
goto out_mutex;
return ret;
}
/* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);
ret = file_modified(file);
ret = ext4_update_disksize_before_punch(inode, offset, length);
if (ret)
goto out_mutex;
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
*/
filemap_invalidate_lock(mapping);
ret = ext4_break_layouts(inode);
if (ret)
goto out_dio;
first_block_offset = round_up(offset, sb->s_blocksize);
last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
return ret;
/* Now release the pages and zero block aligned part of pages*/
if (last_block_offset > first_block_offset) {
ret = ext4_update_disksize_before_punch(inode, offset, length);
if (ret)
goto out_dio;
truncate_pagecache_range(inode, first_block_offset,
last_block_offset);
}
ret = ext4_truncate_page_cache_block_range(inode, offset, end);
if (ret)
return ret;
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
credits = ext4_writepage_trans_blocks(inode);
@ -4044,54 +4069,51 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_std_error(sb, ret);
goto out_dio;
return ret;
}
ret = ext4_zero_partial_blocks(handle, inode, offset,
length);
ret = ext4_zero_partial_blocks(handle, inode, offset, length);
if (ret)
goto out_stop;
first_block = (offset + sb->s_blocksize - 1) >>
EXT4_BLOCK_SIZE_BITS(sb);
stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
goto out_handle;
/* If there are blocks to remove, do it */
if (stop_block > first_block) {
ext4_lblk_t hole_len = stop_block - first_block;
start_lblk = EXT4_B_TO_LBLK(inode, offset);
end_lblk = end >> inode->i_blkbits;
if (end_lblk > start_lblk) {
ext4_lblk_t hole_len = end_lblk - start_lblk;
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
ext4_es_remove_extent(inode, first_block, hole_len);
ext4_es_remove_extent(inode, start_lblk, hole_len);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ret = ext4_ext_remove_space(inode, first_block,
stop_block - 1);
ret = ext4_ext_remove_space(inode, start_lblk,
end_lblk - 1);
else
ret = ext4_ind_remove_space(handle, inode, first_block,
stop_block);
ret = ext4_ind_remove_space(handle, inode, start_lblk,
end_lblk);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
goto out_handle;
}
ext4_es_insert_extent(inode, first_block, hole_len, ~0,
ext4_es_insert_extent(inode, start_lblk, hole_len, ~0,
EXTENT_STATUS_HOLE, 0);
up_write(&EXT4_I(inode)->i_data_sem);
}
ext4_fc_track_range(handle, inode, first_block, stop_block);
ext4_fc_track_range(handle, inode, start_lblk, end_lblk);
ret = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret))
goto out_handle;
ext4_update_inode_fsync_trans(handle, inode, 1);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret2 = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret2))
ret = ret2;
if (ret >= 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
out_stop:
out_handle:
ext4_journal_stop(handle);
out_dio:
filemap_invalidate_unlock(mapping);
out_mutex:
inode_unlock(inode);
return ret;
}
@ -4678,6 +4700,11 @@ static inline int ext4_iget_extra_inode(struct inode *inode,
*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
int err;
err = xattr_check_inode(inode, IHDR(inode, raw_inode),
ITAIL(inode, raw_inode));
if (err)
return err;
ext4_set_inode_state(inode, EXT4_STATE_XATTR);
err = ext4_find_inline_data_nolock(inode);
if (!err && ext4_has_inline_data(inode))
@ -4804,7 +4831,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ei->i_extra_isize = 0;
/* Precompute checksum seed for inode metadata */
if (ext4_has_metadata_csum(sb)) {
if (ext4_has_feature_metadata_csum(sb)) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
__le32 inum = cpu_to_le32(inode->i_ino);
@ -4891,7 +4918,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
* we'd normally treat htree data as empty space. But with metadata
* checksumming that corrupts checksums so forbid that.
*/
if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) &&
if (!ext4_has_feature_dir_index(sb) &&
ext4_has_feature_metadata_csum(sb) &&
ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
ext4_error_inode(inode, function, line, 0,
"iget: Dir with htree data on filesystem without dir_index feature.");
@ -5011,8 +5039,16 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
inode->i_op = &ext4_encrypted_symlink_inode_operations;
} else if (ext4_inode_is_fast_symlink(inode)) {
inode->i_op = &ext4_fast_symlink_inode_operations;
nd_terminate_link(ei->i_data, inode->i_size,
sizeof(ei->i_data) - 1);
if (inode->i_size == 0 ||
inode->i_size >= sizeof(ei->i_data) ||
strnlen((char *)ei->i_data, inode->i_size + 1) !=
inode->i_size) {
ext4_error_inode(inode, function, line, 0,
"invalid fast symlink length %llu",
(unsigned long long)inode->i_size);
ret = -EFSCORRUPTED;
goto bad_inode;
}
inode_set_cached_link(inode, (char *)ei->i_data,
inode->i_size);
} else {
@ -5232,8 +5268,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
return 0;
if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
err = ext4_emergency_state(inode->i_sb);
if (unlikely(err))
return err;
if (EXT4_SB(inode->i_sb)->s_journal) {
if (ext4_journal_current_handle()) {
@ -5355,8 +5392,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
const unsigned int ia_valid = attr->ia_valid;
bool inc_ivers = true;
if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
error = ext4_emergency_state(inode->i_sb);
if (unlikely(error))
return error;
if (unlikely(IS_IMMUTABLE(inode)))
return -EPERM;
@ -5468,7 +5506,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
oldsize & (inode->i_sb->s_blocksize - 1)) {
error = ext4_inode_attach_jinode(inode);
if (error)
goto err_out;
goto out_mmap_sem;
}
handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
@ -5800,9 +5838,10 @@ int ext4_mark_iloc_dirty(handle_t *handle,
{
int err = 0;
if (unlikely(ext4_forced_shutdown(inode->i_sb))) {
err = ext4_emergency_state(inode->i_sb);
if (unlikely(err)) {
put_bh(iloc->bh);
return -EIO;
return err;
}
ext4_fc_track_inode(handle, inode);
@ -5826,8 +5865,9 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
{
int err;
if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
err = ext4_emergency_state(inode->i_sb);
if (unlikely(err))
return err;
err = ext4_get_inode_loc(inode, iloc);
if (!err) {

View File

@ -142,7 +142,7 @@ static int ext4_update_backup_sb(struct super_block *sb,
es = (struct ext4_super_block *) (bh->b_data + offset);
lock_buffer(bh);
if (ext4_has_metadata_csum(sb) &&
if (ext4_has_feature_metadata_csum(sb) &&
es->s_checksum != ext4_superblock_csum(sb, es)) {
ext4_msg(sb, KERN_ERR, "Invalid checksum for backup "
"superblock %llu", sb_block);
@ -150,7 +150,7 @@ static int ext4_update_backup_sb(struct super_block *sb,
goto out_bh;
}
func(es, arg);
if (ext4_has_metadata_csum(sb))
if (ext4_has_feature_metadata_csum(sb))
es->s_checksum = ext4_superblock_csum(sb, es);
set_buffer_uptodate(bh);
unlock_buffer(bh);
@ -351,7 +351,7 @@ void ext4_reset_inode_seed(struct inode *inode)
__le32 gen = cpu_to_le32(inode->i_generation);
__u32 csum;
if (!ext4_has_metadata_csum(inode->i_sb))
if (!ext4_has_feature_metadata_csum(inode->i_sb))
return;
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum));
@ -1205,7 +1205,8 @@ static int ext4_ioctl_setuuid(struct file *filp,
* If any checksums (group descriptors or metadata) are being used
* then the checksum seed feature is required to change the UUID.
*/
if (((ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb))
if (((ext4_has_feature_gdt_csum(sb) ||
ext4_has_feature_metadata_csum(sb))
&& !ext4_has_feature_csum_seed(sb))
|| ext4_has_feature_stable_inodes(sb))
return -EOPNOTSUPP;
@ -1253,7 +1254,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (!inode_owner_or_capable(idmap, inode))
return -EPERM;
if (ext4_has_metadata_csum(inode->i_sb)) {
if (ext4_has_feature_metadata_csum(inode->i_sb)) {
ext4_warning(sb, "Setting inode version is not "
"supported with metadata_csum enabled.");
return -ENOTTY;
@ -1705,7 +1706,7 @@ int ext4_update_overhead(struct super_block *sb, bool force)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (sb_rdonly(sb))
if (ext4_emergency_state(sb) || sb_rdonly(sb))
return 0;
if (!force &&
(sbi->s_overhead == 0 ||

View File

@ -796,6 +796,7 @@ static void test_mb_mark_used(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy);
grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp);
ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
KUNIT_ASSERT_EQ(test, ret, 0);
@ -860,6 +861,7 @@ static void test_mb_free_blocks(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy);
grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp);
ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
KUNIT_ASSERT_EQ(test, ret, 0);

View File

@ -187,7 +187,7 @@
* /sys/fs/ext4/<partition>/mb_min_to_scan
* /sys/fs/ext4/<partition>/mb_max_to_scan
* /sys/fs/ext4/<partition>/mb_order2_req
* /sys/fs/ext4/<partition>/mb_linear_limit
* /sys/fs/ext4/<partition>/mb_max_linear_groups
*
* The regular allocator uses buddy scan only if the request len is power of
* 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
@ -209,7 +209,7 @@
* get traversed linearly. That may result in subsequent allocations being not
* close to each other. And so, the underlying device may get filled up in a
* non-linear fashion. While that may not matter on non-rotational devices, for
* rotational devices that may result in higher seek times. "mb_linear_limit"
* rotational devices that may result in higher seek times. "mb_max_linear_groups"
* tells mballoc how many groups mballoc should search linearly before
* performing consulting above data structures for more efficient lookups. For
* non rotational devices, this value defaults to 0 and for rotational devices
@ -5653,7 +5653,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb)
{
ext4_group_t i, ngroups;
if (ext4_forced_shutdown(sb))
if (ext4_emergency_state(sb))
return;
ngroups = ext4_get_groups_count(sb);
@ -5687,7 +5687,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
if (ext4_forced_shutdown(sb))
if (ext4_emergency_state(sb))
return;
mb_debug(sb, "Can't allocate:"

View File

@ -21,7 +21,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
{
if (!ext4_has_metadata_csum(sb))
if (!ext4_has_feature_metadata_csum(sb))
return 1;
return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
@ -29,7 +29,7 @@ static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
{
if (!ext4_has_metadata_csum(sb))
if (!ext4_has_feature_metadata_csum(sb))
return;
mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
@ -162,7 +162,7 @@ static int kmmpd(void *data)
memcpy(mmp->mmp_nodename, init_utsname()->nodename,
sizeof(mmp->mmp_nodename));
while (!kthread_should_stop() && !ext4_forced_shutdown(sb)) {
while (!kthread_should_stop() && !ext4_emergency_state(sb)) {
if (!ext4_has_feature_mmp(sb)) {
ext4_warning(sb, "kmmpd being stopped since MMP feature"
" has been disabled.");

View File

@ -176,7 +176,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
brelse(bh);
return ERR_PTR(-EFSCORRUPTED);
}
if (!ext4_has_metadata_csum(inode->i_sb) ||
if (!ext4_has_feature_metadata_csum(inode->i_sb) ||
buffer_verified(bh))
return bh;
@ -291,36 +291,6 @@ struct dx_tail {
__le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */
};
static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
static inline unsigned dx_get_hash(struct dx_entry *entry);
static void dx_set_hash(struct dx_entry *entry, unsigned value);
static unsigned dx_get_count(struct dx_entry *entries);
static unsigned dx_get_limit(struct dx_entry *entries);
static void dx_set_count(struct dx_entry *entries, unsigned value);
static void dx_set_limit(struct dx_entry *entries, unsigned value);
static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
static unsigned dx_node_limit(struct inode *dir);
static struct dx_frame *dx_probe(struct ext4_filename *fname,
struct inode *dir,
struct dx_hash_info *hinfo,
struct dx_frame *frame);
static void dx_release(struct dx_frame *frames);
static int dx_make_map(struct inode *dir, struct buffer_head *bh,
struct dx_hash_info *hinfo,
struct dx_map_entry *map_tail);
static void dx_sort_map(struct dx_map_entry *map, unsigned count);
static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from,
char *to, struct dx_map_entry *offsets,
int count, unsigned int blocksize);
static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base,
unsigned int blocksize);
static void dx_insert_block(struct dx_frame *frame,
u32 hash, ext4_lblk_t block);
static int ext4_htree_next_block(struct inode *dir, __u32 hash,
struct dx_frame *frame,
struct dx_frame *frames,
__u32 *start_hash);
static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **res_dir);
@ -398,7 +368,7 @@ int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh)
{
struct ext4_dir_entry_tail *t;
if (!ext4_has_metadata_csum(inode->i_sb))
if (!ext4_has_feature_metadata_csum(inode->i_sb))
return 1;
t = get_dirent_tail(inode, bh);
@ -419,7 +389,7 @@ static void ext4_dirblock_csum_set(struct inode *inode,
{
struct ext4_dir_entry_tail *t;
if (!ext4_has_metadata_csum(inode->i_sb))
if (!ext4_has_feature_metadata_csum(inode->i_sb))
return;
t = get_dirent_tail(inode, bh);
@ -494,7 +464,7 @@ static int ext4_dx_csum_verify(struct inode *inode,
struct dx_tail *t;
int count_offset, limit, count;
if (!ext4_has_metadata_csum(inode->i_sb))
if (!ext4_has_feature_metadata_csum(inode->i_sb))
return 1;
c = get_dx_countlimit(inode, dirent, &count_offset);
@ -523,7 +493,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
struct dx_tail *t;
int count_offset, limit, count;
if (!ext4_has_metadata_csum(inode->i_sb))
if (!ext4_has_feature_metadata_csum(inode->i_sb))
return;
c = get_dx_countlimit(inode, dirent, &count_offset);
@ -612,7 +582,7 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
ext4_dir_rec_len(1, NULL) -
ext4_dir_rec_len(2, NULL) - infosize;
if (ext4_has_metadata_csum(dir->i_sb))
if (ext4_has_feature_metadata_csum(dir->i_sb))
entry_space -= sizeof(struct dx_tail);
return entry_space / sizeof(struct dx_entry);
}
@ -622,7 +592,7 @@ static inline unsigned dx_node_limit(struct inode *dir)
unsigned int entry_space = dir->i_sb->s_blocksize -
ext4_dir_rec_len(0, dir);
if (ext4_has_metadata_csum(dir->i_sb))
if (ext4_has_feature_metadata_csum(dir->i_sb))
entry_space -= sizeof(struct dx_tail);
return entry_space / sizeof(struct dx_entry);
}
@ -1076,7 +1046,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
struct ext4_dir_entry_2 *de, *top;
int err = 0, count = 0;
struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str;
int csum = ext4_has_metadata_csum(dir->i_sb);
int csum = ext4_has_feature_metadata_csum(dir->i_sb);
dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
(unsigned long)block));
@ -1320,7 +1290,7 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh,
struct dx_hash_info h = *hinfo;
int blocksize = EXT4_BLOCK_SIZE(dir->i_sb);
if (ext4_has_metadata_csum(dir->i_sb))
if (ext4_has_feature_metadata_csum(dir->i_sb))
buflen -= sizeof(struct ext4_dir_entry_tail);
while ((char *) de < base + buflen) {
@ -1462,7 +1432,8 @@ static bool ext4_match(struct inode *parent,
* sure cf_name was properly initialized before
* considering the calculated hash.
*/
if (IS_ENCRYPTED(parent) && fname->cf_name.name &&
if (sb_no_casefold_compat_fallback(parent->i_sb) &&
IS_ENCRYPTED(parent) && fname->cf_name.name &&
(fname->hinfo.hash != EXT4_DIRENT_HASH(de) ||
fname->hinfo.minor_hash != EXT4_DIRENT_MINOR_HASH(de)))
return false;
@ -1595,10 +1566,15 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
* return. Otherwise, fall back to doing a search the
* old fashioned way.
*/
if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR)
if (IS_ERR(ret) && PTR_ERR(ret) == ERR_BAD_DX_DIR)
dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
"falling back\n"));
else if (!sb_no_casefold_compat_fallback(dir->i_sb) &&
*res_dir == NULL && IS_CASEFOLDED(dir))
dxtrace(printk(KERN_DEBUG "ext4_find_entry: casefold "
"failed, falling back\n"));
else
goto cleanup_and_exit;
dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
"falling back\n"));
ret = NULL;
}
nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
@ -1945,7 +1921,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
int csum_size = 0;
int err = 0, i;
if (ext4_has_metadata_csum(dir->i_sb))
if (ext4_has_feature_metadata_csum(dir->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
bh2 = ext4_append(handle, dir, &newblock);
@ -2060,8 +2036,7 @@ out:
return ERR_PTR(err);
}
int ext4_find_dest_de(struct inode *dir, struct inode *inode,
struct buffer_head *bh,
int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh,
void *buf, int buf_size,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **dest_de)
@ -2143,11 +2118,11 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
int csum_size = 0;
int err, err2;
if (ext4_has_metadata_csum(inode->i_sb))
if (ext4_has_feature_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
if (!de) {
err = ext4_find_dest_de(dir, inode, bh, bh->b_data,
err = ext4_find_dest_de(dir, bh, bh->b_data,
blocksize - csum_size, fname, &de);
if (err)
return err;
@ -2252,7 +2227,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
struct fake_dirent *fde;
int csum_size = 0;
if (ext4_has_metadata_csum(inode->i_sb))
if (ext4_has_feature_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
blocksize = dir->i_sb->s_blocksize;
@ -2396,7 +2371,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
ext4_lblk_t block, blocks;
int csum_size = 0;
if (ext4_has_metadata_csum(inode->i_sb))
if (ext4_has_feature_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
sb = dir->i_sb;
@ -2427,7 +2402,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
if (!retval || (retval != ERR_BAD_DX_DIR))
goto out;
/* Can we just ignore htree data? */
if (ext4_has_metadata_csum(sb)) {
if (ext4_has_feature_metadata_csum(sb)) {
EXT4_ERROR_INODE(dir,
"Directory has corrupted htree index.");
retval = -EFSCORRUPTED;
@ -2577,8 +2552,10 @@ again:
BUFFER_TRACE(frame->bh, "get_write_access");
err = ext4_journal_get_write_access(handle, sb, frame->bh,
EXT4_JTR_NONE);
if (err)
if (err) {
brelse(bh2);
goto journal_error;
}
if (!add_level) {
unsigned icount1 = icount/2, icount2 = icount - icount1;
unsigned hash2 = dx_get_hash(entries + icount1);
@ -2589,8 +2566,10 @@ again:
err = ext4_journal_get_write_access(handle, sb,
(frame - 1)->bh,
EXT4_JTR_NONE);
if (err)
if (err) {
brelse(bh2);
goto journal_error;
}
memcpy((char *) entries2, (char *) (entries + icount1),
icount2 * sizeof(struct dx_entry));
@ -2609,8 +2588,10 @@ again:
dxtrace(dx_show_index("node",
((struct dx_node *) bh2->b_data)->entries));
err = ext4_handle_dirty_dx_node(handle, dir, bh2);
if (err)
if (err) {
brelse(bh2);
goto journal_error;
}
brelse (bh2);
err = ext4_handle_dirty_dx_node(handle, dir,
(frame - 1)->bh);
@ -2635,8 +2616,10 @@ again:
"Creating %d level index...\n",
dxroot->info.indirect_levels));
err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
if (err)
if (err) {
brelse(bh2);
goto journal_error;
}
err = ext4_handle_dirty_dx_node(handle, dir, bh2);
brelse(bh2);
restart = 1;
@ -2733,7 +2716,7 @@ static int ext4_delete_entry(handle_t *handle,
return err;
}
if (ext4_has_metadata_csum(dir->i_sb))
if (ext4_has_feature_metadata_csum(dir->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
BUFFER_TRACE(bh, "get_write_access");
@ -2973,7 +2956,7 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir,
int csum_size = 0;
int err;
if (ext4_has_metadata_csum(dir->i_sb))
if (ext4_has_feature_metadata_csum(dir->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
@ -3151,8 +3134,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
struct ext4_dir_entry_2 *de;
handle_t *handle = NULL;
if (unlikely(ext4_forced_shutdown(dir->i_sb)))
return -EIO;
retval = ext4_emergency_state(dir->i_sb);
if (unlikely(retval))
return retval;
/* Initialize quotas before so that eventual writes go in
* separate transaction */
@ -3309,8 +3293,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
{
int retval;
if (unlikely(ext4_forced_shutdown(dir->i_sb)))
return -EIO;
retval = ext4_emergency_state(dir->i_sb);
if (unlikely(retval))
return retval;
trace_ext4_unlink_enter(dir, dentry);
/*
@ -3376,8 +3361,9 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct fscrypt_str disk_link;
int retries = 0;
if (unlikely(ext4_forced_shutdown(dir->i_sb)))
return -EIO;
err = ext4_emergency_state(dir->i_sb);
if (unlikely(err))
return err;
err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize,
&disk_link);
@ -4199,8 +4185,9 @@ static int ext4_rename2(struct mnt_idmap *idmap,
{
int err;
if (unlikely(ext4_forced_shutdown(old_dir->i_sb)))
return -EIO;
err = ext4_emergency_state(old_dir->i_sb);
if (unlikely(err))
return err;
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;

View File

@ -537,7 +537,7 @@ static int ext4_orphan_file_block_csum_verify(struct super_block *sb,
struct ext4_orphan_block_tail *ot;
__le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);
if (!ext4_has_metadata_csum(sb))
if (!ext4_has_feature_metadata_csum(sb))
return 1;
ot = ext4_orphan_block_tail(sb, bh);

View File

@ -164,7 +164,8 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
}
/*
* Check a range of space and convert unwritten extents to written. Note that
* On successful IO, check a range of space and convert unwritten extents to
* written. On IO failure, check if journal abort is needed. Note that
* we are protected from truncate touching same part of extent tree by the
* fact that truncate code waits for all DIO to finish (thus exclusion from
* direct IO is achieved) and also waits for PageWriteback bits. Thus we
@ -175,20 +176,36 @@ static int ext4_end_io_end(ext4_io_end_t *io_end)
{
struct inode *inode = io_end->inode;
handle_t *handle = io_end->handle;
struct super_block *sb = inode->i_sb;
int ret = 0;
ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
io_end, inode->i_ino, io_end->list.next, io_end->list.prev);
io_end->handle = NULL; /* Following call will use up the handle */
ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) {
ext4_msg(inode->i_sb, KERN_EMERG,
/*
* Do not convert the unwritten extents if data writeback fails,
* or stale data may be exposed.
*/
io_end->handle = NULL; /* Following call will use up the handle */
if (unlikely(io_end->flag & EXT4_IO_END_FAILED)) {
ret = -EIO;
if (handle)
jbd2_journal_free_reserved(handle);
if (test_opt(sb, DATA_ERR_ABORT))
jbd2_journal_abort(EXT4_SB(sb)->s_journal, ret);
} else {
ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
}
if (ret < 0 && !ext4_emergency_state(sb) &&
io_end->flag & EXT4_IO_END_UNWRITTEN) {
ext4_msg(sb, KERN_EMERG,
"failed to convert unwritten extents to written "
"extents -- potential data loss! "
"(inode %lu, error %d)", inode->i_ino, ret);
}
ext4_clear_io_unwritten_flag(io_end);
ext4_release_io_end(io_end);
return ret;
@ -217,6 +234,16 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head)
#endif
}
static bool ext4_io_end_defer_completion(ext4_io_end_t *io_end)
{
if (io_end->flag & EXT4_IO_END_UNWRITTEN)
return true;
if (test_opt(io_end->inode->i_sb, DATA_ERR_ABORT) &&
io_end->flag & EXT4_IO_END_FAILED)
return true;
return false;
}
/* Add the io_end to per-inode completed end_io list. */
static void ext4_add_complete_io(ext4_io_end_t *io_end)
{
@ -225,9 +252,11 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
struct workqueue_struct *wq;
unsigned long flags;
/* Only reserved conversions from writeback should enter here */
WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
WARN_ON(!io_end->handle && sbi->s_journal);
/* Only reserved conversions or pending IO errors will enter here. */
WARN_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION));
WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN &&
!io_end->handle && sbi->s_journal);
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
wq = sbi->rsv_conversion_wq;
if (list_empty(&ei->i_rsv_conversion_list))
@ -252,7 +281,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
while (!list_empty(&unwritten)) {
io_end = list_entry(unwritten.next, ext4_io_end_t, list);
BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
BUG_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION));
list_del_init(&io_end->list);
err = ext4_end_io_end(io_end);
@ -263,7 +292,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
}
/*
* work on completed IO, to convert unwritten extents to extents
* Used to convert unwritten extents to written extents upon IO completion,
* or used to abort the journal upon IO errors.
*/
void ext4_end_io_rsv_work(struct work_struct *work)
{
@ -288,29 +318,25 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
if (refcount_dec_and_test(&io_end->count)) {
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
list_empty(&io_end->list_vec)) {
ext4_release_io_end(io_end);
if (io_end->flag & EXT4_IO_END_FAILED ||
(io_end->flag & EXT4_IO_END_UNWRITTEN &&
!list_empty(&io_end->list_vec))) {
ext4_add_complete_io(io_end);
return;
}
ext4_add_complete_io(io_end);
ext4_release_io_end(io_end);
}
}
int ext4_put_io_end(ext4_io_end_t *io_end)
{
int err = 0;
if (refcount_dec_and_test(&io_end->count)) {
if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
err = ext4_convert_unwritten_io_end_vec(io_end->handle,
io_end);
io_end->handle = NULL;
ext4_clear_io_unwritten_flag(io_end);
}
if (ext4_io_end_defer_completion(io_end))
return ext4_end_io_end(io_end);
ext4_release_io_end(io_end);
}
return err;
return 0;
}
ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
@ -344,11 +370,12 @@ static void ext4_end_bio(struct bio *bio)
bio->bi_status, inode->i_ino,
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
io_end->flag |= EXT4_IO_END_FAILED;
mapping_set_error(inode->i_mapping,
blk_status_to_errno(bio->bi_status));
}
if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
if (ext4_io_end_defer_completion(io_end)) {
/*
* Link bio into list hanging from io_end. We have to do it
* atomically as bio completions can be racing against each

View File

@ -1118,7 +1118,7 @@ static inline void ext4_set_block_group_nr(struct super_block *sb, char *data,
struct ext4_super_block *es = (struct ext4_super_block *) data;
es->s_block_group_nr = cpu_to_le16(group);
if (ext4_has_metadata_csum(sb))
if (ext4_has_feature_metadata_csum(sb))
es->s_checksum = ext4_superblock_csum(sb, es);
}
@ -1315,7 +1315,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb,
{
struct buffer_head *bh;
if (!ext4_has_metadata_csum(sb))
if (!ext4_has_feature_metadata_csum(sb))
return 0;
bh = ext4_get_bitmap(sb, group_data->inode_bitmap);

View File

@ -79,7 +79,6 @@ static int ext4_unfreeze(struct super_block *sb);
static int ext4_freeze(struct super_block *sb);
static inline int ext2_feature_set_ok(struct super_block *sb);
static inline int ext3_feature_set_ok(struct super_block *sb);
static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
static struct inode *ext4_get_journal_inode(struct super_block *sb,
@ -302,7 +301,7 @@ __le32 ext4_superblock_csum(struct super_block *sb,
static int ext4_superblock_csum_verify(struct super_block *sb,
struct ext4_super_block *es)
{
if (!ext4_has_metadata_csum(sb))
if (!ext4_has_feature_metadata_csum(sb))
return 1;
return es->s_checksum == ext4_superblock_csum(sb, es);
@ -312,7 +311,7 @@ void ext4_superblock_csum_set(struct super_block *sb)
{
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
if (!ext4_has_metadata_csum(sb))
if (!ext4_has_feature_metadata_csum(sb))
return;
es->s_checksum = ext4_superblock_csum(sb, es);
@ -448,9 +447,6 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
#define ext4_get_tstamp(es, tstamp) \
__ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
#define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */
#define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */
/*
* The ext4_maybe_update_superblock() function checks and updates the
* superblock if needed.
@ -458,8 +454,10 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
* This function is designed to update the on-disk superblock only under
* certain conditions to prevent excessive disk writes and unnecessary
* waking of the disk from sleep. The superblock will be updated if:
* 1. More than an hour has passed since the last superblock update, and
* 2. More than 16MB have been written since the last superblock update.
* 1. More than sbi->s_sb_update_sec (def: 1 hour) has passed since the last
* superblock update
* 2. More than sbi->s_sb_update_kb (def: 16MB) kbs have been written since the
* last superblock update.
*
* @sb: The superblock
*/
@ -473,14 +471,15 @@ static void ext4_maybe_update_superblock(struct super_block *sb)
__u64 lifetime_write_kbytes;
__u64 diff_size;
if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) ||
!journal || (journal->j_flags & JBD2_UNMOUNT))
if (ext4_emergency_state(sb) || sb_rdonly(sb) ||
!(sb->s_flags & SB_ACTIVE) || !journal ||
journal->j_flags & JBD2_UNMOUNT)
return;
now = ktime_get_real_seconds();
last_update = ext4_get_tstamp(es, s_wtime);
if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC))
if (likely(now - last_update < sbi->s_sb_update_sec))
return;
lifetime_write_kbytes = sbi->s_kbytes_written +
@ -495,32 +494,18 @@ static void ext4_maybe_update_superblock(struct super_block *sb)
*/
diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written);
if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB)
if (diff_size > sbi->s_sb_update_kb)
schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
}
static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
{
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
int error = is_journal_aborted(journal);
struct ext4_journal_cb_entry *jce;
BUG_ON(txn->t_state == T_FINISHED);
ext4_process_freed_data(sb, txn->t_tid);
ext4_maybe_update_superblock(sb);
spin_lock(&sbi->s_md_lock);
while (!list_empty(&txn->t_private_list)) {
jce = list_entry(txn->t_private_list.next,
struct ext4_journal_cb_entry, jce_list);
list_del_init(&jce->jce_list);
spin_unlock(&sbi->s_md_lock);
jce->jce_func(sb, jce, error);
spin_lock(&sbi->s_md_lock);
}
spin_unlock(&sbi->s_md_lock);
}
/*
@ -707,11 +692,8 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
if (test_opt(sb, WARN_ON_ERROR))
WARN_ON_ONCE(1);
if (!continue_fs && !sb_rdonly(sb)) {
set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags);
if (journal)
jbd2_journal_abort(journal, -EIO);
}
if (!continue_fs && !ext4_emergency_ro(sb) && journal)
jbd2_journal_abort(journal, -EIO);
if (!bdev_read_only(sb->s_bdev)) {
save_error_info(sb, error, ino, block, func, line);
@ -719,9 +701,13 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
* In case the fs should keep running, we need to writeout
* superblock through the journal. Due to lock ordering
* constraints, it may not be safe to do it right here so we
* defer superblock flushing to a workqueue.
* defer superblock flushing to a workqueue. We just need to be
* careful when the journal is already shutting down. If we get
* here in that case, just update the sb directly as the last
* transaction won't commit anyway.
*/
if (continue_fs && journal)
if (continue_fs && journal &&
!ext4_test_mount_flag(sb, EXT4_MF_JOURNAL_DESTROY))
schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
else
ext4_commit_super(sb);
@ -737,17 +723,17 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
sb->s_id);
}
if (sb_rdonly(sb) || continue_fs)
if (ext4_emergency_ro(sb) || continue_fs)
return;
ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
/*
* EXT4_FLAGS_SHUTDOWN was set which stops all filesystem
* modifications. We don't set SB_RDONLY because that requires
* sb->s_umount semaphore and setting it without proper remount
* procedure is confusing code such as freeze_super() leading to
* deadlocks and other problems.
* We don't set SB_RDONLY because that requires sb->s_umount
* semaphore and setting it without proper remount procedure is
* confusing code such as freeze_super() leading to deadlocks
* and other problems.
*/
set_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags);
}
static void update_super_work(struct work_struct *work)
@ -765,7 +751,8 @@ static void update_super_work(struct work_struct *work)
* We use directly jbd2 functions here to avoid recursing back into
* ext4 error handling code during handling of previous errors.
*/
if (!sb_rdonly(sbi->s_sb) && journal) {
if (!ext4_emergency_state(sbi->s_sb) &&
!sb_rdonly(sbi->s_sb) && journal) {
struct buffer_head *sbh = sbi->s_sbh;
bool call_notify_err = false;
@ -819,7 +806,7 @@ void __ext4_error(struct super_block *sb, const char *function,
struct va_format vaf;
va_list args;
if (unlikely(ext4_forced_shutdown(sb)))
if (unlikely(ext4_emergency_state(sb)))
return;
trace_ext4_error(sb, function, line);
@ -844,7 +831,7 @@ void __ext4_error_inode(struct inode *inode, const char *function,
va_list args;
struct va_format vaf;
if (unlikely(ext4_forced_shutdown(inode->i_sb)))
if (unlikely(ext4_emergency_state(inode->i_sb)))
return;
trace_ext4_error(inode->i_sb, function, line);
@ -879,7 +866,7 @@ void __ext4_error_file(struct file *file, const char *function,
struct inode *inode = file_inode(file);
char pathname[80], *path;
if (unlikely(ext4_forced_shutdown(inode->i_sb)))
if (unlikely(ext4_emergency_state(inode->i_sb)))
return;
trace_ext4_error(inode->i_sb, function, line);
@ -959,7 +946,7 @@ void __ext4_std_error(struct super_block *sb, const char *function,
char nbuf[16];
const char *errstr;
if (unlikely(ext4_forced_shutdown(sb)))
if (unlikely(ext4_emergency_state(sb)))
return;
/* Special case: if the error is EROFS, and we're not already
@ -1053,7 +1040,7 @@ __acquires(bitlock)
struct va_format vaf;
va_list args;
if (unlikely(ext4_forced_shutdown(sb)))
if (unlikely(ext4_emergency_state(sb)))
return;
trace_ext4_error(sb, function, line);
@ -1306,18 +1293,17 @@ static void ext4_put_super(struct super_block *sb)
ext4_unregister_li_request(sb);
ext4_quotas_off(sb, EXT4_MAXQUOTAS);
flush_work(&sbi->s_sb_upd_work);
destroy_workqueue(sbi->rsv_conversion_wq);
ext4_release_orphan_info(sb);
if (sbi->s_journal) {
aborted = is_journal_aborted(sbi->s_journal);
err = jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
err = ext4_journal_destroy(sbi, sbi->s_journal);
if ((err < 0) && !aborted) {
ext4_abort(sb, -err, "Couldn't clean up the journal");
}
}
} else
flush_work(&sbi->s_sb_upd_work);
ext4_es_unregister_shrinker(sbi);
timer_shutdown_sync(&sbi->s_err_report);
@ -1325,13 +1311,14 @@ static void ext4_put_super(struct super_block *sb)
ext4_mb_release(sb);
ext4_ext_release(sb);
if (!sb_rdonly(sb) && !aborted) {
ext4_clear_feature_journal_needs_recovery(sb);
ext4_clear_feature_orphan_present(sb);
es->s_state = cpu_to_le16(sbi->s_mount_state);
}
if (!sb_rdonly(sb))
if (!ext4_emergency_state(sb) && !sb_rdonly(sb)) {
if (!aborted) {
ext4_clear_feature_journal_needs_recovery(sb);
ext4_clear_feature_orphan_present(sb);
es->s_state = cpu_to_le16(sbi->s_mount_state);
}
ext4_commit_super(sb);
}
ext4_group_desc_free(sbi);
ext4_flex_groups_free(sbi);
@ -1426,7 +1413,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
spin_lock_init(&ei->i_completed_io_lock);
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
ext4_fc_init_inode(&ei->vfs_inode);
mutex_init(&ei->i_fc_lock);
@ -2785,6 +2771,13 @@ static int ext4_check_opt_consistency(struct fs_context *fc,
}
if (is_remount) {
if (!sbi->s_journal &&
ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT)) {
ext4_msg(NULL, KERN_WARNING,
"Remounting fs w/o journal so ignoring data_err option");
ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT);
}
if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
ext4_msg(NULL, KERN_ERR, "can't mount with "
@ -3038,6 +3031,12 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS))
SEQ_OPTS_PUTS("prefetch_block_bitmaps");
if (ext4_emergency_ro(sb))
SEQ_OPTS_PUTS("emergency_ro");
if (ext4_forced_shutdown(sb))
SEQ_OPTS_PUTS("shutdown");
ext4_show_quota_options(seq, sb);
return 0;
}
@ -3205,7 +3204,7 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
__le32 le_group = cpu_to_le32(block_group);
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (ext4_has_metadata_csum(sbi->s_sb)) {
if (ext4_has_feature_metadata_csum(sbi->s_sb)) {
/* Use new metadata_csum algorithm */
__u32 csum32;
__u16 dummy_csum = 0;
@ -3693,7 +3692,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
if (group >= elr->lr_next_group) {
ret = 1;
if (elr->lr_first_not_zeroed != ngroups &&
!sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) {
!ext4_emergency_state(sb) && !sb_rdonly(sb) &&
test_opt(sb, INIT_INODE_TABLE)) {
elr->lr_next_group = elr->lr_first_not_zeroed;
elr->lr_mode = EXT4_LI_MODE_ITABLE;
ret = 0;
@ -3998,7 +3998,7 @@ int ext4_register_li_request(struct super_block *sb,
goto out;
}
if (sb_rdonly(sb) ||
if (ext4_emergency_state(sb) || sb_rdonly(sb) ||
(test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
(first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE))))
goto out;
@ -4061,7 +4061,7 @@ static int set_journal_csum_feature_set(struct super_block *sb)
int compat, incompat;
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (ext4_has_metadata_csum(sb)) {
if (ext4_has_feature_metadata_csum(sb)) {
/* journal checksum v3 */
compat = 0;
incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
@ -4349,7 +4349,7 @@ static void ext4_set_def_opts(struct super_block *sb,
if (ext4_has_feature_fast_commit(sb))
set_opt2(sb, JOURNAL_FAST_COMMIT);
/* don't forget to enable journal_csum when metadata_csum is enabled. */
if (ext4_has_metadata_csum(sb))
if (ext4_has_feature_metadata_csum(sb))
set_opt(sb, JOURNAL_CHECKSUM);
if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
@ -4642,7 +4642,8 @@ static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_blo
/* Precompute checksum seed for all metadata */
if (ext4_has_feature_csum_seed(sb))
sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
else if (ext4_has_feature_metadata_csum(sb) ||
ext4_has_feature_ea_inode(sb))
sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
sizeof(es->s_uuid));
return 0;
@ -4973,10 +4974,7 @@ static int ext4_load_and_init_journal(struct super_block *sb,
return 0;
out:
/* flush s_sb_upd_work before destroying the journal. */
flush_work(&sbi->s_sb_upd_work);
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
ext4_journal_destroy(sbi, sbi->s_journal);
return -EINVAL;
}
@ -5013,6 +5011,24 @@ static int ext4_check_journal_data_mode(struct super_block *sb)
return 0;
}
static const char *ext4_has_journal_option(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
return "journal_async_commit";
if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM))
return "journal_checksum";
if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
return "commit=";
if (EXT4_MOUNT_DATA_FLAGS &
(sbi->s_mount_opt ^ sbi->s_def_mount_opt))
return "data=";
if (test_opt(sb, DATA_ERR_ABORT))
return "data_err=abort";
return NULL;
}
static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb,
int silent)
{
@ -5263,6 +5279,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
sbi->s_sb_update_kb = EXT4_DEF_SB_UPDATE_INTERVAL_KB;
sbi->s_sb_update_sec = EXT4_DEF_SB_UPDATE_INTERVAL_SEC;
/*
* set default s_li_wait_mult for lazyinit, for the case there is
@ -5404,30 +5422,17 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
"suppressed and not mounted read-only");
goto failed_mount3a;
} else {
const char *journal_option;
/* Nojournal mode, all journal mount options are illegal */
if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"journal_async_commit, fs mounted w/o journal");
journal_option = ext4_has_journal_option(sb);
if (journal_option != NULL) {
ext4_msg(sb, KERN_ERR,
"can't mount with %s, fs mounted w/o journal",
journal_option);
goto failed_mount3a;
}
if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"journal_checksum, fs mounted w/o journal");
goto failed_mount3a;
}
if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"commit=%lu, fs mounted w/o journal",
sbi->s_commit_interval / HZ);
goto failed_mount3a;
}
if (EXT4_MOUNT_DATA_FLAGS &
(sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"data=, fs mounted w/o journal");
goto failed_mount3a;
}
sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
clear_opt(sb, JOURNAL_CHECKSUM);
clear_opt(sb, DATA_FLAGS);
@ -5616,9 +5621,11 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
goto failed_mount9;
}
if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev))
if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) {
ext4_msg(sb, KERN_WARNING,
"mounting with \"discard\" option, but the device does not support discard");
clear_opt(sb, DISCARD);
}
if (es->s_error_count)
mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
@ -5665,10 +5672,7 @@ failed_mount_wq:
sbi->s_ea_block_cache = NULL;
if (sbi->s_journal) {
/* flush s_sb_upd_work before journal destroy. */
flush_work(&sbi->s_sb_upd_work);
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
ext4_journal_destroy(sbi, sbi->s_journal);
}
failed_mount3a:
ext4_es_unregister_shrinker(sbi);
@ -5773,10 +5777,6 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
journal->j_flags |= JBD2_BARRIER;
else
journal->j_flags &= ~JBD2_BARRIER;
if (test_opt(sb, DATA_ERR_ABORT))
journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
else
journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
/*
* Always enable journal cycle record option, letting the journal
* records log transactions continuously between each mount.
@ -5973,7 +5973,7 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
return journal;
out_journal:
jbd2_journal_destroy(journal);
ext4_journal_destroy(EXT4_SB(sb), journal);
out_bdev:
bdev_fput(bdev_file);
return ERR_PTR(errno);
@ -6090,8 +6090,7 @@ static int ext4_load_journal(struct super_block *sb,
EXT4_SB(sb)->s_journal = journal;
err = ext4_clear_journal_err(sb, es);
if (err) {
EXT4_SB(sb)->s_journal = NULL;
jbd2_journal_destroy(journal);
ext4_journal_destroy(EXT4_SB(sb), journal);
return err;
}
@ -6109,7 +6108,7 @@ static int ext4_load_journal(struct super_block *sb,
return 0;
err_out:
jbd2_journal_destroy(journal);
ext4_journal_destroy(EXT4_SB(sb), journal);
return err;
}
@ -6336,8 +6335,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
bool needs_barrier = false;
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (unlikely(ext4_forced_shutdown(sb)))
return -EIO;
ret = ext4_emergency_state(sb);
if (unlikely(ret))
return ret;
trace_ext4_sync_fs(sb, wait);
flush_workqueue(sbi->rsv_conversion_wq);
@ -6419,7 +6419,7 @@ out:
*/
static int ext4_unfreeze(struct super_block *sb)
{
if (ext4_forced_shutdown(sb))
if (ext4_emergency_state(sb))
return 0;
if (EXT4_SB(sb)->s_journal) {
@ -6575,7 +6575,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
flush_work(&sbi->s_sb_upd_work);
if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) {
if (ext4_forced_shutdown(sb)) {
if (ext4_emergency_state(sb)) {
err = -EROFS;
goto restore_opts;
}
@ -6780,6 +6780,7 @@ static int ext4_reconfigure(struct fs_context *fc)
{
struct super_block *sb = fc->root->d_sb;
int ret;
bool old_ro = sb_rdonly(sb);
fc->s_fs_info = EXT4_SB(sb);
@ -6791,9 +6792,9 @@ static int ext4_reconfigure(struct fs_context *fc)
if (ret < 0)
return ret;
ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.",
&sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w",
ext4_quota_mode(sb));
ext4_msg(sb, KERN_INFO, "re-mounted %pU%s.",
&sb->s_uuid,
(old_ro != sb_rdonly(sb)) ? (sb_rdonly(sb) ? " ro" : " r/w") : "");
return 0;
}
@ -6817,22 +6818,29 @@ static int ext4_statfs_project(struct super_block *sb,
dquot->dq_dqb.dqb_bhardlimit);
limit >>= sb->s_blocksize_bits;
if (limit && buf->f_blocks > limit) {
if (limit) {
uint64_t remaining = 0;
curblock = (dquot->dq_dqb.dqb_curspace +
dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
buf->f_blocks = limit;
buf->f_bfree = buf->f_bavail =
(buf->f_blocks > curblock) ?
(buf->f_blocks - curblock) : 0;
if (limit > curblock)
remaining = limit - curblock;
buf->f_blocks = min(buf->f_blocks, limit);
buf->f_bfree = min(buf->f_bfree, remaining);
buf->f_bavail = min(buf->f_bavail, remaining);
}
limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
dquot->dq_dqb.dqb_ihardlimit);
if (limit && buf->f_files > limit) {
buf->f_files = limit;
buf->f_ffree =
(buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
(buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
if (limit) {
uint64_t remaining = 0;
if (limit > dquot->dq_dqb.dqb_curinodes)
remaining = limit - dquot->dq_dqb.dqb_curinodes;
buf->f_files = min(buf->f_files, limit);
buf->f_ffree = min(buf->f_ffree, remaining);
}
spin_unlock(&dquot->dq_dqb_lock);
@ -6935,12 +6943,25 @@ static int ext4_release_dquot(struct dquot *dquot)
{
int ret, err;
handle_t *handle;
bool freeze_protected = false;
/*
* Trying to sb_start_intwrite() in a running transaction
* can result in a deadlock. Further, running transactions
* are already protected from freezing.
*/
if (!ext4_journal_current_handle()) {
sb_start_intwrite(dquot->dq_sb);
freeze_protected = true;
}
handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
if (IS_ERR(handle)) {
/* Release dquot anyway to avoid endless cycle in dqput() */
dquot_release(dquot);
if (freeze_protected)
sb_end_intwrite(dquot->dq_sb);
return PTR_ERR(handle);
}
ret = dquot_release(dquot);
@ -6951,6 +6972,10 @@ static int ext4_release_dquot(struct dquot *dquot)
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
if (freeze_protected)
sb_end_intwrite(dquot->dq_sb);
return ret;
}
@ -7288,7 +7313,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
}
lock_buffer(bh);
memcpy(bh->b_data+offset, data, len);
flush_dcache_page(bh->b_page);
flush_dcache_folio(bh->b_folio);
unlock_buffer(bh);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
brelse(bh);
@ -7381,12 +7406,9 @@ static struct file_system_type ext4_fs_type = {
};
MODULE_ALIAS_FS("ext4");
/* Shared across all ext4 file systems */
wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
static int __init ext4_init_fs(void)
{
int i, err;
int err;
ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64);
ext4_li_info = NULL;
@ -7394,9 +7416,6 @@ static int __init ext4_init_fs(void)
/* Build-time check for flags consistency */
ext4_check_flag_values();
for (i = 0; i < EXT4_WQ_HASH_SZ; i++)
init_waitqueue_head(&ext4__ioend_wq[i]);
err = ext4_init_es();
if (err)
return err;

View File

@ -254,6 +254,8 @@ EXT4_ATTR(journal_task, 0444, journal_task);
EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
EXT4_RW_ATTR_SBI_UL(last_trim_minblks, s_last_trim_minblks);
EXT4_RW_ATTR_SBI_UI(sb_update_sec, s_sb_update_sec);
EXT4_RW_ATTR_SBI_UI(sb_update_kb, s_sb_update_kb);
static unsigned int old_bump_val = 128;
EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@ -305,6 +307,8 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(mb_prefetch),
ATTR_LIST(mb_prefetch_limit),
ATTR_LIST(last_trim_minblks),
ATTR_LIST(sb_update_sec),
ATTR_LIST(sb_update_kb),
NULL,
};
ATTRIBUTE_GROUPS(ext4);

View File

@ -156,7 +156,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode,
struct ext4_xattr_header *hdr = BHDR(bh);
int ret = 1;
if (ext4_has_metadata_csum(inode->i_sb)) {
if (ext4_has_feature_metadata_csum(inode->i_sb)) {
lock_buffer(bh);
ret = (hdr->h_checksum == ext4_xattr_block_csum(inode,
bh->b_blocknr, hdr));
@ -168,7 +168,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode,
static void ext4_xattr_block_csum_set(struct inode *inode,
struct buffer_head *bh)
{
if (ext4_has_metadata_csum(inode->i_sb))
if (ext4_has_feature_metadata_csum(inode->i_sb))
BHDR(bh)->h_checksum = ext4_xattr_block_csum(inode,
bh->b_blocknr, BHDR(bh));
}
@ -308,7 +308,7 @@ __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh,
__ext4_xattr_check_block((inode), (bh), __func__, __LINE__)
static inline int
int
__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
void *end, const char *function, unsigned int line)
{
@ -316,9 +316,6 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
function, line);
}
#define xattr_check_inode(inode, header, end) \
__xattr_check_inode((inode), (header), (end), __func__, __LINE__)
static int
xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry,
void *end, int name_index, const char *name, int sorted)
@ -649,10 +646,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
return error;
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
error = xattr_check_inode(inode, header, end);
if (error)
goto cleanup;
end = ITAIL(inode, raw_inode);
entry = IFIRST(header);
error = xattr_find_entry(inode, &entry, end, name_index, name, 0);
if (error)
@ -783,7 +777,6 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
struct ext4_xattr_ibody_header *header;
struct ext4_inode *raw_inode;
struct ext4_iloc iloc;
void *end;
int error;
if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
@ -793,14 +786,9 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
return error;
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
error = xattr_check_inode(inode, header, end);
if (error)
goto cleanup;
error = ext4_xattr_list_entries(dentry, IFIRST(header),
buffer, buffer_size);
cleanup:
brelse(iloc.bh);
return error;
}
@ -868,7 +856,6 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
struct ext4_xattr_ibody_header *header;
struct ext4_xattr_entry *entry;
qsize_t ea_inode_refs = 0;
void *end;
int ret;
lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem);
@ -879,10 +866,6 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
goto out;
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
ret = xattr_check_inode(inode, header, end);
if (ret)
goto out;
for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
entry = EXT4_XATTR_NEXT(entry))
@ -1176,15 +1159,24 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
{
struct inode *ea_inode;
struct ext4_xattr_entry *entry;
struct ext4_iloc iloc;
bool dirty = false;
unsigned int ea_ino;
int err;
int credits;
void *end;
if (block_csum)
end = (void *)bh->b_data + bh->b_size;
else {
ext4_get_inode_loc(parent, &iloc);
end = (void *)ext4_raw_inode(&iloc) + EXT4_SB(parent->i_sb)->s_inode_size;
}
/* One credit for dec ref on ea_inode, one for orphan list addition, */
credits = 2 + extra_credits;
for (entry = first; !IS_LAST_ENTRY(entry);
for (entry = first; (void *)entry < end && !IS_LAST_ENTRY(entry);
entry = EXT4_XATTR_NEXT(entry)) {
if (!entry->e_value_inum)
continue;
@ -2235,11 +2227,8 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
header = IHDR(inode, raw_inode);
is->s.base = is->s.first = IFIRST(header);
is->s.here = is->s.first;
is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
is->s.end = ITAIL(inode, raw_inode);
if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
error = xattr_check_inode(inode, header, is->s.end);
if (error)
return error;
/* Find the named attribute. */
error = xattr_find_entry(inode, &is->s.here, is->s.end,
i->name_index, i->name, 0);
@ -2786,14 +2775,10 @@ retry:
*/
base = IFIRST(header);
end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
end = ITAIL(inode, raw_inode);
min_offs = end - base;
total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32);
error = xattr_check_inode(inode, header, end);
if (error)
goto cleanup;
ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino);
if (ifree >= isize_diff)
goto shift;

View File

@ -67,6 +67,9 @@ struct ext4_xattr_entry {
((void *)raw_inode + \
EXT4_GOOD_OLD_INODE_SIZE + \
EXT4_I(inode)->i_extra_isize))
#define ITAIL(inode, raw_inode) \
((void *)(raw_inode) + \
EXT4_SB((inode)->i_sb)->s_inode_size)
#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
/*
@ -206,6 +209,13 @@ extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
extern struct mb_cache *ext4_xattr_create_cache(void);
extern void ext4_xattr_destroy_cache(struct mb_cache *);
extern int
__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
void *end, const char *function, unsigned int line);
#define xattr_check_inode(inode, header, end) \
__xattr_check_inode((inode), (header), (end), __func__, __LINE__)
#ifdef CONFIG_EXT4_FS_SECURITY
extern int ext4_init_security(handle_t *handle, struct inode *inode,
struct inode *dir, const struct qstr *qstr);

View File

@ -57,8 +57,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
* So here, we have a buffer which has just come off the forget list. Look to
* see if we can strip all buffers from the backing page.
*
* Called under lock_journal(), and possibly under journal_datalist_lock. The
* caller provided us with a ref against the buffer, and we drop that here.
* Called under j_list_lock. The caller provided us with a ref against the
* buffer, and we drop that here.
*/
static void release_buffer_page(struct buffer_head *bh)
{
@ -738,10 +738,8 @@ start_journal_io:
err = journal_finish_inode_data_buffers(journal, commit_transaction);
if (err) {
printk(KERN_WARNING
"JBD2: Detected IO errors while flushing file data "
"on %s\n", journal->j_devname);
if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
jbd2_journal_abort(journal, err);
"JBD2: Detected IO errors %d while flushing file data on %s\n",
err, journal->j_devname);
err = 0;
}

View File

@ -603,7 +603,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
{
int ret = 0;
transaction_t *commit_trans;
transaction_t *commit_trans, *running_trans;
if (!(journal->j_flags & JBD2_BARRIER))
return 0;
@ -613,6 +613,16 @@ int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
goto out;
commit_trans = journal->j_committing_transaction;
if (!commit_trans || commit_trans->t_tid != tid) {
running_trans = journal->j_running_transaction;
/*
* The query transaction hasn't started committing,
* it must still be running.
*/
if (WARN_ON_ONCE(!running_trans ||
running_trans->t_tid != tid))
goto out;
running_trans->t_need_data_flush = 1;
ret = 1;
goto out;
}
@ -947,7 +957,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
* descriptor blocks we do need to generate bona fide buffers.
*
* After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying
* the buffer's contents they really should run flush_dcache_page(bh->b_page).
* the buffer's contents they really should run flush_dcache_folio(bh->b_folio).
* But we don't bother doing that, so there will be coherency problems with
* mmaps of blockdevs which hold live JBD-controlled filesystems.
*/
@ -1361,7 +1371,7 @@ static int journal_check_superblock(journal_t *journal)
return err;
}
if (jbd2_journal_has_csum_v2or3_feature(journal) &&
if (jbd2_journal_has_csum_v2or3(journal) &&
jbd2_has_feature_checksum(journal)) {
/* Can't have checksum v1 and v2 on at the same time! */
printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
@ -1369,7 +1379,7 @@ static int journal_check_superblock(journal_t *journal)
return err;
}
if (jbd2_journal_has_csum_v2or3_feature(journal)) {
if (jbd2_journal_has_csum_v2or3(journal)) {
if (sb->s_checksum_type != JBD2_CRC32C_CHKSUM) {
printk(KERN_ERR "JBD2: Unknown checksum type\n");
return err;
@ -1869,7 +1879,6 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
/* Log is no longer empty */
write_lock(&journal->j_state_lock);
WARN_ON(!sb->s_sequence);
journal->j_flags &= ~JBD2_FLUSHED;
write_unlock(&journal->j_state_lock);
@ -1965,17 +1974,15 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
return err;
}
if (block_start == ~0ULL) {
block_start = phys_block;
block_stop = block_start - 1;
}
if (block_start == ~0ULL)
block_stop = block_start = phys_block;
/*
* last block not contiguous with current block,
* process last contiguous region and return to this block on
* next loop
*/
if (phys_block != block_stop + 1) {
if (phys_block != block_stop) {
block--;
} else {
block_stop++;
@ -1994,11 +2001,10 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
*/
byte_start = block_start * journal->j_blocksize;
byte_stop = block_stop * journal->j_blocksize;
byte_count = (block_stop - block_start + 1) *
journal->j_blocksize;
byte_count = (block_stop - block_start) * journal->j_blocksize;
truncate_inode_pages_range(journal->j_dev->bd_mapping,
byte_start, byte_stop);
byte_start, byte_stop - 1);
if (flags & JBD2_JOURNAL_FLUSH_DISCARD) {
err = blkdev_issue_discard(journal->j_dev,
@ -2013,7 +2019,7 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
}
if (unlikely(err != 0)) {
pr_err("JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu",
pr_err("JBD2: (error %d) unable to wipe journal at physical blocks [%llu, %llu)",
err, block_start, block_stop);
return err;
}

View File

@ -39,7 +39,7 @@ struct recovery_info
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass);
static int scan_revoke_records(journal_t *, struct buffer_head *,
static int scan_revoke_records(journal_t *, enum passtype, struct buffer_head *,
tid_t, struct recovery_info *);
#ifdef __KERNEL__
@ -65,9 +65,8 @@ static void journal_brelse_array(struct buffer_head *b[], int n)
*/
#define MAXBUF 8
static int do_readahead(journal_t *journal, unsigned int start)
static void do_readahead(journal_t *journal, unsigned int start)
{
int err;
unsigned int max, nbufs, next;
unsigned long long blocknr;
struct buffer_head *bh;
@ -85,7 +84,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
nbufs = 0;
for (next = start; next < max; next++) {
err = jbd2_journal_bmap(journal, next, &blocknr);
int err = jbd2_journal_bmap(journal, next, &blocknr);
if (err) {
printk(KERN_ERR "JBD2: bad block at offset %u\n",
@ -94,10 +93,8 @@ static int do_readahead(journal_t *journal, unsigned int start)
}
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh) {
err = -ENOMEM;
if (!bh)
goto failed;
}
if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
bufs[nbufs++] = bh;
@ -112,12 +109,10 @@ static int do_readahead(journal_t *journal, unsigned int start)
if (nbufs)
bh_readahead_batch(nbufs, bufs, 0);
err = 0;
failed:
if (nbufs)
journal_brelse_array(bufs, nbufs);
return err;
}
#endif /* __KERNEL__ */
@ -287,19 +282,20 @@ static int fc_do_one_pass(journal_t *journal,
int jbd2_journal_recover(journal_t *journal)
{
int err, err2;
journal_superblock_t * sb;
struct recovery_info info;
memset(&info, 0, sizeof(info));
sb = journal->j_superblock;
/*
* The journal superblock's s_start field (the current log head)
* is always zero if, and only if, the journal was cleanly
* unmounted.
* unmounted. We use its in-memory version j_tail here because
* jbd2_journal_wipe() could have updated it without updating journal
* superblock.
*/
if (!sb->s_start) {
if (!journal->j_tail) {
journal_superblock_t *sb = journal->j_superblock;
jbd2_debug(1, "No recovery required, last transaction %d, head block %u\n",
be32_to_cpu(sb->s_sequence), be32_to_cpu(sb->s_head));
journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
@ -327,6 +323,12 @@ int jbd2_journal_recover(journal_t *journal)
journal->j_transaction_sequence, journal->j_head);
jbd2_journal_clear_revoke(journal);
/* Free revoke table allocated for replay */
if (journal->j_revoke != journal->j_revoke_table[0] &&
journal->j_revoke != journal->j_revoke_table[1]) {
jbd2_journal_destroy_revoke_table(journal->j_revoke);
journal->j_revoke = journal->j_revoke_table[1];
}
err2 = sync_blockdev(journal->j_fs_dev);
if (!err)
err = err2;
@ -612,6 +614,31 @@ static int do_one_pass(journal_t *journal,
first_commit_ID = next_commit_ID;
if (pass == PASS_SCAN)
info->start_transaction = first_commit_ID;
else if (pass == PASS_REVOKE) {
/*
* Would the default revoke table have too long hash chains
* during replay?
*/
if (info->nr_revokes > JOURNAL_REVOKE_DEFAULT_HASH * 16) {
unsigned int hash_size;
/*
* Aim for average chain length of 8, limit at 1M
* entries to avoid problems with malicious
* filesystems.
*/
hash_size = min(roundup_pow_of_two(info->nr_revokes / 8),
1U << 20);
journal->j_revoke =
jbd2_journal_init_revoke_table(hash_size);
if (!journal->j_revoke) {
printk(KERN_ERR
"JBD2: failed to allocate revoke table for replay with %u entries. "
"Journal replay may be slow.\n", hash_size);
journal->j_revoke = journal->j_revoke_table[1];
}
}
}
jbd2_debug(1, "Starting recovery pass %d\n", pass);
@ -851,6 +878,13 @@ chksum_ok:
continue;
case JBD2_REVOKE_BLOCK:
/*
* If we aren't in the SCAN or REVOKE pass, then we can
* just skip over this block.
*/
if (pass != PASS_REVOKE && pass != PASS_SCAN)
continue;
/*
* Check revoke block crc in pass_scan, if csum verify
* failed, check commit block time later.
@ -863,12 +897,7 @@ chksum_ok:
need_check_commit_time = true;
}
/* If we aren't in the REVOKE pass, then we can
* just skip over this block. */
if (pass != PASS_REVOKE)
continue;
err = scan_revoke_records(journal, bh,
err = scan_revoke_records(journal, pass, bh,
next_commit_ID, info);
if (err)
goto failed;
@ -922,8 +951,9 @@ chksum_ok:
/* Scan a revoke record, marking all blocks mentioned as revoked. */
static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
tid_t sequence, struct recovery_info *info)
static int scan_revoke_records(journal_t *journal, enum passtype pass,
struct buffer_head *bh, tid_t sequence,
struct recovery_info *info)
{
jbd2_journal_revoke_header_t *header;
int offset, max;
@ -944,6 +974,11 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
if (jbd2_has_feature_64bit(journal))
record_len = 8;
if (pass == PASS_SCAN) {
info->nr_revokes += (max - offset) / record_len;
return 0;
}
while (offset + record_len <= max) {
unsigned long long blocknr;
int err;
@ -956,7 +991,6 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
err = jbd2_journal_set_revoke(journal, blocknr, sequence);
if (err)
return err;
++info->nr_revokes;
}
return 0;
}

View File

@ -215,7 +215,7 @@ int __init jbd2_journal_init_revoke_table_cache(void)
return 0;
}
static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
{
int shift = 0;
int tmp = hash_size;
@ -231,7 +231,7 @@ static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
table->hash_size = hash_size;
table->hash_shift = shift;
table->hash_table =
kmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL);
kvmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL);
if (!table->hash_table) {
kmem_cache_free(jbd2_revoke_table_cache, table);
table = NULL;
@ -245,7 +245,7 @@ out:
return table;
}
static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
{
int i;
struct list_head *hash_list;
@ -255,7 +255,7 @@ static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
J_ASSERT(list_empty(hash_list));
}
kfree(table->hash_table);
kvfree(table->hash_table);
kmem_cache_free(jbd2_revoke_table_cache, table);
}
@ -420,12 +420,11 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
* do not trust the Revoked bit on buffers unless RevokeValid is also
* set.
*/
int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
{
struct jbd2_revoke_record_s *record;
journal_t *journal = handle->h_transaction->t_journal;
int need_cancel;
int did_revoke = 0; /* akpm: debug */
struct buffer_head *bh = jh2bh(jh);
jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh);
@ -450,7 +449,6 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
list_del(&record->hash);
spin_unlock(&journal->j_revoke_lock);
kmem_cache_free(jbd2_revoke_record_cache, record);
did_revoke = 1;
}
}
@ -473,11 +471,10 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
__brelse(bh2);
}
}
return did_revoke;
}
/*
* journal_clear_revoked_flag clears revoked flag of buffers in
* jbd2_clear_buffer_revoked_flags clears revoked flag of buffers in
* revoke table to reflect there is no revoked buffers in the next
* transaction which is going to be started.
*/
@ -506,9 +503,9 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal)
}
}
/* journal_switch_revoke table select j_revoke for next transaction
* we do not want to suspend any processing until all revokes are
* written -bzzz
/* jbd2_journal_switch_revoke_table table select j_revoke for next
* transaction we do not want to suspend any processing until all
* revokes are written -bzzz
*/
void jbd2_journal_switch_revoke_table(journal_t *journal)
{

View File

@ -92,7 +92,6 @@ static void jbd2_get_transaction(journal_t *journal,
atomic_set(&transaction->t_outstanding_revokes, 0);
atomic_set(&transaction->t_handle_count, 0);
INIT_LIST_HEAD(&transaction->t_inode_list);
INIT_LIST_HEAD(&transaction->t_private_list);
/* Set up the commit timer for the new transaction. */
journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
@ -114,12 +113,9 @@ static void jbd2_get_transaction(journal_t *journal,
*/
/*
* Update transaction's maximum wait time, if debugging is enabled.
*
* t_max_wait is carefully updated here with use of atomic compare exchange.
* Note that there could be multiplre threads trying to do this simultaneously
* hence using cmpxchg to avoid any use of locks in this case.
* With this t_max_wait can be updated w/o enabling jbd2_journal_enable_debug.
*/
static inline void update_t_max_wait(transaction_t *transaction,
unsigned long ts)
@ -2079,21 +2075,6 @@ static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
jh->b_transaction = NULL;
}
void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
{
struct buffer_head *bh = jh2bh(jh);
/* Get reference so that buffer cannot be freed before we unlock it */
get_bh(bh);
spin_lock(&jh->b_state_lock);
spin_lock(&journal->j_list_lock);
__jbd2_journal_unfile_buffer(jh);
spin_unlock(&journal->j_list_lock);
spin_unlock(&jh->b_state_lock);
jbd2_journal_put_journal_head(jh);
__brelse(bh);
}
/**
* jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
@ -2192,7 +2173,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
/*
* We don't want to write the buffer anymore, clear the
* bit so that we don't confuse checks in
* __journal_file_buffer
* __jbd2_journal_file_buffer
*/
clear_buffer_dirty(bh);
__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);

View File

@ -1248,11 +1248,19 @@ extern int send_sigurg(struct file *file);
#define SB_NOUSER BIT(31)
/* These flags relate to encoding and casefolding */
#define SB_ENC_STRICT_MODE_FL (1 << 0)
#define SB_ENC_STRICT_MODE_FL (1 << 0)
#define SB_ENC_NO_COMPAT_FALLBACK_FL (1 << 1)
#define sb_has_strict_encoding(sb) \
(sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL)
#if IS_ENABLED(CONFIG_UNICODE)
#define sb_no_casefold_compat_fallback(sb) \
(sb->s_encoding_flags & SB_ENC_NO_COMPAT_FALLBACK_FL)
#else
#define sb_no_casefold_compat_fallback(sb) (1)
#endif
/*
* Umount options
*/

View File

@ -459,7 +459,6 @@ struct jbd2_revoke_table_s;
* @h_ref: Reference count on this handle.
* @h_err: Field for caller's use to track errors through large fs operations.
* @h_sync: Flag for sync-on-close.
* @h_jdata: Flag to force data journaling.
* @h_reserved: Flag for handle for reserved credits.
* @h_aborted: Flag indicating fatal error on handle.
* @h_type: For handle statistics.
@ -491,7 +490,6 @@ struct jbd2_journal_handle
/* Flags [no locking] */
unsigned int h_sync: 1;
unsigned int h_jdata: 1;
unsigned int h_reserved: 1;
unsigned int h_aborted: 1;
unsigned int h_type: 8;
@ -700,12 +698,6 @@ struct transaction_s
/* Disk flush needs to be sent to fs partition [no locking] */
int t_need_data_flush;
/*
* For use by the filesystem to store fs-specific data
* structures associated with the transaction
*/
struct list_head t_private_list;
};
struct transaction_run_stats_s {
@ -1388,9 +1380,6 @@ JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT)
#define JBD2_FLUSHED 0x008 /* The journal superblock has been flushed */
#define JBD2_LOADED 0x010 /* The journal superblock has been loaded */
#define JBD2_BARRIER 0x020 /* Use IDE barriers */
#define JBD2_ABORT_ON_SYNCDATA_ERR 0x040 /* Abort the journal on file
* data write error in ordered
* mode */
#define JBD2_CYCLE_RECORD 0x080 /* Journal cycled record log on
* clean and empty filesystem
* logging area */
@ -1407,7 +1396,6 @@ JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT)
*/
/* Filing buffers */
extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *);
extern bool __jbd2_journal_refile_buffer(struct journal_head *);
extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *);
extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int);
@ -1627,10 +1615,12 @@ extern void jbd2_journal_destroy_revoke_record_cache(void);
extern void jbd2_journal_destroy_revoke_table_cache(void);
extern int __init jbd2_journal_init_revoke_record_cache(void);
extern int __init jbd2_journal_init_revoke_table_cache(void);
struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size);
void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table);
extern void jbd2_journal_destroy_revoke(journal_t *);
extern int jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *);
extern int jbd2_journal_cancel_revoke(handle_t *, struct journal_head *);
extern void jbd2_journal_cancel_revoke(handle_t *, struct journal_head *);
extern void jbd2_journal_write_revoke_records(transaction_t *transaction,
struct list_head *log_bufs);
@ -1736,14 +1726,10 @@ static inline int tid_geq(tid_t x, tid_t y)
extern int jbd2_journal_blocks_per_page(struct inode *inode);
extern size_t journal_tag_bytes(journal_t *journal);
static inline bool jbd2_journal_has_csum_v2or3_feature(journal_t *j)
{
return jbd2_has_feature_csum2(j) || jbd2_has_feature_csum3(j);
}
static inline int jbd2_journal_has_csum_v2or3(journal_t *journal)
{
return jbd2_journal_has_csum_v2or3_feature(journal);
return jbd2_has_feature_csum2(journal) ||
jbd2_has_feature_csum3(journal);
}
static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb)