]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/commitdiff
Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 1 Apr 2009 17:57:49 +0000 (10:57 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 1 Apr 2009 17:57:49 +0000 (10:57 -0700)
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (33 commits)
  ext4: Regularize mount options
  ext4: fix locking typo in mballoc which could cause soft lockup hangs
  ext4: fix typo which causes a memory leak on error path
  jbd2: Update locking coments
  ext4: Rename pa_linear to pa_type
  ext4: add checks of block references for non-extent inodes
  ext4: Check for an valid i_mode when reading the inode from disk
  ext4: Use WRITE_SYNC for commits which are caused by fsync()
  ext4: Add auto_da_alloc mount option
  ext4: Use struct flex_groups to calculate get_orlov_stats()
  ext4: Use atomic_t's in struct flex_groups
  ext4: remove /proc tuning knobs
  ext4: Add sysfs support
  ext4: Track lifetime disk writes
  ext4: Fix discard of inode prealloc space with delayed allocation.
  ext4: Automatically allocate delay allocated blocks on rename
  ext4: Automatically allocate delay allocated blocks on close
  ext4: add EXT4_IOC_ALLOC_DA_BLKS ioctl
  ext4: Simplify delalloc code by removing mpage_da_writepages()
  ext4: Save stack space by removing fake buffer heads
  ...

23 files changed:
Documentation/ABI/testing/sysfs-fs-ext4 [new file with mode: 0644]
Documentation/filesystems/ext4.txt
Documentation/filesystems/proc.txt
fs/ext4/balloc.c
fs/ext4/dir.c
fs/ext4/ext4.h
fs/ext4/ext4_extents.h
fs/ext4/ext4_i.h
fs/ext4/ext4_sb.h
fs/ext4/extents.c
fs/ext4/file.c
fs/ext4/ialloc.c
fs/ext4/inode.c
fs/ext4/ioctl.c
fs/ext4/mballoc.c
fs/ext4/mballoc.h
fs/ext4/namei.c
fs/ext4/resize.c
fs/ext4/super.c
fs/jbd2/commit.c
fs/jbd2/revoke.c
fs/jbd2/transaction.c
include/linux/jbd2.h

diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
new file mode 100644 (file)
index 0000000..4e79074
--- /dev/null
@@ -0,0 +1,81 @@
+What:          /sys/fs/ext4/<disk>/mb_stats
+Date:          March 2008
+Contact:       "Theodore Ts'o" <tytso@mit.edu>
+Description:
+                Controls whether the multiblock allocator should
+                collect statistics, which are shown during the unmount.
+                1 means to collect statistics, 0 means not to collect
+                statistics
+
+What:          /sys/fs/ext4/<disk>/mb_group_prealloc
+Date:          March 2008
+Contact:       "Theodore Ts'o" <tytso@mit.edu>
+Description:
+               The multiblock allocator will round up allocation
+               requests to a multiple of this tuning parameter if the
+               stripe size is not set in the ext4 superblock
+
+What:          /sys/fs/ext4/<disk>/mb_max_to_scan
+Date:          March 2008
+Contact:       "Theodore Ts'o" <tytso@mit.edu>
+Description:
+               The maximum number of extents the multiblock allocator
+               will search to find the best extent
+
+What:          /sys/fs/ext4/<disk>/mb_min_to_scan
+Date:          March 2008
+Contact:       "Theodore Ts'o" <tytso@mit.edu>
+Description:
+               The minimum number of extents the multiblock allocator
+               will search to find the best extent
+
+What:          /sys/fs/ext4/<disk>/mb_order2_req
+Date:          March 2008
+Contact:       "Theodore Ts'o" <tytso@mit.edu>
+Description:
+               Tuning parameter which controls the minimum size for 
+               requests (as a power of 2) where the buddy cache is
+               used
+
+What:          /sys/fs/ext4/<disk>/mb_stream_req
+Date:          March 2008
+Contact:       "Theodore Ts'o" <tytso@mit.edu>
+Description:
+               Files which have fewer blocks than this tunable
+               parameter will have their blocks allocated out of a
+               block group specific preallocation pool, so that small
+               files are packed closely together.  Each large file
+                will have its blocks allocated out of its own unique
+                preallocation pool.
+
+What:          /sys/fs/ext4/<disk>/inode_readahead
+Date:          March 2008
+Contact:       "Theodore Ts'o" <tytso@mit.edu>
+Description:
+               Tuning parameter which controls the maximum number of
+               inode table blocks that ext4's inode table readahead
+               algorithm will pre-read into the buffer cache
+
+What:          /sys/fs/ext4/<disk>/delayed_allocation_blocks
+Date:          March 2008
+Contact:       "Theodore Ts'o" <tytso@mit.edu>
+Description:
+               This file is read-only and shows the number of blocks
+               that are dirty in the page cache, but which do not
+               have their location in the filesystem allocated yet.
+
+What:          /sys/fs/ext4/<disk>/lifetime_write_kbytes
+Date:          March 2008
+Contact:       "Theodore Ts'o" <tytso@mit.edu>
+Description:
+               This file is read-only and shows the number of kilobytes
+               of data that have been written to this filesystem since it was
+               created.
+
+What:          /sys/fs/ext4/<disk>/session_write_kbytes
+Date:          March 2008
+Contact:       "Theodore Ts'o" <tytso@mit.edu>
+Description:
+               This file is read-only and shows the number of
+               kilobytes of data that have been written to this
+               filesystem since it was mounted.
index cec829bc7291623da6189de984fedd753ce51450..97882df0486567c229bb0ee867752da15b24089f 100644 (file)
@@ -85,7 +85,7 @@ Note: More extensive information for getting started with ext4 can be
 * extent format more robust in face of on-disk corruption due to magics,
 * internal redundancy in tree
 * improved file allocation (multi-block alloc)
-* fix 32000 subdirectory limit
+* lift 32000 subdirectory limit imposed by i_links_count[1]
 * nsec timestamps for mtime, atime, ctime, create time
 * inode version field on disk (NFSv4, Lustre)
 * reduced e2fsck time via uninit_bg feature
@@ -100,6 +100,9 @@ Note: More extensive information for getting started with ext4 can be
 * efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
   the ordering)
 
+[1] Filesystems with a block size of 1k may see a limit imposed by the
+directory hash tree having a maximum depth of two.
+
 2.2 Candidate features for future inclusion
 
 * Online defrag (patches available but not well tested)
@@ -180,8 +183,8 @@ commit=nrsec        (*)     Ext4 can be told to sync all its data and metadata
                        performance.
 
 barrier=<0|1(*)>       This enables/disables the use of write barriers in
-                       the jbd code.  barrier=0 disables, barrier=1 enables.
-                       This also requires an IO stack which can support
+barrier(*)             the jbd code.  barrier=0 disables, barrier=1 enables.
+nobarrier              This also requires an IO stack which can support
                        barriers, and if jbd gets an error on a barrier
                        write, it will disable again with a warning.
                        Write barriers enforce proper on-disk ordering
@@ -189,6 +192,9 @@ barrier=<0|1(*)>    This enables/disables the use of write barriers in
                        safe to use, at some performance penalty.  If
                        your disks are battery-backed in one way or another,
                        disabling barriers may safely improve performance.
+                       The mount options "barrier" and "nobarrier" can
+                       also be used to enable or disable barriers, for
+                       consistency with other ext4 mount options.
 
 inode_readahead=n      This tuning parameter controls the maximum
                        number of inode table blocks that ext4's inode
@@ -310,6 +316,24 @@ journal_ioprio=prio        The I/O priority (from 0 to 7, where 0 is the
                        a slightly higher priority than the default I/O
                        priority.
 
+auto_da_alloc(*)       Many broken applications don't use fsync() when 
+noauto_da_alloc                replacing existing files via patterns such as
+                       fd = open("foo.new")/write(fd,..)/close(fd)/
+                       rename("foo.new", "foo"), or worse yet,
+                       fd = open("foo", O_TRUNC)/write(fd,..)/close(fd).
+                       If auto_da_alloc is enabled, ext4 will detect
+                       the replace-via-rename and replace-via-truncate
+                       patterns and force that any delayed allocation
+                       blocks are allocated such that at the next
+                       journal commit, in the default data=ordered
+                       mode, the data blocks of the new file are forced
+                       to disk before the rename() operation is
+                       commited.  This provides roughly the same level
+                       of guarantees as ext3, and avoids the
+                       "zero-length" problem that can happen when a
+                       system crashes before the delayed allocation
+                       blocks are forced to disk.
+
 Data Mode
 =========
 There are 3 different data modes:
index 830bad7cce0f946ba356e7aff283775d270063f1..efc4fd9f40cea608a8a67fb1c498552a67468e42 100644 (file)
@@ -940,27 +940,6 @@ Table 1-10: Files in /proc/fs/ext4/<devname>
  File            Content                                        
  mb_groups       details of multiblock allocator buddy cache of free blocks
  mb_history      multiblock allocation history
- stats           controls whether the multiblock allocator should start
-                 collecting statistics, which are shown during the unmount
- group_prealloc  the multiblock allocator will round up allocation
-                 requests to a multiple of this tuning parameter if the
-                 stripe size is not set in the ext4 superblock
- max_to_scan     The maximum number of extents the multiblock allocator
-                 will search to find the best extent
- min_to_scan     The minimum number of extents the multiblock allocator
-                 will search to find the best extent
- order2_req      Tuning parameter which controls the minimum size for 
-                 requests (as a power of 2) where the buddy cache is
-                 used
- stream_req      Files which have fewer blocks than this tunable
-                 parameter will have their blocks allocated out of a
-                 block group specific preallocation pool, so that small
-                 files are packed closely together.  Each large file
-                 will have its blocks allocated out of its own unique
-                 preallocation pool.
-inode_readahead  Tuning parameter which controls the maximum number of
-                 inode table blocks that ext4's inode table readahead
-                 algorithm will pre-read into the buffer cache
 ..............................................................................
 
 
index 38f40d55899c81e3f483c5237989bd68cf134db3..53c72ad85877314c16c0eef0f19fb3889a6743da 100644 (file)
@@ -55,7 +55,8 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
 }
 
 static int ext4_group_used_meta_blocks(struct super_block *sb,
-                               ext4_group_t block_group)
+                                      ext4_group_t block_group,
+                                      struct ext4_group_desc *gdp)
 {
        ext4_fsblk_t tmp;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -63,10 +64,6 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
        int used_blocks = sbi->s_itb_per_group + 2;
 
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
-               struct ext4_group_desc *gdp;
-               struct buffer_head *bh;
-
-               gdp = ext4_get_group_desc(sb, block_group, &bh);
                if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
                                        block_group))
                        used_blocks--;
@@ -177,7 +174,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 */
                mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
        }
-       return free_blocks - ext4_group_used_meta_blocks(sb, block_group);
+       return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
 }
 
 
@@ -473,9 +470,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-               spin_lock(sb_bgl_lock(sbi, flex_group));
-               sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
-               spin_unlock(sb_bgl_lock(sbi, flex_group));
+               atomic_add(blocks_freed,
+                          &sbi->s_flex_groups[flex_group].free_blocks);
        }
        /*
         * request to reload the buddy with the
index 2df2e40b01af0cc031d4546306079c0d4a6669ac..b64789929a65bc00243e9b10298f056aadaeb306 100644 (file)
@@ -67,7 +67,8 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
                         unsigned int offset)
 {
        const char *error_msg = NULL;
-       const int rlen = ext4_rec_len_from_disk(de->rec_len);
+       const int rlen = ext4_rec_len_from_disk(de->rec_len,
+                                               dir->i_sb->s_blocksize);
 
        if (rlen < EXT4_DIR_REC_LEN(1))
                error_msg = "rec_len is smaller than minimal";
@@ -178,10 +179,11 @@ revalidate:
                                 * least that it is non-zero.  A
                                 * failure will be detected in the
                                 * dirent test below. */
-                               if (ext4_rec_len_from_disk(de->rec_len)
-                                               < EXT4_DIR_REC_LEN(1))
+                               if (ext4_rec_len_from_disk(de->rec_len,
+                                       sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
                                        break;
-                               i += ext4_rec_len_from_disk(de->rec_len);
+                               i += ext4_rec_len_from_disk(de->rec_len,
+                                                           sb->s_blocksize);
                        }
                        offset = i;
                        filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
@@ -203,7 +205,8 @@ revalidate:
                                ret = stored;
                                goto out;
                        }
-                       offset += ext4_rec_len_from_disk(de->rec_len);
+                       offset += ext4_rec_len_from_disk(de->rec_len,
+                                       sb->s_blocksize);
                        if (le32_to_cpu(de->inode)) {
                                /* We might block in the next section
                                 * if the data destination is
@@ -225,7 +228,8 @@ revalidate:
                                        goto revalidate;
                                stored++;
                        }
-                       filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
+                       filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+                                               sb->s_blocksize);
                }
                offset = 0;
                brelse(bh);
index 990c9400092414ed6cedeffe724afe103d116efc..d0f15ef56de1b1b6b325d0a86207e78260b1f40a 100644 (file)
  */
 #undef EXT4FS_DEBUG
 
-/*
- * Define EXT4_RESERVATION to reserve data blocks for expanding files
- */
-#define EXT4_DEFAULT_RESERVE_BLOCKS    8
-/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
-#define EXT4_MAX_RESERVE_BLOCKS                1027
-#define EXT4_RESERVE_WINDOW_NOT_ALLOCATED 0
-
 /*
  * Debug code
  */
@@ -54,8 +46,6 @@
 #define ext4_debug(f, a...)    do {} while (0)
 #endif
 
-#define EXT4_MULTIBLOCK_ALLOCATOR      1
-
 /* prefer goal again. length */
 #define EXT4_MB_HINT_MERGE             1
 /* blocks already reserved */
@@ -180,8 +170,9 @@ struct ext4_group_desc
  */
 
 struct flex_groups {
-       __u32 free_inodes;
-       __u32 free_blocks;
+       atomic_t free_inodes;
+       atomic_t free_blocks;
+       atomic_t used_dirs;
 };
 
 #define EXT4_BG_INODE_UNINIT   0x0001 /* Inode table/bitmap not in use */
@@ -249,6 +240,30 @@ struct flex_groups {
 #define EXT4_FL_USER_VISIBLE           0x000BDFFF /* User visible flags */
 #define EXT4_FL_USER_MODIFIABLE                0x000B80FF /* User modifiable flags */
 
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
+                          EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
+                          EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
+                          EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
+                          EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
+
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
+{
+       if (S_ISDIR(mode))
+               return flags;
+       else if (S_ISREG(mode))
+               return flags & EXT4_REG_FLMASK;
+       else
+               return flags & EXT4_OTHER_FLMASK;
+}
+
 /*
  * Inode dynamic state flags
  */
@@ -256,6 +271,7 @@ struct flex_groups {
 #define EXT4_STATE_NEW                 0x00000002 /* inode is newly created */
 #define EXT4_STATE_XATTR               0x00000004 /* has in-inode xattrs */
 #define EXT4_STATE_NO_EXPAND           0x00000008 /* No space for expansion */
+#define EXT4_STATE_DA_ALLOC_CLOSE      0x00000010 /* Alloc DA blks on close */
 
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
@@ -303,7 +319,9 @@ struct ext4_new_group_data {
 #define EXT4_IOC_GROUP_EXTEND          _IOW('f', 7, unsigned long)
 #define EXT4_IOC_GROUP_ADD             _IOW('f', 8, struct ext4_new_group_input)
 #define EXT4_IOC_MIGRATE               _IO('f', 9)
+ /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
  /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
+#define EXT4_IOC_ALLOC_DA_BLKS         _IO('f', 12)
 
 /*
  * ioctl commands in 32 bit emulation
@@ -531,7 +549,7 @@ do {                                                                               \
 #define EXT4_MOUNT_NO_UID32            0x02000  /* Disable 32-bit UIDs */
 #define EXT4_MOUNT_XATTR_USER          0x04000 /* Extended user attributes */
 #define EXT4_MOUNT_POSIX_ACL           0x08000 /* POSIX Access Control Lists */
-#define EXT4_MOUNT_RESERVATION         0x10000 /* Preallocation */
+#define EXT4_MOUNT_NO_AUTO_DA_ALLOC    0x10000 /* No auto delalloc mapping */
 #define EXT4_MOUNT_BARRIER             0x20000 /* Use block barriers */
 #define EXT4_MOUNT_NOBH                        0x40000 /* No bufferheads */
 #define EXT4_MOUNT_QUOTA               0x80000 /* Some quota option set */
@@ -666,7 +684,8 @@ struct ext4_super_block {
        __u8    s_log_groups_per_flex;  /* FLEX_BG group size */
        __u8    s_reserved_char_pad2;
        __le16  s_reserved_pad;
-       __u32   s_reserved[162];        /* Padding to the end of the block */
+       __le64  s_kbytes_written;       /* nr of lifetime kilobytes written */
+       __u32   s_reserved[160];        /* Padding to the end of the block */
 };
 
 #ifdef __KERNEL__
@@ -813,6 +832,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_DEF_MIN_BATCH_TIME        0
 #define EXT4_DEF_MAX_BATCH_TIME        15000 /* 15ms */
 
+/*
+ * Minimum number of groups in a flexgroup before we separate out
+ * directories into the first block group of a flexgroup
+ */
+#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME        4
+
 /*
  * Structure of a directory entry
  */
@@ -865,24 +890,6 @@ struct ext4_dir_entry_2 {
                                         ~EXT4_DIR_ROUND)
 #define EXT4_MAX_REC_LEN               ((1<<16)-1)
 
-static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
-{
-       unsigned len = le16_to_cpu(dlen);
-
-       if (len == EXT4_MAX_REC_LEN || len == 0)
-               return 1 << 16;
-       return len;
-}
-
-static inline __le16 ext4_rec_len_to_disk(unsigned len)
-{
-       if (len == (1 << 16))
-               return cpu_to_le16(EXT4_MAX_REC_LEN);
-       else if (len > (1 << 16))
-               BUG();
-       return cpu_to_le16(len);
-}
-
 /*
  * Hash Tree Directory indexing
  * (c) Daniel Phillips, 2001
@@ -970,22 +977,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 
 extern struct proc_dir_entry *ext4_proc_root;
 
-#ifdef CONFIG_PROC_FS
-extern const struct file_operations ext4_ui_proc_fops;
-
-#define        EXT4_PROC_HANDLER(name, var)                                    \
-do {                                                                   \
-       proc = proc_create_data(name, mode, sbi->s_proc,                \
-                               &ext4_ui_proc_fops, &sbi->s_##var);     \
-       if (proc == NULL) {                                             \
-               printk(KERN_ERR "EXT4-fs: can't create %s\n", name);    \
-               goto err_out;                                           \
-       }                                                               \
-} while (0)
-#else
-#define EXT4_PROC_HANDLER(name, var)
-#endif
-
 /*
  * Function prototypes
  */
@@ -1092,6 +1083,7 @@ extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate(struct inode *);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
+extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
@@ -1107,7 +1099,10 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
 
 /* migrate.c */
 extern int ext4_ext_migrate(struct inode *);
+
 /* namei.c */
+extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize);
+extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize);
 extern int ext4_orphan_add(handle_t *, struct inode *);
 extern int ext4_orphan_del(handle_t *, struct inode *);
 extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
index 18cb67b2cbbc85c46b8c6e09efbeb13d1513b618..f0c3ec85bd488a83cfc31f8bb4f601306112e787 100644 (file)
@@ -241,5 +241,6 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
 extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
                                                ext4_lblk_t *, ext4_fsblk_t *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern int ext4_ext_check_inode(struct inode *inode);
 #endif /* _EXT4_EXTENTS */
 
index e69acc16f5c4300be5b5cd641f515fba320a98db..4ce2187123aad818de707f4e82a988fdb8ee14a3 100644 (file)
@@ -33,9 +33,6 @@ typedef __u32 ext4_lblk_t;
 /* data type for block group number */
 typedef unsigned int ext4_group_t;
 
-#define rsv_start rsv_window._rsv_start
-#define rsv_end rsv_window._rsv_end
-
 /*
  * storage for cached extent
  */
@@ -125,6 +122,9 @@ struct ext4_inode_info {
        struct list_head i_prealloc_list;
        spinlock_t i_prealloc_lock;
 
+       /* ialloc */
+       ext4_group_t    i_last_alloc_group;
+
        /* allocation reservation info for delalloc */
        unsigned int i_reserved_data_blocks;
        unsigned int i_reserved_meta_blocks;
index 039b6ea1a0429b30e5e95253b42ed83d24494869..57b71fefbccf753fb05e2436eea4da2da7663b3a 100644 (file)
@@ -62,12 +62,10 @@ struct ext4_sb_info {
        struct percpu_counter s_freeinodes_counter;
        struct percpu_counter s_dirs_counter;
        struct percpu_counter s_dirtyblocks_counter;
-       struct blockgroup_lock s_blockgroup_lock;
+       struct blockgroup_lock *s_blockgroup_lock;
        struct proc_dir_entry *s_proc;
-
-       /* root of the per fs reservation window tree */
-       spinlock_t s_rsv_window_lock;
-       struct rb_root s_rsv_window_root;
+       struct kobject s_kobj;
+       struct completion s_kobj_unregister;
 
        /* Journaling */
        struct inode *s_journal_inode;
@@ -146,6 +144,10 @@ struct ext4_sb_info {
        /* locality groups */
        struct ext4_locality_group *s_locality_groups;
 
+       /* for write statistics */
+       unsigned long s_sectors_written_start;
+       u64 s_kbytes_written;
+
        unsigned int s_log_groups_per_flex;
        struct flex_groups *s_flex_groups;
 };
@@ -153,7 +155,7 @@ struct ext4_sb_info {
 static inline spinlock_t *
 sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
 {
-       return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+       return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
 }
 
 #endif /* _EXT4_SB */
index e0aa4fe4f596fcb11549de818436edced6887f3b..ac77d8b8251da9cb6ac720db55e8b7e72ec97a91 100644 (file)
@@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
        ext4_fsblk_t bg_start;
        ext4_fsblk_t last_block;
        ext4_grpblk_t colour;
+       ext4_group_t block_group;
+       int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
        int depth;
 
        if (path) {
@@ -170,10 +172,31 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
        }
 
        /* OK. use inode's group */
-       bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
+       block_group = ei->i_block_group;
+       if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+               /*
+                * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
+                * block groups per flexgroup, reserve the first block 
+                * group for directories and special files.  Regular 
+                * files will start at the second block group.  This
+                * tends to speed up directory access and improves 
+                * fsck times.
+                */
+               block_group &= ~(flex_size-1);
+               if (S_ISREG(inode->i_mode))
+                       block_group++;
+       }
+       bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
                le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
        last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 
+       /*
+        * If we are doing delayed allocation, we don't need take
+        * colour into account.
+        */
+       if (test_opt(inode->i_sb, DELALLOC))
+               return bg_start;
+
        if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
                colour = (current->pid % 16) *
                        (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -301,7 +324,64 @@ ext4_ext_max_entries(struct inode *inode, int depth)
        return max;
 }
 
-static int __ext4_ext_check_header(const char *function, struct inode *inode,
+static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
+{
+       ext4_fsblk_t block = ext_pblock(ext);
+       int len = ext4_ext_get_actual_len(ext);
+       struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+       if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
+                       ((block + len) > ext4_blocks_count(es))))
+               return 0;
+       else
+               return 1;
+}
+
+static int ext4_valid_extent_idx(struct inode *inode,
+                               struct ext4_extent_idx *ext_idx)
+{
+       ext4_fsblk_t block = idx_pblock(ext_idx);
+       struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+       if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
+                       (block > ext4_blocks_count(es))))
+               return 0;
+       else
+               return 1;
+}
+
+static int ext4_valid_extent_entries(struct inode *inode,
+                               struct ext4_extent_header *eh,
+                               int depth)
+{
+       struct ext4_extent *ext;
+       struct ext4_extent_idx *ext_idx;
+       unsigned short entries;
+       if (eh->eh_entries == 0)
+               return 1;
+
+       entries = le16_to_cpu(eh->eh_entries);
+
+       if (depth == 0) {
+               /* leaf entries */
+               ext = EXT_FIRST_EXTENT(eh);
+               while (entries) {
+                       if (!ext4_valid_extent(inode, ext))
+                               return 0;
+                       ext++;
+                       entries--;
+               }
+       } else {
+               ext_idx = EXT_FIRST_INDEX(eh);
+               while (entries) {
+                       if (!ext4_valid_extent_idx(inode, ext_idx))
+                               return 0;
+                       ext_idx++;
+                       entries--;
+               }
+       }
+       return 1;
+}
+
+static int __ext4_ext_check(const char *function, struct inode *inode,
                                        struct ext4_extent_header *eh,
                                        int depth)
 {
@@ -329,11 +409,15 @@ static int __ext4_ext_check_header(const char *function, struct inode *inode,
                error_msg = "invalid eh_entries";
                goto corrupted;
        }
+       if (!ext4_valid_extent_entries(inode, eh, depth)) {
+               error_msg = "invalid extent entries";
+               goto corrupted;
+       }
        return 0;
 
 corrupted:
        ext4_error(inode->i_sb, function,
-                       "bad header in inode #%lu: %s - magic %x, "
+                       "bad header/extent in inode #%lu: %s - magic %x, "
                        "entries %u, max %u(%u), depth %u(%u)",
                        inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
                        le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
@@ -342,8 +426,13 @@ corrupted:
        return -EIO;
 }
 
-#define ext4_ext_check_header(inode, eh, depth)        \
-       __ext4_ext_check_header(__func__, inode, eh, depth)
+#define ext4_ext_check(inode, eh, depth)       \
+       __ext4_ext_check(__func__, inode, eh, depth)
+
+int ext4_ext_check_inode(struct inode *inode)
+{
+       return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
+}
 
 #ifdef EXT_DEBUG
 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
@@ -547,9 +636,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 
        eh = ext_inode_hdr(inode);
        depth = ext_depth(inode);
-       if (ext4_ext_check_header(inode, eh, depth))
-               return ERR_PTR(-EIO);
-
 
        /* account possible depth increase */
        if (!path) {
@@ -565,6 +651,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
        i = depth;
        /* walk through the tree */
        while (i) {
+               int need_to_validate = 0;
+
                ext_debug("depth %d: num %d, max %d\n",
                          ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
 
@@ -573,10 +661,17 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                path[ppos].p_depth = i;
                path[ppos].p_ext = NULL;
 
-               bh = sb_bread(inode->i_sb, path[ppos].p_block);
-               if (!bh)
+               bh = sb_getblk(inode->i_sb, path[ppos].p_block);
+               if (unlikely(!bh))
                        goto err;
-
+               if (!bh_uptodate_or_lock(bh)) {
+                       if (bh_submit_read(bh) < 0) {
+                               put_bh(bh);
+                               goto err;
+                       }
+                       /* validate the extent entries */
+                       need_to_validate = 1;
+               }
                eh = ext_block_hdr(bh);
                ppos++;
                BUG_ON(ppos > depth);
@@ -584,7 +679,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                path[ppos].p_hdr = eh;
                i--;
 
-               if (ext4_ext_check_header(inode, eh, i))
+               if (need_to_validate && ext4_ext_check(inode, eh, i))
                        goto err;
        }
 
@@ -1181,7 +1276,7 @@ got_index:
                        return -EIO;
                eh = ext_block_hdr(bh);
                /* subtract from p_depth to get proper eh_depth */
-               if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
+               if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
                        put_bh(bh);
                        return -EIO;
                }
@@ -1194,7 +1289,7 @@ got_index:
        if (bh == NULL)
                return -EIO;
        eh = ext_block_hdr(bh);
-       if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
+       if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
                put_bh(bh);
                return -EIO;
        }
@@ -2137,7 +2232,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
                return -ENOMEM;
        }
        path[0].p_hdr = ext_inode_hdr(inode);
-       if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) {
+       if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
                err = -EIO;
                goto out;
        }
@@ -2191,7 +2286,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
                                err = -EIO;
                                break;
                        }
-                       if (ext4_ext_check_header(inode, ext_block_hdr(bh),
+                       if (ext4_ext_check(inode, ext_block_hdr(bh),
                                                        depth - i - 1)) {
                                err = -EIO;
                                break;
index f731cb545a0359cc664de79a3fe9ab6c7ca4f3b8..588af8c77246f71b9a2b088602dba33b88c43f52 100644 (file)
  */
 static int ext4_release_file(struct inode *inode, struct file *filp)
 {
+       if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
+               ext4_alloc_da_blocks(inode);
+               EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
+       }
        /* if we are the last writer on the inode, drop the block reservation */
        if ((filp->f_mode & FMODE_WRITE) &&
-                       (atomic_read(&inode->i_writecount) == 1))
+                       (atomic_read(&inode->i_writecount) == 1) &&
+                       !EXT4_I(inode)->i_reserved_data_blocks)
        {
                down_write(&EXT4_I(inode)->i_data_sem);
                ext4_discard_preallocations(inode);
index fb51b40e3e8f5a60960d8584815f390d71ae1124..47b84e8df56859ca1e741cfb6543b5966ee45657 100644 (file)
@@ -189,7 +189,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
        struct ext4_super_block *es;
        struct ext4_sb_info *sbi;
        int fatal = 0, err, count, cleared;
-       ext4_group_t flex_group;
 
        if (atomic_read(&inode->i_count) > 1) {
                printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
@@ -268,6 +267,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                        if (is_directory) {
                                count = ext4_used_dirs_count(sb, gdp) - 1;
                                ext4_used_dirs_set(sb, gdp, count);
+                               if (sbi->s_log_groups_per_flex) {
+                                       ext4_group_t f;
+
+                                       f = ext4_flex_group(sbi, block_group);
+                                       atomic_dec(&sbi->s_flex_groups[f].free_inodes);
+                               }
+
                        }
                        gdp->bg_checksum = ext4_group_desc_csum(sbi,
                                                        block_group, gdp);
@@ -277,10 +283,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                                percpu_counter_dec(&sbi->s_dirs_counter);
 
                        if (sbi->s_log_groups_per_flex) {
-                               flex_group = ext4_flex_group(sbi, block_group);
-                               spin_lock(sb_bgl_lock(sbi, flex_group));
-                               sbi->s_flex_groups[flex_group].free_inodes++;
-                               spin_unlock(sb_bgl_lock(sbi, flex_group));
+                               ext4_group_t f;
+
+                               f = ext4_flex_group(sbi, block_group);
+                               atomic_inc(&sbi->s_flex_groups[f].free_inodes);
                        }
                }
                BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
@@ -360,9 +366,9 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
                sbi->s_log_groups_per_flex;
 
 find_close_to_parent:
-       flexbg_free_blocks = flex_group[best_flex].free_blocks;
+       flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
        flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
-       if (flex_group[best_flex].free_inodes &&
+       if (atomic_read(&flex_group[best_flex].free_inodes) &&
            flex_freeb_ratio > free_block_ratio)
                goto found_flexbg;
 
@@ -375,24 +381,24 @@ find_close_to_parent:
                if (i == parent_fbg_group || i == parent_fbg_group - 1)
                        continue;
 
-               flexbg_free_blocks = flex_group[i].free_blocks;
+               flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
                flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
 
                if (flex_freeb_ratio > free_block_ratio &&
-                   flex_group[i].free_inodes) {
+                   (atomic_read(&flex_group[i].free_inodes))) {
                        best_flex = i;
                        goto found_flexbg;
                }
 
-               if (flex_group[best_flex].free_inodes == 0 ||
-                   (flex_group[i].free_blocks >
-                    flex_group[best_flex].free_blocks &&
-                    flex_group[i].free_inodes))
+               if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
+                   ((atomic_read(&flex_group[i].free_blocks) >
+                     atomic_read(&flex_group[best_flex].free_blocks)) &&
+                    atomic_read(&flex_group[i].free_inodes)))
                        best_flex = i;
        }
 
-       if (!flex_group[best_flex].free_inodes ||
-           !flex_group[best_flex].free_blocks)
+       if (!atomic_read(&flex_group[best_flex].free_inodes) ||
+           !atomic_read(&flex_group[best_flex].free_blocks))
                return -1;
 
 found_flexbg:
@@ -410,6 +416,42 @@ out:
        return 0;
 }
 
+struct orlov_stats {
+       __u32 free_inodes;
+       __u32 free_blocks;
+       __u32 used_dirs;
+};
+
+/*
+ * Helper function for Orlov's allocator; returns critical information
+ * for a particular block group or flex_bg.  If flex_size is 1, then g
+ * is a block group number; otherwise it is flex_bg number.
+ */
+void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+                      int flex_size, struct orlov_stats *stats)
+{
+       struct ext4_group_desc *desc;
+       struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
+
+       if (flex_size > 1) {
+               stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
+               stats->free_blocks = atomic_read(&flex_group[g].free_blocks);
+               stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
+               return;
+       }
+
+       desc = ext4_get_group_desc(sb, g, NULL);
+       if (desc) {
+               stats->free_inodes = ext4_free_inodes_count(sb, desc);
+               stats->free_blocks = ext4_free_blks_count(sb, desc);
+               stats->used_dirs = ext4_used_dirs_count(sb, desc);
+       } else {
+               stats->free_inodes = 0;
+               stats->free_blocks = 0;
+               stats->used_dirs = 0;
+       }
+}
+
 /*
  * Orlov's allocator for directories.
  *
@@ -425,35 +467,34 @@ out:
  * it has too many directories already (max_dirs) or
  * it has too few free inodes left (min_inodes) or
  * it has too few free blocks left (min_blocks) or
- * it's already running too large debt (max_debt).
  * Parent's group is preferred, if it doesn't satisfy these
  * conditions we search cyclically through the rest. If none
  * of the groups look good we just look for a group with more
  * free inodes than average (starting at parent's group).
- *
- * Debt is incremented each time we allocate a directory and decremented
- * when we allocate an inode, within 0--255.
  */
 
-#define INODE_COST 64
-#define BLOCK_COST 256
-
 static int find_group_orlov(struct super_block *sb, struct inode *parent,
-                               ext4_group_t *group)
+                           ext4_group_t *group, int mode)
 {
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-       struct ext4_super_block *es = sbi->s_es;
        ext4_group_t ngroups = sbi->s_groups_count;
        int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
        unsigned int freei, avefreei;
        ext4_fsblk_t freeb, avefreeb;
-       ext4_fsblk_t blocks_per_dir;
        unsigned int ndirs;
-       int max_debt, max_dirs, min_inodes;
+       int max_dirs, min_inodes;
        ext4_grpblk_t min_blocks;
-       ext4_group_t i;
+       ext4_group_t i, grp, g;
        struct ext4_group_desc *desc;
+       struct orlov_stats stats;
+       int flex_size = ext4_flex_bg_size(sbi);
+
+       if (flex_size > 1) {
+               ngroups = (ngroups + flex_size - 1) >>
+                       sbi->s_log_groups_per_flex;
+               parent_group >>= sbi->s_log_groups_per_flex;
+       }
 
        freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
        avefreei = freei / ngroups;
@@ -462,71 +503,97 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
        do_div(avefreeb, ngroups);
        ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
 
-       if ((parent == sb->s_root->d_inode) ||
-           (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
+       if (S_ISDIR(mode) &&
+           ((parent == sb->s_root->d_inode) ||
+            (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
                int best_ndir = inodes_per_group;
-               ext4_group_t grp;
                int ret = -1;
 
                get_random_bytes(&grp, sizeof(grp));
                parent_group = (unsigned)grp % ngroups;
                for (i = 0; i < ngroups; i++) {
-                       grp = (parent_group + i) % ngroups;
-                       desc = ext4_get_group_desc(sb, grp, NULL);
-                       if (!desc || !ext4_free_inodes_count(sb, desc))
+                       g = (parent_group + i) % ngroups;
+                       get_orlov_stats(sb, g, flex_size, &stats);
+                       if (!stats.free_inodes)
                                continue;
-                       if (ext4_used_dirs_count(sb, desc) >= best_ndir)
+                       if (stats.used_dirs >= best_ndir)
                                continue;
-                       if (ext4_free_inodes_count(sb, desc) < avefreei)
+                       if (stats.free_inodes < avefreei)
                                continue;
-                       if (ext4_free_blks_count(sb, desc) < avefreeb)
+                       if (stats.free_blocks < avefreeb)
                                continue;
-                       *group = grp;
+                       grp = g;
                        ret = 0;
-                       best_ndir = ext4_used_dirs_count(sb, desc);
+                       best_ndir = stats.used_dirs;
+               }
+               if (ret)
+                       goto fallback;
+       found_flex_bg:
+               if (flex_size == 1) {
+                       *group = grp;
+                       return 0;
+               }
+
+               /*
+                * We pack inodes at the beginning of the flexgroup's
+                * inode tables.  Block allocation decisions will do
+                * something similar, although regular files will
+                * start at 2nd block group of the flexgroup.  See
+                * ext4_ext_find_goal() and ext4_find_near().
+                */
+               grp *= flex_size;
+               for (i = 0; i < flex_size; i++) {
+                       if (grp+i >= sbi->s_groups_count)
+                               break;
+                       desc = ext4_get_group_desc(sb, grp+i, NULL);
+                       if (desc && ext4_free_inodes_count(sb, desc)) {
+                               *group = grp+i;
+                               return 0;
+                       }
                }
-               if (ret == 0)
-                       return ret;
                goto fallback;
        }
 
-       blocks_per_dir = ext4_blocks_count(es) - freeb;
-       do_div(blocks_per_dir, ndirs);
-
        max_dirs = ndirs / ngroups + inodes_per_group / 16;
-       min_inodes = avefreei - inodes_per_group / 4;
-       min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
-
-       max_debt = EXT4_BLOCKS_PER_GROUP(sb);
-       max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
-       if (max_debt * INODE_COST > inodes_per_group)
-               max_debt = inodes_per_group / INODE_COST;
-       if (max_debt > 255)
-               max_debt = 255;
-       if (max_debt == 0)
-               max_debt = 1;
+       min_inodes = avefreei - inodes_per_group*flex_size / 4;
+       if (min_inodes < 1)
+               min_inodes = 1;
+       min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
+
+       /*
+        * Start looking in the flex group where we last allocated an
+        * inode for this parent directory
+        */
+       if (EXT4_I(parent)->i_last_alloc_group != ~0) {
+               parent_group = EXT4_I(parent)->i_last_alloc_group;
+               if (flex_size > 1)
+                       parent_group >>= sbi->s_log_groups_per_flex;
+       }
 
        for (i = 0; i < ngroups; i++) {
-               *group = (parent_group + i) % ngroups;
-               desc = ext4_get_group_desc(sb, *group, NULL);
-               if (!desc || !ext4_free_inodes_count(sb, desc))
-                       continue;
-               if (ext4_used_dirs_count(sb, desc) >= max_dirs)
+               grp = (parent_group + i) % ngroups;
+               get_orlov_stats(sb, grp, flex_size, &stats);
+               if (stats.used_dirs >= max_dirs)
                        continue;
-               if (ext4_free_inodes_count(sb, desc) < min_inodes)
+               if (stats.free_inodes < min_inodes)
                        continue;
-               if (ext4_free_blks_count(sb, desc) < min_blocks)
+               if (stats.free_blocks < min_blocks)
                        continue;
-               return 0;
+               goto found_flex_bg;
        }
 
 fallback:
+       ngroups = sbi->s_groups_count;
+       avefreei = freei / ngroups;
+       parent_group = EXT4_I(parent)->i_block_group;
        for (i = 0; i < ngroups; i++) {
-               *group = (parent_group + i) % ngroups;
-               desc = ext4_get_group_desc(sb, *group, NULL);
+               grp = (parent_group + i) % ngroups;
+               desc = ext4_get_group_desc(sb, grp, NULL);
                if (desc && ext4_free_inodes_count(sb, desc) &&
-                       ext4_free_inodes_count(sb, desc) >= avefreei)
+                   ext4_free_inodes_count(sb, desc) >= avefreei) {
+                       *group = grp;
                        return 0;
+               }
        }
 
        if (avefreei) {
@@ -542,12 +609,51 @@ fallback:
 }
 
 static int find_group_other(struct super_block *sb, struct inode *parent,
-                               ext4_group_t *group)
+                           ext4_group_t *group, int mode)
 {
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
        struct ext4_group_desc *desc;
-       ext4_group_t i;
+       ext4_group_t i, last;
+       int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
+
+       /*
+        * Try to place the inode is the same flex group as its
+        * parent.  If we can't find space, use the Orlov algorithm to
+        * find another flex group, and store that information in the
+        * parent directory's inode information so that use that flex
+        * group for future allocations.
+        */
+       if (flex_size > 1) {
+               int retry = 0;
+
+       try_again:
+               parent_group &= ~(flex_size-1);
+               last = parent_group + flex_size;
+               if (last > ngroups)
+                       last = ngroups;
+               for  (i = parent_group; i < last; i++) {
+                       desc = ext4_get_group_desc(sb, i, NULL);
+                       if (desc && ext4_free_inodes_count(sb, desc)) {
+                               *group = i;
+                               return 0;
+                       }
+               }
+               if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
+                       retry = 1;
+                       parent_group = EXT4_I(parent)->i_last_alloc_group;
+                       goto try_again;
+               }
+               /*
+                * If this didn't work, use the Orlov search algorithm
+                * to find a new flex group; we pass in the mode to
+                * avoid the topdir algorithms.
+                */
+               *group = parent_group + flex_size;
+               if (*group > ngroups)
+                       *group = 0;
+               return find_group_orlov(sb, parent, group, mode);
+       }
 
        /*
         * Try to place the inode in its parent directory
@@ -665,6 +771,11 @@ static int ext4_claim_inode(struct super_block *sb,
        if (S_ISDIR(mode)) {
                count = ext4_used_dirs_count(sb, gdp) + 1;
                ext4_used_dirs_set(sb, gdp, count);
+               if (sbi->s_log_groups_per_flex) {
+                       ext4_group_t f = ext4_flex_group(sbi, group);
+
+                       atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+               }
        }
        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
@@ -716,10 +827,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
        sbi = EXT4_SB(sb);
        es = sbi->s_es;
 
-       if (sbi->s_log_groups_per_flex) {
+       if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
                ret2 = find_group_flex(sb, dir, &group);
                if (ret2 == -1) {
-                       ret2 = find_group_other(sb, dir, &group);
+                       ret2 = find_group_other(sb, dir, &group, mode);
                        if (ret2 == 0 && once)
                                once = 0;
                                printk(KERN_NOTICE "ext4: find_group_flex "
@@ -733,11 +844,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
                if (test_opt(sb, OLDALLOC))
                        ret2 = find_group_dir(sb, dir, &group);
                else
-                       ret2 = find_group_orlov(sb, dir, &group);
+                       ret2 = find_group_orlov(sb, dir, &group, mode);
        } else
-               ret2 = find_group_other(sb, dir, &group);
+               ret2 = find_group_other(sb, dir, &group, mode);
 
 got_group:
+       EXT4_I(dir)->i_last_alloc_group = group;
        err = -ENOSPC;
        if (ret2 == -1)
                goto out;
@@ -858,9 +970,7 @@ got:
 
        if (sbi->s_log_groups_per_flex) {
                flex_group = ext4_flex_group(sbi, group);
-               spin_lock(sb_bgl_lock(sbi, flex_group));
-               sbi->s_flex_groups[flex_group].free_inodes--;
-               spin_unlock(sb_bgl_lock(sbi, flex_group));
+               atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
        }
 
        inode->i_uid = current_fsuid();
@@ -885,19 +995,16 @@ got:
        ei->i_disksize = 0;
 
        /*
-        * Don't inherit extent flag from directory. We set extent flag on
-        * newly created directory and file only if -o extent mount option is
-        * specified
+        * Don't inherit extent flag from directory, amongst others. We set
+        * extent flag on newly created directory and file only if -o extent
+        * mount option is specified
         */
-       ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL);
-       if (S_ISLNK(mode))
-               ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
-       /* dirsync only applies to directories */
-       if (!S_ISDIR(mode))
-               ei->i_flags &= ~EXT4_DIRSYNC_FL;
+       ei->i_flags =
+               ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
        ei->i_file_acl = 0;
        ei->i_dtime = 0;
        ei->i_block_group = group;
+       ei->i_last_alloc_group = ~0;
 
        ext4_set_inode_flags(inode);
        if (IS_DIRSYNC(inode))
index dd82ff390067482ac31fac5f5a9e3414d45cbcdf..a2e7952bc5f9a6e3f3ed877dde89fb93ef4b4035 100644 (file)
@@ -371,6 +371,34 @@ static int ext4_block_to_path(struct inode *inode,
        return n;
 }
 
+static int __ext4_check_blockref(const char *function, struct inode *inode,
+                                unsigned int *p, unsigned int max) {
+
+       unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
+       unsigned int *bref = p;
+       while (bref < p+max) {
+               if (unlikely(*bref >= maxblocks)) {
+                       ext4_error(inode->i_sb, function,
+                                  "block reference %u >= max (%u) "
+                                  "in inode #%lu, offset=%d",
+                                  *bref, maxblocks,
+                                  inode->i_ino, (int)(bref-p));
+                       return -EIO;
+               }
+               bref++;
+       }
+       return 0;
+}
+
+
+#define ext4_check_indirect_blockref(inode, bh)                         \
+        __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data,  \
+                             EXT4_ADDR_PER_BLOCK((inode)->i_sb))
+
+#define ext4_check_inode_blockref(inode)                                \
+        __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data,   \
+                             EXT4_NDIR_BLOCKS)
+
 /**
  *     ext4_get_branch - read the chain of indirect blocks leading to data
  *     @inode: inode in question
@@ -415,9 +443,22 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
        if (!p->key)
                goto no_block;
        while (--depth) {
-               bh = sb_bread(sb, le32_to_cpu(p->key));
-               if (!bh)
+               bh = sb_getblk(sb, le32_to_cpu(p->key));
+               if (unlikely(!bh))
                        goto failure;
+                  
+               if (!bh_uptodate_or_lock(bh)) {
+                       if (bh_submit_read(bh) < 0) {
+                               put_bh(bh);
+                               goto failure;
+                       }
+                       /* validate block references */
+                       if (ext4_check_indirect_blockref(inode, bh)) {
+                               put_bh(bh);
+                               goto failure;
+                       }
+               }
+               
                add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
                /* Reader: end */
                if (!p->key)
@@ -459,6 +500,8 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
        ext4_fsblk_t bg_start;
        ext4_fsblk_t last_block;
        ext4_grpblk_t colour;
+       ext4_group_t block_group;
+       int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
 
        /* Try to find previous block */
        for (p = ind->p - 1; p >= start; p--) {
@@ -474,9 +517,22 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
         * It is going to be referred to from the inode itself? OK, just put it
         * into the same cylinder group then.
         */
-       bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
+       block_group = ei->i_block_group;
+       if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+               block_group &= ~(flex_size-1);
+               if (S_ISREG(inode->i_mode))
+                       block_group++;
+       }
+       bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
        last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 
+       /*
+        * If we are doing delayed allocation, we don't need take
+        * colour into account.
+        */
+       if (test_opt(inode->i_sb, DELALLOC))
+               return bg_start;
+
        if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
                colour = (current->pid % 16) *
                        (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -1052,9 +1108,16 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
        /*
         * free those over-booking quota for metadata blocks
         */
-
        if (mdb_free)
                vfs_dq_release_reservation_block(inode, mdb_free);
+
+       /*
+        * If we have done all the pending block allocations and if
+        * there aren't any writers on the inode, we can discard the
+        * inode's preallocations.
+        */
+       if (!total && (atomic_read(&inode->i_writecount) == 0))
+               ext4_discard_preallocations(inode);
 }
 
 /*
@@ -1688,9 +1751,10 @@ static void ext4_da_page_release_reservation(struct page *page,
 
 struct mpage_da_data {
        struct inode *inode;
-       struct buffer_head lbh;                 /* extent of blocks */
+       sector_t b_blocknr;             /* start block number of extent */
+       size_t b_size;                  /* size of extent */
+       unsigned long b_state;          /* state of the extent */
        unsigned long first_page, next_page;    /* extent of pages */
-       get_block_t *get_block;
        struct writeback_control *wbc;
        int io_done;
        int pages_written;
@@ -1704,7 +1768,6 @@ struct mpage_da_data {
  * @mpd->inode: inode
  * @mpd->first_page: first page of the extent
  * @mpd->next_page: page after the last page of the extent
- * @mpd->get_block: the filesystem's block mapper function
  *
  * By the time mpage_da_submit_io() is called we expect all blocks
  * to be allocated. this may be wrong if allocation failed.
@@ -1724,7 +1787,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
        /*
         * We need to start from the first_page to the next_page - 1
         * to make sure we also write the mapped dirty buffer_heads.
-        * If we look at mpd->lbh.b_blocknr we would only be looking
+        * If we look at mpd->b_blocknr we would only be looking
         * at the currently mapped buffer_heads.
         */
        index = mpd->first_page;
@@ -1914,68 +1977,111 @@ static void ext4_print_free_blocks(struct inode *inode)
        return;
 }
 
+#define                EXT4_DELALLOC_RSVED     1
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+                                  struct buffer_head *bh_result, int create)
+{
+       int ret;
+       unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+       loff_t disksize = EXT4_I(inode)->i_disksize;
+       handle_t *handle = NULL;
+
+       handle = ext4_journal_current_handle();
+       BUG_ON(!handle);
+       ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+                                  bh_result, create, 0, EXT4_DELALLOC_RSVED);
+       if (ret <= 0)
+               return ret;
+
+       bh_result->b_size = (ret << inode->i_blkbits);
+
+       if (ext4_should_order_data(inode)) {
+               int retval;
+               retval = ext4_jbd2_file_inode(handle, inode);
+               if (retval)
+                       /*
+                        * Failed to add inode for ordered mode. Don't
+                        * update file size
+                        */
+                       return retval;
+       }
+
+       /*
+        * Update on-disk size along with block allocation we don't
+        * use 'extend_disksize' as size may change within already
+        * allocated block -bzzz
+        */
+       disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+       if (disksize > i_size_read(inode))
+               disksize = i_size_read(inode);
+       if (disksize > EXT4_I(inode)->i_disksize) {
+               ext4_update_i_disksize(inode, disksize);
+               ret = ext4_mark_inode_dirty(handle, inode);
+               return ret;
+       }
+       return 0;
+}
+
 /*
  * mpage_da_map_blocks - go through given space
  *
- * @mpd->lbh - bh describing space
- * @mpd->get_block - the filesystem's block mapper function
+ * @mpd - bh describing space
  *
  * The function skips space we know is already mapped to disk blocks.
  *
  */
-static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
+static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
        int err = 0;
        struct buffer_head new;
-       struct buffer_head *lbh = &mpd->lbh;
        sector_t next;
 
        /*
         * We consider only non-mapped and non-allocated blocks
         */
-       if (buffer_mapped(lbh) && !buffer_delay(lbh))
+       if ((mpd->b_state  & (1 << BH_Mapped)) &&
+           !(mpd->b_state & (1 << BH_Delay)))
                return 0;
-       new.b_state = lbh->b_state;
+       new.b_state = mpd->b_state;
        new.b_blocknr = 0;
-       new.b_size = lbh->b_size;
-       next = lbh->b_blocknr;
+       new.b_size = mpd->b_size;
+       next = mpd->b_blocknr;
        /*
         * If we didn't accumulate anything
         * to write simply return
         */
        if (!new.b_size)
                return 0;
-       err = mpd->get_block(mpd->inode, next, &new, 1);
-       if (err) {
 
-               /* If get block returns with error
-                * we simply return. Later writepage
-                * will redirty the page and writepages
-                * will find the dirty page again
+       err = ext4_da_get_block_write(mpd->inode, next, &new, 1);
+       if (err) {
+               /*
+                * If get block returns with error we simply
+                * return. Later writepage will redirty the page and
+                * writepages will find the dirty page again
                 */
                if (err == -EAGAIN)
                        return 0;
 
                if (err == -ENOSPC &&
-                               ext4_count_free_blocks(mpd->inode->i_sb)) {
+                   ext4_count_free_blocks(mpd->inode->i_sb)) {
                        mpd->retval = err;
                        return 0;
                }
 
                /*
-                * get block failure will cause us
-                * to loop in writepages. Because
-                * a_ops->writepage won't be able to
-                * make progress. The page will be redirtied
-                * by writepage and writepages will again
-                * try to write the same.
+                * get block failure will cause us to loop in
+                * writepages, because a_ops->writepage won't be able
+                * to make progress. The page will be redirtied by
+                * writepage and writepages will again try to write
+                * the same.
                 */
                printk(KERN_EMERG "%s block allocation failed for inode %lu "
                                  "at logical offset %llu with max blocks "
                                  "%zd with error %d\n",
                                  __func__, mpd->inode->i_ino,
                                  (unsigned long long)next,
-                                 lbh->b_size >> mpd->inode->i_blkbits, err);
+                                 mpd->b_size >> mpd->inode->i_blkbits, err);
                printk(KERN_EMERG "This should not happen.!! "
                                        "Data will be lost\n");
                if (err == -ENOSPC) {
@@ -1983,7 +2089,7 @@ static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
                }
                /* invlaidate all the pages */
                ext4_da_block_invalidatepages(mpd, next,
-                               lbh->b_size >> mpd->inode->i_blkbits);
+                               mpd->b_size >> mpd->inode->i_blkbits);
                return err;
        }
        BUG_ON(new.b_size == 0);
@@ -1995,7 +2101,8 @@ static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
         * If blocks are delayed marked, we need to
         * put actual blocknr and drop delayed bit
         */
-       if (buffer_delay(lbh) || buffer_unwritten(lbh))
+       if ((mpd->b_state & (1 << BH_Delay)) ||
+           (mpd->b_state & (1 << BH_Unwritten)))
                mpage_put_bnr_to_bhs(mpd, next, &new);
 
        return 0;
@@ -2014,12 +2121,11 @@ static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
  * the function is used to collect contig. blocks in same state
  */
 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
-                                  sector_t logical, struct buffer_head *bh)
+                                  sector_t logical, size_t b_size,
+                                  unsigned long b_state)
 {
        sector_t next;
-       size_t b_size = bh->b_size;
-       struct buffer_head *lbh = &mpd->lbh;
-       int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
+       int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
 
        /* check if thereserved journal credits might overflow */
        if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
@@ -2046,19 +2152,19 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
        /*
         * First block in the extent
         */
-       if (lbh->b_size == 0) {
-               lbh->b_blocknr = logical;
-               lbh->b_size = b_size;
-               lbh->b_state = bh->b_state & BH_FLAGS;
+       if (mpd->b_size == 0) {
+               mpd->b_blocknr = logical;
+               mpd->b_size = b_size;
+               mpd->b_state = b_state & BH_FLAGS;
                return;
        }
 
-       next = lbh->b_blocknr + nrblocks;
+       next = mpd->b_blocknr + nrblocks;
        /*
         * Can we merge the block to our big extent?
         */
-       if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
-               lbh->b_size += b_size;
+       if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
+               mpd->b_size += b_size;
                return;
        }
 
@@ -2087,7 +2193,7 @@ static int __mpage_da_writepage(struct page *page,
 {
        struct mpage_da_data *mpd = data;
        struct inode *inode = mpd->inode;
-       struct buffer_head *bh, *head, fake;
+       struct buffer_head *bh, *head;
        sector_t logical;
 
        if (mpd->io_done) {
@@ -2129,9 +2235,9 @@ static int __mpage_da_writepage(struct page *page,
                /*
                 * ... and blocks
                 */
-               mpd->lbh.b_size = 0;
-               mpd->lbh.b_state = 0;
-               mpd->lbh.b_blocknr = 0;
+               mpd->b_size = 0;
+               mpd->b_state = 0;
+               mpd->b_blocknr = 0;
        }
 
        mpd->next_page = page->index + 1;
@@ -2139,16 +2245,8 @@ static int __mpage_da_writepage(struct page *page,
                  (PAGE_CACHE_SHIFT - inode->i_blkbits);
 
        if (!page_has_buffers(page)) {
-               /*
-                * There is no attached buffer heads yet (mmap?)
-                * we treat the page asfull of dirty blocks
-                */
-               bh = &fake;
-               bh->b_size = PAGE_CACHE_SIZE;
-               bh->b_state = 0;
-               set_buffer_dirty(bh);
-               set_buffer_uptodate(bh);
-               mpage_add_bh_to_extent(mpd, logical, bh);
+               mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
+                                      (1 << BH_Dirty) | (1 << BH_Uptodate));
                if (mpd->io_done)
                        return MPAGE_DA_EXTENT_TAIL;
        } else {
@@ -2166,8 +2264,10 @@ static int __mpage_da_writepage(struct page *page,
                         * with the page in ext4_da_writepage
                         */
                        if (buffer_dirty(bh) &&
-                               (!buffer_mapped(bh) || buffer_delay(bh))) {
-                               mpage_add_bh_to_extent(mpd, logical, bh);
+                           (!buffer_mapped(bh) || buffer_delay(bh))) {
+                               mpage_add_bh_to_extent(mpd, logical,
+                                                      bh->b_size,
+                                                      bh->b_state);
                                if (mpd->io_done)
                                        return MPAGE_DA_EXTENT_TAIL;
                        } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
@@ -2179,9 +2279,8 @@ static int __mpage_da_writepage(struct page *page,
                                 * unmapped buffer_head later we need to
                                 * use the b_state flag of that buffer_head.
                                 */
-                               if (mpd->lbh.b_size == 0)
-                                       mpd->lbh.b_state =
-                                               bh->b_state & BH_FLAGS;
+                               if (mpd->b_size == 0)
+                                       mpd->b_state = bh->b_state & BH_FLAGS;
                        }
                        logical++;
                } while ((bh = bh->b_this_page) != head);
@@ -2190,51 +2289,6 @@ static int __mpage_da_writepage(struct page *page,
        return 0;
 }
 
-/*
- * mpage_da_writepages - walk the list of dirty pages of the given
- * address space, allocates non-allocated blocks, maps newly-allocated
- * blocks to existing bhs and issue IO them
- *
- * @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @get_block: the filesystem's block mapper function.
- *
- * This is a library function, which implements the writepages()
- * address_space_operation.
- */
-static int mpage_da_writepages(struct address_space *mapping,
-                              struct writeback_control *wbc,
-                              struct mpage_da_data *mpd)
-{
-       int ret;
-
-       if (!mpd->get_block)
-               return generic_writepages(mapping, wbc);
-
-       mpd->lbh.b_size = 0;
-       mpd->lbh.b_state = 0;
-       mpd->lbh.b_blocknr = 0;
-       mpd->first_page = 0;
-       mpd->next_page = 0;
-       mpd->io_done = 0;
-       mpd->pages_written = 0;
-       mpd->retval = 0;
-
-       ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
-       /*
-        * Handle last extent of pages
-        */
-       if (!mpd->io_done && mpd->next_page != mpd->first_page) {
-               if (mpage_da_map_blocks(mpd) == 0)
-                       mpage_da_submit_io(mpd);
-
-               mpd->io_done = 1;
-               ret = MPAGE_DA_EXTENT_TAIL;
-       }
-       wbc->nr_to_write -= mpd->pages_written;
-       return ret;
-}
-
 /*
  * this is a special callback for ->write_begin() only
  * it's intention is to return mapped block or reserve space
@@ -2274,51 +2328,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 
        return ret;
 }
-#define                EXT4_DELALLOC_RSVED     1
-static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
-                                  struct buffer_head *bh_result, int create)
-{
-       int ret;
-       unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-       loff_t disksize = EXT4_I(inode)->i_disksize;
-       handle_t *handle = NULL;
-
-       handle = ext4_journal_current_handle();
-       BUG_ON(!handle);
-       ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-                       bh_result, create, 0, EXT4_DELALLOC_RSVED);
-       if (ret > 0) {
-
-               bh_result->b_size = (ret << inode->i_blkbits);
-
-               if (ext4_should_order_data(inode)) {
-                       int retval;
-                       retval = ext4_jbd2_file_inode(handle, inode);
-                       if (retval)
-                               /*
-                                * Failed to add inode for ordered
-                                * mode. Don't update file size
-                                */
-                               return retval;
-               }
-
-               /*
-                * Update on-disk size along with block allocation
-                * we don't use 'extend_disksize' as size may change
-                * within already allocated block -bzzz
-                */
-               disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
-               if (disksize > i_size_read(inode))
-                       disksize = i_size_read(inode);
-               if (disksize > EXT4_I(inode)->i_disksize) {
-                       ext4_update_i_disksize(inode, disksize);
-                       ret = ext4_mark_inode_dirty(handle, inode);
-                       return ret;
-               }
-               ret = 0;
-       }
-       return ret;
-}
 
 static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
 {
@@ -2569,8 +2578,38 @@ retry:
                        dump_stack();
                        goto out_writepages;
                }
-               mpd.get_block = ext4_da_get_block_write;
-               ret = mpage_da_writepages(mapping, wbc, &mpd);
+
+               /*
+                * Now call __mpage_da_writepage to find the next
+                * contiguous region of logical blocks that need
+                * blocks to be allocated by ext4.  We don't actually
+                * submit the blocks for I/O here, even though
+                * write_cache_pages thinks it will, and will set the
+                * pages as clean for write before calling
+                * __mpage_da_writepage().
+                */
+               mpd.b_size = 0;
+               mpd.b_state = 0;
+               mpd.b_blocknr = 0;
+               mpd.first_page = 0;
+               mpd.next_page = 0;
+               mpd.io_done = 0;
+               mpd.pages_written = 0;
+               mpd.retval = 0;
+               ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
+                                       &mpd);
+               /*
+                * If we have a contigous extent of pages and we
+                * haven't done the I/O yet, map the blocks and submit
+                * them for I/O.
+                */
+               if (!mpd.io_done && mpd.next_page != mpd.first_page) {
+                       if (mpage_da_map_blocks(&mpd) == 0)
+                               mpage_da_submit_io(&mpd);
+                       mpd.io_done = 1;
+                       ret = MPAGE_DA_EXTENT_TAIL;
+               }
+               wbc->nr_to_write -= mpd.pages_written;
 
                ext4_journal_stop(handle);
 
@@ -2846,6 +2885,48 @@ out:
        return;
 }
 
+/*
+ * Force all delayed allocation blocks to be allocated for a given inode.
+ */
+int ext4_alloc_da_blocks(struct inode *inode)
+{
+       if (!EXT4_I(inode)->i_reserved_data_blocks &&
+           !EXT4_I(inode)->i_reserved_meta_blocks)
+               return 0;
+
+       /*
+        * We do something simple for now.  The filemap_flush() will
+        * also start triggering a write of the data blocks, which is
+        * not strictly speaking necessary (and for users of
+        * laptop_mode, not even desirable).  However, to do otherwise
+        * would require replicating code paths in:
+        * 
+        * ext4_da_writepages() ->
+        *    write_cache_pages() ---> (via passed in callback function)
+        *        __mpage_da_writepage() -->
+        *           mpage_add_bh_to_extent()
+        *           mpage_da_map_blocks()
+        *
+        * The problem is that write_cache_pages(), located in
+        * mm/page-writeback.c, marks pages clean in preparation for
+        * doing I/O, which is not desirable if we're not planning on
+        * doing I/O at all.
+        *
+        * We could call write_cache_pages(), and then redirty all of
+        * the pages by calling redirty_page_for_writeback() but that
+        * would be ugly in the extreme.  So instead we would need to
+        * replicate parts of the code in the above functions,
+        * simplifying them becuase we wouldn't actually intend to
+        * write out the pages, but rather only collect contiguous
+        * logical block extents, call the multi-block allocator, and
+        * then update the buffer heads with the block allocations.
+        * 
+        * For now, though, we'll cheat by calling filemap_flush(),
+        * which will map the blocks, and start the I/O, but not
+        * actually wait for the I/O to complete.
+        */
+       return filemap_flush(inode->i_mapping);
+}
 
 /*
  * bmap() is special.  It gets used by applications such as lilo and by
@@ -3868,6 +3949,9 @@ void ext4_truncate(struct inode *inode)
        if (!ext4_can_truncate(inode))
                return;
 
+       if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+               ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
+
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
                ext4_ext_truncate(inode);
                return;
@@ -4110,12 +4194,7 @@ make_io:
                        unsigned num;
 
                        table = ext4_inode_table(sb, gdp);
-                       /* Make sure s_inode_readahead_blks is a power of 2 */
-                       while (EXT4_SB(sb)->s_inode_readahead_blks &
-                              (EXT4_SB(sb)->s_inode_readahead_blks-1))
-                               EXT4_SB(sb)->s_inode_readahead_blks = 
-                                  (EXT4_SB(sb)->s_inode_readahead_blks &
-                                   (EXT4_SB(sb)->s_inode_readahead_blks-1));
+                       /* s_inode_readahead_blks is always a power of 2 */
                        b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
                        if (table > b)
                                b = table;
@@ -4287,6 +4366,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        ei->i_disksize = inode->i_size;
        inode->i_generation = le32_to_cpu(raw_inode->i_generation);
        ei->i_block_group = iloc.block_group;
+       ei->i_last_alloc_group = ~0;
        /*
         * NOTE! The in-memory inode i_data array is in little-endian order
         * even on big-endian machines: we do NOT byteswap the block numbers!
@@ -4329,6 +4409,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                        (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
        }
 
+       if (ei->i_flags & EXT4_EXTENTS_FL) {
+               /* Validate extent which is part of inode */
+               ret = ext4_ext_check_inode(inode);
+       } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+                  (S_ISLNK(inode->i_mode) &&
+                   !ext4_inode_is_fast_symlink(inode))) {
+               /* Validate block references which are part of inode */
+               ret = ext4_check_inode_blockref(inode);
+       }
+       if (ret) {
+               brelse(bh);
+               goto bad_inode;
+       }
+
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &ext4_file_inode_operations;
                inode->i_fop = &ext4_file_operations;
@@ -4345,7 +4439,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                        inode->i_op = &ext4_symlink_inode_operations;
                        ext4_set_aops(inode);
                }
-       } else {
+       } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+             S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
                inode->i_op = &ext4_special_inode_operations;
                if (raw_inode->i_block[0])
                        init_special_inode(inode, inode->i_mode,
@@ -4353,6 +4448,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                else
                        init_special_inode(inode, inode->i_mode,
                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+       } else {
+               brelse(bh);
+               ret = -EIO;
+               ext4_error(inode->i_sb, __func__, 
+                          "bogus i_mode (%o) for inode=%lu",
+                          inode->i_mode, inode->i_ino);
+               goto bad_inode;
        }
        brelse(iloc.bh);
        ext4_set_inode_flags(inode);
index 42dc83fb247a98f6dd4450314bf2f96d4e36c175..91e75f7a9e732c4ba6cdf909cfd3607f32dbd800 100644 (file)
@@ -48,8 +48,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                if (err)
                        return err;
 
-               if (!S_ISDIR(inode->i_mode))
-                       flags &= ~EXT4_DIRSYNC_FL;
+               flags = ext4_mask_flags(inode->i_mode, flags);
 
                err = -EPERM;
                mutex_lock(&inode->i_mutex);
@@ -263,6 +262,20 @@ setversion_out:
                return err;
        }
 
+       case EXT4_IOC_ALLOC_DA_BLKS:
+       {
+               int err;
+               if (!is_owner_or_cap(inode))
+                       return -EACCES;
+
+               err = mnt_want_write(filp->f_path.mnt);
+               if (err)
+                       return err;
+               err = ext4_alloc_da_blocks(inode);
+               mnt_drop_write(filp->f_path.mnt);
+               return err;
+       }
+
        default:
                return -ENOTTY;
        }
index b038188bd039e8f8c2762c13318c078fd353b17a..f871677a798499c4327629cf282d4f75b38d44b3 100644 (file)
  * The allocation request involve request for multiple number of blocks
  * near to the goal(block) value specified.
  *
- * During initialization phase of the allocator we decide to use the group
- * preallocation or inode preallocation depending on the size file. The
- * size of the file could be the resulting file size we would have after
- * allocation or the current file size which ever is larger. If the size is
- * less that sbi->s_mb_stream_request we select the group
- * preallocation. The default value of s_mb_stream_request is 16
- * blocks. This can also be tuned via
- * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms
- * of number of blocks.
+ * During initialization phase of the allocator we decide to use the
+ * group preallocation or inode preallocation depending on the size of
+ * the file. The size of the file could be the resulting file size we
+ * would have after allocation, or the current file size, which ever
+ * is larger. If the size is less than sbi->s_mb_stream_request we
+ * select to use the group preallocation. The default value of
+ * s_mb_stream_request is 16 blocks. This can also be tuned via
+ * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
+ * terms of number of blocks.
  *
  * The main motivation for having small file use group preallocation is to
- * ensure that we have small file closer in the disk.
+ * ensure that we have small files closer together on the disk.
  *
- * First stage the allocator looks at the inode prealloc list
- * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for
- * this particular inode. The inode prealloc space is represented as:
+ * First stage the allocator looks at the inode prealloc list,
+ * ext4_inode_info->i_prealloc_list, which contains list of prealloc
+ * spaces for this particular inode. The inode prealloc space is
+ * represented as:
  *
  * pa_lstart -> the logical start block for this prealloc space
  * pa_pstart -> the physical start block for this prealloc space
  * list. In case of inode preallocation we follow a list of heuristics
  * based on file size. This can be found in ext4_mb_normalize_request. If
  * we are doing a group prealloc we try to normalize the request to
- * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to
+ * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
  * 512 blocks. This can be tuned via
- * /proc/fs/ext4/<partition/group_prealloc. The value is represented in
+ * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in
  * terms of number of blocks. If we have mounted the file system with -O
  * stripe=<value> option the group prealloc request is normalized to the
  * stripe value (sbi->s_stripe)
  *
- * The regular allocator(using the buddy cache) support few tunables.
+ * The regular allocator(using the buddy cache) supports few tunables.
  *
- * /proc/fs/ext4/<partition>/min_to_scan
- * /proc/fs/ext4/<partition>/max_to_scan
- * /proc/fs/ext4/<partition>/order2_req
+ * /sys/fs/ext4/<partition>/mb_min_to_scan
+ * /sys/fs/ext4/<partition>/mb_max_to_scan
+ * /sys/fs/ext4/<partition>/mb_order2_req
  *
- * The regular allocator use buddy scan only if the request len is power of
+ * The regular allocator uses buddy scan only if the request len is power of
  * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
  * value of s_mb_order2_reqs can be tuned via
- * /proc/fs/ext4/<partition>/order2_req.  If the request len is equal to
+ * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
  * stripe size (sbi->s_stripe), we try to search for contigous block in
- * stripe size. This should result in better allocation on RAID setup. If
- * not we search in the specific group using bitmap for best extents. The
- * tunable min_to_scan and max_to_scan controll the behaviour here.
+ * stripe size. This should result in better allocation on RAID setups. If
+ * not, we search in the specific group using bitmap for best extents. The
+ * tunable min_to_scan and max_to_scan control the behaviour here.
  * min_to_scan indicate how long the mballoc __must__ look for a best
- * extent and max_to_scanindicate how long the mballoc __can__ look for a
+ * extent and max_to_scan indicates how long the mballoc __can__ look for a
  * best extent in the found extents. Searching for the blocks starts with
  * the group specified as the goal value in allocation context via
  * ac_g_ex. Each group is first checked based on the criteria whether it
@@ -337,8 +338,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
                                                ext4_group_t group);
-static int ext4_mb_init_per_dev_proc(struct super_block *sb);
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
 static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
 
 
@@ -1726,6 +1725,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 {
        unsigned free, fragments;
        unsigned i, bits;
+       int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
        struct ext4_group_desc *desc;
        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
 
@@ -1747,6 +1747,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
                        return 0;
 
+               /* Avoid using the first bg of a flexgroup for data files */
+               if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
+                   (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
+                   ((group % flex_size) == 0))
+                       return 0;
+
                bits = ac->ac_sb->s_blocksize_bits + 1;
                for (i = ac->ac_2order; i <= bits; i++)
                        if (grp->bb_counters[i] > 0)
@@ -1971,7 +1977,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
        /*
         * We search using buddy data only if the order of the request
         * is greater than equal to the sbi_s_mb_order2_reqs
-        * You can tune it via /proc/fs/ext4/<partition>/order2_req
+        * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
         */
        if (i >= sbi->s_mb_order2_reqs) {
                /*
@@ -2693,7 +2699,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_maxs == NULL) {
-               kfree(sbi->s_mb_maxs);
+               kfree(sbi->s_mb_offsets);
                return -ENOMEM;
        }
 
@@ -2746,7 +2752,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
                spin_lock_init(&lg->lg_prealloc_lock);
        }
 
-       ext4_mb_init_per_dev_proc(sb);
        ext4_mb_history_init(sb);
 
        if (sbi->s_journal)
@@ -2829,7 +2834,6 @@ int ext4_mb_release(struct super_block *sb)
 
        free_percpu(sbi->s_locality_groups);
        ext4_mb_history_release(sb);
-       ext4_mb_destroy_per_dev_proc(sb);
 
        return 0;
 }
@@ -2890,62 +2894,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
        mb_debug("freed %u blocks in %u structures\n", count, count2);
 }
 
-#define EXT4_MB_STATS_NAME             "stats"
-#define EXT4_MB_MAX_TO_SCAN_NAME       "max_to_scan"
-#define EXT4_MB_MIN_TO_SCAN_NAME       "min_to_scan"
-#define EXT4_MB_ORDER2_REQ             "order2_req"
-#define EXT4_MB_STREAM_REQ             "stream_req"
-#define EXT4_MB_GROUP_PREALLOC         "group_prealloc"
-
-static int ext4_mb_init_per_dev_proc(struct super_block *sb)
-{
-#ifdef CONFIG_PROC_FS
-       mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
-       struct ext4_sb_info *sbi = EXT4_SB(sb);
-       struct proc_dir_entry *proc;
-
-       if (sbi->s_proc == NULL)
-               return -EINVAL;
-
-       EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats);
-       EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
-       EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
-       EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
-       EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
-       EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
-       return 0;
-
-err_out:
-       remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
-       remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
-       remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
-       remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
-       remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
-       remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
-       return -ENOMEM;
-#else
-       return 0;
-#endif
-}
-
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
-{
-#ifdef CONFIG_PROC_FS
-       struct ext4_sb_info *sbi = EXT4_SB(sb);
-
-       if (sbi->s_proc == NULL)
-               return -EINVAL;
-
-       remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
-       remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
-       remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
-       remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
-       remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
-       remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
-#endif
-       return 0;
-}
-
 int __init init_ext4_mballoc(void)
 {
        ext4_pspace_cachep =
@@ -3096,9 +3044,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi,
                                                          ac->ac_b_ex.fe_group);
-               spin_lock(sb_bgl_lock(sbi, flex_group));
-               sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
-               spin_unlock(sb_bgl_lock(sbi, flex_group));
+               atomic_sub(ac->ac_b_ex.fe_len,
+                          &sbi->s_flex_groups[flex_group].free_blocks);
        }
 
        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -3116,7 +3063,7 @@ out_err:
  * here we normalize request for locality group
  * Group request are normalized to s_strip size if we set the same via mount
  * option. If not we set it to s_mb_group_prealloc which can be configured via
- * /proc/fs/ext4/<partition>/group_prealloc
+ * /sys/fs/ext4/<partition>/mb_group_prealloc
  *
  * XXX: should we try to preallocate more than the group has now?
  */
@@ -3608,8 +3555,11 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
        spin_unlock(&pa->pa_lock);
 
        grp_blk = pa->pa_pstart;
-       /* If linear, pa_pstart may be in the next group when pa is used up */
-       if (pa->pa_linear)
+       /* 
+        * If doing group-based preallocation, pa_pstart may be in the
+        * next group when pa is used up
+        */
+       if (pa->pa_type == MB_GROUP_PA)
                grp_blk--;
 
        ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
@@ -3704,7 +3654,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
        INIT_LIST_HEAD(&pa->pa_inode_list);
        INIT_LIST_HEAD(&pa->pa_group_list);
        pa->pa_deleted = 0;
-       pa->pa_linear = 0;
+       pa->pa_type = MB_INODE_PA;
 
        mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -3767,7 +3717,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
        INIT_LIST_HEAD(&pa->pa_inode_list);
        INIT_LIST_HEAD(&pa->pa_group_list);
        pa->pa_deleted = 0;
-       pa->pa_linear = 1;
+       pa->pa_type = MB_GROUP_PA;
 
        mb_debug("new group pa %p: %llu/%u for %u\n", pa,
                 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -4021,7 +3971,7 @@ repeat:
                list_del_rcu(&pa->pa_inode_list);
                spin_unlock(pa->pa_obj_lock);
 
-               if (pa->pa_linear)
+               if (pa->pa_type == MB_GROUP_PA)
                        ext4_mb_release_group_pa(&e4b, pa, ac);
                else
                        ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
@@ -4121,7 +4071,7 @@ repeat:
        spin_unlock(&ei->i_prealloc_lock);
 
        list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
-               BUG_ON(pa->pa_linear != 0);
+               BUG_ON(pa->pa_type != MB_INODE_PA);
                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
 
                err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -4232,7 +4182,7 @@ static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
  * file is determined by the current size or the resulting size after
  * allocation which ever is larger
  *
- * One can tune this size via /proc/fs/ext4/<partition>/stream_req
+ * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
  */
 static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 {
@@ -4373,7 +4323,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
                        continue;
                }
                /* only lg prealloc space */
-               BUG_ON(!pa->pa_linear);
+               BUG_ON(pa->pa_type != MB_GROUP_PA);
 
                /* seems this one can be freed ... */
                pa->pa_deleted = 1;
@@ -4442,7 +4392,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
                                                pa_inode_list) {
                spin_lock(&tmp_pa->pa_lock);
                if (tmp_pa->pa_deleted) {
-                       spin_unlock(&pa->pa_lock);
+                       spin_unlock(&tmp_pa->pa_lock);
                        continue;
                }
                if (!added && pa->pa_free < tmp_pa->pa_free) {
@@ -4479,7 +4429,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 {
        struct ext4_prealloc_space *pa = ac->ac_pa;
        if (pa) {
-               if (pa->pa_linear) {
+               if (pa->pa_type == MB_GROUP_PA) {
                        /* see comment in ext4_mb_use_group_pa() */
                        spin_lock(&pa->pa_lock);
                        pa->pa_pstart += ac->ac_b_ex.fe_len;
@@ -4499,7 +4449,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
                 * doesn't grow big.  We need to release
                 * alloc_semp before calling ext4_mb_add_n_trim()
                 */
-               if (pa->pa_linear && likely(pa->pa_free)) {
+               if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
                        spin_lock(pa->pa_obj_lock);
                        list_del_rcu(&pa->pa_inode_list);
                        spin_unlock(pa->pa_obj_lock);
@@ -4936,9 +4886,7 @@ do_more:
 
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-               spin_lock(sb_bgl_lock(sbi, flex_group));
-               sbi->s_flex_groups[flex_group].free_blocks += count;
-               spin_unlock(sb_bgl_lock(sbi, flex_group));
+               atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
        }
 
        ext4_mb_release_desc(&e4b);
index 10a2921baf14c986c809a8746fb4b90d3a7f8fbe..dd9e6cd5f6cf4dc97ae6c97b4c6c5c9d5bee9e4d 100644 (file)
@@ -132,12 +132,15 @@ struct ext4_prealloc_space {
        ext4_lblk_t             pa_lstart;      /* log. block */
        unsigned short          pa_len;         /* len of preallocated chunk */
        unsigned short          pa_free;        /* how many blocks are free */
-       unsigned short          pa_linear;      /* consumed in one direction
-                                                * strictly, for grp prealloc */
+       unsigned short          pa_type;        /* pa type. inode or group */
        spinlock_t              *pa_obj_lock;
        struct inode            *pa_inode;      /* hack, for history only */
 };
 
+enum {
+       MB_INODE_PA = 0,
+       MB_GROUP_PA = 1
+};
 
 struct ext4_free_extent {
        ext4_lblk_t fe_logical;
@@ -247,7 +250,6 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
 
 #define in_range(b, first, len)        ((b) >= (first) && (b) <= (first) + (len) - 1)
 
-struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
 static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
                                        struct ext4_free_extent *fex)
 {
index 83410244d3ee11f483db465373665c9843df91cc..22098e1cd085911a7aeffc975ac7a0773a32bbc0 100644 (file)
@@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(const struct qstr *d_name,
                                 struct dx_frame *frame,
                                 int *err);
 static void dx_release(struct dx_frame *frames);
-static int dx_make_map(struct ext4_dir_entry_2 *de, int size,
+static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
                       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
 static void dx_sort_map(struct dx_map_entry *map, unsigned count);
 static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
-               struct dx_map_entry *offsets, int count);
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size);
+               struct dx_map_entry *offsets, int count, unsigned blocksize);
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
 static void dx_insert_block(struct dx_frame *frame,
                                        u32 hash, ext4_lblk_t block);
 static int ext4_htree_next_block(struct inode *dir, __u32 hash,
@@ -180,14 +180,38 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                             struct inode *inode);
 
+unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
+{
+       unsigned len = le16_to_cpu(dlen);
+
+       if (len == EXT4_MAX_REC_LEN || len == 0)
+               return blocksize;
+       return (len & 65532) | ((len & 3) << 16);
+}
+  
+__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
+{
+       if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
+               BUG();
+       if (len < 65536)
+               return cpu_to_le16(len);
+       if (len == blocksize) {
+               if (blocksize == 65536)
+                       return cpu_to_le16(EXT4_MAX_REC_LEN);
+               else 
+                       return cpu_to_le16(0);
+       }
+       return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
+}
+
 /*
  * p is at least 6 bytes before the end of page
  */
 static inline struct ext4_dir_entry_2 *
-ext4_next_entry(struct ext4_dir_entry_2 *p)
+ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
 {
        return (struct ext4_dir_entry_2 *)((char *)p +
-               ext4_rec_len_from_disk(p->rec_len));
+               ext4_rec_len_from_disk(p->rec_len, blocksize));
 }
 
 /*
@@ -294,7 +318,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
                        space += EXT4_DIR_REC_LEN(de->name_len);
                        names++;
                }
-               de = ext4_next_entry(de);
+               de = ext4_next_entry(de, size);
        }
        printk("(%i)\n", names);
        return (struct stats) { names, space, 1 };
@@ -585,7 +609,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
        top = (struct ext4_dir_entry_2 *) ((char *) de +
                                           dir->i_sb->s_blocksize -
                                           EXT4_DIR_REC_LEN(0));
-       for (; de < top; de = ext4_next_entry(de)) {
+       for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
                if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
                                        (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
                                                +((char *)de - bh->b_data))) {
@@ -663,7 +687,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
        }
        if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
                de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
-               de = ext4_next_entry(de);
+               de = ext4_next_entry(de, dir->i_sb->s_blocksize);
                if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
                        goto errout;
                count++;
@@ -713,15 +737,15 @@ errout:
  * Create map of hash values, offsets, and sizes, stored at end of block.
  * Returns number of entries mapped.
  */
-static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
-                       struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
+static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
+                      struct dx_hash_info *hinfo,
+                      struct dx_map_entry *map_tail)
 {
        int count = 0;
        char *base = (char *) de;
        struct dx_hash_info h = *hinfo;
 
-       while ((char *) de < base + size)
-       {
+       while ((char *) de < base + blocksize) {
                if (de->name_len && de->inode) {
                        ext4fs_dirhash(de->name, de->name_len, &h);
                        map_tail--;
@@ -732,7 +756,7 @@ static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
                        cond_resched();
                }
                /* XXX: do we need to check rec_len == 0 case? -Chris */
-               de = ext4_next_entry(de);
+               de = ext4_next_entry(de, blocksize);
        }
        return count;
 }
@@ -832,7 +856,8 @@ static inline int search_dirblock(struct buffer_head *bh,
                        return 1;
                }
                /* prevent looping on a bad block */
-               de_len = ext4_rec_len_from_disk(de->rec_len);
+               de_len = ext4_rec_len_from_disk(de->rec_len,
+                                               dir->i_sb->s_blocksize);
                if (de_len <= 0)
                        return -1;
                offset += de_len;
@@ -996,7 +1021,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
                de = (struct ext4_dir_entry_2 *) bh->b_data;
                top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
                                       EXT4_DIR_REC_LEN(0));
-               for (; de < top; de = ext4_next_entry(de)) {
+               for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
                        int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
                                  + ((char *) de - bh->b_data);
 
@@ -1052,8 +1077,16 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
                        return ERR_PTR(-EIO);
                }
                inode = ext4_iget(dir->i_sb, ino);
-               if (IS_ERR(inode))
-                       return ERR_CAST(inode);
+               if (unlikely(IS_ERR(inode))) {
+                       if (PTR_ERR(inode) == -ESTALE) {
+                               ext4_error(dir->i_sb, __func__,
+                                               "deleted inode referenced: %u",
+                                               ino);
+                               return ERR_PTR(-EIO);
+                       } else {
+                               return ERR_CAST(inode);
+                       }
+               }
        }
        return d_splice_alias(inode, dentry);
 }
@@ -1109,7 +1142,8 @@ static inline void ext4_set_de_type(struct super_block *sb,
  * Returns pointer to last entry moved.
  */
 static struct ext4_dir_entry_2 *
-dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
+               unsigned blocksize)
 {
        unsigned rec_len = 0;
 
@@ -1118,7 +1152,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
                rec_len = EXT4_DIR_REC_LEN(de->name_len);
                memcpy (to, de, rec_len);
                ((struct ext4_dir_entry_2 *) to)->rec_len =
-                               ext4_rec_len_to_disk(rec_len);
+                               ext4_rec_len_to_disk(rec_len, blocksize);
                de->inode = 0;
                map++;
                to += rec_len;
@@ -1130,19 +1164,19 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
  * Compact each dir entry in the range to the minimal rec_len.
  * Returns pointer to last entry in range.
  */
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size)
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
 {
        struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
        unsigned rec_len = 0;
 
        prev = to = de;
-       while ((char*)de < base + size) {
-               next = ext4_next_entry(de);
+       while ((char*)de < base + blocksize) {
+               next = ext4_next_entry(de, blocksize);
                if (de->inode && de->name_len) {
                        rec_len = EXT4_DIR_REC_LEN(de->name_len);
                        if (de > to)
                                memmove(to, de, rec_len);
-                       to->rec_len = ext4_rec_len_to_disk(rec_len);
+                       to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
                        prev = to;
                        to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
                }
@@ -1215,10 +1249,12 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
                                        hash2, split, count-split));
 
        /* Fancy dance to stay within two buffers */
-       de2 = dx_move_dirents(data1, data2, map + split, count - split);
+       de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
        de = dx_pack_dirents(data1, blocksize);
-       de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
-       de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
+       de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+                                          blocksize);
+       de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2,
+                                           blocksize);
        dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
        dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
 
@@ -1268,6 +1304,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        const char      *name = dentry->d_name.name;
        int             namelen = dentry->d_name.len;
        unsigned int    offset = 0;
+       unsigned int    blocksize = dir->i_sb->s_blocksize;
        unsigned short  reclen;
        int             nlen, rlen, err;
        char            *top;
@@ -1275,7 +1312,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        reclen = EXT4_DIR_REC_LEN(namelen);
        if (!de) {
                de = (struct ext4_dir_entry_2 *)bh->b_data;
-               top = bh->b_data + dir->i_sb->s_blocksize - reclen;
+               top = bh->b_data + blocksize - reclen;
                while ((char *) de <= top) {
                        if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
                                                  bh, offset)) {
@@ -1287,7 +1324,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                                return -EEXIST;
                        }
                        nlen = EXT4_DIR_REC_LEN(de->name_len);
-                       rlen = ext4_rec_len_from_disk(de->rec_len);
+                       rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
                        if ((de->inode? rlen - nlen: rlen) >= reclen)
                                break;
                        de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
@@ -1306,11 +1343,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 
        /* By now the buffer is marked for journaling */
        nlen = EXT4_DIR_REC_LEN(de->name_len);
-       rlen = ext4_rec_len_from_disk(de->rec_len);
+       rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
        if (de->inode) {
                struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
-               de1->rec_len = ext4_rec_len_to_disk(rlen - nlen);
-               de->rec_len = ext4_rec_len_to_disk(nlen);
+               de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
+               de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
                de = de1;
        }
        de->file_type = EXT4_FT_UNKNOWN;
@@ -1380,7 +1417,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        /* The 0th block becomes the root, move the dirents out */
        fde = &root->dotdot;
        de = (struct ext4_dir_entry_2 *)((char *)fde +
-               ext4_rec_len_from_disk(fde->rec_len));
+               ext4_rec_len_from_disk(fde->rec_len, blocksize));
        if ((char *) de >= (((char *) root) + blocksize)) {
                ext4_error(dir->i_sb, __func__,
                           "invalid rec_len for '..' in inode %lu",
@@ -1402,12 +1439,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        memcpy (data1, de, len);
        de = (struct ext4_dir_entry_2 *) data1;
        top = data1 + len;
-       while ((char *)(de2 = ext4_next_entry(de)) < top)
+       while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
                de = de2;
-       de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
+       de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+                                          blocksize);
        /* Initialize the root; the dot dirents already exist */
        de = (struct ext4_dir_entry_2 *) (&root->dotdot);
-       de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2));
+       de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
+                                          blocksize);
        memset (&root->info, 0, sizeof(root->info));
        root->info.info_length = sizeof(root->info);
        root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -1488,7 +1527,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                return retval;
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        de->inode = 0;
-       de->rec_len = ext4_rec_len_to_disk(blocksize);
+       de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
        return add_dirent_to_buf(handle, dentry, inode, de, bh);
 }
 
@@ -1551,7 +1590,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        goto cleanup;
                node2 = (struct dx_node *)(bh2->b_data);
                entries2 = node2->entries;
-               node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize);
+               node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
+                                                          sb->s_blocksize);
                node2->fake.inode = 0;
                BUFFER_TRACE(frame->bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, frame->bh);
@@ -1639,6 +1679,7 @@ static int ext4_delete_entry(handle_t *handle,
                             struct buffer_head *bh)
 {
        struct ext4_dir_entry_2 *de, *pde;
+       unsigned int blocksize = dir->i_sb->s_blocksize;
        int i;
 
        i = 0;
@@ -1652,8 +1693,11 @@ static int ext4_delete_entry(handle_t *handle,
                        ext4_journal_get_write_access(handle, bh);
                        if (pde)
                                pde->rec_len = ext4_rec_len_to_disk(
-                                       ext4_rec_len_from_disk(pde->rec_len) +
-                                       ext4_rec_len_from_disk(de->rec_len));
+                                       ext4_rec_len_from_disk(pde->rec_len,
+                                                              blocksize) +
+                                       ext4_rec_len_from_disk(de->rec_len,
+                                                              blocksize),
+                                       blocksize);
                        else
                                de->inode = 0;
                        dir->i_version++;
@@ -1661,9 +1705,9 @@ static int ext4_delete_entry(handle_t *handle,
                        ext4_handle_dirty_metadata(handle, dir, bh);
                        return 0;
                }
-               i += ext4_rec_len_from_disk(de->rec_len);
+               i += ext4_rec_len_from_disk(de->rec_len, blocksize);
                pde = de;
-               de = ext4_next_entry(de);
+               de = ext4_next_entry(de, blocksize);
        }
        return -ENOENT;
 }
@@ -1793,6 +1837,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct inode *inode;
        struct buffer_head *dir_block;
        struct ext4_dir_entry_2 *de;
+       unsigned int blocksize = dir->i_sb->s_blocksize;
        int err, retries = 0;
 
        if (EXT4_DIR_LINK_MAX(dir))
@@ -1824,13 +1869,14 @@ retry:
        de = (struct ext4_dir_entry_2 *) dir_block->b_data;
        de->inode = cpu_to_le32(inode->i_ino);
        de->name_len = 1;
-       de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
+       de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+                                          blocksize);
        strcpy(de->name, ".");
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
-       de = ext4_next_entry(de);
+       de = ext4_next_entry(de, blocksize);
        de->inode = cpu_to_le32(dir->i_ino);
-       de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
-                                               EXT4_DIR_REC_LEN(1));
+       de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1),
+                                          blocksize);
        de->name_len = 2;
        strcpy(de->name, "..");
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
@@ -1885,7 +1931,7 @@ static int empty_dir(struct inode *inode)
                return 1;
        }
        de = (struct ext4_dir_entry_2 *) bh->b_data;
-       de1 = ext4_next_entry(de);
+       de1 = ext4_next_entry(de, sb->s_blocksize);
        if (le32_to_cpu(de->inode) != inode->i_ino ||
                        !le32_to_cpu(de1->inode) ||
                        strcmp(".", de->name) ||
@@ -1896,9 +1942,9 @@ static int empty_dir(struct inode *inode)
                brelse(bh);
                return 1;
        }
-       offset = ext4_rec_len_from_disk(de->rec_len) +
-                ext4_rec_len_from_disk(de1->rec_len);
-       de = ext4_next_entry(de1);
+       offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
+                ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
+       de = ext4_next_entry(de1, sb->s_blocksize);
        while (offset < inode->i_size) {
                if (!bh ||
                        (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
@@ -1927,8 +1973,8 @@ static int empty_dir(struct inode *inode)
                        brelse(bh);
                        return 0;
                }
-               offset += ext4_rec_len_from_disk(de->rec_len);
-               de = ext4_next_entry(de);
+               offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
+               de = ext4_next_entry(de, sb->s_blocksize);
        }
        brelse(bh);
        return 1;
@@ -2297,8 +2343,8 @@ retry:
        return err;
 }
 
-#define PARENT_INO(buffer) \
-       (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode)
+#define PARENT_INO(buffer, size) \
+       (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
 
 /*
  * Anybody can rename anything with this: the permission checks are left to the
@@ -2311,7 +2357,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct inode *old_inode, *new_inode;
        struct buffer_head *old_bh, *new_bh, *dir_bh;
        struct ext4_dir_entry_2 *old_de, *new_de;
-       int retval;
+       int retval, force_da_alloc = 0;
 
        old_bh = new_bh = dir_bh = NULL;
 
@@ -2358,7 +2404,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
                if (!dir_bh)
                        goto end_rename;
-               if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
+               if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
+                               old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
                        goto end_rename;
                retval = -EMLINK;
                if (!new_inode && new_dir != old_dir &&
@@ -2430,7 +2477,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (dir_bh) {
                BUFFER_TRACE(dir_bh, "get_write_access");
                ext4_journal_get_write_access(handle, dir_bh);
-               PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
+               PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
+                                               cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
                ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
                ext4_dec_count(handle, old_dir);
@@ -2449,6 +2497,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                ext4_mark_inode_dirty(handle, new_inode);
                if (!new_inode->i_nlink)
                        ext4_orphan_add(handle, new_inode);
+               if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
+                       force_da_alloc = 1;
        }
        retval = 0;
 
@@ -2457,6 +2507,8 @@ end_rename:
        brelse(old_bh);
        brelse(new_bh);
        ext4_journal_stop(handle);
+       if (retval == 0 && force_da_alloc)
+               ext4_alloc_da_blocks(old_inode);
        return retval;
 }
 
index c06886abd6588b3c975ee92dd1eb76a88be29e3a..546c7dd869e19176e77cb54278e9bdfc2b520da4 100644 (file)
@@ -938,10 +938,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
                ext4_group_t flex_group;
                flex_group = ext4_flex_group(sbi, input->group);
-               sbi->s_flex_groups[flex_group].free_blocks +=
-                       input->free_blocks_count;
-               sbi->s_flex_groups[flex_group].free_inodes +=
-                       EXT4_INODES_PER_GROUP(sb);
+               atomic_add(input->free_blocks_count,
+                          &sbi->s_flex_groups[flex_group].free_blocks);
+               atomic_add(EXT4_INODES_PER_GROUP(sb),
+                          &sbi->s_flex_groups[flex_group].free_inodes);
        }
 
        ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
index f7371a6a923d359480108517c1a3358276a42eb7..9987bba99db3cc62c2faa887b95783676f83b480 100644 (file)
@@ -35,6 +35,7 @@
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
+#include <linux/ctype.h>
 #include <linux/marker.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
@@ -48,6 +49,7 @@
 #include "group.h"
 
 struct proc_dir_entry *ext4_proc_root;
+static struct kset *ext4_kset;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
@@ -577,9 +579,9 @@ static void ext4_put_super(struct super_block *sb)
                ext4_commit_super(sb, es, 1);
        }
        if (sbi->s_proc) {
-               remove_proc_entry("inode_readahead_blks", sbi->s_proc);
                remove_proc_entry(sb->s_id, ext4_proc_root);
        }
+       kobject_del(&sbi->s_kobj);
 
        for (i = 0; i < sbi->s_gdb_count; i++)
                brelse(sbi->s_group_desc[i]);
@@ -615,6 +617,17 @@ static void ext4_put_super(struct super_block *sb)
                ext4_blkdev_remove(sbi);
        }
        sb->s_fs_info = NULL;
+       /*
+        * Now that we are completely done shutting down the
+        * superblock, we need to actually destroy the kobject.
+        */
+       unlock_kernel();
+       unlock_super(sb);
+       kobject_put(&sbi->s_kobj);
+       wait_for_completion(&sbi->s_kobj_unregister);
+       lock_super(sb);
+       lock_kernel();
+       kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        return;
 }
@@ -803,8 +816,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
                seq_puts(seq, ",noacl");
 #endif
-       if (!test_opt(sb, RESERVATION))
-               seq_puts(seq, ",noreservation");
        if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
                seq_printf(seq, ",commit=%u",
                           (unsigned) (sbi->s_commit_interval / HZ));
@@ -855,6 +866,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (test_opt(sb, DATA_ERR_ABORT))
                seq_puts(seq, ",data_err=abort");
 
+       if (test_opt(sb, NO_AUTO_DA_ALLOC))
+               seq_puts(seq, ",noauto_da_alloc");
+
        ext4_show_quota_options(seq, sb);
        return 0;
 }
@@ -1004,7 +1018,7 @@ enum {
        Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
        Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
        Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
-       Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
+       Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh,
        Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
        Opt_journal_update, Opt_journal_dev,
        Opt_journal_checksum, Opt_journal_async_commit,
@@ -1012,8 +1026,8 @@ enum {
        Opt_data_err_abort, Opt_data_err_ignore,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
-       Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
-       Opt_grpquota, Opt_i_version,
+       Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
+       Opt_usrquota, Opt_grpquota, Opt_i_version,
        Opt_stripe, Opt_delalloc, Opt_nodelalloc,
        Opt_inode_readahead_blks, Opt_journal_ioprio
 };
@@ -1039,8 +1053,6 @@ static const match_table_t tokens = {
        {Opt_nouser_xattr, "nouser_xattr"},
        {Opt_acl, "acl"},
        {Opt_noacl, "noacl"},
-       {Opt_reservation, "reservation"},
-       {Opt_noreservation, "noreservation"},
        {Opt_noload, "noload"},
        {Opt_nobh, "nobh"},
        {Opt_bh, "bh"},
@@ -1068,6 +1080,8 @@ static const match_table_t tokens = {
        {Opt_quota, "quota"},
        {Opt_usrquota, "usrquota"},
        {Opt_barrier, "barrier=%u"},
+       {Opt_barrier, "barrier"},
+       {Opt_nobarrier, "nobarrier"},
        {Opt_i_version, "i_version"},
        {Opt_stripe, "stripe=%u"},
        {Opt_resize, "resize"},
@@ -1075,6 +1089,9 @@ static const match_table_t tokens = {
        {Opt_nodelalloc, "nodelalloc"},
        {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
        {Opt_journal_ioprio, "journal_ioprio=%u"},
+       {Opt_auto_da_alloc, "auto_da_alloc=%u"},
+       {Opt_auto_da_alloc, "auto_da_alloc"},
+       {Opt_noauto_da_alloc, "noauto_da_alloc"},
        {Opt_err, NULL},
 };
 
@@ -1207,12 +1224,6 @@ static int parse_options(char *options, struct super_block *sb,
                               "not supported\n");
                        break;
 #endif
-               case Opt_reservation:
-                       set_opt(sbi->s_mount_opt, RESERVATION);
-                       break;
-               case Opt_noreservation:
-                       clear_opt(sbi->s_mount_opt, RESERVATION);
-                       break;
                case Opt_journal_update:
                        /* @@@ FIXME */
                        /* Eventually we will want to be able to create
@@ -1415,9 +1426,14 @@ set_qf_format:
                case Opt_abort:
                        set_opt(sbi->s_mount_opt, ABORT);
                        break;
+               case Opt_nobarrier:
+                       clear_opt(sbi->s_mount_opt, BARRIER);
+                       break;
                case Opt_barrier:
-                       if (match_int(&args[0], &option))
-                               return 0;
+                       if (match_int(&args[0], &option)) {
+                               set_opt(sbi->s_mount_opt, BARRIER);
+                               break;
+                       }
                        if (option)
                                set_opt(sbi->s_mount_opt, BARRIER);
                        else
@@ -1463,6 +1479,11 @@ set_qf_format:
                                return 0;
                        if (option < 0 || option > (1 << 30))
                                return 0;
+                       if (option & (option - 1)) {
+                               printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
+                                      " must be a power of 2\n");
+                               return 0;
+                       }
                        sbi->s_inode_readahead_blks = option;
                        break;
                case Opt_journal_ioprio:
@@ -1473,6 +1494,19 @@ set_qf_format:
                        *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
                                                            option);
                        break;
+               case Opt_noauto_da_alloc:
+                       set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+                       break;
+               case Opt_auto_da_alloc:
+                       if (match_int(&args[0], &option)) {
+                               clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+                               break;
+                       }
+                       if (option)
+                               clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+                       else
+                               set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+                       break;
                default:
                        printk(KERN_ERR
                               "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1612,10 +1646,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
                gdp = ext4_get_group_desc(sb, i, &bh);
 
                flex_group = ext4_flex_group(sbi, i);
-               sbi->s_flex_groups[flex_group].free_inodes +=
-                       ext4_free_inodes_count(sb, gdp);
-               sbi->s_flex_groups[flex_group].free_blocks +=
-                       ext4_free_blks_count(sb, gdp);
+               atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
+                          ext4_free_inodes_count(sb, gdp));
+               atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
+                          ext4_free_blks_count(sb, gdp));
+               atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
+                          ext4_used_dirs_count(sb, gdp));
        }
 
        return 1;
@@ -1991,6 +2027,181 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
        return 0;
 }
 
+/* sysfs supprt */
+
+struct ext4_attr {
+       struct attribute attr;
+       ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
+       ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 
+                        const char *, size_t);
+       int offset;
+};
+
+static int parse_strtoul(const char *buf,
+               unsigned long max, unsigned long *value)
+{
+       char *endp;
+
+       while (*buf && isspace(*buf))
+               buf++;
+       *value = simple_strtoul(buf, &endp, 0);
+       while (*endp && isspace(*endp))
+               endp++;
+       if (*endp || *value > max)
+               return -EINVAL;
+
+       return 0;
+}
+
+static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
+                                             struct ext4_sb_info *sbi,
+                                             char *buf)
+{
+       return snprintf(buf, PAGE_SIZE, "%llu\n",
+                       (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+}
+
+static ssize_t session_write_kbytes_show(struct ext4_attr *a,
+                                        struct ext4_sb_info *sbi, char *buf)
+{
+       struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+       return snprintf(buf, PAGE_SIZE, "%lu\n",
+                       (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+                        sbi->s_sectors_written_start) >> 1);
+}
+
+static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
+                                         struct ext4_sb_info *sbi, char *buf)
+{
+       struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+       return snprintf(buf, PAGE_SIZE, "%llu\n",
+                       sbi->s_kbytes_written + 
+                       ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+                         EXT4_SB(sb)->s_sectors_written_start) >> 1));
+}
+
+static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
+                                         struct ext4_sb_info *sbi,
+                                         const char *buf, size_t count)
+{
+       unsigned long t;
+
+       if (parse_strtoul(buf, 0x40000000, &t))
+               return -EINVAL;
+
+       /* inode_readahead_blks must be a power of 2 */
+       if (t & (t-1))
+               return -EINVAL;
+
+       sbi->s_inode_readahead_blks = t;
+       return count;
+}
+
+static ssize_t sbi_ui_show(struct ext4_attr *a,
+                               struct ext4_sb_info *sbi, char *buf)
+{
+       unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+
+       return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+
+static ssize_t sbi_ui_store(struct ext4_attr *a,
+                           struct ext4_sb_info *sbi,
+                           const char *buf, size_t count)
+{
+       unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+       unsigned long t;
+
+       if (parse_strtoul(buf, 0xffffffff, &t))
+               return -EINVAL;
+       *ui = t;
+       return count;
+}
+
+#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
+static struct ext4_attr ext4_attr_##_name = {                  \
+       .attr = {.name = __stringify(_name), .mode = _mode },   \
+       .show   = _show,                                        \
+       .store  = _store,                                       \
+       .offset = offsetof(struct ext4_sb_info, _elname),       \
+}
+#define EXT4_ATTR(name, mode, show, store) \
+static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
+
+#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
+#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
+#define EXT4_RW_ATTR_SBI_UI(name, elname)      \
+       EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
+#define ATTR_LIST(name) &ext4_attr_##name.attr
+
+EXT4_RO_ATTR(delayed_allocation_blocks);
+EXT4_RO_ATTR(session_write_kbytes);
+EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
+                inode_readahead_blks_store, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
+EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
+EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+
+static struct attribute *ext4_attrs[] = {
+       ATTR_LIST(delayed_allocation_blocks),
+       ATTR_LIST(session_write_kbytes),
+       ATTR_LIST(lifetime_write_kbytes),
+       ATTR_LIST(inode_readahead_blks),
+       ATTR_LIST(mb_stats),
+       ATTR_LIST(mb_max_to_scan),
+       ATTR_LIST(mb_min_to_scan),
+       ATTR_LIST(mb_order2_req),
+       ATTR_LIST(mb_stream_req),
+       ATTR_LIST(mb_group_prealloc),
+       NULL,
+};
+
+static ssize_t ext4_attr_show(struct kobject *kobj,
+                             struct attribute *attr, char *buf)
+{
+       struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+                                               s_kobj);
+       struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+       return a->show ? a->show(a, sbi, buf) : 0;
+}
+
+static ssize_t ext4_attr_store(struct kobject *kobj,
+                              struct attribute *attr,
+                              const char *buf, size_t len)
+{
+       struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+                                               s_kobj);
+       struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+       return a->store ? a->store(a, sbi, buf, len) : 0;
+}
+
+static void ext4_sb_release(struct kobject *kobj)
+{
+       struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+                                               s_kobj);
+       complete(&sbi->s_kobj_unregister);
+}
+
+
+static struct sysfs_ops ext4_attr_ops = {
+       .show   = ext4_attr_show,
+       .store  = ext4_attr_store,
+};
+
+static struct kobj_type ext4_ktype = {
+       .default_attrs  = ext4_attrs,
+       .sysfs_ops      = &ext4_attr_ops,
+       .release        = ext4_sb_release,
+};
+
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                __releases(kernel_lock)
                                __acquires(kernel_lock)
@@ -2021,12 +2232,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
+
+       sbi->s_blockgroup_lock =
+               kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+       if (!sbi->s_blockgroup_lock) {
+               kfree(sbi);
+               return -ENOMEM;
+       }
        sb->s_fs_info = sbi;
        sbi->s_mount_opt = 0;
        sbi->s_resuid = EXT4_DEF_RESUID;
        sbi->s_resgid = EXT4_DEF_RESGID;
        sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
        sbi->s_sb_block = sb_block;
+       sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part,
+                                                     sectors[1]);
 
        unlock_kernel();
 
@@ -2064,6 +2284,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_magic = le16_to_cpu(es->s_magic);
        if (sb->s_magic != EXT4_SUPER_MAGIC)
                goto cantfind_ext4;
+       sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
 
        /* Set defaults before we parse the mount options */
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -2101,7 +2322,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
 
-       set_opt(sbi->s_mount_opt, RESERVATION);
        set_opt(sbi->s_mount_opt, BARRIER);
 
        /*
@@ -2325,14 +2545,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_PROC_FS
        if (ext4_proc_root)
                sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
-
-       if (sbi->s_proc)
-               proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
-                                &ext4_ui_proc_fops,
-                                &sbi->s_inode_readahead_blks);
 #endif
 
-       bgl_lock_init(&sbi->s_blockgroup_lock);
+       bgl_lock_init(sbi->s_blockgroup_lock);
 
        for (i = 0; i < db_count; i++) {
                block = descriptor_loc(sb, logical_sb_block, i);
@@ -2564,6 +2779,16 @@ no_journal:
                goto failed_mount4;
        }
 
+       sbi->s_kobj.kset = ext4_kset;
+       init_completion(&sbi->s_kobj_unregister);
+       err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
+                                  "%s", sb->s_id);
+       if (err) {
+               ext4_mb_release(sb);
+               ext4_ext_release(sb);
+               goto failed_mount4;
+       };
+
        /*
         * akpm: core read_super() calls in here with the superblock locked.
         * That deadlocks, because orphan cleanup needs to lock the superblock
@@ -2618,7 +2843,6 @@ failed_mount2:
        kfree(sbi->s_group_desc);
 failed_mount:
        if (sbi->s_proc) {
-               remove_proc_entry("inode_readahead_blks", sbi->s_proc);
                remove_proc_entry(sb->s_id, ext4_proc_root);
        }
 #ifdef CONFIG_QUOTA
@@ -2913,6 +3137,10 @@ static int ext4_commit_super(struct super_block *sb,
                set_buffer_uptodate(sbh);
        }
        es->s_wtime = cpu_to_le32(get_seconds());
+       es->s_kbytes_written =
+               cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 
+                           ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+                             EXT4_SB(sb)->s_sectors_written_start) >> 1));
        ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
                                        &EXT4_SB(sb)->s_freeblocks_counter));
        es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
@@ -3647,45 +3875,6 @@ static int ext4_get_sb(struct file_system_type *fs_type,
        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
 }
 
-#ifdef CONFIG_PROC_FS
-static int ext4_ui_proc_show(struct seq_file *m, void *v)
-{
-       unsigned int *p = m->private;
-
-       seq_printf(m, "%u\n", *p);
-       return 0;
-}
-
-static int ext4_ui_proc_open(struct inode *inode, struct file *file)
-{
-       return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
-}
-
-static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
-                              size_t cnt, loff_t *ppos)
-{
-       unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
-       char str[32];
-
-       if (cnt >= sizeof(str))
-               return -EINVAL;
-       if (copy_from_user(str, buf, cnt))
-               return -EFAULT;
-
-       *p = simple_strtoul(str, NULL, 0);
-       return cnt;
-}
-
-const struct file_operations ext4_ui_proc_fops = {
-       .owner          = THIS_MODULE,
-       .open           = ext4_ui_proc_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-       .write          = ext4_ui_proc_write,
-};
-#endif
-
 static struct file_system_type ext4_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext4",
@@ -3719,6 +3908,9 @@ static int __init init_ext4_fs(void)
 {
        int err;
 
+       ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
+       if (!ext4_kset)
+               return -ENOMEM;
        ext4_proc_root = proc_mkdir("fs/ext4", NULL);
        err = init_ext4_mballoc();
        if (err)
@@ -3760,6 +3952,7 @@ static void __exit exit_ext4_fs(void)
        exit_ext4_xattr();
        exit_ext4_mballoc();
        remove_proc_entry("fs/ext4", NULL);
+       kset_unregister(ext4_kset);
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
index 62804e57a44caf2f893d16cc481f7ac347efe19a..4ea72377c7a28797b609c77084b03be21485800d 100644 (file)
@@ -367,6 +367,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        int tag_bytes = journal_tag_bytes(journal);
        struct buffer_head *cbh = NULL; /* For transactional checksums */
        __u32 crc32_sum = ~0;
+       int write_op = WRITE;
 
        /*
         * First job: lock down the current transaction and wait for
@@ -401,6 +402,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        spin_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_LOCKED;
 
+       if (commit_transaction->t_synchronous_commit)
+               write_op = WRITE_SYNC;
        stats.u.run.rs_wait = commit_transaction->t_max_wait;
        stats.u.run.rs_locked = jiffies;
        stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
@@ -680,7 +683,7 @@ start_journal_io:
                                clear_buffer_dirty(bh);
                                set_buffer_uptodate(bh);
                                bh->b_end_io = journal_end_buffer_io_sync;
-                               submit_bh(WRITE, bh);
+                               submit_bh(write_op, bh);
                        }
                        cond_resched();
                        stats.u.run.rs_blocks_logged += bufs;
index 257ff26257655f4573669a5ab6cb7ee817e0371b..bbe6d592d8b3723a17e2ef7be46098e3bdc10ebf 100644 (file)
  *                     need do nothing.
  * RevokeValid set, Revoked set:
  *                     buffer has been revoked.
+ *
+ * Locking rules:
+ * We keep two hash tables of revoke records. One hashtable belongs to the
+ * running transaction (is pointed to by journal->j_revoke), the other one
+ * belongs to the committing transaction. Accesses to the second hash table
+ * happen only from the kjournald and no other thread touches this table.  Also
+ * journal_switch_revoke_table() which switches which hashtable belongs to the
+ * running and which to the committing transaction is called only from
+ * kjournald. Therefore we need no locks when accessing the hashtable belonging
+ * to the committing transaction.
+ *
+ * All users operating on the hash table belonging to the running transaction
+ * have a handle to the transaction. Therefore they are safe from kjournald
+ * switching hash tables under them. For operations on the lists of entries in
+ * the hash table j_revoke_lock is used.
+ *
+ * Finally, also replay code uses the hash tables but at this moment noone else
+ * can touch them (filesystem isn't mounted yet) and hence no locking is
+ * needed.
  */
 
 #ifndef __KERNEL__
@@ -401,8 +420,6 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
  * the second time we would still have a pending revoke to cancel.  So,
  * do not trust the Revoked bit on buffers unless RevokeValid is also
  * set.
- *
- * The caller must have the journal locked.
  */
 int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 {
@@ -480,10 +497,7 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
 /*
  * Write revoke records to the journal for all entries in the current
  * revoke hash, deleting the entries as we go.
- *
- * Called with the journal lock held.
  */
-
 void jbd2_journal_write_revoke_records(journal_t *journal,
                                  transaction_t *transaction)
 {
index 28ce21d8598e11f16182a973d31d7e8bef83c145..996ffda06bf306fde630e833f91332b745107689 100644 (file)
@@ -1315,6 +1315,8 @@ int jbd2_journal_stop(handle_t *handle)
                }
        }
 
+       if (handle->h_sync)
+               transaction->t_synchronous_commit = 1;
        current->journal_info = NULL;
        spin_lock(&journal->j_state_lock);
        spin_lock(&transaction->t_handle_lock);
index 4d248b3f1323d828170a9de92d2aba1db7c5f0f4..8815a3456b3bf9a78915cabed347779667aefba3 100644 (file)
@@ -648,6 +648,12 @@ struct transaction_s
         */
        int t_handle_count;
 
+       /*
+        * This transaction is being forced and some process is
+        * waiting for it to finish.
+        */
+       int t_synchronous_commit:1;
+
        /*
         * For use by the filesystem to store fs-specific data
         * structures associated with the transaction