]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/commitdiff
ext4: Add multi block allocator for ext4
authorAlex Tomas <alex@clusterfs.com>
Tue, 29 Jan 2008 05:19:52 +0000 (00:19 -0500)
committerTheodore Ts'o <tytso@mit.edu>
Tue, 29 Jan 2008 05:19:52 +0000 (00:19 -0500)
Signed-off-by: Alex Tomas <alex@clusterfs.com>
Signed-off-by: Andreas Dilger <adilger@clusterfs.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
13 files changed:
Documentation/filesystems/ext4.txt
Documentation/filesystems/proc.txt
fs/ext4/Makefile
fs/ext4/balloc.c
fs/ext4/extents.c
fs/ext4/inode.c
fs/ext4/mballoc.c [new file with mode: 0644]
fs/ext4/migrate.c
fs/ext4/super.c
fs/ext4/xattr.c
include/linux/ext4_fs.h
include/linux/ext4_fs_i.h
include/linux/ext4_fs_sb.h

index 4f329afe20ec33a13d1c4d08a76763e3d9263ab8..560f88dc7090dadcfd7c2db4fc6821c64b8da3fe 100644 (file)
@@ -86,9 +86,11 @@ Alex is working on a new set of patches right now.
 When mounting an ext4 filesystem, the following option are accepted:
 (*) == default
 
 When mounting an ext4 filesystem, the following option are accepted:
 (*) == default
 
-extents                        ext4 will use extents to address file data.  The
+extents                (*)     ext4 will use extents to address file data.  The
                        file system will no longer be mountable by ext3.
 
                        file system will no longer be mountable by ext3.
 
+noextents              ext4 will not use extents for newly created files
+
 journal_checksum       Enable checksumming of the journal transactions.
                        This will allow the recovery code in e2fsck and the
                        kernel to detect corruption in the kernel.  It is a
 journal_checksum       Enable checksumming of the journal transactions.
                        This will allow the recovery code in e2fsck and the
                        kernel to detect corruption in the kernel.  It is a
@@ -206,6 +208,12 @@ nobh                       (a) cache disk block mapping information
                        "nobh" option tries to avoid associating buffer
                        heads (supported only for "writeback" mode).
 
                        "nobh" option tries to avoid associating buffer
                        heads (supported only for "writeback" mode).
 
+mballoc                (*)     Use the multiple block allocator for block allocation
+nomballoc              disabled multiple block allocator for block allocation.
+stripe=n               Number of filesystem blocks that mballoc will try
+                       to use for allocation size and alignment. For RAID5/6
+                       systems this should be the number of data
+                       disks *  RAID chunk size in file system blocks.
 
 Data Mode
 ---------
 
 Data Mode
 ---------
index dec99455321fdf86018ca5b2b62936103fd73031..4413a2d4646fc4fd5305cb60e06a57bd5243bd16 100644 (file)
@@ -857,6 +857,45 @@ CPUs.
 The   "procs_blocked" line gives  the  number of  processes currently blocked,
 waiting for I/O to complete.
 
 The   "procs_blocked" line gives  the  number of  processes currently blocked,
 waiting for I/O to complete.
 
+1.9 Ext4 file system parameters
+------------------------------
+Ext4 file system have one directory per partition under /proc/fs/ext4/
+# ls /proc/fs/ext4/hdc/
+group_prealloc  max_to_scan  mb_groups  mb_history  min_to_scan  order2_req
+stats  stream_req
+
+mb_groups:
+This file gives the details of mutiblock allocator buddy cache of free blocks
+
+mb_history:
+Multiblock allocation history.
+
+stats:
+This file indicate whether the multiblock allocator should start collecting
+statistics. The statistics are shown during unmount
+
+group_prealloc:
+The multiblock allocator normalize the block allocation request to
+group_prealloc filesystem blocks if we don't have strip value set.
+The stripe value can be specified at mount time or during mke2fs.
+
+max_to_scan:
+How long multiblock allocator can look for a best extent (in found extents)
+
+min_to_scan:
+How long multiblock allocator  must look for a best extent
+
+order2_req:
+Multiblock allocator use  2^N search using buddies only for requests greater
+than or equal to order2_req. The request size is specfied in file system
+blocks. A value of 2 indicate only if the requests are greater than or equal
+to 4 blocks.
+
+stream_req:
+Files smaller than stream_req are served by the stream allocator, whose
+purpose is to pack requests as close each to other as possible to
+produce smooth I/O traffic. Avalue of 16 indicate that file smaller than 16
+filesystem block size will use group based preallocation.
 
 ------------------------------------------------------------------------------
 Summary
 
 ------------------------------------------------------------------------------
 Summary
index d5fd80bc0d04d9b6496b649c1e37a9ee7f140f09..ac6fa8ca0a2f1b7b3a99f03accce58f6f666e209 100644 (file)
@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
 
 ext4dev-y      := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
                   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
 
 ext4dev-y      := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
                   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-                  ext4_jbd2.o migrate.o
+                  ext4_jbd2.o migrate.o mballoc.o
 
 ext4dev-$(CONFIG_EXT4DEV_FS_XATTR)     += xattr.o xattr_user.o xattr_trusted.o
 ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o
 
 ext4dev-$(CONFIG_EXT4DEV_FS_XATTR)     += xattr.o xattr_user.o xattr_trusted.o
 ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o
index 80a4616c82448bee3b0c68120fb6cde9bfc230b4..ac75ea953d83fbfe3d3919783f85c963d905c264 100644 (file)
@@ -577,6 +577,8 @@ void ext4_discard_reservation(struct inode *inode)
        struct ext4_reserve_window_node *rsv;
        spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
 
        struct ext4_reserve_window_node *rsv;
        spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
 
+       ext4_mb_discard_inode_preallocations(inode);
+
        if (!block_i)
                return;
 
        if (!block_i)
                return;
 
@@ -785,19 +787,29 @@ error_return:
  * @inode:             inode
  * @block:             start physical block to free
  * @count:             number of blocks to count
  * @inode:             inode
  * @block:             start physical block to free
  * @count:             number of blocks to count
+ * @metadata:          Are these metadata blocks
  */
 void ext4_free_blocks(handle_t *handle, struct inode *inode,
  */
 void ext4_free_blocks(handle_t *handle, struct inode *inode,
-                       ext4_fsblk_t block, unsigned long count)
+                       ext4_fsblk_t block, unsigned long count,
+                       int metadata)
 {
        struct super_block * sb;
        unsigned long dquot_freed_blocks;
 
 {
        struct super_block * sb;
        unsigned long dquot_freed_blocks;
 
+       /* this isn't the right place to decide whether block is metadata
+        * inode.c/extents.c knows better, but for safety ... */
+       if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
+                       ext4_should_journal_data(inode))
+               metadata = 1;
+
        sb = inode->i_sb;
        sb = inode->i_sb;
-       if (!sb) {
-               printk ("ext4_free_blocks: nonexistent device");
-               return;
-       }
-       ext4_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
+
+       if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info)
+               ext4_free_blocks_sb(handle, sb, block, count,
+                                               &dquot_freed_blocks);
+       else
+               ext4_mb_free_blocks(handle, inode, block, count,
+                                               metadata, &dquot_freed_blocks);
        if (dquot_freed_blocks)
                DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
        return;
        if (dquot_freed_blocks)
                DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
        return;
@@ -1576,7 +1588,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 }
 
 /**
 }
 
 /**
- * ext4_new_blocks() -- core block(s) allocation function
+ * ext4_new_blocks_old() -- core block(s) allocation function
  * @handle:            handle to this transaction
  * @inode:             file inode
  * @goal:              given target block(filesystem wide)
  * @handle:            handle to this transaction
  * @inode:             file inode
  * @goal:              given target block(filesystem wide)
@@ -1589,7 +1601,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
  * any specific goal block.
  *
  */
  * any specific goal block.
  *
  */
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
        struct buffer_head *bitmap_bh = NULL;
                        ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
        struct buffer_head *bitmap_bh = NULL;
@@ -1849,13 +1861,46 @@ out:
 }
 
 ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
 }
 
 ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
-                       ext4_fsblk_t goal, int *errp)
+               ext4_fsblk_t goal, int *errp)
+{
+       struct ext4_allocation_request ar;
+       ext4_fsblk_t ret;
+
+       if (!test_opt(inode->i_sb, MBALLOC)) {
+               unsigned long count = 1;
+               ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
+               return ret;
+       }
+
+       memset(&ar, 0, sizeof(ar));
+       ar.inode = inode;
+       ar.goal = goal;
+       ar.len = 1;
+       ret = ext4_mb_new_blocks(handle, &ar, errp);
+       return ret;
+}
+
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+               ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
 {
-       unsigned long count = 1;
+       struct ext4_allocation_request ar;
+       ext4_fsblk_t ret;
 
 
-       return ext4_new_blocks(handle, inode, goal, &count, errp);
+       if (!test_opt(inode->i_sb, MBALLOC)) {
+               ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
+               return ret;
+       }
+
+       memset(&ar, 0, sizeof(ar));
+       ar.inode = inode;
+       ar.goal = goal;
+       ar.len = *count;
+       ret = ext4_mb_new_blocks(handle, &ar, errp);
+       *count = ar.len;
+       return ret;
 }
 
 }
 
+
 /**
  * ext4_count_free_blocks() -- count filesystem free blocks
  * @sb:                superblock
 /**
  * ext4_count_free_blocks() -- count filesystem free blocks
  * @sb:                superblock
index f5cf2a94b6fc3d678c71ab3feec80fed8f9a3dad..0cffb59fff46853f52e0f21684507a7dafb5d6fc 100644 (file)
@@ -853,7 +853,7 @@ cleanup:
                for (i = 0; i < depth; i++) {
                        if (!ablocks[i])
                                continue;
                for (i = 0; i < depth; i++) {
                        if (!ablocks[i])
                                continue;
-                       ext4_free_blocks(handle, inode, ablocks[i], 1);
+                       ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
                }
        }
        kfree(ablocks);
                }
        }
        kfree(ablocks);
@@ -1698,7 +1698,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
        ext_debug("index is empty, remove it, free block %llu\n", leaf);
        bh = sb_find_get_block(inode->i_sb, leaf);
        ext4_forget(handle, 1, inode, bh, leaf);
        ext_debug("index is empty, remove it, free block %llu\n", leaf);
        bh = sb_find_get_block(inode->i_sb, leaf);
        ext4_forget(handle, 1, inode, bh, leaf);
-       ext4_free_blocks(handle, inode, leaf, 1);
+       ext4_free_blocks(handle, inode, leaf, 1, 1);
        return err;
 }
 
        return err;
 }
 
@@ -1759,8 +1759,10 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 {
        struct buffer_head *bh;
        unsigned short ee_len =  ext4_ext_get_actual_len(ex);
 {
        struct buffer_head *bh;
        unsigned short ee_len =  ext4_ext_get_actual_len(ex);
-       int i;
+       int i, metadata = 0;
 
 
+       if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+               metadata = 1;
 #ifdef EXTENTS_STATS
        {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 #ifdef EXTENTS_STATS
        {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -1789,7 +1791,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                        bh = sb_find_get_block(inode->i_sb, start + i);
                        ext4_forget(handle, 0, inode, bh, start + i);
                }
                        bh = sb_find_get_block(inode->i_sb, start + i);
                        ext4_forget(handle, 0, inode, bh, start + i);
                }
-               ext4_free_blocks(handle, inode, start, num);
+               ext4_free_blocks(handle, inode, start, num, metadata);
        } else if (from == le32_to_cpu(ex->ee_block)
                   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
                printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
        } else if (from == le32_to_cpu(ex->ee_block)
                   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
                printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
@@ -2287,6 +2289,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        ext4_fsblk_t goal, newblock;
        int err = 0, depth, ret;
        unsigned long allocated = 0;
        ext4_fsblk_t goal, newblock;
        int err = 0, depth, ret;
        unsigned long allocated = 0;
+       struct ext4_allocation_request ar;
 
        __clear_bit(BH_New, &bh_result->b_state);
        ext_debug("blocks %u/%lu requested for inode %u\n",
 
        __clear_bit(BH_New, &bh_result->b_state);
        ext_debug("blocks %u/%lu requested for inode %u\n",
@@ -2397,8 +2400,15 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
                ext4_init_block_alloc_info(inode);
 
        if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
                ext4_init_block_alloc_info(inode);
 
-       /* allocate new block */
-       goal = ext4_ext_find_goal(inode, path, iblock);
+       /* find neighbour allocated blocks */
+       ar.lleft = iblock;
+       err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
+       if (err)
+               goto out2;
+       ar.lright = iblock;
+       err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
+       if (err)
+               goto out2;
 
        /*
         * See if request is beyond maximum number of blocks we can have in
 
        /*
         * See if request is beyond maximum number of blocks we can have in
@@ -2421,7 +2431,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                allocated = le16_to_cpu(newex.ee_len);
        else
                allocated = max_blocks;
                allocated = le16_to_cpu(newex.ee_len);
        else
                allocated = max_blocks;
-       newblock = ext4_new_blocks(handle, inode, goal, &allocated, &err);
+
+       /* allocate new block */
+       ar.inode = inode;
+       ar.goal = ext4_ext_find_goal(inode, path, iblock);
+       ar.logical = iblock;
+       ar.len = allocated;
+       if (S_ISREG(inode->i_mode))
+               ar.flags = EXT4_MB_HINT_DATA;
+       else
+               /* disable in-core preallocation for non-regular files */
+               ar.flags = 0;
+       newblock = ext4_mb_new_blocks(handle, &ar, &err);
        if (!newblock)
                goto out2;
        ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
        if (!newblock)
                goto out2;
        ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
@@ -2429,14 +2450,17 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 
        /* try to insert new extent into found leaf and return */
        ext4_ext_store_pblock(&newex, newblock);
 
        /* try to insert new extent into found leaf and return */
        ext4_ext_store_pblock(&newex, newblock);
-       newex.ee_len = cpu_to_le16(allocated);
+       newex.ee_len = cpu_to_le16(ar.len);
        if (create == EXT4_CREATE_UNINITIALIZED_EXT)  /* Mark uninitialized */
                ext4_ext_mark_uninitialized(&newex);
        err = ext4_ext_insert_extent(handle, inode, path, &newex);
        if (err) {
                /* free data blocks we just allocated */
        if (create == EXT4_CREATE_UNINITIALIZED_EXT)  /* Mark uninitialized */
                ext4_ext_mark_uninitialized(&newex);
        err = ext4_ext_insert_extent(handle, inode, path, &newex);
        if (err) {
                /* free data blocks we just allocated */
+               /* not a good idea to call discard here directly,
+                * but otherwise we'd need to call it every free() */
+               ext4_mb_discard_inode_preallocations(inode);
                ext4_free_blocks(handle, inode, ext_pblock(&newex),
                ext4_free_blocks(handle, inode, ext_pblock(&newex),
-                                       le16_to_cpu(newex.ee_len));
+                                       le16_to_cpu(newex.ee_len), 0);
                goto out2;
        }
 
                goto out2;
        }
 
@@ -2445,6 +2469,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 
        /* previous routine could use block we allocated */
        newblock = ext_pblock(&newex);
 
        /* previous routine could use block we allocated */
        newblock = ext_pblock(&newex);
+       allocated = le16_to_cpu(newex.ee_len);
 outnew:
        __set_bit(BH_New, &bh_result->b_state);
 
 outnew:
        __set_bit(BH_New, &bh_result->b_state);
 
@@ -2496,6 +2521,8 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
        down_write(&EXT4_I(inode)->i_data_sem);
        ext4_ext_invalidate_cache(inode);
 
        down_write(&EXT4_I(inode)->i_data_sem);
        ext4_ext_invalidate_cache(inode);
 
+       ext4_mb_discard_inode_preallocations(inode);
+
        /*
         * TODO: optimization is possible here.
         * Probably we need not scan at all,
        /*
         * TODO: optimization is possible here.
         * Probably we need not scan at all,
index a06a3b7cfc34f663269e7d4d06698975c6f17a2a..bb717cbb749c822265bccdd8159de7af456ad745 100644 (file)
@@ -551,7 +551,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
        return ret;
 failed_out:
        for (i = 0; i <index; i++)
        return ret;
 failed_out:
        for (i = 0; i <index; i++)
-               ext4_free_blocks(handle, inode, new_blocks[i], 1);
+               ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
        return ret;
 }
 
        return ret;
 }
 
@@ -650,9 +650,9 @@ failed:
                ext4_journal_forget(handle, branch[i].bh);
        }
        for (i = 0; i <indirect_blks; i++)
                ext4_journal_forget(handle, branch[i].bh);
        }
        for (i = 0; i <indirect_blks; i++)
-               ext4_free_blocks(handle, inode, new_blocks[i], 1);
+               ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
 
 
-       ext4_free_blocks(handle, inode, new_blocks[i], num);
+       ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
 
        return err;
 }
 
        return err;
 }
@@ -749,9 +749,10 @@ err_out:
        for (i = 1; i <= num; i++) {
                BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
                ext4_journal_forget(handle, where[i].bh);
        for (i = 1; i <= num; i++) {
                BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
                ext4_journal_forget(handle, where[i].bh);
-               ext4_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
+               ext4_free_blocks(handle, inode,
+                                       le32_to_cpu(where[i-1].key), 1, 0);
        }
        }
-       ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
+       ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
 
        return err;
 }
 
        return err;
 }
@@ -2052,7 +2053,7 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
                }
        }
 
                }
        }
 
-       ext4_free_blocks(handle, inode, block_to_free, count);
+       ext4_free_blocks(handle, inode, block_to_free, count, 0);
 }
 
 /**
 }
 
 /**
@@ -2225,7 +2226,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                ext4_journal_test_restart(handle, inode);
                        }
 
                                ext4_journal_test_restart(handle, inode);
                        }
 
-                       ext4_free_blocks(handle, inode, nr, 1);
+                       ext4_free_blocks(handle, inode, nr, 1, 1);
 
                        if (parent_bh) {
                                /*
 
                        if (parent_bh) {
                                /*
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
new file mode 100644 (file)
index 0000000..76e5fed
--- /dev/null
@@ -0,0 +1,4552 @@
+/*
+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ */
+
+
+/*
+ * mballoc.c contains the multiblocks allocation routines
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/proc_fs.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/version.h>
+#include "group.h"
+
+/*
+ * MUSTDO:
+ *   - test ext4_ext_search_left() and ext4_ext_search_right()
+ *   - search for metadata in few groups
+ *
+ * TODO v4:
+ *   - normalization should take into account whether file is still open
+ *   - discard preallocations if no free space left (policy?)
+ *   - don't normalize tails
+ *   - quota
+ *   - reservation for superuser
+ *
+ * TODO v3:
+ *   - bitmap read-ahead (proposed by Oleg Drokin aka green)
+ *   - track min/max extents in each group for better group selection
+ *   - mb_mark_used() may allocate chunk right after splitting buddy
+ *   - tree of groups sorted by number of free blocks
+ *   - error handling
+ */
+
+/*
+ * The allocation request involve request for multiple number of blocks
+ * near to the goal(block) value specified.
+ *
+ * During initialization phase of the allocator we decide to use the group
+ * preallocation or inode preallocation depending on the size file. The
+ * size of the file could be the resulting file size we would have after
+ * allocation or the current file size which ever is larger. If the size is
+ * less that sbi->s_mb_stream_request we select the group
+ * preallocation. The default value of s_mb_stream_request is 16
+ * blocks. This can also be tuned via
+ * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms
+ * of number of blocks.
+ *
+ * The main motivation for having small file use group preallocation is to
+ * ensure that we have small file closer in the disk.
+ *
+ * First stage the allocator looks at the inode prealloc list
+ * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for
+ * this particular inode. The inode prealloc space is represented as:
+ *
+ * pa_lstart -> the logical start block for this prealloc space
+ * pa_pstart -> the physical start block for this prealloc space
+ * pa_len    -> lenght for this prealloc space
+ * pa_free   ->  free space available in this prealloc space
+ *
+ * The inode preallocation space is used looking at the _logical_ start
+ * block. If only the logical file block falls within the range of prealloc
+ * space we will consume the particular prealloc space. This make sure that
+ * that the we have contiguous physical blocks representing the file blocks
+ *
+ * The important thing to be noted in case of inode prealloc space is that
+ * we don't modify the values associated to inode prealloc space except
+ * pa_free.
+ *
+ * If we are not able to find blocks in the inode prealloc space and if we
+ * have the group allocation flag set then we look at the locality group
+ * prealloc space. These are per CPU prealloc list repreasented as
+ *
+ * ext4_sb_info.s_locality_groups[smp_processor_id()]
+ *
+ * The reason for having a per cpu locality group is to reduce the contention
+ * between CPUs. It is possible to get scheduled at this point.
+ *
+ * The locality group prealloc space is used looking at whether we have
+ * enough free space (pa_free) withing the prealloc space.
+ *
+ * If we can't allocate blocks via inode prealloc or/and locality group
+ * prealloc then we look at the buddy cache. The buddy cache is represented
+ * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
+ * mapped to the buddy and bitmap information regarding different
+ * groups. The buddy information is attached to buddy cache inode so that
+ * we can access them through the page cache. The information regarding
+ * each group is loaded via ext4_mb_load_buddy.  The information involve
+ * block bitmap and buddy information. The information are stored in the
+ * inode as:
+ *
+ *  {                        page                        }
+ *  [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ *
+ *
+ * one block each for bitmap and buddy information.  So for each group we
+ * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE /
+ * blocksize) blocks.  So it can have information regarding groups_per_page
+ * which is blocks_per_page/2
+ *
+ * The buddy cache inode is not stored on disk. The inode is thrown
+ * away when the filesystem is unmounted.
+ *
+ * We look for count number of blocks in the buddy cache. If we were able
+ * to locate that many free blocks we return with additional information
+ * regarding rest of the contiguous physical block available
+ *
+ * Before allocating blocks via buddy cache we normalize the request
+ * blocks. This ensure we ask for more blocks that we needed. The extra
+ * blocks that we get after allocation is added to the respective prealloc
+ * list. In case of inode preallocation we follow a list of heuristics
+ * based on file size. This can be found in ext4_mb_normalize_request. If
+ * we are doing a group prealloc we try to normalize the request to
+ * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to
+ * 512 blocks. This can be tuned via
+ * /proc/fs/ext4/<partition/group_prealloc. The value is represented in
+ * terms of number of blocks. If we have mounted the file system with -O
+ * stripe=<value> option the group prealloc request is normalized to the
+ * stripe value (sbi->s_stripe)
+ *
+ * The regular allocator(using the buddy cache) support few tunables.
+ *
+ * /proc/fs/ext4/<partition>/min_to_scan
+ * /proc/fs/ext4/<partition>/max_to_scan
+ * /proc/fs/ext4/<partition>/order2_req
+ *
+ * The regular allocator use buddy scan only if the request len is power of
+ * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
+ * value of s_mb_order2_reqs can be tuned via
+ * /proc/fs/ext4/<partition>/order2_req.  If the request len is equal to
+ * stripe size (sbi->s_stripe), we try to search for contigous block in
+ * stripe size. This should result in better allocation on RAID setup. If
+ * not we search in the specific group using bitmap for best extents. The
+ * tunable min_to_scan and max_to_scan controll the behaviour here.
+ * min_to_scan indicate how long the mballoc __must__ look for a best
+ * extent and max_to_scanindicate how long the mballoc __can__ look for a
+ * best extent in the found extents. Searching for the blocks starts with
+ * the group specified as the goal value in allocation context via
+ * ac_g_ex. Each group is first checked based on the criteria whether it
+ * can used for allocation. ext4_mb_good_group explains how the groups are
+ * checked.
+ *
+ * Both the prealloc space are getting populated as above. So for the first
+ * request we will hit the buddy cache which will result in this prealloc
+ * space getting filled. The prealloc space is then later used for the
+ * subsequent request.
+ */
+
+/*
+ * mballoc operates on the following data:
+ *  - on-disk bitmap
+ *  - in-core buddy (actually includes buddy and bitmap)
+ *  - preallocation descriptors (PAs)
+ *
+ * there are two types of preallocations:
+ *  - inode
+ *    assiged to specific inode and can be used for this inode only.
+ *    it describes part of inode's space preallocated to specific
+ *    physical blocks. any block from that preallocated can be used
+ *    independent. the descriptor just tracks number of blocks left
+ *    unused. so, before taking some block from descriptor, one must
+ *    make sure corresponded logical block isn't allocated yet. this
+ *    also means that freeing any block within descriptor's range
+ *    must discard all preallocated blocks.
+ *  - locality group
+ *    assigned to specific locality group which does not translate to
+ *    permanent set of inodes: inode can join and leave group. space
+ *    from this type of preallocation can be used for any inode. thus
+ *    it's consumed from the beginning to the end.
+ *
+ * relation between them can be expressed as:
+ *    in-core buddy = on-disk bitmap + preallocation descriptors
+ *
+ * this mean blocks mballoc considers used are:
+ *  - allocated blocks (persistent)
+ *  - preallocated blocks (non-persistent)
+ *
+ * consistency in mballoc world means that at any time a block is either
+ * free or used in ALL structures. notice: "any time" should not be read
+ * literally -- time is discrete and delimited by locks.
+ *
+ *  to keep it simple, we don't use block numbers, instead we count number of
+ *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
+ *
+ * all operations can be expressed as:
+ *  - init buddy:                      buddy = on-disk + PAs
+ *  - new PA:                          buddy += N; PA = N
+ *  - use inode PA:                    on-disk += N; PA -= N
+ *  - discard inode PA                 buddy -= on-disk - PA; PA = 0
+ *  - use locality group PA            on-disk += N; PA -= N
+ *  - discard locality group PA                buddy -= PA; PA = 0
+ *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
+ *        is used in real operation because we can't know actual used
+ *        bits from PA, only from on-disk bitmap
+ *
+ * if we follow this strict logic, then all operations above should be atomic.
+ * given some of them can block, we'd have to use something like semaphores
+ * killing performance on high-end SMP hardware. let's try to relax it using
+ * the following knowledge:
+ *  1) if buddy is referenced, it's already initialized
+ *  2) while block is used in buddy and the buddy is referenced,
+ *     nobody can re-allocate that block
+ *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
+ *     bit set and PA claims same block, it's OK. IOW, one can set bit in
+ *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded
+ *     block
+ *
+ * so, now we're building a concurrency table:
+ *  - init buddy vs.
+ *    - new PA
+ *      blocks for PA are allocated in the buddy, buddy must be referenced
+ *      until PA is linked to allocation group to avoid concurrent buddy init
+ *    - use inode PA
+ *      we need to make sure that either on-disk bitmap or PA has uptodate data
+ *      given (3) we care that PA-=N operation doesn't interfere with init
+ *    - discard inode PA
+ *      the simplest way would be to have buddy initialized by the discard
+ *    - use locality group PA
+ *      again PA-=N must be serialized with init
+ *    - discard locality group PA
+ *      the simplest way would be to have buddy initialized by the discard
+ *  - new PA vs.
+ *    - use inode PA
+ *      i_data_sem serializes them
+ *    - discard inode PA
+ *      discard process must wait until PA isn't used by another process
+ *    - use locality group PA
+ *      some mutex should serialize them
+ *    - discard locality group PA
+ *      discard process must wait until PA isn't used by another process
+ *  - use inode PA
+ *    - use inode PA
+ *      i_data_sem or another mutex should serializes them
+ *    - discard inode PA
+ *      discard process must wait until PA isn't used by another process
+ *    - use locality group PA
+ *      nothing wrong here -- they're different PAs covering different blocks
+ *    - discard locality group PA
+ *      discard process must wait until PA isn't used by another process
+ *
+ * now we're ready to make few consequences:
+ *  - PA is referenced and while it is no discard is possible
+ *  - PA is referenced until block isn't marked in on-disk bitmap
+ *  - PA changes only after on-disk bitmap
+ *  - discard must not compete with init. either init is done before
+ *    any discard or they're serialized somehow
+ *  - buddy init as sum of on-disk bitmap and PAs is done atomically
+ *
+ * a special case when we've used PA to emptiness. no need to modify buddy
+ * in this case, but we should care about concurrent init
+ *
+ */
+
+ /*
+ * Logic in few words:
+ *
+ *  - allocation:
+ *    load group
+ *    find blocks
+ *    mark bits in on-disk bitmap
+ *    release group
+ *
+ *  - use preallocation:
+ *    find proper PA (per-inode or group)
+ *    load group
+ *    mark bits in on-disk bitmap
+ *    release group
+ *    release PA
+ *
+ *  - free:
+ *    load group
+ *    mark bits in on-disk bitmap
+ *    release group
+ *
+ *  - discard preallocations in group:
+ *    mark PAs deleted
+ *    move them onto local list
+ *    load on-disk bitmap
+ *    load group
+ *    remove PA from object (inode or locality group)
+ *    mark free blocks in-core
+ *
+ *  - discard inode's preallocations:
+ */
+
+/*
+ * Locking rules
+ *
+ * Locks:
+ *  - bitlock on a group       (group)
+ *  - object (inode/locality)  (object)
+ *  - per-pa lock              (pa)
+ *
+ * Paths:
+ *  - new pa
+ *    object
+ *    group
+ *
+ *  - find and use pa:
+ *    pa
+ *
+ *  - release consumed pa:
+ *    pa
+ *    group
+ *    object
+ *
+ *  - generate in-core bitmap:
+ *    group
+ *        pa
+ *
+ *  - discard all for given object (inode, locality group):
+ *    object
+ *        pa
+ *    group
+ *
+ *  - discard all for given group:
+ *    group
+ *        pa
+ *    group
+ *        object
+ *
+ */
+
+/*
+ * with AGGRESSIVE_CHECK allocator runs consistency checks over
+ * structures. these checks slow things down a lot
+ */
+#define AGGRESSIVE_CHECK__
+
+/*
+ * with DOUBLE_CHECK defined mballoc creates persistent in-core
+ * bitmaps, maintains and uses them to check for double allocations
+ */
+#define DOUBLE_CHECK__
+
+/*
+ */
+#define MB_DEBUG__
+#ifdef MB_DEBUG
+#define mb_debug(fmt, a...)    printk(fmt, ##a)
+#else
+#define mb_debug(fmt, a...)
+#endif
+
+/*
+ * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
+ * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
+ */
+#define EXT4_MB_HISTORY
+#define EXT4_MB_HISTORY_ALLOC          1       /* allocation */
+#define EXT4_MB_HISTORY_PREALLOC       2       /* preallocated blocks used */
+#define EXT4_MB_HISTORY_DISCARD                4       /* preallocation discarded */
+#define EXT4_MB_HISTORY_FREE           8       /* free */
+
+#define EXT4_MB_HISTORY_DEFAULT                (EXT4_MB_HISTORY_ALLOC | \
+                                        EXT4_MB_HISTORY_PREALLOC)
+
+/*
+ * How long mballoc can look for a best extent (in found extents)
+ */
+#define MB_DEFAULT_MAX_TO_SCAN         200
+
+/*
+ * How long mballoc must look for a best extent
+ */
+#define MB_DEFAULT_MIN_TO_SCAN         10
+
+/*
+ * How many groups mballoc will scan looking for the best chunk
+ */
+#define MB_DEFAULT_MAX_GROUPS_TO_SCAN  5
+
+/*
+ * with 'ext4_mb_stats' allocator will collect stats that will be
+ * shown at umount. The collecting costs though!
+ */
+#define MB_DEFAULT_STATS               1
+
+/*
+ * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
+ * by the stream allocator, which purpose is to pack requests
+ * as close each to other as possible to produce smooth I/O traffic
+ * We use locality group prealloc space for stream request.
+ * We can tune the same via /proc/fs/ext4/<parition>/stream_req
+ */
+#define MB_DEFAULT_STREAM_THRESHOLD    16      /* 64K */
+
+/*
+ * for which requests use 2^N search using buddies
+ */
+#define MB_DEFAULT_ORDER2_REQS         2
+
+/*
+ * default group prealloc size 512 blocks
+ */
+#define MB_DEFAULT_GROUP_PREALLOC      512
+
+static struct kmem_cache *ext4_pspace_cachep;
+
+#ifdef EXT4_BB_MAX_BLOCKS
+#undef EXT4_BB_MAX_BLOCKS
+#endif
+#define EXT4_BB_MAX_BLOCKS     30
+
+struct ext4_free_metadata {
+       ext4_group_t group;
+       unsigned short num;
+       ext4_grpblk_t  blocks[EXT4_BB_MAX_BLOCKS];
+       struct list_head list;
+};
+
+struct ext4_group_info {
+       unsigned long   bb_state;
+       unsigned long   bb_tid;
+       struct ext4_free_metadata *bb_md_cur;
+       unsigned short  bb_first_free;
+       unsigned short  bb_free;
+       unsigned short  bb_fragments;
+       struct          list_head bb_prealloc_list;
+#ifdef DOUBLE_CHECK
+       void            *bb_bitmap;
+#endif
+       unsigned short  bb_counters[];
+};
+
+#define EXT4_GROUP_INFO_NEED_INIT_BIT  0
+#define EXT4_GROUP_INFO_LOCKED_BIT     1
+
+#define EXT4_MB_GRP_NEED_INIT(grp)     \
+       (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+
+
+struct ext4_prealloc_space {
+       struct list_head        pa_inode_list;
+       struct list_head        pa_group_list;
+       union {
+               struct list_head pa_tmp_list;
+               struct rcu_head pa_rcu;
+       } u;
+       spinlock_t              pa_lock;
+       atomic_t                pa_count;
+       unsigned                pa_deleted;
+       ext4_fsblk_t            pa_pstart;      /* phys. block */
+       ext4_lblk_t             pa_lstart;      /* log. block */
+       unsigned short          pa_len;         /* len of preallocated chunk */
+       unsigned short          pa_free;        /* how many blocks are free */
+       unsigned short          pa_linear;      /* consumed in one direction
+                                                * strictly, for grp prealloc */
+       spinlock_t              *pa_obj_lock;
+       struct inode            *pa_inode;      /* hack, for history only */
+};
+
+
+struct ext4_free_extent {
+       ext4_lblk_t fe_logical;
+       ext4_grpblk_t fe_start;
+       ext4_group_t fe_group;
+       int fe_len;
+};
+
+/*
+ * Locality group:
+ *   we try to group all related changes together
+ *   so that writeback can flush/allocate them together as well
+ */
+struct ext4_locality_group {
+       /* for allocator */
+       struct mutex            lg_mutex;       /* to serialize allocates */
+       struct list_head        lg_prealloc_list;/* list of preallocations */
+       spinlock_t              lg_prealloc_lock;
+};
+
+struct ext4_allocation_context {
+       struct inode *ac_inode;
+       struct super_block *ac_sb;
+
+       /* original request */
+       struct ext4_free_extent ac_o_ex;
+
+       /* goal request (after normalization) */
+       struct ext4_free_extent ac_g_ex;
+
+       /* the best found extent */
+       struct ext4_free_extent ac_b_ex;
+
+       /* copy of the bext found extent taken before preallocation efforts */
+       struct ext4_free_extent ac_f_ex;
+
+       /* number of iterations done. we have to track to limit searching */
+       unsigned long ac_ex_scanned;
+       __u16 ac_groups_scanned;
+       __u16 ac_found;
+       __u16 ac_tail;
+       __u16 ac_buddy;
+       __u16 ac_flags;         /* allocation hints */
+       __u8 ac_status;
+       __u8 ac_criteria;
+       __u8 ac_repeats;
+       __u8 ac_2order;         /* if request is to allocate 2^N blocks and
+                                * N > 0, the field stores N, otherwise 0 */
+       __u8 ac_op;             /* operation, for history only */
+       struct page *ac_bitmap_page;
+       struct page *ac_buddy_page;
+       struct ext4_prealloc_space *ac_pa;
+       struct ext4_locality_group *ac_lg;
+};
+
+#define AC_STATUS_CONTINUE     1
+#define AC_STATUS_FOUND                2
+#define AC_STATUS_BREAK                3
+
+struct ext4_mb_history {
+       struct ext4_free_extent orig;   /* orig allocation */
+       struct ext4_free_extent goal;   /* goal allocation */
+       struct ext4_free_extent result; /* result allocation */
+       unsigned pid;
+       unsigned ino;
+       __u16 found;    /* how many extents have been found */
+       __u16 groups;   /* how many groups have been scanned */
+       __u16 tail;     /* what tail broke some buddy */
+       __u16 buddy;    /* buddy the tail ^^^ broke */
+       __u16 flags;
+       __u8 cr:3;      /* which phase the result extent was found at */
+       __u8 op:4;
+       __u8 merged:1;
+};
+
+struct ext4_buddy {
+       struct page *bd_buddy_page;
+       void *bd_buddy;
+       struct page *bd_bitmap_page;
+       void *bd_bitmap;
+       struct ext4_group_info *bd_info;
+       struct super_block *bd_sb;
+       __u16 bd_blkbits;
+       ext4_group_t bd_group;
+};
+#define EXT4_MB_BITMAP(e4b)    ((e4b)->bd_bitmap)
+#define EXT4_MB_BUDDY(e4b)     ((e4b)->bd_buddy)
+
+#ifndef EXT4_MB_HISTORY
+static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
+{
+       return;
+}
+#else
+static void ext4_mb_store_history(struct ext4_allocation_context *ac);
+#endif
+
+#define in_range(b, first, len)        ((b) >= (first) && (b) <= (first) + (len) - 1)
+
+static struct proc_dir_entry *proc_root_ext4;
+struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
+ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+                       ext4_fsblk_t goal, unsigned long *count, int *errp);
+
+static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+                                       ext4_group_t group);
+static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
+static void ext4_mb_free_committed_blocks(struct super_block *);
+static void ext4_mb_return_to_preallocation(struct inode *inode,
+                                       struct ext4_buddy *e4b, sector_t block,
+                                       int count);
+static void ext4_mb_put_pa(struct ext4_allocation_context *,
+                       struct super_block *, struct ext4_prealloc_space *pa);
+static int ext4_mb_init_per_dev_proc(struct super_block *sb);
+static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
+
+
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+{
+       struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+       bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+}
+
+static inline void ext4_unlock_group(struct super_block *sb,
+                                       ext4_group_t group)
+{
+       struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+       bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+}
+
+static inline int ext4_is_group_locked(struct super_block *sb,
+                                       ext4_group_t group)
+{
+       struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+       return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
+                                               &(grinfo->bb_state));
+}
+
+static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
+                                       struct ext4_free_extent *fex)
+{
+       ext4_fsblk_t block;
+
+       block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb)
+                       + fex->fe_start
+                       + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+       return block;
+}
+
+#if BITS_PER_LONG == 64
+#define mb_correct_addr_and_bit(bit, addr)             \
+{                                                      \
+       bit += ((unsigned long) addr & 7UL) << 3;       \
+       addr = (void *) ((unsigned long) addr & ~7UL);  \
+}
+#elif BITS_PER_LONG == 32
+#define mb_correct_addr_and_bit(bit, addr)             \
+{                                                      \
+       bit += ((unsigned long) addr & 3UL) << 3;       \
+       addr = (void *) ((unsigned long) addr & ~3UL);  \
+}
+#else
+#error "how many bits you are?!"
+#endif
+
+static inline int mb_test_bit(int bit, void *addr)
+{
+       /*
+        * ext4_test_bit on architecture like powerpc
+        * needs unsigned long aligned address
+        */
+       mb_correct_addr_and_bit(bit, addr);
+       return ext4_test_bit(bit, addr);
+}
+
+static inline void mb_set_bit(int bit, void *addr)
+{
+       mb_correct_addr_and_bit(bit, addr);
+       ext4_set_bit(bit, addr);
+}
+
+static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
+{
+       mb_correct_addr_and_bit(bit, addr);
+       ext4_set_bit_atomic(lock, bit, addr);
+}
+
+static inline void mb_clear_bit(int bit, void *addr)
+{
+       mb_correct_addr_and_bit(bit, addr);
+       ext4_clear_bit(bit, addr);
+}
+
+static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
+{
+       mb_correct_addr_and_bit(bit, addr);
+       ext4_clear_bit_atomic(lock, bit, addr);
+}
+
+static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
+{
+       char *bb;
+
+       /* FIXME!! is this needed */
+       BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+       BUG_ON(max == NULL);
+
+       if (order > e4b->bd_blkbits + 1) {
+               *max = 0;
+               return NULL;
+       }
+
+       /* at order 0 we see each particular block */
+       *max = 1 << (e4b->bd_blkbits + 3);
+       if (order == 0)
+               return EXT4_MB_BITMAP(e4b);
+
+       bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
+       *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
+
+       return bb;
+}
+
+#ifdef DOUBLE_CHECK
+static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
+                          int first, int count)
+{
+       int i;
+       struct super_block *sb = e4b->bd_sb;
+
+       if (unlikely(e4b->bd_info->bb_bitmap == NULL))
+               return;
+       BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+       for (i = 0; i < count; i++) {
+               if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
+                       ext4_fsblk_t blocknr;
+                       blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
+                       blocknr += first + i;
+                       blocknr +=
+                           le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+
+                       ext4_error(sb, __FUNCTION__, "double-free of inode"
+                                  " %lu's block %llu(bit %u in group %lu)\n",
+                                  inode ? inode->i_ino : 0, blocknr,
+                                  first + i, e4b->bd_group);
+               }
+               mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
+       }
+}
+
+static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
+{
+       int i;
+
+       if (unlikely(e4b->bd_info->bb_bitmap == NULL))
+               return;
+       BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+       for (i = 0; i < count; i++) {
+               BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
+               mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
+       }
+}
+
+static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
+{
+       if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
+               unsigned char *b1, *b2;
+               int i;
+               b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
+               b2 = (unsigned char *) bitmap;
+               for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
+                       if (b1[i] != b2[i]) {
+                               printk("corruption in group %lu at byte %u(%u):"
+                                      " %x in copy != %x on disk/prealloc\n",
+                                       e4b->bd_group, i, i * 8, b1[i], b2[i]);
+                               BUG();
+                       }
+               }
+       }
+}
+
+#else
+static inline void mb_free_blocks_double(struct inode *inode,
+                               struct ext4_buddy *e4b, int first, int count)
+{
+       return;
+}
+static inline void mb_mark_used_double(struct ext4_buddy *e4b,
+                                               int first, int count)
+{
+       return;
+}
+static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
+{
+       return;
+}
+#endif
+
+#ifdef AGGRESSIVE_CHECK
+
+#define MB_CHECK_ASSERT(assert)                                                \
+do {                                                                   \
+       if (!(assert)) {                                                \
+               printk(KERN_EMERG                                       \
+                       "Assertion failure in %s() at %s:%d: \"%s\"\n", \
+                       function, file, line, # assert);                \
+               BUG();                                                  \
+       }                                                               \
+} while (0)
+
+static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
+                               const char *function, int line)
+{
+       struct super_block *sb = e4b->bd_sb;
+       int order = e4b->bd_blkbits + 1;
+       int max;
+       int max2;
+       int i;
+       int j;
+       int k;
+       int count;
+       struct ext4_group_info *grp;
+       int fragments = 0;
+       int fstart;
+       struct list_head *cur;
+       void *buddy;
+       void *buddy2;
+
+       if (!test_opt(sb, MBALLOC))
+               return 0;
+
+       {
+               static int mb_check_counter;
+               if (mb_check_counter++ % 100 != 0)
+                       return 0;
+       }
+
+       while (order > 1) {
+               buddy = mb_find_buddy(e4b, order, &max);
+               MB_CHECK_ASSERT(buddy);
+               buddy2 = mb_find_buddy(e4b, order - 1, &max2);
+               MB_CHECK_ASSERT(buddy2);
+               MB_CHECK_ASSERT(buddy != buddy2);
+               MB_CHECK_ASSERT(max * 2 == max2);
+
+               count = 0;
+               for (i = 0; i < max; i++) {
+
+                       if (mb_test_bit(i, buddy)) {
+                               /* only single bit in buddy2 may be 1 */
+                               if (!mb_test_bit(i << 1, buddy2)) {
+                                       MB_CHECK_ASSERT(
+                                               mb_test_bit((i<<1)+1, buddy2));
+                               } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
+                                       MB_CHECK_ASSERT(
+                                               mb_test_bit(i << 1, buddy2));
+                               }
+                               continue;
+                       }
+
+                       /* both bits in buddy2 must be 0 */
+                       MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
+                       MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
+
+                       for (j = 0; j < (1 << order); j++) {
+                               k = (i * (1 << order)) + j;
+                               MB_CHECK_ASSERT(
+                                       !mb_test_bit(k, EXT4_MB_BITMAP(e4b)));
+                       }
+                       count++;
+               }
+               MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
+               order--;
+       }
+
+       fstart = -1;
+       buddy = mb_find_buddy(e4b, 0, &max);
+       for (i = 0; i < max; i++) {
+               if (!mb_test_bit(i, buddy)) {
+                       MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
+                       if (fstart == -1) {
+                               fragments++;
+                               fstart = i;
+                       }
+                       continue;
+               }
+               fstart = -1;
+               /* check used bits only */
+               for (j = 0; j < e4b->bd_blkbits + 1; j++) {
+                       buddy2 = mb_find_buddy(e4b, j, &max2);
+                       k = i >> j;
+                       MB_CHECK_ASSERT(k < max2);
+                       MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
+               }
+       }
+       MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
+       MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
+
+       grp = ext4_get_group_info(sb, e4b->bd_group);
+       buddy = mb_find_buddy(e4b, 0, &max);
+       list_for_each(cur, &grp->bb_prealloc_list) {
+               ext4_group_t groupnr;
+               struct ext4_prealloc_space *pa;
+               pa = list_entry(cur, struct ext4_prealloc_space, group_list);
+               ext4_get_group_no_and_offset(sb, pa->pstart, &groupnr, &k);
+               MB_CHECK_ASSERT(groupnr == e4b->bd_group);
+               for (i = 0; i < pa->len; i++)
+                       MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
+       }
+       return 0;
+}
+#undef MB_CHECK_ASSERT
+#define mb_check_buddy(e4b) __mb_check_buddy(e4b,      \
+                                       __FILE__, __FUNCTION__, __LINE__)
+#else
+#define mb_check_buddy(e4b)
+#endif
+
+/* FIXME!! need more doc */
+static void ext4_mb_mark_free_simple(struct super_block *sb,
+                               void *buddy, unsigned first, int len,
+                                       struct ext4_group_info *grp)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       unsigned short min;
+       unsigned short max;
+       unsigned short chunk;
+       unsigned short border;
+
+       BUG_ON(len >= EXT4_BLOCKS_PER_GROUP(sb));
+
+       border = 2 << sb->s_blocksize_bits;
+
+       while (len > 0) {
+               /* find how many blocks can be covered since this position */
+               max = ffs(first | border) - 1;
+
+               /* find how many blocks of power 2 we need to mark */
+               min = fls(len) - 1;
+
+               if (max < min)
+                       min = max;
+               chunk = 1 << min;
+
+               /* mark multiblock chunks only */
+               grp->bb_counters[min]++;
+               if (min > 0)
+                       mb_clear_bit(first >> min,
+                                    buddy + sbi->s_mb_offsets[min]);
+
+               len -= chunk;
+               first += chunk;
+       }
+}
+
+static void ext4_mb_generate_buddy(struct super_block *sb,
+                               void *buddy, void *bitmap, ext4_group_t group)
+{
+       struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+       unsigned short max = EXT4_BLOCKS_PER_GROUP(sb);
+       unsigned short i = 0;
+       unsigned short first;
+       unsigned short len;
+       unsigned free = 0;
+       unsigned fragments = 0;
+       unsigned long long period = get_cycles();
+
+       /* initialize buddy from bitmap which is aggregation
+        * of on-disk bitmap and preallocations */
+       i = ext4_find_next_zero_bit(bitmap, max, 0);
+       grp->bb_first_free = i;
+       while (i < max) {
+               fragments++;
+               first = i;
+               i = ext4_find_next_bit(bitmap, max, i);
+               len = i - first;
+               free += len;
+               if (len > 1)
+                       ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
+               else
+                       grp->bb_counters[0]++;
+               if (i < max)
+                       i = ext4_find_next_zero_bit(bitmap, max, i);
+       }
+       grp->bb_fragments = fragments;
+
+       if (free != grp->bb_free) {
+               printk(KERN_DEBUG
+                       "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n",
+                       group, free, grp->bb_free);
+               grp->bb_free = free;
+       }
+
+       clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+
+       period = get_cycles() - period;
+       spin_lock(&EXT4_SB(sb)->s_bal_lock);
+       EXT4_SB(sb)->s_mb_buddies_generated++;
+       EXT4_SB(sb)->s_mb_generation_time += period;
+       spin_unlock(&EXT4_SB(sb)->s_bal_lock);
+}
+
+/* The buddy information is attached the buddy cache inode
+ * for convenience. The information regarding each group
+ * is loaded via ext4_mb_load_buddy. The information involve
+ * block bitmap and buddy information. The information are
+ * stored in the inode as
+ *
+ * {                        page                        }
+ * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ *
+ *
+ * one block each for bitmap and buddy information.
+ * So for each group we take up 2 blocks. A page can
+ * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
+ * So it can have information regarding groups_per_page which
+ * is blocks_per_page/2
+ */
+
+static int ext4_mb_init_cache(struct page *page, char *incore)
+{
+       int blocksize;
+       int blocks_per_page;
+       int groups_per_page;
+       int err = 0;
+       int i;
+       ext4_group_t first_group;
+       int first_block;
+       struct super_block *sb;
+       struct buffer_head *bhs;
+       struct buffer_head **bh;
+       struct inode *inode;
+       char *data;
+       char *bitmap;
+
+       mb_debug("init page %lu\n", page->index);
+
+       inode = page->mapping->host;
+       sb = inode->i_sb;
+       blocksize = 1 << inode->i_blkbits;
+       blocks_per_page = PAGE_CACHE_SIZE / blocksize;
+
+       groups_per_page = blocks_per_page >> 1;
+       if (groups_per_page == 0)
+               groups_per_page = 1;
+
+       /* allocate buffer_heads to read bitmaps */
+       if (groups_per_page > 1) {
+               err = -ENOMEM;
+               i = sizeof(struct buffer_head *) * groups_per_page;
+               bh = kzalloc(i, GFP_NOFS);
+               if (bh == NULL)
+                       goto out;
+       } else
+               bh = &bhs;
+
+       first_group = page->index * blocks_per_page / 2;
+
+       /* read all groups the page covers into the cache */
+       for (i = 0; i < groups_per_page; i++) {
+               struct ext4_group_desc *desc;
+
+               if (first_group + i >= EXT4_SB(sb)->s_groups_count)
+                       break;
+
+               err = -EIO;
+               desc = ext4_get_group_desc(sb, first_group + i, NULL);
+               if (desc == NULL)
+                       goto out;
+
+               err = -ENOMEM;
+               bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
+               if (bh[i] == NULL)
+                       goto out;
+
+               if (bh_uptodate_or_lock(bh[i]))
+                       continue;
+
+               if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+                       ext4_init_block_bitmap(sb, bh[i],
+                                               first_group + i, desc);
+                       set_buffer_uptodate(bh[i]);
+                       unlock_buffer(bh[i]);
+                       continue;
+               }
+               get_bh(bh[i]);
+               bh[i]->b_end_io = end_buffer_read_sync;
+               submit_bh(READ, bh[i]);
+               mb_debug("read bitmap for group %lu\n", first_group + i);
+       }
+
+       /* wait for I/O completion */
+       for (i = 0; i < groups_per_page && bh[i]; i++)
+               wait_on_buffer(bh[i]);
+
+       err = -EIO;
+       for (i = 0; i < groups_per_page && bh[i]; i++)
+               if (!buffer_uptodate(bh[i]))
+                       goto out;
+
+       first_block = page->index * blocks_per_page;
+       for (i = 0; i < blocks_per_page; i++) {
+               int group;
+               struct ext4_group_info *grinfo;
+
+               group = (first_block + i) >> 1;
+               if (group >= EXT4_SB(sb)->s_groups_count)
+                       break;
+
+               /*
+                * data carry information regarding this
+                * particular group in the format specified
+                * above
+                *
+                */
+               data = page_address(page) + (i * blocksize);
+               bitmap = bh[group - first_group]->b_data;
+
+               /*
+                * We place the buddy block and bitmap block
+                * close together
+                */
+               if ((first_block + i) & 1) {
+                       /* this is block of buddy */
+                       BUG_ON(incore == NULL);
+                       mb_debug("put buddy for group %u in page %lu/%x\n",
+                               group, page->index, i * blocksize);
+                       memset(data, 0xff, blocksize);
+                       grinfo = ext4_get_group_info(sb, group);
+                       grinfo->bb_fragments = 0;
+                       memset(grinfo->bb_counters, 0,
+                              sizeof(unsigned short)*(sb->s_blocksize_bits+2));
+                       /*
+                        * incore got set to the group block bitmap below
+                        */
+                       ext4_mb_generate_buddy(sb, data, incore, group);
+                       incore = NULL;
+               } else {
+                       /* this is block of bitmap */
+                       BUG_ON(incore != NULL);
+                       mb_debug("put bitmap for group %u in page %lu/%x\n",
+                               group, page->index, i * blocksize);
+
+                       /* see comments in ext4_mb_put_pa() */
+                       ext4_lock_group(sb, group);
+                       memcpy(data, bitmap, blocksize);
+
+                       /* mark all preallocated blks used in in-core bitmap */
+                       ext4_mb_generate_from_pa(sb, data, group);
+                       ext4_unlock_group(sb, group);
+
+                       /* set incore so that the buddy information can be
+                        * generated using this
+                        */
+                       incore = data;
+               }
+       }
+       SetPageUptodate(page);
+
+out:
+       if (bh) {
+               for (i = 0; i < groups_per_page && bh[i]; i++)
+                       brelse(bh[i]);
+               if (bh != &bhs)
+                       kfree(bh);
+       }
+       return err;
+}
+
+static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+               struct ext4_buddy *e4b)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct inode *inode = sbi->s_buddy_cache;
+       int blocks_per_page;
+       int block;
+       int pnum;
+       int poff;
+       struct page *page;
+
+       mb_debug("load group %lu\n", group);
+
+       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+
+       e4b->bd_blkbits = sb->s_blocksize_bits;
+       e4b->bd_info = ext4_get_group_info(sb, group);
+       e4b->bd_sb = sb;
+       e4b->bd_group = group;
+       e4b->bd_buddy_page = NULL;
+       e4b->bd_bitmap_page = NULL;
+
+       /*
+        * the buddy cache inode stores the block bitmap
+        * and buddy information in consecutive blocks.
+        * So for each group we need two blocks.
+        */
+       block = group * 2;
+       pnum = block / blocks_per_page;
+       poff = block % blocks_per_page;
+
+       /* we could use find_or_create_page(), but it locks page
+        * what we'd like to avoid in fast path ... */
+       page = find_get_page(inode->i_mapping, pnum);
+       if (page == NULL || !PageUptodate(page)) {
+               if (page)
+                       page_cache_release(page);
+               page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+               if (page) {
+                       BUG_ON(page->mapping != inode->i_mapping);
+                       if (!PageUptodate(page)) {
+                               ext4_mb_init_cache(page, NULL);
+                               mb_cmp_bitmaps(e4b, page_address(page) +
+                                              (poff * sb->s_blocksize));
+                       }
+                       unlock_page(page);
+               }
+       }
+       if (page == NULL || !PageUptodate(page))
+               goto err;
+       e4b->bd_bitmap_page = page;
+       e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
+       mark_page_accessed(page);
+
+       block++;
+       pnum = block / blocks_per_page;
+       poff = block % blocks_per_page;
+
+       page = find_get_page(inode->i_mapping, pnum);
+       if (page == NULL || !PageUptodate(page)) {
+               if (page)
+                       page_cache_release(page);
+               page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+               if (page) {
+                       BUG_ON(page->mapping != inode->i_mapping);
+                       if (!PageUptodate(page))
+                               ext4_mb_init_cache(page, e4b->bd_bitmap);
+
+                       unlock_page(page);
+               }
+       }
+       if (page == NULL || !PageUptodate(page))
+               goto err;
+       e4b->bd_buddy_page = page;
+       e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
+       mark_page_accessed(page);
+
+       BUG_ON(e4b->bd_bitmap_page == NULL);
+       BUG_ON(e4b->bd_buddy_page == NULL);
+
+       return 0;
+
+err:
+       if (e4b->bd_bitmap_page)
+               page_cache_release(e4b->bd_bitmap_page);
+       if (e4b->bd_buddy_page)
+               page_cache_release(e4b->bd_buddy_page);
+       e4b->bd_buddy = NULL;
+       e4b->bd_bitmap = NULL;
+       return -EIO;
+}
+
+static void ext4_mb_release_desc(struct ext4_buddy *e4b)
+{
+       if (e4b->bd_bitmap_page)
+               page_cache_release(e4b->bd_bitmap_page);
+       if (e4b->bd_buddy_page)
+               page_cache_release(e4b->bd_buddy_page);
+}
+
+
+static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
+{
+       int order = 1;
+       void *bb;
+
+       BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+       BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
+
+       bb = EXT4_MB_BUDDY(e4b);
+       while (order <= e4b->bd_blkbits + 1) {
+               block = block >> 1;
+               if (!mb_test_bit(block, bb)) {
+                       /* this block is part of buddy of order 'order' */
+                       return order;
+               }
+               bb += 1 << (e4b->bd_blkbits - order);
+               order++;
+       }
+       return 0;
+}
+
+static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
+{
+       __u32 *addr;
+
+       len = cur + len;
+       while (cur < len) {
+               if ((cur & 31) == 0 && (len - cur) >= 32) {
+                       /* fast path: clear whole word at once */
+                       addr = bm + (cur >> 3);
+                       *addr = 0;
+                       cur += 32;
+                       continue;
+               }
+               mb_clear_bit_atomic(lock, cur, bm);
+               cur++;
+       }
+}
+
+static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
+{
+       __u32 *addr;
+
+       len = cur + len;
+       while (cur < len) {
+               if ((cur & 31) == 0 && (len - cur) >= 32) {
+                       /* fast path: set whole word at once */
+                       addr = bm + (cur >> 3);
+                       *addr = 0xffffffff;
+                       cur += 32;
+                       continue;
+               }
+               mb_set_bit_atomic(lock, cur, bm);
+               cur++;
+       }
+}
+
+static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
+                         int first, int count)
+{
+       int block = 0;
+       int max = 0;
+       int order;
+       void *buddy;
+       void *buddy2;
+       struct super_block *sb = e4b->bd_sb;
+
+       BUG_ON(first + count > (sb->s_blocksize << 3));
+       BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+       mb_check_buddy(e4b);
+       mb_free_blocks_double(inode, e4b, first, count);
+
+       e4b->bd_info->bb_free += count;
+       if (first < e4b->bd_info->bb_first_free)
+               e4b->bd_info->bb_first_free = first;
+
+       /* let's maintain fragments counter */
+       if (first != 0)
+               block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b));
+       if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
+               max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b));
+       if (block && max)
+               e4b->bd_info->bb_fragments--;
+       else if (!block && !max)
+               e4b->bd_info->bb_fragments++;
+
+       /* let's maintain buddy itself */
+       while (count-- > 0) {
+               block = first++;
+               order = 0;
+
+               if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
+                       ext4_fsblk_t blocknr;
+                       blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
+                       blocknr += block;
+                       blocknr +=
+                           le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+
+                       ext4_error(sb, __FUNCTION__, "double-free of inode"
+                                  " %lu's block %llu(bit %u in group %lu)\n",
+                                  inode ? inode->i_ino : 0, blocknr, block,
+                                  e4b->bd_group);
+               }
+               mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
+               e4b->bd_info->bb_counters[order]++;
+
+               /* start of the buddy */
+               buddy = mb_find_buddy(e4b, order, &max);
+
+               do {
+                       block &= ~1UL;
+                       if (mb_test_bit(block, buddy) ||
+                                       mb_test_bit(block + 1, buddy))
+                               break;
+
+                       /* both the buddies are free, try to coalesce them */
+                       buddy2 = mb_find_buddy(e4b, order + 1, &max);
+
+                       if (!buddy2)
+                               break;
+
+                       if (order > 0) {
+                               /* for special purposes, we don't set
+                                * free bits in bitmap */
+                               mb_set_bit(block, buddy);
+                               mb_set_bit(block + 1, buddy);
+                       }
+                       e4b->bd_info->bb_counters[order]--;
+                       e4b->bd_info->bb_counters[order]--;
+
+                       block = block >> 1;
+                       order++;
+                       e4b->bd_info->bb_counters[order]++;
+
+                       mb_clear_bit(block, buddy2);
+                       buddy = buddy2;
+               } while (1);
+       }
+       mb_check_buddy(e4b);
+
+       return 0;
+}
+
+static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
+                               int needed, struct ext4_free_extent *ex)
+{
+       int next = block;
+       int max;
+       int ord;
+       void *buddy;
+
+       BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+       BUG_ON(ex == NULL);
+
+       buddy = mb_find_buddy(e4b, order, &max);
+       BUG_ON(buddy == NULL);
+       BUG_ON(block >= max);
+       if (mb_test_bit(block, buddy)) {
+               ex->fe_len = 0;
+               ex->fe_start = 0;
+               ex->fe_group = 0;
+               return 0;
+       }
+
+       /* FIXME dorp order completely ? */
+       if (likely(order == 0)) {
+               /* find actual order */
+               order = mb_find_order_for_block(e4b, block);
+               block = block >> order;
+       }
+
+       ex->fe_len = 1 << order;
+       ex->fe_start = block << order;
+       ex->fe_group = e4b->bd_group;
+
+       /* calc difference from given start */
+       next = next - ex->fe_start;
+       ex->fe_len -= next;
+       ex->fe_start += next;
+
+       while (needed > ex->fe_len &&
+              (buddy = mb_find_buddy(e4b, order, &max))) {
+
+               if (block + 1 >= max)
+                       break;
+
+               next = (block + 1) * (1 << order);
+               if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
+                       break;
+
+               ord = mb_find_order_for_block(e4b, next);
+
+               order = ord;
+               block = next >> order;
+               ex->fe_len += 1 << order;
+       }
+
+       BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
+       return ex->fe_len;
+}
+
+static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
+{
+       int ord;
+       int mlen = 0;
+       int max = 0;
+       int cur;
+       int start = ex->fe_start;
+       int len = ex->fe_len;
+       unsigned ret = 0;
+       int len0 = len;
+       void *buddy;
+
+       BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
+       BUG_ON(e4b->bd_group != ex->fe_group);
+       BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+       mb_check_buddy(e4b);
+       mb_mark_used_double(e4b, start, len);
+
+       e4b->bd_info->bb_free -= len;
+       if (e4b->bd_info->bb_first_free == start)
+               e4b->bd_info->bb_first_free += len;
+
+       /* let's maintain fragments counter */
+       if (start != 0)
+               mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b));
+       if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
+               max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b));
+       if (mlen && max)
+               e4b->bd_info->bb_fragments++;
+       else if (!mlen && !max)
+               e4b->bd_info->bb_fragments--;
+
+       /* let's maintain buddy itself */
+       while (len) {
+               ord = mb_find_order_for_block(e4b, start);
+
+               if (((start >> ord) << ord) == start && len >= (1 << ord)) {
+                       /* the whole chunk may be allocated at once! */
+                       mlen = 1 << ord;
+                       buddy = mb_find_buddy(e4b, ord, &max);
+                       BUG_ON((start >> ord) >= max);
+                       mb_set_bit(start >> ord, buddy);
+                       e4b->bd_info->bb_counters[ord]--;
+                       start += mlen;
+                       len -= mlen;
+                       BUG_ON(len < 0);
+                       continue;
+               }
+
+               /* store for history */
+               if (ret == 0)
+                       ret = len | (ord << 16);
+
+               /* we have to split large buddy */
+               BUG_ON(ord <= 0);
+               buddy = mb_find_buddy(e4b, ord, &max);
+               mb_set_bit(start >> ord, buddy);
+               e4b->bd_info->bb_counters[ord]--;
+
+               ord--;
+               cur = (start >> ord) & ~1U;
+               buddy = mb_find_buddy(e4b, ord, &max);
+               mb_clear_bit(cur, buddy);
+               mb_clear_bit(cur + 1, buddy);
+               e4b->bd_info->bb_counters[ord]++;
+               e4b->bd_info->bb_counters[ord]++;
+       }
+
+       mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group),
+                       EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
+       mb_check_buddy(e4b);
+
+       return ret;
+}
+
+/*
+ * Must be called under group lock!
+ */
+static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
+                                       struct ext4_buddy *e4b)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+       int ret;
+
+       BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
+       BUG_ON(ac->ac_status == AC_STATUS_FOUND);
+
+       ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
+       ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
+       ret = mb_mark_used(e4b, &ac->ac_b_ex);
+
+       /* preallocation can change ac_b_ex, thus we store actually
+        * allocated blocks for history */
+       ac->ac_f_ex = ac->ac_b_ex;
+
+       ac->ac_status = AC_STATUS_FOUND;
+       ac->ac_tail = ret & 0xffff;
+       ac->ac_buddy = ret >> 16;
+
+       /* XXXXXXX: SUCH A HORRIBLE **CK */
+       /*FIXME!! Why ? */
+       ac->ac_bitmap_page = e4b->bd_bitmap_page;
+       get_page(ac->ac_bitmap_page);
+       ac->ac_buddy_page = e4b->bd_buddy_page;
+       get_page(ac->ac_buddy_page);
+
+       /* store last allocated for subsequent stream allocation */
+       if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
+               spin_lock(&sbi->s_md_lock);
+               sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
+               sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
+               spin_unlock(&sbi->s_md_lock);
+       }
+}
+
+/*
+ * regular allocator, for general purposes allocation
+ */
+
+static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
+                                       struct ext4_buddy *e4b,
+                                       int finish_group)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+       struct ext4_free_extent *bex = &ac->ac_b_ex;
+       struct ext4_free_extent *gex = &ac->ac_g_ex;
+       struct ext4_free_extent ex;
+       int max;
+
+       /*
+        * We don't want to scan for a whole year
+        */
+       if (ac->ac_found > sbi->s_mb_max_to_scan &&
+                       !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
+               ac->ac_status = AC_STATUS_BREAK;
+               return;
+       }
+
+       /*
+        * Haven't found good chunk so far, let's continue
+        */
+       if (bex->fe_len < gex->fe_len)
+               return;
+
+       if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
+                       && bex->fe_group == e4b->bd_group) {
+               /* recheck chunk's availability - we don't know
+                * when it was found (within this lock-unlock
+                * period or not) */
+               max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex);
+               if (max >= gex->fe_len) {
+                       ext4_mb_use_best_found(ac, e4b);
+                       return;
+               }
+       }
+}
+
+/*
+ * The routine checks whether found extent is good enough. If it is,
+ * then the extent gets marked used and flag is set to the context
+ * to stop scanning. Otherwise, the extent is compared with the
+ * previous found extent and if new one is better, then it's stored
+ * in the context. Later, the best found extent will be used, if
+ * mballoc can't find good enough extent.
+ *
+ * FIXME: real allocation policy is to be designed yet!
+ */
+static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
+                                       struct ext4_free_extent *ex,
+                                       struct ext4_buddy *e4b)
+{
+       struct ext4_free_extent *bex = &ac->ac_b_ex;
+       struct ext4_free_extent *gex = &ac->ac_g_ex;
+
+       BUG_ON(ex->fe_len <= 0);
+       BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+       BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+       BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
+
+       ac->ac_found++;
+
+       /*
+        * The special case - take what you catch first
+        */
+       if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
+               *bex = *ex;
+               ext4_mb_use_best_found(ac, e4b);
+               return;
+       }
+
+       /*
+        * Let's check whether the chuck is good enough
+        */
+       if (ex->fe_len == gex->fe_len) {
+               *bex = *ex;
+               ext4_mb_use_best_found(ac, e4b);
+               return;
+       }
+
+       /*
+        * If this is first found extent, just store it in the context
+        */
+       if (bex->fe_len == 0) {
+               *bex = *ex;
+               return;
+       }
+
+       /*
+        * If new found extent is better, store it in the context
+        */
+       if (bex->fe_len < gex->fe_len) {
+               /* if the request isn't satisfied, any found extent
+                * larger than previous best one is better */
+               if (ex->fe_len > bex->fe_len)
+                       *bex = *ex;
+       } else if (ex->fe_len > gex->fe_len) {
+               /* if the request is satisfied, then we try to find
+                * an extent that still satisfy the request, but is
+                * smaller than previous one */
+               if (ex->fe_len < bex->fe_len)
+                       *bex = *ex;
+       }
+
+       ext4_mb_check_limits(ac, e4b, 0);
+}
+
+static int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
+                                       struct ext4_buddy *e4b)
+{
+       struct ext4_free_extent ex = ac->ac_b_ex;
+       ext4_group_t group = ex.fe_group;
+       int max;
+       int err;
+
+       BUG_ON(ex.fe_len <= 0);
+       err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
+       if (err)
+               return err;
+
+       ext4_lock_group(ac->ac_sb, group);
+       max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex);
+
+       if (max > 0) {
+               ac->ac_b_ex = ex;
+               ext4_mb_use_best_found(ac, e4b);
+       }
+
+       ext4_unlock_group(ac->ac_sb, group);
+       ext4_mb_release_desc(e4b);
+
+       return 0;
+}
+
+static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
+                               struct ext4_buddy *e4b)
+{
+       ext4_group_t group = ac->ac_g_ex.fe_group;
+       int max;
+       int err;
+       struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+       struct ext4_super_block *es = sbi->s_es;
+       struct ext4_free_extent ex;
+
+       if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
+               return 0;
+
+       err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
+       if (err)
+               return err;
+
+       ext4_lock_group(ac->ac_sb, group);
+       max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start,
+                            ac->ac_g_ex.fe_len, &ex);
+
+       if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
+               ext4_fsblk_t start;
+
+               start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) +
+                       ex.fe_start + le32_to_cpu(es->s_first_data_block);
+               /* use do_div to get remainder (would be 64-bit modulo) */
+               if (do_div(start, sbi->s_stripe) == 0) {
+                       ac->ac_found++;
+                       ac->ac_b_ex = ex;
+                       ext4_mb_use_best_found(ac, e4b);
+               }
+       } else if (max >= ac->ac_g_ex.fe_len) {
+               BUG_ON(ex.fe_len <= 0);
+               BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
+               BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
+               ac->ac_found++;
+               ac->ac_b_ex = ex;
+               ext4_mb_use_best_found(ac, e4b);
+       } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
+               /* Sometimes, caller may want to merge even small
+                * number of blocks to an existing extent */
+               BUG_ON(ex.fe_len <= 0);
+               BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
+               BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
+               ac->ac_found++;
+               ac->ac_b_ex = ex;
+               ext4_mb_use_best_found(ac, e4b);
+       }
+       ext4_unlock_group(ac->ac_sb, group);
+       ext4_mb_release_desc(e4b);
+
+       return 0;
+}
+
+/*
+ * The routine scans buddy structures (not bitmap!) from given order
+ * to max order and tries to find big enough chunk to satisfy the req
+ */
+static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
+                                       struct ext4_buddy *e4b)
+{
+       struct super_block *sb = ac->ac_sb;
+       struct ext4_group_info *grp = e4b->bd_info;
+       void *buddy;
+       int i;
+       int k;
+       int max;
+
+       BUG_ON(ac->ac_2order <= 0);
+       for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
+               if (grp->bb_counters[i] == 0)
+                       continue;
+
+               buddy = mb_find_buddy(e4b, i, &max);
+               BUG_ON(buddy == NULL);
+
+               k = ext4_find_next_zero_bit(buddy, max, 0);
+               BUG_ON(k >= max);
+
+               ac->ac_found++;
+
+               ac->ac_b_ex.fe_len = 1 << i;
+               ac->ac_b_ex.fe_start = k << i;
+               ac->ac_b_ex.fe_group = e4b->bd_group;
+
+               ext4_mb_use_best_found(ac, e4b);
+
+               BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
+
+               if (EXT4_SB(sb)->s_mb_stats)
+                       atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
+
+               break;
+       }
+}
+
+/*
+ * The routine scans the group and measures all found extents.
+ * In order to optimize scanning, caller must pass number of
+ * free blocks in the group, so the routine can know upper limit.
+ */
+static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
+                                       struct ext4_buddy *e4b)
+{
+       struct super_block *sb = ac->ac_sb;
+       void *bitmap = EXT4_MB_BITMAP(e4b);
+       struct ext4_free_extent ex;
+       int i;
+       int free;
+
+       free = e4b->bd_info->bb_free;
+       BUG_ON(free <= 0);
+
+       i = e4b->bd_info->bb_first_free;
+
+       while (free && ac->ac_status == AC_STATUS_CONTINUE) {
+               i = ext4_find_next_zero_bit(bitmap,
+                                               EXT4_BLOCKS_PER_GROUP(sb), i);
+               if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
+                       BUG_ON(free != 0);
+                       break;
+               }
+
+               mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
+               BUG_ON(ex.fe_len <= 0);
+               BUG_ON(free < ex.fe_len);
+
+               ext4_mb_measure_extent(ac, &ex, e4b);
+
+               i += ex.fe_len;
+               free -= ex.fe_len;
+       }
+
+       ext4_mb_check_limits(ac, e4b, 1);
+}
+
+/*
+ * This is a special case for storages like raid5
+ * we try to find stripe-aligned chunks for stripe-size requests
+ * XXX should do so at least for multiples of stripe size as well
+ */
+static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
+                                struct ext4_buddy *e4b)
+{
+       struct super_block *sb = ac->ac_sb;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       void *bitmap = EXT4_MB_BITMAP(e4b);
+       struct ext4_free_extent ex;
+       ext4_fsblk_t first_group_block;
+       ext4_fsblk_t a;
+       ext4_grpblk_t i;
+       int max;
+
+       BUG_ON(sbi->s_stripe == 0);
+
+       /* find first stripe-aligned block in group */
+       first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb)
+               + le32_to_cpu(sbi->s_es->s_first_data_block);
+       a = first_group_block + sbi->s_stripe - 1;
+       do_div(a, sbi->s_stripe);
+       i = (a * sbi->s_stripe) - first_group_block;
+
+       while (i < EXT4_BLOCKS_PER_GROUP(sb)) {
+               if (!mb_test_bit(i, bitmap)) {
+                       max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
+                       if (max >= sbi->s_stripe) {
+                               ac->ac_found++;
+                               ac->ac_b_ex = ex;
+                               ext4_mb_use_best_found(ac, e4b);
+                               break;
+                       }
+               }
+               i += sbi->s_stripe;
+       }
+}
+
+static int ext4_mb_good_group(struct ext4_allocation_context *ac,
+                               ext4_group_t group, int cr)
+{
+       unsigned free, fragments;
+       unsigned i, bits;
+       struct ext4_group_desc *desc;
+       struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
+
+       BUG_ON(cr < 0 || cr >= 4);
+       BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
+
+       free = grp->bb_free;
+       fragments = grp->bb_fragments;
+       if (free == 0)
+               return 0;
+       if (fragments == 0)
+               return 0;
+
+       switch (cr) {
+       case 0:
+               BUG_ON(ac->ac_2order == 0);
+               /* If this group is uninitialized, skip it initially */
+               desc = ext4_get_group_desc(ac->ac_sb, group, NULL);
+               if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
+                       return 0;
+
+               bits = ac->ac_sb->s_blocksize_bits + 1;
+               for (i = ac->ac_2order; i <= bits; i++)
+                       if (grp->bb_counters[i] > 0)
+                               return 1;
+               break;
+       case 1:
+               if ((free / fragments) >= ac->ac_g_ex.fe_len)
+                       return 1;
+               break;
+       case 2:
+               if (free >= ac->ac_g_ex.fe_len)
+                       return 1;
+               break;
+       case 3:
+               return 1;
+       default:
+               BUG();
+       }
+
+       return 0;
+}
+
+static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
+{
+       ext4_group_t group;
+       ext4_group_t i;
+       int cr;
+       int err = 0;
+       int bsbits;
+       struct ext4_sb_info *sbi;
+       struct super_block *sb;
+       struct ext4_buddy e4b;
+       loff_t size, isize;
+
+       sb = ac->ac_sb;
+       sbi = EXT4_SB(sb);
+       BUG_ON(ac->ac_status == AC_STATUS_FOUND);
+
+       /* first, try the goal */
+       err = ext4_mb_find_by_goal(ac, &e4b);
+       if (err || ac->ac_status == AC_STATUS_FOUND)
+               goto out;
+
+       if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+               goto out;
+
+       /*
+        * ac->ac2_order is set only if the fe_len is a power of 2
+        * if ac2_order is set we also set criteria to 0 so that we
+        * try exact allocation using buddy.
+        */
+       i = fls(ac->ac_g_ex.fe_len);
+       ac->ac_2order = 0;
+       /*
+        * We search using buddy data only if the order of the request
+        * is greater than equal to the sbi_s_mb_order2_reqs
+        * You can tune it via /proc/fs/ext4/<partition>/order2_req
+        */
+       if (i >= sbi->s_mb_order2_reqs) {
+               /*
+                * This should tell if fe_len is exactly power of 2
+                */
+               if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
+                       ac->ac_2order = i - 1;
+       }
+
+       bsbits = ac->ac_sb->s_blocksize_bits;
+       /* if stream allocation is enabled, use global goal */
+       size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+       isize = i_size_read(ac->ac_inode) >> bsbits;
+       if (size < isize)
+               size = isize;
+
+       if (size < sbi->s_mb_stream_request &&
+                       (ac->ac_flags & EXT4_MB_HINT_DATA)) {
+               /* TBD: may be hot point */
+               spin_lock(&sbi->s_md_lock);
+               ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
+               ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
+               spin_unlock(&sbi->s_md_lock);
+       }
+
+       /* searching for the right group start from the goal value specified */
+       group = ac->ac_g_ex.fe_group;
+
+       /* Let's just scan groups to find more-less suitable blocks */
+       cr = ac->ac_2order ? 0 : 1;
+       /*
+        * cr == 0 try to get exact allocation,
+        * cr == 3  try to get anything
+        */
+repeat:
+       for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
+               ac->ac_criteria = cr;
+               for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
+                       struct ext4_group_info *grp;
+                       struct ext4_group_desc *desc;
+
+                       if (group == EXT4_SB(sb)->s_groups_count)
+                               group = 0;
+
+                       /* quick check to skip empty groups */
+                       grp = ext4_get_group_info(ac->ac_sb, group);
+                       if (grp->bb_free == 0)
+                               continue;
+
+                       /*
+                        * if the group is already init we check whether it is
+                        * a good group and if not we don't load the buddy
+                        */
+                       if (EXT4_MB_GRP_NEED_INIT(grp)) {
+                               /*
+                                * we need full data about the group
+                                * to make a good selection
+                                */
+                               err = ext4_mb_load_buddy(sb, group, &e4b);
+                               if (err)
+                                       goto out;
+                               ext4_mb_release_desc(&e4b);
+                       }
+
+                       /*
+                        * If the particular group doesn't satisfy our
+                        * criteria we continue with the next group
+                        */
+                       if (!ext4_mb_good_group(ac, group, cr))
+                               continue;
+
+                       err = ext4_mb_load_buddy(sb, group, &e4b);
+                       if (err)
+                               goto out;
+
+                       ext4_lock_group(sb, group);
+                       if (!ext4_mb_good_group(ac, group, cr)) {
+                               /* someone did allocation from this group */
+                               ext4_unlock_group(sb, group);
+                               ext4_mb_release_desc(&e4b);
+                               continue;
+                       }
+
+                       ac->ac_groups_scanned++;
+                       desc = ext4_get_group_desc(sb, group, NULL);
+                       if (cr == 0 || (desc->bg_flags &
+                                       cpu_to_le16(EXT4_BG_BLOCK_UNINIT) &&
+                                       ac->ac_2order != 0))
+                               ext4_mb_simple_scan_group(ac, &e4b);
+                       else if (cr == 1 &&
+                                       ac->ac_g_ex.fe_len == sbi->s_stripe)
+                               ext4_mb_scan_aligned(ac, &e4b);
+                       else
+                               ext4_mb_complex_scan_group(ac, &e4b);
+
+                       ext4_unlock_group(sb, group);
+                       ext4_mb_release_desc(&e4b);
+
+                       if (ac->ac_status != AC_STATUS_CONTINUE)
+                               break;
+               }
+       }
+
+       if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
+           !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
+               /*
+                * We've been searching too long. Let's try to allocate
+                * the best chunk we've found so far
+                */
+
+               ext4_mb_try_best_found(ac, &e4b);
+               if (ac->ac_status != AC_STATUS_FOUND) {
+                       /*
+                        * Someone more lucky has already allocated it.
+                        * The only thing we can do is just take first
+                        * found block(s)
+                       printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
+                        */
+                       ac->ac_b_ex.fe_group = 0;
+                       ac->ac_b_ex.fe_start = 0;
+                       ac->ac_b_ex.fe_len = 0;
+                       ac->ac_status = AC_STATUS_CONTINUE;
+                       ac->ac_flags |= EXT4_MB_HINT_FIRST;
+                       cr = 3;
+                       atomic_inc(&sbi->s_mb_lost_chunks);
+                       goto repeat;
+               }
+       }
+out:
+       return err;
+}
+
+#ifdef EXT4_MB_HISTORY
+struct ext4_mb_proc_session {
+       struct ext4_mb_history *history;
+       struct super_block *sb;
+       int start;
+       int max;
+};
+
+static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s,
+                                       struct ext4_mb_history *hs,
+                                       int first)
+{
+       if (hs == s->history + s->max)
+               hs = s->history;
+       if (!first && hs == s->history + s->start)
+               return NULL;
+       while (hs->orig.fe_len == 0) {
+               hs++;
+               if (hs == s->history + s->max)
+                       hs = s->history;
+               if (hs == s->history + s->start)
+                       return NULL;
+       }
+       return hs;
+}
+
+static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
+{
+       struct ext4_mb_proc_session *s = seq->private;
+       struct ext4_mb_history *hs;
+       int l = *pos;
+
+       if (l == 0)
+               return SEQ_START_TOKEN;
+       hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1);
+       if (!hs)
+               return NULL;
+       while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL);
+       return hs;
+}
+
+static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v,
+                                     loff_t *pos)
+{
+       struct ext4_mb_proc_session *s = seq->private;
+       struct ext4_mb_history *hs = v;
+
+       ++*pos;
+       if (v == SEQ_START_TOKEN)
+               return ext4_mb_history_skip_empty(s, s->history + s->start, 1);
+       else
+               return ext4_mb_history_skip_empty(s, ++hs, 0);
+}
+
+static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
+{
+       char buf[25], buf2[25], buf3[25], *fmt;
+       struct ext4_mb_history *hs = v;
+
+       if (v == SEQ_START_TOKEN) {
+               seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
+                               "%-5s %-2s %-5s %-5s %-5s %-6s\n",
+                         "pid", "inode", "original", "goal", "result", "found",
+                          "grps", "cr", "flags", "merge", "tail", "broken");
+               return 0;
+       }
+
+       if (hs->op == EXT4_MB_HISTORY_ALLOC) {
+               fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
+                       "%-5u %-5s %-5u %-6u\n";
+               sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
+                       hs->result.fe_start, hs->result.fe_len,
+                       hs->result.fe_logical);
+               sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
+                       hs->orig.fe_start, hs->orig.fe_len,
+                       hs->orig.fe_logical);
+               sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group,
+                       hs->goal.fe_start, hs->goal.fe_len,
+                       hs->goal.fe_logical);
+               seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
+                               hs->found, hs->groups, hs->cr, hs->flags,
+                               hs->merged ? "M" : "", hs->tail,
+                               hs->buddy ? 1 << hs->buddy : 0);
+       } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
+               fmt = "%-5u %-8u %-23s %-23s %-23s\n";
+               sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
+                       hs->result.fe_start, hs->result.fe_len,
+                       hs->result.fe_logical);
+               sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
+                       hs->orig.fe_start, hs->orig.fe_len,
+                       hs->orig.fe_logical);
+               seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
+       } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
+               sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
+                       hs->result.fe_start, hs->result.fe_len);
+               seq_printf(seq, "%-5u %-8u %-23s discard\n",
+                               hs->pid, hs->ino, buf2);
+       } else if (hs->op == EXT4_MB_HISTORY_FREE) {
+               sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
+                       hs->result.fe_start, hs->result.fe_len);
+               seq_printf(seq, "%-5u %-8u %-23s free\n",
+                               hs->pid, hs->ino, buf2);
+       }
+       return 0;
+}
+
+static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
+{
+}
+
+static struct seq_operations ext4_mb_seq_history_ops = {
+       .start  = ext4_mb_seq_history_start,
+       .next   = ext4_mb_seq_history_next,
+       .stop   = ext4_mb_seq_history_stop,
+       .show   = ext4_mb_seq_history_show,
+};
+
+static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
+{
+       struct super_block *sb = PDE(inode)->data;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_mb_proc_session *s;
+       int rc;
+       int size;
+
+       s = kmalloc(sizeof(*s), GFP_KERNEL);
+       if (s == NULL)
+               return -ENOMEM;
+       s->sb = sb;
+       size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max;
+       s->history = kmalloc(size, GFP_KERNEL);
+       if (s->history == NULL) {
+               kfree(s);
+               return -ENOMEM;
+       }
+
+       spin_lock(&sbi->s_mb_history_lock);
+       memcpy(s->history, sbi->s_mb_history, size);
+       s->max = sbi->s_mb_history_max;
+       s->start = sbi->s_mb_history_cur % s->max;
+       spin_unlock(&sbi->s_mb_history_lock);
+
+       rc = seq_open(file, &ext4_mb_seq_history_ops);
+       if (rc == 0) {
+               struct seq_file *m = (struct seq_file *)file->private_data;
+               m->private = s;
+       } else {
+               kfree(s->history);
+               kfree(s);
+       }
+       return rc;
+
+}
+
+static int ext4_mb_seq_history_release(struct inode *inode, struct file *file)
+{
+       struct seq_file *seq = (struct seq_file *)file->private_data;
+       struct ext4_mb_proc_session *s = seq->private;
+       kfree(s->history);
+       kfree(s);
+       return seq_release(inode, file);
+}
+
+static ssize_t ext4_mb_seq_history_write(struct file *file,
+                               const char __user *buffer,
+                               size_t count, loff_t *ppos)
+{
+       struct seq_file *seq = (struct seq_file *)file->private_data;
+       struct ext4_mb_proc_session *s = seq->private;
+       struct super_block *sb = s->sb;
+       char str[32];
+       int value;
+
+       if (count >= sizeof(str)) {
+               printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
+                               "mb_history", (int)sizeof(str));
+               return -EOVERFLOW;
+       }
+
+       if (copy_from_user(str, buffer, count))
+               return -EFAULT;
+
+       value = simple_strtol(str, NULL, 0);
+       if (value < 0)
+               return -ERANGE;
+       EXT4_SB(sb)->s_mb_history_filter = value;
+
+       return count;
+}
+
+static struct file_operations ext4_mb_seq_history_fops = {
+       .owner          = THIS_MODULE,
+       .open           = ext4_mb_seq_history_open,
+       .read           = seq_read,
+       .write          = ext4_mb_seq_history_write,
+       .llseek         = seq_lseek,
+       .release        = ext4_mb_seq_history_release,
+};
+
+static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
+{
+       struct super_block *sb = seq->private;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       ext4_group_t group;
+
+       if (*pos < 0 || *pos >= sbi->s_groups_count)
+               return NULL;
+
+       group = *pos + 1;
+       return (void *) group;
+}
+
+static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+       struct super_block *sb = seq->private;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       ext4_group_t group;
+
+       ++*pos;
+       if (*pos < 0 || *pos >= sbi->s_groups_count)
+               return NULL;
+       group = *pos + 1;
+       return (void *) group;;
+}
+
+static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
+{
+       struct super_block *sb = seq->private;
+       long group = (long) v;
+       int i;
+       int err;
+       struct ext4_buddy e4b;
+       struct sg {
+               struct ext4_group_info info;
+               unsigned short counters[16];
+       } sg;
+
+       group--;
+       if (group == 0)
+               seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
+                               "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
+                                 "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
+                          "group", "free", "frags", "first",
+                          "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
+                          "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
+
+       i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
+               sizeof(struct ext4_group_info);
+       err = ext4_mb_load_buddy(sb, group, &e4b);
+       if (err) {
+               seq_printf(seq, "#%-5lu: I/O error\n", group);
+               return 0;
+       }
+       ext4_lock_group(sb, group);
+       memcpy(&sg, ext4_get_group_info(sb, group), i);
+       ext4_unlock_group(sb, group);
+       ext4_mb_release_desc(&e4b);
+
+       seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free,
+                       sg.info.bb_fragments, sg.info.bb_first_free);
+       for (i = 0; i <= 13; i++)
+               seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
+                               sg.info.bb_counters[i] : 0);
+       seq_printf(seq, " ]\n");
+
+       return 0;
+}
+
+static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
+{
+}
+
+static struct seq_operations ext4_mb_seq_groups_ops = {
+       .start  = ext4_mb_seq_groups_start,
+       .next   = ext4_mb_seq_groups_next,
+       .stop   = ext4_mb_seq_groups_stop,
+       .show   = ext4_mb_seq_groups_show,
+};
+
+static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
+{
+       struct super_block *sb = PDE(inode)->data;
+       int rc;
+
+       rc = seq_open(file, &ext4_mb_seq_groups_ops);
+       if (rc == 0) {
+               struct seq_file *m = (struct seq_file *)file->private_data;
+               m->private = sb;
+       }
+       return rc;
+
+}
+
+static struct file_operations ext4_mb_seq_groups_fops = {
+       .owner          = THIS_MODULE,
+       .open           = ext4_mb_seq_groups_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = seq_release,
+};
+
+static void ext4_mb_history_release(struct super_block *sb)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       remove_proc_entry("mb_groups", sbi->s_mb_proc);
+       remove_proc_entry("mb_history", sbi->s_mb_proc);
+
+       kfree(sbi->s_mb_history);
+}
+
+static void ext4_mb_history_init(struct super_block *sb)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       int i;
+
+       if (sbi->s_mb_proc != NULL) {
+               struct proc_dir_entry *p;
+               p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc);
+               if (p) {
+                       p->proc_fops = &ext4_mb_seq_history_fops;
+                       p->data = sb;
+               }
+               p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc);
+               if (p) {
+                       p->proc_fops = &ext4_mb_seq_groups_fops;
+                       p->data = sb;
+               }
+       }
+
+       sbi->s_mb_history_max = 1000;
+       sbi->s_mb_history_cur = 0;
+       spin_lock_init(&sbi->s_mb_history_lock);
+       i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
+       sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
+       if (likely(sbi->s_mb_history != NULL))
+               memset(sbi->s_mb_history, 0, i);
+       /* if we can't allocate history, then we simple won't use it */
+}
+
+static void ext4_mb_store_history(struct ext4_allocation_context *ac)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+       struct ext4_mb_history h;
+
+       if (unlikely(sbi->s_mb_history == NULL))
+               return;
+
+       if (!(ac->ac_op & sbi->s_mb_history_filter))
+               return;
+
+       h.op = ac->ac_op;
+       h.pid = current->pid;
+       h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0;
+       h.orig = ac->ac_o_ex;
+       h.result = ac->ac_b_ex;
+       h.flags = ac->ac_flags;
+       h.found = ac->ac_found;
+       h.groups = ac->ac_groups_scanned;
+       h.cr = ac->ac_criteria;
+       h.tail = ac->ac_tail;
+       h.buddy = ac->ac_buddy;
+       h.merged = 0;
+       if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
+               if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
+                               ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
+                       h.merged = 1;
+               h.goal = ac->ac_g_ex;
+               h.result = ac->ac_f_ex;
+       }
+
+       spin_lock(&sbi->s_mb_history_lock);
+       memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
+       if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
+               sbi->s_mb_history_cur = 0;
+       spin_unlock(&sbi->s_mb_history_lock);
+}
+
+#else
+#define ext4_mb_history_release(sb)
+#define ext4_mb_history_init(sb)
+#endif
+
+static int ext4_mb_init_backend(struct super_block *sb)
+{
+       ext4_group_t i;
+       int j, len, metalen;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       int num_meta_group_infos =
+               (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
+                       EXT4_DESC_PER_BLOCK_BITS(sb);
+       struct ext4_group_info **meta_group_info;
+
+       /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
+        * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
+        * So a two level scheme suffices for now. */
+       sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
+                                   num_meta_group_infos, GFP_KERNEL);
+       if (sbi->s_group_info == NULL) {
+               printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
+               return -ENOMEM;
+       }
+       sbi->s_buddy_cache = new_inode(sb);
+       if (sbi->s_buddy_cache == NULL) {
+               printk(KERN_ERR "EXT4-fs: can't get new inode\n");
+               goto err_freesgi;
+       }
+       EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
+
+       metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
+       for (i = 0; i < num_meta_group_infos; i++) {
+               if ((i + 1) == num_meta_group_infos)
+                       metalen = sizeof(*meta_group_info) *
+                               (sbi->s_groups_count -
+                                       (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
+               meta_group_info = kmalloc(metalen, GFP_KERNEL);
+               if (meta_group_info == NULL) {
+                       printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
+                              "buddy group\n");
+                       goto err_freemeta;
+               }
+               sbi->s_group_info[i] = meta_group_info;
+       }
+
+       /*
+        * calculate needed size. if change bb_counters size,
+        * don't forget about ext4_mb_generate_buddy()
+        */
+       len = sizeof(struct ext4_group_info);
+       len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
+       for (i = 0; i < sbi->s_groups_count; i++) {
+               struct ext4_group_desc *desc;
+
+               meta_group_info =
+                       sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+               j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
+
+               meta_group_info[j] = kzalloc(len, GFP_KERNEL);
+               if (meta_group_info[j] == NULL) {
+                       printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
+                       i--;
+                       goto err_freebuddy;
+               }
+               desc = ext4_get_group_desc(sb, i, NULL);
+               if (desc == NULL) {
+                       printk(KERN_ERR
+                               "EXT4-fs: can't read descriptor %lu\n", i);
+                       goto err_freebuddy;
+               }
+               memset(meta_group_info[j], 0, len);
+               set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+                       &(meta_group_info[j]->bb_state));
+
+               /*
+                * initialize bb_free to be able to skip
+                * empty groups without initialization
+                */
+               if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+                       meta_group_info[j]->bb_free =
+                               ext4_free_blocks_after_init(sb, i, desc);
+               } else {
+                       meta_group_info[j]->bb_free =
+                               le16_to_cpu(desc->bg_free_blocks_count);
+               }
+
+               INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
+
+#ifdef DOUBLE_CHECK
+               {
+                       struct buffer_head *bh;
+                       meta_group_info[j]->bb_bitmap =
+                               kmalloc(sb->s_blocksize, GFP_KERNEL);
+                       BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
+                       bh = read_block_bitmap(sb, i);
+                       BUG_ON(bh == NULL);
+                       memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
+                                       sb->s_blocksize);
+                       put_bh(bh);
+               }
+#endif
+
+       }
+
+       return 0;
+
+err_freebuddy:
+       while (i >= 0) {
+               kfree(ext4_get_group_info(sb, i));
+               i--;
+       }
+       i = num_meta_group_infos;
+err_freemeta:
+       while (--i >= 0)
+               kfree(sbi->s_group_info[i]);
+       iput(sbi->s_buddy_cache);
+err_freesgi:
+       kfree(sbi->s_group_info);
+       return -ENOMEM;
+}
+
+int ext4_mb_init(struct super_block *sb, int needs_recovery)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       unsigned i;
+       unsigned offset;
+       unsigned max;
+
+       if (!test_opt(sb, MBALLOC))
+               return 0;
+
+       i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
+
+       sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
+       if (sbi->s_mb_offsets == NULL) {
+               clear_opt(sbi->s_mount_opt, MBALLOC);
+               return -ENOMEM;
+       }
+       sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
+       if (sbi->s_mb_maxs == NULL) {
+               clear_opt(sbi->s_mount_opt, MBALLOC);
+               kfree(sbi->s_mb_maxs);
+               return -ENOMEM;
+       }
+
+       /* order 0 is regular bitmap */
+       sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
+       sbi->s_mb_offsets[0] = 0;
+
+       i = 1;
+       offset = 0;
+       max = sb->s_blocksize << 2;
+       do {
+               sbi->s_mb_offsets[i] = offset;
+               sbi->s_mb_maxs[i] = max;
+               offset += 1 << (sb->s_blocksize_bits - i);
+               max = max >> 1;
+               i++;
+       } while (i <= sb->s_blocksize_bits + 1);
+
+       /* init file for buddy data */
+       i = ext4_mb_init_backend(sb);
+       if (i) {
+               clear_opt(sbi->s_mount_opt, MBALLOC);
+               kfree(sbi->s_mb_offsets);
+               kfree(sbi->s_mb_maxs);
+               return i;
+       }
+
+       spin_lock_init(&sbi->s_md_lock);
+       INIT_LIST_HEAD(&sbi->s_active_transaction);
+       INIT_LIST_HEAD(&sbi->s_closed_transaction);
+       INIT_LIST_HEAD(&sbi->s_committed_transaction);
+       spin_lock_init(&sbi->s_bal_lock);
+
+       sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
+       sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
+       sbi->s_mb_stats = MB_DEFAULT_STATS;
+       sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
+       sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+       sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
+       sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
+
+       i = sizeof(struct ext4_locality_group) * NR_CPUS;
+       sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
+       if (sbi->s_locality_groups == NULL) {
+               clear_opt(sbi->s_mount_opt, MBALLOC);
+               kfree(sbi->s_mb_offsets);
+               kfree(sbi->s_mb_maxs);
+               return -ENOMEM;
+       }
+       for (i = 0; i < NR_CPUS; i++) {
+               struct ext4_locality_group *lg;
+               lg = &sbi->s_locality_groups[i];
+               mutex_init(&lg->lg_mutex);
+               INIT_LIST_HEAD(&lg->lg_prealloc_list);
+               spin_lock_init(&lg->lg_prealloc_lock);
+       }
+
+       ext4_mb_init_per_dev_proc(sb);
+       ext4_mb_history_init(sb);
+
+       printk("EXT4-fs: mballoc enabled\n");
+       return 0;
+}
+
+/* need to called with ext4 group lock (ext4_lock_group) */
+static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
+{
+       struct ext4_prealloc_space *pa;
+       struct list_head *cur, *tmp;
+       int count = 0;
+
+       list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
+               pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
+               list_del(&pa->pa_group_list);
+               count++;
+               kfree(pa);
+       }
+       if (count)
+               mb_debug("mballoc: %u PAs left\n", count);
+
+}
+
+int ext4_mb_release(struct super_block *sb)
+{
+       ext4_group_t i;
+       int num_meta_group_infos;
+       struct ext4_group_info *grinfo;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       if (!test_opt(sb, MBALLOC))
+               return 0;
+
+       /* release freed, non-committed blocks */
+       spin_lock(&sbi->s_md_lock);
+       list_splice_init(&sbi->s_closed_transaction,
+                       &sbi->s_committed_transaction);
+       list_splice_init(&sbi->s_active_transaction,
+                       &sbi->s_committed_transaction);
+       spin_unlock(&sbi->s_md_lock);
+       ext4_mb_free_committed_blocks(sb);
+
+       if (sbi->s_group_info) {
+               for (i = 0; i < sbi->s_groups_count; i++) {
+                       grinfo = ext4_get_group_info(sb, i);
+#ifdef DOUBLE_CHECK
+                       kfree(grinfo->bb_bitmap);
+#endif
+                       ext4_lock_group(sb, i);
+                       ext4_mb_cleanup_pa(grinfo);
+                       ext4_unlock_group(sb, i);
+                       kfree(grinfo);
+               }
+               num_meta_group_infos = (sbi->s_groups_count +
+                               EXT4_DESC_PER_BLOCK(sb) - 1) >>
+                       EXT4_DESC_PER_BLOCK_BITS(sb);
+               for (i = 0; i < num_meta_group_infos; i++)
+                       kfree(sbi->s_group_info[i]);
+               kfree(sbi->s_group_info);
+       }
+       kfree(sbi->s_mb_offsets);
+       kfree(sbi->s_mb_maxs);
+       if (sbi->s_buddy_cache)
+               iput(sbi->s_buddy_cache);
+       if (sbi->s_mb_stats) {
+               printk(KERN_INFO
+                      "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n",
+                               atomic_read(&sbi->s_bal_allocated),
+                               atomic_read(&sbi->s_bal_reqs),
+                               atomic_read(&sbi->s_bal_success));
+               printk(KERN_INFO
+                     "EXT4-fs: mballoc: %u extents scanned, %u goal hits, "
+                               "%u 2^N hits, %u breaks, %u lost\n",
+                               atomic_read(&sbi->s_bal_ex_scanned),
+                               atomic_read(&sbi->s_bal_goals),
+                               atomic_read(&sbi->s_bal_2orders),
+                               atomic_read(&sbi->s_bal_breaks),
+                               atomic_read(&sbi->s_mb_lost_chunks));
+               printk(KERN_INFO
+                      "EXT4-fs: mballoc: %lu generated and it took %Lu\n",
+                               sbi->s_mb_buddies_generated++,
+                               sbi->s_mb_generation_time);
+               printk(KERN_INFO
+                      "EXT4-fs: mballoc: %u preallocated, %u discarded\n",
+                               atomic_read(&sbi->s_mb_preallocated),
+                               atomic_read(&sbi->s_mb_discarded));
+       }
+
+       kfree(sbi->s_locality_groups);
+
+       ext4_mb_history_release(sb);
+       ext4_mb_destroy_per_dev_proc(sb);
+
+       return 0;
+}
+
+static void ext4_mb_free_committed_blocks(struct super_block *sb)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       int err;
+       int i;
+       int count = 0;
+       int count2 = 0;
+       struct ext4_free_metadata *md;
+       struct ext4_buddy e4b;
+
+       if (list_empty(&sbi->s_committed_transaction))
+               return;
+
+       /* there is committed blocks to be freed yet */
+       do {
+               /* get next array of blocks */
+               md = NULL;
+               spin_lock(&sbi->s_md_lock);
+               if (!list_empty(&sbi->s_committed_transaction)) {
+                       md = list_entry(sbi->s_committed_transaction.next,
+                                       struct ext4_free_metadata, list);
+                       list_del(&md->list);
+               }
+               spin_unlock(&sbi->s_md_lock);
+
+               if (md == NULL)
+                       break;
+
+               mb_debug("gonna free %u blocks in group %lu (0x%p):",
+                               md->num, md->group, md);
+
+               err = ext4_mb_load_buddy(sb, md->group, &e4b);
+               /* we expect to find existing buddy because it's pinned */
+               BUG_ON(err != 0);
+
+               /* there are blocks to put in buddy to make them really free */
+               count += md->num;
+               count2++;
+               ext4_lock_group(sb, md->group);
+               for (i = 0; i < md->num; i++) {
+                       mb_debug(" %u", md->blocks[i]);
+                       err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
+                       BUG_ON(err != 0);
+               }
+               mb_debug("\n");
+               ext4_unlock_group(sb, md->group);
+
+               /* balance refcounts from ext4_mb_free_metadata() */
+               page_cache_release(e4b.bd_buddy_page);
+               page_cache_release(e4b.bd_bitmap_page);
+
+               kfree(md);
+               ext4_mb_release_desc(&e4b);
+
+       } while (md);
+
+       mb_debug("freed %u blocks in %u structures\n", count, count2);
+}
+
+#define EXT4_ROOT                      "ext4"
+#define EXT4_MB_STATS_NAME             "stats"
+#define EXT4_MB_MAX_TO_SCAN_NAME       "max_to_scan"
+#define EXT4_MB_MIN_TO_SCAN_NAME       "min_to_scan"
+#define EXT4_MB_ORDER2_REQ             "order2_req"
+#define EXT4_MB_STREAM_REQ             "stream_req"
+#define EXT4_MB_GROUP_PREALLOC         "group_prealloc"
+
+
+
+#define MB_PROC_VALUE_READ(name)                               \
+static int ext4_mb_read_##name(char *page, char **start,       \
+               off_t off, int count, int *eof, void *data)     \
+{                                                              \
+       struct ext4_sb_info *sbi = data;                        \
+       int len;                                                \
+       *eof = 1;                                               \
+       if (off != 0)                                           \
+               return 0;                                       \
+       len = sprintf(page, "%ld\n", sbi->s_mb_##name);         \
+       *start = page;                                          \
+       return len;                                             \
+}
+
+#define MB_PROC_VALUE_WRITE(name)                              \
+static int ext4_mb_write_##name(struct file *file,             \
+               const char __user *buf, unsigned long cnt, void *data)  \
+{                                                              \
+       struct ext4_sb_info *sbi = data;                        \
+       char str[32];                                           \
+       long value;                                             \
+       if (cnt >= sizeof(str))                                 \
+               return -EINVAL;                                 \
+       if (copy_from_user(str, buf, cnt))                      \
+               return -EFAULT;                                 \
+       value = simple_strtol(str, NULL, 0);                    \
+       if (value <= 0)                                         \
+               return -ERANGE;                                 \
+       sbi->s_mb_##name = value;                               \
+       return cnt;                                             \
+}
+
+MB_PROC_VALUE_READ(stats);
+MB_PROC_VALUE_WRITE(stats);
+MB_PROC_VALUE_READ(max_to_scan);
+MB_PROC_VALUE_WRITE(max_to_scan);
+MB_PROC_VALUE_READ(min_to_scan);
+MB_PROC_VALUE_WRITE(min_to_scan);
+MB_PROC_VALUE_READ(order2_reqs);
+MB_PROC_VALUE_WRITE(order2_reqs);
+MB_PROC_VALUE_READ(stream_request);
+MB_PROC_VALUE_WRITE(stream_request);
+MB_PROC_VALUE_READ(group_prealloc);
+MB_PROC_VALUE_WRITE(group_prealloc);
+
+#define        MB_PROC_HANDLER(name, var)                                      \
+do {                                                                   \
+       proc = create_proc_entry(name, mode, sbi->s_mb_proc);           \
+       if (proc == NULL) {                                             \
+               printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
+               goto err_out;                                           \
+       }                                                               \
+       proc->data = sbi;                                               \
+       proc->read_proc  = ext4_mb_read_##var ;                         \
+       proc->write_proc = ext4_mb_write_##var;                         \
+} while (0)
+
+static int ext4_mb_init_per_dev_proc(struct super_block *sb)
+{
+       mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct proc_dir_entry *proc;
+       char devname[64];
+
+       snprintf(devname, sizeof(devname) - 1, "%s",
+               bdevname(sb->s_bdev, devname));
+       sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
+
+       MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats);
+       MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan);
+       MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan);
+       MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs);
+       MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request);
+       MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc);
+
+       return 0;
+
+err_out:
+       printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname);
+       remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
+       remove_proc_entry(devname, proc_root_ext4);
+       sbi->s_mb_proc = NULL;
+
+       return -ENOMEM;
+}
+
+static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       char devname[64];
+
+       if (sbi->s_mb_proc == NULL)
+               return -EINVAL;
+
+       snprintf(devname, sizeof(devname) - 1, "%s",
+               bdevname(sb->s_bdev, devname));
+       remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
+       remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
+       remove_proc_entry(devname, proc_root_ext4);
+
+       return 0;
+}
+
+int __init init_ext4_mballoc(void)
+{
+       ext4_pspace_cachep =
+               kmem_cache_create("ext4_prealloc_space",
+                                    sizeof(struct ext4_prealloc_space),
+                                    0, SLAB_RECLAIM_ACCOUNT, NULL);
+       if (ext4_pspace_cachep == NULL)
+               return -ENOMEM;
+
+#ifdef CONFIG_PROC_FS
+       proc_root_ext4 = proc_mkdir(EXT4_ROOT, proc_root_fs);
+       if (proc_root_ext4 == NULL)
+               printk(KERN_ERR "EXT4-fs: Unable to create %s\n", EXT4_ROOT);
+#endif
+
+       return 0;
+}
+
+void exit_ext4_mballoc(void)
+{
+       /* XXX: synchronize_rcu(); */
+       kmem_cache_destroy(ext4_pspace_cachep);
+#ifdef CONFIG_PROC_FS
+       remove_proc_entry(EXT4_ROOT, proc_root_fs);
+#endif
+}
+
+
+/*
+ * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps
+ * Returns 0 if success or error code
+ */
+static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
+                               handle_t *handle)
+{
+       struct buffer_head *bitmap_bh = NULL;
+       struct ext4_super_block *es;
+       struct ext4_group_desc *gdp;
+       struct buffer_head *gdp_bh;
+       struct ext4_sb_info *sbi;
+       struct super_block *sb;
+       ext4_fsblk_t block;
+       int err;
+
+       BUG_ON(ac->ac_status != AC_STATUS_FOUND);
+       BUG_ON(ac->ac_b_ex.fe_len <= 0);
+
+       sb = ac->ac_sb;
+       sbi = EXT4_SB(sb);
+       es = sbi->s_es;
+
+       ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group,
+                       gdp->bg_free_blocks_count);
+
+       err = -EIO;
+       bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
+       if (!bitmap_bh)
+               goto out_err;
+
+       err = ext4_journal_get_write_access(handle, bitmap_bh);
+       if (err)
+               goto out_err;
+
+       err = -EIO;
+       gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
+       if (!gdp)
+               goto out_err;
+
+       err = ext4_journal_get_write_access(handle, gdp_bh);
+       if (err)
+               goto out_err;
+
+       block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb)
+               + ac->ac_b_ex.fe_start
+               + le32_to_cpu(es->s_first_data_block);
+
+       if (block == ext4_block_bitmap(sb, gdp) ||
+                       block == ext4_inode_bitmap(sb, gdp) ||
+                       in_range(block, ext4_inode_table(sb, gdp),
+                               EXT4_SB(sb)->s_itb_per_group)) {
+
+               ext4_error(sb, __FUNCTION__,
+                          "Allocating block in system zone - block = %llu",
+                          block);
+       }
+#ifdef AGGRESSIVE_CHECK
+       {
+               int i;
+               for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
+                       BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
+                                               bitmap_bh->b_data));
+               }
+       }
+#endif
+       mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
+                               ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
+
+       spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+       if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+               gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+               gdp->bg_free_blocks_count =
+                       cpu_to_le16(ext4_free_blocks_after_init(sb,
+                                               ac->ac_b_ex.fe_group,
+                                               gdp));
+       }
+       gdp->bg_free_blocks_count =
+               cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)
+                               - ac->ac_b_ex.fe_len);
+       gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
+       spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+       percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+
+       err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+       if (err)
+               goto out_err;
+       err = ext4_journal_dirty_metadata(handle, gdp_bh);
+
+out_err:
+       sb->s_dirt = 1;
+       put_bh(bitmap_bh);
+       return err;
+}
+
+/*
+ * here we normalize request for locality group
+ * Group request are normalized to s_strip size if we set the same via mount
+ * option. If not we set it to s_mb_group_prealloc which can be configured via
+ * /proc/fs/ext4/<partition>/group_prealloc
+ *
+ * XXX: should we try to preallocate more than the group has now?
+ */
+static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
+{
+       struct super_block *sb = ac->ac_sb;
+       struct ext4_locality_group *lg = ac->ac_lg;
+
+       BUG_ON(lg == NULL);
+       if (EXT4_SB(sb)->s_stripe)
+               ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
+       else
+               ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
+       mb_debug("#%u: goal %lu blocks for locality group\n",
+               current->pid, ac->ac_g_ex.fe_len);
+}
+
+/*
+ * Normalization means making request better in terms of
+ * size and alignment
+ */
+static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
+                               struct ext4_allocation_request *ar)
+{
+       int bsbits, max;
+       ext4_lblk_t end;
+       struct list_head *cur;
+       loff_t size, orig_size, start_off;
+       ext4_lblk_t start, orig_start;
+       struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
+
+       /* do normalize only data requests, metadata requests
+          do not need preallocation */
+       if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+               return;
+
+       /* sometime caller may want exact blocks */
+       if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+               return;
+
+       /* caller may indicate that preallocation isn't
+        * required (it's a tail, for example) */
+       if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
+               return;
+
+       if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
+               ext4_mb_normalize_group_request(ac);
+               return ;
+       }
+
+       bsbits = ac->ac_sb->s_blocksize_bits;
+
+       /* first, let's learn actual file size
+        * given current request is allocated */
+       size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+       size = size << bsbits;
+       if (size < i_size_read(ac->ac_inode))
+               size = i_size_read(ac->ac_inode);
+
+       /* max available blocks in a free group */
+       max = EXT4_BLOCKS_PER_GROUP(ac->ac_sb) - 1 - 1 -
+                               EXT4_SB(ac->ac_sb)->s_itb_per_group;
+
+#define NRL_CHECK_SIZE(req, size, max,bits)    \
+               (req <= (size) || max <= ((size) >> bits))
+
+       /* first, try to predict filesize */
+       /* XXX: should this table be tunable? */
+       start_off = 0;
+       if (size <= 16 * 1024) {
+               size = 16 * 1024;
+       } else if (size <= 32 * 1024) {
+               size = 32 * 1024;
+       } else if (size <= 64 * 1024) {
+               size = 64 * 1024;
+       } else if (size <= 128 * 1024) {
+               size = 128 * 1024;
+       } else if (size <= 256 * 1024) {
+               size = 256 * 1024;
+       } else if (size <= 512 * 1024) {
+               size = 512 * 1024;
+       } else if (size <= 1024 * 1024) {
+               size = 1024 * 1024;
+       } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, bsbits)) {
+               start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+                                               (20 - bsbits)) << 20;
+               size = 1024 * 1024;
+       } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, bsbits)) {
+               start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+                                                       (22 - bsbits)) << 22;
+               size = 4 * 1024 * 1024;
+       } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
+                                       (8<<20)>>bsbits, max, bsbits)) {
+               start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+                                                       (23 - bsbits)) << 23;
+               size = 8 * 1024 * 1024;
+       } else {
+               start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
+               size      = ac->ac_o_ex.fe_len << bsbits;
+       }
+       orig_size = size = size >> bsbits;
+       orig_start = start = start_off >> bsbits;
+
+       /* don't cover already allocated blocks in selected range */
+       if (ar->pleft && start <= ar->lleft) {
+               size -= ar->lleft + 1 - start;
+               start = ar->lleft + 1;
+       }
+       if (ar->pright && start + size - 1 >= ar->lright)
+               size -= start + size - ar->lright;
+
+       end = start + size;
+
+       /* check we don't cross already preallocated blocks */
+       rcu_read_lock();
+       list_for_each_rcu(cur, &ei->i_prealloc_list) {
+               struct ext4_prealloc_space *pa;
+               unsigned long pa_end;
+
+               pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
+
+               if (pa->pa_deleted)
+                       continue;
+               spin_lock(&pa->pa_lock);
+               if (pa->pa_deleted) {
+                       spin_unlock(&pa->pa_lock);
+                       continue;
+               }
+
+               pa_end = pa->pa_lstart + pa->pa_len;
+
+               /* PA must not overlap original request */
+               BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
+                       ac->ac_o_ex.fe_logical < pa->pa_lstart));
+
+               /* skip PA normalized request doesn't overlap with */
+               if (pa->pa_lstart >= end) {
+                       spin_unlock(&pa->pa_lock);
+                       continue;
+               }
+               if (pa_end <= start) {
+                       spin_unlock(&pa->pa_lock);
+                       continue;
+               }
+               BUG_ON(pa->pa_lstart <= start && pa_end >= end);
+
+               if (pa_end <= ac->ac_o_ex.fe_logical) {
+                       BUG_ON(pa_end < start);
+                       start = pa_end;
+               }
+
+               if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
+                       BUG_ON(pa->pa_lstart > end);
+                       end = pa->pa_lstart;
+               }
+               spin_unlock(&pa->pa_lock);
+       }
+       rcu_read_unlock();
+       size = end - start;
+
+       /* XXX: extra loop to check we really don't overlap preallocations */
+       rcu_read_lock();
+       list_for_each_rcu(cur, &ei->i_prealloc_list) {
+               struct ext4_prealloc_space *pa;
+               unsigned long pa_end;
+               pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
+               spin_lock(&pa->pa_lock);
+               if (pa->pa_deleted == 0) {
+                       pa_end = pa->pa_lstart + pa->pa_len;
+                       BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
+               }
+               spin_unlock(&pa->pa_lock);
+       }
+       rcu_read_unlock();
+
+       if (start + size <= ac->ac_o_ex.fe_logical &&
+                       start > ac->ac_o_ex.fe_logical) {
+               printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n",
+                       (unsigned long) start, (unsigned long) size,
+                       (unsigned long) ac->ac_o_ex.fe_logical);
+       }
+       BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
+                       start > ac->ac_o_ex.fe_logical);
+       BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+
+       /* now prepare goal request */
+
+       /* XXX: is it better to align blocks WRT to logical
+        * placement or satisfy big request as is */
+       ac->ac_g_ex.fe_logical = start;
+       ac->ac_g_ex.fe_len = size;
+
+       /* define goal start in order to merge */
+       if (ar->pright && (ar->lright == (start + size))) {
+               /* merge to the right */
+               ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
+                                               &ac->ac_f_ex.fe_group,
+                                               &ac->ac_f_ex.fe_start);
+               ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
+       }
+       if (ar->pleft && (ar->lleft + 1 == start)) {
+               /* merge to the left */
+               ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
+                                               &ac->ac_f_ex.fe_group,
+                                               &ac->ac_f_ex.fe_start);
+               ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
+       }
+
+       mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size,
+               (unsigned) orig_size, (unsigned) start);
+}
+
+static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+
+       if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
+               atomic_inc(&sbi->s_bal_reqs);
+               atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
+               if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len)
+                       atomic_inc(&sbi->s_bal_success);
+               atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
+               if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
+                               ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
+                       atomic_inc(&sbi->s_bal_goals);
+               if (ac->ac_found > sbi->s_mb_max_to_scan)
+                       atomic_inc(&sbi->s_bal_breaks);
+       }
+
+       ext4_mb_store_history(ac);
+}
+
+/*
+ * use blocks preallocated to inode
+ */
+static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
+                               struct ext4_prealloc_space *pa)
+{
+       ext4_fsblk_t start;
+       ext4_fsblk_t end;
+       int len;
+
+       /* found preallocated blocks, use them */
+       start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
+       end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len);
+       len = end - start;
+       ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
+                                       &ac->ac_b_ex.fe_start);
+       ac->ac_b_ex.fe_len = len;
+       ac->ac_status = AC_STATUS_FOUND;
+       ac->ac_pa = pa;
+
+       BUG_ON(start < pa->pa_pstart);
+       BUG_ON(start + len > pa->pa_pstart + pa->pa_len);
+       BUG_ON(pa->pa_free < len);
+       pa->pa_free -= len;
+
+       mb_debug("use %llu/%lu from inode pa %p\n", start, len, pa);
+}
+
+/*
+ * use blocks preallocated to locality group
+ */
+static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
+                               struct ext4_prealloc_space *pa)
+{
+       unsigned len = ac->ac_o_ex.fe_len;
+
+       ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
+                                       &ac->ac_b_ex.fe_group,
+                                       &ac->ac_b_ex.fe_start);
+       ac->ac_b_ex.fe_len = len;
+       ac->ac_status = AC_STATUS_FOUND;
+       ac->ac_pa = pa;
+
+       /* we don't correct pa_pstart or pa_plen here to avoid
+        * possible race when tte group is being loaded concurrently
+        * instead we correct pa later, after blocks are marked
+        * in on-disk bitmap -- see ext4_mb_release_context() */
+       /*
+        * FIXME!! but the other CPUs can look at this particular
+        * pa and think that it have enought free blocks if we
+        * don't update pa_free here right ?
+        */
+       mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
+}
+
+/*
+ * search goal blocks in preallocated space
+ */
+static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
+{
+       struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
+       struct ext4_locality_group *lg;
+       struct ext4_prealloc_space *pa;
+       struct list_head *cur;
+
+       /* only data can be preallocated */
+       if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+               return 0;
+
+       /* first, try per-file preallocation */
+       rcu_read_lock();
+       list_for_each_rcu(cur, &ei->i_prealloc_list) {
+               pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
+
+               /* all fields in this condition don't change,
+                * so we can skip locking for them */
+               if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
+                       ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
+                       continue;
+
+               /* found preallocated blocks, use them */
+               spin_lock(&pa->pa_lock);
+               if (pa->pa_deleted == 0 && pa->pa_free) {
+                       atomic_inc(&pa->pa_count);
+                       ext4_mb_use_inode_pa(ac, pa);
+                       spin_unlock(&pa->pa_lock);
+                       ac->ac_criteria = 10;
+                       rcu_read_unlock();
+                       return 1;
+               }
+               spin_unlock(&pa->pa_lock);
+       }
+       rcu_read_unlock();
+
+       /* can we use group allocation? */
+       if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
+               return 0;
+
+       /* inode may have no locality group for some reason */
+       lg = ac->ac_lg;
+       if (lg == NULL)
+               return 0;
+
+       rcu_read_lock();
+       list_for_each_rcu(cur, &lg->lg_prealloc_list) {
+               pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
+               spin_lock(&pa->pa_lock);
+               if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) {
+                       atomic_inc(&pa->pa_count);
+                       ext4_mb_use_group_pa(ac, pa);
+                       spin_unlock(&pa->pa_lock);
+                       ac->ac_criteria = 20;
+                       rcu_read_unlock();
+                       return 1;
+               }
+               spin_unlock(&pa->pa_lock);
+       }
+       rcu_read_unlock();
+
+       return 0;
+}
+
+/*
+ * the function goes through all preallocation in this group and marks them
+ * used in in-core bitmap. buddy must be generated from this bitmap
+ * Need to be called with ext4 group lock (ext4_lock_group)
+ */
+static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+                                       ext4_group_t group)
+{
+       struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+       struct ext4_prealloc_space *pa;
+       struct list_head *cur;
+       ext4_group_t groupnr;
+       ext4_grpblk_t start;
+       int preallocated = 0;
+       int count = 0;
+       int len;
+
+       /* all form of preallocation discards first load group,
+        * so the only competing code is preallocation use.
+        * we don't need any locking here
+        * notice we do NOT ignore preallocations with pa_deleted
+        * otherwise we could leave used blocks available for
+        * allocation in buddy when concurrent ext4_mb_put_pa()
+        * is dropping preallocation
+        */
+       list_for_each(cur, &grp->bb_prealloc_list) {
+               pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
+               spin_lock(&pa->pa_lock);
+               ext4_get_group_no_and_offset(sb, pa->pa_pstart,
+                                            &groupnr, &start);
+               len = pa->pa_len;
+               spin_unlock(&pa->pa_lock);
+               if (unlikely(len == 0))
+                       continue;
+               BUG_ON(groupnr != group);
+               mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
+                                               bitmap, start, len);
+               preallocated += len;
+               count++;
+       }
+       mb_debug("prellocated %u for group %lu\n", preallocated, group);
+}
+
+static void ext4_mb_pa_callback(struct rcu_head *head)
+{
+       struct ext4_prealloc_space *pa;
+       pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+       kmem_cache_free(ext4_pspace_cachep, pa);
+}
+
+/*
+ * drops a reference to preallocated space descriptor
+ * if this was the last reference and the space is consumed
+ */
+static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
+                       struct super_block *sb, struct ext4_prealloc_space *pa)
+{
+       unsigned long grp;
+
+       if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
+               return;
+
+       /* in this short window concurrent discard can set pa_deleted */
+       spin_lock(&pa->pa_lock);
+       if (pa->pa_deleted == 1) {
+               spin_unlock(&pa->pa_lock);
+               return;
+       }
+
+       pa->pa_deleted = 1;
+       spin_unlock(&pa->pa_lock);
+
+       /* -1 is to protect from crossing allocation group */
+       ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL);
+
+       /*
+        * possible race:
+        *
+        *  P1 (buddy init)                     P2 (regular allocation)
+        *                                      find block B in PA
+        *  copy on-disk bitmap to buddy
+        *                                      mark B in on-disk bitmap
+        *                                      drop PA from group
+        *  mark all PAs in buddy
+        *
+        * thus, P1 initializes buddy with B available. to prevent this
+        * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
+        * against that pair
+        */
+       ext4_lock_group(sb, grp);
+       list_del(&pa->pa_group_list);
+       ext4_unlock_group(sb, grp);
+
+       spin_lock(pa->pa_obj_lock);
+       list_del_rcu(&pa->pa_inode_list);
+       spin_unlock(pa->pa_obj_lock);
+
+       call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+}
+
+/*
+ * creates new preallocated space for given inode
+ */
+static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
+{
+       struct super_block *sb = ac->ac_sb;
+       struct ext4_prealloc_space *pa;
+       struct ext4_group_info *grp;
+       struct ext4_inode_info *ei;
+
+       /* preallocate only when found space is larger then requested */
+       BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
+       BUG_ON(ac->ac_status != AC_STATUS_FOUND);
+       BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
+
+       pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
+       if (pa == NULL)
+               return -ENOMEM;
+
+       if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
+               int winl;
+               int wins;
+               int win;
+               int offs;
+
+               /* we can't allocate as much as normalizer wants.
+                * so, found space must get proper lstart
+                * to cover original request */
+               BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
+               BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
+
+               /* we're limited by original request in that
+                * logical block must be covered any way
+                * winl is window we can move our chunk within */
+               winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
+
+               /* also, we should cover whole original request */
+               wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len;
+
+               /* the smallest one defines real window */
+               win = min(winl, wins);
+
+               offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len;
+               if (offs && offs < win)
+                       win = offs;
+
+               ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win;
+               BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
+               BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
+       }
+
+       /* preallocation can change ac_b_ex, thus we store actually
+        * allocated blocks for history */
+       ac->ac_f_ex = ac->ac_b_ex;
+
+       pa->pa_lstart = ac->ac_b_ex.fe_logical;
+       pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+       pa->pa_len = ac->ac_b_ex.fe_len;
+       pa->pa_free = pa->pa_len;
+       atomic_set(&pa->pa_count, 1);
+       spin_lock_init(&pa->pa_lock);
+       pa->pa_deleted = 0;
+       pa->pa_linear = 0;
+
+       mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
+                       pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+
+       ext4_mb_use_inode_pa(ac, pa);
+       atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
+
+       ei = EXT4_I(ac->ac_inode);
+       grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
+
+       pa->pa_obj_lock = &ei->i_prealloc_lock;
+       pa->pa_inode = ac->ac_inode;
+
+       ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+       list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
+       ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+
+       spin_lock(pa->pa_obj_lock);
+       list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
+       spin_unlock(pa->pa_obj_lock);
+
+       return 0;
+}
+
+/*
+ * creates new preallocated space for locality group inodes belongs to
+ */
+static int ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
+{
+       struct super_block *sb = ac->ac_sb;
+       struct ext4_locality_group *lg;
+       struct ext4_prealloc_space *pa;
+       struct ext4_group_info *grp;
+
+       /* preallocate only when found space is larger then requested */
+       BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
+       BUG_ON(ac->ac_status != AC_STATUS_FOUND);
+       BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
+
+       BUG_ON(ext4_pspace_cachep == NULL);
+       pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
+       if (pa == NULL)
+               return -ENOMEM;
+
+       /* preallocation can change ac_b_ex, thus we store actually
+        * allocated blocks for history */
+       ac->ac_f_ex = ac->ac_b_ex;
+
+       pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+       pa->pa_lstart = pa->pa_pstart;
+       pa->pa_len = ac->ac_b_ex.fe_len;
+       pa->pa_free = pa->pa_len;
+       atomic_set(&pa->pa_count, 1);
+       spin_lock_init(&pa->pa_lock);
+       pa->pa_deleted = 0;
+       pa->pa_linear = 1;
+
+       mb_debug("new group pa %p: %llu/%u for %u\n", pa,
+                       pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+
+       ext4_mb_use_group_pa(ac, pa);
+       atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
+
+       grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
+       lg = ac->ac_lg;
+       BUG_ON(lg == NULL);
+
+       pa->pa_obj_lock = &lg->lg_prealloc_lock;
+       pa->pa_inode = NULL;
+
+       ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+       list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
+       ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+
+       spin_lock(pa->pa_obj_lock);
+       list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list);
+       spin_unlock(pa->pa_obj_lock);
+
+       return 0;
+}
+
+static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
+{
+       int err;
+
+       if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
+               err = ext4_mb_new_group_pa(ac);
+       else
+               err = ext4_mb_new_inode_pa(ac);
+       return err;
+}
+
+/*
+ * finds all unused blocks in on-disk bitmap, frees them in
+ * in-core bitmap and buddy.
+ * @pa must be unlinked from inode and group lists, so that
+ * nobody else can find/use it.
+ * the caller MUST hold group/inode locks.
+ * TODO: optimize the case when there are no in-core structures yet
+ */
+static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,
+                               struct buffer_head *bitmap_bh,
+                               struct ext4_prealloc_space *pa)
+{
+       struct ext4_allocation_context ac;
+       struct super_block *sb = e4b->bd_sb;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       unsigned long end;
+       unsigned long next;
+       ext4_group_t group;
+       ext4_grpblk_t bit;
+       sector_t start;
+       int err = 0;
+       int free = 0;
+
+       BUG_ON(pa->pa_deleted == 0);
+       ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
+       BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
+       end = bit + pa->pa_len;
+
+       ac.ac_sb = sb;
+       ac.ac_inode = pa->pa_inode;
+       ac.ac_op = EXT4_MB_HISTORY_DISCARD;
+
+       while (bit < end) {
+               bit = ext4_find_next_zero_bit(bitmap_bh->b_data, end, bit);
+               if (bit >= end)
+                       break;
+               next = ext4_find_next_bit(bitmap_bh->b_data, end, bit);
+               if (next > end)
+                       next = end;
+               start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
+                               le32_to_cpu(sbi->s_es->s_first_data_block);
+               mb_debug("    free preallocated %u/%u in group %u\n",
+                               (unsigned) start, (unsigned) next - bit,
+                               (unsigned) group);
+               free += next - bit;
+
+               ac.ac_b_ex.fe_group = group;
+               ac.ac_b_ex.fe_start = bit;
+               ac.ac_b_ex.fe_len = next - bit;
+               ac.ac_b_ex.fe_logical = 0;
+               ext4_mb_store_history(&ac);
+
+               mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
+               bit = next + 1;
+       }
+       if (free != pa->pa_free) {
+               printk(KERN_ERR "pa %p: logic %lu, phys. %lu, len %lu\n",
+                       pa, (unsigned long) pa->pa_lstart,
+                       (unsigned long) pa->pa_pstart,
+                       (unsigned long) pa->pa_len);
+               printk(KERN_ERR "free %u, pa_free %u\n", free, pa->pa_free);
+       }
+       BUG_ON(free != pa->pa_free);
+       atomic_add(free, &sbi->s_mb_discarded);
+
+       return err;
+}
+
+static int ext4_mb_release_group_pa(struct ext4_buddy *e4b,
+                               struct ext4_prealloc_space *pa)
+{
+       struct ext4_allocation_context ac;
+       struct super_block *sb = e4b->bd_sb;
+       ext4_group_t group;
+       ext4_grpblk_t bit;
+
+       ac.ac_op = EXT4_MB_HISTORY_DISCARD;
+
+       BUG_ON(pa->pa_deleted == 0);
+       ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
+       BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
+       mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
+       atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
+
+       ac.ac_sb = sb;
+       ac.ac_inode = NULL;
+       ac.ac_b_ex.fe_group = group;
+       ac.ac_b_ex.fe_start = bit;
+       ac.ac_b_ex.fe_len = pa->pa_len;
+       ac.ac_b_ex.fe_logical = 0;
+       ext4_mb_store_history(&ac);
+
+       return 0;
+}
+
+/*
+ * releases all preallocations in given group
+ *
+ * first, we need to decide discard policy:
+ * - when do we discard
+ *   1) ENOSPC
+ * - how many do we discard
+ *   1) how many requested
+ */
+static int ext4_mb_discard_group_preallocations(struct super_block *sb,
+                                       ext4_group_t group, int needed)
+{
+       struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+       struct buffer_head *bitmap_bh = NULL;
+       struct ext4_prealloc_space *pa, *tmp;
+       struct list_head list;
+       struct ext4_buddy e4b;
+       int err;
+       int busy = 0;
+       int free = 0;
+
+       mb_debug("discard preallocation for group %lu\n", group);
+
+       if (list_empty(&grp->bb_prealloc_list))
+               return 0;
+
+       bitmap_bh = read_block_bitmap(sb, group);
+       if (bitmap_bh == NULL) {
+               /* error handling here */
+               ext4_mb_release_desc(&e4b);
+               BUG_ON(bitmap_bh == NULL);
+       }
+
+       err = ext4_mb_load_buddy(sb, group, &e4b);
+       BUG_ON(err != 0); /* error handling here */
+
+       if (needed == 0)
+               needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
+
+       grp = ext4_get_group_info(sb, group);
+       INIT_LIST_HEAD(&list);
+
+repeat:
+       ext4_lock_group(sb, group);
+       list_for_each_entry_safe(pa, tmp,
+                               &grp->bb_prealloc_list, pa_group_list) {
+               spin_lock(&pa->pa_lock);
+               if (atomic_read(&pa->pa_count)) {
+                       spin_unlock(&pa->pa_lock);
+                       busy = 1;
+                       continue;
+               }
+               if (pa->pa_deleted) {
+                       spin_unlock(&pa->pa_lock);
+                       continue;
+               }
+
+               /* seems this one can be freed ... */
+               pa->pa_deleted = 1;
+
+               /* we can trust pa_free ... */
+               free += pa->pa_free;
+
+               spin_unlock(&pa->pa_lock);
+
+               list_del(&pa->pa_group_list);
+               list_add(&pa->u.pa_tmp_list, &list);
+       }
+
+       /* if we still need more blocks and some PAs were used, try again */
+       if (free < needed && busy) {
+               busy = 0;
+               ext4_unlock_group(sb, group);
+               /*
+                * Yield the CPU here so that we don't get soft lockup
+                * in non preempt case.
+                */
+               yield();
+               goto repeat;
+       }
+
+       /* found anything to free? */
+       if (list_empty(&list)) {
+               BUG_ON(free != 0);
+               goto out;
+       }
+
+       /* now free all selected PAs */
+       list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
+
+               /* remove from object (inode or locality group) */
+               spin_lock(pa->pa_obj_lock);
+               list_del_rcu(&pa->pa_inode_list);
+               spin_unlock(pa->pa_obj_lock);
+
+               if (pa->pa_linear)
+                       ext4_mb_release_group_pa(&e4b, pa);
+               else
+                       ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
+
+               list_del(&pa->u.pa_tmp_list);
+               call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+       }
+
+out:
+       ext4_unlock_group(sb, group);
+       ext4_mb_release_desc(&e4b);
+       put_bh(bitmap_bh);
+       return free;
+}
+
+/*
+ * releases all non-used preallocated blocks for given inode
+ *
+ * It's important to discard preallocations under i_data_sem
+ * We don't want another block to be served from the prealloc
+ * space when we are discarding the inode prealloc space.
+ *
+ * FIXME!! Make sure it is valid at all the call sites
+ */
+void ext4_mb_discard_inode_preallocations(struct inode *inode)
+{
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       struct super_block *sb = inode->i_sb;
+       struct buffer_head *bitmap_bh = NULL;
+       struct ext4_prealloc_space *pa, *tmp;
+       ext4_group_t group = 0;
+       struct list_head list;
+       struct ext4_buddy e4b;
+       int err;
+
+       if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) {
+               /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
+               return;
+       }
+
+       mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
+
+       INIT_LIST_HEAD(&list);
+
+repeat:
+       /* first, collect all pa's in the inode */
+       spin_lock(&ei->i_prealloc_lock);
+       while (!list_empty(&ei->i_prealloc_list)) {
+               pa = list_entry(ei->i_prealloc_list.next,
+                               struct ext4_prealloc_space, pa_inode_list);
+               BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
+               spin_lock(&pa->pa_lock);
+               if (atomic_read(&pa->pa_count)) {
+                       /* this shouldn't happen often - nobody should
+                        * use preallocation while we're discarding it */
+                       spin_unlock(&pa->pa_lock);
+                       spin_unlock(&ei->i_prealloc_lock);
+                       printk(KERN_ERR "uh-oh! used pa while discarding\n");
+                       WARN_ON(1);
+                       schedule_timeout_uninterruptible(HZ);
+                       goto repeat;
+
+               }
+               if (pa->pa_deleted == 0) {
+                       pa->pa_deleted = 1;
+                       spin_unlock(&pa->pa_lock);
+                       list_del_rcu(&pa->pa_inode_list);
+                       list_add(&pa->u.pa_tmp_list, &list);
+                       continue;
+               }
+
+               /* someone is deleting pa right now */
+               spin_unlock(&pa->pa_lock);
+               spin_unlock(&ei->i_prealloc_lock);
+
+               /* we have to wait here because pa_deleted
+                * doesn't mean pa is already unlinked from
+                * the list. as we might be called from
+                * ->clear_inode() the inode will get freed
+                * and concurrent thread which is unlinking
+                * pa from inode's list may access already
+                * freed memory, bad-bad-bad */
+
+               /* XXX: if this happens too often, we can
+                * add a flag to force wait only in case
+                * of ->clear_inode(), but not in case of
+                * regular truncate */
+               schedule_timeout_uninterruptible(HZ);
+               goto repeat;
+       }
+       spin_unlock(&ei->i_prealloc_lock);
+
+       list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
+               BUG_ON(pa->pa_linear != 0);
+               ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+
+               err = ext4_mb_load_buddy(sb, group, &e4b);
+               BUG_ON(err != 0); /* error handling here */
+
+               bitmap_bh = read_block_bitmap(sb, group);
+               if (bitmap_bh == NULL) {
+                       /* error handling here */
+                       ext4_mb_release_desc(&e4b);
+                       BUG_ON(bitmap_bh == NULL);
+               }
+
+               ext4_lock_group(sb, group);
+               list_del(&pa->pa_group_list);
+               ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
+               ext4_unlock_group(sb, group);
+
+               ext4_mb_release_desc(&e4b);
+               put_bh(bitmap_bh);
+
+               list_del(&pa->u.pa_tmp_list);
+               call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+       }
+}
+
+/*
+ * finds all preallocated spaces and return blocks being freed to them
+ * if preallocated space becomes full (no block is used from the space)
+ * then the function frees space in buddy
+ * XXX: at the moment, truncate (which is the only way to free blocks)
+ * discards all preallocations
+ */
+static void ext4_mb_return_to_preallocation(struct inode *inode,
+                                       struct ext4_buddy *e4b,
+                                       sector_t block, int count)
+{
+       BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
+}
+#ifdef MB_DEBUG
+static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
+{
+       struct super_block *sb = ac->ac_sb;
+       ext4_group_t i;
+
+       printk(KERN_ERR "EXT4-fs: Can't allocate:"
+                       " Allocation context details:\n");
+       printk(KERN_ERR "EXT4-fs: status %d flags %d\n",
+                       ac->ac_status, ac->ac_flags);
+       printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, "
+                       "best %lu/%lu/%lu@%lu cr %d\n",
+                       (unsigned long)ac->ac_o_ex.fe_group,
+                       (unsigned long)ac->ac_o_ex.fe_start,
+                       (unsigned long)ac->ac_o_ex.fe_len,
+                       (unsigned long)ac->ac_o_ex.fe_logical,
+                       (unsigned long)ac->ac_g_ex.fe_group,
+                       (unsigned long)ac->ac_g_ex.fe_start,
+                       (unsigned long)ac->ac_g_ex.fe_len,
+                       (unsigned long)ac->ac_g_ex.fe_logical,
+                       (unsigned long)ac->ac_b_ex.fe_group,
+                       (unsigned long)ac->ac_b_ex.fe_start,
+                       (unsigned long)ac->ac_b_ex.fe_len,
+                       (unsigned long)ac->ac_b_ex.fe_logical,
+                       (int)ac->ac_criteria);
+       printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
+               ac->ac_found);
+       printk(KERN_ERR "EXT4-fs: groups: \n");
+       for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+               struct ext4_group_info *grp = ext4_get_group_info(sb, i);
+               struct ext4_prealloc_space *pa;
+               ext4_grpblk_t start;
+               struct list_head *cur;
+               ext4_lock_group(sb, i);
+               list_for_each(cur, &grp->bb_prealloc_list) {
+                       pa = list_entry(cur, struct ext4_prealloc_space,
+                                       pa_group_list);
+                       spin_lock(&pa->pa_lock);
+                       ext4_get_group_no_and_offset(sb, pa->pa_pstart,
+                                                    NULL, &start);
+                       spin_unlock(&pa->pa_lock);
+                       printk(KERN_ERR "PA:%lu:%d:%u \n", i,
+                                                       start, pa->pa_len);
+               }
+               ext4_lock_group(sb, i);
+
+               if (grp->bb_free == 0)
+                       continue;
+               printk(KERN_ERR "%lu: %d/%d \n",
+                      i, grp->bb_free, grp->bb_fragments);
+       }
+       printk(KERN_ERR "\n");
+}
+#else
+static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
+{
+       return;
+}
+#endif
+
+/*
+ * We use locality group preallocation for small size file. The size of the
+ * file is determined by the current size or the resulting size after
+ * allocation which ever is larger
+ *
+ * One can tune this size via /proc/fs/ext4/<partition>/stream_req
+ */
+static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+       int bsbits = ac->ac_sb->s_blocksize_bits;
+       loff_t size, isize;
+
+       if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+               return;
+
+       size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+       isize = i_size_read(ac->ac_inode) >> bsbits;
+       size = max(size, isize);
+
+       /* don't use group allocation for large files */
+       if (size >= sbi->s_mb_stream_request)
+               return;
+
+       if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+               return;
+
+       BUG_ON(ac->ac_lg != NULL);
+       /*
+        * locality group prealloc space are per cpu. The reason for having
+        * per cpu locality group is to reduce the contention between block
+        * request from multiple CPUs.
+        */
+       ac->ac_lg = &sbi->s_locality_groups[get_cpu()];
+       put_cpu();
+
+       /* we're going to use group allocation */
+       ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
+
+       /* serialize all allocations in the group */
+       mutex_lock(&ac->ac_lg->lg_mutex);
+}
+
+static int ext4_mb_initialize_context(struct ext4_allocation_context *ac,
+                               struct ext4_allocation_request *ar)
+{
+       struct super_block *sb = ar->inode->i_sb;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_super_block *es = sbi->s_es;
+       ext4_group_t group;
+       unsigned long len;
+       unsigned long goal;
+       ext4_grpblk_t block;
+
+       /* we can't allocate > group size */
+       len = ar->len;
+
+       /* just a dirty hack to filter too big requests  */
+       if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10)
+               len = EXT4_BLOCKS_PER_GROUP(sb) - 10;
+
+       /* start searching from the goal */
+       goal = ar->goal;
+       if (goal < le32_to_cpu(es->s_first_data_block) ||
+                       goal >= ext4_blocks_count(es))
+               goal = le32_to_cpu(es->s_first_data_block);
+       ext4_get_group_no_and_offset(sb, goal, &group, &block);
+
+       /* set up allocation goals */
+       ac->ac_b_ex.fe_logical = ar->logical;
+       ac->ac_b_ex.fe_group = 0;
+       ac->ac_b_ex.fe_start = 0;
+       ac->ac_b_ex.fe_len = 0;
+       ac->ac_status = AC_STATUS_CONTINUE;
+       ac->ac_groups_scanned = 0;
+       ac->ac_ex_scanned = 0;
+       ac->ac_found = 0;
+       ac->ac_sb = sb;
+       ac->ac_inode = ar->inode;
+       ac->ac_o_ex.fe_logical = ar->logical;
+       ac->ac_o_ex.fe_group = group;
+       ac->ac_o_ex.fe_start = block;
+       ac->ac_o_ex.fe_len = len;
+       ac->ac_g_ex.fe_logical = ar->logical;
+       ac->ac_g_ex.fe_group = group;
+       ac->ac_g_ex.fe_start = block;
+       ac->ac_g_ex.fe_len = len;
+       ac->ac_f_ex.fe_len = 0;
+       ac->ac_flags = ar->flags;
+       ac->ac_2order = 0;
+       ac->ac_criteria = 0;
+       ac->ac_pa = NULL;
+       ac->ac_bitmap_page = NULL;
+       ac->ac_buddy_page = NULL;
+       ac->ac_lg = NULL;
+
+       /* we have to define context: we'll we work with a file or
+        * locality group. this is a policy, actually */
+       ext4_mb_group_or_file(ac);
+
+       mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
+                       "left: %u/%u, right %u/%u to %swritable\n",
+                       (unsigned) ar->len, (unsigned) ar->logical,
+                       (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
+                       (unsigned) ar->lleft, (unsigned) ar->pleft,
+                       (unsigned) ar->lright, (unsigned) ar->pright,
+                       atomic_read(&ar->inode->i_writecount) ? "" : "non-");
+       return 0;
+
+}
+
+/*
+ * release all resource we used in allocation
+ */
+static int ext4_mb_release_context(struct ext4_allocation_context *ac)
+{
+       if (ac->ac_pa) {
+               if (ac->ac_pa->pa_linear) {
+                       /* see comment in ext4_mb_use_group_pa() */
+                       spin_lock(&ac->ac_pa->pa_lock);
+                       ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len;
+                       ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len;
+                       ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len;
+                       ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len;
+                       spin_unlock(&ac->ac_pa->pa_lock);
+               }
+               ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa);
+       }
+       if (ac->ac_bitmap_page)
+               page_cache_release(ac->ac_bitmap_page);
+       if (ac->ac_buddy_page)
+               page_cache_release(ac->ac_buddy_page);
+       if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
+               mutex_unlock(&ac->ac_lg->lg_mutex);
+       ext4_mb_collect_stats(ac);
+       return 0;
+}
+
+static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
+{
+       ext4_group_t i;
+       int ret;
+       int freed = 0;
+
+       for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
+               ret = ext4_mb_discard_group_preallocations(sb, i, needed);
+               freed += ret;
+               needed -= ret;
+       }
+
+       return freed;
+}
+
+/*
+ * Main entry point into mballoc to allocate blocks
+ * it tries to use preallocation first, then falls back
+ * to usual allocation
+ */
+ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
+                                struct ext4_allocation_request *ar, int *errp)
+{
+       struct ext4_allocation_context ac;
+       struct ext4_sb_info *sbi;
+       struct super_block *sb;
+       ext4_fsblk_t block = 0;
+       int freed;
+       int inquota;
+
+       sb = ar->inode->i_sb;
+       sbi = EXT4_SB(sb);
+
+       if (!test_opt(sb, MBALLOC)) {
+               block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
+                                           &(ar->len), errp);
+               return block;
+       }
+
+       while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
+               ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+               ar->len--;
+       }
+       if (ar->len == 0) {
+               *errp = -EDQUOT;
+               return 0;
+       }
+       inquota = ar->len;
+
+       ext4_mb_poll_new_transaction(sb, handle);
+
+       *errp = ext4_mb_initialize_context(&ac, ar);
+       if (*errp) {
+               ar->len = 0;
+               goto out;
+       }
+
+       ac.ac_op = EXT4_MB_HISTORY_PREALLOC;
+       if (!ext4_mb_use_preallocated(&ac)) {
+
+               ac.ac_op = EXT4_MB_HISTORY_ALLOC;
+               ext4_mb_normalize_request(&ac, ar);
+
+repeat:
+               /* allocate space in core */
+               ext4_mb_regular_allocator(&ac);
+
+               /* as we've just preallocated more space than
+                * user requested orinally, we store allocated
+                * space in a special descriptor */
+               if (ac.ac_status == AC_STATUS_FOUND &&
+                               ac.ac_o_ex.fe_len < ac.ac_b_ex.fe_len)
+                       ext4_mb_new_preallocation(&ac);
+       }
+
+       if (likely(ac.ac_status == AC_STATUS_FOUND)) {
+               ext4_mb_mark_diskspace_used(&ac, handle);
+               *errp = 0;
+               block = ext4_grp_offs_to_block(sb, &ac.ac_b_ex);
+               ar->len = ac.ac_b_ex.fe_len;
+       } else {
+               freed  = ext4_mb_discard_preallocations(sb, ac.ac_o_ex.fe_len);
+               if (freed)
+                       goto repeat;
+               *errp = -ENOSPC;
+               ac.ac_b_ex.fe_len = 0;
+               ar->len = 0;
+               ext4_mb_show_ac(&ac);
+       }
+
+       ext4_mb_release_context(&ac);
+
+out:
+       if (ar->len < inquota)
+               DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
+
+       return block;
+}
+static void ext4_mb_poll_new_transaction(struct super_block *sb,
+                                               handle_t *handle)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       if (sbi->s_last_transaction == handle->h_transaction->t_tid)
+               return;
+
+       /* new transaction! time to close last one and free blocks for
+        * committed transaction. we know that only transaction can be
+        * active, so previos transaction can be being logged and we
+        * know that transaction before previous is known to be already
+        * logged. this means that now we may free blocks freed in all
+        * transactions before previous one. hope I'm clear enough ... */
+
+       spin_lock(&sbi->s_md_lock);
+       if (sbi->s_last_transaction != handle->h_transaction->t_tid) {
+               mb_debug("new transaction %lu, old %lu\n",
+                               (unsigned long) handle->h_transaction->t_tid,
+                               (unsigned long) sbi->s_last_transaction);
+               list_splice_init(&sbi->s_closed_transaction,
+                               &sbi->s_committed_transaction);
+               list_splice_init(&sbi->s_active_transaction,
+                               &sbi->s_closed_transaction);
+               sbi->s_last_transaction = handle->h_transaction->t_tid;
+       }
+       spin_unlock(&sbi->s_md_lock);
+
+       ext4_mb_free_committed_blocks(sb);
+}
+
+static int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
+                         ext4_group_t group, ext4_grpblk_t block, int count)
+{
+       struct ext4_group_info *db = e4b->bd_info;
+       struct super_block *sb = e4b->bd_sb;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_free_metadata *md;
+       int i;
+
+       BUG_ON(e4b->bd_bitmap_page == NULL);
+       BUG_ON(e4b->bd_buddy_page == NULL);
+
+       ext4_lock_group(sb, group);
+       for (i = 0; i < count; i++) {
+               md = db->bb_md_cur;
+               if (md && db->bb_tid != handle->h_transaction->t_tid) {
+                       db->bb_md_cur = NULL;
+                       md = NULL;
+               }
+
+               if (md == NULL) {
+                       ext4_unlock_group(sb, group);
+                       md = kmalloc(sizeof(*md), GFP_NOFS);
+                       if (md == NULL)
+                               return -ENOMEM;
+                       md->num = 0;
+                       md->group = group;
+
+                       ext4_lock_group(sb, group);
+                       if (db->bb_md_cur == NULL) {
+                               spin_lock(&sbi->s_md_lock);
+                               list_add(&md->list, &sbi->s_active_transaction);
+                               spin_unlock(&sbi->s_md_lock);
+                               /* protect buddy cache from being freed,
+                                * otherwise we'll refresh it from
+                                * on-disk bitmap and lose not-yet-available
+                                * blocks */
+                               page_cache_get(e4b->bd_buddy_page);
+                               page_cache_get(e4b->bd_bitmap_page);
+                               db->bb_md_cur = md;
+                               db->bb_tid = handle->h_transaction->t_tid;
+                               mb_debug("new md 0x%p for group %lu\n",
+                                               md, md->group);
+                       } else {
+                               kfree(md);
+                               md = db->bb_md_cur;
+                       }
+               }
+
+               BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
+               md->blocks[md->num] = block + i;
+               md->num++;
+               if (md->num == EXT4_BB_MAX_BLOCKS) {
+                       /* no more space, put full container on a sb's list */
+                       db->bb_md_cur = NULL;
+               }
+       }
+       ext4_unlock_group(sb, group);
+       return 0;
+}
+
+/*
+ * Main entry point into mballoc to free blocks
+ */
+void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
+                       unsigned long block, unsigned long count,
+                       int metadata, unsigned long *freed)
+{
+       struct buffer_head *bitmap_bh = 0;
+       struct super_block *sb = inode->i_sb;
+       struct ext4_allocation_context ac;
+       struct ext4_group_desc *gdp;
+       struct ext4_super_block *es;
+       unsigned long overflow;
+       ext4_grpblk_t bit;
+       struct buffer_head *gd_bh;
+       ext4_group_t block_group;
+       struct ext4_sb_info *sbi;
+       struct ext4_buddy e4b;
+       int err = 0;
+       int ret;
+
+       *freed = 0;
+
+       ext4_mb_poll_new_transaction(sb, handle);
+
+       sbi = EXT4_SB(sb);
+       es = EXT4_SB(sb)->s_es;
+       if (block < le32_to_cpu(es->s_first_data_block) ||
+           block + count < block ||
+           block + count > ext4_blocks_count(es)) {
+               ext4_error(sb, __FUNCTION__,
+                           "Freeing blocks not in datazone - "
+                           "block = %lu, count = %lu", block, count);
+               goto error_return;
+       }
+
+       ext4_debug("freeing block %lu\n", block);
+
+       ac.ac_op = EXT4_MB_HISTORY_FREE;
+       ac.ac_inode = inode;
+       ac.ac_sb = sb;
+
+do_more:
+       overflow = 0;
+       ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+
+       /*
+        * Check to see if we are freeing blocks across a group
+        * boundary.
+        */
+       if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+               overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
+               count -= overflow;
+       }
+       bitmap_bh = read_block_bitmap(sb, block_group);
+       if (!bitmap_bh)
+               goto error_return;
+       gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
+       if (!gdp)
+               goto error_return;
+
+       if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
+           in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
+           in_range(block, ext4_inode_table(sb, gdp),
+                     EXT4_SB(sb)->s_itb_per_group) ||
+           in_range(block + count - 1, ext4_inode_table(sb, gdp),
+                     EXT4_SB(sb)->s_itb_per_group)) {
+
+               ext4_error(sb, __FUNCTION__,
+                          "Freeing blocks in system zone - "
+                          "Block = %lu, count = %lu", block, count);
+       }
+
+       BUFFER_TRACE(bitmap_bh, "getting write access");
+       err = ext4_journal_get_write_access(handle, bitmap_bh);
+       if (err)
+               goto error_return;
+
+       /*
+        * We are about to modify some metadata.  Call the journal APIs
+        * to unshare ->b_data if a currently-committing transaction is
+        * using it
+        */
+       BUFFER_TRACE(gd_bh, "get_write_access");
+       err = ext4_journal_get_write_access(handle, gd_bh);
+       if (err)
+               goto error_return;
+
+       err = ext4_mb_load_buddy(sb, block_group, &e4b);
+       if (err)
+               goto error_return;
+
+#ifdef AGGRESSIVE_CHECK
+       {
+               int i;
+               for (i = 0; i < count; i++)
+                       BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
+       }
+#endif
+       mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+                       bit, count);
+
+       /* We dirtied the bitmap block */
+       BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+       err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+
+       ac.ac_b_ex.fe_group = block_group;
+       ac.ac_b_ex.fe_start = bit;
+       ac.ac_b_ex.fe_len = count;
+       ext4_mb_store_history(&ac);
+
+       if (metadata) {
+               /* blocks being freed are metadata. these blocks shouldn't
+                * be used until this transaction is committed */
+               ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
+       } else {
+               ext4_lock_group(sb, block_group);
+               err = mb_free_blocks(inode, &e4b, bit, count);
+               ext4_mb_return_to_preallocation(inode, &e4b, block, count);
+               ext4_unlock_group(sb, block_group);
+               BUG_ON(err != 0);
+       }
+
+       spin_lock(sb_bgl_lock(sbi, block_group));
+       gdp->bg_free_blocks_count =
+               cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
+       gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+       spin_unlock(sb_bgl_lock(sbi, block_group));
+       percpu_counter_add(&sbi->s_freeblocks_counter, count);
+
+       ext4_mb_release_desc(&e4b);
+
+       *freed += count;
+
+       /* And the group descriptor block */
+       BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+       ret = ext4_journal_dirty_metadata(handle, gd_bh);
+       if (!err)
+               err = ret;
+
+       if (overflow && !err) {
+               block += count;
+               count = overflow;
+               put_bh(bitmap_bh);
+               goto do_more;
+       }
+       sb->s_dirt = 1;
+error_return:
+       brelse(bitmap_bh);
+       ext4_std_error(sb, err);
+       return;
+}
index ec7cb567a7dacf04d3eef7cee49384ae1a8c3ea0..3ebc2332f52ec072da353ef1466dfaaea3cd2baf 100644 (file)
@@ -236,10 +236,10 @@ static int free_dind_blocks(handle_t *handle,
        for (i = 0; i < max_entries; i++) {
                if (tmp_idata[i])
                        ext4_free_blocks(handle, inode,
        for (i = 0; i < max_entries; i++) {
                if (tmp_idata[i])
                        ext4_free_blocks(handle, inode,
-                                       le32_to_cpu(tmp_idata[i]), 1);
+                                       le32_to_cpu(tmp_idata[i]), 1, 1);
        }
        put_bh(bh);
        }
        put_bh(bh);
-       ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1);
+       ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
        return 0;
 }
 
        return 0;
 }
 
@@ -267,7 +267,7 @@ static int free_tind_blocks(handle_t *handle,
                }
        }
        put_bh(bh);
                }
        }
        put_bh(bh);
-       ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1);
+       ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
        return 0;
 }
 
        return 0;
 }
 
@@ -278,7 +278,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode)
 
        if (ei->i_data[EXT4_IND_BLOCK])
                ext4_free_blocks(handle, inode,
 
        if (ei->i_data[EXT4_IND_BLOCK])
                ext4_free_blocks(handle, inode,
-                               le32_to_cpu(ei->i_data[EXT4_IND_BLOCK]), 1);
+                               le32_to_cpu(ei->i_data[EXT4_IND_BLOCK]), 1, 1);
 
        if (ei->i_data[EXT4_DIND_BLOCK]) {
                retval = free_dind_blocks(handle, inode,
 
        if (ei->i_data[EXT4_DIND_BLOCK]) {
                retval = free_dind_blocks(handle, inode,
@@ -365,7 +365,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
                }
        }
        put_bh(bh);
                }
        }
        put_bh(bh);
-       ext4_free_blocks(handle, inode, block, 1);
+       ext4_free_blocks(handle, inode, block, 1, 1);
        return retval;
 }
 
        return retval;
 }
 
index 64fc7f1117344e66dabe080fc25ae0b8cfd7b6e1..3a51ffc477907f33448e7951b733e643962ab63c 100644 (file)
@@ -503,6 +503,7 @@ static void ext4_put_super (struct super_block * sb)
        struct ext4_super_block *es = sbi->s_es;
        int i;
 
        struct ext4_super_block *es = sbi->s_es;
        int i;
 
+       ext4_mb_release(sb);
        ext4_ext_release(sb);
        ext4_xattr_put_super(sb);
        jbd2_journal_destroy(sbi->s_journal);
        ext4_ext_release(sb);
        ext4_xattr_put_super(sb);
        jbd2_journal_destroy(sbi->s_journal);
@@ -569,6 +570,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei->i_block_alloc_info = NULL;
        ei->vfs_inode.i_version = 1;
        memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
        ei->i_block_alloc_info = NULL;
        ei->vfs_inode.i_version = 1;
        memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
+       INIT_LIST_HEAD(&ei->i_prealloc_list);
+       spin_lock_init(&ei->i_prealloc_lock);
        return &ei->vfs_inode;
 }
 
        return &ei->vfs_inode;
 }
 
@@ -881,6 +884,7 @@ enum {
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
        Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
        Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
        Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
        Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
+       Opt_mballoc, Opt_nomballoc, Opt_stripe,
 };
 
 static match_table_t tokens = {
 };
 
 static match_table_t tokens = {
@@ -935,6 +939,9 @@ static match_table_t tokens = {
        {Opt_extents, "extents"},
        {Opt_noextents, "noextents"},
        {Opt_i_version, "i_version"},
        {Opt_extents, "extents"},
        {Opt_noextents, "noextents"},
        {Opt_i_version, "i_version"},
+       {Opt_mballoc, "mballoc"},
+       {Opt_nomballoc, "nomballoc"},
+       {Opt_stripe, "stripe=%u"},
        {Opt_err, NULL},
        {Opt_resize, "resize"},
 };
        {Opt_err, NULL},
        {Opt_resize, "resize"},
 };
@@ -1284,6 +1291,19 @@ clear_qf_name:
                        set_opt(sbi->s_mount_opt, I_VERSION);
                        sb->s_flags |= MS_I_VERSION;
                        break;
                        set_opt(sbi->s_mount_opt, I_VERSION);
                        sb->s_flags |= MS_I_VERSION;
                        break;
+               case Opt_mballoc:
+                       set_opt(sbi->s_mount_opt, MBALLOC);
+                       break;
+               case Opt_nomballoc:
+                       clear_opt(sbi->s_mount_opt, MBALLOC);
+                       break;
+               case Opt_stripe:
+                       if (match_int(&args[0], &option))
+                               return 0;
+                       if (option < 0)
+                               return 0;
+                       sbi->s_stripe = option;
+                       break;
                default:
                        printk (KERN_ERR
                                "EXT4-fs: Unrecognized mount option \"%s\" "
                default:
                        printk (KERN_ERR
                                "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1742,6 +1762,34 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
        return (has_super + ext4_group_first_block_no(sb, bg));
 }
 
        return (has_super + ext4_group_first_block_no(sb, bg));
 }
 
+/**
+ * ext4_get_stripe_size: Get the stripe size.
+ * @sbi: In memory super block info
+ *
+ * If we have specified it via mount option, then
+ * use the mount option value. If the value specified at mount time is
+ * greater than the blocks per group use the super block value.
+ * If the super block value is greater than blocks per group return 0.
+ * Allocator needs it be less than blocks per group.
+ *
+ */
+static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
+{
+       unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
+       unsigned long stripe_width =
+                       le32_to_cpu(sbi->s_es->s_raid_stripe_width);
+
+       if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
+               return sbi->s_stripe;
+
+       if (stripe_width <= sbi->s_blocks_per_group)
+               return stripe_width;
+
+       if (stride <= sbi->s_blocks_per_group)
+               return stride;
+
+       return 0;
+}
 
 static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                                __releases(kernel_sem)
 
 static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                                __releases(kernel_sem)
@@ -2091,6 +2139,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
        sbi->s_rsv_window_head.rsv_goal_size = 0;
        ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
 
        sbi->s_rsv_window_head.rsv_goal_size = 0;
        ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
 
+       sbi->s_stripe = ext4_get_stripe_size(sbi);
+
        /*
         * set up enough so that it can read an inode
         */
        /*
         * set up enough so that it can read an inode
         */
@@ -2250,6 +2300,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
                "writeback");
 
        ext4_ext_init(sb);
                "writeback");
 
        ext4_ext_init(sb);
+       ext4_mb_init(sb, needs_recovery);
 
        lock_kernel();
        return 0;
 
        lock_kernel();
        return 0;
@@ -3232,9 +3283,15 @@ static struct file_system_type ext4dev_fs_type = {
 
 static int __init init_ext4_fs(void)
 {
 
 static int __init init_ext4_fs(void)
 {
-       int err = init_ext4_xattr();
+       int err;
+
+       err = init_ext4_mballoc();
        if (err)
                return err;
        if (err)
                return err;
+
+       err = init_ext4_xattr();
+       if (err)
+               goto out2;
        err = init_inodecache();
        if (err)
                goto out1;
        err = init_inodecache();
        if (err)
                goto out1;
@@ -3246,6 +3303,8 @@ out:
        destroy_inodecache();
 out1:
        exit_ext4_xattr();
        destroy_inodecache();
 out1:
        exit_ext4_xattr();
+out2:
+       exit_ext4_mballoc();
        return err;
 }
 
        return err;
 }
 
@@ -3254,6 +3313,7 @@ static void __exit exit_ext4_fs(void)
        unregister_filesystem(&ext4dev_fs_type);
        destroy_inodecache();
        exit_ext4_xattr();
        unregister_filesystem(&ext4dev_fs_type);
        destroy_inodecache();
        exit_ext4_xattr();
+       exit_ext4_mballoc();
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
index 86387302c2a9b45591e71c8e6900fdde92c09eaf..d7962139c0108a329386b86d61088cc4dedb3eb1 100644 (file)
@@ -480,7 +480,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
                ea_bdebug(bh, "refcount now=0; freeing");
                if (ce)
                        mb_cache_entry_free(ce);
                ea_bdebug(bh, "refcount now=0; freeing");
                if (ce)
                        mb_cache_entry_free(ce);
-               ext4_free_blocks(handle, inode, bh->b_blocknr, 1);
+               ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
                get_bh(bh);
                ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
        } else {
                get_bh(bh);
                ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
        } else {
@@ -821,7 +821,7 @@ inserted:
                        new_bh = sb_getblk(sb, block);
                        if (!new_bh) {
 getblk_failed:
                        new_bh = sb_getblk(sb, block);
                        if (!new_bh) {
 getblk_failed:
-                               ext4_free_blocks(handle, inode, block, 1);
+                               ext4_free_blocks(handle, inode, block, 1, 1);
                                error = -EIO;
                                goto cleanup;
                        }
                                error = -EIO;
                                goto cleanup;
                        }
index d0b7ca99b91fd99721eed47aa5282d1d44cc5c66..1852313fc7c73605887a1447ae33d18645c298c1 100644 (file)
@@ -20,6 +20,8 @@
 #include <linux/blkdev.h>
 #include <linux/magic.h>
 
 #include <linux/blkdev.h>
 #include <linux/magic.h>
 
+#include <linux/ext4_fs_i.h>
+
 /*
  * The second extended filesystem constants/structures
  */
 /*
  * The second extended filesystem constants/structures
  */
 #define ext4_debug(f, a...)    do {} while (0)
 #endif
 
 #define ext4_debug(f, a...)    do {} while (0)
 #endif
 
+#define EXT4_MULTIBLOCK_ALLOCATOR      1
+
+/* prefer goal again. length */
+#define EXT4_MB_HINT_MERGE             1
+/* blocks already reserved */
+#define EXT4_MB_HINT_RESERVED          2
+/* metadata is being allocated */
+#define EXT4_MB_HINT_METADATA          4
+/* first blocks in the file */
+#define EXT4_MB_HINT_FIRST             8
+/* search for the best chunk */
+#define EXT4_MB_HINT_BEST              16
+/* data is being allocated */
+#define EXT4_MB_HINT_DATA              32
+/* don't preallocate (for tails) */
+#define EXT4_MB_HINT_NOPREALLOC                64
+/* allocate for locality group */
+#define EXT4_MB_HINT_GROUP_ALLOC       128
+/* allocate goal blocks or none */
+#define EXT4_MB_HINT_GOAL_ONLY         256
+/* goal is meaningful */
+#define EXT4_MB_HINT_TRY_GOAL          512
+
+struct ext4_allocation_request {
+       /* target inode for block we're allocating */
+       struct inode *inode;
+       /* logical block in target inode */
+       ext4_lblk_t logical;
+       /* phys. target (a hint) */
+       ext4_fsblk_t goal;
+       /* the closest logical allocated block to the left */
+       ext4_lblk_t lleft;
+       /* phys. block for ^^^ */
+       ext4_fsblk_t pleft;
+       /* the closest logical allocated block to the right */
+       ext4_lblk_t lright;
+       /* phys. block for ^^^ */
+       ext4_fsblk_t pright;
+       /* how many blocks we want to allocate */
+       unsigned long len;
+       /* flags. see above EXT4_MB_HINT_* */
+       unsigned long flags;
+};
+
 /*
  * Special inodes numbers
  */
 /*
  * Special inodes numbers
  */
@@ -474,6 +520,7 @@ do {                                                                               \
 #define EXT4_MOUNT_JOURNAL_CHECKSUM    0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT        0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_JOURNAL_CHECKSUM    0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT        0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
+#define EXT4_MOUNT_MBALLOC             0x4000000 /* Buddy allocation support */
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)              o &= ~EXT4_MOUNT_##opt
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)              o &= ~EXT4_MOUNT_##opt
@@ -912,7 +959,7 @@ extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode,
 extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp);
 extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
 extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp);
 extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
-                       ext4_fsblk_t block, unsigned long count);
+                       ext4_fsblk_t block, unsigned long count, int metadata);
 extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
                                 ext4_fsblk_t block, unsigned long count,
                                unsigned long *pdquot_freed_blocks);
 extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
                                 ext4_fsblk_t block, unsigned long count,
                                unsigned long *pdquot_freed_blocks);
@@ -950,6 +997,20 @@ extern unsigned long ext4_count_dirs (struct super_block *);
 extern void ext4_check_inodes_bitmap (struct super_block *);
 extern unsigned long ext4_count_free (struct buffer_head *, unsigned);
 
 extern void ext4_check_inodes_bitmap (struct super_block *);
 extern unsigned long ext4_count_free (struct buffer_head *, unsigned);
 
+/* mballoc.c */
+extern long ext4_mb_stats;
+extern long ext4_mb_max_to_scan;
+extern int ext4_mb_init(struct super_block *, int);
+extern int ext4_mb_release(struct super_block *);
+extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
+                               struct ext4_allocation_request *, int *);
+extern int ext4_mb_reserve_blocks(struct super_block *, int);
+extern void ext4_mb_discard_inode_preallocations(struct inode *);
+extern int __init init_ext4_mballoc(void);
+extern void exit_ext4_mballoc(void);
+extern void ext4_mb_free_blocks(handle_t *, struct inode *,
+               unsigned long, unsigned long, int, unsigned long *);
+
 
 /* inode.c */
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
 
 /* inode.c */
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
@@ -1080,6 +1141,19 @@ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
        raw_inode->i_size_high = cpu_to_le32(i_size >> 32);
 }
 
        raw_inode->i_size_high = cpu_to_le32(i_size >> 32);
 }
 
+static inline
+struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
+                                                       ext4_group_t group)
+{
+        struct ext4_group_info ***grp_info;
+        long indexv, indexh;
+        grp_info = EXT4_SB(sb)->s_group_info;
+        indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
+        indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
+        return grp_info[indexv][indexh];
+}
+
+
 #define ext4_std_error(sb, errno)                              \
 do {                                                           \
        if ((errno))                                            \
 #define ext4_std_error(sb, errno)                              \
 do {                                                           \
        if ((errno))                                            \
index 4377d249d3789700b114189413f053833d70cb26..d5508d3cf29096db958efa7be8719d69fb155ac0 100644 (file)
@@ -158,6 +158,10 @@ struct ext4_inode_info {
         * struct timespec i_{a,c,m}time in the generic inode.
         */
        struct timespec i_crtime;
         * struct timespec i_{a,c,m}time in the generic inode.
         */
        struct timespec i_crtime;
+
+       /* mballoc */
+       struct list_head i_prealloc_list;
+       spinlock_t i_prealloc_lock;
 };
 
 #endif /* _LINUX_EXT4_FS_I */
 };
 
 #endif /* _LINUX_EXT4_FS_I */
index 38a47ec06df9507431263c315d98b6d0a94e1d23..abaae2c8cccf3d7bad74769998816f0b081e7d5d 100644 (file)
@@ -91,6 +91,58 @@ struct ext4_sb_info {
        unsigned long s_ext_blocks;
        unsigned long s_ext_extents;
 #endif
        unsigned long s_ext_blocks;
        unsigned long s_ext_extents;
 #endif
+
+       /* for buddy allocator */
+       struct ext4_group_info ***s_group_info;
+       struct inode *s_buddy_cache;
+       long s_blocks_reserved;
+       spinlock_t s_reserve_lock;
+       struct list_head s_active_transaction;
+       struct list_head s_closed_transaction;
+       struct list_head s_committed_transaction;
+       spinlock_t s_md_lock;
+       tid_t s_last_transaction;
+       unsigned short *s_mb_offsets, *s_mb_maxs;
+
+       /* tunables */
+       unsigned long s_stripe;
+       unsigned long s_mb_stream_request;
+       unsigned long s_mb_max_to_scan;
+       unsigned long s_mb_min_to_scan;
+       unsigned long s_mb_stats;
+       unsigned long s_mb_order2_reqs;
+       unsigned long s_mb_group_prealloc;
+       /* where last allocation was done - for stream allocation */
+       unsigned long s_mb_last_group;
+       unsigned long s_mb_last_start;
+
+       /* history to debug policy */
+       struct ext4_mb_history *s_mb_history;
+       int s_mb_history_cur;
+       int s_mb_history_max;
+       int s_mb_history_num;
+       struct proc_dir_entry *s_mb_proc;
+       spinlock_t s_mb_history_lock;
+       int s_mb_history_filter;
+
+       /* stats for buddy allocator */
+       spinlock_t s_mb_pa_lock;
+       atomic_t s_bal_reqs;    /* number of reqs with len > 1 */
+       atomic_t s_bal_success; /* we found long enough chunks */
+       atomic_t s_bal_allocated;       /* in blocks */
+       atomic_t s_bal_ex_scanned;      /* total extents scanned */
+       atomic_t s_bal_goals;   /* goal hits */
+       atomic_t s_bal_breaks;  /* too long searches */
+       atomic_t s_bal_2orders; /* 2^order hits */
+       spinlock_t s_bal_lock;
+       unsigned long s_mb_buddies_generated;
+       unsigned long long s_mb_generation_time;
+       atomic_t s_mb_lost_chunks;
+       atomic_t s_mb_preallocated;
+       atomic_t s_mb_discarded;
+
+       /* locality groups */
+       struct ext4_locality_group *s_locality_groups;
 };
 
 #endif /* _LINUX_EXT4_FS_SB */
 };
 
 #endif /* _LINUX_EXT4_FS_SB */