]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/commitdiff
Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 31 Dec 2008 01:48:25 +0000 (17:48 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 31 Dec 2008 01:48:25 +0000 (17:48 -0800)
* 'for-linus' of git://oss.sgi.com/xfs/xfs: (184 commits)
  [XFS] Fix race in xfs_write() between direct and buffered I/O with DMAPI
  [XFS] handle unaligned data in xfs_bmbt_disk_get_all
  [XFS] avoid memory allocations in xfs_fs_vcmn_err
  [XFS] Fix speculative allocation beyond eof
  [XFS] Remove XFS_BUF_SHUT() and friends
  [XFS] Use the incore inode size in xfs_file_readdir()
  [XFS] set b_error from bio error in xfs_buf_bio_end_io
  [XFS] use inode_change_ok for setattr permission checking
  [XFS] add a FMODE flag to make XFS invisible I/O less hacky
  [XFS] resync headers with libxfs
  [XFS] simplify projid check in xfs_rename
  [XFS] replace b_fspriv with b_mount
  [XFS] Remove unused tracing code
  [XFS] Remove unnecessary assertion
  [XFS] Remove unused variable in ktrace_free()
  [XFS] Check return value of xfs_buf_get_noaddr()
  [XFS] Fix hang after disallowed rename across directory quota domains
  [XFS] Fix compile with CONFIG_COMPAT enabled
  move inode tracing out of xfs_vnode.
  move vn_iowait / vn_iowake into xfs_aops.c
  ...

112 files changed:
Documentation/filesystems/xfs.txt
fs/inode.c
fs/xfs/Makefile
fs/xfs/linux-2.6/sv.h
fs/xfs/linux-2.6/xfs_aops.c
fs/xfs/linux-2.6/xfs_aops.h
fs/xfs/linux-2.6/xfs_buf.c
fs/xfs/linux-2.6/xfs_buf.h
fs/xfs/linux-2.6/xfs_cred.h
fs/xfs/linux-2.6/xfs_export.c
fs/xfs/linux-2.6/xfs_file.c
fs/xfs/linux-2.6/xfs_fs_subr.c
fs/xfs/linux-2.6/xfs_globals.c
fs/xfs/linux-2.6/xfs_globals.h
fs/xfs/linux-2.6/xfs_ioctl.c
fs/xfs/linux-2.6/xfs_ioctl.h [new file with mode: 0644]
fs/xfs/linux-2.6/xfs_ioctl32.c
fs/xfs/linux-2.6/xfs_ioctl32.h
fs/xfs/linux-2.6/xfs_iops.c
fs/xfs/linux-2.6/xfs_iops.h
fs/xfs/linux-2.6/xfs_linux.h
fs/xfs/linux-2.6/xfs_lrw.c
fs/xfs/linux-2.6/xfs_stats.c
fs/xfs/linux-2.6/xfs_stats.h
fs/xfs/linux-2.6/xfs_super.c
fs/xfs/linux-2.6/xfs_super.h
fs/xfs/linux-2.6/xfs_sync.c [new file with mode: 0644]
fs/xfs/linux-2.6/xfs_sync.h [new file with mode: 0644]
fs/xfs/linux-2.6/xfs_sysctl.c
fs/xfs/linux-2.6/xfs_sysctl.h
fs/xfs/linux-2.6/xfs_vfs.h [deleted file]
fs/xfs/linux-2.6/xfs_vnode.c [deleted file]
fs/xfs/linux-2.6/xfs_vnode.h
fs/xfs/quota/xfs_dquot.c
fs/xfs/quota/xfs_dquot.h
fs/xfs/quota/xfs_dquot_item.c
fs/xfs/quota/xfs_qm.c
fs/xfs/quota/xfs_qm.h
fs/xfs/quota/xfs_qm_bhv.c
fs/xfs/quota/xfs_qm_syscalls.c
fs/xfs/support/debug.c
fs/xfs/support/debug.h
fs/xfs/support/ktrace.c
fs/xfs/xfs.h
fs/xfs/xfs_acl.c
fs/xfs/xfs_ag.h
fs/xfs/xfs_alloc.c
fs/xfs/xfs_alloc.h
fs/xfs/xfs_alloc_btree.c
fs/xfs/xfs_alloc_btree.h
fs/xfs/xfs_arch.h
fs/xfs/xfs_bit.h
fs/xfs/xfs_bmap.c
fs/xfs/xfs_bmap.h
fs/xfs/xfs_bmap_btree.c
fs/xfs/xfs_bmap_btree.h
fs/xfs/xfs_btree.c
fs/xfs/xfs_btree.h
fs/xfs/xfs_btree_trace.c [new file with mode: 0644]
fs/xfs/xfs_btree_trace.h [new file with mode: 0644]
fs/xfs/xfs_buf_item.c
fs/xfs/xfs_clnt.h [deleted file]
fs/xfs/xfs_da_btree.h
fs/xfs/xfs_dfrag.c
fs/xfs/xfs_dfrag.h
fs/xfs/xfs_dinode.h
fs/xfs/xfs_dir2_sf.h
fs/xfs/xfs_dmops.c
fs/xfs/xfs_error.c
fs/xfs/xfs_error.h
fs/xfs/xfs_extfree_item.c
fs/xfs/xfs_fs.h
fs/xfs/xfs_fsops.c
fs/xfs/xfs_ialloc.c
fs/xfs/xfs_ialloc.h
fs/xfs/xfs_ialloc_btree.c
fs/xfs/xfs_ialloc_btree.h
fs/xfs/xfs_iget.c
fs/xfs/xfs_imap.h [deleted file]
fs/xfs/xfs_inode.c
fs/xfs/xfs_inode.h
fs/xfs/xfs_inode_item.c
fs/xfs/xfs_inode_item.h
fs/xfs/xfs_iomap.c
fs/xfs/xfs_itable.c
fs/xfs/xfs_itable.h
fs/xfs/xfs_log.c
fs/xfs/xfs_log.h
fs/xfs/xfs_log_priv.h
fs/xfs/xfs_log_recover.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_qmops.c
fs/xfs/xfs_quota.h
fs/xfs/xfs_rename.c
fs/xfs/xfs_rtalloc.c
fs/xfs/xfs_rw.c
fs/xfs/xfs_sb.h
fs/xfs/xfs_trans.c
fs/xfs/xfs_trans.h
fs/xfs/xfs_trans_ail.c
fs/xfs/xfs_trans_buf.c
fs/xfs/xfs_trans_inode.c
fs/xfs/xfs_trans_item.c
fs/xfs/xfs_trans_priv.h
fs/xfs/xfs_utils.c
fs/xfs/xfs_vfsops.c [deleted file]
fs/xfs/xfs_vfsops.h [deleted file]
fs/xfs/xfs_vnodeops.c
fs/xfs/xfs_vnodeops.h
include/linux/fs.h
kernel/sysctl_check.c

index 0a1668ba26008d56a5e888a1a876f072a5800966..9878f50d6ed6d37b3241eb558a58d08681898927 100644 (file)
@@ -229,10 +229,6 @@ The following sysctls are available for the XFS filesystem:
        ISGID bit is cleared if the irix_sgid_inherit compatibility sysctl
        is set.
 
-  fs.xfs.restrict_chown                (Min: 0  Default: 1  Max: 1)
-       Controls whether unprivileged users can use chown to "give away"
-       a file to another user.
-
   fs.xfs.inherit_sync          (Min: 0  Default: 1  Max: 1)
        Setting this to "1" will cause the "sync" flag set
        by the xfs_io(8) chattr command on a directory to be
index 0487ddba139780a4e0bfabc8f4ef2103358d9713..098a2443196f20cacd3c1a4c38964646a14c6e74 100644 (file)
@@ -108,84 +108,100 @@ static void wake_up_inode(struct inode *inode)
        wake_up_bit(&inode->i_state, __I_LOCK);
 }
 
-static struct inode *alloc_inode(struct super_block *sb)
+/**
+ * inode_init_always - perform inode structure intialisation
+ * @sb         - superblock inode belongs to.
+ * @inode      - inode to initialise
+ *
+ * These are initializations that need to be done on every inode
+ * allocation as the fields are not initialised by slab allocation.
+ */
+struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 {
        static const struct address_space_operations empty_aops;
        static struct inode_operations empty_iops;
        static const struct file_operations empty_fops;
-       struct inode *inode;
-
-       if (sb->s_op->alloc_inode)
-               inode = sb->s_op->alloc_inode(sb);
-       else
-               inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL);
 
-       if (inode) {
-               struct address_space * const mapping = &inode->i_data;
-
-               inode->i_sb = sb;
-               inode->i_blkbits = sb->s_blocksize_bits;
-               inode->i_flags = 0;
-               atomic_set(&inode->i_count, 1);
-               inode->i_op = &empty_iops;
-               inode->i_fop = &empty_fops;
-               inode->i_nlink = 1;
-               atomic_set(&inode->i_writecount, 0);
-               inode->i_size = 0;
-               inode->i_blocks = 0;
-               inode->i_bytes = 0;
-               inode->i_generation = 0;
+       struct address_space * const mapping = &inode->i_data;
+
+       inode->i_sb = sb;
+       inode->i_blkbits = sb->s_blocksize_bits;
+       inode->i_flags = 0;
+       atomic_set(&inode->i_count, 1);
+       inode->i_op = &empty_iops;
+       inode->i_fop = &empty_fops;
+       inode->i_nlink = 1;
+       atomic_set(&inode->i_writecount, 0);
+       inode->i_size = 0;
+       inode->i_blocks = 0;
+       inode->i_bytes = 0;
+       inode->i_generation = 0;
 #ifdef CONFIG_QUOTA
-               memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
+       memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
 #endif
-               inode->i_pipe = NULL;
-               inode->i_bdev = NULL;
-               inode->i_cdev = NULL;
-               inode->i_rdev = 0;
-               inode->dirtied_when = 0;
-               if (security_inode_alloc(inode)) {
-                       if (inode->i_sb->s_op->destroy_inode)
-                               inode->i_sb->s_op->destroy_inode(inode);
-                       else
-                               kmem_cache_free(inode_cachep, (inode));
-                       return NULL;
-               }
+       inode->i_pipe = NULL;
+       inode->i_bdev = NULL;
+       inode->i_cdev = NULL;
+       inode->i_rdev = 0;
+       inode->dirtied_when = 0;
+       if (security_inode_alloc(inode)) {
+               if (inode->i_sb->s_op->destroy_inode)
+                       inode->i_sb->s_op->destroy_inode(inode);
+               else
+                       kmem_cache_free(inode_cachep, (inode));
+               return NULL;
+       }
 
-               spin_lock_init(&inode->i_lock);
-               lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
+       spin_lock_init(&inode->i_lock);
+       lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
 
-               mutex_init(&inode->i_mutex);
-               lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
+       mutex_init(&inode->i_mutex);
+       lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
 
-               init_rwsem(&inode->i_alloc_sem);
-               lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
+       init_rwsem(&inode->i_alloc_sem);
+       lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
 
-               mapping->a_ops = &empty_aops;
-               mapping->host = inode;
-               mapping->flags = 0;
-               mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
-               mapping->assoc_mapping = NULL;
-               mapping->backing_dev_info = &default_backing_dev_info;
-               mapping->writeback_index = 0;
+       mapping->a_ops = &empty_aops;
+       mapping->host = inode;
+       mapping->flags = 0;
+       mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
+       mapping->assoc_mapping = NULL;
+       mapping->backing_dev_info = &default_backing_dev_info;
+       mapping->writeback_index = 0;
 
-               /*
-                * If the block_device provides a backing_dev_info for client
-                * inodes then use that.  Otherwise the inode share the bdev's
-                * backing_dev_info.
-                */
-               if (sb->s_bdev) {
-                       struct backing_dev_info *bdi;
+       /*
+        * If the block_device provides a backing_dev_info for client
+        * inodes then use that.  Otherwise the inode share the bdev's
+        * backing_dev_info.
+        */
+       if (sb->s_bdev) {
+               struct backing_dev_info *bdi;
 
-                       bdi = sb->s_bdev->bd_inode_backing_dev_info;
-                       if (!bdi)
-                               bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
-                       mapping->backing_dev_info = bdi;
-               }
-               inode->i_private = NULL;
-               inode->i_mapping = mapping;
+               bdi = sb->s_bdev->bd_inode_backing_dev_info;
+               if (!bdi)
+                       bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+               mapping->backing_dev_info = bdi;
        }
+       inode->i_private = NULL;
+       inode->i_mapping = mapping;
+
        return inode;
 }
+EXPORT_SYMBOL(inode_init_always);
+
+static struct inode *alloc_inode(struct super_block *sb)
+{
+       struct inode *inode;
+
+       if (sb->s_op->alloc_inode)
+               inode = sb->s_op->alloc_inode(sb);
+       else
+               inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
+
+       if (inode)
+               return inode_init_always(sb, inode);
+       return NULL;
+}
 
 void destroy_inode(struct inode *inode) 
 {
@@ -196,6 +212,7 @@ void destroy_inode(struct inode *inode)
        else
                kmem_cache_free(inode_cachep, (inode));
 }
+EXPORT_SYMBOL(destroy_inode);
 
 
 /*
@@ -534,6 +551,49 @@ repeat:
        return node ? inode : NULL;
 }
 
+static unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+       unsigned long tmp;
+
+       tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
+                       L1_CACHE_BYTES;
+       tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
+       return tmp & I_HASHMASK;
+}
+
+static inline void
+__inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
+                       struct inode *inode)
+{
+       inodes_stat.nr_inodes++;
+       list_add(&inode->i_list, &inode_in_use);
+       list_add(&inode->i_sb_list, &sb->s_inodes);
+       if (head)
+               hlist_add_head(&inode->i_hash, head);
+}
+
+/**
+ * inode_add_to_lists - add a new inode to relevant lists
+ * @sb         - superblock inode belongs to.
+ * @inode      - inode to mark in use
+ *
+ * When an inode is allocated it needs to be accounted for, added to the in use
+ * list, the owning superblock and the inode hash. This needs to be done under
+ * the inode_lock, so export a function to do this rather than the inode lock
+ * itself. We calculate the hash list to add to here so it is all internal
+ * which requires the caller to have already set up the inode number in the
+ * inode to add.
+ */
+void inode_add_to_lists(struct super_block *sb, struct inode *inode)
+{
+       struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino);
+
+       spin_lock(&inode_lock);
+       __inode_add_to_lists(sb, head, inode);
+       spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL_GPL(inode_add_to_lists);
+
 /**
  *     new_inode       - obtain an inode
  *     @sb: superblock
@@ -561,9 +621,7 @@ struct inode *new_inode(struct super_block *sb)
        inode = alloc_inode(sb);
        if (inode) {
                spin_lock(&inode_lock);
-               inodes_stat.nr_inodes++;
-               list_add(&inode->i_list, &inode_in_use);
-               list_add(&inode->i_sb_list, &sb->s_inodes);
+               __inode_add_to_lists(sb, NULL, inode);
                inode->i_ino = ++last_ino;
                inode->i_state = 0;
                spin_unlock(&inode_lock);
@@ -622,10 +680,7 @@ static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *h
                        if (set(inode, data))
                                goto set_failed;
 
-                       inodes_stat.nr_inodes++;
-                       list_add(&inode->i_list, &inode_in_use);
-                       list_add(&inode->i_sb_list, &sb->s_inodes);
-                       hlist_add_head(&inode->i_hash, head);
+                       __inode_add_to_lists(sb, head, inode);
                        inode->i_state = I_LOCK|I_NEW;
                        spin_unlock(&inode_lock);
 
@@ -671,10 +726,7 @@ static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_he
                old = find_inode_fast(sb, head, ino);
                if (!old) {
                        inode->i_ino = ino;
-                       inodes_stat.nr_inodes++;
-                       list_add(&inode->i_list, &inode_in_use);
-                       list_add(&inode->i_sb_list, &sb->s_inodes);
-                       hlist_add_head(&inode->i_hash, head);
+                       __inode_add_to_lists(sb, head, inode);
                        inode->i_state = I_LOCK|I_NEW;
                        spin_unlock(&inode_lock);
 
@@ -698,16 +750,6 @@ static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_he
        return inode;
 }
 
-static unsigned long hash(struct super_block *sb, unsigned long hashval)
-{
-       unsigned long tmp;
-
-       tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
-                       L1_CACHE_BYTES;
-       tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
-       return tmp & I_HASHMASK;
-}
-
 /**
  *     iunique - get a unique inode number
  *     @sb: superblock
@@ -1292,6 +1334,7 @@ int inode_wait(void *word)
        schedule();
        return 0;
 }
+EXPORT_SYMBOL(inode_wait);
 
 /*
  * If we try to find an inode in the inode hash while it is being
index 737c9a425361df67f3f7ef9d0ff674e26caf6105..c3dc491fff89c468806fca873a34eedc5843cd91 100644 (file)
@@ -85,13 +85,13 @@ xfs-y                               += xfs_alloc.o \
                                   xfs_trans_inode.o \
                                   xfs_trans_item.o \
                                   xfs_utils.o \
-                                  xfs_vfsops.o \
                                   xfs_vnodeops.o \
                                   xfs_rw.o \
                                   xfs_dmops.o \
                                   xfs_qmops.o
 
-xfs-$(CONFIG_XFS_TRACE)                += xfs_dir2_trace.o
+xfs-$(CONFIG_XFS_TRACE)                += xfs_btree_trace.o \
+                                  xfs_dir2_trace.o
 
 # Objects in linux/
 xfs-y                          += $(addprefix $(XFS_LINUX)/, \
@@ -106,7 +106,7 @@ xfs-y                               += $(addprefix $(XFS_LINUX)/, \
                                   xfs_iops.o \
                                   xfs_lrw.o \
                                   xfs_super.o \
-                                  xfs_vnode.o \
+                                  xfs_sync.o \
                                   xfs_xattr.o)
 
 # Objects in support/
index 351a8f454bd1adfdcffb91e8299324da77a8878c..4dfc7c370819dd502de2e682f322e4bbd18ab4f4 100644 (file)
@@ -32,23 +32,15 @@ typedef struct sv_s {
        wait_queue_head_t waiters;
 } sv_t;
 
-#define SV_FIFO                0x0             /* sv_t is FIFO type */
-#define SV_LIFO                0x2             /* sv_t is LIFO type */
-#define SV_PRIO                0x4             /* sv_t is PRIO type */
-#define SV_KEYED       0x6             /* sv_t is KEYED type */
-#define SV_DEFAULT      SV_FIFO
-
-
-static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
-                            unsigned long timeout)
+static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
 {
        DECLARE_WAITQUEUE(wait, current);
 
        add_wait_queue_exclusive(&sv->waiters, &wait);
-       __set_current_state(state);
+       __set_current_state(TASK_UNINTERRUPTIBLE);
        spin_unlock(lock);
 
-       schedule_timeout(timeout);
+       schedule();
 
        remove_wait_queue(&sv->waiters, &wait);
 }
@@ -58,13 +50,7 @@ static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
 #define sv_destroy(sv) \
        /*NOTHING*/
 #define sv_wait(sv, pri, lock, s) \
-       _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
-#define sv_wait_sig(sv, pri, lock, s)   \
-       _sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
-#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \
-       _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts))
-#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \
-       _sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts))
+       _sv_wait(sv, lock)
 #define sv_signal(sv) \
        wake_up(&(sv)->waiters)
 #define sv_broadcast(sv) \
index a44d68eb50b5302b8482000e523661b14c0854d8..de3a198f771e20627769b837d6f2460ec18077fc 100644 (file)
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
 
+
+/*
+ * Prime number of hash buckets since address is used as the key.
+ */
+#define NVSYNC         37
+#define to_ioend_wq(v) (&xfs_ioend_wq[((unsigned long)v) % NVSYNC])
+static wait_queue_head_t xfs_ioend_wq[NVSYNC];
+
+void __init
+xfs_ioend_init(void)
+{
+       int i;
+
+       for (i = 0; i < NVSYNC; i++)
+               init_waitqueue_head(&xfs_ioend_wq[i]);
+}
+
+void
+xfs_ioend_wait(
+       xfs_inode_t     *ip)
+{
+       wait_queue_head_t *wq = to_ioend_wq(ip);
+
+       wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
+}
+
+STATIC void
+xfs_ioend_wake(
+       xfs_inode_t     *ip)
+{
+       if (atomic_dec_and_test(&ip->i_iocount))
+               wake_up(to_ioend_wq(ip));
+}
+
 STATIC void
 xfs_count_page_state(
        struct page             *page,
@@ -146,16 +180,25 @@ xfs_destroy_ioend(
        xfs_ioend_t             *ioend)
 {
        struct buffer_head      *bh, *next;
+       struct xfs_inode        *ip = XFS_I(ioend->io_inode);
 
        for (bh = ioend->io_buffer_head; bh; bh = next) {
                next = bh->b_private;
                bh->b_end_io(bh, !ioend->io_error);
        }
-       if (unlikely(ioend->io_error)) {
-               vn_ioerror(XFS_I(ioend->io_inode), ioend->io_error,
-                               __FILE__,__LINE__);
+
+       /*
+        * Volume managers supporting multiple paths can send back ENODEV
+        * when the final path disappears.  In this case continuing to fill
+        * the page cache with dirty data which cannot be written out is
+        * evil, so prevent that.
+        */
+       if (unlikely(ioend->io_error == -ENODEV)) {
+               xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ,
+                                     __FILE__, __LINE__);
        }
-       vn_iowake(XFS_I(ioend->io_inode));
+
+       xfs_ioend_wake(ip);
        mempool_free(ioend, xfs_ioend_pool);
 }
 
@@ -191,7 +234,7 @@ xfs_setfilesize(
                ip->i_d.di_size = isize;
                ip->i_update_core = 1;
                ip->i_update_size = 1;
-               mark_inode_dirty_sync(ioend->io_inode);
+               xfs_mark_inode_dirty_sync(ip);
        }
 
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -317,14 +360,9 @@ xfs_map_blocks(
        xfs_iomap_t             *mapp,
        int                     flags)
 {
-       xfs_inode_t             *ip = XFS_I(inode);
-       int                     error, nmaps = 1;
-
-       error = xfs_iomap(ip, offset, count,
-                               flags, mapp, &nmaps);
-       if (!error && (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)))
-               xfs_iflags_set(ip, XFS_IMODIFIED);
-       return -error;
+       int                     nmaps = 1;
+
+       return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps);
 }
 
 STATIC_INLINE int
@@ -512,7 +550,7 @@ xfs_cancel_ioend(
                        unlock_buffer(bh);
                } while ((bh = next_bh) != NULL);
 
-               vn_iowake(XFS_I(ioend->io_inode));
+               xfs_ioend_wake(XFS_I(ioend->io_inode));
                mempool_free(ioend, xfs_ioend_pool);
        } while ((ioend = next) != NULL);
 }
index 3ba0631a38182fe04640d39c86551a84f653e05e..7b26f5ff969230140e363a082e72c476f35ea51e 100644 (file)
@@ -43,4 +43,7 @@ typedef struct xfs_ioend {
 extern const struct address_space_operations xfs_address_space_operations;
 extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
 
+extern void xfs_ioend_init(void);
+extern void xfs_ioend_wait(struct xfs_inode *);
+
 #endif /* __XFS_AOPS_H__ */
index 36d5fcd3f593d8694926e70181b1bade9a1df8b6..cb329edc925b915ae5e590edeb4840810f811326 100644 (file)
@@ -630,6 +630,29 @@ xfs_buf_get_flags(
        return NULL;
 }
 
+STATIC int
+_xfs_buf_read(
+       xfs_buf_t               *bp,
+       xfs_buf_flags_t         flags)
+{
+       int                     status;
+
+       XB_TRACE(bp, "_xfs_buf_read", (unsigned long)flags);
+
+       ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE)));
+       ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
+
+       bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
+                       XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+       bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | \
+                       XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+
+       status = xfs_buf_iorequest(bp);
+       if (!status && !(flags & XBF_ASYNC))
+               status = xfs_buf_iowait(bp);
+       return status;
+}
+
 xfs_buf_t *
 xfs_buf_read_flags(
        xfs_buftarg_t           *target,
@@ -646,7 +669,7 @@ xfs_buf_read_flags(
                if (!XFS_BUF_ISDONE(bp)) {
                        XB_TRACE(bp, "read", (unsigned long)flags);
                        XFS_STATS_INC(xb_get_read);
-                       xfs_buf_iostart(bp, flags);
+                       _xfs_buf_read(bp, flags);
                } else if (flags & XBF_ASYNC) {
                        XB_TRACE(bp, "read_async", (unsigned long)flags);
                        /*
@@ -1048,50 +1071,39 @@ xfs_buf_ioerror(
        XB_TRACE(bp, "ioerror", (unsigned long)error);
 }
 
-/*
- *     Initiate I/O on a buffer, based on the flags supplied.
- *     The b_iodone routine in the buffer supplied will only be called
- *     when all of the subsidiary I/O requests, if any, have been completed.
- */
 int
-xfs_buf_iostart(
-       xfs_buf_t               *bp,
-       xfs_buf_flags_t         flags)
+xfs_bawrite(
+       void                    *mp,
+       struct xfs_buf          *bp)
 {
-       int                     status = 0;
+       XB_TRACE(bp, "bawrite", 0);
 
-       XB_TRACE(bp, "iostart", (unsigned long)flags);
+       ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
 
-       if (flags & XBF_DELWRI) {
-               bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC);
-               bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC);
-               xfs_buf_delwri_queue(bp, 1);
-               return 0;
-       }
+       xfs_buf_delwri_dequeue(bp);
 
-       bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
-                       XBF_READ_AHEAD | _XBF_RUN_QUEUES);
-       bp->b_flags |= flags & (XBF_READ | XBF_WRITE | XBF_ASYNC | \
-                       XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+       bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD);
+       bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
+
+       bp->b_mount = mp;
+       bp->b_strat = xfs_bdstrat_cb;
+       return xfs_bdstrat_cb(bp);
+}
 
-       BUG_ON(bp->b_bn == XFS_BUF_DADDR_NULL);
+void
+xfs_bdwrite(
+       void                    *mp,
+       struct xfs_buf          *bp)
+{
+       XB_TRACE(bp, "bdwrite", 0);
 
-       /* For writes allow an alternate strategy routine to precede
-        * the actual I/O request (which may not be issued at all in
-        * a shutdown situation, for example).
-        */
-       status = (flags & XBF_WRITE) ?
-               xfs_buf_iostrategy(bp) : xfs_buf_iorequest(bp);
+       bp->b_strat = xfs_bdstrat_cb;
+       bp->b_mount = mp;
 
-       /* Wait for I/O if we are not an async request.
-        * Note: async I/O request completion will release the buffer,
-        * and that can already be done by this point.  So using the
-        * buffer pointer from here on, after async I/O, is invalid.
-        */
-       if (!status && !(flags & XBF_ASYNC))
-               status = xfs_buf_iowait(bp);
+       bp->b_flags &= ~XBF_READ;
+       bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
 
-       return status;
+       xfs_buf_delwri_queue(bp, 1);
 }
 
 STATIC_INLINE void
@@ -1114,8 +1126,7 @@ xfs_buf_bio_end_io(
        unsigned int            blocksize = bp->b_target->bt_bsize;
        struct bio_vec          *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 
-       if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-               bp->b_error = EIO;
+       xfs_buf_ioerror(bp, -error);
 
        do {
                struct page     *page = bvec->bv_page;
index 456519a088c7bfb5ba472f0a8125818a46a5ac5b..288ae7c4c800a5040c7c89ce8dd2a54622e2b7f2 100644 (file)
@@ -168,7 +168,7 @@ typedef struct xfs_buf {
        struct completion       b_iowait;       /* queue for I/O waiters */
        void                    *b_fspriv;
        void                    *b_fspriv2;
-       void                    *b_fspriv3;
+       struct xfs_mount        *b_mount;
        unsigned short          b_error;        /* error code on I/O */
        unsigned int            b_page_count;   /* size of page array */
        unsigned int            b_offset;       /* page offset in first page */
@@ -214,9 +214,10 @@ extern void xfs_buf_lock(xfs_buf_t *);
 extern void xfs_buf_unlock(xfs_buf_t *);
 
 /* Buffer Read and Write Routines */
+extern int xfs_bawrite(void *mp, xfs_buf_t *bp);
+extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
 extern void xfs_buf_ioend(xfs_buf_t *, int);
 extern void xfs_buf_ioerror(xfs_buf_t *, int);
-extern int xfs_buf_iostart(xfs_buf_t *, xfs_buf_flags_t);
 extern int xfs_buf_iorequest(xfs_buf_t *);
 extern int xfs_buf_iowait(xfs_buf_t *);
 extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t,
@@ -311,10 +312,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
 #define XFS_BUF_UNORDERED(bp)  ((bp)->b_flags &= ~XBF_ORDERED)
 #define XFS_BUF_ISORDERED(bp)  ((bp)->b_flags & XBF_ORDERED)
 
-#define XFS_BUF_SHUT(bp)       do { } while (0)
-#define XFS_BUF_UNSHUT(bp)     do { } while (0)
-#define XFS_BUF_ISSHUT(bp)     (0)
-
 #define XFS_BUF_HOLD(bp)       xfs_buf_hold(bp)
 #define XFS_BUF_READ(bp)       ((bp)->b_flags |= XBF_READ)
 #define XFS_BUF_UNREAD(bp)     ((bp)->b_flags &= ~XBF_READ)
@@ -334,8 +331,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
 #define XFS_BUF_SET_FSPRIVATE(bp, val)         ((bp)->b_fspriv = (void*)(val))
 #define XFS_BUF_FSPRIVATE2(bp, type)           ((type)(bp)->b_fspriv2)
 #define XFS_BUF_SET_FSPRIVATE2(bp, val)                ((bp)->b_fspriv2 = (void*)(val))
-#define XFS_BUF_FSPRIVATE3(bp, type)           ((type)(bp)->b_fspriv3)
-#define XFS_BUF_SET_FSPRIVATE3(bp, val)                ((bp)->b_fspriv3 = (void*)(val))
 #define XFS_BUF_SET_START(bp)                  do { } while (0)
 #define XFS_BUF_SET_BRELSE_FUNC(bp, func)      ((bp)->b_relse = (func))
 
@@ -366,14 +361,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
 #define XFS_BUF_TARGET(bp)             ((bp)->b_target)
 #define XFS_BUFTARG_NAME(target)       xfs_buf_target_name(target)
 
-static inline int xfs_bawrite(void *mp, xfs_buf_t *bp)
-{
-       bp->b_fspriv3 = mp;
-       bp->b_strat = xfs_bdstrat_cb;
-       xfs_buf_delwri_dequeue(bp);
-       return xfs_buf_iostart(bp, XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
-}
-
 static inline void xfs_buf_relse(xfs_buf_t *bp)
 {
        if (!bp->b_relse)
@@ -414,17 +401,6 @@ static inline int XFS_bwrite(xfs_buf_t *bp)
        return error;
 }
 
-/*
- * No error can be returned from xfs_buf_iostart for delwri
- * buffers as they are queued and no I/O is issued.
- */
-static inline void xfs_bdwrite(void *mp, xfs_buf_t *bp)
-{
-       bp->b_strat = xfs_bdstrat_cb;
-       bp->b_fspriv3 = mp;
-       (void)xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC);
-}
-
 #define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
 
 #define xfs_iowait(bp) xfs_buf_iowait(bp)
index 8c022cd0ad672dc9f68687c54cb305dc5f846a19..55bddf3b60912022267801cd883bb1c21e93f832 100644 (file)
  */
 typedef const struct cred cred_t;
 
-extern cred_t *sys_cred;
-
-/* this is a hack.. (assumes sys_cred is the only cred_t in the system) */
-static inline int capable_cred(cred_t *cr, int cid)
-{
-       return (cr == sys_cred) ? 1 : capable(cid);
-}
-
 #endif  /* __XFS_CRED_H__ */
index 7f7abec25e141302458af4e72a0bf3ecce87fbab..595751f783506540ab08ea88819682dc8a73c647 100644 (file)
@@ -29,7 +29,6 @@
 #include "xfs_vnodeops.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
-#include "xfs_vfsops.h"
 
 /*
  * Note that we only accept fileids which are long enough rather than allow
index 3fee790f138b0acc2358244e0385bd81919ae554..e14c4e3aea0c331f67ba529946e201c6d814e8ce 100644 (file)
 #include "xfs_inode.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
-#include "xfs_ioctl32.h"
 #include "xfs_vnodeops.h"
+#include "xfs_da_btree.h"
+#include "xfs_ioctl.h"
 
 #include <linux/dcache.h>
 #include <linux/smp_lock.h>
 
 static struct vm_operations_struct xfs_file_vm_ops;
 
-STATIC_INLINE ssize_t
-__xfs_file_read(
+STATIC ssize_t
+xfs_file_aio_read(
        struct kiocb            *iocb,
        const struct iovec      *iov,
        unsigned long           nr_segs,
-       int                     ioflags,
        loff_t                  pos)
 {
        struct file             *file = iocb->ki_filp;
+       int                     ioflags = IO_ISAIO;
 
        BUG_ON(iocb->ki_pos != pos);
        if (unlikely(file->f_flags & O_DIRECT))
                ioflags |= IO_ISDIRECT;
+       if (file->f_mode & FMODE_NOCMTIME)
+               ioflags |= IO_INVIS;
        return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov,
                                nr_segs, &iocb->ki_pos, ioflags);
 }
 
 STATIC ssize_t
-xfs_file_aio_read(
-       struct kiocb            *iocb,
-       const struct iovec      *iov,
-       unsigned long           nr_segs,
-       loff_t                  pos)
-{
-       return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO, pos);
-}
-
-STATIC ssize_t
-xfs_file_aio_read_invis(
-       struct kiocb            *iocb,
-       const struct iovec      *iov,
-       unsigned long           nr_segs,
-       loff_t                  pos)
-{
-       return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos);
-}
-
-STATIC_INLINE ssize_t
-__xfs_file_write(
+xfs_file_aio_write(
        struct kiocb            *iocb,
        const struct iovec      *iov,
        unsigned long           nr_segs,
-       int                     ioflags,
        loff_t                  pos)
 {
-       struct file     *file = iocb->ki_filp;
+       struct file             *file = iocb->ki_filp;
+       int                     ioflags = IO_ISAIO;
 
        BUG_ON(iocb->ki_pos != pos);
        if (unlikely(file->f_flags & O_DIRECT))
                ioflags |= IO_ISDIRECT;
+       if (file->f_mode & FMODE_NOCMTIME)
+               ioflags |= IO_INVIS;
        return xfs_write(XFS_I(file->f_mapping->host), iocb, iov, nr_segs,
                                &iocb->ki_pos, ioflags);
 }
 
-STATIC ssize_t
-xfs_file_aio_write(
-       struct kiocb            *iocb,
-       const struct iovec      *iov,
-       unsigned long           nr_segs,
-       loff_t                  pos)
-{
-       return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO, pos);
-}
-
-STATIC ssize_t
-xfs_file_aio_write_invis(
-       struct kiocb            *iocb,
-       const struct iovec      *iov,
-       unsigned long           nr_segs,
-       loff_t                  pos)
-{
-       return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos);
-}
-
 STATIC ssize_t
 xfs_file_splice_read(
        struct file             *infilp,
@@ -126,20 +91,13 @@ xfs_file_splice_read(
        size_t                  len,
        unsigned int            flags)
 {
-       return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode),
-                                  infilp, ppos, pipe, len, flags, 0);
-}
+       int                     ioflags = 0;
+
+       if (infilp->f_mode & FMODE_NOCMTIME)
+               ioflags |= IO_INVIS;
 
-STATIC ssize_t
-xfs_file_splice_read_invis(
-       struct file             *infilp,
-       loff_t                  *ppos,
-       struct pipe_inode_info  *pipe,
-       size_t                  len,
-       unsigned int            flags)
-{
        return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode),
-                                  infilp, ppos, pipe, len, flags, IO_INVIS);
+                                  infilp, ppos, pipe, len, flags, ioflags);
 }
 
 STATIC ssize_t
@@ -150,30 +108,49 @@ xfs_file_splice_write(
        size_t                  len,
        unsigned int            flags)
 {
-       return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode),
-                                   pipe, outfilp, ppos, len, flags, 0);
-}
+       int                     ioflags = 0;
+
+       if (outfilp->f_mode & FMODE_NOCMTIME)
+               ioflags |= IO_INVIS;
 
-STATIC ssize_t
-xfs_file_splice_write_invis(
-       struct pipe_inode_info  *pipe,
-       struct file             *outfilp,
-       loff_t                  *ppos,
-       size_t                  len,
-       unsigned int            flags)
-{
        return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode),
-                                   pipe, outfilp, ppos, len, flags, IO_INVIS);
+                                   pipe, outfilp, ppos, len, flags, ioflags);
 }
 
 STATIC int
 xfs_file_open(
        struct inode    *inode,
-       struct file     *filp)
+       struct file     *file)
 {
-       if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
+       if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
                return -EFBIG;
-       return -xfs_open(XFS_I(inode));
+       if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
+               return -EIO;
+       return 0;
+}
+
+STATIC int
+xfs_dir_open(
+       struct inode    *inode,
+       struct file     *file)
+{
+       struct xfs_inode *ip = XFS_I(inode);
+       int             mode;
+       int             error;
+
+       error = xfs_file_open(inode, file);
+       if (error)
+               return error;
+
+       /*
+        * If there are any blocks, read-ahead block 0 as we're almost
+        * certain to have the next operation be a read there.
+        */
+       mode = xfs_ilock_map_shared(ip);
+       if (ip->i_d.di_nextents > 0)
+               xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
+       xfs_iunlock(ip, mode);
+       return 0;
 }
 
 STATIC int
@@ -227,7 +204,7 @@ xfs_file_readdir(
         * point we can change the ->readdir prototype to include the
         * buffer size.
         */
-       bufsize = (size_t)min_t(loff_t, PAGE_SIZE, inode->i_size);
+       bufsize = (size_t)min_t(loff_t, PAGE_SIZE, ip->i_d.di_size);
 
        error = xfs_readdir(ip, dirent, bufsize,
                                (xfs_off_t *)&filp->f_pos, filldir);
@@ -248,48 +225,6 @@ xfs_file_mmap(
        return 0;
 }
 
-STATIC long
-xfs_file_ioctl(
-       struct file     *filp,
-       unsigned int    cmd,
-       unsigned long   p)
-{
-       int             error;
-       struct inode    *inode = filp->f_path.dentry->d_inode;
-
-       error = xfs_ioctl(XFS_I(inode), filp, 0, cmd, (void __user *)p);
-       xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED);
-
-       /* NOTE:  some of the ioctl's return positive #'s as a
-        *        byte count indicating success, such as
-        *        readlink_by_handle.  So we don't "sign flip"
-        *        like most other routines.  This means true
-        *        errors need to be returned as a negative value.
-        */
-       return error;
-}
-
-STATIC long
-xfs_file_ioctl_invis(
-       struct file     *filp,
-       unsigned int    cmd,
-       unsigned long   p)
-{
-       int             error;
-       struct inode    *inode = filp->f_path.dentry->d_inode;
-
-       error = xfs_ioctl(XFS_I(inode), filp, IO_INVIS, cmd, (void __user *)p);
-       xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED);
-
-       /* NOTE:  some of the ioctl's return positive #'s as a
-        *        byte count indicating success, such as
-        *        readlink_by_handle.  So we don't "sign flip"
-        *        like most other routines.  This means true
-        *        errors need to be returned as a negative value.
-        */
-       return error;
-}
-
 /*
  * mmap()d file has taken write protection fault and is being made
  * writable. We can set the page state up correctly for a writable
@@ -325,26 +260,8 @@ const struct file_operations xfs_file_operations = {
 #endif
 };
 
-const struct file_operations xfs_invis_file_operations = {
-       .llseek         = generic_file_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
-       .aio_read       = xfs_file_aio_read_invis,
-       .aio_write      = xfs_file_aio_write_invis,
-       .splice_read    = xfs_file_splice_read_invis,
-       .splice_write   = xfs_file_splice_write_invis,
-       .unlocked_ioctl = xfs_file_ioctl_invis,
-#ifdef CONFIG_COMPAT
-       .compat_ioctl   = xfs_file_compat_invis_ioctl,
-#endif
-       .mmap           = xfs_file_mmap,
-       .open           = xfs_file_open,
-       .release        = xfs_file_release,
-       .fsync          = xfs_file_fsync,
-};
-
-
 const struct file_operations xfs_dir_file_operations = {
+       .open           = xfs_dir_open,
        .read           = generic_read_dir,
        .readdir        = xfs_file_readdir,
        .llseek         = generic_file_llseek,
index 36caa6d957dfd035687e51511622f07366b4c259..5aeb777769613152297ebacb0d5a9ade99f7f26c 100644 (file)
@@ -24,6 +24,10 @@ int  fs_noerr(void) { return 0; }
 int  fs_nosys(void) { return ENOSYS; }
 void fs_noval(void) { return; }
 
+/*
+ * note: all filemap functions return negative error codes. These
+ * need to be inverted before returning to the xfs core functions.
+ */
 void
 xfs_tosspages(
        xfs_inode_t     *ip,
@@ -53,7 +57,7 @@ xfs_flushinval_pages(
                if (!ret)
                        truncate_inode_pages(mapping, first);
        }
-       return ret;
+       return -ret;
 }
 
 int
@@ -72,10 +76,23 @@ xfs_flush_pages(
                xfs_iflags_clear(ip, XFS_ITRUNCATED);
                ret = filemap_fdatawrite(mapping);
                if (flags & XFS_B_ASYNC)
-                       return ret;
+                       return -ret;
                ret2 = filemap_fdatawait(mapping);
                if (!ret)
                        ret = ret2;
        }
-       return ret;
+       return -ret;
+}
+
+int
+xfs_wait_on_pages(
+       xfs_inode_t     *ip,
+       xfs_off_t       first,
+       xfs_off_t       last)
+{
+       struct address_space *mapping = VFS_I(ip)->i_mapping;
+
+       if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
+               return -filemap_fdatawait(mapping);
+       return 0;
 }
index ef90e64641e6a662b0478d74fc0918246c9a2e95..2ae8b1ccb02e1f86ad1f8219985130eaf132dc81 100644 (file)
@@ -26,7 +26,6 @@
  */
 xfs_param_t xfs_params = {
                          /*    MIN             DFLT            MAX     */
-       .restrict_chown = {     0,              1,              1       },
        .sgid_inherit   = {     0,              0,              1       },
        .symlink_mode   = {     0,              0,              1       },
        .panic_mask     = {     0,              0,              255     },
@@ -43,10 +42,3 @@ xfs_param_t xfs_params = {
        .inherit_nodfrg = {     0,              1,              1       },
        .fstrm_timer    = {     1,              30*100,         3600*100},
 };
-
-/*
- * Global system credential structure.
- */
-static cred_t sys_cred_val;
-cred_t *sys_cred = &sys_cred_val;
-
index 6eda8a3eb6f18f87bb2b87fa94933fc42a4f9c49..69f71caf061cf037075f015af0425af294343409 100644 (file)
@@ -19,6 +19,5 @@
 #define __XFS_GLOBALS_H__
 
 extern uint64_t        xfs_panic_mask;         /* set to cause more panics */
-extern cred_t *sys_cred;
 
 #endif /* __XFS_GLOBALS_H__ */
index 281cbd5a25cfd1550645370cf339e0e251d1f721..67205f6198ba58019eb549ffc6440e56b84af8b4 100644 (file)
  * XFS_IOC_PATH_TO_HANDLE
  *    returns full handle for a path
  */
-STATIC int
+int
 xfs_find_handle(
        unsigned int            cmd,
-       void                    __user *arg)
+       xfs_fsop_handlereq_t    *hreq)
 {
        int                     hsize;
        xfs_handle_t            handle;
-       xfs_fsop_handlereq_t    hreq;
        struct inode            *inode;
 
-       if (copy_from_user(&hreq, arg, sizeof(hreq)))
-               return -XFS_ERROR(EFAULT);
-
        memset((char *)&handle, 0, sizeof(handle));
 
        switch (cmd) {
        case XFS_IOC_PATH_TO_FSHANDLE:
        case XFS_IOC_PATH_TO_HANDLE: {
                struct path path;
-               int error = user_lpath((const char __user *)hreq.path, &path);
+               int error = user_lpath((const char __user *)hreq->path, &path);
                if (error)
                        return error;
 
@@ -101,7 +97,7 @@ xfs_find_handle(
        case XFS_IOC_FD_TO_HANDLE: {
                struct file     *file;
 
-               file = fget(hreq.fd);
+               file = fget(hreq->fd);
                if (!file)
                    return -EBADF;
 
@@ -158,8 +154,8 @@ xfs_find_handle(
        }
 
        /* now copy our handle into the user buffer & write out the size */
-       if (copy_to_user(hreq.ohandle, &handle, hsize) ||
-           copy_to_user(hreq.ohandlen, &hsize, sizeof(__s32))) {
+       if (copy_to_user(hreq->ohandle, &handle, hsize) ||
+           copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) {
                iput(inode);
                return -XFS_ERROR(EFAULT);
        }
@@ -249,10 +245,10 @@ xfs_vget_fsop_handlereq(
        return 0;
 }
 
-STATIC int
+int
 xfs_open_by_handle(
        xfs_mount_t             *mp,
-       void                    __user *arg,
+       xfs_fsop_handlereq_t    *hreq,
        struct file             *parfilp,
        struct inode            *parinode)
 {
@@ -263,14 +259,11 @@ xfs_open_by_handle(
        struct file             *filp;
        struct inode            *inode;
        struct dentry           *dentry;
-       xfs_fsop_handlereq_t    hreq;
 
        if (!capable(CAP_SYS_ADMIN))
                return -XFS_ERROR(EPERM);
-       if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
-               return -XFS_ERROR(EFAULT);
 
-       error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &inode);
+       error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode);
        if (error)
                return -error;
 
@@ -281,10 +274,10 @@ xfs_open_by_handle(
        }
 
 #if BITS_PER_LONG != 32
-       hreq.oflags |= O_LARGEFILE;
+       hreq->oflags |= O_LARGEFILE;
 #endif
        /* Put open permission in namei format. */
-       permflag = hreq.oflags;
+       permflag = hreq->oflags;
        if ((permflag+1) & O_ACCMODE)
                permflag++;
        if (permflag & O_TRUNC)
@@ -322,15 +315,16 @@ xfs_open_by_handle(
        mntget(parfilp->f_path.mnt);
 
        /* Create file pointer. */
-       filp = dentry_open(dentry, parfilp->f_path.mnt, hreq.oflags, cred);
+       filp = dentry_open(dentry, parfilp->f_path.mnt, hreq->oflags, cred);
        if (IS_ERR(filp)) {
                put_unused_fd(new_fd);
                return -XFS_ERROR(-PTR_ERR(filp));
        }
+
        if (inode->i_mode & S_IFREG) {
                /* invisible operation should not change atime */
                filp->f_flags |= O_NOATIME;
-               filp->f_op = &xfs_invis_file_operations;
+               filp->f_mode |= FMODE_NOCMTIME;
        }
 
        fd_install(new_fd, filp);
@@ -363,24 +357,21 @@ do_readlink(
 }
 
 
-STATIC int
+int
 xfs_readlink_by_handle(
        xfs_mount_t             *mp,
-       void                    __user *arg,
+       xfs_fsop_handlereq_t    *hreq,
        struct inode            *parinode)
 {
        struct inode            *inode;
-       xfs_fsop_handlereq_t    hreq;
        __u32                   olen;
        void                    *link;
        int                     error;
 
        if (!capable(CAP_SYS_ADMIN))
                return -XFS_ERROR(EPERM);
-       if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
-               return -XFS_ERROR(EFAULT);
 
-       error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &inode);
+       error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode);
        if (error)
                return -error;
 
@@ -390,7 +381,7 @@ xfs_readlink_by_handle(
                goto out_iput;
        }
 
-       if (copy_from_user(&olen, hreq.ohandlen, sizeof(__u32))) {
+       if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
                error = -XFS_ERROR(EFAULT);
                goto out_iput;
        }
@@ -402,7 +393,7 @@ xfs_readlink_by_handle(
        error = -xfs_readlink(XFS_I(inode), link);
        if (error)
                goto out_kfree;
-       error = do_readlink(hreq.ohandle, olen, link);
+       error = do_readlink(hreq->ohandle, olen, link);
        if (error)
                goto out_kfree;
 
@@ -501,7 +492,7 @@ xfs_attrlist_by_handle(
        return -error;
 }
 
-STATIC int
+int
 xfs_attrmulti_attr_get(
        struct inode            *inode,
        char                    *name,
@@ -530,7 +521,7 @@ xfs_attrmulti_attr_get(
        return error;
 }
 
-STATIC int
+int
 xfs_attrmulti_attr_set(
        struct inode            *inode,
        char                    *name,
@@ -560,7 +551,7 @@ xfs_attrmulti_attr_set(
        return error;
 }
 
-STATIC int
+int
 xfs_attrmulti_attr_remove(
        struct inode            *inode,
        char                    *name,
@@ -662,19 +653,26 @@ xfs_attrmulti_by_handle(
        return -error;
 }
 
-STATIC int
+int
 xfs_ioc_space(
        struct xfs_inode        *ip,
        struct inode            *inode,
        struct file             *filp,
        int                     ioflags,
        unsigned int            cmd,
-       void                    __user *arg)
+       xfs_flock64_t           *bf)
 {
-       xfs_flock64_t           bf;
        int                     attr_flags = 0;
        int                     error;
 
+       /*
+        * Only allow the sys admin to reserve space unless
+        * unwritten extents are enabled.
+        */
+       if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) &&
+           !capable(CAP_SYS_ADMIN))
+               return -XFS_ERROR(EPERM);
+
        if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
                return -XFS_ERROR(EPERM);
 
@@ -684,16 +682,12 @@ xfs_ioc_space(
        if (!S_ISREG(inode->i_mode))
                return -XFS_ERROR(EINVAL);
 
-       if (copy_from_user(&bf, arg, sizeof(bf)))
-               return -XFS_ERROR(EFAULT);
-
        if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
                attr_flags |= XFS_ATTR_NONBLOCK;
        if (ioflags & IO_INVIS)
                attr_flags |= XFS_ATTR_DMI;
 
-       error = xfs_change_file_space(ip, cmd, &bf, filp->f_pos,
-                                             NULL, attr_flags);
+       error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags);
        return -error;
 }
 
@@ -1105,10 +1099,6 @@ xfs_ioctl_setattr(
 
        /*
         * Change file ownership.  Must be the owner or privileged.
-        * If the system was configured with the "restricted_chown"
-        * option, the owner is not permitted to give away the file,
-        * and can change the group id only to a group of which he
-        * or she is a member.
         */
        if (mask & FSX_PROJID) {
                /*
@@ -1137,7 +1127,7 @@ xfs_ioctl_setattr(
                         * the superblock version number since projids didn't
                         * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
                         */
-                       if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
+                       if (ip->i_d.di_version == 1)
                                xfs_bump_ino_vers2(tp, ip);
                }
 
@@ -1255,6 +1245,19 @@ xfs_ioc_setxflags(
        return -xfs_ioctl_setattr(ip, &fa, mask);
 }
 
+STATIC int
+xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
+{
+       struct getbmap __user   *base = *ap;
+
+       /* copy only getbmap portion (not getbmapx) */
+       if (copy_to_user(base, bmv, sizeof(struct getbmap)))
+               return XFS_ERROR(EFAULT);
+
+       *ap += sizeof(struct getbmap);
+       return 0;
+}
+
 STATIC int
 xfs_ioc_getbmap(
        struct xfs_inode        *ip,
@@ -1262,37 +1265,48 @@ xfs_ioc_getbmap(
        unsigned int            cmd,
        void                    __user *arg)
 {
-       struct getbmap          bm;
-       int                     iflags;
+       struct getbmapx         bmx;
        int                     error;
 
-       if (copy_from_user(&bm, arg, sizeof(bm)))
+       if (copy_from_user(&bmx, arg, sizeof(struct getbmapx)))
                return -XFS_ERROR(EFAULT);
 
-       if (bm.bmv_count < 2)
+       if (bmx.bmv_count < 2)
                return -XFS_ERROR(EINVAL);
 
-       iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
+       bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
        if (ioflags & IO_INVIS)
-               iflags |= BMV_IF_NO_DMAPI_READ;
+               bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
 
-       error = xfs_getbmap(ip, &bm, (struct getbmap __user *)arg+1, iflags);
+       error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
+                           (struct getbmap *)arg+1);
        if (error)
                return -error;
 
-       if (copy_to_user(arg, &bm, sizeof(bm)))
+       /* copy back header - only size of getbmap */
+       if (copy_to_user(arg, &bmx, sizeof(struct getbmap)))
                return -XFS_ERROR(EFAULT);
        return 0;
 }
 
+STATIC int
+xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full)
+{
+       struct getbmapx __user  *base = *ap;
+
+       if (copy_to_user(base, bmv, sizeof(struct getbmapx)))
+               return XFS_ERROR(EFAULT);
+
+       *ap += sizeof(struct getbmapx);
+       return 0;
+}
+
 STATIC int
 xfs_ioc_getbmapx(
        struct xfs_inode        *ip,
        void                    __user *arg)
 {
        struct getbmapx         bmx;
-       struct getbmap          bm;
-       int                     iflags;
        int                     error;
 
        if (copy_from_user(&bmx, arg, sizeof(bmx)))
@@ -1301,46 +1315,46 @@ xfs_ioc_getbmapx(
        if (bmx.bmv_count < 2)
                return -XFS_ERROR(EINVAL);
 
-       /*
-        * Map input getbmapx structure to a getbmap
-        * structure for xfs_getbmap.
-        */
-       GETBMAP_CONVERT(bmx, bm);
-
-       iflags = bmx.bmv_iflags;
-
-       if (iflags & (~BMV_IF_VALID))
+       if (bmx.bmv_iflags & (~BMV_IF_VALID))
                return -XFS_ERROR(EINVAL);
 
-       iflags |= BMV_IF_EXTENDED;
-
-       error = xfs_getbmap(ip, &bm, (struct getbmapx __user *)arg+1, iflags);
+       error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format,
+                           (struct getbmapx *)arg+1);
        if (error)
                return -error;
 
-       GETBMAP_CONVERT(bm, bmx);
-
-       if (copy_to_user(arg, &bmx, sizeof(bmx)))
+       /* copy back header */
+       if (copy_to_user(arg, &bmx, sizeof(struct getbmapx)))
                return -XFS_ERROR(EFAULT);
 
        return 0;
 }
 
-int
-xfs_ioctl(
-       xfs_inode_t             *ip,
+/*
+ * Note: some of the ioctl's return positive numbers as a
+ * byte count indicating success, such as readlink_by_handle.
+ * So we don't "sign flip" like most other routines.  This means
+ * true errors need to be returned as a negative value.
+ */
+long
+xfs_file_ioctl(
        struct file             *filp,
-       int                     ioflags,
        unsigned int            cmd,
-       void                    __user *arg)
+       unsigned long           p)
 {
        struct inode            *inode = filp->f_path.dentry->d_inode;
-       xfs_mount_t             *mp = ip->i_mount;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       void                    __user *arg = (void __user *)p;
+       int                     ioflags = 0;
        int                     error;
 
-       xfs_itrace_entry(XFS_I(inode));
-       switch (cmd) {
+       if (filp->f_mode & FMODE_NOCMTIME)
+               ioflags |= IO_INVIS;
 
+       xfs_itrace_entry(ip);
+
+       switch (cmd) {
        case XFS_IOC_ALLOCSP:
        case XFS_IOC_FREESP:
        case XFS_IOC_RESVSP:
@@ -1348,17 +1362,13 @@ xfs_ioctl(
        case XFS_IOC_ALLOCSP64:
        case XFS_IOC_FREESP64:
        case XFS_IOC_RESVSP64:
-       case XFS_IOC_UNRESVSP64:
-               /*
-                * Only allow the sys admin to reserve space unless
-                * unwritten extents are enabled.
-                */
-               if (!xfs_sb_version_hasextflgbit(&mp->m_sb) &&
-                   !capable(CAP_SYS_ADMIN))
-                       return -EPERM;
-
-               return xfs_ioc_space(ip, inode, filp, ioflags, cmd, arg);
+       case XFS_IOC_UNRESVSP64: {
+               xfs_flock64_t           bf;
 
+               if (copy_from_user(&bf, arg, sizeof(bf)))
+                       return -XFS_ERROR(EFAULT);
+               return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+       }
        case XFS_IOC_DIOINFO: {
                struct dioattr  da;
                xfs_buftarg_t   *target =
@@ -1418,18 +1428,30 @@ xfs_ioctl(
 
        case XFS_IOC_FD_TO_HANDLE:
        case XFS_IOC_PATH_TO_HANDLE:
-       case XFS_IOC_PATH_TO_FSHANDLE:
-               return xfs_find_handle(cmd, arg);
+       case XFS_IOC_PATH_TO_FSHANDLE: {
+               xfs_fsop_handlereq_t    hreq;
 
-       case XFS_IOC_OPEN_BY_HANDLE:
-               return xfs_open_by_handle(mp, arg, filp, inode);
+               if (copy_from_user(&hreq, arg, sizeof(hreq)))
+                       return -XFS_ERROR(EFAULT);
+               return xfs_find_handle(cmd, &hreq);
+       }
+       case XFS_IOC_OPEN_BY_HANDLE: {
+               xfs_fsop_handlereq_t    hreq;
 
+               if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
+                       return -XFS_ERROR(EFAULT);
+               return xfs_open_by_handle(mp, &hreq, filp, inode);
+       }
        case XFS_IOC_FSSETDM_BY_HANDLE:
                return xfs_fssetdm_by_handle(mp, arg, inode);
 
-       case XFS_IOC_READLINK_BY_HANDLE:
-               return xfs_readlink_by_handle(mp, arg, inode);
+       case XFS_IOC_READLINK_BY_HANDLE: {
+               xfs_fsop_handlereq_t    hreq;
 
+               if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
+                       return -XFS_ERROR(EFAULT);
+               return xfs_readlink_by_handle(mp, &hreq, inode);
+       }
        case XFS_IOC_ATTRLIST_BY_HANDLE:
                return xfs_attrlist_by_handle(mp, arg, inode);
 
@@ -1437,7 +1459,11 @@ xfs_ioctl(
                return xfs_attrmulti_by_handle(mp, arg, filp, inode);
 
        case XFS_IOC_SWAPEXT: {
-               error = xfs_swapext((struct xfs_swapext __user *)arg);
+               struct xfs_swapext      sxp;
+
+               if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))
+                       return -XFS_ERROR(EFAULT);
+               error = xfs_swapext(&sxp);
                return -error;
        }
 
@@ -1493,9 +1519,6 @@ xfs_ioctl(
        case XFS_IOC_FSGROWFSDATA: {
                xfs_growfs_data_t in;
 
-               if (!capable(CAP_SYS_ADMIN))
-                       return -EPERM;
-
                if (copy_from_user(&in, arg, sizeof(in)))
                        return -XFS_ERROR(EFAULT);
 
@@ -1506,9 +1529,6 @@ xfs_ioctl(
        case XFS_IOC_FSGROWFSLOG: {
                xfs_growfs_log_t in;
 
-               if (!capable(CAP_SYS_ADMIN))
-                       return -EPERM;
-
                if (copy_from_user(&in, arg, sizeof(in)))
                        return -XFS_ERROR(EFAULT);
 
@@ -1519,9 +1539,6 @@ xfs_ioctl(
        case XFS_IOC_FSGROWFSRT: {
                xfs_growfs_rt_t in;
 
-               if (!capable(CAP_SYS_ADMIN))
-                       return -EPERM;
-
                if (copy_from_user(&in, arg, sizeof(in)))
                        return -XFS_ERROR(EFAULT);
 
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
new file mode 100644 (file)
index 0000000..8c16bf2
--- /dev/null
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IOCTL_H__
+#define __XFS_IOCTL_H__
+
+extern int
+xfs_ioc_space(
+       struct xfs_inode        *ip,
+       struct inode            *inode,
+       struct file             *filp,
+       int                     ioflags,
+       unsigned int            cmd,
+       xfs_flock64_t           *bf);
+
+extern int
+xfs_find_handle(
+       unsigned int            cmd,
+       xfs_fsop_handlereq_t    *hreq);
+
+extern int
+xfs_open_by_handle(
+       xfs_mount_t             *mp,
+       xfs_fsop_handlereq_t    *hreq,
+       struct file             *parfilp,
+       struct inode            *parinode);
+
+extern int
+xfs_readlink_by_handle(
+       xfs_mount_t             *mp,
+       xfs_fsop_handlereq_t    *hreq,
+       struct inode            *parinode);
+
+extern int
+xfs_attrmulti_attr_get(
+       struct inode            *inode,
+       char                    *name,
+       char                    __user *ubuf,
+       __uint32_t              *len,
+       __uint32_t              flags);
+
+extern int
+       xfs_attrmulti_attr_set(
+       struct inode            *inode,
+       char                    *name,
+       const char              __user *ubuf,
+       __uint32_t              len,
+       __uint32_t              flags);
+
+extern int
+xfs_attrmulti_attr_remove(
+       struct inode            *inode,
+       char                    *name,
+       __uint32_t              flags);
+
+extern long
+xfs_file_ioctl(
+       struct file             *filp,
+       unsigned int            cmd,
+       unsigned long           p);
+
+extern long
+xfs_file_compat_ioctl(
+       struct file             *file,
+       unsigned int            cmd,
+       unsigned long           arg);
+
+#endif
index a4b254eb43b2ec74151ada8ffb1e9fc2dd41cfab..0504cece9f6668768fb4c91b019d6f55438f6ba3 100644 (file)
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include <linux/compat.h>
-#include <linux/init.h>
 #include <linux/ioctl.h>
-#include <linux/syscalls.h>
-#include <linux/types.h>
-#include <linux/fs.h>
 #include <asm/uaccess.h>
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -36,7 +32,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_vfs.h"
 #include "xfs_vnode.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_error.h"
 #include "xfs_dfrag.h"
 #include "xfs_vnodeops.h"
+#include "xfs_fsops.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_attr.h"
+#include "xfs_ioctl.h"
 #include "xfs_ioctl32.h"
 
 #define  _NATIVE_IOC(cmd, type) \
          _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type))
 
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
-#define BROKEN_X86_ALIGNMENT
-#define _PACKED __attribute__((packed))
-/* on ia32 l_start is on a 32-bit boundary */
-typedef struct xfs_flock64_32 {
-       __s16           l_type;
-       __s16           l_whence;
-       __s64           l_start __attribute__((packed));
-                       /* len == 0 means until end of file */
-       __s64           l_len __attribute__((packed));
-       __s32           l_sysid;
-       __u32           l_pid;
-       __s32           l_pad[4];       /* reserve area */
-} xfs_flock64_32_t;
-
-#define XFS_IOC_ALLOCSP_32     _IOW ('X', 10, struct xfs_flock64_32)
-#define XFS_IOC_FREESP_32      _IOW ('X', 11, struct xfs_flock64_32)
-#define XFS_IOC_ALLOCSP64_32   _IOW ('X', 36, struct xfs_flock64_32)
-#define XFS_IOC_FREESP64_32    _IOW ('X', 37, struct xfs_flock64_32)
-#define XFS_IOC_RESVSP_32      _IOW ('X', 40, struct xfs_flock64_32)
-#define XFS_IOC_UNRESVSP_32    _IOW ('X', 41, struct xfs_flock64_32)
-#define XFS_IOC_RESVSP64_32    _IOW ('X', 42, struct xfs_flock64_32)
-#define XFS_IOC_UNRESVSP64_32  _IOW ('X', 43, struct xfs_flock64_32)
-
-/* just account for different alignment */
-STATIC unsigned long
-xfs_ioctl32_flock(
-       unsigned long           arg)
+#ifdef BROKEN_X86_ALIGNMENT
+STATIC int
+xfs_compat_flock64_copyin(
+       xfs_flock64_t           *bf,
+       compat_xfs_flock64_t    __user *arg32)
 {
-       xfs_flock64_32_t        __user *p32 = (void __user *)arg;
-       xfs_flock64_t           __user *p = compat_alloc_user_space(sizeof(*p));
-
-       if (copy_in_user(&p->l_type,    &p32->l_type,   sizeof(s16)) ||
-           copy_in_user(&p->l_whence,  &p32->l_whence, sizeof(s16)) ||
-           copy_in_user(&p->l_start,   &p32->l_start,  sizeof(s64)) ||
-           copy_in_user(&p->l_len,     &p32->l_len,    sizeof(s64)) ||
-           copy_in_user(&p->l_sysid,   &p32->l_sysid,  sizeof(s32)) ||
-           copy_in_user(&p->l_pid,     &p32->l_pid,    sizeof(u32)) ||
-           copy_in_user(&p->l_pad,     &p32->l_pad,    4*sizeof(u32)))
-               return -EFAULT;
-
-       return (unsigned long)p;
+       if (get_user(bf->l_type,        &arg32->l_type) ||
+           get_user(bf->l_whence,      &arg32->l_whence) ||
+           get_user(bf->l_start,       &arg32->l_start) ||
+           get_user(bf->l_len,         &arg32->l_len) ||
+           get_user(bf->l_sysid,       &arg32->l_sysid) ||
+           get_user(bf->l_pid,         &arg32->l_pid) ||
+           copy_from_user(bf->l_pad,   &arg32->l_pad,  4*sizeof(u32)))
+               return -XFS_ERROR(EFAULT);
+       return 0;
 }
 
-typedef struct compat_xfs_fsop_geom_v1 {
-       __u32           blocksize;      /* filesystem (data) block size */
-       __u32           rtextsize;      /* realtime extent size         */
-       __u32           agblocks;       /* fsblocks in an AG            */
-       __u32           agcount;        /* number of allocation groups  */
-       __u32           logblocks;      /* fsblocks in the log          */
-       __u32           sectsize;       /* (data) sector size, bytes    */
-       __u32           inodesize;      /* inode size in bytes          */
-       __u32           imaxpct;        /* max allowed inode space(%)   */
-       __u64           datablocks;     /* fsblocks in data subvolume   */
-       __u64           rtblocks;       /* fsblocks in realtime subvol  */
-       __u64           rtextents;      /* rt extents in realtime subvol*/
-       __u64           logstart;       /* starting fsblock of the log  */
-       unsigned char   uuid[16];       /* unique id of the filesystem  */
-       __u32           sunit;          /* stripe unit, fsblocks        */
-       __u32           swidth;         /* stripe width, fsblocks       */
-       __s32           version;        /* structure version            */
-       __u32           flags;          /* superblock version flags     */
-       __u32           logsectsize;    /* log sector size, bytes       */
-       __u32           rtsectsize;     /* realtime sector size, bytes  */
-       __u32           dirblocksize;   /* directory block size, bytes  */
-} __attribute__((packed)) compat_xfs_fsop_geom_v1_t;
-
-#define XFS_IOC_FSGEOMETRY_V1_32  \
-       _IOR ('X', 100, struct compat_xfs_fsop_geom_v1)
-
-STATIC unsigned long xfs_ioctl32_geom_v1(unsigned long arg)
+STATIC int
+xfs_compat_ioc_fsgeometry_v1(
+       struct xfs_mount          *mp,
+       compat_xfs_fsop_geom_v1_t __user *arg32)
 {
-       compat_xfs_fsop_geom_v1_t __user *p32 = (void __user *)arg;
-       xfs_fsop_geom_v1_t __user *p = compat_alloc_user_space(sizeof(*p));
+       xfs_fsop_geom_t           fsgeo;
+       int                       error;
 
-       if (copy_in_user(p, p32, sizeof(*p32)))
-               return -EFAULT;
-       return (unsigned long)p;
+       error = xfs_fs_geometry(mp, &fsgeo, 3);
+       if (error)
+               return -error;
+       /* The 32-bit variant simply has some padding at the end */
+       if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
+               return -XFS_ERROR(EFAULT);
+       return 0;
 }
 
-typedef struct compat_xfs_inogrp {
-       __u64           xi_startino;    /* starting inode number        */
-       __s32           xi_alloccount;  /* # bits set in allocmask      */
-       __u64           xi_allocmask;   /* mask of allocated inodes     */
-} __attribute__((packed)) compat_xfs_inogrp_t;
-
-STATIC int xfs_inumbers_fmt_compat(
-       void __user *ubuffer,
-       const xfs_inogrp_t *buffer,
-       long count,
-       long *written)
+STATIC int
+xfs_compat_growfs_data_copyin(
+       struct xfs_growfs_data   *in,
+       compat_xfs_growfs_data_t __user *arg32)
 {
-       compat_xfs_inogrp_t __user *p32 = ubuffer;
-       long i;
+       if (get_user(in->newblocks, &arg32->newblocks) ||
+           get_user(in->imaxpct,   &arg32->imaxpct))
+               return -XFS_ERROR(EFAULT);
+       return 0;
+}
+
+STATIC int
+xfs_compat_growfs_rt_copyin(
+       struct xfs_growfs_rt     *in,
+       compat_xfs_growfs_rt_t  __user *arg32)
+{
+       if (get_user(in->newblocks, &arg32->newblocks) ||
+           get_user(in->extsize,   &arg32->extsize))
+               return -XFS_ERROR(EFAULT);
+       return 0;
+}
+
+STATIC int
+xfs_inumbers_fmt_compat(
+       void                    __user *ubuffer,
+       const xfs_inogrp_t      *buffer,
+       long                    count,
+       long                    *written)
+{
+       compat_xfs_inogrp_t     __user *p32 = ubuffer;
+       long                    i;
 
        for (i = 0; i < count; i++) {
                if (put_user(buffer[i].xi_startino,   &p32[i].xi_startino) ||
                    put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) ||
                    put_user(buffer[i].xi_allocmask,  &p32[i].xi_allocmask))
-                       return -EFAULT;
+                       return -XFS_ERROR(EFAULT);
        }
        *written = count * sizeof(*p32);
        return 0;
 }
 
 #else
-
 #define xfs_inumbers_fmt_compat xfs_inumbers_fmt
-#define _PACKED
+#endif /* BROKEN_X86_ALIGNMENT */
 
-#endif
+STATIC int
+xfs_ioctl32_bstime_copyin(
+       xfs_bstime_t            *bstime,
+       compat_xfs_bstime_t     __user *bstime32)
+{
+       compat_time_t           sec32;  /* tv_sec differs on 64 vs. 32 */
 
-/* XFS_IOC_FSBULKSTAT and friends */
+       if (get_user(sec32,             &bstime32->tv_sec)      ||
+           get_user(bstime->tv_nsec,   &bstime32->tv_nsec))
+               return -XFS_ERROR(EFAULT);
+       bstime->tv_sec = sec32;
+       return 0;
+}
+
+/* xfs_bstat_t has differing alignment on intel, & bstime_t sizes everywhere */
+STATIC int
+xfs_ioctl32_bstat_copyin(
+       xfs_bstat_t             *bstat,
+       compat_xfs_bstat_t      __user *bstat32)
+{
+       if (get_user(bstat->bs_ino,     &bstat32->bs_ino)       ||
+           get_user(bstat->bs_mode,    &bstat32->bs_mode)      ||
+           get_user(bstat->bs_nlink,   &bstat32->bs_nlink)     ||
+           get_user(bstat->bs_uid,     &bstat32->bs_uid)       ||
+           get_user(bstat->bs_gid,     &bstat32->bs_gid)       ||
+           get_user(bstat->bs_rdev,    &bstat32->bs_rdev)      ||
+           get_user(bstat->bs_blksize, &bstat32->bs_blksize)   ||
+           get_user(bstat->bs_size,    &bstat32->bs_size)      ||
+           xfs_ioctl32_bstime_copyin(&bstat->bs_atime, &bstat32->bs_atime) ||
+           xfs_ioctl32_bstime_copyin(&bstat->bs_mtime, &bstat32->bs_mtime) ||
+           xfs_ioctl32_bstime_copyin(&bstat->bs_ctime, &bstat32->bs_ctime) ||
+           get_user(bstat->bs_blocks,  &bstat32->bs_size)      ||
+           get_user(bstat->bs_xflags,  &bstat32->bs_size)      ||
+           get_user(bstat->bs_extsize, &bstat32->bs_extsize)   ||
+           get_user(bstat->bs_extents, &bstat32->bs_extents)   ||
+           get_user(bstat->bs_gen,     &bstat32->bs_gen)       ||
+           get_user(bstat->bs_projid,  &bstat32->bs_projid)    ||
+           get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
+           get_user(bstat->bs_dmstate, &bstat32->bs_dmstate)   ||
+           get_user(bstat->bs_aextents, &bstat32->bs_aextents))
+               return -XFS_ERROR(EFAULT);
+       return 0;
+}
 
-typedef struct compat_xfs_bstime {
-       __s32           tv_sec;         /* seconds              */
-       __s32           tv_nsec;        /* and nanoseconds      */
-} compat_xfs_bstime_t;
+/* XFS_IOC_FSBULKSTAT and friends */
 
-STATIC int xfs_bstime_store_compat(
-       compat_xfs_bstime_t __user *p32,
-       const xfs_bstime_t *p)
+STATIC int
+xfs_bstime_store_compat(
+       compat_xfs_bstime_t     __user *p32,
+       const xfs_bstime_t      *p)
 {
-       __s32 sec32;
+       __s32                   sec32;
 
        sec32 = p->tv_sec;
        if (put_user(sec32, &p32->tv_sec) ||
            put_user(p->tv_nsec, &p32->tv_nsec))
-               return -EFAULT;
+               return -XFS_ERROR(EFAULT);
        return 0;
 }
 
-typedef struct compat_xfs_bstat {
-       __u64           bs_ino;         /* inode number                 */
-       __u16           bs_mode;        /* type and mode                */
-       __u16           bs_nlink;       /* number of links              */
-       __u32           bs_uid;         /* user id                      */
-       __u32           bs_gid;         /* group id                     */
-       __u32           bs_rdev;        /* device value                 */
-       __s32           bs_blksize;     /* block size                   */
-       __s64           bs_size;        /* file size                    */
-       compat_xfs_bstime_t bs_atime;   /* access time                  */
-       compat_xfs_bstime_t bs_mtime;   /* modify time                  */
-       compat_xfs_bstime_t bs_ctime;   /* inode change time            */
-       int64_t         bs_blocks;      /* number of blocks             */
-       __u32           bs_xflags;      /* extended flags               */
-       __s32           bs_extsize;     /* extent size                  */
-       __s32           bs_extents;     /* number of extents            */
-       __u32           bs_gen;         /* generation count             */
-       __u16           bs_projid;      /* project id                   */
-       unsigned char   bs_pad[14];     /* pad space, unused            */
-       __u32           bs_dmevmask;    /* DMIG event mask              */
-       __u16           bs_dmstate;     /* DMIG state info              */
-       __u16           bs_aextents;    /* attribute number of extents  */
-} _PACKED compat_xfs_bstat_t;
-
-STATIC int xfs_bulkstat_one_fmt_compat(
+/* Return 0 on success or positive error (to xfs_bulkstat()) */
+STATIC int
+xfs_bulkstat_one_fmt_compat(
        void                    __user *ubuffer,
+       int                     ubsize,
+       int                     *ubused,
        const xfs_bstat_t       *buffer)
 {
-       compat_xfs_bstat_t __user *p32 = ubuffer;
-
-       if (put_user(buffer->bs_ino, &p32->bs_ino) ||
-           put_user(buffer->bs_mode, &p32->bs_mode) ||
-           put_user(buffer->bs_nlink, &p32->bs_nlink) ||
-           put_user(buffer->bs_uid, &p32->bs_uid) ||
-           put_user(buffer->bs_gid, &p32->bs_gid) ||
-           put_user(buffer->bs_rdev, &p32->bs_rdev) ||
-           put_user(buffer->bs_blksize, &p32->bs_blksize) ||
-           put_user(buffer->bs_size, &p32->bs_size) ||
+       compat_xfs_bstat_t      __user *p32 = ubuffer;
+
+       if (ubsize < sizeof(*p32))
+               return XFS_ERROR(ENOMEM);
+
+       if (put_user(buffer->bs_ino,      &p32->bs_ino)         ||
+           put_user(buffer->bs_mode,     &p32->bs_mode)        ||
+           put_user(buffer->bs_nlink,    &p32->bs_nlink)       ||
+           put_user(buffer->bs_uid,      &p32->bs_uid)         ||
+           put_user(buffer->bs_gid,      &p32->bs_gid)         ||
+           put_user(buffer->bs_rdev,     &p32->bs_rdev)        ||
+           put_user(buffer->bs_blksize,  &p32->bs_blksize)     ||
+           put_user(buffer->bs_size,     &p32->bs_size)        ||
            xfs_bstime_store_compat(&p32->bs_atime, &buffer->bs_atime) ||
            xfs_bstime_store_compat(&p32->bs_mtime, &buffer->bs_mtime) ||
            xfs_bstime_store_compat(&p32->bs_ctime, &buffer->bs_ctime) ||
-           put_user(buffer->bs_blocks, &p32->bs_blocks) ||
-           put_user(buffer->bs_xflags, &p32->bs_xflags) ||
-           put_user(buffer->bs_extsize, &p32->bs_extsize) ||
-           put_user(buffer->bs_extents, &p32->bs_extents) ||
-           put_user(buffer->bs_gen, &p32->bs_gen) ||
-           put_user(buffer->bs_projid, &p32->bs_projid) ||
-           put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) ||
-           put_user(buffer->bs_dmstate, &p32->bs_dmstate) ||
+           put_user(buffer->bs_blocks,   &p32->bs_blocks)      ||
+           put_user(buffer->bs_xflags,   &p32->bs_xflags)      ||
+           put_user(buffer->bs_extsize,  &p32->bs_extsize)     ||
+           put_user(buffer->bs_extents,  &p32->bs_extents)     ||
+           put_user(buffer->bs_gen,      &p32->bs_gen)         ||
+           put_user(buffer->bs_projid,   &p32->bs_projid)      ||
+           put_user(buffer->bs_dmevmask, &p32->bs_dmevmask)    ||
+           put_user(buffer->bs_dmstate,  &p32->bs_dmstate)     ||
            put_user(buffer->bs_aextents, &p32->bs_aextents))
-               return -EFAULT;
-       return sizeof(*p32);
+               return XFS_ERROR(EFAULT);
+       if (ubused)
+               *ubused = sizeof(*p32);
+       return 0;
 }
 
-
-
-typedef struct compat_xfs_fsop_bulkreq {
-       compat_uptr_t   lastip;         /* last inode # pointer         */
-       __s32           icount;         /* count of entries in buffer   */
-       compat_uptr_t   ubuffer;        /* user buffer for inode desc.  */
-       compat_uptr_t   ocount;         /* output count pointer         */
-} compat_xfs_fsop_bulkreq_t;
-
-#define XFS_IOC_FSBULKSTAT_32 \
-       _IOWR('X', 101, struct compat_xfs_fsop_bulkreq)
-#define XFS_IOC_FSBULKSTAT_SINGLE_32 \
-       _IOWR('X', 102, struct compat_xfs_fsop_bulkreq)
-#define XFS_IOC_FSINUMBERS_32 \
-       _IOWR('X', 103, struct compat_xfs_fsop_bulkreq)
+STATIC int
+xfs_bulkstat_one_compat(
+       xfs_mount_t     *mp,            /* mount point for filesystem */
+       xfs_ino_t       ino,            /* inode number to get data for */
+       void            __user *buffer, /* buffer to place output in */
+       int             ubsize,         /* size of buffer */
+       void            *private_data,  /* my private data */
+       xfs_daddr_t     bno,            /* starting bno of inode cluster */
+       int             *ubused,        /* bytes used by me */
+       void            *dibuff,        /* on-disk inode buffer */
+       int             *stat)          /* BULKSTAT_RV_... */
+{
+       return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
+                                   xfs_bulkstat_one_fmt_compat, bno,
+                                   ubused, dibuff, stat);
+}
 
 /* copied from xfs_ioctl.c */
 STATIC int
-xfs_ioc_bulkstat_compat(
-       xfs_mount_t             *mp,
-       unsigned int            cmd,
-       void                    __user *arg)
+xfs_compat_ioc_bulkstat(
+       xfs_mount_t               *mp,
+       unsigned int              cmd,
+       compat_xfs_fsop_bulkreq_t __user *p32)
 {
-       compat_xfs_fsop_bulkreq_t __user *p32 = (void __user *)arg;
        u32                     addr;
        xfs_fsop_bulkreq_t      bulkreq;
        int                     count;  /* # of records returned */
@@ -270,20 +263,20 @@ xfs_ioc_bulkstat_compat(
        /* should be called again (unused here, but used in dmapi) */
 
        if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
+               return -XFS_ERROR(EPERM);
 
        if (XFS_FORCED_SHUTDOWN(mp))
                return -XFS_ERROR(EIO);
 
        if (get_user(addr, &p32->lastip))
-               return -EFAULT;
+               return -XFS_ERROR(EFAULT);
        bulkreq.lastip = compat_ptr(addr);
        if (get_user(bulkreq.icount, &p32->icount) ||
            get_user(addr, &p32->ubuffer))
-               return -EFAULT;
+               return -XFS_ERROR(EFAULT);
        bulkreq.ubuffer = compat_ptr(addr);
        if (get_user(addr, &p32->ocount))
-               return -EFAULT;
+               return -XFS_ERROR(EFAULT);
        bulkreq.ocount = compat_ptr(addr);
 
        if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
@@ -295,17 +288,22 @@ xfs_ioc_bulkstat_compat(
        if (bulkreq.ubuffer == NULL)
                return -XFS_ERROR(EINVAL);
 
-       if (cmd == XFS_IOC_FSINUMBERS)
+       if (cmd == XFS_IOC_FSINUMBERS_32) {
                error = xfs_inumbers(mp, &inlast, &count,
                                bulkreq.ubuffer, xfs_inumbers_fmt_compat);
-       else {
-               /* declare a var to get a warning in case the type changes */
-               bulkstat_one_fmt_pf formatter = xfs_bulkstat_one_fmt_compat;
+       } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) {
+               int res;
+
+               error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
+                               sizeof(compat_xfs_bstat_t),
+                               NULL, 0, NULL, NULL, &res);
+       } else if (cmd == XFS_IOC_FSBULKSTAT_32) {
                error = xfs_bulkstat(mp, &inlast, &count,
-                       xfs_bulkstat_one, formatter,
+                       xfs_bulkstat_one_compat, NULL,
                        sizeof(compat_xfs_bstat_t), bulkreq.ubuffer,
                        BULKSTAT_FG_QUICK, &done);
-       }
+       } else
+               error = XFS_ERROR(EINVAL);
        if (error)
                return -error;
 
@@ -321,63 +319,306 @@ xfs_ioc_bulkstat_compat(
        return 0;
 }
 
+STATIC int
+xfs_compat_handlereq_copyin(
+       xfs_fsop_handlereq_t            *hreq,
+       compat_xfs_fsop_handlereq_t     __user *arg32)
+{
+       compat_xfs_fsop_handlereq_t     hreq32;
+
+       if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t)))
+               return -XFS_ERROR(EFAULT);
+
+       hreq->fd = hreq32.fd;
+       hreq->path = compat_ptr(hreq32.path);
+       hreq->oflags = hreq32.oflags;
+       hreq->ihandle = compat_ptr(hreq32.ihandle);
+       hreq->ihandlen = hreq32.ihandlen;
+       hreq->ohandle = compat_ptr(hreq32.ohandle);
+       hreq->ohandlen = compat_ptr(hreq32.ohandlen);
 
+       return 0;
+}
 
-typedef struct compat_xfs_fsop_handlereq {
-       __u32           fd;             /* fd for FD_TO_HANDLE          */
-       compat_uptr_t   path;           /* user pathname                */
-       __u32           oflags;         /* open flags                   */
-       compat_uptr_t   ihandle;        /* user supplied handle         */
-       __u32           ihandlen;       /* user supplied length         */
-       compat_uptr_t   ohandle;        /* user buffer for handle       */
-       compat_uptr_t   ohandlen;       /* user buffer length           */
-} compat_xfs_fsop_handlereq_t;
-
-#define XFS_IOC_PATH_TO_FSHANDLE_32 \
-       _IOWR('X', 104, struct compat_xfs_fsop_handlereq)
-#define XFS_IOC_PATH_TO_HANDLE_32 \
-       _IOWR('X', 105, struct compat_xfs_fsop_handlereq)
-#define XFS_IOC_FD_TO_HANDLE_32 \
-       _IOWR('X', 106, struct compat_xfs_fsop_handlereq)
-#define XFS_IOC_OPEN_BY_HANDLE_32 \
-       _IOWR('X', 107, struct compat_xfs_fsop_handlereq)
-#define XFS_IOC_READLINK_BY_HANDLE_32 \
-       _IOWR('X', 108, struct compat_xfs_fsop_handlereq)
-
-STATIC unsigned long xfs_ioctl32_fshandle(unsigned long arg)
+/*
+ * Convert userspace handle data into inode.
+ *
+ * We use the fact that all the fsop_handlereq ioctl calls have a data
+ * structure argument whose first component is always a xfs_fsop_handlereq_t,
+ * so we can pass that sub structure into this handy, shared routine.
+ *
+ * If no error, caller must always iput the returned inode.
+ */
+STATIC int
+xfs_vget_fsop_handlereq_compat(
+       xfs_mount_t             *mp,
+       struct inode            *parinode,      /* parent inode pointer    */
+       compat_xfs_fsop_handlereq_t     *hreq,
+       struct inode            **inode)
 {
-       compat_xfs_fsop_handlereq_t __user *p32 = (void __user *)arg;
-       xfs_fsop_handlereq_t __user *p = compat_alloc_user_space(sizeof(*p));
-       u32 addr;
-
-       if (copy_in_user(&p->fd, &p32->fd, sizeof(__u32)) ||
-           get_user(addr, &p32->path) ||
-           put_user(compat_ptr(addr), &p->path) ||
-           copy_in_user(&p->oflags, &p32->oflags, sizeof(__u32)) ||
-           get_user(addr, &p32->ihandle) ||
-           put_user(compat_ptr(addr), &p->ihandle) ||
-           copy_in_user(&p->ihandlen, &p32->ihandlen, sizeof(__u32)) ||
-           get_user(addr, &p32->ohandle) ||
-           put_user(compat_ptr(addr), &p->ohandle) ||
-           get_user(addr, &p32->ohandlen) ||
-           put_user(compat_ptr(addr), &p->ohandlen))
-               return -EFAULT;
-
-       return (unsigned long)p;
+       void                    __user *hanp;
+       size_t                  hlen;
+       xfs_fid_t               *xfid;
+       xfs_handle_t            *handlep;
+       xfs_handle_t            handle;
+       xfs_inode_t             *ip;
+       xfs_ino_t               ino;
+       __u32                   igen;
+       int                     error;
+
+       /*
+        * Only allow handle opens under a directory.
+        */
+       if (!S_ISDIR(parinode->i_mode))
+               return XFS_ERROR(ENOTDIR);
+
+       hanp = compat_ptr(hreq->ihandle);
+       hlen = hreq->ihandlen;
+       handlep = &handle;
+
+       if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep))
+               return XFS_ERROR(EINVAL);
+       if (copy_from_user(handlep, hanp, hlen))
+               return XFS_ERROR(EFAULT);
+       if (hlen < sizeof(*handlep))
+               memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen);
+       if (hlen > sizeof(handlep->ha_fsid)) {
+               if (handlep->ha_fid.fid_len !=
+                   (hlen - sizeof(handlep->ha_fsid) -
+                           sizeof(handlep->ha_fid.fid_len)) ||
+                   handlep->ha_fid.fid_pad)
+                       return XFS_ERROR(EINVAL);
+       }
+
+       /*
+        * Crack the handle, obtain the inode # & generation #
+        */
+       xfid = (struct xfs_fid *)&handlep->ha_fid;
+       if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) {
+               ino  = xfid->fid_ino;
+               igen = xfid->fid_gen;
+       } else {
+               return XFS_ERROR(EINVAL);
+       }
+
+       /*
+        * Get the XFS inode, building a Linux inode to go with it.
+        */
+       error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
+       if (error)
+               return error;
+       if (ip == NULL)
+               return XFS_ERROR(EIO);
+       if (ip->i_d.di_gen != igen) {
+               xfs_iput_new(ip, XFS_ILOCK_SHARED);
+               return XFS_ERROR(ENOENT);
+       }
+
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+       *inode = VFS_I(ip);
+       return 0;
 }
 
+STATIC int
+xfs_compat_attrlist_by_handle(
+       xfs_mount_t             *mp,
+       void                    __user *arg,
+       struct inode            *parinode)
+{
+       int                     error;
+       attrlist_cursor_kern_t  *cursor;
+       compat_xfs_fsop_attrlist_handlereq_t al_hreq;
+       struct inode            *inode;
+       char                    *kbuf;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -XFS_ERROR(EPERM);
+       if (copy_from_user(&al_hreq, arg,
+                          sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
+               return -XFS_ERROR(EFAULT);
+       if (al_hreq.buflen > XATTR_LIST_MAX)
+               return -XFS_ERROR(EINVAL);
+
+       /*
+        * Reject flags, only allow namespaces.
+        */
+       if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
+               return -XFS_ERROR(EINVAL);
+
+       error = xfs_vget_fsop_handlereq_compat(mp, parinode, &al_hreq.hreq,
+                                              &inode);
+       if (error)
+               goto out;
+
+       kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
+       if (!kbuf)
+               goto out_vn_rele;
+
+       cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
+       error = xfs_attr_list(XFS_I(inode), kbuf, al_hreq.buflen,
+                                       al_hreq.flags, cursor);
+       if (error)
+               goto out_kfree;
+
+       if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen))
+               error = -EFAULT;
+
+ out_kfree:
+       kfree(kbuf);
+ out_vn_rele:
+       iput(inode);
+ out:
+       return -error;
+}
 
-STATIC long
-xfs_compat_ioctl(
-       int             mode,
-       struct file     *file,
-       unsigned        cmd,
-       unsigned long   arg)
+STATIC int
+xfs_compat_attrmulti_by_handle(
+       xfs_mount_t                             *mp,
+       void                                    __user *arg,
+       struct inode                            *parinode)
+{
+       int                                     error;
+       compat_xfs_attr_multiop_t               *ops;
+       compat_xfs_fsop_attrmulti_handlereq_t   am_hreq;
+       struct inode                            *inode;
+       unsigned int                            i, size;
+       char                                    *attr_name;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -XFS_ERROR(EPERM);
+       if (copy_from_user(&am_hreq, arg,
+                          sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
+               return -XFS_ERROR(EFAULT);
+
+       error = xfs_vget_fsop_handlereq_compat(mp, parinode, &am_hreq.hreq,
+                                              &inode);
+       if (error)
+               goto out;
+
+       error = E2BIG;
+       size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t);
+       if (!size || size > 16 * PAGE_SIZE)
+               goto out_vn_rele;
+
+       error = ENOMEM;
+       ops = kmalloc(size, GFP_KERNEL);
+       if (!ops)
+               goto out_vn_rele;
+
+       error = EFAULT;
+       if (copy_from_user(ops, compat_ptr(am_hreq.ops), size))
+               goto out_kfree_ops;
+
+       attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
+       if (!attr_name)
+               goto out_kfree_ops;
+
+
+       error = 0;
+       for (i = 0; i < am_hreq.opcount; i++) {
+               ops[i].am_error = strncpy_from_user(attr_name,
+                               compat_ptr(ops[i].am_attrname),
+                               MAXNAMELEN);
+               if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
+                       error = -ERANGE;
+               if (ops[i].am_error < 0)
+                       break;
+
+               switch (ops[i].am_opcode) {
+               case ATTR_OP_GET:
+                       ops[i].am_error = xfs_attrmulti_attr_get(inode,
+                                       attr_name,
+                                       compat_ptr(ops[i].am_attrvalue),
+                                       &ops[i].am_length, ops[i].am_flags);
+                       break;
+               case ATTR_OP_SET:
+                       ops[i].am_error = xfs_attrmulti_attr_set(inode,
+                                       attr_name,
+                                       compat_ptr(ops[i].am_attrvalue),
+                                       ops[i].am_length, ops[i].am_flags);
+                       break;
+               case ATTR_OP_REMOVE:
+                       ops[i].am_error = xfs_attrmulti_attr_remove(inode,
+                                       attr_name, ops[i].am_flags);
+                       break;
+               default:
+                       ops[i].am_error = EINVAL;
+               }
+       }
+
+       if (copy_to_user(compat_ptr(am_hreq.ops), ops, size))
+               error = XFS_ERROR(EFAULT);
+
+       kfree(attr_name);
+ out_kfree_ops:
+       kfree(ops);
+ out_vn_rele:
+       iput(inode);
+ out:
+       return -error;
+}
+
+STATIC int
+xfs_compat_fssetdm_by_handle(
+       xfs_mount_t             *mp,
+       void                    __user *arg,
+       struct inode            *parinode)
+{
+       int                     error;
+       struct fsdmidata        fsd;
+       compat_xfs_fsop_setdm_handlereq_t dmhreq;
+       struct inode            *inode;
+
+       if (!capable(CAP_MKNOD))
+               return -XFS_ERROR(EPERM);
+       if (copy_from_user(&dmhreq, arg,
+                          sizeof(compat_xfs_fsop_setdm_handlereq_t)))
+               return -XFS_ERROR(EFAULT);
+
+       error = xfs_vget_fsop_handlereq_compat(mp, parinode, &dmhreq.hreq,
+                                              &inode);
+       if (error)
+               return -error;
+
+       if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) {
+               error = -XFS_ERROR(EPERM);
+               goto out;
+       }
+
+       if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) {
+               error = -XFS_ERROR(EFAULT);
+               goto out;
+       }
+
+       error = -xfs_set_dmattrs(XFS_I(inode), fsd.fsd_dmevmask,
+                                fsd.fsd_dmstate);
+
+out:
+       iput(inode);
+       return error;
+}
+
+long
+xfs_file_compat_ioctl(
+       struct file             *filp,
+       unsigned                cmd,
+       unsigned long           p)
 {
-       struct inode    *inode = file->f_path.dentry->d_inode;
-       int             error;
+       struct inode            *inode = filp->f_path.dentry->d_inode;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       void                    __user *arg = (void __user *)p;
+       int                     ioflags = 0;
+       int                     error;
+
+       if (filp->f_mode & FMODE_NOCMTIME)
+               ioflags |= IO_INVIS;
+
+       xfs_itrace_entry(ip);
 
        switch (cmd) {
+       /* No size or alignment issues on any arch */
        case XFS_IOC_DIOINFO:
        case XFS_IOC_FSGEOMETRY:
        case XFS_IOC_FSGETXATTR:
@@ -387,48 +628,18 @@ xfs_compat_ioctl(
        case XFS_IOC_GETBMAP:
        case XFS_IOC_GETBMAPA:
        case XFS_IOC_GETBMAPX:
-/* not handled
-       case XFS_IOC_FSSETDM_BY_HANDLE:
-       case XFS_IOC_ATTRLIST_BY_HANDLE:
-       case XFS_IOC_ATTRMULTI_BY_HANDLE:
-*/
        case XFS_IOC_FSCOUNTS:
        case XFS_IOC_SET_RESBLKS:
        case XFS_IOC_GET_RESBLKS:
-       case XFS_IOC_FSGROWFSDATA:
        case XFS_IOC_FSGROWFSLOG:
-       case XFS_IOC_FSGROWFSRT:
        case XFS_IOC_FREEZE:
        case XFS_IOC_THAW:
        case XFS_IOC_GOINGDOWN:
        case XFS_IOC_ERROR_INJECTION:
        case XFS_IOC_ERROR_CLEARALL:
-               break;
-
-       case XFS_IOC32_GETXFLAGS:
-       case XFS_IOC32_SETXFLAGS:
-       case XFS_IOC32_GETVERSION:
-               cmd = _NATIVE_IOC(cmd, long);
-               break;
-#ifdef BROKEN_X86_ALIGNMENT
-       /* xfs_flock_t has wrong u32 vs u64 alignment */
-       case XFS_IOC_ALLOCSP_32:
-       case XFS_IOC_FREESP_32:
-       case XFS_IOC_ALLOCSP64_32:
-       case XFS_IOC_FREESP64_32:
-       case XFS_IOC_RESVSP_32:
-       case XFS_IOC_UNRESVSP_32:
-       case XFS_IOC_RESVSP64_32:
-       case XFS_IOC_UNRESVSP64_32:
-               arg = xfs_ioctl32_flock(arg);
-               cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
-               break;
-       case XFS_IOC_FSGEOMETRY_V1_32:
-               arg = xfs_ioctl32_geom_v1(arg);
-               cmd = _NATIVE_IOC(cmd, struct xfs_fsop_geom_v1);
-               break;
-
-#else /* These are handled fine if no alignment issues */
+               return xfs_file_ioctl(filp, cmd, p);
+#ifndef BROKEN_X86_ALIGNMENT
+       /* These are handled fine if no alignment issues */
        case XFS_IOC_ALLOCSP:
        case XFS_IOC_FREESP:
        case XFS_IOC_RESVSP:
@@ -438,51 +649,97 @@ xfs_compat_ioctl(
        case XFS_IOC_RESVSP64:
        case XFS_IOC_UNRESVSP64:
        case XFS_IOC_FSGEOMETRY_V1:
-               break;
+       case XFS_IOC_FSGROWFSDATA:
+       case XFS_IOC_FSGROWFSRT:
+               return xfs_file_ioctl(filp, cmd, p);
+#else
+       case XFS_IOC_ALLOCSP_32:
+       case XFS_IOC_FREESP_32:
+       case XFS_IOC_ALLOCSP64_32:
+       case XFS_IOC_FREESP64_32:
+       case XFS_IOC_RESVSP_32:
+       case XFS_IOC_UNRESVSP_32:
+       case XFS_IOC_RESVSP64_32:
+       case XFS_IOC_UNRESVSP64_32: {
+               struct xfs_flock64      bf;
 
-       /* xfs_bstat_t still has wrong u32 vs u64 alignment */
-       case XFS_IOC_SWAPEXT:
-               break;
+               if (xfs_compat_flock64_copyin(&bf, arg))
+                       return -XFS_ERROR(EFAULT);
+               cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
+               return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+       }
+       case XFS_IOC_FSGEOMETRY_V1_32:
+               return xfs_compat_ioc_fsgeometry_v1(mp, arg);
+       case XFS_IOC_FSGROWFSDATA_32: {
+               struct xfs_growfs_data  in;
+
+               if (xfs_compat_growfs_data_copyin(&in, arg))
+                       return -XFS_ERROR(EFAULT);
+               error = xfs_growfs_data(mp, &in);
+               return -error;
+       }
+       case XFS_IOC_FSGROWFSRT_32: {
+               struct xfs_growfs_rt    in;
 
+               if (xfs_compat_growfs_rt_copyin(&in, arg))
+                       return -XFS_ERROR(EFAULT);
+               error = xfs_growfs_rt(mp, &in);
+               return -error;
+       }
 #endif
+       /* long changes size, but xfs only copiese out 32 bits */
+       case XFS_IOC_GETXFLAGS_32:
+       case XFS_IOC_SETXFLAGS_32:
+       case XFS_IOC_GETVERSION_32:
+               cmd = _NATIVE_IOC(cmd, long);
+               return xfs_file_ioctl(filp, cmd, p);
+       case XFS_IOC_SWAPEXT: {
+               struct xfs_swapext        sxp;
+               struct compat_xfs_swapext __user *sxu = arg;
+
+               /* Bulk copy in up to the sx_stat field, then copy bstat */
+               if (copy_from_user(&sxp, sxu,
+                                  offsetof(struct xfs_swapext, sx_stat)) ||
+                   xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat))
+                       return -XFS_ERROR(EFAULT);
+               error = xfs_swapext(&sxp);
+               return -error;
+       }
        case XFS_IOC_FSBULKSTAT_32:
        case XFS_IOC_FSBULKSTAT_SINGLE_32:
        case XFS_IOC_FSINUMBERS_32:
-               cmd = _NATIVE_IOC(cmd, struct xfs_fsop_bulkreq);
-               return xfs_ioc_bulkstat_compat(XFS_I(inode)->i_mount,
-                               cmd, (void __user*)arg);
+               return xfs_compat_ioc_bulkstat(mp, cmd, arg);
        case XFS_IOC_FD_TO_HANDLE_32:
        case XFS_IOC_PATH_TO_HANDLE_32:
-       case XFS_IOC_PATH_TO_FSHANDLE_32:
-       case XFS_IOC_OPEN_BY_HANDLE_32:
-       case XFS_IOC_READLINK_BY_HANDLE_32:
-               arg = xfs_ioctl32_fshandle(arg);
+       case XFS_IOC_PATH_TO_FSHANDLE_32: {
+               struct xfs_fsop_handlereq       hreq;
+
+               if (xfs_compat_handlereq_copyin(&hreq, arg))
+                       return -XFS_ERROR(EFAULT);
                cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq);
-               break;
-       default:
-               return -ENOIOCTLCMD;
+               return xfs_find_handle(cmd, &hreq);
        }
+       case XFS_IOC_OPEN_BY_HANDLE_32: {
+               struct xfs_fsop_handlereq       hreq;
 
-       error = xfs_ioctl(XFS_I(inode), file, mode, cmd, (void __user *)arg);
-       xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED);
-
-       return error;
-}
-
-long
-xfs_file_compat_ioctl(
-       struct file             *file,
-       unsigned                cmd,
-       unsigned long           arg)
-{
-       return xfs_compat_ioctl(0, file, cmd, arg);
-}
+               if (xfs_compat_handlereq_copyin(&hreq, arg))
+                       return -XFS_ERROR(EFAULT);
+               return xfs_open_by_handle(mp, &hreq, filp, inode);
+       }
+       case XFS_IOC_READLINK_BY_HANDLE_32: {
+               struct xfs_fsop_handlereq       hreq;
 
-long
-xfs_file_compat_invis_ioctl(
-       struct file             *file,
-       unsigned                cmd,
-       unsigned long           arg)
-{
-       return xfs_compat_ioctl(IO_INVIS, file, cmd, arg);
+               if (xfs_compat_handlereq_copyin(&hreq, arg))
+                       return -XFS_ERROR(EFAULT);
+               return xfs_readlink_by_handle(mp, &hreq, inode);
+       }
+       case XFS_IOC_ATTRLIST_BY_HANDLE_32:
+               return xfs_compat_attrlist_by_handle(mp, arg, inode);
+       case XFS_IOC_ATTRMULTI_BY_HANDLE_32:
+               return xfs_compat_attrmulti_by_handle(mp, arg, inode);
+       case XFS_IOC_FSSETDM_BY_HANDLE_32:
+               return xfs_compat_fssetdm_by_handle(mp, arg, inode);
+       default:
+               return -XFS_ERROR(ENOIOCTLCMD);
+       }
 }
index 02de6e62ee370aaef2ef9aca30d16b1f5e3af47f..1024c4f8ba0d4501d296d3076c9d6ada00dda9d8 100644 (file)
 #ifndef __XFS_IOCTL32_H__
 #define __XFS_IOCTL32_H__
 
-extern long xfs_file_compat_ioctl(struct file *, unsigned, unsigned long);
-extern long xfs_file_compat_invis_ioctl(struct file *, unsigned, unsigned long);
+#include <linux/compat.h>
+
+/*
+ * on 32-bit arches, ioctl argument structures may have different sizes
+ * and/or alignment.  We define compat structures which match the
+ * 32-bit sizes/alignments here, and their associated ioctl numbers.
+ *
+ * xfs_ioctl32.c contains routines to copy these structures in and out.
+ */
+
+/* stock kernel-level ioctls we support */
+#define XFS_IOC_GETXFLAGS_32   FS_IOC32_GETFLAGS
+#define XFS_IOC_SETXFLAGS_32   FS_IOC32_SETFLAGS
+#define XFS_IOC_GETVERSION_32  FS_IOC32_GETVERSION
+
+/*
+ * On intel, even if sizes match, alignment and/or padding may differ.
+ */
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#define BROKEN_X86_ALIGNMENT
+#define __compat_packed __attribute__((packed))
+#else
+#define __compat_packed
+#endif
+
+typedef struct compat_xfs_bstime {
+       compat_time_t   tv_sec;         /* seconds              */
+       __s32           tv_nsec;        /* and nanoseconds      */
+} compat_xfs_bstime_t;
+
+typedef struct compat_xfs_bstat {
+       __u64           bs_ino;         /* inode number                 */
+       __u16           bs_mode;        /* type and mode                */
+       __u16           bs_nlink;       /* number of links              */
+       __u32           bs_uid;         /* user id                      */
+       __u32           bs_gid;         /* group id                     */
+       __u32           bs_rdev;        /* device value                 */
+       __s32           bs_blksize;     /* block size                   */
+       __s64           bs_size;        /* file size                    */
+       compat_xfs_bstime_t bs_atime;   /* access time                  */
+       compat_xfs_bstime_t bs_mtime;   /* modify time                  */
+       compat_xfs_bstime_t bs_ctime;   /* inode change time            */
+       int64_t         bs_blocks;      /* number of blocks             */
+       __u32           bs_xflags;      /* extended flags               */
+       __s32           bs_extsize;     /* extent size                  */
+       __s32           bs_extents;     /* number of extents            */
+       __u32           bs_gen;         /* generation count             */
+       __u16           bs_projid;      /* project id                   */
+       unsigned char   bs_pad[14];     /* pad space, unused            */
+       __u32           bs_dmevmask;    /* DMIG event mask              */
+       __u16           bs_dmstate;     /* DMIG state info              */
+       __u16           bs_aextents;    /* attribute number of extents  */
+} __compat_packed compat_xfs_bstat_t;
+
+typedef struct compat_xfs_fsop_bulkreq {
+       compat_uptr_t   lastip;         /* last inode # pointer         */
+       __s32           icount;         /* count of entries in buffer   */
+       compat_uptr_t   ubuffer;        /* user buffer for inode desc.  */
+       compat_uptr_t   ocount;         /* output count pointer         */
+} compat_xfs_fsop_bulkreq_t;
+
+#define XFS_IOC_FSBULKSTAT_32 \
+       _IOWR('X', 101, struct compat_xfs_fsop_bulkreq)
+#define XFS_IOC_FSBULKSTAT_SINGLE_32 \
+       _IOWR('X', 102, struct compat_xfs_fsop_bulkreq)
+#define XFS_IOC_FSINUMBERS_32 \
+       _IOWR('X', 103, struct compat_xfs_fsop_bulkreq)
+
+typedef struct compat_xfs_fsop_handlereq {
+       __u32           fd;             /* fd for FD_TO_HANDLE          */
+       compat_uptr_t   path;           /* user pathname                */
+       __u32           oflags;         /* open flags                   */
+       compat_uptr_t   ihandle;        /* user supplied handle         */
+       __u32           ihandlen;       /* user supplied length         */
+       compat_uptr_t   ohandle;        /* user buffer for handle       */
+       compat_uptr_t   ohandlen;       /* user buffer length           */
+} compat_xfs_fsop_handlereq_t;
+
+#define XFS_IOC_PATH_TO_FSHANDLE_32 \
+       _IOWR('X', 104, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_PATH_TO_HANDLE_32 \
+       _IOWR('X', 105, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_FD_TO_HANDLE_32 \
+       _IOWR('X', 106, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_OPEN_BY_HANDLE_32 \
+       _IOWR('X', 107, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_READLINK_BY_HANDLE_32 \
+       _IOWR('X', 108, struct compat_xfs_fsop_handlereq)
+
+/* The bstat field in the swapext struct needs translation */
+typedef struct compat_xfs_swapext {
+       __int64_t               sx_version;     /* version */
+       __int64_t               sx_fdtarget;    /* fd of target file */
+       __int64_t               sx_fdtmp;       /* fd of tmp file */
+       xfs_off_t               sx_offset;      /* offset into file */
+       xfs_off_t               sx_length;      /* leng from offset */
+       char                    sx_pad[16];     /* pad space, unused */
+       compat_xfs_bstat_t      sx_stat;        /* stat of target b4 copy */
+} __compat_packed compat_xfs_swapext_t;
+
+#define XFS_IOC_SWAPEXT_32     _IOWR('X', 109, struct compat_xfs_swapext)
+
+typedef struct compat_xfs_fsop_attrlist_handlereq {
+       struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
+       struct xfs_attrlist_cursor      pos; /* opaque cookie, list offset */
+       __u32                           flags;  /* which namespace to use */
+       __u32                           buflen; /* length of buffer supplied */
+       compat_uptr_t                   buffer; /* returned names */
+} __compat_packed compat_xfs_fsop_attrlist_handlereq_t;
+
+/* Note: actually this is read/write */
+#define XFS_IOC_ATTRLIST_BY_HANDLE_32 \
+       _IOW('X', 122, struct compat_xfs_fsop_attrlist_handlereq)
+
+/* am_opcodes defined in xfs_fs.h */
+typedef struct compat_xfs_attr_multiop {
+       __u32           am_opcode;
+       __s32           am_error;
+       compat_uptr_t   am_attrname;
+       compat_uptr_t   am_attrvalue;
+       __u32           am_length;
+       __u32           am_flags;
+} compat_xfs_attr_multiop_t;
+
+typedef struct compat_xfs_fsop_attrmulti_handlereq {
+       struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
+       __u32                           opcount;/* count of following multiop */
+       /* ptr to compat_xfs_attr_multiop */
+       compat_uptr_t                   ops; /* attr_multi data */
+} compat_xfs_fsop_attrmulti_handlereq_t;
+
+#define XFS_IOC_ATTRMULTI_BY_HANDLE_32 \
+       _IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq)
+
+typedef struct compat_xfs_fsop_setdm_handlereq {
+       struct compat_xfs_fsop_handlereq hreq;  /* handle information   */
+       /* ptr to struct fsdmidata */
+       compat_uptr_t                   data;   /* DMAPI data   */
+} compat_xfs_fsop_setdm_handlereq_t;
+
+#define XFS_IOC_FSSETDM_BY_HANDLE_32 \
+       _IOW('X', 121, struct compat_xfs_fsop_setdm_handlereq)
+
+#ifdef BROKEN_X86_ALIGNMENT
+/* on ia32 l_start is on a 32-bit boundary */
+typedef struct compat_xfs_flock64 {
+       __s16           l_type;
+       __s16           l_whence;
+       __s64           l_start __attribute__((packed));
+                       /* len == 0 means until end of file */
+       __s64           l_len __attribute__((packed));
+       __s32           l_sysid;
+       __u32           l_pid;
+       __s32           l_pad[4];       /* reserve area */
+} compat_xfs_flock64_t;
+
+#define XFS_IOC_ALLOCSP_32     _IOW('X', 10, struct compat_xfs_flock64)
+#define XFS_IOC_FREESP_32      _IOW('X', 11, struct compat_xfs_flock64)
+#define XFS_IOC_ALLOCSP64_32   _IOW('X', 36, struct compat_xfs_flock64)
+#define XFS_IOC_FREESP64_32    _IOW('X', 37, struct compat_xfs_flock64)
+#define XFS_IOC_RESVSP_32      _IOW('X', 40, struct compat_xfs_flock64)
+#define XFS_IOC_UNRESVSP_32    _IOW('X', 41, struct compat_xfs_flock64)
+#define XFS_IOC_RESVSP64_32    _IOW('X', 42, struct compat_xfs_flock64)
+#define XFS_IOC_UNRESVSP64_32  _IOW('X', 43, struct compat_xfs_flock64)
+
+typedef struct compat_xfs_fsop_geom_v1 {
+       __u32           blocksize;      /* filesystem (data) block size */
+       __u32           rtextsize;      /* realtime extent size         */
+       __u32           agblocks;       /* fsblocks in an AG            */
+       __u32           agcount;        /* number of allocation groups  */
+       __u32           logblocks;      /* fsblocks in the log          */
+       __u32           sectsize;       /* (data) sector size, bytes    */
+       __u32           inodesize;      /* inode size in bytes          */
+       __u32           imaxpct;        /* max allowed inode space(%)   */
+       __u64           datablocks;     /* fsblocks in data subvolume   */
+       __u64           rtblocks;       /* fsblocks in realtime subvol  */
+       __u64           rtextents;      /* rt extents in realtime subvol*/
+       __u64           logstart;       /* starting fsblock of the log  */
+       unsigned char   uuid[16];       /* unique id of the filesystem  */
+       __u32           sunit;          /* stripe unit, fsblocks        */
+       __u32           swidth;         /* stripe width, fsblocks       */
+       __s32           version;        /* structure version            */
+       __u32           flags;          /* superblock version flags     */
+       __u32           logsectsize;    /* log sector size, bytes       */
+       __u32           rtsectsize;     /* realtime sector size, bytes  */
+       __u32           dirblocksize;   /* directory block size, bytes  */
+} __attribute__((packed)) compat_xfs_fsop_geom_v1_t;
+
+#define XFS_IOC_FSGEOMETRY_V1_32  \
+       _IOR('X', 100, struct compat_xfs_fsop_geom_v1)
+
+typedef struct compat_xfs_inogrp {
+       __u64           xi_startino;    /* starting inode number        */
+       __s32           xi_alloccount;  /* # bits set in allocmask      */
+       __u64           xi_allocmask;   /* mask of allocated inodes     */
+} __attribute__((packed)) compat_xfs_inogrp_t;
+
+/* These growfs input structures have padding on the end, so must translate */
+typedef struct compat_xfs_growfs_data {
+       __u64           newblocks;      /* new data subvol size, fsblocks */
+       __u32           imaxpct;        /* new inode space percentage limit */
+} __attribute__((packed)) compat_xfs_growfs_data_t;
+
+typedef struct compat_xfs_growfs_rt {
+       __u64           newblocks;      /* new realtime size, fsblocks */
+       __u32           extsize;        /* new realtime extent size, fsblocks */
+} __attribute__((packed)) compat_xfs_growfs_rt_t;
+
+#define XFS_IOC_FSGROWFSDATA_32 _IOW('X', 110, struct compat_xfs_growfs_data)
+#define XFS_IOC_FSGROWFSRT_32   _IOW('X', 112, struct compat_xfs_growfs_rt)
+
+#endif /* BROKEN_X86_ALIGNMENT */
 
 #endif /* __XFS_IOCTL32_H__ */
index 095d271f34349e01175eecd3e4c362dbcc1dccce..7aa53fefc67fafe14bb54db40398f79945f1baae 100644 (file)
@@ -53,6 +53,7 @@
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/falloc.h>
+#include <linux/fiemap.h>
 
 /*
  * Bring the atime in the XFS inode uptodate.
@@ -64,14 +65,14 @@ xfs_synchronize_atime(
 {
        struct inode    *inode = VFS_I(ip);
 
-       if (inode) {
+       if (!(inode->i_state & I_CLEAR)) {
                ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
                ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
        }
 }
 
 /*
- * If the linux inode exists, mark it dirty.
+ * If the linux inode is valid, mark it dirty.
  * Used when commiting a dirty inode into a transaction so that
  * the inode will get written back by the linux code
  */
@@ -81,7 +82,7 @@ xfs_mark_inode_dirty_sync(
 {
        struct inode    *inode = VFS_I(ip);
 
-       if (inode)
+       if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
                mark_inode_dirty_sync(inode);
 }
 
@@ -128,7 +129,7 @@ xfs_ichgtime(
        if (sync_it) {
                SYNCHRONIZE();
                ip->i_update_core = 1;
-               mark_inode_dirty_sync(inode);
+               xfs_mark_inode_dirty_sync(ip);
        }
 }
 
@@ -158,8 +159,6 @@ xfs_init_security(
        }
 
        error = xfs_attr_set(ip, name, value, length, ATTR_SECURE);
-       if (!error)
-               xfs_iflags_set(ip, XFS_IMODIFIED);
 
        kfree(name);
        kfree(value);
@@ -260,7 +259,6 @@ xfs_vn_mknod(
                error = _ACL_INHERIT(inode, mode, default_acl);
                if (unlikely(error))
                        goto out_cleanup_inode;
-               xfs_iflags_set(ip, XFS_IMODIFIED);
                _ACL_FREE(default_acl);
        }
 
@@ -366,21 +364,17 @@ xfs_vn_link(
        struct inode    *dir,
        struct dentry   *dentry)
 {
-       struct inode    *inode; /* inode of guy being linked to */
+       struct inode    *inode = old_dentry->d_inode;
        struct xfs_name name;
        int             error;
 
-       inode = old_dentry->d_inode;
        xfs_dentry_to_name(&name, dentry);
 
-       igrab(inode);
        error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
-       if (unlikely(error)) {
-               iput(inode);
+       if (unlikely(error))
                return -error;
-       }
 
-       xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED);
+       atomic_inc(&inode->i_count);
        d_instantiate(dentry, inode);
        return 0;
 }
@@ -601,7 +595,7 @@ xfs_vn_setattr(
        struct dentry   *dentry,
        struct iattr    *iattr)
 {
-       return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0, NULL);
+       return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
 }
 
 /*
@@ -642,7 +636,7 @@ xfs_vn_fallocate(
 
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
        error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
-                                     0, NULL, XFS_ATTR_NOLOCK);
+                                     0, XFS_ATTR_NOLOCK);
        if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
            offset + len > i_size_read(inode))
                new_size = offset + len;
@@ -653,7 +647,7 @@ xfs_vn_fallocate(
 
                iattr.ia_valid = ATTR_SIZE;
                iattr.ia_size = new_size;
-               error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK, NULL);
+               error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
        }
 
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -661,6 +655,88 @@ out_error:
        return error;
 }
 
+#define XFS_FIEMAP_FLAGS       (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
+
+/*
+ * Call fiemap helper to fill in user data.
+ * Returns positive errors to xfs_getbmap.
+ */
+STATIC int
+xfs_fiemap_format(
+       void                    **arg,
+       struct getbmapx         *bmv,
+       int                     *full)
+{
+       int                     error;
+       struct fiemap_extent_info *fieinfo = *arg;
+       u32                     fiemap_flags = 0;
+       u64                     logical, physical, length;
+
+       /* Do nothing for a hole */
+       if (bmv->bmv_block == -1LL)
+               return 0;
+
+       logical = BBTOB(bmv->bmv_offset);
+       physical = BBTOB(bmv->bmv_block);
+       length = BBTOB(bmv->bmv_length);
+
+       if (bmv->bmv_oflags & BMV_OF_PREALLOC)
+               fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
+       else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
+               fiemap_flags |= FIEMAP_EXTENT_DELALLOC;
+               physical = 0;   /* no block yet */
+       }
+       if (bmv->bmv_oflags & BMV_OF_LAST)
+               fiemap_flags |= FIEMAP_EXTENT_LAST;
+
+       error = fiemap_fill_next_extent(fieinfo, logical, physical,
+                                       length, fiemap_flags);
+       if (error > 0) {
+               error = 0;
+               *full = 1;      /* user array now full */
+       }
+
+       return -error;
+}
+
+STATIC int
+xfs_vn_fiemap(
+       struct inode            *inode,
+       struct fiemap_extent_info *fieinfo,
+       u64                     start,
+       u64                     length)
+{
+       xfs_inode_t             *ip = XFS_I(inode);
+       struct getbmapx         bm;
+       int                     error;
+
+       error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS);
+       if (error)
+               return error;
+
+       /* Set up bmap header for xfs internal routine */
+       bm.bmv_offset = BTOBB(start);
+       /* Special case for whole file */
+       if (length == FIEMAP_MAX_OFFSET)
+               bm.bmv_length = -1LL;
+       else
+               bm.bmv_length = BTOBB(length);
+
+       /* our formatter will tell xfs_getbmap when to stop. */
+       bm.bmv_count = MAXEXTNUM;
+       bm.bmv_iflags = BMV_IF_PREALLOC;
+       if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
+               bm.bmv_iflags |= BMV_IF_ATTRFORK;
+       if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
+               bm.bmv_iflags |= BMV_IF_DELALLOC;
+
+       error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
+       if (error)
+               return -error;
+
+       return 0;
+}
+
 static const struct inode_operations xfs_inode_operations = {
        .permission             = xfs_vn_permission,
        .truncate               = xfs_vn_truncate,
@@ -671,6 +747,7 @@ static const struct inode_operations xfs_inode_operations = {
        .removexattr            = generic_removexattr,
        .listxattr              = xfs_vn_listxattr,
        .fallocate              = xfs_vn_fallocate,
+       .fiemap                 = xfs_vn_fiemap,
 };
 
 static const struct inode_operations xfs_dir_inode_operations = {
@@ -766,12 +843,20 @@ xfs_diflags_to_iflags(
  * When reading existing inodes from disk this is called directly
  * from xfs_iget, when creating a new inode it is called from
  * xfs_ialloc after setting up the inode.
+ *
+ * We are always called with an uninitialised linux inode here.
+ * We need to initialise the necessary fields and take a reference
+ * on it.
  */
 void
 xfs_setup_inode(
        struct xfs_inode        *ip)
 {
-       struct inode            *inode = ip->i_vnode;
+       struct inode            *inode = &ip->i_vnode;
+
+       inode->i_ino = ip->i_ino;
+       inode->i_state = I_NEW|I_LOCK;
+       inode_add_to_lists(ip->i_mount->m_super, inode);
 
        inode->i_mode   = ip->i_d.di_mode;
        inode->i_nlink  = ip->i_d.di_nlink;
@@ -799,7 +884,6 @@ xfs_setup_inode(
        inode->i_ctime.tv_sec   = ip->i_d.di_ctime.t_sec;
        inode->i_ctime.tv_nsec  = ip->i_d.di_ctime.t_nsec;
        xfs_diflags_to_iflags(inode, ip);
-       xfs_iflags_clear(ip, XFS_IMODIFIED);
 
        switch (inode->i_mode & S_IFMT) {
        case S_IFREG:
index 8b1a1e31dc21f8c3414ee96c8285c2f54e81cc9c..ef41c92ce66e9e8159a2c3b1319774e589877ce6 100644 (file)
@@ -22,7 +22,6 @@ struct xfs_inode;
 
 extern const struct file_operations xfs_file_operations;
 extern const struct file_operations xfs_dir_file_operations;
-extern const struct file_operations xfs_invis_file_operations;
 
 extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
 
index cc0f7b3a97957687114dd961d9543cd5c807aeb3..507492d6dccd0d9de4e8591ba2ca9dfdb831b44d 100644 (file)
 #include <linux/types.h>
 
 /*
- * Some types are conditional depending on the target system.
  * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
- * XFS_BIG_INUMS needs the VFS inode number to be 64 bits, as well
- * as requiring XFS_BIG_BLKNOS to be set.
+ * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set.
  */
 #if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
 # define XFS_BIG_BLKNOS        1
-# if BITS_PER_LONG == 64
-#  define XFS_BIG_INUMS        1
-# else
-#  define XFS_BIG_INUMS        0
-# endif
+# define XFS_BIG_INUMS 1
 #else
 # define XFS_BIG_BLKNOS        0
 # define XFS_BIG_INUMS 0
@@ -77,6 +71,7 @@
 #include <linux/spinlock.h>
 #include <linux/random.h>
 #include <linux/ctype.h>
+#include <linux/writeback.h>
 
 #include <asm/page.h>
 #include <asm/div64.h>
@@ -85,7 +80,6 @@
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
 
-#include <xfs_vfs.h>
 #include <xfs_cred.h>
 #include <xfs_vnode.h>
 #include <xfs_stats.h>
 #undef  HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
 #endif
 
-#define restricted_chown       xfs_params.restrict_chown.val
 #define irix_sgid_inherit      xfs_params.sgid_inherit.val
 #define irix_symlink_mode      xfs_params.symlink_mode.val
 #define xfs_panic_mask         xfs_params.panic_mask.val
index 1957e5357d047b067b25719c34099bbb006a9cb6..7e90daa0d1d1bfbc6435b3d161fbd73baf36bc85 100644 (file)
@@ -51,7 +51,6 @@
 #include "xfs_vnodeops.h"
 
 #include <linux/capability.h>
-#include <linux/mount.h>
 #include <linux/writeback.h>
 
 
@@ -243,7 +242,7 @@ xfs_read(
 
        if (unlikely(ioflags & IO_ISDIRECT)) {
                if (inode->i_mapping->nrpages)
-                       ret = xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
+                       ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
                                                    -1, FI_REMAPF_LOCKED);
                mutex_unlock(&inode->i_mutex);
                if (ret) {
@@ -668,15 +667,8 @@ start:
        if (new_size > xip->i_size)
                xip->i_new_size = new_size;
 
-       /*
-        * We're not supposed to change timestamps in readonly-mounted
-        * filesystems.  Throw it away if anyone asks us.
-        */
-       if (likely(!(ioflags & IO_INVIS) &&
-                  !mnt_want_write(file->f_path.mnt))) {
+       if (likely(!(ioflags & IO_INVIS)))
                xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-               mnt_drop_write(file->f_path.mnt);
-       }
 
        /*
         * If the offset is beyond the size of the file, we have a couple
@@ -715,7 +707,6 @@ start:
                }
        }
 
-retry:
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = mapping->backing_dev_info;
 
@@ -771,6 +762,17 @@ retry:
        if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
                ret = wait_on_sync_kiocb(iocb);
 
+       isize = i_size_read(inode);
+       if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
+               *offset = isize;
+
+       if (*offset > xip->i_size) {
+               xfs_ilock(xip, XFS_ILOCK_EXCL);
+               if (*offset > xip->i_size)
+                       xip->i_size = *offset;
+               xfs_iunlock(xip, XFS_ILOCK_EXCL);
+       }
+
        if (ret == -ENOSPC &&
            DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
                xfs_iunlock(xip, iolock);
@@ -784,20 +786,7 @@ retry:
                xfs_ilock(xip, iolock);
                if (error)
                        goto out_unlock_internal;
-               pos = xip->i_size;
-               ret = 0;
-               goto retry;
-       }
-
-       isize = i_size_read(inode);
-       if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
-               *offset = isize;
-
-       if (*offset > xip->i_size) {
-               xfs_ilock(xip, XFS_ILOCK_EXCL);
-               if (*offset > xip->i_size)
-                       xip->i_size = *offset;
-               xfs_iunlock(xip, XFS_ILOCK_EXCL);
+               goto start;
        }
 
        error = -ret;
@@ -855,13 +844,7 @@ retry:
 int
 xfs_bdstrat_cb(struct xfs_buf *bp)
 {
-       xfs_mount_t     *mp;
-
-       mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
-       if (!XFS_FORCED_SHUTDOWN(mp)) {
-               xfs_buf_iorequest(bp);
-               return 0;
-       } else {
+       if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
                xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
                /*
                 * Metadata write that didn't get logged but
@@ -874,6 +857,9 @@ xfs_bdstrat_cb(struct xfs_buf *bp)
                else
                        return (xfs_bioerror(bp));
        }
+
+       xfs_buf_iorequest(bp);
+       return 0;
 }
 
 /*
index 3d5b67c075c7ab5b927884313f35196fd9249e80..c3526d445f6a281c095371d4f5d7bf83658f63d5 100644 (file)
@@ -53,11 +53,15 @@ xfs_read_xfsstats(
                { "icluster",           XFSSTAT_END_INODE_CLUSTER       },
                { "vnodes",             XFSSTAT_END_VNODE_OPS           },
                { "buf",                XFSSTAT_END_BUF                 },
+               { "abtb2",              XFSSTAT_END_ABTB_V2             },
+               { "abtc2",              XFSSTAT_END_ABTC_V2             },
+               { "bmbt2",              XFSSTAT_END_BMBT_V2             },
+               { "ibt2",               XFSSTAT_END_IBT_V2              },
        };
 
        /* Loop over all stats groups */
        for (i=j=len = 0; i < ARRAY_SIZE(xstats); i++) {
-               len += sprintf(buffer + len, xstats[i].desc);
+               len += sprintf(buffer + len, "%s", xstats[i].desc);
                /* inner loop does each group */
                while (j < xstats[i].endpoint) {
                        val = 0;
index e83820febc9feacc8b642f3555d147017048c1c8..736854b1ca1a0c6d1d92488f6cc79891241b8a6d 100644 (file)
@@ -118,6 +118,71 @@ struct xfsstats {
        __uint32_t              xb_page_retries;
        __uint32_t              xb_page_found;
        __uint32_t              xb_get_read;
+/* Version 2 btree counters */
+#define XFSSTAT_END_ABTB_V2            (XFSSTAT_END_BUF+15)
+       __uint32_t              xs_abtb_2_lookup;
+       __uint32_t              xs_abtb_2_compare;
+       __uint32_t              xs_abtb_2_insrec;
+       __uint32_t              xs_abtb_2_delrec;
+       __uint32_t              xs_abtb_2_newroot;
+       __uint32_t              xs_abtb_2_killroot;
+       __uint32_t              xs_abtb_2_increment;
+       __uint32_t              xs_abtb_2_decrement;
+       __uint32_t              xs_abtb_2_lshift;
+       __uint32_t              xs_abtb_2_rshift;
+       __uint32_t              xs_abtb_2_split;
+       __uint32_t              xs_abtb_2_join;
+       __uint32_t              xs_abtb_2_alloc;
+       __uint32_t              xs_abtb_2_free;
+       __uint32_t              xs_abtb_2_moves;
+#define XFSSTAT_END_ABTC_V2            (XFSSTAT_END_ABTB_V2+15)
+       __uint32_t              xs_abtc_2_lookup;
+       __uint32_t              xs_abtc_2_compare;
+       __uint32_t              xs_abtc_2_insrec;
+       __uint32_t              xs_abtc_2_delrec;
+       __uint32_t              xs_abtc_2_newroot;
+       __uint32_t              xs_abtc_2_killroot;
+       __uint32_t              xs_abtc_2_increment;
+       __uint32_t              xs_abtc_2_decrement;
+       __uint32_t              xs_abtc_2_lshift;
+       __uint32_t              xs_abtc_2_rshift;
+       __uint32_t              xs_abtc_2_split;
+       __uint32_t              xs_abtc_2_join;
+       __uint32_t              xs_abtc_2_alloc;
+       __uint32_t              xs_abtc_2_free;
+       __uint32_t              xs_abtc_2_moves;
+#define XFSSTAT_END_BMBT_V2            (XFSSTAT_END_ABTC_V2+15)
+       __uint32_t              xs_bmbt_2_lookup;
+       __uint32_t              xs_bmbt_2_compare;
+       __uint32_t              xs_bmbt_2_insrec;
+       __uint32_t              xs_bmbt_2_delrec;
+       __uint32_t              xs_bmbt_2_newroot;
+       __uint32_t              xs_bmbt_2_killroot;
+       __uint32_t              xs_bmbt_2_increment;
+       __uint32_t              xs_bmbt_2_decrement;
+       __uint32_t              xs_bmbt_2_lshift;
+       __uint32_t              xs_bmbt_2_rshift;
+       __uint32_t              xs_bmbt_2_split;
+       __uint32_t              xs_bmbt_2_join;
+       __uint32_t              xs_bmbt_2_alloc;
+       __uint32_t              xs_bmbt_2_free;
+       __uint32_t              xs_bmbt_2_moves;
+#define XFSSTAT_END_IBT_V2             (XFSSTAT_END_BMBT_V2+15)
+       __uint32_t              xs_ibt_2_lookup;
+       __uint32_t              xs_ibt_2_compare;
+       __uint32_t              xs_ibt_2_insrec;
+       __uint32_t              xs_ibt_2_delrec;
+       __uint32_t              xs_ibt_2_newroot;
+       __uint32_t              xs_ibt_2_killroot;
+       __uint32_t              xs_ibt_2_increment;
+       __uint32_t              xs_ibt_2_decrement;
+       __uint32_t              xs_ibt_2_lshift;
+       __uint32_t              xs_ibt_2_rshift;
+       __uint32_t              xs_ibt_2_split;
+       __uint32_t              xs_ibt_2_join;
+       __uint32_t              xs_ibt_2_alloc;
+       __uint32_t              xs_ibt_2_free;
+       __uint32_t              xs_ibt_2_moves;
 /* Extra precision counters */
        __uint64_t              xs_xstrat_bytes;
        __uint64_t              xs_write_bytes;
index 37ebe36056ebd758af0e62012b60c5dfb13b7039..36f6cc703ef25307f3de486badc7775f299aaad6 100644 (file)
@@ -18,7 +18,6 @@
 #include "xfs.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
-#include "xfs_clnt.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
@@ -36,6 +35,7 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
@@ -48,7 +48,6 @@
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
 #include "xfs_vnodeops.h"
-#include "xfs_vfsops.h"
 #include "xfs_version.h"
 #include "xfs_log_priv.h"
 #include "xfs_trans_priv.h"
@@ -58,6 +57,7 @@
 #include "xfs_extfree_item.h"
 #include "xfs_mru_cache.h"
 #include "xfs_inode_item.h"
+#include "xfs_sync.h"
 
 #include <linux/namei.h>
 #include <linux/init.h>
 
 static struct quotactl_ops xfs_quotactl_operations;
 static struct super_operations xfs_super_operations;
-static kmem_zone_t *xfs_vnode_zone;
 static kmem_zone_t *xfs_ioend_zone;
 mempool_t *xfs_ioend_pool;
 
-STATIC struct xfs_mount_args *
-xfs_args_allocate(
-       struct super_block      *sb,
-       int                     silent)
-{
-       struct xfs_mount_args   *args;
-
-       args = kzalloc(sizeof(struct xfs_mount_args), GFP_KERNEL);
-       if (!args)
-               return NULL;
-
-       args->logbufs = args->logbufsize = -1;
-       strncpy(args->fsname, sb->s_id, MAXNAMELEN);
-
-       /* Copy the already-parsed mount(2) flags we're interested in */
-       if (sb->s_flags & MS_DIRSYNC)
-               args->flags |= XFSMNT_DIRSYNC;
-       if (sb->s_flags & MS_SYNCHRONOUS)
-               args->flags |= XFSMNT_WSYNC;
-       if (silent)
-               args->flags |= XFSMNT_QUIET;
-       args->flags |= XFSMNT_32BITINODES;
-
-       return args;
-}
-
 #define MNTOPT_LOGBUFS "logbufs"       /* number of XFS log buffers */
 #define MNTOPT_LOGBSIZE        "logbsize"      /* size of XFS log buffers */
 #define MNTOPT_LOGDEV  "logdev"        /* log device */
@@ -188,26 +161,54 @@ suffix_strtoul(char *s, char **endp, unsigned int base)
        return simple_strtoul((const char *)s, endp, base) << shift_left_factor;
 }
 
+/*
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock has _not_ yet been read in.
+ *
+ * Note that this function leaks the various device name allocations on
+ * failure.  The caller takes care of them.
+ */
 STATIC int
 xfs_parseargs(
        struct xfs_mount        *mp,
        char                    *options,
-       struct xfs_mount_args   *args,
-       int                     update)
+       char                    **mtpt)
 {
+       struct super_block      *sb = mp->m_super;
        char                    *this_char, *value, *eov;
-       int                     dsunit, dswidth, vol_dsunit, vol_dswidth;
-       int                     iosize;
+       int                     dsunit = 0;
+       int                     dswidth = 0;
+       int                     iosize = 0;
        int                     dmapi_implies_ikeep = 1;
+       uchar_t                 iosizelog = 0;
+
+       /*
+        * Copy binary VFS mount flags we are interested in.
+        */
+       if (sb->s_flags & MS_RDONLY)
+               mp->m_flags |= XFS_MOUNT_RDONLY;
+       if (sb->s_flags & MS_DIRSYNC)
+               mp->m_flags |= XFS_MOUNT_DIRSYNC;
+       if (sb->s_flags & MS_SYNCHRONOUS)
+               mp->m_flags |= XFS_MOUNT_WSYNC;
+
+       /*
+        * Set some default flags that could be cleared by the mount option
+        * parsing.
+        */
+       mp->m_flags |= XFS_MOUNT_BARRIER;
+       mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
+       mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
 
-       args->flags |= XFSMNT_BARRIER;
-       args->flags2 |= XFSMNT2_COMPAT_IOSIZE;
+       /*
+        * These can be overridden by the mount option parsing.
+        */
+       mp->m_logbufs = -1;
+       mp->m_logbsize = -1;
 
        if (!options)
                goto done;
 
-       iosize = dsunit = dswidth = vol_dsunit = vol_dswidth = 0;
-
        while ((this_char = strsep(&options, ",")) != NULL) {
                if (!*this_char)
                        continue;
@@ -221,7 +222,7 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                       args->logbufs = simple_strtoul(value, &eov, 10);
+                       mp->m_logbufs = simple_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -229,7 +230,7 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                       args->logbufsize = suffix_strtoul(value, &eov, 10);
+                       mp->m_logbsize = suffix_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -237,7 +238,9 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                       strncpy(args->logname, value, MAXNAMELEN);
+                       mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+                       if (!mp->m_logname)
+                               return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_MTPT)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -245,7 +248,9 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                       strncpy(args->mtpt, value, MAXNAMELEN);
+                       *mtpt = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+                       if (!*mtpt)
+                               return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -253,7 +258,9 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                       strncpy(args->rtname, value, MAXNAMELEN);
+                       mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+                       if (!mp->m_rtname)
+                               return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -262,8 +269,7 @@ xfs_parseargs(
                                return EINVAL;
                        }
                        iosize = simple_strtoul(value, &eov, 10);
-                       args->flags |= XFSMNT_IOSIZE;
-                       args->iosizelog = (uint8_t) iosize;
+                       iosizelog = ffs(iosize) - 1;
                } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -272,8 +278,7 @@ xfs_parseargs(
                                return EINVAL;
                        }
                        iosize = suffix_strtoul(value, &eov, 10);
-                       args->flags |= XFSMNT_IOSIZE;
-                       args->iosizelog = ffs(iosize) - 1;
+                       iosizelog = ffs(iosize) - 1;
                } else if (!strcmp(this_char, MNTOPT_GRPID) ||
                           !strcmp(this_char, MNTOPT_BSDGROUPS)) {
                        mp->m_flags |= XFS_MOUNT_GRPID;
@@ -281,23 +286,25 @@ xfs_parseargs(
                           !strcmp(this_char, MNTOPT_SYSVGROUPS)) {
                        mp->m_flags &= ~XFS_MOUNT_GRPID;
                } else if (!strcmp(this_char, MNTOPT_WSYNC)) {
-                       args->flags |= XFSMNT_WSYNC;
+                       mp->m_flags |= XFS_MOUNT_WSYNC;
                } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) {
-                       args->flags |= XFSMNT_OSYNCISOSYNC;
+                       mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
                } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
-                       args->flags |= XFSMNT_NORECOVERY;
+                       mp->m_flags |= XFS_MOUNT_NORECOVERY;
                } else if (!strcmp(this_char, MNTOPT_INO64)) {
-                       args->flags |= XFSMNT_INO64;
-#if !XFS_BIG_INUMS
+#if XFS_BIG_INUMS
+                       mp->m_flags |= XFS_MOUNT_INO64;
+                       mp->m_inoadd = XFS_INO64_OFFSET;
+#else
                        cmn_err(CE_WARN,
                                "XFS: %s option not allowed on this system",
                                this_char);
                        return EINVAL;
 #endif
                } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
-                       args->flags |= XFSMNT_NOALIGN;
+                       mp->m_flags |= XFS_MOUNT_NOALIGN;
                } else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
-                       args->flags |= XFSMNT_SWALLOC;
+                       mp->m_flags |= XFS_MOUNT_SWALLOC;
                } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -315,7 +322,7 @@ xfs_parseargs(
                        }
                        dswidth = simple_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
-                       args->flags &= ~XFSMNT_32BITINODES;
+                       mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
 #if !XFS_BIG_INUMS
                        cmn_err(CE_WARN,
                                "XFS: %s option not allowed on this system",
@@ -323,56 +330,61 @@ xfs_parseargs(
                        return EINVAL;
 #endif
                } else if (!strcmp(this_char, MNTOPT_NOUUID)) {
-                       args->flags |= XFSMNT_NOUUID;
+                       mp->m_flags |= XFS_MOUNT_NOUUID;
                } else if (!strcmp(this_char, MNTOPT_BARRIER)) {
-                       args->flags |= XFSMNT_BARRIER;
+                       mp->m_flags |= XFS_MOUNT_BARRIER;
                } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) {
-                       args->flags &= ~XFSMNT_BARRIER;
+                       mp->m_flags &= ~XFS_MOUNT_BARRIER;
                } else if (!strcmp(this_char, MNTOPT_IKEEP)) {
-                       args->flags |= XFSMNT_IKEEP;
+                       mp->m_flags |= XFS_MOUNT_IKEEP;
                } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
                        dmapi_implies_ikeep = 0;
-                       args->flags &= ~XFSMNT_IKEEP;
+                       mp->m_flags &= ~XFS_MOUNT_IKEEP;
                } else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
-                       args->flags2 &= ~XFSMNT2_COMPAT_IOSIZE;
+                       mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
                } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) {
-                       args->flags2 |= XFSMNT2_COMPAT_IOSIZE;
+                       mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
                } else if (!strcmp(this_char, MNTOPT_ATTR2)) {
-                       args->flags |= XFSMNT_ATTR2;
+                       mp->m_flags |= XFS_MOUNT_ATTR2;
                } else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
-                       args->flags &= ~XFSMNT_ATTR2;
-                       args->flags |= XFSMNT_NOATTR2;
+                       mp->m_flags &= ~XFS_MOUNT_ATTR2;
+                       mp->m_flags |= XFS_MOUNT_NOATTR2;
                } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
-                       args->flags2 |= XFSMNT2_FILESTREAMS;
+                       mp->m_flags |= XFS_MOUNT_FILESTREAMS;
                } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
-                       args->flags &= ~(XFSMNT_UQUOTAENF|XFSMNT_UQUOTA);
-                       args->flags &= ~(XFSMNT_GQUOTAENF|XFSMNT_GQUOTA);
+                       mp->m_qflags &= ~(XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
+                                         XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+                                         XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+                                         XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_QUOTA) ||
                           !strcmp(this_char, MNTOPT_UQUOTA) ||
                           !strcmp(this_char, MNTOPT_USRQUOTA)) {
-                       args->flags |= XFSMNT_UQUOTA | XFSMNT_UQUOTAENF;
+                       mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
+                                        XFS_UQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) ||
                           !strcmp(this_char, MNTOPT_UQUOTANOENF)) {
-                       args->flags |= XFSMNT_UQUOTA;
-                       args->flags &= ~XFSMNT_UQUOTAENF;
+                       mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
+                       mp->m_qflags &= ~XFS_UQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
                           !strcmp(this_char, MNTOPT_PRJQUOTA)) {
-                       args->flags |= XFSMNT_PQUOTA | XFSMNT_PQUOTAENF;
+                       mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+                                        XFS_OQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
-                       args->flags |= XFSMNT_PQUOTA;
-                       args->flags &= ~XFSMNT_PQUOTAENF;
+                       mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
+                       mp->m_qflags &= ~XFS_OQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
                           !strcmp(this_char, MNTOPT_GRPQUOTA)) {
-                       args->flags |= XFSMNT_GQUOTA | XFSMNT_GQUOTAENF;
+                       mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+                                        XFS_OQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
-                       args->flags |= XFSMNT_GQUOTA;
-                       args->flags &= ~XFSMNT_GQUOTAENF;
+                       mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
+                       mp->m_qflags &= ~XFS_OQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_DMAPI)) {
-                       args->flags |= XFSMNT_DMAPI;
+                       mp->m_flags |= XFS_MOUNT_DMAPI;
                } else if (!strcmp(this_char, MNTOPT_XDSM)) {
-                       args->flags |= XFSMNT_DMAPI;
+                       mp->m_flags |= XFS_MOUNT_DMAPI;
                } else if (!strcmp(this_char, MNTOPT_DMI)) {
-                       args->flags |= XFSMNT_DMAPI;
+                       mp->m_flags |= XFS_MOUNT_DMAPI;
                } else if (!strcmp(this_char, "ihashsize")) {
                        cmn_err(CE_WARN,
        "XFS: ihashsize no longer used, option is deprecated.");
@@ -390,27 +402,29 @@ xfs_parseargs(
                }
        }
 
-       if (args->flags & XFSMNT_NORECOVERY) {
-               if ((mp->m_flags & XFS_MOUNT_RDONLY) == 0) {
-                       cmn_err(CE_WARN,
-                               "XFS: no-recovery mounts must be read-only.");
-                       return EINVAL;
-               }
+       /*
+        * no recovery flag requires a read-only mount
+        */
+       if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
+           !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+               cmn_err(CE_WARN, "XFS: no-recovery mounts must be read-only.");
+               return EINVAL;
        }
 
-       if ((args->flags & XFSMNT_NOALIGN) && (dsunit || dswidth)) {
+       if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
                cmn_err(CE_WARN,
        "XFS: sunit and swidth options incompatible with the noalign option");
                return EINVAL;
        }
 
-       if ((args->flags & XFSMNT_GQUOTA) && (args->flags & XFSMNT_PQUOTA)) {
+       if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
+           (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
                cmn_err(CE_WARN,
                        "XFS: cannot mount with both project and group quota");
                return EINVAL;
        }
 
-       if ((args->flags & XFSMNT_DMAPI) && *args->mtpt == '\0') {
+       if ((mp->m_flags & XFS_MOUNT_DMAPI) && (!*mtpt || *mtpt[0] == '\0')) {
                printk("XFS: %s option needs the mount point option as well\n",
                        MNTOPT_DMAPI);
                return EINVAL;
@@ -438,27 +452,66 @@ xfs_parseargs(
         * Note that if "ikeep" or "noikeep" mount options are
         * supplied, then they are honored.
         */
-       if ((args->flags & XFSMNT_DMAPI) && dmapi_implies_ikeep)
-               args->flags |= XFSMNT_IKEEP;
+       if ((mp->m_flags & XFS_MOUNT_DMAPI) && dmapi_implies_ikeep)
+               mp->m_flags |= XFS_MOUNT_IKEEP;
 
-       if ((args->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
+done:
+       if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
+               /*
+                * At this point the superblock has not been read
+                * in, therefore we do not know the block size.
+                * Before the mount call ends we will convert
+                * these to FSBs.
+                */
                if (dsunit) {
-                       args->sunit = dsunit;
-                       args->flags |= XFSMNT_RETERR;
-               } else {
-                       args->sunit = vol_dsunit;
+                       mp->m_dalign = dsunit;
+                       mp->m_flags |= XFS_MOUNT_RETERR;
                }
-               dswidth ? (args->swidth = dswidth) :
-                         (args->swidth = vol_dswidth);
-       } else {
-               args->sunit = args->swidth = 0;
+
+               if (dswidth)
+                       mp->m_swidth = dswidth;
+       }
+
+       if (mp->m_logbufs != -1 &&
+           mp->m_logbufs != 0 &&
+           (mp->m_logbufs < XLOG_MIN_ICLOGS ||
+            mp->m_logbufs > XLOG_MAX_ICLOGS)) {
+               cmn_err(CE_WARN,
+                       "XFS: invalid logbufs value: %d [not %d-%d]",
+                       mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
+               return XFS_ERROR(EINVAL);
+       }
+       if (mp->m_logbsize != -1 &&
+           mp->m_logbsize !=  0 &&
+           (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
+            mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
+            !is_power_of_2(mp->m_logbsize))) {
+               cmn_err(CE_WARN,
+       "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
+                       mp->m_logbsize);
+               return XFS_ERROR(EINVAL);
+       }
+
+       mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
+       if (!mp->m_fsname)
+               return ENOMEM;
+       mp->m_fsname_len = strlen(mp->m_fsname) + 1;
+
+       if (iosizelog) {
+               if (iosizelog > XFS_MAX_IO_LOG ||
+                   iosizelog < XFS_MIN_IO_LOG) {
+                       cmn_err(CE_WARN,
+               "XFS: invalid log iosize: %d [not %d-%d]",
+                               iosizelog, XFS_MIN_IO_LOG,
+                               XFS_MAX_IO_LOG);
+                       return XFS_ERROR(EINVAL);
+               }
+
+               mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
+               mp->m_readio_log = iosizelog;
+               mp->m_writeio_log = iosizelog;
        }
 
-done:
-       if (args->flags & XFSMNT_32BITINODES)
-               mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
-       if (args->flags2)
-               args->flags |= XFSMNT_FLAGS2;
        return 0;
 }
 
@@ -704,8 +757,7 @@ xfs_close_devices(
  */
 STATIC int
 xfs_open_devices(
-       struct xfs_mount        *mp,
-       struct xfs_mount_args   *args)
+       struct xfs_mount        *mp)
 {
        struct block_device     *ddev = mp->m_super->s_bdev;
        struct block_device     *logdev = NULL, *rtdev = NULL;
@@ -714,14 +766,14 @@ xfs_open_devices(
        /*
         * Open real time and log devices - order is important.
         */
-       if (args->logname[0]) {
-               error = xfs_blkdev_get(mp, args->logname, &logdev);
+       if (mp->m_logname) {
+               error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
                if (error)
                        goto out;
        }
 
-       if (args->rtname[0]) {
-               error = xfs_blkdev_get(mp, args->rtname, &rtdev);
+       if (mp->m_rtname) {
+               error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev);
                if (error)
                        goto out_close_logdev;
 
@@ -813,18 +865,18 @@ xfs_setup_devices(
  */
 void
 xfsaild_wakeup(
-       xfs_mount_t             *mp,
+       struct xfs_ail          *ailp,
        xfs_lsn_t               threshold_lsn)
 {
-       mp->m_ail.xa_target = threshold_lsn;
-       wake_up_process(mp->m_ail.xa_task);
+       ailp->xa_target = threshold_lsn;
+       wake_up_process(ailp->xa_task);
 }
 
 int
 xfsaild(
        void    *data)
 {
-       xfs_mount_t     *mp = (xfs_mount_t *)data;
+       struct xfs_ail  *ailp = data;
        xfs_lsn_t       last_pushed_lsn = 0;
        long            tout = 0;
 
@@ -836,11 +888,11 @@ xfsaild(
                /* swsusp */
                try_to_freeze();
 
-               ASSERT(mp->m_log);
-               if (XFS_FORCED_SHUTDOWN(mp))
+               ASSERT(ailp->xa_mount->m_log);
+               if (XFS_FORCED_SHUTDOWN(ailp->xa_mount))
                        continue;
 
-               tout = xfsaild_push(mp, &last_pushed_lsn);
+               tout = xfsaild_push(ailp, &last_pushed_lsn);
        }
 
        return 0;
@@ -848,43 +900,82 @@ xfsaild(
 
 int
 xfsaild_start(
-       xfs_mount_t     *mp)
+       struct xfs_ail  *ailp)
 {
-       mp->m_ail.xa_target = 0;
-       mp->m_ail.xa_task = kthread_run(xfsaild, mp, "xfsaild");
-       if (IS_ERR(mp->m_ail.xa_task))
-               return -PTR_ERR(mp->m_ail.xa_task);
+       ailp->xa_target = 0;
+       ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild");
+       if (IS_ERR(ailp->xa_task))
+               return -PTR_ERR(ailp->xa_task);
        return 0;
 }
 
 void
 xfsaild_stop(
-       xfs_mount_t     *mp)
+       struct xfs_ail  *ailp)
 {
-       kthread_stop(mp->m_ail.xa_task);
+       kthread_stop(ailp->xa_task);
 }
 
 
-
+/* Catch misguided souls that try to use this interface on XFS */
 STATIC struct inode *
 xfs_fs_alloc_inode(
        struct super_block      *sb)
 {
-       return kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP);
+       BUG();
+       return NULL;
 }
 
+/*
+ * Now that the generic code is guaranteed not to be accessing
+ * the linux inode, we can reclaim the inode.
+ */
 STATIC void
 xfs_fs_destroy_inode(
-       struct inode            *inode)
+       struct inode    *inode)
 {
-       kmem_zone_free(xfs_vnode_zone, inode);
+       xfs_inode_t             *ip = XFS_I(inode);
+
+       XFS_STATS_INC(vn_reclaim);
+       if (xfs_reclaim(ip))
+               panic("%s: cannot reclaim 0x%p\n", __func__, inode);
 }
 
+/*
+ * Slab object creation initialisation for the XFS inode.
+ * This covers only the idempotent fields in the XFS inode;
+ * all other fields need to be initialised on allocation
+ * from the slab. This avoids the need to repeatedly intialise
+ * fields in the xfs inode that left in the initialise state
+ * when freeing the inode.
+ */
 STATIC void
 xfs_fs_inode_init_once(
-       void                    *vnode)
+       void                    *inode)
 {
-       inode_init_once((struct inode *)vnode);
+       struct xfs_inode        *ip = inode;
+
+       memset(ip, 0, sizeof(struct xfs_inode));
+
+       /* vfs inode */
+       inode_init_once(VFS_I(ip));
+
+       /* xfs inode */
+       atomic_set(&ip->i_iocount, 0);
+       atomic_set(&ip->i_pincount, 0);
+       spin_lock_init(&ip->i_flags_lock);
+       init_waitqueue_head(&ip->i_ipin_wait);
+       /*
+        * Because we want to use a counting completion, complete
+        * the flush completion once to allow a single access to
+        * the flush completion without blocking.
+        */
+       init_completion(&ip->i_flush);
+       complete(&ip->i_flush);
+
+       mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
+                    "xfsino", ip->i_ino);
+       mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
 }
 
 /*
@@ -898,21 +989,26 @@ xfs_fs_write_inode(
        struct inode            *inode,
        int                     sync)
 {
+       struct xfs_inode        *ip = XFS_I(inode);
        int                     error = 0;
        int                     flags = 0;
 
-       xfs_itrace_entry(XFS_I(inode));
+       xfs_itrace_entry(ip);
        if (sync) {
-               filemap_fdatawait(inode->i_mapping);
+               error = xfs_wait_on_pages(ip, 0, -1);
+               if (error)
+                       goto out_error;
                flags |= FLUSH_SYNC;
        }
-       error = xfs_inode_flush(XFS_I(inode), flags);
+       error = xfs_inode_flush(ip, flags);
+
+out_error:
        /*
         * if we failed to write out the inode then mark
         * it dirty again so we'll try again later.
         */
        if (error)
-               mark_inode_dirty_sync(inode);
+               xfs_mark_inode_dirty_sync(ip);
 
        return -error;
 }
@@ -923,164 +1019,12 @@ xfs_fs_clear_inode(
 {
        xfs_inode_t             *ip = XFS_I(inode);
 
-       /*
-        * ip can be null when xfs_iget_core calls xfs_idestroy if we
-        * find an inode with di_mode == 0 but without IGET_CREATE set.
-        */
-       if (ip) {
-               xfs_itrace_entry(ip);
-               XFS_STATS_INC(vn_rele);
-               XFS_STATS_INC(vn_remove);
-               XFS_STATS_INC(vn_reclaim);
-               XFS_STATS_DEC(vn_active);
-
-               xfs_inactive(ip);
-               xfs_iflags_clear(ip, XFS_IMODIFIED);
-               if (xfs_reclaim(ip))
-                       panic("%s: cannot reclaim 0x%p\n", __func__, inode);
-       }
-
-       ASSERT(XFS_I(inode) == NULL);
-}
+       xfs_itrace_entry(ip);
+       XFS_STATS_INC(vn_rele);
+       XFS_STATS_INC(vn_remove);
+       XFS_STATS_DEC(vn_active);
 
-/*
- * Enqueue a work item to be picked up by the vfs xfssyncd thread.
- * Doing this has two advantages:
- * - It saves on stack space, which is tight in certain situations
- * - It can be used (with care) as a mechanism to avoid deadlocks.
- * Flushing while allocating in a full filesystem requires both.
- */
-STATIC void
-xfs_syncd_queue_work(
-       struct xfs_mount *mp,
-       void            *data,
-       void            (*syncer)(struct xfs_mount *, void *))
-{
-       struct bhv_vfs_sync_work *work;
-
-       work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
-       INIT_LIST_HEAD(&work->w_list);
-       work->w_syncer = syncer;
-       work->w_data = data;
-       work->w_mount = mp;
-       spin_lock(&mp->m_sync_lock);
-       list_add_tail(&work->w_list, &mp->m_sync_list);
-       spin_unlock(&mp->m_sync_lock);
-       wake_up_process(mp->m_sync_task);
-}
-
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations.  At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room...
- */
-STATIC void
-xfs_flush_inode_work(
-       struct xfs_mount *mp,
-       void            *arg)
-{
-       struct inode    *inode = arg;
-       filemap_flush(inode->i_mapping);
-       iput(inode);
-}
-
-void
-xfs_flush_inode(
-       xfs_inode_t     *ip)
-{
-       struct inode    *inode = VFS_I(ip);
-
-       igrab(inode);
-       xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
-       delay(msecs_to_jiffies(500));
-}
-
-/*
- * This is the "bigger hammer" version of xfs_flush_inode_work...
- * (IOW, "If at first you don't succeed, use a Bigger Hammer").
- */
-STATIC void
-xfs_flush_device_work(
-       struct xfs_mount *mp,
-       void            *arg)
-{
-       struct inode    *inode = arg;
-       sync_blockdev(mp->m_super->s_bdev);
-       iput(inode);
-}
-
-void
-xfs_flush_device(
-       xfs_inode_t     *ip)
-{
-       struct inode    *inode = VFS_I(ip);
-
-       igrab(inode);
-       xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
-       delay(msecs_to_jiffies(500));
-       xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
-}
-
-STATIC void
-xfs_sync_worker(
-       struct xfs_mount *mp,
-       void            *unused)
-{
-       int             error;
-
-       if (!(mp->m_flags & XFS_MOUNT_RDONLY))
-               error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR);
-       mp->m_sync_seq++;
-       wake_up(&mp->m_wait_single_sync_task);
-}
-
-STATIC int
-xfssyncd(
-       void                    *arg)
-{
-       struct xfs_mount        *mp = arg;
-       long                    timeleft;
-       bhv_vfs_sync_work_t     *work, *n;
-       LIST_HEAD               (tmp);
-
-       set_freezable();
-       timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
-       for (;;) {
-               timeleft = schedule_timeout_interruptible(timeleft);
-               /* swsusp */
-               try_to_freeze();
-               if (kthread_should_stop() && list_empty(&mp->m_sync_list))
-                       break;
-
-               spin_lock(&mp->m_sync_lock);
-               /*
-                * We can get woken by laptop mode, to do a sync -
-                * that's the (only!) case where the list would be
-                * empty with time remaining.
-                */
-               if (!timeleft || list_empty(&mp->m_sync_list)) {
-                       if (!timeleft)
-                               timeleft = xfs_syncd_centisecs *
-                                                       msecs_to_jiffies(10);
-                       INIT_LIST_HEAD(&mp->m_sync_work.w_list);
-                       list_add_tail(&mp->m_sync_work.w_list,
-                                       &mp->m_sync_list);
-               }
-               list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
-                       list_move(&work->w_list, &tmp);
-               spin_unlock(&mp->m_sync_lock);
-
-               list_for_each_entry_safe(work, n, &tmp, w_list) {
-                       (*work->w_syncer)(mp, work->w_data);
-                       list_del(&work->w_list);
-                       if (work == &mp->m_sync_work)
-                               continue;
-                       kmem_free(work);
-               }
-       }
-
-       return 0;
+       xfs_inactive(ip);
 }
 
 STATIC void
@@ -1099,11 +1043,9 @@ xfs_fs_put_super(
        struct xfs_mount        *mp = XFS_M(sb);
        struct xfs_inode        *rip = mp->m_rootip;
        int                     unmount_event_flags = 0;
-       int                     error;
 
-       kthread_stop(mp->m_sync_task);
-
-       xfs_sync(mp, SYNC_ATTR | SYNC_DELWRI);
+       xfs_syncd_stop(mp);
+       xfs_sync_inodes(mp, SYNC_ATTR|SYNC_DELWRI);
 
 #ifdef HAVE_DMAPI
        if (mp->m_flags & XFS_MOUNT_DMAPI) {
@@ -1128,18 +1070,6 @@ xfs_fs_put_super(
        xfs_filestream_unmount(mp);
 
        XFS_bflush(mp->m_ddev_targp);
-       error = xfs_unmount_flush(mp, 0);
-       WARN_ON(error);
-
-       /*
-        * If we're forcing a shutdown, typically because of a media error,
-        * we want to make sure we invalidate dirty pages that belong to
-        * referenced vnodes as well.
-        */
-       if (XFS_FORCED_SHUTDOWN(mp)) {
-               error = xfs_sync(mp, SYNC_WAIT | SYNC_CLOSE);
-               ASSERT(error != EFSCORRUPTED);
-       }
 
        if (mp->m_flags & XFS_MOUNT_DMAPI) {
                XFS_SEND_UNMOUNT(mp, rip, DM_RIGHT_NULL, 0, 0,
@@ -1161,7 +1091,7 @@ xfs_fs_write_super(
        struct super_block      *sb)
 {
        if (!(sb->s_flags & MS_RDONLY))
-               xfs_sync(XFS_M(sb), SYNC_FSDATA);
+               xfs_sync_fsdata(XFS_M(sb), 0);
        sb->s_dirt = 0;
 }
 
@@ -1172,7 +1102,6 @@ xfs_fs_sync_super(
 {
        struct xfs_mount        *mp = XFS_M(sb);
        int                     error;
-       int                     flags;
 
        /*
         * Treat a sync operation like a freeze.  This is to work
@@ -1186,20 +1115,10 @@ xfs_fs_sync_super(
         * dirty the Linux inode until after the transaction I/O
         * completes.
         */
-       if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE)) {
-               /*
-                * First stage of freeze - no more writers will make progress
-                * now we are here, so we flush delwri and delalloc buffers
-                * here, then wait for all I/O to complete.  Data is frozen at
-                * that point. Metadata is not frozen, transactions can still
-                * occur here so don't bother flushing the buftarg (i.e
-                * SYNC_QUIESCE) because it'll just get dirty again.
-                */
-               flags = SYNC_DATA_QUIESCE;
-       } else
-               flags = SYNC_FSDATA;
-
-       error = xfs_sync(mp, flags);
+       if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE))
+               error = xfs_quiesce_data(mp);
+       else
+               error = xfs_sync_fsdata(mp, 0);
        sb->s_dirt = 0;
 
        if (unlikely(laptop_mode)) {
@@ -1337,9 +1256,8 @@ xfs_fs_remount(
 
        /* rw -> ro */
        if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
-               xfs_filestream_flush(mp);
-               xfs_sync(mp, SYNC_DATA_QUIESCE);
-               xfs_attr_quiesce(mp);
+               xfs_quiesce_data(mp);
+               xfs_quiesce_attr(mp);
                mp->m_flags |= XFS_MOUNT_RDONLY;
        }
 
@@ -1348,7 +1266,7 @@ xfs_fs_remount(
 
 /*
  * Second stage of a freeze. The data is already frozen so we only
- * need to take care of themetadata. Once that's done write a dummy
+ * need to take care of the metadata. Once that's done write a dummy
  * record to dirty the log in case of a crash while frozen.
  */
 STATIC void
@@ -1357,7 +1275,7 @@ xfs_fs_lockfs(
 {
        struct xfs_mount        *mp = XFS_M(sb);
 
-       xfs_attr_quiesce(mp);
+       xfs_quiesce_attr(mp);
        xfs_fs_log_dummy(mp);
 }
 
@@ -1420,177 +1338,30 @@ xfs_fs_setxquota(
                                   Q_XSETPQLIM), id, (caddr_t)fdq);
 }
 
-/*
- * This function fills in xfs_mount_t fields based on mount args.
- * Note: the superblock has _not_ yet been read in.
- */
-STATIC int
-xfs_start_flags(
-       struct xfs_mount_args   *ap,
-       struct xfs_mount        *mp)
-{
-       int                     error;
-
-       /* Values are in BBs */
-       if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
-               /*
-                * At this point the superblock has not been read
-                * in, therefore we do not know the block size.
-                * Before the mount call ends we will convert
-                * these to FSBs.
-                */
-               mp->m_dalign = ap->sunit;
-               mp->m_swidth = ap->swidth;
-       }
-
-       if (ap->logbufs != -1 &&
-           ap->logbufs != 0 &&
-           (ap->logbufs < XLOG_MIN_ICLOGS ||
-            ap->logbufs > XLOG_MAX_ICLOGS)) {
-               cmn_err(CE_WARN,
-                       "XFS: invalid logbufs value: %d [not %d-%d]",
-                       ap->logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
-               return XFS_ERROR(EINVAL);
-       }
-       mp->m_logbufs = ap->logbufs;
-       if (ap->logbufsize != -1 &&
-           ap->logbufsize !=  0 &&
-           (ap->logbufsize < XLOG_MIN_RECORD_BSIZE ||
-            ap->logbufsize > XLOG_MAX_RECORD_BSIZE ||
-            !is_power_of_2(ap->logbufsize))) {
-               cmn_err(CE_WARN,
-       "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
-                       ap->logbufsize);
-               return XFS_ERROR(EINVAL);
-       }
-
-       error = ENOMEM;
-
-       mp->m_logbsize = ap->logbufsize;
-       mp->m_fsname_len = strlen(ap->fsname) + 1;
-
-       mp->m_fsname = kstrdup(ap->fsname, GFP_KERNEL);
-       if (!mp->m_fsname)
-               goto out;
-
-       if (ap->rtname[0]) {
-               mp->m_rtname = kstrdup(ap->rtname, GFP_KERNEL);
-               if (!mp->m_rtname)
-                       goto out_free_fsname;
-
-       }
-
-       if (ap->logname[0]) {
-               mp->m_logname = kstrdup(ap->logname, GFP_KERNEL);
-               if (!mp->m_logname)
-                       goto out_free_rtname;
-       }
-
-       if (ap->flags & XFSMNT_WSYNC)
-               mp->m_flags |= XFS_MOUNT_WSYNC;
-#if XFS_BIG_INUMS
-       if (ap->flags & XFSMNT_INO64) {
-               mp->m_flags |= XFS_MOUNT_INO64;
-               mp->m_inoadd = XFS_INO64_OFFSET;
-       }
-#endif
-       if (ap->flags & XFSMNT_RETERR)
-               mp->m_flags |= XFS_MOUNT_RETERR;
-       if (ap->flags & XFSMNT_NOALIGN)
-               mp->m_flags |= XFS_MOUNT_NOALIGN;
-       if (ap->flags & XFSMNT_SWALLOC)
-               mp->m_flags |= XFS_MOUNT_SWALLOC;
-       if (ap->flags & XFSMNT_OSYNCISOSYNC)
-               mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
-       if (ap->flags & XFSMNT_32BITINODES)
-               mp->m_flags |= XFS_MOUNT_32BITINODES;
-
-       if (ap->flags & XFSMNT_IOSIZE) {
-               if (ap->iosizelog > XFS_MAX_IO_LOG ||
-                   ap->iosizelog < XFS_MIN_IO_LOG) {
-                       cmn_err(CE_WARN,
-               "XFS: invalid log iosize: %d [not %d-%d]",
-                               ap->iosizelog, XFS_MIN_IO_LOG,
-                               XFS_MAX_IO_LOG);
-                       return XFS_ERROR(EINVAL);
-               }
-
-               mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
-               mp->m_readio_log = mp->m_writeio_log = ap->iosizelog;
-       }
-
-       if (ap->flags & XFSMNT_IKEEP)
-               mp->m_flags |= XFS_MOUNT_IKEEP;
-       if (ap->flags & XFSMNT_DIRSYNC)
-               mp->m_flags |= XFS_MOUNT_DIRSYNC;
-       if (ap->flags & XFSMNT_ATTR2)
-               mp->m_flags |= XFS_MOUNT_ATTR2;
-       if (ap->flags & XFSMNT_NOATTR2)
-               mp->m_flags |= XFS_MOUNT_NOATTR2;
-
-       if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE)
-               mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
-
-       /*
-        * no recovery flag requires a read-only mount
-        */
-       if (ap->flags & XFSMNT_NORECOVERY) {
-               if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                       cmn_err(CE_WARN,
-       "XFS: tried to mount a FS read-write without recovery!");
-                       return XFS_ERROR(EINVAL);
-               }
-               mp->m_flags |= XFS_MOUNT_NORECOVERY;
-       }
-
-       if (ap->flags & XFSMNT_NOUUID)
-               mp->m_flags |= XFS_MOUNT_NOUUID;
-       if (ap->flags & XFSMNT_BARRIER)
-               mp->m_flags |= XFS_MOUNT_BARRIER;
-       else
-               mp->m_flags &= ~XFS_MOUNT_BARRIER;
-
-       if (ap->flags2 & XFSMNT2_FILESTREAMS)
-               mp->m_flags |= XFS_MOUNT_FILESTREAMS;
-
-       if (ap->flags & XFSMNT_DMAPI)
-               mp->m_flags |= XFS_MOUNT_DMAPI;
-       return 0;
-
-
- out_free_rtname:
-       kfree(mp->m_rtname);
- out_free_fsname:
-       kfree(mp->m_fsname);
- out:
-       return error;
-}
-
 /*
  * This function fills in xfs_mount_t fields based on mount args.
  * Note: the superblock _has_ now been read in.
  */
 STATIC int
 xfs_finish_flags(
-       struct xfs_mount_args   *ap,
        struct xfs_mount        *mp)
 {
        int                     ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
 
        /* Fail a mount where the logbuf is smaller then the log stripe */
        if (xfs_sb_version_haslogv2(&mp->m_sb)) {
-               if ((ap->logbufsize <= 0) &&
-                   (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) {
+               if (mp->m_logbsize <= 0 &&
+                   mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {
                        mp->m_logbsize = mp->m_sb.sb_logsunit;
-               } else if (ap->logbufsize > 0 &&
-                          ap->logbufsize < mp->m_sb.sb_logsunit) {
+               } else if (mp->m_logbsize > 0 &&
+                          mp->m_logbsize < mp->m_sb.sb_logsunit) {
                        cmn_err(CE_WARN,
        "XFS: logbuf size must be greater than or equal to log stripe size");
                        return XFS_ERROR(EINVAL);
                }
        } else {
                /* Fail a mount if the logbuf is larger than 32K */
-               if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) {
+               if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
                        cmn_err(CE_WARN,
        "XFS: logbuf size for version 1 logs must be 16K or 32K");
                        return XFS_ERROR(EINVAL);
@@ -1602,7 +1373,7 @@ xfs_finish_flags(
         * told by noattr2 to turn it off
         */
        if (xfs_sb_version_hasattr2(&mp->m_sb) &&
-           !(ap->flags & XFSMNT_NOATTR2))
+           !(mp->m_flags & XFS_MOUNT_NOATTR2))
                mp->m_flags |= XFS_MOUNT_ATTR2;
 
        /*
@@ -1614,48 +1385,6 @@ xfs_finish_flags(
                return XFS_ERROR(EROFS);
        }
 
-       /*
-        * check for shared mount.
-        */
-       if (ap->flags & XFSMNT_SHARED) {
-               if (!xfs_sb_version_hasshared(&mp->m_sb))
-                       return XFS_ERROR(EINVAL);
-
-               /*
-                * For IRIX 6.5, shared mounts must have the shared
-                * version bit set, have the persistent readonly
-                * field set, must be version 0 and can only be mounted
-                * read-only.
-                */
-               if (!ronly || !(mp->m_sb.sb_flags & XFS_SBF_READONLY) ||
-                    (mp->m_sb.sb_shared_vn != 0))
-                       return XFS_ERROR(EINVAL);
-
-               mp->m_flags |= XFS_MOUNT_SHARED;
-
-               /*
-                * Shared XFS V0 can't deal with DMI.  Return EINVAL.
-                */
-               if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI))
-                       return XFS_ERROR(EINVAL);
-       }
-
-       if (ap->flags & XFSMNT_UQUOTA) {
-               mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
-               if (ap->flags & XFSMNT_UQUOTAENF)
-                       mp->m_qflags |= XFS_UQUOTA_ENFD;
-       }
-
-       if (ap->flags & XFSMNT_GQUOTA) {
-               mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
-               if (ap->flags & XFSMNT_GQUOTAENF)
-                       mp->m_qflags |= XFS_OQUOTA_ENFD;
-       } else if (ap->flags & XFSMNT_PQUOTA) {
-               mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
-               if (ap->flags & XFSMNT_PQUOTAENF)
-                       mp->m_qflags |= XFS_OQUOTA_ENFD;
-       }
-
        return 0;
 }
 
@@ -1667,19 +1396,14 @@ xfs_fs_fill_super(
 {
        struct inode            *root;
        struct xfs_mount        *mp = NULL;
-       struct xfs_mount_args   *args;
        int                     flags = 0, error = ENOMEM;
-
-       args = xfs_args_allocate(sb, silent);
-       if (!args)
-               return -ENOMEM;
+       char                    *mtpt = NULL;
 
        mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
        if (!mp)
-               goto out_free_args;
+               goto out;
 
        spin_lock_init(&mp->m_sb_lock);
-       mutex_init(&mp->m_ilock);
        mutex_init(&mp->m_growlock);
        atomic_set(&mp->m_active_trans, 0);
        INIT_LIST_HEAD(&mp->m_sync_list);
@@ -1689,12 +1413,9 @@ xfs_fs_fill_super(
        mp->m_super = sb;
        sb->s_fs_info = mp;
 
-       if (sb->s_flags & MS_RDONLY)
-               mp->m_flags |= XFS_MOUNT_RDONLY;
-
-       error = xfs_parseargs(mp, (char *)data, args, 0);
+       error = xfs_parseargs(mp, (char *)data, &mtpt);
        if (error)
-               goto out_free_mp;
+               goto out_free_fsname;
 
        sb_min_blocksize(sb, BBSIZE);
        sb->s_xattr = xfs_xattr_handlers;
@@ -1702,33 +1423,28 @@ xfs_fs_fill_super(
        sb->s_qcop = &xfs_quotactl_operations;
        sb->s_op = &xfs_super_operations;
 
-       error = xfs_dmops_get(mp, args);
+       error = xfs_dmops_get(mp);
        if (error)
-               goto out_free_mp;
-       error = xfs_qmops_get(mp, args);
+               goto out_free_fsname;
+       error = xfs_qmops_get(mp);
        if (error)
                goto out_put_dmops;
 
-       if (args->flags & XFSMNT_QUIET)
+       if (silent)
                flags |= XFS_MFSI_QUIET;
 
-       error = xfs_open_devices(mp, args);
+       error = xfs_open_devices(mp);
        if (error)
                goto out_put_qmops;
 
        if (xfs_icsb_init_counters(mp))
                mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
 
-       /*
-        * Setup flags based on mount(2) options and then the superblock
-        */
-       error = xfs_start_flags(args, mp);
-       if (error)
-               goto out_free_fsname;
        error = xfs_readsb(mp, flags);
        if (error)
-               goto out_free_fsname;
-       error = xfs_finish_flags(args, mp);
+               goto out_destroy_counters;
+
+       error = xfs_finish_flags(mp);
        if (error)
                goto out_free_sb;
 
@@ -1747,7 +1463,7 @@ xfs_fs_fill_super(
        if (error)
                goto out_filestream_unmount;
 
-       XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, args->mtpt, args->fsname);
+       XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname);
 
        sb->s_dirt = 1;
        sb->s_magic = XFS_SB_MAGIC;
@@ -1772,35 +1488,31 @@ xfs_fs_fill_super(
                goto fail_vnrele;
        }
 
-       mp->m_sync_work.w_syncer = xfs_sync_worker;
-       mp->m_sync_work.w_mount = mp;
-       mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
-       if (IS_ERR(mp->m_sync_task)) {
-               error = -PTR_ERR(mp->m_sync_task);
+       error = xfs_syncd_init(mp);
+       if (error)
                goto fail_vnrele;
-       }
 
-       xfs_itrace_exit(XFS_I(sb->s_root->d_inode));
+       kfree(mtpt);
 
-       kfree(args);
+       xfs_itrace_exit(XFS_I(sb->s_root->d_inode));
        return 0;
 
  out_filestream_unmount:
        xfs_filestream_unmount(mp);
  out_free_sb:
        xfs_freesb(mp);
- out_free_fsname:
-       xfs_free_fsname(mp);
+ out_destroy_counters:
        xfs_icsb_destroy_counters(mp);
        xfs_close_devices(mp);
  out_put_qmops:
        xfs_qmops_put(mp);
  out_put_dmops:
        xfs_dmops_put(mp);
- out_free_mp:
+ out_free_fsname:
+       xfs_free_fsname(mp);
+       kfree(mtpt);
        kfree(mp);
- out_free_args:
-       kfree(args);
+ out:
        return -error;
 
  fail_vnrele:
@@ -1820,8 +1532,6 @@ xfs_fs_fill_super(
        xfs_filestream_unmount(mp);
 
        XFS_bflush(mp->m_ddev_targp);
-       error = xfs_unmount_flush(mp, 0);
-       WARN_ON(error);
 
        xfs_unmountfs(mp);
        goto out_free_sb;
@@ -1882,10 +1592,19 @@ xfs_alloc_trace_bufs(void)
        if (!xfs_bmap_trace_buf)
                goto out_free_alloc_trace;
 #endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
+       xfs_allocbt_trace_buf = ktrace_alloc(XFS_ALLOCBT_TRACE_SIZE,
+                                            KM_MAYFAIL);
+       if (!xfs_allocbt_trace_buf)
+               goto out_free_bmap_trace;
+
+       xfs_inobt_trace_buf = ktrace_alloc(XFS_INOBT_TRACE_SIZE, KM_MAYFAIL);
+       if (!xfs_inobt_trace_buf)
+               goto out_free_allocbt_trace;
+
        xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_MAYFAIL);
        if (!xfs_bmbt_trace_buf)
-               goto out_free_bmap_trace;
+               goto out_free_inobt_trace;
 #endif
 #ifdef XFS_ATTR_TRACE
        xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_MAYFAIL);
@@ -1907,8 +1626,12 @@ xfs_alloc_trace_bufs(void)
        ktrace_free(xfs_attr_trace_buf);
  out_free_bmbt_trace:
 #endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
        ktrace_free(xfs_bmbt_trace_buf);
+ out_free_inobt_trace:
+       ktrace_free(xfs_inobt_trace_buf);
+ out_free_allocbt_trace:
+       ktrace_free(xfs_allocbt_trace_buf);
  out_free_bmap_trace:
 #endif
 #ifdef XFS_BMAP_TRACE
@@ -1931,8 +1654,10 @@ xfs_free_trace_bufs(void)
 #ifdef XFS_ATTR_TRACE
        ktrace_free(xfs_attr_trace_buf);
 #endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
        ktrace_free(xfs_bmbt_trace_buf);
+       ktrace_free(xfs_inobt_trace_buf);
+       ktrace_free(xfs_allocbt_trace_buf);
 #endif
 #ifdef XFS_BMAP_TRACE
        ktrace_free(xfs_bmap_trace_buf);
@@ -1945,16 +1670,10 @@ xfs_free_trace_bufs(void)
 STATIC int __init
 xfs_init_zones(void)
 {
-       xfs_vnode_zone = kmem_zone_init_flags(sizeof(struct inode), "xfs_vnode",
-                                       KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
-                                       KM_ZONE_SPREAD,
-                                       xfs_fs_inode_init_once);
-       if (!xfs_vnode_zone)
-               goto out;
 
        xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
        if (!xfs_ioend_zone)
-               goto out_destroy_vnode_zone;
+               goto out;
 
        xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
                                                  xfs_ioend_zone);
@@ -1970,6 +1689,7 @@ xfs_init_zones(void)
                                                "xfs_bmap_free_item");
        if (!xfs_bmap_free_item_zone)
                goto out_destroy_log_ticket_zone;
+
        xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
                                                "xfs_btree_cur");
        if (!xfs_btree_cur_zone)
@@ -2017,8 +1737,8 @@ xfs_init_zones(void)
 
        xfs_inode_zone =
                kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
-                                       KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
-                                       KM_ZONE_SPREAD, NULL);
+                       KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
+                       xfs_fs_inode_init_once);
        if (!xfs_inode_zone)
                goto out_destroy_efi_zone;
 
@@ -2066,8 +1786,6 @@ xfs_init_zones(void)
        mempool_destroy(xfs_ioend_pool);
  out_destroy_ioend_zone:
        kmem_zone_destroy(xfs_ioend_zone);
- out_destroy_vnode_zone:
-       kmem_zone_destroy(xfs_vnode_zone);
  out:
        return -ENOMEM;
 }
@@ -2092,7 +1810,6 @@ xfs_destroy_zones(void)
        kmem_zone_destroy(xfs_log_ticket_zone);
        mempool_destroy(xfs_ioend_pool);
        kmem_zone_destroy(xfs_ioend_zone);
-       kmem_zone_destroy(xfs_vnode_zone);
 
 }
 
@@ -2100,13 +1817,12 @@ STATIC int __init
 init_xfs_fs(void)
 {
        int                     error;
-       static char             message[] __initdata = KERN_INFO \
-               XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n";
 
-       printk(message);
+       printk(KERN_INFO XFS_VERSION_STRING " with "
+                        XFS_BUILD_OPTIONS " enabled\n");
 
        ktrace_init(64);
-       vn_init();
+       xfs_ioend_init();
        xfs_dir_startup();
 
        error = xfs_init_zones();
index fe2ef4e6a0f9e50fdb7a69ea8c60deb0e1892b7f..d5d776d4cd6780fe92c389a763cc5295c076ee43 100644 (file)
 
 #include <linux/exportfs.h>
 
-#ifdef CONFIG_XFS_DMAPI
-# define vfs_insertdmapi(vfs)  vfs_insertops(vfsp, &xfs_dmops)
-# define vfs_initdmapi()       dmapi_init()
-# define vfs_exitdmapi()       dmapi_uninit()
-#else
-# define vfs_insertdmapi(vfs)  do { } while (0)
-# define vfs_initdmapi()       do { } while (0)
-# define vfs_exitdmapi()       do { } while (0)
-#endif
-
 #ifdef CONFIG_XFS_QUOTA
-# define vfs_insertquota(vfs)  vfs_insertops(vfsp, &xfs_qmops)
 extern void xfs_qm_init(void);
 extern void xfs_qm_exit(void);
 # define vfs_initquota()       xfs_qm_init()
 # define vfs_exitquota()       xfs_qm_exit()
 #else
-# define vfs_insertquota(vfs)  do { } while (0)
 # define vfs_initquota()       do { } while (0)
 # define vfs_exitquota()       do { } while (0)
 #endif
@@ -101,9 +89,6 @@ struct block_device;
 
 extern __uint64_t xfs_max_file_offset(unsigned int);
 
-extern void xfs_flush_inode(struct xfs_inode *);
-extern void xfs_flush_device(struct xfs_inode *);
-
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 
 extern const struct export_operations xfs_export_operations;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
new file mode 100644 (file)
index 0000000..2ed0353
--- /dev/null
@@ -0,0 +1,762 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_attr_sf.h"
+#include "xfs_inode.h"
+#include "xfs_dinode.h"
+#include "xfs_error.h"
+#include "xfs_mru_cache.h"
+#include "xfs_filestream.h"
+#include "xfs_vnodeops.h"
+#include "xfs_utils.h"
+#include "xfs_buf_item.h"
+#include "xfs_inode_item.h"
+#include "xfs_rw.h"
+
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+/*
+ * Sync all the inodes in the given AG according to the
+ * direction given by the flags.
+ */
+STATIC int
+xfs_sync_inodes_ag(
+       xfs_mount_t     *mp,
+       int             ag,
+       int             flags)
+{
+       xfs_perag_t     *pag = &mp->m_perag[ag];
+       int             nr_found;
+       uint32_t        first_index = 0;
+       int             error = 0;
+       int             last_error = 0;
+       int             fflag = XFS_B_ASYNC;
+
+       if (flags & SYNC_DELWRI)
+               fflag = XFS_B_DELWRI;
+       if (flags & SYNC_WAIT)
+               fflag = 0;              /* synchronous overrides all */
+
+       do {
+               struct inode    *inode;
+               xfs_inode_t     *ip = NULL;
+               int             lock_flags = XFS_ILOCK_SHARED;
+
+               /*
+                * use a gang lookup to find the next inode in the tree
+                * as the tree is sparse and a gang lookup walks to find
+                * the number of objects requested.
+                */
+               read_lock(&pag->pag_ici_lock);
+               nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+                               (void**)&ip, first_index, 1);
+
+               if (!nr_found) {
+                       read_unlock(&pag->pag_ici_lock);
+                       break;
+               }
+
+               /*
+                * Update the index for the next lookup. Catch overflows
+                * into the next AG range which can occur if we have inodes
+                * in the last block of the AG and we are currently
+                * pointing to the last inode.
+                */
+               first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+               if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
+                       read_unlock(&pag->pag_ici_lock);
+                       break;
+               }
+
+               /* nothing to sync during shutdown */
+               if (XFS_FORCED_SHUTDOWN(mp)) {
+                       read_unlock(&pag->pag_ici_lock);
+                       return 0;
+               }
+
+               /*
+                * If we can't get a reference on the inode, it must be
+                * in reclaim. Leave it for the reclaim code to flush.
+                */
+               inode = VFS_I(ip);
+               if (!igrab(inode)) {
+                       read_unlock(&pag->pag_ici_lock);
+                       continue;
+               }
+               read_unlock(&pag->pag_ici_lock);
+
+               /* avoid new or bad inodes */
+               if (is_bad_inode(inode) ||
+                   xfs_iflags_test(ip, XFS_INEW)) {
+                       IRELE(ip);
+                       continue;
+               }
+
+               /*
+                * If we have to flush data or wait for I/O completion
+                * we need to hold the iolock.
+                */
+               if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) {
+                       xfs_ilock(ip, XFS_IOLOCK_SHARED);
+                       lock_flags |= XFS_IOLOCK_SHARED;
+                       error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
+                       if (flags & SYNC_IOWAIT)
+                               xfs_ioend_wait(ip);
+               }
+               xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+               if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) {
+                       if (flags & SYNC_WAIT) {
+                               xfs_iflock(ip);
+                               if (!xfs_inode_clean(ip))
+                                       error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
+                               else
+                                       xfs_ifunlock(ip);
+                       } else if (xfs_iflock_nowait(ip)) {
+                               if (!xfs_inode_clean(ip))
+                                       error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
+                               else
+                                       xfs_ifunlock(ip);
+                       }
+               }
+               xfs_iput(ip, lock_flags);
+
+               if (error)
+                       last_error = error;
+               /*
+                * bail out if the filesystem is corrupted.
+                */
+               if (error == EFSCORRUPTED)
+                       return XFS_ERROR(error);
+
+       } while (nr_found);
+
+       return last_error;
+}
+
+int
+xfs_sync_inodes(
+       xfs_mount_t     *mp,
+       int             flags)
+{
+       int             error;
+       int             last_error;
+       int             i;
+       int             lflags = XFS_LOG_FORCE;
+
+       if (mp->m_flags & XFS_MOUNT_RDONLY)
+               return 0;
+       error = 0;
+       last_error = 0;
+
+       if (flags & SYNC_WAIT)
+               lflags |= XFS_LOG_SYNC;
+
+       for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+               if (!mp->m_perag[i].pag_ici_init)
+                       continue;
+               error = xfs_sync_inodes_ag(mp, i, flags);
+               if (error)
+                       last_error = error;
+               if (error == EFSCORRUPTED)
+                       break;
+       }
+       if (flags & SYNC_DELWRI)
+               xfs_log_force(mp, 0, lflags);
+
+       return XFS_ERROR(last_error);
+}
+
+STATIC int
+xfs_commit_dummy_trans(
+       struct xfs_mount        *mp,
+       uint                    log_flags)
+{
+       struct xfs_inode        *ip = mp->m_rootip;
+       struct xfs_trans        *tp;
+       int                     error;
+
+       /*
+        * Put a dummy transaction in the log to tell recovery
+        * that all others are OK.
+        */
+       tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
+       error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+       if (error) {
+               xfs_trans_cancel(tp, 0);
+               return error;
+       }
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+       xfs_trans_ihold(tp, ip);
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+       /* XXX(hch): ignoring the error here.. */
+       error = xfs_trans_commit(tp, 0);
+
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+       xfs_log_force(mp, 0, log_flags);
+       return 0;
+}
+
+int
+xfs_sync_fsdata(
+       struct xfs_mount        *mp,
+       int                     flags)
+{
+       struct xfs_buf          *bp;
+       struct xfs_buf_log_item *bip;
+       int                     error = 0;
+
+       /*
+        * If this is xfssyncd() then only sync the superblock if we can
+        * lock it without sleeping and it is not pinned.
+        */
+       if (flags & SYNC_BDFLUSH) {
+               ASSERT(!(flags & SYNC_WAIT));
+
+               bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
+               if (!bp)
+                       goto out;
+
+               bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
+               if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp))
+                       goto out_brelse;
+       } else {
+               bp = xfs_getsb(mp, 0);
+
+               /*
+                * If the buffer is pinned then push on the log so we won't
+                * get stuck waiting in the write for someone, maybe
+                * ourselves, to flush the log.
+                *
+                * Even though we just pushed the log above, we did not have
+                * the superblock buffer locked at that point so it can
+                * become pinned in between there and here.
+                */
+               if (XFS_BUF_ISPINNED(bp))
+                       xfs_log_force(mp, 0, XFS_LOG_FORCE);
+       }
+
+
+       if (flags & SYNC_WAIT)
+               XFS_BUF_UNASYNC(bp);
+       else
+               XFS_BUF_ASYNC(bp);
+
+       return xfs_bwrite(mp, bp);
+
+ out_brelse:
+       xfs_buf_relse(bp);
+ out:
+       return error;
+}
+
+/*
+ * When remounting a filesystem read-only or freezing the filesystem, we have
+ * two phases to execute. This first phase is syncing the data before we
+ * quiesce the filesystem, and the second is flushing all the inodes out after
+ * we've waited for all the transactions created by the first phase to
+ * complete. The second phase ensures that the inodes are written to their
+ * location on disk rather than just existing in transactions in the log. This
+ * means after a quiesce there is no log replay required to write the inodes to
+ * disk (this is the main difference between a sync and a quiesce).
+ */
+/*
+ * First stage of freeze - no writers will make progress now we are here,
+ * so we flush delwri and delalloc buffers here, then wait for all I/O to
+ * complete.  Data is frozen at that point. Metadata is not frozen,
+ * transactions can still occur here so don't bother flushing the buftarg
+ * because it'll just get dirty again.
+ */
+int
+xfs_quiesce_data(
+       struct xfs_mount        *mp)
+{
+       int error;
+
+       /* push non-blocking */
+       xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_BDFLUSH);
+       XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
+       xfs_filestream_flush(mp);
+
+       /* push and block */
+       xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_WAIT|SYNC_IOWAIT);
+       XFS_QM_DQSYNC(mp, SYNC_WAIT);
+
+       /* write superblock and hoover up shutdown errors */
+       error = xfs_sync_fsdata(mp, 0);
+
+       /* flush data-only devices */
+       if (mp->m_rtdev_targp)
+               XFS_bflush(mp->m_rtdev_targp);
+
+       return error;
+}
+
+STATIC void
+xfs_quiesce_fs(
+       struct xfs_mount        *mp)
+{
+       int     count = 0, pincount;
+
+       xfs_flush_buftarg(mp->m_ddev_targp, 0);
+       xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+
+       /*
+        * This loop must run at least twice.  The first instance of the loop
+        * will flush most meta data but that will generate more meta data
+        * (typically directory updates).  Which then must be flushed and
+        * logged before we can write the unmount record.
+        */
+       do {
+               xfs_sync_inodes(mp, SYNC_ATTR|SYNC_WAIT);
+               pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
+               if (!pincount) {
+                       delay(50);
+                       count++;
+               }
+       } while (count < 2);
+}
+
+/*
+ * Second stage of a quiesce. The data is already synced, now we have to take
+ * care of the metadata. New transactions are already blocked, so we need to
+ * wait for any remaining transactions to drain out before proceding.
+ */
+void
+xfs_quiesce_attr(
+       struct xfs_mount        *mp)
+{
+       int     error = 0;
+
+       /* wait for all modifications to complete */
+       while (atomic_read(&mp->m_active_trans) > 0)
+               delay(100);
+
+       /* flush inodes and push all remaining buffers out to disk */
+       xfs_quiesce_fs(mp);
+
+       ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
+
+       /* Push the superblock and write an unmount record */
+       error = xfs_log_sbcount(mp, 1);
+       if (error)
+               xfs_fs_cmn_err(CE_WARN, mp,
+                               "xfs_attr_quiesce: failed to log sb changes. "
+                               "Frozen image may not be consistent.");
+       xfs_log_unmount_write(mp);
+       xfs_unmountfs_writesb(mp);
+}
+
+/*
+ * Enqueue a work item to be picked up by the vfs xfssyncd thread.
+ * Doing this has two advantages:
+ * - It saves on stack space, which is tight in certain situations
+ * - It can be used (with care) as a mechanism to avoid deadlocks.
+ * Flushing while allocating in a full filesystem requires both.
+ */
+STATIC void
+xfs_syncd_queue_work(
+       struct xfs_mount *mp,
+       void            *data,
+       void            (*syncer)(struct xfs_mount *, void *))
+{
+       struct bhv_vfs_sync_work *work;
+
+       work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
+       INIT_LIST_HEAD(&work->w_list);
+       work->w_syncer = syncer;
+       work->w_data = data;
+       work->w_mount = mp;
+       spin_lock(&mp->m_sync_lock);
+       list_add_tail(&work->w_list, &mp->m_sync_list);
+       spin_unlock(&mp->m_sync_lock);
+       wake_up_process(mp->m_sync_task);
+}
+
+/*
+ * Flush delayed allocate data, attempting to free up reserved space
+ * from existing allocations.  At this point a new allocation attempt
+ * has failed with ENOSPC and we are in the process of scratching our
+ * heads, looking about for more room...
+ */
+STATIC void
+xfs_flush_inode_work(
+       struct xfs_mount *mp,
+       void            *arg)
+{
+       struct inode    *inode = arg;
+       filemap_flush(inode->i_mapping);
+       iput(inode);
+}
+
+void
+xfs_flush_inode(
+       xfs_inode_t     *ip)
+{
+       struct inode    *inode = VFS_I(ip);
+
+       igrab(inode);
+       xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
+       delay(msecs_to_jiffies(500));
+}
+
+/*
+ * This is the "bigger hammer" version of xfs_flush_inode_work...
+ * (IOW, "If at first you don't succeed, use a Bigger Hammer").
+ */
+STATIC void
+xfs_flush_device_work(
+       struct xfs_mount *mp,
+       void            *arg)
+{
+       struct inode    *inode = arg;
+       sync_blockdev(mp->m_super->s_bdev);
+       iput(inode);
+}
+
+void
+xfs_flush_device(
+       xfs_inode_t     *ip)
+{
+       struct inode    *inode = VFS_I(ip);
+
+       igrab(inode);
+       xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
+       delay(msecs_to_jiffies(500));
+       xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
+}
+
+/*
+ * Every sync period we need to unpin all items, reclaim inodes, sync
+ * quota and write out the superblock. We might need to cover the log
+ * to indicate it is idle.
+ */
+STATIC void
+xfs_sync_worker(
+       struct xfs_mount *mp,
+       void            *unused)
+{
+       int             error;
+
+       if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+               xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+               xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+               /* dgc: errors ignored here */
+               error = XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
+               error = xfs_sync_fsdata(mp, SYNC_BDFLUSH);
+               if (xfs_log_need_covered(mp))
+                       error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
+       }
+       mp->m_sync_seq++;
+       wake_up(&mp->m_wait_single_sync_task);
+}
+
+STATIC int
+xfssyncd(
+       void                    *arg)
+{
+       struct xfs_mount        *mp = arg;
+       long                    timeleft;
+       bhv_vfs_sync_work_t     *work, *n;
+       LIST_HEAD               (tmp);
+
+       set_freezable();
+       timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
+       for (;;) {
+               timeleft = schedule_timeout_interruptible(timeleft);
+               /* swsusp */
+               try_to_freeze();
+               if (kthread_should_stop() && list_empty(&mp->m_sync_list))
+                       break;
+
+               spin_lock(&mp->m_sync_lock);
+               /*
+                * We can get woken by laptop mode, to do a sync -
+                * that's the (only!) case where the list would be
+                * empty with time remaining.
+                */
+               if (!timeleft || list_empty(&mp->m_sync_list)) {
+                       if (!timeleft)
+                               timeleft = xfs_syncd_centisecs *
+                                                       msecs_to_jiffies(10);
+                       INIT_LIST_HEAD(&mp->m_sync_work.w_list);
+                       list_add_tail(&mp->m_sync_work.w_list,
+                                       &mp->m_sync_list);
+               }
+               list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
+                       list_move(&work->w_list, &tmp);
+               spin_unlock(&mp->m_sync_lock);
+
+               list_for_each_entry_safe(work, n, &tmp, w_list) {
+                       (*work->w_syncer)(mp, work->w_data);
+                       list_del(&work->w_list);
+                       if (work == &mp->m_sync_work)
+                               continue;
+                       kmem_free(work);
+               }
+       }
+
+       return 0;
+}
+
+int
+xfs_syncd_init(
+       struct xfs_mount        *mp)
+{
+       mp->m_sync_work.w_syncer = xfs_sync_worker;
+       mp->m_sync_work.w_mount = mp;
+       mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
+       if (IS_ERR(mp->m_sync_task))
+               return -PTR_ERR(mp->m_sync_task);
+       return 0;
+}
+
+void
+xfs_syncd_stop(
+       struct xfs_mount        *mp)
+{
+       kthread_stop(mp->m_sync_task);
+}
+
+int
+xfs_reclaim_inode(
+       xfs_inode_t     *ip,
+       int             locked,
+       int             sync_mode)
+{
+       xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
+
+       /* The hash lock here protects a thread in xfs_iget_core from
+        * racing with us on linking the inode back with a vnode.
+        * Once we have the XFS_IRECLAIM flag set it will not touch
+        * us.
+        */
+       write_lock(&pag->pag_ici_lock);
+       spin_lock(&ip->i_flags_lock);
+       if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
+           !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
+               spin_unlock(&ip->i_flags_lock);
+               write_unlock(&pag->pag_ici_lock);
+               if (locked) {
+                       xfs_ifunlock(ip);
+                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               }
+               return 1;
+       }
+       __xfs_iflags_set(ip, XFS_IRECLAIM);
+       spin_unlock(&ip->i_flags_lock);
+       write_unlock(&pag->pag_ici_lock);
+       xfs_put_perag(ip->i_mount, pag);
+
+       /*
+        * If the inode is still dirty, then flush it out.  If the inode
+        * is not in the AIL, then it will be OK to flush it delwri as
+        * long as xfs_iflush() does not keep any references to the inode.
+        * We leave that decision up to xfs_iflush() since it has the
+        * knowledge of whether it's OK to simply do a delwri flush of
+        * the inode or whether we need to wait until the inode is
+        * pulled from the AIL.
+        * We get the flush lock regardless, though, just to make sure
+        * we don't free it while it is being flushed.
+        */
+       if (!locked) {
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               xfs_iflock(ip);
+       }
+
+       /*
+        * In the case of a forced shutdown we rely on xfs_iflush() to
+        * wait for the inode to be unpinned before returning an error.
+        */
+       if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
+               /* synchronize with xfs_iflush_done */
+               xfs_iflock(ip);
+               xfs_ifunlock(ip);
+       }
+
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       xfs_ireclaim(ip);
+       return 0;
+}
+
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+       xfs_inode_t     *ip)
+{
+       xfs_mount_t     *mp = ip->i_mount;
+       xfs_perag_t     *pag = xfs_get_perag(mp, ip->i_ino);
+
+       read_lock(&pag->pag_ici_lock);
+       spin_lock(&ip->i_flags_lock);
+       radix_tree_tag_set(&pag->pag_ici_root,
+                       XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+       __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+       spin_unlock(&ip->i_flags_lock);
+       read_unlock(&pag->pag_ici_lock);
+       xfs_put_perag(mp, pag);
+}
+
+void
+__xfs_inode_clear_reclaim_tag(
+       xfs_mount_t     *mp,
+       xfs_perag_t     *pag,
+       xfs_inode_t     *ip)
+{
+       radix_tree_tag_clear(&pag->pag_ici_root,
+                       XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+}
+
+void
+xfs_inode_clear_reclaim_tag(
+       xfs_inode_t     *ip)
+{
+       xfs_mount_t     *mp = ip->i_mount;
+       xfs_perag_t     *pag = xfs_get_perag(mp, ip->i_ino);
+
+       read_lock(&pag->pag_ici_lock);
+       spin_lock(&ip->i_flags_lock);
+       __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+       spin_unlock(&ip->i_flags_lock);
+       read_unlock(&pag->pag_ici_lock);
+       xfs_put_perag(mp, pag);
+}
+
+
+STATIC void
+xfs_reclaim_inodes_ag(
+       xfs_mount_t     *mp,
+       int             ag,
+       int             noblock,
+       int             mode)
+{
+       xfs_inode_t     *ip = NULL;
+       xfs_perag_t     *pag = &mp->m_perag[ag];
+       int             nr_found;
+       uint32_t        first_index;
+       int             skipped;
+
+restart:
+       first_index = 0;
+       skipped = 0;
+       do {
+               /*
+                * use a gang lookup to find the next inode in the tree
+                * as the tree is sparse and a gang lookup walks to find
+                * the number of objects requested.
+                */
+               read_lock(&pag->pag_ici_lock);
+               nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
+                                       (void**)&ip, first_index, 1,
+                                       XFS_ICI_RECLAIM_TAG);
+
+               if (!nr_found) {
+                       read_unlock(&pag->pag_ici_lock);
+                       break;
+               }
+
+               /*
+                * Update the index for the next lookup. Catch overflows
+                * into the next AG range which can occur if we have inodes
+                * in the last block of the AG and we are currently
+                * pointing to the last inode.
+                */
+               first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+               if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
+                       read_unlock(&pag->pag_ici_lock);
+                       break;
+               }
+
+               /* ignore if already under reclaim */
+               if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
+                       read_unlock(&pag->pag_ici_lock);
+                       continue;
+               }
+
+               if (noblock) {
+                       if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+                               read_unlock(&pag->pag_ici_lock);
+                               continue;
+                       }
+                       if (xfs_ipincount(ip) ||
+                           !xfs_iflock_nowait(ip)) {
+                               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                               read_unlock(&pag->pag_ici_lock);
+                               continue;
+                       }
+               }
+               read_unlock(&pag->pag_ici_lock);
+
+               /*
+                * hmmm - this is an inode already in reclaim. Do
+                * we even bother catching it here?
+                */
+               if (xfs_reclaim_inode(ip, noblock, mode))
+                       skipped++;
+       } while (nr_found);
+
+       if (skipped) {
+               delay(1);
+               goto restart;
+       }
+       return;
+
+}
+
+int
+xfs_reclaim_inodes(
+       xfs_mount_t     *mp,
+       int              noblock,
+       int             mode)
+{
+       int             i;
+
+       for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+               if (!mp->m_perag[i].pag_ici_init)
+                       continue;
+               xfs_reclaim_inodes_ag(mp, i, noblock, mode);
+       }
+       return 0;
+}
+
+
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
new file mode 100644 (file)
index 0000000..5f6de1e
--- /dev/null
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef XFS_SYNC_H
+#define XFS_SYNC_H 1
+
+struct xfs_mount;
+
+typedef struct bhv_vfs_sync_work {
+       struct list_head        w_list;
+       struct xfs_mount        *w_mount;
+       void                    *w_data;        /* syncer routine argument */
+       void                    (*w_syncer)(struct xfs_mount *, void *);
+} bhv_vfs_sync_work_t;
+
+#define SYNC_ATTR              0x0001  /* sync attributes */
+#define SYNC_DELWRI            0x0002  /* look at delayed writes */
+#define SYNC_WAIT              0x0004  /* wait for i/o to complete */
+#define SYNC_BDFLUSH           0x0008  /* BDFLUSH is calling -- don't block */
+#define SYNC_IOWAIT            0x0010  /* wait for all I/O to complete */
+
+int xfs_syncd_init(struct xfs_mount *mp);
+void xfs_syncd_stop(struct xfs_mount *mp);
+
+int xfs_sync_inodes(struct xfs_mount *mp, int flags);
+int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
+
+int xfs_quiesce_data(struct xfs_mount *mp);
+void xfs_quiesce_attr(struct xfs_mount *mp);
+
+void xfs_flush_inode(struct xfs_inode *ip);
+void xfs_flush_device(struct xfs_inode *ip);
+
+int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
+int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode);
+
+void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
+void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
+                               struct xfs_inode *ip);
+#endif
index 7dacb5bbde3f30123da154cc1fb3563046b694af..916c0ffb6083de2be56fdf56656ee199e248df73 100644 (file)
@@ -55,17 +55,6 @@ xfs_stats_clear_proc_handler(
 #endif /* CONFIG_PROC_FS */
 
 static ctl_table xfs_table[] = {
-       {
-               .ctl_name       = XFS_RESTRICT_CHOWN,
-               .procname       = "restrict_chown",
-               .data           = &xfs_params.restrict_chown.val,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec_minmax,
-               .strategy       = &sysctl_intvec,
-               .extra1         = &xfs_params.restrict_chown.min,
-               .extra2         = &xfs_params.restrict_chown.max
-       },
        {
                .ctl_name       = XFS_SGID_INHERIT,
                .procname       = "irix_sgid_inherit",
index 4aadb8056c373c95b0a867e40397234e505d10ae..b9937d450f8e6d72e01e186c09f446c791028ef2 100644 (file)
@@ -31,7 +31,6 @@ typedef struct xfs_sysctl_val {
 } xfs_sysctl_val_t;
 
 typedef struct xfs_param {
-       xfs_sysctl_val_t restrict_chown;/* Root/non-root can give away files.*/
        xfs_sysctl_val_t sgid_inherit;  /* Inherit S_ISGID if process' GID is
                                         * not a member of parent dir GID. */
        xfs_sysctl_val_t symlink_mode;  /* Link creat mode affected by umask */
@@ -68,7 +67,7 @@ typedef struct xfs_param {
 enum {
        /* XFS_REFCACHE_SIZE = 1 */
        /* XFS_REFCACHE_PURGE = 2 */
-       XFS_RESTRICT_CHOWN = 3,
+       /* XFS_RESTRICT_CHOWN = 3 */
        XFS_SGID_INHERIT = 4,
        XFS_SYMLINK_MODE = 5,
        XFS_PANIC_MASK = 6,
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
deleted file mode 100644 (file)
index 7e60c77..0000000
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_VFS_H__
-#define __XFS_VFS_H__
-
-#include <linux/vfs.h>
-#include "xfs_fs.h"
-
-struct inode;
-
-struct fid;
-struct cred;
-struct seq_file;
-struct super_block;
-struct xfs_inode;
-struct xfs_mount;
-struct xfs_mount_args;
-
-typedef struct kstatfs bhv_statvfs_t;
-
-typedef struct bhv_vfs_sync_work {
-       struct list_head        w_list;
-       struct xfs_mount        *w_mount;
-       void                    *w_data;        /* syncer routine argument */
-       void                    (*w_syncer)(struct xfs_mount *, void *);
-} bhv_vfs_sync_work_t;
-
-#define SYNC_ATTR              0x0001  /* sync attributes */
-#define SYNC_CLOSE             0x0002  /* close file system down */
-#define SYNC_DELWRI            0x0004  /* look at delayed writes */
-#define SYNC_WAIT              0x0008  /* wait for i/o to complete */
-#define SYNC_BDFLUSH           0x0010  /* BDFLUSH is calling -- don't block */
-#define SYNC_FSDATA            0x0020  /* flush fs data (e.g. superblocks) */
-#define SYNC_REFCACHE          0x0040  /* prune some of the nfs ref cache */
-#define SYNC_REMOUNT           0x0080  /* remount readonly, no dummy LRs */
-#define SYNC_IOWAIT            0x0100  /* wait for all I/O to complete */
-
-/*
- * When remounting a filesystem read-only or freezing the filesystem,
- * we have two phases to execute. This first phase is syncing the data
- * before we quiesce the fielsystem, and the second is flushing all the
- * inodes out after we've waited for all the transactions created by
- * the first phase to complete. The second phase uses SYNC_INODE_QUIESCE
- * to ensure that the inodes are written to their location on disk
- * rather than just existing in transactions in the log. This means
- * after a quiesce there is no log replay required to write the inodes
- * to disk (this is the main difference between a sync and a quiesce).
- */
-#define SYNC_DATA_QUIESCE      (SYNC_DELWRI|SYNC_FSDATA|SYNC_WAIT|SYNC_IOWAIT)
-#define SYNC_INODE_QUIESCE     (SYNC_REMOUNT|SYNC_ATTR|SYNC_WAIT)
-
-#define SHUTDOWN_META_IO_ERROR 0x0001  /* write attempt to metadata failed */
-#define SHUTDOWN_LOG_IO_ERROR  0x0002  /* write attempt to the log failed */
-#define SHUTDOWN_FORCE_UMOUNT  0x0004  /* shutdown from a forced unmount */
-#define SHUTDOWN_CORRUPT_INCORE        0x0008  /* corrupt in-memory data structures */
-#define SHUTDOWN_REMOTE_REQ    0x0010  /* shutdown came from remote cell */
-#define SHUTDOWN_DEVICE_REQ    0x0020  /* failed all paths to the device */
-
-#define xfs_test_for_freeze(mp)                ((mp)->m_super->s_frozen)
-#define xfs_wait_for_freeze(mp,l)      vfs_check_frozen((mp)->m_super, (l))
-
-#endif /* __XFS_VFS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
deleted file mode 100644 (file)
index b52528b..0000000
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_vnodeops.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-
-/*
- * And this gunk is needed for xfs_mount.h"
- */
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_dmapi.h"
-#include "xfs_inum.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-
-
-/*
- * Dedicated vnode inactive/reclaim sync wait queues.
- * Prime number of hash buckets since address is used as the key.
- */
-#define NVSYNC                  37
-#define vptosync(v)             (&vsync[((unsigned long)v) % NVSYNC])
-static wait_queue_head_t vsync[NVSYNC];
-
-void __init
-vn_init(void)
-{
-       int i;
-
-       for (i = 0; i < NVSYNC; i++)
-               init_waitqueue_head(&vsync[i]);
-}
-
-void
-vn_iowait(
-       xfs_inode_t     *ip)
-{
-       wait_queue_head_t *wq = vptosync(ip);
-
-       wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
-}
-
-void
-vn_iowake(
-       xfs_inode_t     *ip)
-{
-       if (atomic_dec_and_test(&ip->i_iocount))
-               wake_up(vptosync(ip));
-}
-
-/*
- * Volume managers supporting multiple paths can send back ENODEV when the
- * final path disappears.  In this case continuing to fill the page cache
- * with dirty data which cannot be written out is evil, so prevent that.
- */
-void
-vn_ioerror(
-       xfs_inode_t     *ip,
-       int             error,
-       char            *f,
-       int             l)
-{
-       if (unlikely(error == -ENODEV))
-               xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, f, l);
-}
-
-#ifdef XFS_INODE_TRACE
-
-/*
- * Reference count of Linux inode if present, -1 if the xfs_inode
- * has no associated Linux inode.
- */
-static inline int xfs_icount(struct xfs_inode *ip)
-{
-       struct inode *vp = VFS_I(ip);
-
-       if (vp)
-               return vn_count(vp);
-       return -1;
-}
-
-#define KTRACE_ENTER(ip, vk, s, line, ra)                      \
-       ktrace_enter(   (ip)->i_trace,                          \
-/*  0 */               (void *)(__psint_t)(vk),                \
-/*  1 */               (void *)(s),                            \
-/*  2 */               (void *)(__psint_t) line,               \
-/*  3 */               (void *)(__psint_t)xfs_icount(ip),      \
-/*  4 */               (void *)(ra),                           \
-/*  5 */               NULL,                                   \
-/*  6 */               (void *)(__psint_t)current_cpu(),       \
-/*  7 */               (void *)(__psint_t)current_pid(),       \
-/*  8 */               (void *)__return_address,               \
-/*  9 */               NULL, NULL, NULL, NULL, NULL, NULL, NULL)
-
-/*
- * Vnode tracing code.
- */
-void
-_xfs_itrace_entry(xfs_inode_t *ip, const char *func, inst_t *ra)
-{
-       KTRACE_ENTER(ip, INODE_KTRACE_ENTRY, func, 0, ra);
-}
-
-void
-_xfs_itrace_exit(xfs_inode_t *ip, const char *func, inst_t *ra)
-{
-       KTRACE_ENTER(ip, INODE_KTRACE_EXIT, func, 0, ra);
-}
-
-void
-xfs_itrace_hold(xfs_inode_t *ip, char *file, int line, inst_t *ra)
-{
-       KTRACE_ENTER(ip, INODE_KTRACE_HOLD, file, line, ra);
-}
-
-void
-_xfs_itrace_ref(xfs_inode_t *ip, char *file, int line, inst_t *ra)
-{
-       KTRACE_ENTER(ip, INODE_KTRACE_REF, file, line, ra);
-}
-
-void
-xfs_itrace_rele(xfs_inode_t *ip, char *file, int line, inst_t *ra)
-{
-       KTRACE_ENTER(ip, INODE_KTRACE_RELE, file, line, ra);
-}
-#endif /* XFS_INODE_TRACE */
index 683ce16210ffefe8ee69dd54c3c85e46b29331ae..f65983a230d3610b94c192d880dac5a302200c9e 100644 (file)
 #ifndef __XFS_VNODE_H__
 #define __XFS_VNODE_H__
 
+#include "xfs_fs.h"
+
 struct file;
+struct xfs_inode;
 struct xfs_iomap;
 struct attrlist_cursor_kern;
 
@@ -51,40 +54,6 @@ struct attrlist_cursor_kern;
                                           Prevent VM access to the pages until
                                           the operation completes. */
 
-
-extern void    vn_init(void);
-
-/*
- * Yeah, these don't take vnode anymore at all, all this should be
- * cleaned up at some point.
- */
-extern void    vn_iowait(struct xfs_inode *ip);
-extern void    vn_iowake(struct xfs_inode *ip);
-extern void    vn_ioerror(struct xfs_inode *ip, int error, char *f, int l);
-
-static inline int vn_count(struct inode *vp)
-{
-       return atomic_read(&vp->i_count);
-}
-
-#define IHOLD(ip) \
-do { \
-       ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
-       atomic_inc(&(VFS_I(ip)->i_count)); \
-       xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
-} while (0)
-
-#define IRELE(ip) \
-do { \
-       xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
-       iput(VFS_I(ip)); \
-} while (0)
-
-static inline struct inode *vn_grab(struct inode *vp)
-{
-       return igrab(vp);
-}
-
 /*
  * Dealing with bad inodes
  */
@@ -121,39 +90,4 @@ static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt)
                                        PAGECACHE_TAG_DIRTY)
 
 
-/*
- * Tracking vnode activity.
- */
-#if defined(XFS_INODE_TRACE)
-
-#define        INODE_TRACE_SIZE        16              /* number of trace entries */
-#define        INODE_KTRACE_ENTRY      1
-#define        INODE_KTRACE_EXIT       2
-#define        INODE_KTRACE_HOLD       3
-#define        INODE_KTRACE_REF        4
-#define        INODE_KTRACE_RELE       5
-
-extern void _xfs_itrace_entry(struct xfs_inode *, const char *, inst_t *);
-extern void _xfs_itrace_exit(struct xfs_inode *, const char *, inst_t *);
-extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *);
-extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *);
-extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *);
-#define xfs_itrace_entry(ip)   \
-       _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address)
-#define xfs_itrace_exit(ip)    \
-       _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address)
-#define xfs_itrace_exit_tag(ip, tag)   \
-       _xfs_itrace_exit(ip, tag, (inst_t *)__return_address)
-#define xfs_itrace_ref(ip)     \
-       _xfs_itrace_ref(ip, __FILE__, __LINE__, (inst_t *)__return_address)
-
-#else
-#define        xfs_itrace_entry(a)
-#define        xfs_itrace_exit(a)
-#define        xfs_itrace_exit_tag(a, b)
-#define        xfs_itrace_hold(a, b, c, d)
-#define        xfs_itrace_ref(a)
-#define        xfs_itrace_rele(a, b, c, d)
-#endif
-
 #endif /* __XFS_VNODE_H__ */
index f2705f2fd43c264674660da0285adb5e5a2c39db..591ca6602bfb6738e4677a43c094aa62d5238254 100644 (file)
@@ -101,7 +101,7 @@ xfs_qm_dqinit(
        if (brandnewdquot) {
                dqp->dq_flnext = dqp->dq_flprev = dqp;
                mutex_init(&dqp->q_qlock);
-               sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq");
+               init_waitqueue_head(&dqp->q_pinwait);
 
                /*
                 * Because we want to use a counting completion, complete
@@ -131,7 +131,7 @@ xfs_qm_dqinit(
                 dqp->q_res_bcount = 0;
                 dqp->q_res_icount = 0;
                 dqp->q_res_rtbcount = 0;
-                dqp->q_pincount = 0;
+                atomic_set(&dqp->q_pincount, 0);
                 dqp->q_hash = NULL;
                 ASSERT(dqp->dq_flnext == dqp->dq_flprev);
 
@@ -1221,16 +1221,14 @@ xfs_qm_dqflush(
        xfs_dqtrace_entry(dqp, "DQFLUSH");
 
        /*
-        * If not dirty, nada.
+        * If not dirty, or it's pinned and we are not supposed to
+        * block, nada.
         */
-       if (!XFS_DQ_IS_DIRTY(dqp)) {
+       if (!XFS_DQ_IS_DIRTY(dqp) ||
+           (!(flags & XFS_QMOPT_SYNC) && atomic_read(&dqp->q_pincount) > 0)) {
                xfs_dqfunlock(dqp);
-               return (0);
+               return 0;
        }
-
-       /*
-        * Cant flush a pinned dquot. Wait for it.
-        */
        xfs_qm_dqunpin_wait(dqp);
 
        /*
@@ -1274,10 +1272,8 @@ xfs_qm_dqflush(
        dqp->dq_flags &= ~(XFS_DQ_DIRTY);
        mp = dqp->q_mount;
 
-       /* lsn is 64 bits */
-       spin_lock(&mp->m_ail_lock);
-       dqp->q_logitem.qli_flush_lsn = dqp->q_logitem.qli_item.li_lsn;
-       spin_unlock(&mp->m_ail_lock);
+       xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
+                                       &dqp->q_logitem.qli_item.li_lsn);
 
        /*
         * Attach an iodone routine so that we can remove this dquot from the
@@ -1323,8 +1319,10 @@ xfs_qm_dqflush_done(
        xfs_dq_logitem_t        *qip)
 {
        xfs_dquot_t             *dqp;
+       struct xfs_ail          *ailp;
 
        dqp = qip->qli_dquot;
+       ailp = qip->qli_item.li_ailp;
 
        /*
         * We only want to pull the item from the AIL if its
@@ -1337,15 +1335,12 @@ xfs_qm_dqflush_done(
        if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
            qip->qli_item.li_lsn == qip->qli_flush_lsn) {
 
-               spin_lock(&dqp->q_mount->m_ail_lock);
-               /*
-                * xfs_trans_delete_ail() drops the AIL lock.
-                */
+               /* xfs_trans_ail_delete() drops the AIL lock. */
+               spin_lock(&ailp->xa_lock);
                if (qip->qli_item.li_lsn == qip->qli_flush_lsn)
-                       xfs_trans_delete_ail(dqp->q_mount,
-                                            (xfs_log_item_t*)qip);
+                       xfs_trans_ail_delete(ailp, (xfs_log_item_t*)qip);
                else
-                       spin_unlock(&dqp->q_mount->m_ail_lock);
+                       spin_unlock(&ailp->xa_lock);
        }
 
        /*
@@ -1375,7 +1370,7 @@ xfs_dqunlock(
        mutex_unlock(&(dqp->q_qlock));
        if (dqp->q_logitem.qli_dquot == dqp) {
                /* Once was dqp->q_mount, but might just have been cleared */
-               xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_mountp,
+               xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp,
                                        (xfs_log_item_t*)&(dqp->q_logitem));
        }
 }
@@ -1489,7 +1484,7 @@ xfs_qm_dqpurge(
                                "xfs_qm_dqpurge: dquot %p flush failed", dqp);
                xfs_dqflock(dqp);
        }
-       ASSERT(dqp->q_pincount == 0);
+       ASSERT(atomic_read(&dqp->q_pincount) == 0);
        ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
               !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
 
index 8958d0faf8d37606c806a1ce22d9c996036bdf08..7e455337e2ba55038199a8afeb1ac85a70786d9c 100644 (file)
@@ -83,8 +83,8 @@ typedef struct xfs_dquot {
        xfs_qcnt_t       q_res_rtbcount;/* total realtime blks used+reserved */
        mutex_t          q_qlock;       /* quota lock */
        struct completion q_flush;      /* flush completion queue */
-       uint             q_pincount;    /* pin count for this dquot */
-       sv_t             q_pinwait;     /* sync var for pinning */
+       atomic_t          q_pincount;   /* dquot pin count */
+       wait_queue_head_t q_pinwait;    /* dquot pinning wait queue */
 #ifdef XFS_DQUOT_TRACE
        struct ktrace   *q_trace;       /* trace header structure */
 #endif
index f028644caa5eb090e93620aae600bd6d9d87d26d..1728f6a7c4f50d4a37ee22900140b1a02e90c7d8 100644 (file)
@@ -88,25 +88,22 @@ xfs_qm_dquot_logitem_format(
 
 /*
  * Increment the pin count of the given dquot.
- * This value is protected by pinlock spinlock in the xQM structure.
  */
 STATIC void
 xfs_qm_dquot_logitem_pin(
        xfs_dq_logitem_t *logitem)
 {
-       xfs_dquot_t *dqp;
+       xfs_dquot_t *dqp = logitem->qli_dquot;
 
-       dqp = logitem->qli_dquot;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-       spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
-       dqp->q_pincount++;
-       spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
+       atomic_inc(&dqp->q_pincount);
 }
 
 /*
  * Decrement the pin count of the given dquot, and wake up
  * anyone in xfs_dqwait_unpin() if the count goes to 0.         The
- * dquot must have been previously pinned with a call to xfs_dqpin().
+ * dquot must have been previously pinned with a call to
+ * xfs_qm_dquot_logitem_pin().
  */
 /* ARGSUSED */
 STATIC void
@@ -114,16 +111,11 @@ xfs_qm_dquot_logitem_unpin(
        xfs_dq_logitem_t *logitem,
        int               stale)
 {
-       xfs_dquot_t *dqp;
+       xfs_dquot_t *dqp = logitem->qli_dquot;
 
-       dqp = logitem->qli_dquot;
-       ASSERT(dqp->q_pincount > 0);
-       spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
-       dqp->q_pincount--;
-       if (dqp->q_pincount == 0) {
-               sv_broadcast(&dqp->q_pinwait);
-       }
-       spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
+       ASSERT(atomic_read(&dqp->q_pincount) > 0);
+       if (atomic_dec_and_test(&dqp->q_pincount))
+               wake_up(&dqp->q_pinwait);
 }
 
 /* ARGSUSED */
@@ -193,21 +185,14 @@ xfs_qm_dqunpin_wait(
        xfs_dquot_t     *dqp)
 {
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-       if (dqp->q_pincount == 0) {
+       if (atomic_read(&dqp->q_pincount) == 0)
                return;
-       }
 
        /*
         * Give the log a push so we don't wait here too long.
         */
        xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE);
-       spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
-       if (dqp->q_pincount == 0) {
-               spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
-               return;
-       }
-       sv_wait(&(dqp->q_pinwait), PINOD,
-               &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s);
+       wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
 }
 
 /*
@@ -310,7 +295,7 @@ xfs_qm_dquot_logitem_trylock(
        uint                    retval;
 
        dqp = qip->qli_dquot;
-       if (dqp->q_pincount > 0)
+       if (atomic_read(&dqp->q_pincount) > 0)
                return (XFS_ITEM_PINNED);
 
        if (! xfs_qm_dqlock_nowait(dqp))
@@ -568,14 +553,16 @@ xfs_qm_qoffend_logitem_committed(
        xfs_lsn_t lsn)
 {
        xfs_qoff_logitem_t      *qfs;
+       struct xfs_ail          *ailp;
 
        qfs = qfe->qql_start_lip;
-       spin_lock(&qfs->qql_item.li_mountp->m_ail_lock);
+       ailp = qfs->qql_item.li_ailp;
+       spin_lock(&ailp->xa_lock);
        /*
         * Delete the qoff-start logitem from the AIL.
-        * xfs_trans_delete_ail() drops the AIL lock.
+        * xfs_trans_ail_delete() drops the AIL lock.
         */
-       xfs_trans_delete_ail(qfs->qql_item.li_mountp, (xfs_log_item_t *)qfs);
+       xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs);
        kmem_free(qfs);
        kmem_free(qfe);
        return (xfs_lsn_t)-1;
index df0ffef9775ae75d54c647a7b56c4ba248028a01..6b13960cf318cf5c518c6a1c2f366b066a1f1573 100644 (file)
@@ -20,7 +20,6 @@
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
-#include "xfs_clnt.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
@@ -396,13 +395,10 @@ xfs_qm_mount_quotas(
 /*
  * Called from the vfsops layer.
  */
-int
+void
 xfs_qm_unmount_quotas(
        xfs_mount_t     *mp)
 {
-       xfs_inode_t     *uqp, *gqp;
-       int             error = 0;
-
        /*
         * Release the dquots that root inode, et al might be holding,
         * before we flush quotas and blow away the quotainfo structure.
@@ -415,43 +411,18 @@ xfs_qm_unmount_quotas(
                xfs_qm_dqdetach(mp->m_rsumip);
 
        /*
-        * Flush out the quota inodes.
+        * Release the quota inodes.
         */
-       uqp = gqp = NULL;
        if (mp->m_quotainfo) {
-               if ((uqp = mp->m_quotainfo->qi_uquotaip) != NULL) {
-                       xfs_ilock(uqp, XFS_ILOCK_EXCL);
-                       xfs_iflock(uqp);
-                       error = xfs_iflush(uqp, XFS_IFLUSH_SYNC);
-                       xfs_iunlock(uqp, XFS_ILOCK_EXCL);
-                       if (unlikely(error == EFSCORRUPTED)) {
-                               XFS_ERROR_REPORT("xfs_qm_unmount_quotas(1)",
-                                                XFS_ERRLEVEL_LOW, mp);
-                               goto out;
-                       }
+               if (mp->m_quotainfo->qi_uquotaip) {
+                       IRELE(mp->m_quotainfo->qi_uquotaip);
+                       mp->m_quotainfo->qi_uquotaip = NULL;
                }
-               if ((gqp = mp->m_quotainfo->qi_gquotaip) != NULL) {
-                       xfs_ilock(gqp, XFS_ILOCK_EXCL);
-                       xfs_iflock(gqp);
-                       error = xfs_iflush(gqp, XFS_IFLUSH_SYNC);
-                       xfs_iunlock(gqp, XFS_ILOCK_EXCL);
-                       if (unlikely(error == EFSCORRUPTED)) {
-                               XFS_ERROR_REPORT("xfs_qm_unmount_quotas(2)",
-                                                XFS_ERRLEVEL_LOW, mp);
-                               goto out;
-                       }
+               if (mp->m_quotainfo->qi_gquotaip) {
+                       IRELE(mp->m_quotainfo->qi_gquotaip);
+                       mp->m_quotainfo->qi_gquotaip = NULL;
                }
        }
-       if (uqp) {
-                IRELE(uqp);
-                mp->m_quotainfo->qi_uquotaip = NULL;
-       }
-       if (gqp) {
-               IRELE(gqp);
-               mp->m_quotainfo->qi_gquotaip = NULL;
-       }
-out:
-       return XFS_ERROR(error);
 }
 
 /*
@@ -987,14 +958,10 @@ xfs_qm_dqdetach(
 }
 
 /*
- * This is called by VFS_SYNC and flags arg determines the caller,
- * and its motives, as done in xfs_sync.
- *
- * vfs_sync: SYNC_FSDATA|SYNC_ATTR|SYNC_BDFLUSH 0x31
- * syscall sync: SYNC_FSDATA|SYNC_ATTR|SYNC_DELWRI 0x25
- * umountroot : SYNC_WAIT | SYNC_CLOSE | SYNC_ATTR | SYNC_FSDATA
+ * This is called to sync quotas. We can be told to use non-blocking
+ * semantics by either the SYNC_BDFLUSH flag or the absence of the
+ * SYNC_WAIT flag.
  */
-
 int
 xfs_qm_sync(
        xfs_mount_t     *mp,
@@ -1137,7 +1104,6 @@ xfs_qm_init_quotainfo(
                return error;
        }
 
-       spin_lock_init(&qinf->qi_pinlock);
        xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
        qinf->qi_dqreclaims = 0;
 
@@ -1234,7 +1200,6 @@ xfs_qm_destroy_quotainfo(
         */
        xfs_qm_rele_quotafs_ref(mp);
 
-       spinlock_destroy(&qi->qi_pinlock);
        xfs_qm_list_destroy(&qi->qi_dqlist);
 
        if (qi->qi_uquotaip) {
index 44f25349e478bb7eb7db859da7c49c24c0cf75d1..ddf09166387c0e89c168329b2c15ea34ca52cefd 100644 (file)
@@ -106,7 +106,6 @@ typedef struct xfs_qm {
 typedef struct xfs_quotainfo {
        xfs_inode_t     *qi_uquotaip;    /* user quota inode */
        xfs_inode_t     *qi_gquotaip;    /* group quota inode */
-       spinlock_t       qi_pinlock;     /* dquot pinning lock */
        xfs_dqlist_t     qi_dqlist;      /* all dquots in filesys */
        int              qi_dqreclaims;  /* a change here indicates
                                            a removal in the dqlist */
@@ -168,7 +167,7 @@ extern void         xfs_qm_destroy_quotainfo(xfs_mount_t *);
 extern void            xfs_qm_mount_quotas(xfs_mount_t *);
 extern int             xfs_qm_quotacheck(xfs_mount_t *);
 extern void            xfs_qm_unmount_quotadestroy(xfs_mount_t *);
-extern int             xfs_qm_unmount_quotas(xfs_mount_t *);
+extern void            xfs_qm_unmount_quotas(xfs_mount_t *);
 extern int             xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
 extern int             xfs_qm_sync(xfs_mount_t *, int);
 
index eea2e60b456b6e962bd30872383e9ce07a0bfe31..bc6c5cca3e1251d141d7e80084cb6757e9d41214 100644 (file)
@@ -20,7 +20,6 @@
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
-#include "xfs_clnt.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
@@ -51,7 +50,7 @@
 
 STATIC void
 xfs_fill_statvfs_from_dquot(
-       bhv_statvfs_t           *statp,
+       struct kstatfs          *statp,
        xfs_disk_dquot_t        *dp)
 {
        __uint64_t              limit;
@@ -88,7 +87,7 @@ xfs_fill_statvfs_from_dquot(
 STATIC void
 xfs_qm_statvfs(
        xfs_inode_t             *ip,
-       bhv_statvfs_t           *statp)
+       struct kstatfs          *statp)
 {
        xfs_mount_t             *mp = ip->i_mount;
        xfs_dquot_t             *dqp;
index 1a3b803dfa55fbd2ca671e589e60098f6f5a0194..68139b38aedef0f9e11806b3d69bab1537871a2e 100644 (file)
@@ -127,7 +127,7 @@ xfs_qm_quotactl(
                break;
 
        case Q_XQUOTASYNC:
-               return (xfs_sync_inodes(mp, SYNC_DELWRI, NULL));
+               return xfs_sync_inodes(mp, SYNC_DELWRI);
 
        default:
                break;
@@ -1022,101 +1022,104 @@ xfs_qm_export_flags(
 
 
 /*
- * Go thru all the inodes in the file system, releasing their dquots.
- * Note that the mount structure gets modified to indicate that quotas are off
- * AFTER this, in the case of quotaoff. This also gets called from
- * xfs_rootumount.
+ * Release all the dquots on the inodes in an AG.
  */
-void
-xfs_qm_dqrele_all_inodes(
-       struct xfs_mount *mp,
-       uint             flags)
+STATIC void
+xfs_qm_dqrele_inodes_ag(
+       xfs_mount_t     *mp,
+       int             ag,
+       uint            flags)
 {
-       xfs_inode_t     *ip, *topino;
-       uint            ireclaims;
-       struct inode    *vp;
-       boolean_t       vnode_refd;
+       xfs_inode_t     *ip = NULL;
+       xfs_perag_t     *pag = &mp->m_perag[ag];
+       int             first_index = 0;
+       int             nr_found;
 
-       ASSERT(mp->m_quotainfo);
-
-       XFS_MOUNT_ILOCK(mp);
-again:
-       ip = mp->m_inodes;
-       if (ip == NULL) {
-               XFS_MOUNT_IUNLOCK(mp);
-               return;
-       }
        do {
-               /* Skip markers inserted by xfs_sync */
-               if (ip->i_mount == NULL) {
-                       ip = ip->i_mnext;
-                       continue;
+               /*
+                * use a gang lookup to find the next inode in the tree
+                * as the tree is sparse and a gang lookup walks to find
+                * the number of objects requested.
+                */
+               read_lock(&pag->pag_ici_lock);
+               nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+                               (void**)&ip, first_index, 1);
+
+               if (!nr_found) {
+                       read_unlock(&pag->pag_ici_lock);
+                       break;
                }
-               /* Root inode, rbmip and rsumip have associated blocks */
+
+               /*
+                * Update the index for the next lookup. Catch overflows
+                * into the next AG range which can occur if we have inodes
+                * in the last block of the AG and we are currently
+                * pointing to the last inode.
+                */
+               first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+               if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
+                       read_unlock(&pag->pag_ici_lock);
+                       break;
+               }
+
+               /* skip quota inodes */
                if (ip == XFS_QI_UQIP(mp) || ip == XFS_QI_GQIP(mp)) {
                        ASSERT(ip->i_udquot == NULL);
                        ASSERT(ip->i_gdquot == NULL);
-                       ip = ip->i_mnext;
+                       read_unlock(&pag->pag_ici_lock);
                        continue;
                }
-               vp = VFS_I(ip);
-               if (!vp) {
-                       ASSERT(ip->i_udquot == NULL);
-                       ASSERT(ip->i_gdquot == NULL);
-                       ip = ip->i_mnext;
+
+               /*
+                * If we can't get a reference on the inode, it must be
+                * in reclaim. Leave it for the reclaim code to flush.
+                */
+               if (!igrab(VFS_I(ip))) {
+                       read_unlock(&pag->pag_ici_lock);
                        continue;
                }
-               vnode_refd = B_FALSE;
-               if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
-                       ireclaims = mp->m_ireclaims;
-                       topino = mp->m_inodes;
-                       vp = vn_grab(vp);
-                       if (!vp)
-                               goto again;
-
-                       XFS_MOUNT_IUNLOCK(mp);
-                       /* XXX restart limit ? */
-                       xfs_ilock(ip, XFS_ILOCK_EXCL);
-                       vnode_refd = B_TRUE;
-               } else {
-                       ireclaims = mp->m_ireclaims;
-                       topino = mp->m_inodes;
-                       XFS_MOUNT_IUNLOCK(mp);
+               read_unlock(&pag->pag_ici_lock);
+
+               /* avoid new inodes though we shouldn't find any here */
+               if (xfs_iflags_test(ip, XFS_INEW)) {
+                       IRELE(ip);
+                       continue;
                }
 
-               /*
-                * We don't keep the mountlock across the dqrele() call,
-                * since it can take a while..
-                */
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
                if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
                        xfs_qm_dqrele(ip->i_udquot);
                        ip->i_udquot = NULL;
                }
-               if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) {
+               if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) &&
+                   ip->i_gdquot) {
                        xfs_qm_dqrele(ip->i_gdquot);
                        ip->i_gdquot = NULL;
                }
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               /*
-                * Wait until we've dropped the ilock and mountlock to
-                * do the vn_rele. Or be condemned to an eternity in the
-                * inactive code in hell.
-                */
-               if (vnode_refd)
-                       IRELE(ip);
-               XFS_MOUNT_ILOCK(mp);
-               /*
-                * If an inode was inserted or removed, we gotta
-                * start over again.
-                */
-               if (topino != mp->m_inodes || mp->m_ireclaims != ireclaims) {
-                       /* XXX use a sentinel */
-                       goto again;
-               }
-               ip = ip->i_mnext;
-       } while (ip != mp->m_inodes);
+               xfs_iput(ip, XFS_ILOCK_EXCL);
+
+       } while (nr_found);
+}
+
+/*
+ * Go thru all the inodes in the file system, releasing their dquots.
+ * Note that the mount structure gets modified to indicate that quotas are off
+ * AFTER this, in the case of quotaoff. This also gets called from
+ * xfs_rootumount.
+ */
+void
+xfs_qm_dqrele_all_inodes(
+       struct xfs_mount *mp,
+       uint             flags)
+{
+       int             i;
 
-       XFS_MOUNT_IUNLOCK(mp);
+       ASSERT(mp->m_quotainfo);
+       for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+               if (!mp->m_perag[i].pag_ici_init)
+                       continue;
+               xfs_qm_dqrele_inodes_ag(mp, i, flags);
+       }
 }
 
 /*------------------------------------------------------------------------*/
index c27abef7b84f9498821dfb9704e21b15bb86e781..ae5482965424d9ae4dddbed76bbff558bb86bb5d 100644 (file)
 #include <xfs.h>
 #include "debug.h"
 
+/* xfs_mount.h drags a lot of crap in, sorry.. */
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_ag.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+
 static char            message[1024];  /* keep it off the stack */
 static DEFINE_SPINLOCK(xfs_err_lock);
 
@@ -55,22 +62,42 @@ cmn_err(register int level, char *fmt, ...)
 }
 
 void
-icmn_err(register int level, char *fmt, va_list ap)
+xfs_fs_vcmn_err(
+       int                     level,
+       struct xfs_mount        *mp,
+       char                    *fmt,
+       va_list                 ap)
 {
-       ulong   flags;
-       int     len;
+       unsigned long           flags;
+       int                     len = 0;
 
        level &= XFS_ERR_MASK;
-       if(level > XFS_MAX_ERR_LEVEL)
+       if (level > XFS_MAX_ERR_LEVEL)
                level = XFS_MAX_ERR_LEVEL;
+
        spin_lock_irqsave(&xfs_err_lock,flags);
-       len = vsnprintf(message, sizeof(message), fmt, ap);
+
+       if (mp) {
+               len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname);
+
+               /*
+                * Skip the printk if we can't print anything useful
+                * due to an over-long device name.
+                */
+               if (len >= sizeof(message))
+                       goto out;
+       }
+
+       len = vsnprintf(message + len, sizeof(message) - len, fmt, ap);
        if (len >= sizeof(message))
                len = sizeof(message) - 1;
        if (message[len-1] == '\n')
                message[len-1] = 0;
+
        printk("%s%s\n", err_level[level], message);
+ out:
        spin_unlock_irqrestore(&xfs_err_lock,flags);
+
        BUG_ON(level == CE_PANIC);
 }
 
@@ -84,5 +111,5 @@ assfail(char *expr, char *file, int line)
 void
 xfs_hex_dump(void *p, int length)
 {
-       print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1);
+       print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
 }
index 75845f950814ca8beac0bf2941eea886d8446271..6f4fd37c67af41e684d8371353bb2915a6f62294 100644 (file)
@@ -27,8 +27,6 @@
 #define CE_ALERT        1               /* alert        */
 #define CE_PANIC        0               /* panic        */
 
-extern void icmn_err(int, char *, va_list)
-       __attribute__ ((format (printf, 2, 0)));
 extern void cmn_err(int, char *, ...)
        __attribute__ ((format (printf, 2, 3)));
 extern void assfail(char *expr, char *f, int l);
index a34ef05489b1c323369c7b22cf40400febaa28dd..2d494c26717f47e14aa1fb90aeb65d75ec5f6abc 100644 (file)
@@ -113,21 +113,16 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
 void
 ktrace_free(ktrace_t *ktp)
 {
-       int     entries_size;
-
        if (ktp == (ktrace_t *)NULL)
                return;
 
        /*
         * Special treatment for the Vnode trace buffer.
         */
-       if (ktp->kt_nentries == ktrace_zentries) {
+       if (ktp->kt_nentries == ktrace_zentries)
                kmem_zone_free(ktrace_ent_zone, ktp->kt_entries);
-       } else {
-               entries_size = (int)(ktp->kt_nentries * sizeof(ktrace_entry_t));
-
+       else
                kmem_free(ktp->kt_entries);
-       }
 
        kmem_zone_free(ktrace_hdr_zone, ktp);
 }
index 540e4c98982512a365eeccc4bd7f5d16b1d9db63..17254b529c54de506e8969826e1f905051763b80 100644 (file)
@@ -30,7 +30,7 @@
 #define XFS_ATTR_TRACE 1
 #define XFS_BLI_TRACE 1
 #define XFS_BMAP_TRACE 1
-#define XFS_BMBT_TRACE 1
+#define XFS_BTREE_TRACE 1
 #define XFS_DIR2_TRACE 1
 #define XFS_DQUOT_TRACE 1
 #define XFS_ILOCK_TRACE 1
index 91d69338d3b214c9ff8339217cf4834c842d6045..a8cdd73999a405080cf5748d8814306cf333170e 100644 (file)
@@ -758,7 +758,7 @@ xfs_acl_setmode(
        if (gap && nomask)
                iattr.ia_mode |= gap->ae_perm << 3;
 
-       return xfs_setattr(XFS_I(vp), &iattr, 0, sys_cred);
+       return xfs_setattr(XFS_I(vp), &iattr, 0);
 }
 
 /*
index 61b292a9fb414f2486ce92a1175b8a11b2cc6e7d..f2e21817a226dac5d12fcdf3d82396cd7e5ee26b 100644 (file)
@@ -91,6 +91,8 @@ typedef struct xfs_agf {
 #define        XFS_AGF_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
 #define        XFS_BUF_TO_AGF(bp)      ((xfs_agf_t *)XFS_BUF_PTR(bp))
 
+extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
+                       xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
 
 /*
  * Size of the unlinked inode hash table in the agi.
@@ -142,6 +144,9 @@ typedef struct xfs_agi {
 #define        XFS_AGI_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
 #define        XFS_BUF_TO_AGI(bp)      ((xfs_agi_t *)XFS_BUF_PTR(bp))
 
+extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
+                               xfs_agnumber_t agno, struct xfs_buf **bpp);
+
 /*
  * The third a.g. block contains the a.g. freelist, an array
  * of block pointers to blocks owned by the allocation btree code.
@@ -192,17 +197,23 @@ typedef struct xfs_perag
        xfs_agino_t     pagi_freecount; /* number of free inodes */
        xfs_agino_t     pagi_count;     /* number of allocated inodes */
        int             pagb_count;     /* pagb slots in use */
+       xfs_perag_busy_t *pagb_list;    /* unstable blocks */
 #ifdef __KERNEL__
        spinlock_t      pagb_lock;      /* lock for pagb_list */
-#endif
-       xfs_perag_busy_t *pagb_list;    /* unstable blocks */
+
        atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
 
        int             pag_ici_init;   /* incore inode cache initialised */
        rwlock_t        pag_ici_lock;   /* incore inode lock */
        struct radix_tree_root pag_ici_root;    /* incore inode cache root */
+#endif
 } xfs_perag_t;
 
+/*
+ * tags for inode radix tree
+ */
+#define XFS_ICI_RECLAIM_TAG    0       /* inode is to be reclaimed */
+
 #define        XFS_AG_MAXLEVELS(mp)            ((mp)->m_ag_maxlevels)
 #define        XFS_MIN_FREELIST_RAW(bl,cl,mp)  \
        (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
index 1956f83489f1b8378f4459844e422547d421e0ec..028e44e58ea986946f2e4615f2b2c0088a6845c8 100644 (file)
@@ -89,6 +89,92 @@ STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
  * Internal functions.
  */
 
+/*
+ * Lookup the record equal to [bno, len] in the btree given by cur.
+ */
+STATIC int                             /* error */
+xfs_alloc_lookup_eq(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           bno,    /* starting block of extent */
+       xfs_extlen_t            len,    /* length of extent */
+       int                     *stat)  /* success/failure */
+{
+       cur->bc_rec.a.ar_startblock = bno;
+       cur->bc_rec.a.ar_blockcount = len;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/*
+ * Lookup the first record greater than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+STATIC int                             /* error */
+xfs_alloc_lookup_ge(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           bno,    /* starting block of extent */
+       xfs_extlen_t            len,    /* length of extent */
+       int                     *stat)  /* success/failure */
+{
+       cur->bc_rec.a.ar_startblock = bno;
+       cur->bc_rec.a.ar_blockcount = len;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+ * Lookup the first record less than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+STATIC int                             /* error */
+xfs_alloc_lookup_le(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           bno,    /* starting block of extent */
+       xfs_extlen_t            len,    /* length of extent */
+       int                     *stat)  /* success/failure */
+{
+       cur->bc_rec.a.ar_startblock = bno;
+       cur->bc_rec.a.ar_blockcount = len;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int                             /* error */
+xfs_alloc_update(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           bno,    /* starting block of extent */
+       xfs_extlen_t            len)    /* length of extent */
+{
+       union xfs_btree_rec     rec;
+
+       rec.alloc.ar_startblock = cpu_to_be32(bno);
+       rec.alloc.ar_blockcount = cpu_to_be32(len);
+       return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+STATIC int                             /* error */
+xfs_alloc_get_rec(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           *bno,   /* output: starting block of extent */
+       xfs_extlen_t            *len,   /* output: length of extent */
+       int                     *stat)  /* output: success/failure */
+{
+       union xfs_btree_rec     *rec;
+       int                     error;
+
+       error = xfs_btree_get_rec(cur, &rec, stat);
+       if (!error && *stat == 1) {
+               *bno = be32_to_cpu(rec->alloc.ar_startblock);
+               *len = be32_to_cpu(rec->alloc.ar_blockcount);
+       }
+       return error;
+}
+
 /*
  * Compute aligned version of the found extent.
  * Takes alignment and min length into account.
@@ -294,21 +380,20 @@ xfs_alloc_fixup_trees(
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        }
+
 #ifdef DEBUG
-       {
-               xfs_alloc_block_t       *bnoblock;
-               xfs_alloc_block_t       *cntblock;
-
-               if (bno_cur->bc_nlevels == 1 &&
-                   cnt_cur->bc_nlevels == 1) {
-                       bnoblock = XFS_BUF_TO_ALLOC_BLOCK(bno_cur->bc_bufs[0]);
-                       cntblock = XFS_BUF_TO_ALLOC_BLOCK(cnt_cur->bc_bufs[0]);
-                       XFS_WANT_CORRUPTED_RETURN(
-                               be16_to_cpu(bnoblock->bb_numrecs) ==
-                               be16_to_cpu(cntblock->bb_numrecs));
-               }
+       if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
+               struct xfs_btree_block  *bnoblock;
+               struct xfs_btree_block  *cntblock;
+
+               bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
+               cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
+
+               XFS_WANT_CORRUPTED_RETURN(
+                       bnoblock->bb_numrecs == cntblock->bb_numrecs);
        }
 #endif
+
        /*
         * Deal with all four cases: the allocated record is contained
         * within the freespace record, so we can have new freespace
@@ -333,7 +418,7 @@ xfs_alloc_fixup_trees(
        /*
         * Delete the entry from the by-size btree.
         */
-       if ((error = xfs_alloc_delete(cnt_cur, &i)))
+       if ((error = xfs_btree_delete(cnt_cur, &i)))
                return error;
        XFS_WANT_CORRUPTED_RETURN(i == 1);
        /*
@@ -343,7 +428,7 @@ xfs_alloc_fixup_trees(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 0);
-               if ((error = xfs_alloc_insert(cnt_cur, &i)))
+               if ((error = xfs_btree_insert(cnt_cur, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        }
@@ -351,7 +436,7 @@ xfs_alloc_fixup_trees(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 0);
-               if ((error = xfs_alloc_insert(cnt_cur, &i)))
+               if ((error = xfs_btree_insert(cnt_cur, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        }
@@ -362,7 +447,7 @@ xfs_alloc_fixup_trees(
                /*
                 * No remaining freespace, just delete the by-block tree entry.
                 */
-               if ((error = xfs_alloc_delete(bno_cur, &i)))
+               if ((error = xfs_btree_delete(bno_cur, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        } else {
@@ -379,7 +464,7 @@ xfs_alloc_fixup_trees(
                if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 0);
-               if ((error = xfs_alloc_insert(bno_cur, &i)))
+               if ((error = xfs_btree_insert(bno_cur, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        }
@@ -640,8 +725,8 @@ xfs_alloc_ag_vextent_exact(
        /*
         * Allocate/initialize a cursor for the by-number freespace btree.
         */
-       bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
-               args->agno, XFS_BTNUM_BNO, NULL, 0);
+       bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+               args->agno, XFS_BTNUM_BNO);
        /*
         * Lookup bno and minlen in the btree (minlen is irrelevant, really).
         * Look for the closest free block <= bno, it must contain bno
@@ -696,8 +781,8 @@ xfs_alloc_ag_vextent_exact(
         * We are allocating agbno for rlen [agbno .. end]
         * Allocate/initialize a cursor for the by-size btree.
         */
-       cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
-               args->agno, XFS_BTNUM_CNT, NULL, 0);
+       cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+               args->agno, XFS_BTNUM_CNT);
        ASSERT(args->agbno + args->len <=
                be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
@@ -759,8 +844,8 @@ xfs_alloc_ag_vextent_near(
        /*
         * Get a cursor for the by-size btree.
         */
-       cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
-               args->agno, XFS_BTNUM_CNT, NULL, 0);
+       cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+               args->agno, XFS_BTNUM_CNT);
        ltlen = 0;
        bno_cur_lt = bno_cur_gt = NULL;
        /*
@@ -818,7 +903,7 @@ xfs_alloc_ag_vextent_near(
                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                                if (ltlen >= args->minlen)
                                        break;
-                               if ((error = xfs_alloc_increment(cnt_cur, 0, &i)))
+                               if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
                                        goto error0;
                        } while (i);
                        ASSERT(ltlen >= args->minlen);
@@ -828,7 +913,7 @@ xfs_alloc_ag_vextent_near(
                i = cnt_cur->bc_ptrs[0];
                for (j = 1, blen = 0, bdiff = 0;
                     !error && j && (blen < args->maxlen || bdiff > 0);
-                    error = xfs_alloc_increment(cnt_cur, 0, &j)) {
+                    error = xfs_btree_increment(cnt_cur, 0, &j)) {
                        /*
                         * For each entry, decide if it's better than
                         * the previous best entry.
@@ -886,8 +971,8 @@ xfs_alloc_ag_vextent_near(
                /*
                 * Set up a cursor for the by-bno tree.
                 */
-               bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp,
-                       args->agbp, args->agno, XFS_BTNUM_BNO, NULL, 0);
+               bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp,
+                       args->agbp, args->agno, XFS_BTNUM_BNO);
                /*
                 * Fix up the btree entries.
                 */
@@ -914,8 +999,8 @@ xfs_alloc_ag_vextent_near(
        /*
         * Allocate and initialize the cursor for the leftward search.
         */
-       bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
-               args->agno, XFS_BTNUM_BNO, NULL, 0);
+       bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+               args->agno, XFS_BTNUM_BNO);
        /*
         * Lookup <= bno to find the leftward search's starting point.
         */
@@ -938,7 +1023,7 @@ xfs_alloc_ag_vextent_near(
         * Increment the cursor, so we will point at the entry just right
         * of the leftward entry if any, or to the leftmost entry.
         */
-       if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
+       if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
                goto error0;
        if (!i) {
                /*
@@ -961,7 +1046,7 @@ xfs_alloc_ag_vextent_near(
                                        args->minlen, &ltbnoa, &ltlena);
                        if (ltlena >= args->minlen)
                                break;
-                       if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i)))
+                       if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
                                goto error0;
                        if (!i) {
                                xfs_btree_del_cursor(bno_cur_lt,
@@ -977,7 +1062,7 @@ xfs_alloc_ag_vextent_near(
                                        args->minlen, &gtbnoa, &gtlena);
                        if (gtlena >= args->minlen)
                                break;
-                       if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
+                       if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
                                goto error0;
                        if (!i) {
                                xfs_btree_del_cursor(bno_cur_gt,
@@ -1066,7 +1151,7 @@ xfs_alloc_ag_vextent_near(
                                        /*
                                         * Fell off the right end.
                                         */
-                                       if ((error = xfs_alloc_increment(
+                                       if ((error = xfs_btree_increment(
                                                        bno_cur_gt, 0, &i)))
                                                goto error0;
                                        if (!i) {
@@ -1162,7 +1247,7 @@ xfs_alloc_ag_vextent_near(
                                        /*
                                         * Fell off the left end.
                                         */
-                                       if ((error = xfs_alloc_decrement(
+                                       if ((error = xfs_btree_decrement(
                                                        bno_cur_lt, 0, &i)))
                                                goto error0;
                                        if (!i) {
@@ -1267,8 +1352,8 @@ xfs_alloc_ag_vextent_size(
        /*
         * Allocate and initialize a cursor for the by-size btree.
         */
-       cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
-               args->agno, XFS_BTNUM_CNT, NULL, 0);
+       cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+               args->agno, XFS_BTNUM_CNT);
        bno_cur = NULL;
        /*
         * Look for an entry >= maxlen+alignment-1 blocks.
@@ -1321,7 +1406,7 @@ xfs_alloc_ag_vextent_size(
                bestflen = flen;
                bestfbno = fbno;
                for (;;) {
-                       if ((error = xfs_alloc_decrement(cnt_cur, 0, &i)))
+                       if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
                                goto error0;
                        if (i == 0)
                                break;
@@ -1372,8 +1457,8 @@ xfs_alloc_ag_vextent_size(
        /*
         * Allocate and initialize a cursor for the by-block tree.
         */
-       bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
-               args->agno, XFS_BTNUM_BNO, NULL, 0);
+       bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+               args->agno, XFS_BTNUM_BNO);
        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
                        rbno, rlen, XFSA_FIXUP_CNT_OK)))
                goto error0;
@@ -1416,7 +1501,7 @@ xfs_alloc_ag_vextent_small(
        xfs_extlen_t    flen;
        int             i;
 
-       if ((error = xfs_alloc_decrement(ccur, 0, &i)))
+       if ((error = xfs_btree_decrement(ccur, 0, &i)))
                goto error0;
        if (i) {
                if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
@@ -1515,8 +1600,7 @@ xfs_free_ag_extent(
        /*
         * Allocate and initialize a cursor for the by-block btree.
         */
-       bno_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO, NULL,
-               0);
+       bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
        cnt_cur = NULL;
        /*
         * Look for a neighboring block on the left (lower block numbers)
@@ -1549,7 +1633,7 @@ xfs_free_ag_extent(
         * Look for a neighboring block on the right (higher block numbers)
         * that is contiguous with this space.
         */
-       if ((error = xfs_alloc_increment(bno_cur, 0, &haveright)))
+       if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
                goto error0;
        if (haveright) {
                /*
@@ -1575,8 +1659,7 @@ xfs_free_ag_extent(
        /*
         * Now allocate and initialize a cursor for the by-size tree.
         */
-       cnt_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT, NULL,
-               0);
+       cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
        /*
         * Have both left and right contiguous neighbors.
         * Merge all three into a single free block.
@@ -1588,7 +1671,7 @@ xfs_free_ag_extent(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if ((error = xfs_alloc_delete(cnt_cur, &i)))
+               if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
@@ -1597,19 +1680,19 @@ xfs_free_ag_extent(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if ((error = xfs_alloc_delete(cnt_cur, &i)))
+               if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
                 * Delete the old by-block entry for the right block.
                 */
-               if ((error = xfs_alloc_delete(bno_cur, &i)))
+               if ((error = xfs_btree_delete(bno_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
                 * Move the by-block cursor back to the left neighbor.
                 */
-               if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
+               if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 #ifdef DEBUG
@@ -1648,14 +1731,14 @@ xfs_free_ag_extent(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if ((error = xfs_alloc_delete(cnt_cur, &i)))
+               if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
                 * Back up the by-block cursor to the left neighbor, and
                 * update its length.
                 */
-               if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
+               if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                nbno = ltbno;
@@ -1674,7 +1757,7 @@ xfs_free_ag_extent(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if ((error = xfs_alloc_delete(cnt_cur, &i)))
+               if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
@@ -1693,7 +1776,7 @@ xfs_free_ag_extent(
        else {
                nbno = bno;
                nlen = len;
-               if ((error = xfs_alloc_insert(bno_cur, &i)))
+               if ((error = xfs_btree_insert(bno_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        }
@@ -1705,7 +1788,7 @@ xfs_free_ag_extent(
        if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
                goto error0;
        XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
-       if ((error = xfs_alloc_insert(cnt_cur, &i)))
+       if ((error = xfs_btree_insert(cnt_cur, &i)))
                goto error0;
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -2150,51 +2233,83 @@ xfs_alloc_put_freelist(
  * Read in the allocation group header (free/alloc section).
  */
 int                                    /* error */
-xfs_alloc_read_agf(
-       xfs_mount_t     *mp,            /* mount point structure */
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_agnumber_t  agno,           /* allocation group number */
-       int             flags,          /* XFS_ALLOC_FLAG_... */
-       xfs_buf_t       **bpp)          /* buffer for the ag freelist header */
+xfs_read_agf(
+       struct xfs_mount        *mp,    /* mount point structure */
+       struct xfs_trans        *tp,    /* transaction pointer */
+       xfs_agnumber_t          agno,   /* allocation group number */
+       int                     flags,  /* XFS_BUF_ */
+       struct xfs_buf          **bpp)  /* buffer for the ag freelist header */
 {
-       xfs_agf_t       *agf;           /* ag freelist header */
+       struct xfs_agf  *agf;           /* ag freelist header */
        int             agf_ok;         /* set if agf is consistent */
-       xfs_buf_t       *bp;            /* return value */
-       xfs_perag_t     *pag;           /* per allocation group data */
        int             error;
 
        ASSERT(agno != NULLAGNUMBER);
        error = xfs_trans_read_buf(
                        mp, tp, mp->m_ddev_targp,
                        XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-                       XFS_FSS_TO_BB(mp, 1),
-                       (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0U,
-                       &bp);
+                       XFS_FSS_TO_BB(mp, 1), flags, bpp);
        if (error)
                return error;
-       ASSERT(!bp || !XFS_BUF_GETERROR(bp));
-       if (!bp) {
-               *bpp = NULL;
+       if (!*bpp)
                return 0;
-       }
+
+       ASSERT(!XFS_BUF_GETERROR(*bpp));
+       agf = XFS_BUF_TO_AGF(*bpp);
+
        /*
         * Validate the magic number of the agf block.
         */
-       agf = XFS_BUF_TO_AGF(bp);
        agf_ok =
                be32_to_cpu(agf->agf_magicnum) == XFS_AGF_MAGIC &&
                XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
                be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
                be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
                be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
-               be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp);
+               be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) &&
+               be32_to_cpu(agf->agf_seqno) == agno;
+       if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+               agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
+                                               be32_to_cpu(agf->agf_length);
        if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
                        XFS_RANDOM_ALLOC_READ_AGF))) {
                XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
                                     XFS_ERRLEVEL_LOW, mp, agf);
-               xfs_trans_brelse(tp, bp);
+               xfs_trans_brelse(tp, *bpp);
                return XFS_ERROR(EFSCORRUPTED);
        }
+
+       XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF);
+       return 0;
+}
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int                                    /* error */
+xfs_alloc_read_agf(
+       struct xfs_mount        *mp,    /* mount point structure */
+       struct xfs_trans        *tp,    /* transaction pointer */
+       xfs_agnumber_t          agno,   /* allocation group number */
+       int                     flags,  /* XFS_ALLOC_FLAG_... */
+       struct xfs_buf          **bpp)  /* buffer for the ag freelist header */
+{
+       struct xfs_agf          *agf;           /* ag freelist header */
+       struct xfs_perag        *pag;           /* per allocation group data */
+       int                     error;
+
+       ASSERT(agno != NULLAGNUMBER);
+
+       error = xfs_read_agf(mp, tp, agno,
+                       (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0,
+                       bpp);
+       if (error)
+               return error;
+       if (!*bpp)
+               return 0;
+       ASSERT(!XFS_BUF_GETERROR(*bpp));
+
+       agf = XFS_BUF_TO_AGF(*bpp);
        pag = &mp->m_perag[agno];
        if (!pag->pagf_init) {
                pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
@@ -2213,6 +2328,7 @@ xfs_alloc_read_agf(
 #ifdef DEBUG
        else if (!XFS_FORCED_SHUTDOWN(mp)) {
                ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
+               ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
                ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
                ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
                ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
@@ -2221,8 +2337,6 @@ xfs_alloc_read_agf(
                       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
        }
 #endif
-       XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGF, XFS_AGF_REF);
-       *bpp = bp;
        return 0;
 }
 
index 5aec15d0651e836191d794edecddebc797cdf32c..588172796f7b92c61f696591bb5241f1d8f42ccc 100644 (file)
@@ -121,6 +121,19 @@ extern ktrace_t *xfs_alloc_trace_buf;
 #define        XFS_ALLOC_KTRACE_BUSYSEARCH     6
 #endif
 
+void
+xfs_alloc_mark_busy(xfs_trans_t *tp,
+               xfs_agnumber_t agno,
+               xfs_agblock_t bno,
+               xfs_extlen_t len);
+
+void
+xfs_alloc_clear_busy(xfs_trans_t *tp,
+               xfs_agnumber_t ag,
+               int idx);
+
+#endif /* __KERNEL__ */
+
 /*
  * Compute and fill in value of m_ag_maxlevels.
  */
@@ -196,18 +209,4 @@ xfs_free_extent(
        xfs_fsblock_t   bno,    /* starting block number of extent */
        xfs_extlen_t    len);   /* length of extent */
 
-void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
-               xfs_agnumber_t agno,
-               xfs_agblock_t bno,
-               xfs_extlen_t len);
-
-void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
-               xfs_agnumber_t ag,
-               int idx);
-
-
-#endif /* __KERNEL__ */
-
 #endif /* __XFS_ALLOC_H__ */
index 3ce2645508ae70ea826cc3b77cb2aeeaa1830785..733cb75a8c5d956b96743f4b67c6a654850a366a 100644 (file)
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 
-/*
- * Prototypes for internal functions.
- */
-
-STATIC void xfs_alloc_log_block(xfs_trans_t *, xfs_buf_t *, int);
-STATIC void xfs_alloc_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_alloc_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_alloc_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC int xfs_alloc_lshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_alloc_newroot(xfs_btree_cur_t *, int *);
-STATIC int xfs_alloc_rshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_alloc_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
-               xfs_alloc_key_t *, xfs_btree_cur_t **, int *);
-STATIC int xfs_alloc_updkey(xfs_btree_cur_t *, xfs_alloc_key_t *, int);
-
-/*
- * Internal functions.
- */
-
-/*
- * Single level of the xfs_alloc_delete record deletion routine.
- * Delete record pointed to by cur/level.
- * Remove the record from its block then rebalance the tree.
- * Return 0 for error, 1 for done, 2 to go on to the next level.
- */
-STATIC int                             /* error */
-xfs_alloc_delrec(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level removing record from */
-       int                     *stat)  /* fail/done/go-on */
-{
-       xfs_agf_t               *agf;   /* allocation group freelist header */
-       xfs_alloc_block_t       *block; /* btree block record/key lives in */
-       xfs_agblock_t           bno;    /* btree block number */
-       xfs_buf_t               *bp;    /* buffer for block */
-       int                     error;  /* error return value */
-       int                     i;      /* loop index */
-       xfs_alloc_key_t         key;    /* kp points here if block is level 0 */
-       xfs_agblock_t           lbno;   /* left block's block number */
-       xfs_buf_t               *lbp;   /* left block's buffer pointer */
-       xfs_alloc_block_t       *left;  /* left btree block */
-       xfs_alloc_key_t         *lkp=NULL;      /* left block key pointer */
-       xfs_alloc_ptr_t         *lpp=NULL;      /* left block address pointer */
-       int                     lrecs=0;        /* number of records in left block */
-       xfs_alloc_rec_t         *lrp;   /* left block record pointer */
-       xfs_mount_t             *mp;    /* mount structure */
-       int                     ptr;    /* index in btree block for this rec */
-       xfs_agblock_t           rbno;   /* right block's block number */
-       xfs_buf_t               *rbp;   /* right block's buffer pointer */
-       xfs_alloc_block_t       *right; /* right btree block */
-       xfs_alloc_key_t         *rkp;   /* right block key pointer */
-       xfs_alloc_ptr_t         *rpp;   /* right block address pointer */
-       int                     rrecs=0;        /* number of records in right block */
-       int                     numrecs;
-       xfs_alloc_rec_t         *rrp;   /* right block record pointer */
-       xfs_btree_cur_t         *tcur;  /* temporary btree cursor */
-
-       /*
-        * Get the index of the entry being deleted, check for nothing there.
-        */
-       ptr = cur->bc_ptrs[level];
-       if (ptr == 0) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * Get the buffer & block containing the record or key/ptr.
-        */
-       bp = cur->bc_bufs[level];
-       block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-               return error;
-#endif
-       /*
-        * Fail if we're off the end of the block.
-        */
-       numrecs = be16_to_cpu(block->bb_numrecs);
-       if (ptr > numrecs) {
-               *stat = 0;
-               return 0;
-       }
-       XFS_STATS_INC(xs_abt_delrec);
-       /*
-        * It's a nonleaf.  Excise the key and ptr being deleted, by
-        * sliding the entries past them down one.
-        * Log the changed areas of the block.
-        */
-       if (level > 0) {
-               lkp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
-               lpp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
-               for (i = ptr; i < numrecs; i++) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
-                               return error;
-               }
-#endif
-               if (ptr < numrecs) {
-                       memmove(&lkp[ptr - 1], &lkp[ptr],
-                               (numrecs - ptr) * sizeof(*lkp));
-                       memmove(&lpp[ptr - 1], &lpp[ptr],
-                               (numrecs - ptr) * sizeof(*lpp));
-                       xfs_alloc_log_ptrs(cur, bp, ptr, numrecs - 1);
-                       xfs_alloc_log_keys(cur, bp, ptr, numrecs - 1);
-               }
-       }
-       /*
-        * It's a leaf.  Excise the record being deleted, by sliding the
-        * entries past it down one.  Log the changed areas of the block.
-        */
-       else {
-               lrp = XFS_ALLOC_REC_ADDR(block, 1, cur);
-               if (ptr < numrecs) {
-                       memmove(&lrp[ptr - 1], &lrp[ptr],
-                               (numrecs - ptr) * sizeof(*lrp));
-                       xfs_alloc_log_recs(cur, bp, ptr, numrecs - 1);
-               }
-               /*
-                * If it's the first record in the block, we'll need a key
-                * structure to pass up to the next level (updkey).
-                */
-               if (ptr == 1) {
-                       key.ar_startblock = lrp->ar_startblock;
-                       key.ar_blockcount = lrp->ar_blockcount;
-                       lkp = &key;
-               }
-       }
-       /*
-        * Decrement and log the number of entries in the block.
-        */
-       numrecs--;
-       block->bb_numrecs = cpu_to_be16(numrecs);
-       xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-       /*
-        * See if the longest free extent in the allocation group was
-        * changed by this operation.  True if it's the by-size btree, and
-        * this is the leaf level, and there is no right sibling block,
-        * and this was the last record.
-        */
-       agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-       mp = cur->bc_mp;
-
-       if (level == 0 &&
-           cur->bc_btnum == XFS_BTNUM_CNT &&
-           be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
-           ptr > numrecs) {
-               ASSERT(ptr == numrecs + 1);
-               /*
-                * There are still records in the block.  Grab the size
-                * from the last one.
-                */
-               if (numrecs) {
-                       rrp = XFS_ALLOC_REC_ADDR(block, numrecs, cur);
-                       agf->agf_longest = rrp->ar_blockcount;
-               }
-               /*
-                * No free extents left.
-                */
-               else
-                       agf->agf_longest = 0;
-               mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_longest =
-                       be32_to_cpu(agf->agf_longest);
-               xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-                       XFS_AGF_LONGEST);
-       }
-       /*
-        * Is this the root level?  If so, we're almost done.
-        */
-       if (level == cur->bc_nlevels - 1) {
-               /*
-                * If this is the root level,
-                * and there's only one entry left,
-                * and it's NOT the leaf level,
-                * then we can get rid of this level.
-                */
-               if (numrecs == 1 && level > 0) {
-                       /*
-                        * lpp is still set to the first pointer in the block.
-                        * Make it the new root of the btree.
-                        */
-                       bno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]);
-                       agf->agf_roots[cur->bc_btnum] = *lpp;
-                       be32_add_cpu(&agf->agf_levels[cur->bc_btnum], -1);
-                       mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_levels[cur->bc_btnum]--;
-                       /*
-                        * Put this buffer/block on the ag's freelist.
-                        */
-                       error = xfs_alloc_put_freelist(cur->bc_tp,
-                                       cur->bc_private.a.agbp, NULL, bno, 1);
-                       if (error)
-                               return error;
-                       /*
-                        * Since blocks move to the free list without the
-                        * coordination used in xfs_bmap_finish, we can't allow
-                        * block to be available for reallocation and
-                        * non-transaction writing (user data) until we know
-                        * that the transaction that moved it to the free list
-                        * is permanently on disk. We track the blocks by
-                        * declaring these blocks as "busy"; the busy list is
-                        * maintained on a per-ag basis and each transaction
-                        * records which entries should be removed when the
-                        * iclog commits to disk. If a busy block is
-                        * allocated, the iclog is pushed up to the LSN
-                        * that freed the block.
-                        */
-                       xfs_alloc_mark_busy(cur->bc_tp,
-                               be32_to_cpu(agf->agf_seqno), bno, 1);
-
-                       xfs_trans_agbtree_delta(cur->bc_tp, -1);
-                       xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-                               XFS_AGF_ROOTS | XFS_AGF_LEVELS);
-                       /*
-                        * Update the cursor so there's one fewer level.
-                        */
-                       xfs_btree_setbuf(cur, level, NULL);
-                       cur->bc_nlevels--;
-               } else if (level > 0 &&
-                          (error = xfs_alloc_decrement(cur, level, &i)))
-                       return error;
-               *stat = 1;
-               return 0;
-       }
-       /*
-        * If we deleted the leftmost entry in the block, update the
-        * key values above us in the tree.
-        */
-       if (ptr == 1 && (error = xfs_alloc_updkey(cur, lkp, level + 1)))
-               return error;
-       /*
-        * If the number of records remaining in the block is at least
-        * the minimum, we're done.
-        */
-       if (numrecs >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
-               if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
-                       return error;
-               *stat = 1;
-               return 0;
-       }
-       /*
-        * Otherwise, we have to move some records around to keep the
-        * tree balanced.  Look at the left and right sibling blocks to
-        * see if we can re-balance by moving only one record.
-        */
-       rbno = be32_to_cpu(block->bb_rightsib);
-       lbno = be32_to_cpu(block->bb_leftsib);
-       bno = NULLAGBLOCK;
-       ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
-       /*
-        * Duplicate the cursor so our btree manipulations here won't
-        * disrupt the next level up.
-        */
-       if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-               return error;
-       /*
-        * If there's a right sibling, see if it's ok to shift an entry
-        * out of it.
-        */
-       if (rbno != NULLAGBLOCK) {
-               /*
-                * Move the temp cursor to the last entry in the next block.
-                * Actually any entry but the first would suffice.
-                */
-               i = xfs_btree_lastrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if ((error = xfs_alloc_increment(tcur, level, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               i = xfs_btree_lastrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               /*
-                * Grab a pointer to the block.
-                */
-               rbp = tcur->bc_bufs[level];
-               right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                       goto error0;
-#endif
-               /*
-                * Grab the current block number, for future use.
-                */
-               bno = be32_to_cpu(right->bb_leftsib);
-               /*
-                * If right block is full enough so that removing one entry
-                * won't make it too empty, and left-shifting an entry out
-                * of right to us works, we're done.
-                */
-               if (be16_to_cpu(right->bb_numrecs) - 1 >=
-                    XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
-                       if ((error = xfs_alloc_lshift(tcur, level, &i)))
-                               goto error0;
-                       if (i) {
-                               ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                      XFS_ALLOC_BLOCK_MINRECS(level, cur));
-                               xfs_btree_del_cursor(tcur,
-                                                    XFS_BTREE_NOERROR);
-                               if (level > 0 &&
-                                   (error = xfs_alloc_decrement(cur, level,
-                                           &i)))
-                                       return error;
-                               *stat = 1;
-                               return 0;
-                       }
-               }
-               /*
-                * Otherwise, grab the number of records in right for
-                * future reference, and fix up the temp cursor to point
-                * to our block again (last record).
-                */
-               rrecs = be16_to_cpu(right->bb_numrecs);
-               if (lbno != NULLAGBLOCK) {
-                       i = xfs_btree_firstrec(tcur, level);
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                       if ((error = xfs_alloc_decrement(tcur, level, &i)))
-                               goto error0;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               }
-       }
-       /*
-        * If there's a left sibling, see if it's ok to shift an entry
-        * out of it.
-        */
-       if (lbno != NULLAGBLOCK) {
-               /*
-                * Move the temp cursor to the first entry in the
-                * previous block.
-                */
-               i = xfs_btree_firstrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if ((error = xfs_alloc_decrement(tcur, level, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               xfs_btree_firstrec(tcur, level);
-               /*
-                * Grab a pointer to the block.
-                */
-               lbp = tcur->bc_bufs[level];
-               left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                       goto error0;
-#endif
-               /*
-                * Grab the current block number, for future use.
-                */
-               bno = be32_to_cpu(left->bb_rightsib);
-               /*
-                * If left block is full enough so that removing one entry
-                * won't make it too empty, and right-shifting an entry out
-                * of left to us works, we're done.
-                */
-               if (be16_to_cpu(left->bb_numrecs) - 1 >=
-                    XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
-                       if ((error = xfs_alloc_rshift(tcur, level, &i)))
-                               goto error0;
-                       if (i) {
-                               ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                      XFS_ALLOC_BLOCK_MINRECS(level, cur));
-                               xfs_btree_del_cursor(tcur,
-                                                    XFS_BTREE_NOERROR);
-                               if (level == 0)
-                                       cur->bc_ptrs[0]++;
-                               *stat = 1;
-                               return 0;
-                       }
-               }
-               /*
-                * Otherwise, grab the number of records in right for
-                * future reference.
-                */
-               lrecs = be16_to_cpu(left->bb_numrecs);
-       }
-       /*
-        * Delete the temp cursor, we're done with it.
-        */
-       xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-       /*
-        * If here, we need to do a join to keep the tree balanced.
-        */
-       ASSERT(bno != NULLAGBLOCK);
-       /*
-        * See if we can join with the left neighbor block.
-        */
-       if (lbno != NULLAGBLOCK &&
-           lrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-               /*
-                * Set "right" to be the starting block,
-                * "left" to be the left neighbor.
-                */
-               rbno = bno;
-               right = block;
-               rrecs = be16_to_cpu(right->bb_numrecs);
-               rbp = bp;
-               if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                               cur->bc_private.a.agno, lbno, 0, &lbp,
-                               XFS_ALLOC_BTREE_REF)))
-                       return error;
-               left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-               lrecs = be16_to_cpu(left->bb_numrecs);
-               if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                       return error;
-       }
-       /*
-        * If that won't work, see if we can join with the right neighbor block.
-        */
-       else if (rbno != NULLAGBLOCK &&
-                rrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-               /*
-                * Set "left" to be the starting block,
-                * "right" to be the right neighbor.
-                */
-               lbno = bno;
-               left = block;
-               lrecs = be16_to_cpu(left->bb_numrecs);
-               lbp = bp;
-               if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                               cur->bc_private.a.agno, rbno, 0, &rbp,
-                               XFS_ALLOC_BTREE_REF)))
-                       return error;
-               right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-               rrecs = be16_to_cpu(right->bb_numrecs);
-               if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                       return error;
-       }
-       /*
-        * Otherwise, we can't fix the imbalance.
-        * Just return.  This is probably a logic error, but it's not fatal.
-        */
-       else {
-               if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
-                       return error;
-               *stat = 1;
-               return 0;
-       }
-       /*
-        * We're now going to join "left" and "right" by moving all the stuff
-        * in "right" to "left" and deleting "right".
-        */
-       if (level > 0) {
-               /*
-                * It's a non-leaf.  Move keys and pointers.
-                */
-               lkp = XFS_ALLOC_KEY_ADDR(left, lrecs + 1, cur);
-               lpp = XFS_ALLOC_PTR_ADDR(left, lrecs + 1, cur);
-               rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
-               rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-               for (i = 0; i < rrecs; i++) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
-                               return error;
-               }
-#endif
-               memcpy(lkp, rkp, rrecs * sizeof(*lkp));
-               memcpy(lpp, rpp, rrecs * sizeof(*lpp));
-               xfs_alloc_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
-               xfs_alloc_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
-       } else {
-               /*
-                * It's a leaf.  Move records.
-                */
-               lrp = XFS_ALLOC_REC_ADDR(left, lrecs + 1, cur);
-               rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-               memcpy(lrp, rrp, rrecs * sizeof(*lrp));
-               xfs_alloc_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
-       }
-       /*
-        * If we joined with the left neighbor, set the buffer in the
-        * cursor to the left block, and fix up the index.
-        */
-       if (bp != lbp) {
-               xfs_btree_setbuf(cur, level, lbp);
-               cur->bc_ptrs[level] += lrecs;
-       }
-       /*
-        * If we joined with the right neighbor and there's a level above
-        * us, increment the cursor at that level.
-        */
-       else if (level + 1 < cur->bc_nlevels &&
-                (error = xfs_alloc_increment(cur, level + 1, &i)))
-               return error;
-       /*
-        * Fix up the number of records in the surviving block.
-        */
-       lrecs += rrecs;
-       left->bb_numrecs = cpu_to_be16(lrecs);
-       /*
-        * Fix up the right block pointer in the surviving block, and log it.
-        */
-       left->bb_rightsib = right->bb_rightsib;
-       xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-       /*
-        * If there is a right sibling now, make it point to the
-        * remaining block.
-        */
-       if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
-               xfs_alloc_block_t       *rrblock;
-               xfs_buf_t               *rrbp;
-
-               if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                               cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0,
-                               &rrbp, XFS_ALLOC_BTREE_REF)))
-                       return error;
-               rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
-               if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
-                       return error;
-               rrblock->bb_leftsib = cpu_to_be32(lbno);
-               xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
-       }
-       /*
-        * Free the deleting block by putting it on the freelist.
-        */
-       error = xfs_alloc_put_freelist(cur->bc_tp,
-                                        cur->bc_private.a.agbp, NULL, rbno, 1);
-       if (error)
-               return error;
-       /*
-        * Since blocks move to the free list without the coordination
-        * used in xfs_bmap_finish, we can't allow block to be available
-        * for reallocation and non-transaction writing (user data)
-        * until we know that the transaction that moved it to the free
-        * list is permanently on disk. We track the blocks by declaring
-        * these blocks as "busy"; the busy list is maintained on a
-        * per-ag basis and each transaction records which entries
-        * should be removed when the iclog commits to disk. If a
-        * busy block is allocated, the iclog is pushed up to the
-        * LSN that freed the block.
-        */
-       xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
-       xfs_trans_agbtree_delta(cur->bc_tp, -1);
-
-       /*
-        * Adjust the current level's cursor so that we're left referring
-        * to the right node, after we're done.
-        * If this leaves the ptr value 0 our caller will fix it up.
-        */
-       if (level > 0)
-               cur->bc_ptrs[level]--;
-       /*
-        * Return value means the next level up has something to do.
-        */
-       *stat = 2;
-       return 0;
-
-error0:
-       xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-       return error;
-}
-
-/*
- * Insert one record/level.  Return information to the caller
- * allowing the next level up to proceed if necessary.
- */
-STATIC int                             /* error */
-xfs_alloc_insrec(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level to insert record at */
-       xfs_agblock_t           *bnop,  /* i/o: block number inserted */
-       xfs_alloc_rec_t         *recp,  /* i/o: record data inserted */
-       xfs_btree_cur_t         **curp, /* output: new cursor replacing cur */
-       int                     *stat)  /* output: success/failure */
-{
-       xfs_agf_t               *agf;   /* allocation group freelist header */
-       xfs_alloc_block_t       *block; /* btree block record/key lives in */
-       xfs_buf_t               *bp;    /* buffer for block */
-       int                     error;  /* error return value */
-       int                     i;      /* loop index */
-       xfs_alloc_key_t         key;    /* key value being inserted */
-       xfs_alloc_key_t         *kp;    /* pointer to btree keys */
-       xfs_agblock_t           nbno;   /* block number of allocated block */
-       xfs_btree_cur_t         *ncur;  /* new cursor to be used at next lvl */
-       xfs_alloc_key_t         nkey;   /* new key value, from split */
-       xfs_alloc_rec_t         nrec;   /* new record value, for caller */
-       int                     numrecs;
-       int                     optr;   /* old ptr value */
-       xfs_alloc_ptr_t         *pp;    /* pointer to btree addresses */
-       int                     ptr;    /* index in btree block for this rec */
-       xfs_alloc_rec_t         *rp;    /* pointer to btree records */
-
-       ASSERT(be32_to_cpu(recp->ar_blockcount) > 0);
-
-       /*
-        * GCC doesn't understand the (arguably complex) control flow in
-        * this function and complains about uninitialized structure fields
-        * without this.
-        */
-       memset(&nrec, 0, sizeof(nrec));
-
-       /*
-        * If we made it to the root level, allocate a new root block
-        * and we're done.
-        */
-       if (level >= cur->bc_nlevels) {
-               XFS_STATS_INC(xs_abt_insrec);
-               if ((error = xfs_alloc_newroot(cur, &i)))
-                       return error;
-               *bnop = NULLAGBLOCK;
-               *stat = i;
-               return 0;
-       }
-       /*
-        * Make a key out of the record data to be inserted, and save it.
-        */
-       key.ar_startblock = recp->ar_startblock;
-       key.ar_blockcount = recp->ar_blockcount;
-       optr = ptr = cur->bc_ptrs[level];
-       /*
-        * If we're off the left edge, return failure.
-        */
-       if (ptr == 0) {
-               *stat = 0;
-               return 0;
-       }
-       XFS_STATS_INC(xs_abt_insrec);
-       /*
-        * Get pointers to the btree buffer and block.
-        */
-       bp = cur->bc_bufs[level];
-       block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-       numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-               return error;
-       /*
-        * Check that the new entry is being inserted in the right place.
-        */
-       if (ptr <= numrecs) {
-               if (level == 0) {
-                       rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
-                       xfs_btree_check_rec(cur->bc_btnum, recp, rp);
-               } else {
-                       kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
-                       xfs_btree_check_key(cur->bc_btnum, &key, kp);
-               }
-       }
-#endif
-       nbno = NULLAGBLOCK;
-       ncur = NULL;
-       /*
-        * If the block is full, we can't insert the new entry until we
-        * make the block un-full.
-        */
-       if (numrecs == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-               /*
-                * First, try shifting an entry to the right neighbor.
-                */
-               if ((error = xfs_alloc_rshift(cur, level, &i)))
-                       return error;
-               if (i) {
-                       /* nothing */
-               }
-               /*
-                * Next, try shifting an entry to the left neighbor.
-                */
-               else {
-                       if ((error = xfs_alloc_lshift(cur, level, &i)))
-                               return error;
-                       if (i)
-                               optr = ptr = cur->bc_ptrs[level];
-                       else {
-                               /*
-                                * Next, try splitting the current block in
-                                * half. If this works we have to re-set our
-                                * variables because we could be in a
-                                * different block now.
-                                */
-                               if ((error = xfs_alloc_split(cur, level, &nbno,
-                                               &nkey, &ncur, &i)))
-                                       return error;
-                               if (i) {
-                                       bp = cur->bc_bufs[level];
-                                       block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-                                       if ((error =
-                                               xfs_btree_check_sblock(cur,
-                                                       block, level, bp)))
-                                               return error;
-#endif
-                                       ptr = cur->bc_ptrs[level];
-                                       nrec.ar_startblock = nkey.ar_startblock;
-                                       nrec.ar_blockcount = nkey.ar_blockcount;
-                               }
-                               /*
-                                * Otherwise the insert fails.
-                                */
-                               else {
-                                       *stat = 0;
-                                       return 0;
-                               }
-                       }
-               }
-       }
-       /*
-        * At this point we know there's room for our new entry in the block
-        * we're pointing at.
-        */
-       numrecs = be16_to_cpu(block->bb_numrecs);
-       if (level > 0) {
-               /*
-                * It's a non-leaf entry.  Make a hole for the new data
-                * in the key and ptr regions of the block.
-                */
-               kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
-               pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
-               for (i = numrecs; i >= ptr; i--) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level)))
-                               return error;
-               }
-#endif
-               memmove(&kp[ptr], &kp[ptr - 1],
-                       (numrecs - ptr + 1) * sizeof(*kp));
-               memmove(&pp[ptr], &pp[ptr - 1],
-                       (numrecs - ptr + 1) * sizeof(*pp));
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
-                       return error;
-#endif
-               /*
-                * Now stuff the new data in, bump numrecs and log the new data.
-                */
-               kp[ptr - 1] = key;
-               pp[ptr - 1] = cpu_to_be32(*bnop);
-               numrecs++;
-               block->bb_numrecs = cpu_to_be16(numrecs);
-               xfs_alloc_log_keys(cur, bp, ptr, numrecs);
-               xfs_alloc_log_ptrs(cur, bp, ptr, numrecs);
-#ifdef DEBUG
-               if (ptr < numrecs)
-                       xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
-                               kp + ptr);
-#endif
-       } else {
-               /*
-                * It's a leaf entry.  Make a hole for the new record.
-                */
-               rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
-               memmove(&rp[ptr], &rp[ptr - 1],
-                       (numrecs - ptr + 1) * sizeof(*rp));
-               /*
-                * Now stuff the new record in, bump numrecs
-                * and log the new data.
-                */
-               rp[ptr - 1] = *recp;
-               numrecs++;
-               block->bb_numrecs = cpu_to_be16(numrecs);
-               xfs_alloc_log_recs(cur, bp, ptr, numrecs);
-#ifdef DEBUG
-               if (ptr < numrecs)
-                       xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
-                               rp + ptr);
-#endif
-       }
-       /*
-        * Log the new number of records in the btree header.
-        */
-       xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-       /*
-        * If we inserted at the start of a block, update the parents' keys.
-        */
-       if (optr == 1 && (error = xfs_alloc_updkey(cur, &key, level + 1)))
-               return error;
-       /*
-        * Look to see if the longest extent in the allocation group
-        * needs to be updated.
-        */
 
-       agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-       if (level == 0 &&
-           cur->bc_btnum == XFS_BTNUM_CNT &&
-           be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
-           be32_to_cpu(recp->ar_blockcount) > be32_to_cpu(agf->agf_longest)) {
-               /*
-                * If this is a leaf in the by-size btree and there
-                * is no right sibling block and this block is bigger
-                * than the previous longest block, update it.
-                */
-               agf->agf_longest = recp->ar_blockcount;
-               cur->bc_mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_longest
-                       = be32_to_cpu(recp->ar_blockcount);
-               xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-                       XFS_AGF_LONGEST);
-       }
-       /*
-        * Return the new block number, if any.
-        * If there is one, give back a record value and a cursor too.
-        */
-       *bnop = nbno;
-       if (nbno != NULLAGBLOCK) {
-               *recp = nrec;
-               *curp = ncur;
-       }
-       *stat = 1;
-       return 0;
-}
-
-/*
- * Log header fields from a btree block.
- */
-STATIC void
-xfs_alloc_log_block(
-       xfs_trans_t             *tp,    /* transaction pointer */
-       xfs_buf_t               *bp,    /* buffer containing btree block */
-       int                     fields) /* mask of fields: XFS_BB_... */
+STATIC struct xfs_btree_cur *
+xfs_allocbt_dup_cursor(
+       struct xfs_btree_cur    *cur)
 {
-       int                     first;  /* first byte offset logged */
-       int                     last;   /* last byte offset logged */
-       static const short      offsets[] = {   /* table of offsets */
-               offsetof(xfs_alloc_block_t, bb_magic),
-               offsetof(xfs_alloc_block_t, bb_level),
-               offsetof(xfs_alloc_block_t, bb_numrecs),
-               offsetof(xfs_alloc_block_t, bb_leftsib),
-               offsetof(xfs_alloc_block_t, bb_rightsib),
-               sizeof(xfs_alloc_block_t)
-       };
-
-       xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
-       xfs_trans_log_buf(tp, bp, first, last);
+       return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
+                       cur->bc_private.a.agbp, cur->bc_private.a.agno,
+                       cur->bc_btnum);
 }
-
-/*
- * Log keys from a btree block (nonleaf).
- */
-STATIC void
-xfs_alloc_log_keys(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_buf_t               *bp,    /* buffer containing btree block */
-       int                     kfirst, /* index of first key to log */
-       int                     klast)  /* index of last key to log */
-{
-       xfs_alloc_block_t       *block; /* btree block to log from */
-       int                     first;  /* first byte offset logged */
-       xfs_alloc_key_t         *kp;    /* key pointer in btree block */
-       int                     last;   /* last byte offset logged */
-
-       block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-       kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
-       first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
-       last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
-       xfs_trans_log_buf(cur->bc_tp, bp, first, last);
-}
-
-/*
- * Log block pointer fields from a btree block (nonleaf).
- */
-STATIC void
-xfs_alloc_log_ptrs(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_buf_t               *bp,    /* buffer containing btree block */
-       int                     pfirst, /* index of first pointer to log */
-       int                     plast)  /* index of last pointer to log */
-{
-       xfs_alloc_block_t       *block; /* btree block to log from */
-       int                     first;  /* first byte offset logged */
-       int                     last;   /* last byte offset logged */
-       xfs_alloc_ptr_t         *pp;    /* block-pointer pointer in btree blk */
-
-       block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-       pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
-       first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
-       last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
-       xfs_trans_log_buf(cur->bc_tp, bp, first, last);
-}
-
-/*
- * Log records from a btree block (leaf).
- */
-STATIC void
-xfs_alloc_log_recs(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_buf_t               *bp,    /* buffer containing btree block */
-       int                     rfirst, /* index of first record to log */
-       int                     rlast)  /* index of last record to log */
-{
-       xfs_alloc_block_t       *block; /* btree block to log from */
-       int                     first;  /* first byte offset logged */
-       int                     last;   /* last byte offset logged */
-       xfs_alloc_rec_t         *rp;    /* record pointer for btree block */
-
-
-       block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-       rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
-#ifdef DEBUG
-       {
-               xfs_agf_t       *agf;
-               xfs_alloc_rec_t *p;
-
-               agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-               for (p = &rp[rfirst - 1]; p <= &rp[rlast - 1]; p++)
-                       ASSERT(be32_to_cpu(p->ar_startblock) +
-                              be32_to_cpu(p->ar_blockcount) <=
-                              be32_to_cpu(agf->agf_length));
-       }
-#endif
-       first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
-       last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
-       xfs_trans_log_buf(cur->bc_tp, bp, first, last);
-}
-
-/*
- * Lookup the record.  The cursor is made to point to it, based on dir.
- * Return 0 if can't find any such record, 1 for success.
- */
-STATIC int                             /* error */
-xfs_alloc_lookup(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_lookup_t            dir,    /* <=, ==, or >= */
-       int                     *stat)  /* success/failure */
-{
-       xfs_agblock_t           agbno;  /* a.g. relative btree block number */
-       xfs_agnumber_t          agno;   /* allocation group number */
-       xfs_alloc_block_t       *block=NULL;    /* current btree block */
-       int                     diff;   /* difference for the current key */
-       int                     error;  /* error return value */
-       int                     keyno=0;        /* current key number */
-       int                     level;  /* level in the btree */
-       xfs_mount_t             *mp;    /* file system mount point */
-
-       XFS_STATS_INC(xs_abt_lookup);
-       /*
-        * Get the allocation group header, and the root block number.
-        */
-       mp = cur->bc_mp;
-
-       {
-               xfs_agf_t       *agf;   /* a.g. freespace header */
-
-               agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-               agno = be32_to_cpu(agf->agf_seqno);
-               agbno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]);
-       }
-       /*
-        * Iterate over each level in the btree, starting at the root.
-        * For each level above the leaves, find the key we need, based
-        * on the lookup record, then follow the corresponding block
-        * pointer down to the next level.
-        */
-       for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
-               xfs_buf_t       *bp;    /* buffer pointer for btree block */
-               xfs_daddr_t     d;      /* disk address of btree block */
-
-               /*
-                * Get the disk address we're looking for.
-                */
-               d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-               /*
-                * If the old buffer at this level is for a different block,
-                * throw it away, otherwise just use it.
-                */
-               bp = cur->bc_bufs[level];
-               if (bp && XFS_BUF_ADDR(bp) != d)
-                       bp = NULL;
-               if (!bp) {
-                       /*
-                        * Need to get a new buffer.  Read it, then
-                        * set it in the cursor, releasing the old one.
-                        */
-                       if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, agno,
-                                       agbno, 0, &bp, XFS_ALLOC_BTREE_REF)))
-                               return error;
-                       xfs_btree_setbuf(cur, level, bp);
-                       /*
-                        * Point to the btree block, now that we have the buffer
-                        */
-                       block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-                       if ((error = xfs_btree_check_sblock(cur, block, level,
-                                       bp)))
-                               return error;
-               } else
-                       block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-               /*
-                * If we already had a key match at a higher level, we know
-                * we need to use the first entry in this block.
-                */
-               if (diff == 0)
-                       keyno = 1;
-               /*
-                * Otherwise we need to search this block.  Do a binary search.
-                */
-               else {
-                       int             high;   /* high entry number */
-                       xfs_alloc_key_t *kkbase=NULL;/* base of keys in block */
-                       xfs_alloc_rec_t *krbase=NULL;/* base of records in block */
-                       int             low;    /* low entry number */
-
-                       /*
-                        * Get a pointer to keys or records.
-                        */
-                       if (level > 0)
-                               kkbase = XFS_ALLOC_KEY_ADDR(block, 1, cur);
-                       else
-                               krbase = XFS_ALLOC_REC_ADDR(block, 1, cur);
-                       /*
-                        * Set low and high entry numbers, 1-based.
-                        */
-                       low = 1;
-                       if (!(high = be16_to_cpu(block->bb_numrecs))) {
-                               /*
-                                * If the block is empty, the tree must
-                                * be an empty leaf.
-                                */
-                               ASSERT(level == 0 && cur->bc_nlevels == 1);
-                               cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
-                               *stat = 0;
-                               return 0;
-                       }
-                       /*
-                        * Binary search the block.
-                        */
-                       while (low <= high) {
-                               xfs_extlen_t    blockcount;     /* key value */
-                               xfs_agblock_t   startblock;     /* key value */
-
-                               XFS_STATS_INC(xs_abt_compare);
-                               /*
-                                * keyno is average of low and high.
-                                */
-                               keyno = (low + high) >> 1;
-                               /*
-                                * Get startblock & blockcount.
-                                */
-                               if (level > 0) {
-                                       xfs_alloc_key_t *kkp;
-
-                                       kkp = kkbase + keyno - 1;
-                                       startblock = be32_to_cpu(kkp->ar_startblock);
-                                       blockcount = be32_to_cpu(kkp->ar_blockcount);
-                               } else {
-                                       xfs_alloc_rec_t *krp;
-
-                                       krp = krbase + keyno - 1;
-                                       startblock = be32_to_cpu(krp->ar_startblock);
-                                       blockcount = be32_to_cpu(krp->ar_blockcount);
-                               }
-                               /*
-                                * Compute difference to get next direction.
-                                */
-                               if (cur->bc_btnum == XFS_BTNUM_BNO)
-                                       diff = (int)startblock -
-                                              (int)cur->bc_rec.a.ar_startblock;
-                               else if (!(diff = (int)blockcount -
-                                           (int)cur->bc_rec.a.ar_blockcount))
-                                       diff = (int)startblock -
-                                           (int)cur->bc_rec.a.ar_startblock;
-                               /*
-                                * Less than, move right.
-                                */
-                               if (diff < 0)
-                                       low = keyno + 1;
-                               /*
-                                * Greater than, move left.
-                                */
-                               else if (diff > 0)
-                                       high = keyno - 1;
-                               /*
-                                * Equal, we're done.
-                                */
-                               else
-                                       break;
-                       }
-               }
-               /*
-                * If there are more levels, set up for the next level
-                * by getting the block number and filling in the cursor.
-                */
-               if (level > 0) {
-                       /*
-                        * If we moved left, need the previous key number,
-                        * unless there isn't one.
-                        */
-                       if (diff > 0 && --keyno < 1)
-                               keyno = 1;
-                       agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, keyno, cur));
-#ifdef DEBUG
-                       if ((error = xfs_btree_check_sptr(cur, agbno, level)))
-                               return error;
-#endif
-                       cur->bc_ptrs[level] = keyno;
-               }
-       }
-       /*
-        * Done with the search.
-        * See if we need to adjust the results.
-        */
-       if (dir != XFS_LOOKUP_LE && diff < 0) {
-               keyno++;
-               /*
-                * If ge search and we went off the end of the block, but it's
-                * not the last block, we're in the wrong block.
-                */
-               if (dir == XFS_LOOKUP_GE &&
-                   keyno > be16_to_cpu(block->bb_numrecs) &&
-                   be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
-                       int     i;
-
-                       cur->bc_ptrs[0] = keyno;
-                       if ((error = xfs_alloc_increment(cur, 0, &i)))
-                               return error;
-                       XFS_WANT_CORRUPTED_RETURN(i == 1);
-                       *stat = 1;
-                       return 0;
-               }
-       }
-       else if (dir == XFS_LOOKUP_LE && diff > 0)
-               keyno--;
-       cur->bc_ptrs[0] = keyno;
-       /*
-        * Return if we succeeded or not.
-        */
-       if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs))
-               *stat = 0;
-       else
-               *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
-       return 0;
-}
-
-/*
- * Move 1 record left from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                             /* error */
-xfs_alloc_lshift(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level to shift record on */
-       int                     *stat)  /* success/failure */
-{
-       int                     error;  /* error return value */
-#ifdef DEBUG
-       int                     i;      /* loop index */
-#endif
-       xfs_alloc_key_t         key;    /* key value for leaf level upward */
-       xfs_buf_t               *lbp;   /* buffer for left neighbor block */
-       xfs_alloc_block_t       *left;  /* left neighbor btree block */
-       int                     nrec;   /* new number of left block entries */
-       xfs_buf_t               *rbp;   /* buffer for right (current) block */
-       xfs_alloc_block_t       *right; /* right (current) btree block */
-       xfs_alloc_key_t         *rkp=NULL;      /* key pointer for right block */
-       xfs_alloc_ptr_t         *rpp=NULL;      /* address pointer for right block */
-       xfs_alloc_rec_t         *rrp=NULL;      /* record pointer for right block */
-
-       /*
-        * Set up variables for this block as "right".
-        */
-       rbp = cur->bc_bufs[level];
-       right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-               return error;
-#endif
-       /*
-        * If we've got no left sibling then we can't shift an entry left.
-        */
-       if (be32_to_cpu(right->bb_leftsib) == NULLAGBLOCK) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * If the cursor entry is the one that would be moved, don't
-        * do it... it's too complicated.
-        */
-       if (cur->bc_ptrs[level] <= 1) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * Set up the left neighbor as "left".
-        */
-       if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                       cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib),
-                       0, &lbp, XFS_ALLOC_BTREE_REF)))
-               return error;
-       left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-       if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-               return error;
-       /*
-        * If it's full, it can't take another entry.
-        */
-       if (be16_to_cpu(left->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-               *stat = 0;
-               return 0;
-       }
-       nrec = be16_to_cpu(left->bb_numrecs) + 1;
-       /*
-        * If non-leaf, copy a key and a ptr to the left block.
-        */
-       if (level > 0) {
-               xfs_alloc_key_t *lkp;   /* key pointer for left block */
-               xfs_alloc_ptr_t *lpp;   /* address pointer for left block */
-
-               lkp = XFS_ALLOC_KEY_ADDR(left, nrec, cur);
-               rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
-               *lkp = *rkp;
-               xfs_alloc_log_keys(cur, lbp, nrec, nrec);
-               lpp = XFS_ALLOC_PTR_ADDR(left, nrec, cur);
-               rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level)))
-                       return error;
-#endif
-               *lpp = *rpp;
-               xfs_alloc_log_ptrs(cur, lbp, nrec, nrec);
-               xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
-       }
-       /*
-        * If leaf, copy a record to the left block.
-        */
-       else {
-               xfs_alloc_rec_t *lrp;   /* record pointer for left block */
-
-               lrp = XFS_ALLOC_REC_ADDR(left, nrec, cur);
-               rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-               *lrp = *rrp;
-               xfs_alloc_log_recs(cur, lbp, nrec, nrec);
-               xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
-       }
-       /*
-        * Bump and log left's numrecs, decrement and log right's numrecs.
-        */
-       be16_add_cpu(&left->bb_numrecs, 1);
-       xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-       be16_add_cpu(&right->bb_numrecs, -1);
-       xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
-       /*
-        * Slide the contents of right down one entry.
-        */
-       if (level > 0) {
-#ifdef DEBUG
-               for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i + 1]),
-                                       level)))
-                               return error;
-               }
-#endif
-               memmove(rkp, rkp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-               memmove(rpp, rpp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-               xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-       } else {
-               memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-               xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               key.ar_startblock = rrp->ar_startblock;
-               key.ar_blockcount = rrp->ar_blockcount;
-               rkp = &key;
-       }
-       /*
-        * Update the parent key values of right.
-        */
-       if ((error = xfs_alloc_updkey(cur, rkp, level + 1)))
-               return error;
-       /*
-        * Slide the cursor value left one.
-        */
-       cur->bc_ptrs[level]--;
-       *stat = 1;
-       return 0;
-}
-
-/*
- * Allocate a new root block, fill it in.
- */
-STATIC int                             /* error */
-xfs_alloc_newroot(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     *stat)  /* success/failure */
-{
-       int                     error;  /* error return value */
-       xfs_agblock_t           lbno;   /* left block number */
-       xfs_buf_t               *lbp;   /* left btree buffer */
-       xfs_alloc_block_t       *left;  /* left btree block */
-       xfs_mount_t             *mp;    /* mount structure */
-       xfs_agblock_t           nbno;   /* new block number */
-       xfs_buf_t               *nbp;   /* new (root) buffer */
-       xfs_alloc_block_t       *new;   /* new (root) btree block */
-       int                     nptr;   /* new value for key index, 1 or 2 */
-       xfs_agblock_t           rbno;   /* right block number */
-       xfs_buf_t               *rbp;   /* right btree buffer */
-       xfs_alloc_block_t       *right; /* right btree block */
-
-       mp = cur->bc_mp;
-
-       ASSERT(cur->bc_nlevels < XFS_AG_MAXLEVELS(mp));
-       /*
-        * Get a buffer from the freelist blocks, for the new root.
-        */
-       error = xfs_alloc_get_freelist(cur->bc_tp,
-                                       cur->bc_private.a.agbp, &nbno, 1);
-       if (error)
-               return error;
-       /*
-        * None available, we fail.
-        */
-       if (nbno == NULLAGBLOCK) {
-               *stat = 0;
-               return 0;
-       }
-       xfs_trans_agbtree_delta(cur->bc_tp, 1);
-       nbp = xfs_btree_get_bufs(mp, cur->bc_tp, cur->bc_private.a.agno, nbno,
-               0);
-       new = XFS_BUF_TO_ALLOC_BLOCK(nbp);
-       /*
-        * Set the root data in the a.g. freespace structure.
-        */
-       {
-               xfs_agf_t       *agf;   /* a.g. freespace header */
-               xfs_agnumber_t  seqno;
-
-               agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-               agf->agf_roots[cur->bc_btnum] = cpu_to_be32(nbno);
-               be32_add_cpu(&agf->agf_levels[cur->bc_btnum], 1);
-               seqno = be32_to_cpu(agf->agf_seqno);
-               mp->m_perag[seqno].pagf_levels[cur->bc_btnum]++;
-               xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-                       XFS_AGF_ROOTS | XFS_AGF_LEVELS);
-       }
-       /*
-        * At the previous root level there are now two blocks: the old
-        * root, and the new block generated when it was split.
-        * We don't know which one the cursor is pointing at, so we
-        * set up variables "left" and "right" for each case.
-        */
-       lbp = cur->bc_bufs[cur->bc_nlevels - 1];
-       left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, left, cur->bc_nlevels - 1, lbp)))
-               return error;
-#endif
-       if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
-               /*
-                * Our block is left, pick up the right block.
-                */
-               lbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(lbp));
-               rbno = be32_to_cpu(left->bb_rightsib);
-               if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                               cur->bc_private.a.agno, rbno, 0, &rbp,
-                               XFS_ALLOC_BTREE_REF)))
-                       return error;
-               right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-               if ((error = xfs_btree_check_sblock(cur, right,
-                               cur->bc_nlevels - 1, rbp)))
-                       return error;
-               nptr = 1;
-       } else {
-               /*
-                * Our block is right, pick up the left block.
-                */
-               rbp = lbp;
-               right = left;
-               rbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(rbp));
-               lbno = be32_to_cpu(right->bb_leftsib);
-               if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                               cur->bc_private.a.agno, lbno, 0, &lbp,
-                               XFS_ALLOC_BTREE_REF)))
-                       return error;
-               left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-               if ((error = xfs_btree_check_sblock(cur, left,
-                               cur->bc_nlevels - 1, lbp)))
-                       return error;
-               nptr = 2;
-       }
-       /*
-        * Fill in the new block's btree header and log it.
-        */
-       new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
-       new->bb_level = cpu_to_be16(cur->bc_nlevels);
-       new->bb_numrecs = cpu_to_be16(2);
-       new->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-       new->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-       xfs_alloc_log_block(cur->bc_tp, nbp, XFS_BB_ALL_BITS);
-       ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
-       /*
-        * Fill in the key data in the new root.
-        */
-       {
-               xfs_alloc_key_t         *kp;    /* btree key pointer */
-
-               kp = XFS_ALLOC_KEY_ADDR(new, 1, cur);
-               if (be16_to_cpu(left->bb_level) > 0) {
-                       kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur);
-                       kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur);
-               } else {
-                       xfs_alloc_rec_t *rp;    /* btree record pointer */
-
-                       rp = XFS_ALLOC_REC_ADDR(left, 1, cur);
-                       kp[0].ar_startblock = rp->ar_startblock;
-                       kp[0].ar_blockcount = rp->ar_blockcount;
-                       rp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-                       kp[1].ar_startblock = rp->ar_startblock;
-                       kp[1].ar_blockcount = rp->ar_blockcount;
-               }
-       }
-       xfs_alloc_log_keys(cur, nbp, 1, 2);
-       /*
-        * Fill in the pointer data in the new root.
-        */
-       {
-               xfs_alloc_ptr_t         *pp;    /* btree address pointer */
-
-               pp = XFS_ALLOC_PTR_ADDR(new, 1, cur);
-               pp[0] = cpu_to_be32(lbno);
-               pp[1] = cpu_to_be32(rbno);
-       }
-       xfs_alloc_log_ptrs(cur, nbp, 1, 2);
-       /*
-        * Fix up the cursor.
-        */
-       xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
-       cur->bc_ptrs[cur->bc_nlevels] = nptr;
-       cur->bc_nlevels++;
-       *stat = 1;
-       return 0;
-}
-
-/*
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                             /* error */
-xfs_alloc_rshift(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level to shift record on */
-       int                     *stat)  /* success/failure */
-{
-       int                     error;  /* error return value */
-       int                     i;      /* loop index */
-       xfs_alloc_key_t         key;    /* key value for leaf level upward */
-       xfs_buf_t               *lbp;   /* buffer for left (current) block */
-       xfs_alloc_block_t       *left;  /* left (current) btree block */
-       xfs_buf_t               *rbp;   /* buffer for right neighbor block */
-       xfs_alloc_block_t       *right; /* right neighbor btree block */
-       xfs_alloc_key_t         *rkp;   /* key pointer for right block */
-       xfs_btree_cur_t         *tcur;  /* temporary cursor */
-
-       /*
-        * Set up variables for this block as "left".
-        */
-       lbp = cur->bc_bufs[level];
-       left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-               return error;
-#endif
-       /*
-        * If we've got no right sibling then we can't shift an entry right.
-        */
-       if (be32_to_cpu(left->bb_rightsib) == NULLAGBLOCK) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * If the cursor entry is the one that would be moved, don't
-        * do it... it's too complicated.
-        */
-       if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * Set up the right neighbor as "right".
-        */
-       if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                       cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib),
-                       0, &rbp, XFS_ALLOC_BTREE_REF)))
-               return error;
-       right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-       if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
+
+STATIC void
+xfs_allocbt_set_root(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr,
+       int                     inc)
+{
+       struct xfs_buf          *agbp = cur->bc_private.a.agbp;
+       struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+       xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
+       int                     btnum = cur->bc_btnum;
+
+       ASSERT(ptr->s != 0);
+
+       agf->agf_roots[btnum] = ptr->s;
+       be32_add_cpu(&agf->agf_levels[btnum], inc);
+       cur->bc_mp->m_perag[seqno].pagf_levels[btnum] += inc;
+
+       xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+}
+
+STATIC int
+xfs_allocbt_alloc_block(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *start,
+       union xfs_btree_ptr     *new,
+       int                     length,
+       int                     *stat)
+{
+       int                     error;
+       xfs_agblock_t           bno;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+       /* Allocate the new block from the freelist. If we can't, give up.  */
+       error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+                                      &bno, 1);
+       if (error) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
                return error;
-       /*
-        * If it's full, it can't take another entry.
-        */
-       if (be16_to_cpu(right->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
+       }
+
+       if (bno == NULLAGBLOCK) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
                *stat = 0;
                return 0;
        }
-       /*
-        * Make a hole at the start of the right neighbor block, then
-        * copy the last left block entry to the hole.
-        */
-       if (level > 0) {
-               xfs_alloc_key_t *lkp;   /* key pointer for left block */
-               xfs_alloc_ptr_t *lpp;   /* address pointer for left block */
-               xfs_alloc_ptr_t *rpp;   /* address pointer for right block */
 
-               lkp = XFS_ALLOC_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-               lpp = XFS_ALLOC_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-               rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
-               rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-               for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
-                               return error;
-               }
-#endif
-               memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-               memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level)))
-                       return error;
-#endif
-               *rkp = *lkp;
-               *rpp = *lpp;
-               xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-               xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-               xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
-       } else {
-               xfs_alloc_rec_t *lrp;   /* record pointer for left block */
-               xfs_alloc_rec_t *rrp;   /* record pointer for right block */
+       xfs_trans_agbtree_delta(cur->bc_tp, 1);
+       new->s = cpu_to_be32(bno);
 
-               lrp = XFS_ALLOC_REC_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-               rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-               memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-               *rrp = *lrp;
-               xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-               key.ar_startblock = rrp->ar_startblock;
-               key.ar_blockcount = rrp->ar_blockcount;
-               rkp = &key;
-               xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
-       }
-       /*
-        * Decrement and log left's numrecs, bump and log right's numrecs.
-        */
-       be16_add_cpu(&left->bb_numrecs, -1);
-       xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-       be16_add_cpu(&right->bb_numrecs, 1);
-       xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
-       /*
-        * Using a temporary cursor, update the parent key values of the
-        * block on the right.
-        */
-       if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-               return error;
-       i = xfs_btree_lastrec(tcur, level);
-       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-       if ((error = xfs_alloc_increment(tcur, level, &i)) ||
-           (error = xfs_alloc_updkey(tcur, rkp, level + 1)))
-               goto error0;
-       xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
        *stat = 1;
        return 0;
-error0:
-       xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-       return error;
 }
 
-/*
- * Split cur/level block in half.
- * Return new block number and its first record (to be inserted into parent).
- */
-STATIC int                             /* error */
-xfs_alloc_split(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level to split */
-       xfs_agblock_t           *bnop,  /* output: block number allocated */
-       xfs_alloc_key_t         *keyp,  /* output: first key of new block */
-       xfs_btree_cur_t         **curp, /* output: new cursor */
-       int                     *stat)  /* success/failure */
+STATIC int
+xfs_allocbt_free_block(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp)
 {
-       int                     error;  /* error return value */
-       int                     i;      /* loop index/record number */
-       xfs_agblock_t           lbno;   /* left (current) block number */
-       xfs_buf_t               *lbp;   /* buffer for left block */
-       xfs_alloc_block_t       *left;  /* left (current) btree block */
-       xfs_agblock_t           rbno;   /* right (new) block number */
-       xfs_buf_t               *rbp;   /* buffer for right block */
-       xfs_alloc_block_t       *right; /* right (new) btree block */
+       struct xfs_buf          *agbp = cur->bc_private.a.agbp;
+       struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+       xfs_agblock_t           bno;
+       int                     error;
 
-       /*
-        * Allocate the new block from the freelist.
-        * If we can't do it, we're toast.  Give up.
-        */
-       error = xfs_alloc_get_freelist(cur->bc_tp,
-                                        cur->bc_private.a.agbp, &rbno, 1);
+       bno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(bp));
+       error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
        if (error)
                return error;
-       if (rbno == NULLAGBLOCK) {
-               *stat = 0;
-               return 0;
-       }
-       xfs_trans_agbtree_delta(cur->bc_tp, 1);
-       rbp = xfs_btree_get_bufs(cur->bc_mp, cur->bc_tp, cur->bc_private.a.agno,
-               rbno, 0);
-       /*
-        * Set up the new block as "right".
-        */
-       right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-       /*
-        * "Left" is the current (according to the cursor) block.
-        */
-       lbp = cur->bc_bufs[level];
-       left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-               return error;
-#endif
-       /*
-        * Fill in the btree header for the new block.
-        */
-       right->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
-       right->bb_level = left->bb_level;
-       right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
-       /*
-        * Make sure that if there's an odd number of entries now, that
-        * each new block will have the same number of entries.
-        */
-       if ((be16_to_cpu(left->bb_numrecs) & 1) &&
-           cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
-               be16_add_cpu(&right->bb_numrecs, 1);
-       i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
-       /*
-        * For non-leaf blocks, copy keys and addresses over to the new block.
-        */
-       if (level > 0) {
-               xfs_alloc_key_t *lkp;   /* left btree key pointer */
-               xfs_alloc_ptr_t *lpp;   /* left btree address pointer */
-               xfs_alloc_key_t *rkp;   /* right btree key pointer */
-               xfs_alloc_ptr_t *rpp;   /* right btree address pointer */
-
-               lkp = XFS_ALLOC_KEY_ADDR(left, i, cur);
-               lpp = XFS_ALLOC_PTR_ADDR(left, i, cur);
-               rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
-               rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-               for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
-                               return error;
-               }
-#endif
-               memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-               memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-               xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               *keyp = *rkp;
-       }
-       /*
-        * For leaf blocks, copy records over to the new block.
-        */
-       else {
-               xfs_alloc_rec_t *lrp;   /* left btree record pointer */
-               xfs_alloc_rec_t *rrp;   /* right btree record pointer */
-
-               lrp = XFS_ALLOC_REC_ADDR(left, i, cur);
-               rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-               memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-               xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               keyp->ar_startblock = rrp->ar_startblock;
-               keyp->ar_blockcount = rrp->ar_blockcount;
-       }
-       /*
-        * Find the left block number by looking in the buffer.
-        * Adjust numrecs, sibling pointers.
-        */
-       lbno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(lbp));
-       be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
-       right->bb_rightsib = left->bb_rightsib;
-       left->bb_rightsib = cpu_to_be32(rbno);
-       right->bb_leftsib = cpu_to_be32(lbno);
-       xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_ALL_BITS);
-       xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-       /*
-        * If there's a block to the new block's right, make that block
-        * point back to right instead of to left.
-        */
-       if (be32_to_cpu(right->bb_rightsib) != NULLAGBLOCK) {
-               xfs_alloc_block_t       *rrblock;       /* rr btree block */
-               xfs_buf_t               *rrbp;          /* buffer for rrblock */
 
-               if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                               cur->bc_private.a.agno, be32_to_cpu(right->bb_rightsib), 0,
-                               &rrbp, XFS_ALLOC_BTREE_REF)))
-                       return error;
-               rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
-               if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
-                       return error;
-               rrblock->bb_leftsib = cpu_to_be32(rbno);
-               xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
-       }
-       /*
-        * If the cursor is really in the right block, move it there.
-        * If it's just pointing past the last entry in left, then we'll
-        * insert there, so don't change anything in that case.
-        */
-       if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
-               xfs_btree_setbuf(cur, level, rbp);
-               cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
-       }
        /*
-        * If there are more levels, we'll need another cursor which refers to
-        * the right block, no matter where this cursor was.
+        * Since blocks move to the free list without the coordination used in
+        * xfs_bmap_finish, we can't allow block to be available for
+        * reallocation and non-transaction writing (user data) until we know
+        * that the transaction that moved it to the free list is permanently
+        * on disk. We track the blocks by declaring these blocks as "busy";
+        * the busy list is maintained on a per-ag basis and each transaction
+        * records which entries should be removed when the iclog commits to
+        * disk. If a busy block is allocated, the iclog is pushed up to the
+        * LSN that freed the block.
         */
-       if (level + 1 < cur->bc_nlevels) {
-               if ((error = xfs_btree_dup_cursor(cur, curp)))
-                       return error;
-               (*curp)->bc_ptrs[level + 1]++;
-       }
-       *bnop = rbno;
-       *stat = 1;
+       xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+       xfs_trans_agbtree_delta(cur->bc_tp, -1);
        return 0;
 }
 
 /*
- * Update keys at all levels from here to the root along the cursor's path.
+ * Update the longest extent in the AGF
  */
-STATIC int                             /* error */
-xfs_alloc_updkey(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_alloc_key_t         *keyp,  /* new key value to update to */
-       int                     level)  /* starting level for update */
+STATIC void
+xfs_allocbt_update_lastrec(
+       struct xfs_btree_cur    *cur,
+       struct xfs_btree_block  *block,
+       union xfs_btree_rec     *rec,
+       int                     ptr,
+       int                     reason)
 {
-       int                     ptr;    /* index of key in block */
+       struct xfs_agf          *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+       xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
+       __be32                  len;
+       int                     numrecs;
 
-       /*
-        * Go up the tree from this level toward the root.
-        * At each level, update the key value to the value input.
-        * Stop when we reach a level where the cursor isn't pointing
-        * at the first entry in the block.
-        */
-       for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
-               xfs_alloc_block_t       *block; /* btree block */
-               xfs_buf_t               *bp;    /* buffer for block */
-#ifdef DEBUG
-               int                     error;  /* error return value */
-#endif
-               xfs_alloc_key_t         *kp;    /* ptr to btree block keys */
+       ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
+
+       switch (reason) {
+       case LASTREC_UPDATE:
+               /*
+                * If this is the last leaf block and it's the last record,
+                * then update the size of the longest extent in the AG.
+                */
+               if (ptr != xfs_btree_get_numrecs(block))
+                       return;
+               len = rec->alloc.ar_blockcount;
+               break;
+       case LASTREC_INSREC:
+               if (be32_to_cpu(rec->alloc.ar_blockcount) <=
+                   be32_to_cpu(agf->agf_longest))
+                       return;
+               len = rec->alloc.ar_blockcount;
+               break;
+       case LASTREC_DELREC:
+               numrecs = xfs_btree_get_numrecs(block);
+               if (ptr <= numrecs)
+                       return;
+               ASSERT(ptr == numrecs + 1);
 
-               bp = cur->bc_bufs[level];
-               block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                       return error;
-#endif
-               ptr = cur->bc_ptrs[level];
-               kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
-               *kp = *keyp;
-               xfs_alloc_log_keys(cur, bp, ptr, ptr);
+               if (numrecs) {
+                       xfs_alloc_rec_t *rrp;
+
+                       rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs);
+                       len = rrp->ar_blockcount;
+               } else {
+                       len = 0;
+               }
+
+               break;
+       default:
+               ASSERT(0);
+               return;
        }
-       return 0;
+
+       agf->agf_longest = len;
+       cur->bc_mp->m_perag[seqno].pagf_longest = be32_to_cpu(len);
+       xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
 }
 
-/*
- * Externally visible routines.
- */
+STATIC int
+xfs_allocbt_get_minrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
+{
+       return cur->bc_mp->m_alloc_mnr[level != 0];
+}
 
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                    /* error */
-xfs_alloc_decrement(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level in btree, 0 is leaf */
-       int                     *stat)  /* success/failure */
+STATIC int
+xfs_allocbt_get_maxrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
 {
-       xfs_alloc_block_t       *block; /* btree block */
-       int                     error;  /* error return value */
-       int                     lev;    /* btree level */
+       return cur->bc_mp->m_alloc_mxr[level != 0];
+}
 
-       ASSERT(level < cur->bc_nlevels);
-       /*
-        * Read-ahead to the left at this level.
-        */
-       xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
-       /*
-        * Decrement the ptr at this level.  If we're still in the block
-        * then we're done.
-        */
-       if (--cur->bc_ptrs[level] > 0) {
-               *stat = 1;
-               return 0;
-       }
-       /*
-        * Get a pointer to the btree block.
-        */
-       block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[level]);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, level,
-                       cur->bc_bufs[level])))
-               return error;
-#endif
-       /*
-        * If we just went off the left edge of the tree, return failure.
-        */
-       if (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * March up the tree decrementing pointers.
-        * Stop when we don't go off the left edge of a block.
-        */
-       for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-               if (--cur->bc_ptrs[lev] > 0)
-                       break;
-               /*
-                * Read-ahead the left block, we're going to read it
-                * in the next loop.
-                */
-               xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
-       }
-       /*
-        * If we went off the root then we are seriously confused.
-        */
-       ASSERT(lev < cur->bc_nlevels);
-       /*
-        * Now walk back down the tree, fixing up the cursor's buffer
-        * pointers and key numbers.
-        */
-       for (block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
-               xfs_agblock_t   agbno;  /* block number of btree block */
-               xfs_buf_t       *bp;    /* buffer pointer for block */
+STATIC void
+xfs_allocbt_init_key_from_rec(
+       union xfs_btree_key     *key,
+       union xfs_btree_rec     *rec)
+{
+       ASSERT(rec->alloc.ar_startblock != 0);
 
-               agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
-               if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                               cur->bc_private.a.agno, agbno, 0, &bp,
-                               XFS_ALLOC_BTREE_REF)))
-                       return error;
-               lev--;
-               xfs_btree_setbuf(cur, lev, bp);
-               block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-               if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                       return error;
-               cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
-       }
-       *stat = 1;
-       return 0;
+       key->alloc.ar_startblock = rec->alloc.ar_startblock;
+       key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
 }
 
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-int                                    /* error */
-xfs_alloc_delete(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       int             *stat)          /* success/failure */
+STATIC void
+xfs_allocbt_init_rec_from_key(
+       union xfs_btree_key     *key,
+       union xfs_btree_rec     *rec)
 {
-       int             error;          /* error return value */
-       int             i;              /* result code */
-       int             level;          /* btree level */
+       ASSERT(key->alloc.ar_startblock != 0);
 
-       /*
-        * Go up the tree, starting at leaf level.
-        * If 2 is returned then a join was done; go to the next level.
-        * Otherwise we are done.
-        */
-       for (level = 0, i = 2; i == 2; level++) {
-               if ((error = xfs_alloc_delrec(cur, level, &i)))
-                       return error;
-       }
-       if (i == 0) {
-               for (level = 1; level < cur->bc_nlevels; level++) {
-                       if (cur->bc_ptrs[level] == 0) {
-                               if ((error = xfs_alloc_decrement(cur, level, &i)))
-                                       return error;
-                               break;
-                       }
-               }
-       }
-       *stat = i;
-       return 0;
+       rec->alloc.ar_startblock = key->alloc.ar_startblock;
+       rec->alloc.ar_blockcount = key->alloc.ar_blockcount;
 }
 
-/*
- * Get the data from the pointed-to record.
- */
-int                                    /* error */
-xfs_alloc_get_rec(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_agblock_t           *bno,   /* output: starting block of extent */
-       xfs_extlen_t            *len,   /* output: length of extent */
-       int                     *stat)  /* output: success/failure */
+STATIC void
+xfs_allocbt_init_rec_from_cur(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec)
 {
-       xfs_alloc_block_t       *block; /* btree block */
-#ifdef DEBUG
-       int                     error;  /* error return value */
-#endif
-       int                     ptr;    /* record number */
+       ASSERT(cur->bc_rec.a.ar_startblock != 0);
 
-       ptr = cur->bc_ptrs[0];
-       block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
-               return error;
-#endif
-       /*
-        * Off the right end or left end, return failure.
-        */
-       if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * Point to the record and extract its data.
-        */
-       {
-               xfs_alloc_rec_t         *rec;   /* record data */
+       rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
+       rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
+}
+
+STATIC void
+xfs_allocbt_init_ptr_from_cur(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr)
+{
+       struct xfs_agf          *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+
+       ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+       ASSERT(agf->agf_roots[cur->bc_btnum] != 0);
+
+       ptr->s = agf->agf_roots[cur->bc_btnum];
+}
+
+STATIC __int64_t
+xfs_allocbt_key_diff(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *key)
+{
+       xfs_alloc_rec_incore_t  *rec = &cur->bc_rec.a;
+       xfs_alloc_key_t         *kp = &key->alloc;
+       __int64_t               diff;
 
-               rec = XFS_ALLOC_REC_ADDR(block, ptr, cur);
-               *bno = be32_to_cpu(rec->ar_startblock);
-               *len = be32_to_cpu(rec->ar_blockcount);
+       if (cur->bc_btnum == XFS_BTNUM_BNO) {
+               return (__int64_t)be32_to_cpu(kp->ar_startblock) -
+                               rec->ar_startblock;
        }
-       *stat = 1;
-       return 0;
+
+       diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
+       if (diff)
+               return diff;
+
+       return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
 }
 
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                    /* error */
-xfs_alloc_increment(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level in btree, 0 is leaf */
-       int                     *stat)  /* success/failure */
+STATIC int
+xfs_allocbt_kill_root(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp,
+       int                     level,
+       union xfs_btree_ptr     *newroot)
 {
-       xfs_alloc_block_t       *block; /* btree block */
-       xfs_buf_t               *bp;    /* tree block buffer */
-       int                     error;  /* error return value */
-       int                     lev;    /* btree level */
+       int                     error;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_STATS_INC(cur, killroot);
 
-       ASSERT(level < cur->bc_nlevels);
-       /*
-        * Read-ahead to the right at this level.
-        */
-       xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
        /*
-        * Get a pointer to the btree block.
+        * Update the root pointer, decreasing the level by 1 and then
+        * free the old root.
         */
-       bp = cur->bc_bufs[level];
-       block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+       xfs_allocbt_set_root(cur, newroot, -1);
+       error = xfs_allocbt_free_block(cur, bp);
+       if (error) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
                return error;
-#endif
-       /*
-        * Increment the ptr at this level.  If we're still in the block
-        * then we're done.
-        */
-       if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
-               *stat = 1;
-               return 0;
-       }
-       /*
-        * If we just went off the right edge of the tree, return failure.
-        */
-       if (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK) {
-               *stat = 0;
-               return 0;
        }
-       /*
-        * March up the tree incrementing pointers.
-        * Stop when we don't go off the right edge of a block.
-        */
-       for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-               bp = cur->bc_bufs[lev];
-               block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+
+       XFS_BTREE_STATS_INC(cur, free);
+
+       xfs_btree_setbuf(cur, level, NULL);
+       cur->bc_nlevels--;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       return 0;
+}
+
 #ifdef DEBUG
-               if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                       return error;
-#endif
-               if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
-                       break;
-               /*
-                * Read-ahead the right block, we're going to read it
-                * in the next loop.
-                */
-               xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+STATIC int
+xfs_allocbt_keys_inorder(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *k1,
+       union xfs_btree_key     *k2)
+{
+       if (cur->bc_btnum == XFS_BTNUM_BNO) {
+               return be32_to_cpu(k1->alloc.ar_startblock) <
+                      be32_to_cpu(k2->alloc.ar_startblock);
+       } else {
+               return be32_to_cpu(k1->alloc.ar_blockcount) <
+                       be32_to_cpu(k2->alloc.ar_blockcount) ||
+                       (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
+                        be32_to_cpu(k1->alloc.ar_startblock) <
+                        be32_to_cpu(k2->alloc.ar_startblock));
        }
-       /*
-        * If we went off the root then we are seriously confused.
-        */
-       ASSERT(lev < cur->bc_nlevels);
-       /*
-        * Now walk back down the tree, fixing up the cursor's buffer
-        * pointers and key numbers.
-        */
-       for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-            lev > level; ) {
-               xfs_agblock_t   agbno;  /* block number of btree block */
+}
 
-               agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
-               if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                               cur->bc_private.a.agno, agbno, 0, &bp,
-                               XFS_ALLOC_BTREE_REF)))
-                       return error;
-               lev--;
-               xfs_btree_setbuf(cur, lev, bp);
-               block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-               if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                       return error;
-               cur->bc_ptrs[lev] = 1;
+STATIC int
+xfs_allocbt_recs_inorder(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *r1,
+       union xfs_btree_rec     *r2)
+{
+       if (cur->bc_btnum == XFS_BTNUM_BNO) {
+               return be32_to_cpu(r1->alloc.ar_startblock) +
+                       be32_to_cpu(r1->alloc.ar_blockcount) <=
+                       be32_to_cpu(r2->alloc.ar_startblock);
+       } else {
+               return be32_to_cpu(r1->alloc.ar_blockcount) <
+                       be32_to_cpu(r2->alloc.ar_blockcount) ||
+                       (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
+                        be32_to_cpu(r1->alloc.ar_startblock) <
+                        be32_to_cpu(r2->alloc.ar_startblock));
        }
-       *stat = 1;
-       return 0;
 }
+#endif /* DEBUG */
 
-/*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
- */
-int                                    /* error */
-xfs_alloc_insert(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       int             *stat)          /* success/failure */
-{
-       int             error;          /* error return value */
-       int             i;              /* result value, 0 for failure */
-       int             level;          /* current level number in btree */
-       xfs_agblock_t   nbno;           /* new block number (split result) */
-       xfs_btree_cur_t *ncur;          /* new cursor (split result) */
-       xfs_alloc_rec_t nrec;           /* record being inserted this level */
-       xfs_btree_cur_t *pcur;          /* previous level's cursor */
+#ifdef XFS_BTREE_TRACE
+ktrace_t       *xfs_allocbt_trace_buf;
 
-       level = 0;
-       nbno = NULLAGBLOCK;
-       nrec.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
-       nrec.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
-       ncur = NULL;
-       pcur = cur;
-       /*
-        * Loop going up the tree, starting at the leaf level.
-        * Stop when we don't get a split block, that must mean that
-        * the insert is finished with this level.
-        */
-       do {
-               /*
-                * Insert nrec/nbno into this level of the tree.
-                * Note if we fail, nbno will be null.
-                */
-               if ((error = xfs_alloc_insrec(pcur, level++, &nbno, &nrec, &ncur,
-                               &i))) {
-                       if (pcur != cur)
-                               xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-                       return error;
-               }
-               /*
-                * See if the cursor we just used is trash.
-                * Can't trash the caller's cursor, but otherwise we should
-                * if ncur is a new cursor or we're about to be done.
-                */
-               if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
-                       cur->bc_nlevels = pcur->bc_nlevels;
-                       xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
-               }
-               /*
-                * If we got a new cursor, switch to it.
-                */
-               if (ncur) {
-                       pcur = ncur;
-                       ncur = NULL;
-               }
-       } while (nbno != NULLAGBLOCK);
-       *stat = i;
-       return 0;
+STATIC void
+xfs_allocbt_trace_enter(
+       struct xfs_btree_cur    *cur,
+       const char              *func,
+       char                    *s,
+       int                     type,
+       int                     line,
+       __psunsigned_t          a0,
+       __psunsigned_t          a1,
+       __psunsigned_t          a2,
+       __psunsigned_t          a3,
+       __psunsigned_t          a4,
+       __psunsigned_t          a5,
+       __psunsigned_t          a6,
+       __psunsigned_t          a7,
+       __psunsigned_t          a8,
+       __psunsigned_t          a9,
+       __psunsigned_t          a10)
+{
+       ktrace_enter(xfs_allocbt_trace_buf, (void *)(__psint_t)type,
+               (void *)func, (void *)s, NULL, (void *)cur,
+               (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+               (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+               (void *)a8, (void *)a9, (void *)a10);
 }
 
-/*
- * Lookup the record equal to [bno, len] in the btree given by cur.
- */
-int                                    /* error */
-xfs_alloc_lookup_eq(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       xfs_agblock_t   bno,            /* starting block of extent */
-       xfs_extlen_t    len,            /* length of extent */
-       int             *stat)          /* success/failure */
+STATIC void
+xfs_allocbt_trace_cursor(
+       struct xfs_btree_cur    *cur,
+       __uint32_t              *s0,
+       __uint64_t              *l0,
+       __uint64_t              *l1)
 {
-       cur->bc_rec.a.ar_startblock = bno;
-       cur->bc_rec.a.ar_blockcount = len;
-       return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, stat);
+       *s0 = cur->bc_private.a.agno;
+       *l0 = cur->bc_rec.a.ar_startblock;
+       *l1 = cur->bc_rec.a.ar_blockcount;
 }
 
-/*
- * Lookup the first record greater than or equal to [bno, len]
- * in the btree given by cur.
- */
-int                                    /* error */
-xfs_alloc_lookup_ge(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       xfs_agblock_t   bno,            /* starting block of extent */
-       xfs_extlen_t    len,            /* length of extent */
-       int             *stat)          /* success/failure */
+STATIC void
+xfs_allocbt_trace_key(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *key,
+       __uint64_t              *l0,
+       __uint64_t              *l1)
 {
-       cur->bc_rec.a.ar_startblock = bno;
-       cur->bc_rec.a.ar_blockcount = len;
-       return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, stat);
+       *l0 = be32_to_cpu(key->alloc.ar_startblock);
+       *l1 = be32_to_cpu(key->alloc.ar_blockcount);
 }
 
-/*
- * Lookup the first record less than or equal to [bno, len]
- * in the btree given by cur.
- */
-int                                    /* error */
-xfs_alloc_lookup_le(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       xfs_agblock_t   bno,            /* starting block of extent */
-       xfs_extlen_t    len,            /* length of extent */
-       int             *stat)          /* success/failure */
+STATIC void
+xfs_allocbt_trace_record(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec,
+       __uint64_t              *l0,
+       __uint64_t              *l1,
+       __uint64_t              *l2)
 {
-       cur->bc_rec.a.ar_startblock = bno;
-       cur->bc_rec.a.ar_blockcount = len;
-       return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, stat);
+       *l0 = be32_to_cpu(rec->alloc.ar_startblock);
+       *l1 = be32_to_cpu(rec->alloc.ar_blockcount);
+       *l2 = 0;
 }
+#endif /* XFS_BTREE_TRACE */
+
+static const struct xfs_btree_ops xfs_allocbt_ops = {
+       .rec_len                = sizeof(xfs_alloc_rec_t),
+       .key_len                = sizeof(xfs_alloc_key_t),
+
+       .dup_cursor             = xfs_allocbt_dup_cursor,
+       .set_root               = xfs_allocbt_set_root,
+       .kill_root              = xfs_allocbt_kill_root,
+       .alloc_block            = xfs_allocbt_alloc_block,
+       .free_block             = xfs_allocbt_free_block,
+       .update_lastrec         = xfs_allocbt_update_lastrec,
+       .get_minrecs            = xfs_allocbt_get_minrecs,
+       .get_maxrecs            = xfs_allocbt_get_maxrecs,
+       .init_key_from_rec      = xfs_allocbt_init_key_from_rec,
+       .init_rec_from_key      = xfs_allocbt_init_rec_from_key,
+       .init_rec_from_cur      = xfs_allocbt_init_rec_from_cur,
+       .init_ptr_from_cur      = xfs_allocbt_init_ptr_from_cur,
+       .key_diff               = xfs_allocbt_key_diff,
+
+#ifdef DEBUG
+       .keys_inorder           = xfs_allocbt_keys_inorder,
+       .recs_inorder           = xfs_allocbt_recs_inorder,
+#endif
+
+#ifdef XFS_BTREE_TRACE
+       .trace_enter            = xfs_allocbt_trace_enter,
+       .trace_cursor           = xfs_allocbt_trace_cursor,
+       .trace_key              = xfs_allocbt_trace_key,
+       .trace_record           = xfs_allocbt_trace_record,
+#endif
+};
 
 /*
- * Update the record referred to by cur, to the value given by [bno, len].
- * This either works (return 0) or gets an EFSCORRUPTED error.
+ * Allocate a new allocation btree cursor.
  */
-int                                    /* error */
-xfs_alloc_update(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_agblock_t           bno,    /* starting block of extent */
-       xfs_extlen_t            len)    /* length of extent */
+struct xfs_btree_cur *                 /* new alloc btree cursor */
+xfs_allocbt_init_cursor(
+       struct xfs_mount        *mp,            /* file system mount point */
+       struct xfs_trans        *tp,            /* transaction pointer */
+       struct xfs_buf          *agbp,          /* buffer for agf structure */
+       xfs_agnumber_t          agno,           /* allocation group number */
+       xfs_btnum_t             btnum)          /* btree identifier */
 {
-       xfs_alloc_block_t       *block; /* btree block to update */
-       int                     error;  /* error return value */
-       int                     ptr;    /* current record number (updating) */
+       struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+       struct xfs_btree_cur    *cur;
 
-       ASSERT(len > 0);
-       /*
-        * Pick up the a.g. freelist struct and the current block.
-        */
-       block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
-               return error;
-#endif
-       /*
-        * Get the address of the rec to be updated.
-        */
-       ptr = cur->bc_ptrs[0];
-       {
-               xfs_alloc_rec_t         *rp;    /* pointer to updated record */
+       ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
 
-               rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
-               /*
-                * Fill in the new contents and log them.
-                */
-               rp->ar_startblock = cpu_to_be32(bno);
-               rp->ar_blockcount = cpu_to_be32(len);
-               xfs_alloc_log_recs(cur, cur->bc_bufs[0], ptr, ptr);
-       }
-       /*
-        * If it's the by-size btree and it's the last leaf block and
-        * it's the last record... then update the size of the longest
-        * extent in the a.g., which we cache in the a.g. freelist header.
-        */
-       if (cur->bc_btnum == XFS_BTNUM_CNT &&
-           be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
-           ptr == be16_to_cpu(block->bb_numrecs)) {
-               xfs_agf_t       *agf;   /* a.g. freespace header */
-               xfs_agnumber_t  seqno;
+       cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
 
-               agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-               seqno = be32_to_cpu(agf->agf_seqno);
-               cur->bc_mp->m_perag[seqno].pagf_longest = len;
-               agf->agf_longest = cpu_to_be32(len);
-               xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-                       XFS_AGF_LONGEST);
-       }
-       /*
-        * Updating first record in leaf. Pass new key value up to our parent.
-        */
-       if (ptr == 1) {
-               xfs_alloc_key_t key;    /* key containing [bno, len] */
+       cur->bc_tp = tp;
+       cur->bc_mp = mp;
+       cur->bc_nlevels = be32_to_cpu(agf->agf_levels[btnum]);
+       cur->bc_btnum = btnum;
+       cur->bc_blocklog = mp->m_sb.sb_blocklog;
 
-               key.ar_startblock = cpu_to_be32(bno);
-               key.ar_blockcount = cpu_to_be32(len);
-               if ((error = xfs_alloc_updkey(cur, &key, 1)))
-                       return error;
-       }
-       return 0;
+       cur->bc_ops = &xfs_allocbt_ops;
+       if (btnum == XFS_BTNUM_CNT)
+               cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
+
+       cur->bc_private.a.agbp = agbp;
+       cur->bc_private.a.agno = agno;
+
+       return cur;
+}
+
+/*
+ * Calculate number of records in an alloc btree block.
+ */
+int
+xfs_allocbt_maxrecs(
+       struct xfs_mount        *mp,
+       int                     blocklen,
+       int                     leaf)
+{
+       blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
+
+       if (leaf)
+               return blocklen / sizeof(xfs_alloc_rec_t);
+       return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
 }
index 5bd1a2c8bd0712a4ba05828802b8d7fb763727e2..a6caa0022c9bba4f937abaf032ede0761e930a22 100644 (file)
@@ -24,7 +24,6 @@
 
 struct xfs_buf;
 struct xfs_btree_cur;
-struct xfs_btree_sblock;
 struct xfs_mount;
 
 /*
@@ -50,16 +49,6 @@ typedef struct xfs_alloc_rec_incore {
 
 /* btree pointer type */
 typedef __be32 xfs_alloc_ptr_t;
-/* btree block header type */
-typedef        struct xfs_btree_sblock xfs_alloc_block_t;
-
-#define        XFS_BUF_TO_ALLOC_BLOCK(bp)      ((xfs_alloc_block_t *)XFS_BUF_PTR(bp))
-
-/*
- * Real block structures have a size equal to the disk block size.
- */
-#define        XFS_ALLOC_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_alloc_mxr[lev != 0])
-#define        XFS_ALLOC_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_alloc_mnr[lev != 0])
 
 /*
  * Minimum and maximum blocksize and sectorsize.
@@ -83,73 +72,39 @@ typedef     struct xfs_btree_sblock xfs_alloc_block_t;
 #define        XFS_CNT_BLOCK(mp)       ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
 
 /*
- * Record, key, and pointer address macros for btree blocks.
- */
-#define        XFS_ALLOC_REC_ADDR(bb,i,cur)    \
-       XFS_BTREE_REC_ADDR(xfs_alloc, bb, i)
-
-#define        XFS_ALLOC_KEY_ADDR(bb,i,cur)    \
-       XFS_BTREE_KEY_ADDR(xfs_alloc, bb, i)
-
-#define        XFS_ALLOC_PTR_ADDR(bb,i,cur)    \
-       XFS_BTREE_PTR_ADDR(xfs_alloc, bb, i, XFS_ALLOC_BLOCK_MAXRECS(1, cur))
-
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_alloc_decrement(struct xfs_btree_cur *cur, int level, int *stat);
-
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-extern int xfs_alloc_delete(struct xfs_btree_cur *cur, int *stat);
-
-/*
- * Get the data from the pointed-to record.
- */
-extern int xfs_alloc_get_rec(struct xfs_btree_cur *cur,        xfs_agblock_t *bno,
-                               xfs_extlen_t *len, int *stat);
-
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_alloc_increment(struct xfs_btree_cur *cur, int level, int *stat);
-
-/*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
- */
-extern int xfs_alloc_insert(struct xfs_btree_cur *cur, int *stat);
-
-/*
- * Lookup the record equal to [bno, len] in the btree given by cur.
- */
-extern int xfs_alloc_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-                               xfs_extlen_t len, int *stat);
-
-/*
- * Lookup the first record greater than or equal to [bno, len]
- * in the btree given by cur.
- */
-extern int xfs_alloc_lookup_ge(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-                               xfs_extlen_t len, int *stat);
-
-/*
- * Lookup the first record less than or equal to [bno, len]
- * in the btree given by cur.
+ * Btree block header size depends on a superblock flag.
+ *
+ * (not quite yet, but soon)
  */
-extern int xfs_alloc_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-                               xfs_extlen_t len, int *stat);
+#define XFS_ALLOC_BLOCK_LEN(mp)        XFS_BTREE_SBLOCK_LEN
 
 /*
- * Update the record referred to by cur, to the value given by [bno, len].
- * This either works (return 0) or gets an EFSCORRUPTED error.
- */
-extern int xfs_alloc_update(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-                               xfs_extlen_t len);
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_ALLOC_REC_ADDR(mp, block, index) \
+       ((xfs_alloc_rec_t *) \
+               ((char *)(block) + \
+                XFS_ALLOC_BLOCK_LEN(mp) + \
+                (((index) - 1) * sizeof(xfs_alloc_rec_t))))
+
+#define XFS_ALLOC_KEY_ADDR(mp, block, index) \
+       ((xfs_alloc_key_t *) \
+               ((char *)(block) + \
+                XFS_ALLOC_BLOCK_LEN(mp) + \
+                ((index) - 1) * sizeof(xfs_alloc_key_t)))
+
+#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \
+       ((xfs_alloc_ptr_t *) \
+               ((char *)(block) + \
+                XFS_ALLOC_BLOCK_LEN(mp) + \
+                (maxrecs) * sizeof(xfs_alloc_key_t) + \
+                ((index) - 1) * sizeof(xfs_alloc_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
+               struct xfs_trans *, struct xfs_buf *,
+               xfs_agnumber_t, xfs_btnum_t);
+extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
 
 #endif /* __XFS_ALLOC_BTREE_H__ */
index 0b3b5efe848cbbf883373a971cf5b34d1c9951e7..53d5e70d13609a720127319da3d43d6e4c42dc0c 100644 (file)
 #endif
 
 #ifdef XFS_NATIVE_HOST
-#define cpu_to_be16(val)       ((__be16)(val))
-#define cpu_to_be32(val)       ((__be32)(val))
-#define cpu_to_be64(val)       ((__be64)(val))
-#define be16_to_cpu(val)       ((__uint16_t)(val))
-#define be32_to_cpu(val)       ((__uint32_t)(val))
-#define be64_to_cpu(val)       ((__uint64_t)(val))
+#define cpu_to_be16(val)       ((__force __be16)(__u16)(val))
+#define cpu_to_be32(val)       ((__force __be32)(__u32)(val))
+#define cpu_to_be64(val)       ((__force __be64)(__u64)(val))
+#define be16_to_cpu(val)       ((__force __u16)(__be16)(val))
+#define be32_to_cpu(val)       ((__force __u32)(__be32)(val))
+#define be64_to_cpu(val)       ((__force __u64)(__be64)(val))
 #else
-#define cpu_to_be16(val)       (__swab16((__uint16_t)(val)))
-#define cpu_to_be32(val)       (__swab32((__uint32_t)(val)))
-#define cpu_to_be64(val)       (__swab64((__uint64_t)(val)))
-#define be16_to_cpu(val)       (__swab16((__be16)(val)))
-#define be32_to_cpu(val)       (__swab32((__be32)(val)))
-#define be64_to_cpu(val)       (__swab64((__be64)(val)))
+#define cpu_to_be16(val)       ((__force __be16)__swab16((__u16)(val)))
+#define cpu_to_be32(val)       ((__force __be32)__swab32((__u32)(val)))
+#define cpu_to_be64(val)       ((__force __be64)__swab64((__u64)(val)))
+#define be16_to_cpu(val)       (__swab16((__force __u16)(__be16)(val)))
+#define be32_to_cpu(val)       (__swab32((__force __u32)(__be32)(val)))
+#define be64_to_cpu(val)       (__swab64((__force __u64)(__be64)(val)))
 #endif
 
+static inline void be16_add_cpu(__be16 *a, __s16 b)
+{
+       *a = cpu_to_be16(be16_to_cpu(*a) + b);
+}
+
+static inline void be32_add_cpu(__be32 *a, __s32 b)
+{
+       *a = cpu_to_be32(be32_to_cpu(*a) + b);
+}
+
+static inline void be64_add_cpu(__be64 *a, __s64 b)
+{
+       *a = cpu_to_be64(be64_to_cpu(*a) + b);
+}
+
 #endif /* __KERNEL__ */
 
 /* do we need conversion? */
index 8e0e463dae2d7c0f55268c46683110a01a42405b..bca7b243c31979ff488de535c38f262fa5739ccb 100644 (file)
@@ -61,8 +61,7 @@ static inline int xfs_highbit64(__uint64_t v)
 /* Get low bit set out of 32-bit argument, -1 if none set */
 static inline int xfs_lowbit32(__uint32_t v)
 {
-       unsigned long   t = v;
-       return (v) ? find_first_bit(&t, 32) : -1;
+       return ffs(v) - 1;
 }
 
 /* Get low bit set out of 64-bit argument, -1 if none set */
index a1aab9275d5ac727700a00070b89c88e1acb8e42..138308e70d14c1943540e98cb6d133be6cc5dadb 100644 (file)
@@ -393,8 +393,8 @@ xfs_bmap_count_leaves(
 
 STATIC void
 xfs_bmap_disk_count_leaves(
-       xfs_extnum_t            idx,
-       xfs_bmbt_block_t        *block,
+       struct xfs_mount        *mp,
+       struct xfs_btree_block  *block,
        int                     numrecs,
        int                     *count);
 
@@ -402,6 +402,53 @@ xfs_bmap_disk_count_leaves(
  * Bmap internal routines.
  */
 
+STATIC int                             /* error */
+xfs_bmbt_lookup_eq(
+       struct xfs_btree_cur    *cur,
+       xfs_fileoff_t           off,
+       xfs_fsblock_t           bno,
+       xfs_filblks_t           len,
+       int                     *stat)  /* success/failure */
+{
+       cur->bc_rec.b.br_startoff = off;
+       cur->bc_rec.b.br_startblock = bno;
+       cur->bc_rec.b.br_blockcount = len;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+STATIC int                             /* error */
+xfs_bmbt_lookup_ge(
+       struct xfs_btree_cur    *cur,
+       xfs_fileoff_t           off,
+       xfs_fsblock_t           bno,
+       xfs_filblks_t           len,
+       int                     *stat)  /* success/failure */
+{
+       cur->bc_rec.b.br_startoff = off;
+       cur->bc_rec.b.br_startblock = bno;
+       cur->bc_rec.b.br_blockcount = len;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+* Update the record referred to by cur to the value given
+ * by [off, bno, len, state].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_bmbt_update(
+       struct xfs_btree_cur    *cur,
+       xfs_fileoff_t           off,
+       xfs_fsblock_t           bno,
+       xfs_filblks_t           len,
+       xfs_exntst_t            state)
+{
+       union xfs_btree_rec     rec;
+
+       xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state);
+       return xfs_btree_update(cur, &rec);
+}
+
 /*
  * Called from xfs_bmap_add_attrfork to handle btree format files.
  */
@@ -422,15 +469,14 @@ xfs_bmap_add_attrfork_btree(
        if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip))
                *flags |= XFS_ILOG_DBROOT;
        else {
-               cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
-                       XFS_DATA_FORK);
+               cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
                cur->bc_private.b.flist = flist;
                cur->bc_private.b.firstblock = *firstblock;
                if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
                        goto error0;
                /* must be at least one entry */
                XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
-               if ((error = xfs_bmbt_newroot(cur, flags, &stat)))
+               if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
                        goto error0;
                if (stat == 0) {
                        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
@@ -818,10 +864,10 @@ xfs_bmap_add_extent_delay_real(
                                        RIGHT.br_blockcount, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_delete(cur, &i)))
+                       if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                       if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
@@ -931,7 +977,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                       if ((error = xfs_bmbt_insert(cur, &i)))
+                       if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1007,7 +1053,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                       if ((error = xfs_bmbt_insert(cur, &i)))
+                       if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1097,7 +1143,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                       if ((error = xfs_bmbt_insert(cur, &i)))
+                       if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1152,7 +1198,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                       if ((error = xfs_bmbt_insert(cur, &i)))
+                       if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1379,16 +1425,16 @@ xfs_bmap_add_extent_unwritten_real(
                                        RIGHT.br_blockcount, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_delete(cur, &i)))
+                       if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                       if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_delete(cur, &i)))
+                       if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                       if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
@@ -1428,10 +1474,10 @@ xfs_bmap_add_extent_unwritten_real(
                                        &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_delete(cur, &i)))
+                       if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                       if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
@@ -1471,10 +1517,10 @@ xfs_bmap_add_extent_unwritten_real(
                                        RIGHT.br_blockcount, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_delete(cur, &i)))
+                       if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                       if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
@@ -1557,7 +1603,7 @@ xfs_bmap_add_extent_unwritten_real(
                                PREV.br_blockcount - new->br_blockcount,
                                oldext)))
                                goto done;
-                       if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                       if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        if (xfs_bmbt_update(cur, LEFT.br_startoff,
                                LEFT.br_startblock,
@@ -1605,7 +1651,7 @@ xfs_bmap_add_extent_unwritten_real(
                                oldext)))
                                goto done;
                        cur->bc_rec.b = *new;
-                       if ((error = xfs_bmbt_insert(cur, &i)))
+                       if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1647,7 +1693,7 @@ xfs_bmap_add_extent_unwritten_real(
                                PREV.br_blockcount - new->br_blockcount,
                                oldext)))
                                goto done;
-                       if ((error = xfs_bmbt_increment(cur, 0, &i)))
+                       if ((error = xfs_btree_increment(cur, 0, &i)))
                                goto done;
                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
                                new->br_startblock,
@@ -1695,7 +1741,7 @@ xfs_bmap_add_extent_unwritten_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                       if ((error = xfs_bmbt_insert(cur, &i)))
+                       if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1743,7 +1789,7 @@ xfs_bmap_add_extent_unwritten_real(
                        cur->bc_rec.b = PREV;
                        cur->bc_rec.b.br_blockcount =
                                new->br_startoff - PREV.br_startoff;
-                       if ((error = xfs_bmbt_insert(cur, &i)))
+                       if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        /*
@@ -1758,7 +1804,7 @@ xfs_bmap_add_extent_unwritten_real(
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        /* new middle extent - newext */
                        cur->bc_rec.b.br_state = new->br_state;
-                       if ((error = xfs_bmbt_insert(cur, &i)))
+                       if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -2106,10 +2152,10 @@ xfs_bmap_add_extent_hole_real(
                                        right.br_blockcount, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_delete(cur, &i)))
+                       if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                       if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, left.br_startoff,
@@ -2218,7 +2264,7 @@ xfs_bmap_add_extent_hole_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = new->br_state;
-                       if ((error = xfs_bmbt_insert(cur, &i)))
+                       if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -2996,24 +3042,24 @@ xfs_bmap_btree_to_extents(
        int                     whichfork)  /* data or attr fork */
 {
        /* REFERENCED */
-       xfs_bmbt_block_t        *cblock;/* child btree block */
+       struct xfs_btree_block  *cblock;/* child btree block */
        xfs_fsblock_t           cbno;   /* child block number */
        xfs_buf_t               *cbp;   /* child block's buffer */
        int                     error;  /* error return value */
        xfs_ifork_t             *ifp;   /* inode fork data */
        xfs_mount_t             *mp;    /* mount point structure */
        __be64                  *pp;    /* ptr to block address */
-       xfs_bmbt_block_t        *rblock;/* root btree block */
+       struct xfs_btree_block  *rblock;/* root btree block */
 
+       mp = ip->i_mount;
        ifp = XFS_IFORK_PTR(ip, whichfork);
        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
        rblock = ifp->if_broot;
        ASSERT(be16_to_cpu(rblock->bb_level) == 1);
        ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
-       ASSERT(XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes) == 1);
-       mp = ip->i_mount;
-       pp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, ifp->if_broot_bytes);
+       ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
        cbno = be64_to_cpu(*pp);
        *logflagsp = 0;
 #ifdef DEBUG
@@ -3023,8 +3069,8 @@ xfs_bmap_btree_to_extents(
        if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp,
                        XFS_BMAP_BTREE_REF)))
                return error;
-       cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
-       if ((error = xfs_btree_check_lblock(cur, cblock, 0, cbp)))
+       cblock = XFS_BUF_TO_BLOCK(cbp);
+       if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
                return error;
        xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
        ip->i_d.di_nblocks--;
@@ -3170,7 +3216,7 @@ xfs_bmap_del_extent(
                        flags |= XFS_ILOG_FEXT(whichfork);
                        break;
                }
-               if ((error = xfs_bmbt_delete(cur, &i)))
+               if ((error = xfs_btree_delete(cur, &i)))
                        goto done;
                XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                break;
@@ -3254,10 +3300,10 @@ xfs_bmap_del_extent(
                                                got.br_startblock, temp,
                                                got.br_state)))
                                        goto done;
-                               if ((error = xfs_bmbt_increment(cur, 0, &i)))
+                               if ((error = xfs_btree_increment(cur, 0, &i)))
                                        goto done;
                                cur->bc_rec.b = new;
-                               error = xfs_bmbt_insert(cur, &i);
+                               error = xfs_btree_insert(cur, &i);
                                if (error && error != ENOSPC)
                                        goto done;
                                /*
@@ -3404,11 +3450,11 @@ xfs_bmap_extents_to_btree(
        int                     *logflagsp,     /* inode logging flags */
        int                     whichfork)      /* data or attr fork */
 {
-       xfs_bmbt_block_t        *ablock;        /* allocated (child) bt block */
+       struct xfs_btree_block  *ablock;        /* allocated (child) bt block */
        xfs_buf_t               *abp;           /* buffer for ablock */
        xfs_alloc_arg_t         args;           /* allocation arguments */
        xfs_bmbt_rec_t          *arp;           /* child record pointer */
-       xfs_bmbt_block_t        *block;         /* btree root block */
+       struct xfs_btree_block  *block;         /* btree root block */
        xfs_btree_cur_t         *cur;           /* bmap btree cursor */
        xfs_bmbt_rec_host_t     *ep;            /* extent record pointer */
        int                     error;          /* error return value */
@@ -3428,6 +3474,7 @@ xfs_bmap_extents_to_btree(
         */
        xfs_iroot_realloc(ip, 1, whichfork);
        ifp->if_flags |= XFS_IFBROOT;
+
        /*
         * Fill in the root.
         */
@@ -3435,14 +3482,14 @@ xfs_bmap_extents_to_btree(
        block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
        block->bb_level = cpu_to_be16(1);
        block->bb_numrecs = cpu_to_be16(1);
-       block->bb_leftsib = cpu_to_be64(NULLDFSBNO);
-       block->bb_rightsib = cpu_to_be64(NULLDFSBNO);
+       block->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+       block->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+
        /*
         * Need a cursor.  Can't allocate until bb_level is filled in.
         */
        mp = ip->i_mount;
-       cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
-               whichfork);
+       cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
        cur->bc_private.b.firstblock = *firstblock;
        cur->bc_private.b.flist = flist;
        cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
@@ -3489,12 +3536,12 @@ xfs_bmap_extents_to_btree(
        /*
         * Fill in the child block.
         */
-       ablock = XFS_BUF_TO_BMBT_BLOCK(abp);
+       ablock = XFS_BUF_TO_BLOCK(abp);
        ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
        ablock->bb_level = 0;
-       ablock->bb_leftsib = cpu_to_be64(NULLDFSBNO);
-       ablock->bb_rightsib = cpu_to_be64(NULLDFSBNO);
-       arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
+       ablock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+       ablock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+       arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
        for (cnt = i = 0; i < nextents; i++) {
                ep = xfs_iext_get_ext(ifp, i);
@@ -3505,21 +3552,24 @@ xfs_bmap_extents_to_btree(
                }
        }
        ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
-       ablock->bb_numrecs = cpu_to_be16(cnt);
+       xfs_btree_set_numrecs(ablock, cnt);
+
        /*
         * Fill in the root key and pointer.
         */
-       kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
-       arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
+       kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
+       arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
        kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
-       pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
+       pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
+                                               be16_to_cpu(block->bb_level)));
        *pp = cpu_to_be64(args.fsbno);
+
        /*
         * Do all this logging at the end so that
         * the root is at the right level.
         */
-       xfs_bmbt_log_block(cur, abp, XFS_BB_ALL_BITS);
-       xfs_bmbt_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
+       xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
+       xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
        ASSERT(*curp == NULL);
        *curp = cur;
        *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork);
@@ -4176,7 +4226,7 @@ xfs_bmap_compute_maxlevels(
                maxleafents = MAXAEXTNUM;
                sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
        }
-       maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0);
+       maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0);
        minleafrecs = mp->m_bmap_dmnr[0];
        minnoderecs = mp->m_bmap_dmnr[1];
        maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
@@ -4242,9 +4292,15 @@ xfs_bmap_finish(
         * We have a new transaction, so we should return committed=1,
         * even though we're returning an error.
         */
-       if (error) {
+       if (error)
                return error;
-       }
+
+       /*
+        * transaction commit worked ok so we can drop the extra ticket
+        * reference that we gained in xfs_trans_dup()
+        */
+       xfs_log_ticket_put(ntp->t_ticket);
+
        if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES,
                        logcount)))
                return error;
@@ -4474,6 +4530,22 @@ xfs_bmap_one_block(
        return rval;
 }
 
+STATIC int
+xfs_bmap_sanity_check(
+       struct xfs_mount        *mp,
+       struct xfs_buf          *bp,
+       int                     level)
+{
+       struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+
+       if (be32_to_cpu(block->bb_magic) != XFS_BMAP_MAGIC ||
+           be16_to_cpu(block->bb_level) != level ||
+           be16_to_cpu(block->bb_numrecs) == 0 ||
+           be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+               return 0;
+       return 1;
+}
+
 /*
  * Read in the extents to if_extents.
  * All inode fields are set up by caller, we just traverse the btree
@@ -4486,7 +4558,7 @@ xfs_bmap_read_extents(
        xfs_inode_t             *ip,    /* incore inode */
        int                     whichfork) /* data or attr fork */
 {
-       xfs_bmbt_block_t        *block; /* current btree block */
+       struct xfs_btree_block  *block; /* current btree block */
        xfs_fsblock_t           bno;    /* block # of "block" */
        xfs_buf_t               *bp;    /* buffer for "block" */
        int                     error;  /* error return value */
@@ -4510,7 +4582,7 @@ xfs_bmap_read_extents(
         */
        level = be16_to_cpu(block->bb_level);
        ASSERT(level > 0);
-       pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
        bno = be64_to_cpu(*pp);
        ASSERT(bno != NULLDFSBNO);
        ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
@@ -4523,13 +4595,13 @@ xfs_bmap_read_extents(
                if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
                                XFS_BMAP_BTREE_REF)))
                        return error;
-               block = XFS_BUF_TO_BMBT_BLOCK(bp);
+               block = XFS_BUF_TO_BLOCK(bp);
                XFS_WANT_CORRUPTED_GOTO(
-                       XFS_BMAP_SANITY_CHECK(mp, block, level),
+                       xfs_bmap_sanity_check(mp, bp, level),
                        error0);
                if (level == 0)
                        break;
-               pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
+               pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
                bno = be64_to_cpu(*pp);
                XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
                xfs_trans_brelse(tp, bp);
@@ -4549,7 +4621,7 @@ xfs_bmap_read_extents(
                xfs_extnum_t    start;
 
 
-               num_recs = be16_to_cpu(block->bb_numrecs);
+               num_recs = xfs_btree_get_numrecs(block);
                if (unlikely(i + num_recs > room)) {
                        ASSERT(i + num_recs <= room);
                        xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
@@ -4561,18 +4633,18 @@ xfs_bmap_read_extents(
                        goto error0;
                }
                XFS_WANT_CORRUPTED_GOTO(
-                       XFS_BMAP_SANITY_CHECK(mp, block, 0),
+                       xfs_bmap_sanity_check(mp, bp, 0),
                        error0);
                /*
                 * Read-ahead the next leaf block, if any.
                 */
-               nextbno = be64_to_cpu(block->bb_rightsib);
+               nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                if (nextbno != NULLFSBLOCK)
                        xfs_btree_reada_bufl(mp, nextbno, 1);
                /*
                 * Copy records into the extent records.
                 */
-               frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1);
+               frp = XFS_BMBT_REC_ADDR(mp, block, 1);
                start = i;
                for (j = 0; j < num_recs; j++, i++, frp++) {
                        xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
@@ -4603,7 +4675,7 @@ xfs_bmap_read_extents(
                if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
                                XFS_BMAP_BTREE_REF)))
                        return error;
-               block = XFS_BUF_TO_BMBT_BLOCK(bp);
+               block = XFS_BUF_TO_BLOCK(bp);
        }
        ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
        ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
@@ -5029,8 +5101,7 @@ xfs_bmapi(
                                if (abno == NULLFSBLOCK)
                                        break;
                                if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
-                                       cur = xfs_btree_init_cursor(mp,
-                                               tp, NULL, 0, XFS_BTNUM_BMAP,
+                                       cur = xfs_bmbt_init_cursor(mp, tp,
                                                ip, whichfork);
                                        cur->bc_private.b.firstblock =
                                                *firstblock;
@@ -5147,9 +5218,8 @@ xfs_bmapi(
                         */
                        ASSERT(mval->br_blockcount <= len);
                        if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
-                               cur = xfs_btree_init_cursor(mp,
-                                       tp, NULL, 0, XFS_BTNUM_BMAP,
-                                       ip, whichfork);
+                               cur = xfs_bmbt_init_cursor(mp,
+                                       tp, ip, whichfork);
                                cur->bc_private.b.firstblock =
                                        *firstblock;
                                cur->bc_private.b.flist = flist;
@@ -5440,8 +5510,7 @@ xfs_bunmapi(
        logflags = 0;
        if (ifp->if_flags & XFS_IFBROOT) {
                ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
-               cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
-                       whichfork);
+               cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
                cur->bc_private.b.firstblock = *firstblock;
                cur->bc_private.b.flist = flist;
                cur->bc_private.b.flags = 0;
@@ -5742,14 +5811,17 @@ error0:
 STATIC int
 xfs_getbmapx_fix_eof_hole(
        xfs_inode_t             *ip,            /* xfs incore inode pointer */
-       struct getbmap          *out,           /* output structure */
+       struct getbmapx         *out,           /* output structure */
        int                     prealloced,     /* this is a file with
-                                               * preallocated data space */
+                                                * preallocated data space */
        __int64_t               end,            /* last block requested */
        xfs_fsblock_t           startblock)
 {
        __int64_t               fixlen;
        xfs_mount_t             *mp;            /* file system mount point */
+       xfs_ifork_t             *ifp;           /* inode fork pointer */
+       xfs_extnum_t            lastx;          /* last extent pointer */
+       xfs_fileoff_t           fileblock;
 
        if (startblock == HOLESTARTBLOCK) {
                mp = ip->i_mount;
@@ -5763,21 +5835,33 @@ xfs_getbmapx_fix_eof_hole(
                        out->bmv_length = fixlen;
                }
        } else {
-               out->bmv_block = XFS_FSB_TO_DB(ip, startblock);
+               if (startblock == DELAYSTARTBLOCK)
+                       out->bmv_block = -2;
+               else
+                       out->bmv_block = XFS_FSB_TO_DB(ip, startblock);
+               fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
+               ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+               if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
+                  (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
+                       out->bmv_oflags |= BMV_OF_LAST;
        }
 
        return 1;
 }
 
 /*
- * Fcntl interface to xfs_bmapi.
+ * Get inode's extents as described in bmv, and format for output.
+ * Calls formatter to fill the user's buffer until all extents
+ * are mapped, until the passed-in bmv->bmv_count slots have
+ * been filled, or until the formatter short-circuits the loop,
+ * if it is tracking filled-in extents on its own.
  */
 int                                            /* error code */
 xfs_getbmap(
        xfs_inode_t             *ip,
-       struct getbmap          *bmv,           /* user bmap structure */
-       void                    __user *ap,     /* pointer to user's array */
-       int                     interface)      /* interface flags */
+       struct getbmapx         *bmv,           /* user bmap structure */
+       xfs_bmap_format_t       formatter,      /* format to user */
+       void                    *arg)           /* formatter arg */
 {
        __int64_t               bmvend;         /* last block requested */
        int                     error;          /* return value */
@@ -5790,19 +5874,17 @@ xfs_getbmap(
        int                     nexleft;        /* # of user extents left */
        int                     subnex;         /* # of bmapi's can do */
        int                     nmap;           /* number of map entries */
-       struct getbmap          out;            /* output structure */
+       struct getbmapx         out;            /* output structure */
        int                     whichfork;      /* data or attr fork */
        int                     prealloced;     /* this is a file with
                                                 * preallocated data space */
-       int                     sh_unwritten;   /* true, if unwritten */
-                                               /* extents listed separately */
+       int                     iflags;         /* interface flags */
        int                     bmapi_flags;    /* flags for xfs_bmapi */
-       __int32_t               oflags;         /* getbmapx bmv_oflags field */
 
        mp = ip->i_mount;
+       iflags = bmv->bmv_iflags;
 
-       whichfork = interface & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
-       sh_unwritten = (interface & BMV_IF_PREALLOC) != 0;
+       whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
 
        /*      If the BMV_IF_NO_DMAPI_READ interface bit specified, do not
         *      generate a DMAPI read event.  Otherwise, if the DM_EVENT_READ
@@ -5817,7 +5899,7 @@ xfs_getbmap(
         *      could misinterpret holes in a DMAPI file as true holes,
         *      when in fact they may represent offline user data.
         */
-       if ((interface & BMV_IF_NO_DMAPI_READ) == 0 &&
+       if ((iflags & BMV_IF_NO_DMAPI_READ) == 0 &&
            DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
            whichfork == XFS_DATA_FORK) {
                error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
@@ -5873,8 +5955,9 @@ xfs_getbmap(
 
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
 
-       if (whichfork == XFS_DATA_FORK &&
-               (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) {
+       if (((iflags & BMV_IF_DELALLOC) == 0) &&
+           (whichfork == XFS_DATA_FORK) &&
+           (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) {
                /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
                error = xfs_flush_pages(ip, (xfs_off_t)0,
                                               -1, 0, FI_REMAPF);
@@ -5884,7 +5967,8 @@ xfs_getbmap(
                }
        }
 
-       ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0);
+       ASSERT(whichfork == XFS_ATTR_FORK || (iflags & BMV_IF_DELALLOC) ||
+              ip->i_delayed_blks == 0);
 
        lock = xfs_ilock_map_shared(ip);
 
@@ -5896,7 +5980,7 @@ xfs_getbmap(
                nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
 
        bmapi_flags = XFS_BMAPI_AFLAG(whichfork) |
-                       ((sh_unwritten) ? 0 : XFS_BMAPI_IGSTATE);
+                       ((iflags & BMV_IF_PREALLOC) ? 0 : XFS_BMAPI_IGSTATE);
 
        /*
         * Allocate enough space to handle "subnex" maps at a time.
@@ -5906,9 +5990,12 @@ xfs_getbmap(
 
        bmv->bmv_entries = 0;
 
-       if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0) {
-               error = 0;
-               goto unlock_and_return;
+       if ((XFS_IFORK_NEXTENTS(ip, whichfork) == 0)) {
+               if (((iflags & BMV_IF_DELALLOC) == 0) ||
+                   whichfork == XFS_ATTR_FORK) {
+                       error = 0;
+                       goto unlock_and_return;
+               }
        }
 
        nexleft = nex;
@@ -5924,52 +6011,40 @@ xfs_getbmap(
                ASSERT(nmap <= subnex);
 
                for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
-                       nexleft--;
-                       oflags = (map[i].br_state == XFS_EXT_UNWRITTEN) ?
-                                       BMV_OF_PREALLOC : 0;
+                       out.bmv_oflags = 0;
+                       if (map[i].br_state == XFS_EXT_UNWRITTEN)
+                               out.bmv_oflags |= BMV_OF_PREALLOC;
+                       else if (map[i].br_startblock == DELAYSTARTBLOCK)
+                               out.bmv_oflags |= BMV_OF_DELALLOC;
                        out.bmv_offset = XFS_FSB_TO_BB(mp, map[i].br_startoff);
                        out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
-                       ASSERT(map[i].br_startblock != DELAYSTARTBLOCK);
+                       out.bmv_unused1 = out.bmv_unused2 = 0;
+                       ASSERT(((iflags & BMV_IF_DELALLOC) != 0) ||
+                             (map[i].br_startblock != DELAYSTARTBLOCK));
                         if (map[i].br_startblock == HOLESTARTBLOCK &&
                            whichfork == XFS_ATTR_FORK) {
                                /* came to the end of attribute fork */
+                               out.bmv_oflags |= BMV_OF_LAST;
                                goto unlock_and_return;
                        } else {
+                               int full = 0;   /* user array is full */
+
                                if (!xfs_getbmapx_fix_eof_hole(ip, &out,
                                                        prealloced, bmvend,
                                                        map[i].br_startblock)) {
                                        goto unlock_and_return;
                                }
 
-                               /* return either getbmap/getbmapx structure. */
-                               if (interface & BMV_IF_EXTENDED) {
-                                       struct  getbmapx        outx;
-
-                                       GETBMAP_CONVERT(out,outx);
-                                       outx.bmv_oflags = oflags;
-                                       outx.bmv_unused1 = outx.bmv_unused2 = 0;
-                                       if (copy_to_user(ap, &outx,
-                                                       sizeof(outx))) {
-                                               error = XFS_ERROR(EFAULT);
-                                               goto unlock_and_return;
-                                       }
-                               } else {
-                                       if (copy_to_user(ap, &out,
-                                                       sizeof(out))) {
-                                               error = XFS_ERROR(EFAULT);
-                                               goto unlock_and_return;
-                                       }
-                               }
+                               /* format results & advance arg */
+                               error = formatter(&arg, &out, &full);
+                               if (error || full)
+                                       goto unlock_and_return;
+                               nexleft--;
                                bmv->bmv_offset =
                                        out.bmv_offset + out.bmv_length;
                                bmv->bmv_length = MAX((__int64_t)0,
                                        (__int64_t)(bmvend - bmv->bmv_offset));
                                bmv->bmv_entries++;
-                               ap = (interface & BMV_IF_EXTENDED) ?
-                                               (void __user *)
-                                       ((struct getbmapx __user *)ap + 1) :
-                                               (void __user *)
-                                       ((struct getbmap __user *)ap + 1);
                        }
                }
        } while (nmap && nexleft && bmv->bmv_length);
@@ -6131,7 +6206,7 @@ xfs_bmap_get_bp(
 
 void
 xfs_check_block(
-       xfs_bmbt_block_t        *block,
+       struct xfs_btree_block  *block,
        xfs_mount_t             *mp,
        int                     root,
        short                   sz)
@@ -6143,36 +6218,29 @@ xfs_check_block(
        ASSERT(be16_to_cpu(block->bb_level) > 0);
 
        prevp = NULL;
-       for( i = 1; i <= be16_to_cpu(block->bb_numrecs); i++) {
+       for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
                dmxr = mp->m_bmap_dmxr[0];
-
-               if (root) {
-                       keyp = XFS_BMAP_BROOT_KEY_ADDR(block, i, sz);
-               } else {
-                       keyp = XFS_BTREE_KEY_ADDR(xfs_bmbt, block, i);
-               }
+               keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
 
                if (prevp) {
-                       xfs_btree_check_key(XFS_BTNUM_BMAP, prevp, keyp);
+                       ASSERT(be64_to_cpu(prevp->br_startoff) <
+                              be64_to_cpu(keyp->br_startoff));
                }
                prevp = keyp;
 
                /*
                 * Compare the block numbers to see if there are dups.
                 */
+               if (root)
+                       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
+               else
+                       pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
 
-               if (root) {
-                       pp = XFS_BMAP_BROOT_PTR_ADDR(block, i, sz);
-               } else {
-                       pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, i, dmxr);
-               }
                for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
-                       if (root) {
-                               thispa = XFS_BMAP_BROOT_PTR_ADDR(block, j, sz);
-                       } else {
-                               thispa = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, j,
-                                                           dmxr);
-                       }
+                       if (root)
+                               thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
+                       else
+                               thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
                        if (*thispa == *pp) {
                                cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
                                        __func__, j, i,
@@ -6195,7 +6263,7 @@ xfs_bmap_check_leaf_extents(
        xfs_inode_t             *ip,            /* incore inode pointer */
        int                     whichfork)      /* data or attr fork */
 {
-       xfs_bmbt_block_t        *block; /* current btree block */
+       struct xfs_btree_block  *block; /* current btree block */
        xfs_fsblock_t           bno;    /* block # of "block" */
        xfs_buf_t               *bp;    /* buffer for "block" */
        int                     error;  /* error return value */
@@ -6223,7 +6291,7 @@ xfs_bmap_check_leaf_extents(
        level = be16_to_cpu(block->bb_level);
        ASSERT(level > 0);
        xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
-       pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
        bno = be64_to_cpu(*pp);
 
        ASSERT(bno != NULLDFSBNO);
@@ -6245,9 +6313,9 @@ xfs_bmap_check_leaf_extents(
                if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
                                XFS_BMAP_BTREE_REF)))
                        goto error_norelse;
-               block = XFS_BUF_TO_BMBT_BLOCK(bp);
+               block = XFS_BUF_TO_BLOCK(bp);
                XFS_WANT_CORRUPTED_GOTO(
-                       XFS_BMAP_SANITY_CHECK(mp, block, level),
+                       xfs_bmap_sanity_check(mp, bp, level),
                        error0);
                if (level == 0)
                        break;
@@ -6258,7 +6326,7 @@ xfs_bmap_check_leaf_extents(
                 */
 
                xfs_check_block(block, mp, 0, 0);
-               pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
+               pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
                bno = be64_to_cpu(*pp);
                XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
                if (bp_release) {
@@ -6280,13 +6348,13 @@ xfs_bmap_check_leaf_extents(
                xfs_extnum_t    num_recs;
 
 
-               num_recs = be16_to_cpu(block->bb_numrecs);
+               num_recs = xfs_btree_get_numrecs(block);
 
                /*
                 * Read-ahead the next leaf block, if any.
                 */
 
-               nextbno = be64_to_cpu(block->bb_rightsib);
+               nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
 
                /*
                 * Check all the extents to make sure they are OK.
@@ -6294,13 +6362,17 @@ xfs_bmap_check_leaf_extents(
                 * conform with the first entry in this one.
                 */
 
-               ep = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1);
+               ep = XFS_BMBT_REC_ADDR(mp, block, 1);
                if (i) {
-                       xfs_btree_check_rec(XFS_BTNUM_BMAP, &last, ep);
+                       ASSERT(xfs_bmbt_disk_get_startoff(&last) +
+                              xfs_bmbt_disk_get_blockcount(&last) <=
+                              xfs_bmbt_disk_get_startoff(ep));
                }
                for (j = 1; j < num_recs; j++) {
-                       nextp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, j + 1);
-                       xfs_btree_check_rec(XFS_BTNUM_BMAP, ep, nextp);
+                       nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
+                       ASSERT(xfs_bmbt_disk_get_startoff(ep) +
+                              xfs_bmbt_disk_get_blockcount(ep) <=
+                              xfs_bmbt_disk_get_startoff(nextp));
                        ep = nextp;
                }
 
@@ -6326,7 +6398,7 @@ xfs_bmap_check_leaf_extents(
                if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
                                XFS_BMAP_BTREE_REF)))
                        goto error_norelse;
-               block = XFS_BUF_TO_BMBT_BLOCK(bp);
+               block = XFS_BUF_TO_BLOCK(bp);
        }
        if (bp_release) {
                bp_release = 0;
@@ -6356,7 +6428,7 @@ xfs_bmap_count_blocks(
        int                     whichfork,      /* data or attr fork */
        int                     *count)         /* out: count of blocks */
 {
-       xfs_bmbt_block_t        *block; /* current btree block */
+       struct xfs_btree_block  *block; /* current btree block */
        xfs_fsblock_t           bno;    /* block # of "block" */
        xfs_ifork_t             *ifp;   /* fork structure */
        int                     level;  /* btree level, for checking */
@@ -6379,7 +6451,7 @@ xfs_bmap_count_blocks(
        block = ifp->if_broot;
        level = be16_to_cpu(block->bb_level);
        ASSERT(level > 0);
-       pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
        bno = be64_to_cpu(*pp);
        ASSERT(bno != NULLDFSBNO);
        ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
@@ -6413,29 +6485,29 @@ xfs_bmap_count_tree(
        __be64                  *pp;
        xfs_fsblock_t           bno = blockno;
        xfs_fsblock_t           nextbno;
-       xfs_bmbt_block_t        *block, *nextblock;
+       struct xfs_btree_block  *block, *nextblock;
        int                     numrecs;
 
        if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF)))
                return error;
        *count += 1;
-       block = XFS_BUF_TO_BMBT_BLOCK(bp);
+       block = XFS_BUF_TO_BLOCK(bp);
 
        if (--level) {
                /* Not at node above leafs, count this level of nodes */
-               nextbno = be64_to_cpu(block->bb_rightsib);
+               nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                while (nextbno != NULLFSBLOCK) {
                        if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
                                0, &nbp, XFS_BMAP_BTREE_REF)))
                                return error;
                        *count += 1;
-                       nextblock = XFS_BUF_TO_BMBT_BLOCK(nbp);
-                       nextbno = be64_to_cpu(nextblock->bb_rightsib);
+                       nextblock = XFS_BUF_TO_BLOCK(nbp);
+                       nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
                        xfs_trans_brelse(tp, nbp);
                }
 
                /* Dive to the next level */
-               pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
+               pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
                bno = be64_to_cpu(*pp);
                if (unlikely((error =
                     xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
@@ -6448,9 +6520,9 @@ xfs_bmap_count_tree(
        } else {
                /* count all level 1 nodes and their leaves */
                for (;;) {
-                       nextbno = be64_to_cpu(block->bb_rightsib);
+                       nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                        numrecs = be16_to_cpu(block->bb_numrecs);
-                       xfs_bmap_disk_count_leaves(0, block, numrecs, count);
+                       xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
                        xfs_trans_brelse(tp, bp);
                        if (nextbno == NULLFSBLOCK)
                                break;
@@ -6459,7 +6531,7 @@ xfs_bmap_count_tree(
                                XFS_BMAP_BTREE_REF)))
                                return error;
                        *count += 1;
-                       block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                       block = XFS_BUF_TO_BLOCK(bp);
                }
        }
        return 0;
@@ -6489,8 +6561,8 @@ xfs_bmap_count_leaves(
  */
 STATIC void
 xfs_bmap_disk_count_leaves(
-       xfs_extnum_t            idx,
-       xfs_bmbt_block_t        *block,
+       struct xfs_mount        *mp,
+       struct xfs_btree_block  *block,
        int                     numrecs,
        int                     *count)
 {
@@ -6498,7 +6570,7 @@ xfs_bmap_disk_count_leaves(
        xfs_bmbt_rec_t  *frp;
 
        for (b = 1; b <= numrecs; b++) {
-               frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, idx + b);
+               frp = XFS_BMBT_REC_ADDR(mp, block, b);
                *count += xfs_bmbt_disk_get_blockcount(frp);
        }
 }
index 9f3e3a836d153b6bbf09d222588b84ae06c1be34..284571c05ed0c7d7295fd14eb68e8f430f66c26d 100644 (file)
@@ -137,9 +137,7 @@ typedef struct xfs_bmalloca {
        char                    conv;   /* overwriting unwritten extents */
 } xfs_bmalloca_t;
 
-#ifdef __KERNEL__
-
-#if defined(XFS_BMAP_TRACE)
+#if defined(__KERNEL__) && defined(XFS_BMAP_TRACE)
 /*
  * Trace operations for bmap extent tracing
  */
@@ -163,9 +161,12 @@ xfs_bmap_trace_exlist(
        int                     whichfork);     /* data or attr fork */
 #define        XFS_BMAP_TRACE_EXLIST(ip,c,w)   \
        xfs_bmap_trace_exlist(__func__,ip,c,w)
-#else
+
+#else  /* __KERNEL__ && XFS_BMAP_TRACE */
+
 #define        XFS_BMAP_TRACE_EXLIST(ip,c,w)
-#endif
+
+#endif /* __KERNEL__ && XFS_BMAP_TRACE */
 
 /*
  * Convert inode from non-attributed to attributed.
@@ -205,20 +206,6 @@ xfs_bmap_compute_maxlevels(
        struct xfs_mount        *mp,    /* file system mount structure */
        int                     whichfork);     /* data or attr fork */
 
-/*
- * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
- * caller.  Frees all the extents that need freeing, which must be done
- * last due to locking considerations.
- *
- * Return 1 if the given transaction was committed and a new one allocated,
- * and 0 otherwise.
- */
-int                                            /* error */
-xfs_bmap_finish(
-       struct xfs_trans        **tp,           /* transaction pointer addr */
-       xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
-       int                     *committed);    /* xact committed or not */
-
 /*
  * Returns the file-relative block number of the first unused block in the file.
  * This is the lowest-address hole if the file has holes, else the first block
@@ -344,14 +331,43 @@ xfs_bunmapi(
        int                     *done);         /* set if not done yet */
 
 /*
- * Fcntl interface to xfs_bmapi.
+ * Check an extent list, which has just been read, for
+ * any bit in the extent flag field.
+ */
+int
+xfs_check_nostate_extents(
+       struct xfs_ifork        *ifp,
+       xfs_extnum_t            idx,
+       xfs_extnum_t            num);
+
+#ifdef __KERNEL__
+
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller.  Frees all the extents that need freeing, which must be done
+ * last due to locking considerations.
+ *
+ * Return 1 if the given transaction was committed and a new one allocated,
+ * and 0 otherwise.
+ */
+int                                            /* error */
+xfs_bmap_finish(
+       struct xfs_trans        **tp,           /* transaction pointer addr */
+       xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
+       int                     *committed);    /* xact committed or not */
+
+/* bmap to userspace formatter - copy to user & advance pointer */
+typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
+
+/*
+ * Get inode's extents as described in bmv, and format for output.
  */
 int                                            /* error code */
 xfs_getbmap(
        xfs_inode_t             *ip,
-       struct getbmap          *bmv,           /* user bmap structure */
-       void                    __user *ap,     /* pointer to user's array */
-       int                     iflags);        /* interface flags */
+       struct getbmapx         *bmv,           /* user bmap structure */
+       xfs_bmap_format_t       formatter,      /* format to user */
+       void                    *arg);          /* formatter arg */
 
 /*
  * Check if the endoff is outside the last extent. If so the caller will grow
@@ -374,16 +390,6 @@ xfs_bmap_count_blocks(
        int                     whichfork,
        int                     *count);
 
-/*
- * Check an extent list, which has just been read, for
- * any bit in the extent flag field.
- */
-int
-xfs_check_nostate_extents(
-       struct xfs_ifork        *ifp,
-       xfs_extnum_t            idx,
-       xfs_extnum_t            num);
-
 /*
  * Search the extent records for the entry containing block bno.
  * If bno lies in a hole, point to the next entry.  If bno lies
index 23efad29a5cd680b1229ae32a3e56fa9b78d50a8..8f1ec73725d386ca433a44beea72f25ead80d220 100644 (file)
 #include "xfs_inode_item.h"
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_bmap.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
 
-#if defined(XFS_BMBT_TRACE)
-ktrace_t       *xfs_bmbt_trace_buf;
-#endif
-
-/*
- * Prototypes for internal btree functions.
- */
-
-
-STATIC int xfs_bmbt_killroot(xfs_btree_cur_t *);
-STATIC void xfs_bmbt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_bmbt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC int xfs_bmbt_lshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_bmbt_rshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_bmbt_split(xfs_btree_cur_t *, int, xfs_fsblock_t *,
-               __uint64_t *, xfs_btree_cur_t **, int *);
-STATIC int xfs_bmbt_updkey(xfs_btree_cur_t *, xfs_bmbt_key_t *, int);
-
-
-#if defined(XFS_BMBT_TRACE)
-
-static char    ARGS[] = "args";
-static char    ENTRY[] = "entry";
-static char    ERROR[] = "error";
-#undef EXIT
-static char    EXIT[] = "exit";
-
-/*
- * Add a trace buffer entry for the arguments given to the routine,
- * generic form.
- */
-STATIC void
-xfs_bmbt_trace_enter(
-       const char      *func,
-       xfs_btree_cur_t *cur,
-       char            *s,
-       int             type,
-       int             line,
-       __psunsigned_t  a0,
-       __psunsigned_t  a1,
-       __psunsigned_t  a2,
-       __psunsigned_t  a3,
-       __psunsigned_t  a4,
-       __psunsigned_t  a5,
-       __psunsigned_t  a6,
-       __psunsigned_t  a7,
-       __psunsigned_t  a8,
-       __psunsigned_t  a9,
-       __psunsigned_t  a10)
-{
-       xfs_inode_t     *ip;
-       int             whichfork;
-
-       ip = cur->bc_private.b.ip;
-       whichfork = cur->bc_private.b.whichfork;
-       ktrace_enter(xfs_bmbt_trace_buf,
-               (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
-               (void *)func, (void *)s, (void *)ip, (void *)cur,
-               (void *)a0, (void *)a1, (void *)a2, (void *)a3,
-               (void *)a4, (void *)a5, (void *)a6, (void *)a7,
-               (void *)a8, (void *)a9, (void *)a10);
-       ASSERT(ip->i_btrace);
-       ktrace_enter(ip->i_btrace,
-               (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
-               (void *)func, (void *)s, (void *)ip, (void *)cur,
-               (void *)a0, (void *)a1, (void *)a2, (void *)a3,
-               (void *)a4, (void *)a5, (void *)a6, (void *)a7,
-               (void *)a8, (void *)a9, (void *)a10);
-}
-/*
- * Add a trace buffer entry for arguments, for a buffer & 1 integer arg.
- */
-STATIC void
-xfs_bmbt_trace_argbi(
-       const char      *func,
-       xfs_btree_cur_t *cur,
-       xfs_buf_t       *b,
-       int             i,
-       int             line)
-{
-       xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBI, line,
-               (__psunsigned_t)b, i, 0, 0,
-               0, 0, 0, 0,
-               0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for a buffer & 2 integer args.
- */
-STATIC void
-xfs_bmbt_trace_argbii(
-       const char      *func,
-       xfs_btree_cur_t *cur,
-       xfs_buf_t       *b,
-       int             i0,
-       int             i1,
-       int             line)
-{
-       xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBII, line,
-               (__psunsigned_t)b, i0, i1, 0,
-               0, 0, 0, 0,
-               0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for 3 block-length args
- * and an integer arg.
- */
-STATIC void
-xfs_bmbt_trace_argfffi(
-       const char              *func,
-       xfs_btree_cur_t         *cur,
-       xfs_dfiloff_t           o,
-       xfs_dfsbno_t            b,
-       xfs_dfilblks_t          i,
-       int                     j,
-       int                     line)
-{
-       xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGFFFI, line,
-               o >> 32, (int)o, b >> 32, (int)b,
-               i >> 32, (int)i, (int)j, 0,
-               0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for one integer arg.
- */
-STATIC void
-xfs_bmbt_trace_argi(
-       const char      *func,
-       xfs_btree_cur_t *cur,
-       int             i,
-       int             line)
-{
-       xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGI, line,
-               i, 0, 0, 0,
-               0, 0, 0, 0,
-               0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for int, fsblock, key.
- */
-STATIC void
-xfs_bmbt_trace_argifk(
-       const char              *func,
-       xfs_btree_cur_t         *cur,
-       int                     i,
-       xfs_fsblock_t           f,
-       xfs_dfiloff_t           o,
-       int                     line)
-{
-       xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
-               i, (xfs_dfsbno_t)f >> 32, (int)f, o >> 32,
-               (int)o, 0, 0, 0,
-               0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for int, fsblock, rec.
- */
-STATIC void
-xfs_bmbt_trace_argifr(
-       const char              *func,
-       xfs_btree_cur_t         *cur,
-       int                     i,
-       xfs_fsblock_t           f,
-       xfs_bmbt_rec_t          *r,
-       int                     line)
-{
-       xfs_dfsbno_t            b;
-       xfs_dfilblks_t          c;
-       xfs_dfsbno_t            d;
-       xfs_dfiloff_t           o;
-       xfs_bmbt_irec_t         s;
-
-       d = (xfs_dfsbno_t)f;
-       xfs_bmbt_disk_get_all(r, &s);
-       o = (xfs_dfiloff_t)s.br_startoff;
-       b = (xfs_dfsbno_t)s.br_startblock;
-       c = s.br_blockcount;
-       xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFR, line,
-               i, d >> 32, (int)d, o >> 32,
-               (int)o, b >> 32, (int)b, c >> 32,
-               (int)c, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for int, key.
- */
-STATIC void
-xfs_bmbt_trace_argik(
-       const char              *func,
-       xfs_btree_cur_t         *cur,
-       int                     i,
-       xfs_bmbt_key_t          *k,
-       int                     line)
-{
-       xfs_dfiloff_t           o;
-
-       o = be64_to_cpu(k->br_startoff);
-       xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
-               i, o >> 32, (int)o, 0,
-               0, 0, 0, 0,
-               0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for the cursor/operation.
- */
-STATIC void
-xfs_bmbt_trace_cursor(
-       const char      *func,
-       xfs_btree_cur_t *cur,
-       char            *s,
-       int             line)
-{
-       xfs_bmbt_rec_host_t     r;
-
-       xfs_bmbt_set_all(&r, &cur->bc_rec.b);
-       xfs_bmbt_trace_enter(func, cur, s, XFS_BMBT_KTRACE_CUR, line,
-               (cur->bc_nlevels << 24) | (cur->bc_private.b.flags << 16) |
-               cur->bc_private.b.allocated,
-               r.l0 >> 32, (int)r.l0,
-               r.l1 >> 32, (int)r.l1,
-               (unsigned long)cur->bc_bufs[0], (unsigned long)cur->bc_bufs[1],
-               (unsigned long)cur->bc_bufs[2], (unsigned long)cur->bc_bufs[3],
-               (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1],
-               (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]);
-}
-
-#define        XFS_BMBT_TRACE_ARGBI(c,b,i)     \
-       xfs_bmbt_trace_argbi(__func__, c, b, i, __LINE__)
-#define        XFS_BMBT_TRACE_ARGBII(c,b,i,j)  \
-       xfs_bmbt_trace_argbii(__func__, c, b, i, j, __LINE__)
-#define        XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)       \
-       xfs_bmbt_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
-#define        XFS_BMBT_TRACE_ARGI(c,i)        \
-       xfs_bmbt_trace_argi(__func__, c, i, __LINE__)
-#define        XFS_BMBT_TRACE_ARGIFK(c,i,f,s)  \
-       xfs_bmbt_trace_argifk(__func__, c, i, f, s, __LINE__)
-#define        XFS_BMBT_TRACE_ARGIFR(c,i,f,r)  \
-       xfs_bmbt_trace_argifr(__func__, c, i, f, r, __LINE__)
-#define        XFS_BMBT_TRACE_ARGIK(c,i,k)     \
-       xfs_bmbt_trace_argik(__func__, c, i, k, __LINE__)
-#define        XFS_BMBT_TRACE_CURSOR(c,s)      \
-       xfs_bmbt_trace_cursor(__func__, c, s, __LINE__)
-#else
-#define        XFS_BMBT_TRACE_ARGBI(c,b,i)
-#define        XFS_BMBT_TRACE_ARGBII(c,b,i,j)
-#define        XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)
-#define        XFS_BMBT_TRACE_ARGI(c,i)
-#define        XFS_BMBT_TRACE_ARGIFK(c,i,f,s)
-#define        XFS_BMBT_TRACE_ARGIFR(c,i,f,r)
-#define        XFS_BMBT_TRACE_ARGIK(c,i,k)
-#define        XFS_BMBT_TRACE_CURSOR(c,s)
-#endif /* XFS_BMBT_TRACE */
-
-
-/*
- * Internal functions.
- */
-
-/*
- * Delete record pointed to by cur/level.
- */
-STATIC int                                     /* error */
-xfs_bmbt_delrec(
-       xfs_btree_cur_t         *cur,
-       int                     level,
-       int                     *stat)          /* success/failure */
-{
-       xfs_bmbt_block_t        *block;         /* bmap btree block */
-       xfs_fsblock_t           bno;            /* fs-relative block number */
-       xfs_buf_t               *bp;            /* buffer for block */
-       int                     error;          /* error return value */
-       int                     i;              /* loop counter */
-       int                     j;              /* temp state */
-       xfs_bmbt_key_t          key;            /* bmap btree key */
-       xfs_bmbt_key_t          *kp=NULL;       /* pointer to bmap btree key */
-       xfs_fsblock_t           lbno;           /* left sibling block number */
-       xfs_buf_t               *lbp;           /* left buffer pointer */
-       xfs_bmbt_block_t        *left;          /* left btree block */
-       xfs_bmbt_key_t          *lkp;           /* left btree key */
-       xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
-       int                     lrecs=0;        /* left record count */
-       xfs_bmbt_rec_t          *lrp;           /* left record pointer */
-       xfs_mount_t             *mp;            /* file system mount point */
-       xfs_bmbt_ptr_t          *pp;            /* pointer to bmap block addr */
-       int                     ptr;            /* key/record index */
-       xfs_fsblock_t           rbno;           /* right sibling block number */
-       xfs_buf_t               *rbp;           /* right buffer pointer */
-       xfs_bmbt_block_t        *right;         /* right btree block */
-       xfs_bmbt_key_t          *rkp;           /* right btree key */
-       xfs_bmbt_rec_t          *rp;            /* pointer to bmap btree rec */
-       xfs_bmbt_ptr_t          *rpp;           /* right address pointer */
-       xfs_bmbt_block_t        *rrblock;       /* right-right btree block */
-       xfs_buf_t               *rrbp;          /* right-right buffer pointer */
-       int                     rrecs=0;        /* right record count */
-       xfs_bmbt_rec_t          *rrp;           /* right record pointer */
-       xfs_btree_cur_t         *tcur;          /* temporary btree cursor */
-       int                     numrecs;        /* temporary numrec count */
-       int                     numlrecs, numrrecs;
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGI(cur, level);
-       ptr = cur->bc_ptrs[level];
-       tcur = NULL;
-       if (ptr == 0) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       block = xfs_bmbt_get_block(cur, level, &bp);
-       numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               goto error0;
-       }
-#endif
-       if (ptr > numrecs) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       XFS_STATS_INC(xs_bmbt_delrec);
-       if (level > 0) {
-               kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
-               pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-#ifdef DEBUG
-               for (i = ptr; i < numrecs; i++) {
-                       if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               goto error0;
-                       }
-               }
-#endif
-               if (ptr < numrecs) {
-                       memmove(&kp[ptr - 1], &kp[ptr],
-                               (numrecs - ptr) * sizeof(*kp));
-                       memmove(&pp[ptr - 1], &pp[ptr],
-                               (numrecs - ptr) * sizeof(*pp));
-                       xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs - 1);
-                       xfs_bmbt_log_keys(cur, bp, ptr, numrecs - 1);
-               }
-       } else {
-               rp = XFS_BMAP_REC_IADDR(block, 1, cur);
-               if (ptr < numrecs) {
-                       memmove(&rp[ptr - 1], &rp[ptr],
-                               (numrecs - ptr) * sizeof(*rp));
-                       xfs_bmbt_log_recs(cur, bp, ptr, numrecs - 1);
-               }
-               if (ptr == 1) {
-                       key.br_startoff =
-                               cpu_to_be64(xfs_bmbt_disk_get_startoff(rp));
-                       kp = &key;
-               }
-       }
-       numrecs--;
-       block->bb_numrecs = cpu_to_be16(numrecs);
-       xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
-       /*
-        * We're at the root level.
-        * First, shrink the root block in-memory.
-        * Try to get rid of the next level down.
-        * If we can't then there's nothing left to do.
-        */
-       if (level == cur->bc_nlevels - 1) {
-               xfs_iroot_realloc(cur->bc_private.b.ip, -1,
-                       cur->bc_private.b.whichfork);
-               if ((error = xfs_bmbt_killroot(cur))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 1;
-               return 0;
-       }
-       if (ptr == 1 && (error = xfs_bmbt_updkey(cur, kp, level + 1))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               goto error0;
-       }
-       if (numrecs >= XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
-               if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 1;
-               return 0;
-       }
-       rbno = be64_to_cpu(block->bb_rightsib);
-       lbno = be64_to_cpu(block->bb_leftsib);
-       /*
-        * One child of root, need to get a chance to copy its contents
-        * into the root and delete it. Can't go up to next level,
-        * there's nothing to delete there.
-        */
-       if (lbno == NULLFSBLOCK && rbno == NULLFSBLOCK &&
-           level == cur->bc_nlevels - 2) {
-               if ((error = xfs_bmbt_killroot(cur))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 1;
-               return 0;
-       }
-       ASSERT(rbno != NULLFSBLOCK || lbno != NULLFSBLOCK);
-       if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               goto error0;
-       }
-       bno = NULLFSBLOCK;
-       if (rbno != NULLFSBLOCK) {
-               i = xfs_btree_lastrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if ((error = xfs_bmbt_increment(tcur, level, &i))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               i = xfs_btree_lastrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               rbp = tcur->bc_bufs[level];
-               right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-#endif
-               bno = be64_to_cpu(right->bb_leftsib);
-               if (be16_to_cpu(right->bb_numrecs) - 1 >=
-                   XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
-                       if ((error = xfs_bmbt_lshift(tcur, level, &i))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               goto error0;
-                       }
-                       if (i) {
-                               ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                      XFS_BMAP_BLOCK_IMINRECS(level, tcur));
-                               xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-                               tcur = NULL;
-                               if (level > 0) {
-                                       if ((error = xfs_bmbt_decrement(cur,
-                                                       level, &i))) {
-                                               XFS_BMBT_TRACE_CURSOR(cur,
-                                                       ERROR);
-                                               goto error0;
-                                       }
-                               }
-                               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                               *stat = 1;
-                               return 0;
-                       }
-               }
-               rrecs = be16_to_cpu(right->bb_numrecs);
-               if (lbno != NULLFSBLOCK) {
-                       i = xfs_btree_firstrec(tcur, level);
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                       if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               goto error0;
-                       }
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               }
-       }
-       if (lbno != NULLFSBLOCK) {
-               i = xfs_btree_firstrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               /*
-                * decrement to last in block
-                */
-               if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               i = xfs_btree_firstrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               lbp = tcur->bc_bufs[level];
-               left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-#endif
-               bno = be64_to_cpu(left->bb_rightsib);
-               if (be16_to_cpu(left->bb_numrecs) - 1 >=
-                   XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
-                       if ((error = xfs_bmbt_rshift(tcur, level, &i))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               goto error0;
-                       }
-                       if (i) {
-                               ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                      XFS_BMAP_BLOCK_IMINRECS(level, tcur));
-                               xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-                               tcur = NULL;
-                               if (level == 0)
-                                       cur->bc_ptrs[0]++;
-                               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                               *stat = 1;
-                               return 0;
-                       }
-               }
-               lrecs = be16_to_cpu(left->bb_numrecs);
-       }
-       xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-       tcur = NULL;
-       mp = cur->bc_mp;
-       ASSERT(bno != NULLFSBLOCK);
-       if (lbno != NULLFSBLOCK &&
-           lrecs + be16_to_cpu(block->bb_numrecs) <= XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-               rbno = bno;
-               right = block;
-               rbp = bp;
-               if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, lbno, 0, &lbp,
-                               XFS_BMAP_BTREE_REF))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-               if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-       } else if (rbno != NULLFSBLOCK &&
-                  rrecs + be16_to_cpu(block->bb_numrecs) <=
-                  XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-               lbno = bno;
-               left = block;
-               lbp = bp;
-               if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, rbno, 0, &rbp,
-                               XFS_BMAP_BTREE_REF))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-               if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               lrecs = be16_to_cpu(left->bb_numrecs);
-       } else {
-               if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 1;
-               return 0;
-       }
-       numlrecs = be16_to_cpu(left->bb_numrecs);
-       numrrecs = be16_to_cpu(right->bb_numrecs);
-       if (level > 0) {
-               lkp = XFS_BMAP_KEY_IADDR(left, numlrecs + 1, cur);
-               lpp = XFS_BMAP_PTR_IADDR(left, numlrecs + 1, cur);
-               rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
-               rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
-               for (i = 0; i < numrrecs; i++) {
-                       if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               goto error0;
-                       }
-               }
-#endif
-               memcpy(lkp, rkp, numrrecs * sizeof(*lkp));
-               memcpy(lpp, rpp, numrrecs * sizeof(*lpp));
-               xfs_bmbt_log_keys(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
-               xfs_bmbt_log_ptrs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
-       } else {
-               lrp = XFS_BMAP_REC_IADDR(left, numlrecs + 1, cur);
-               rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
-               memcpy(lrp, rrp, numrrecs * sizeof(*lrp));
-               xfs_bmbt_log_recs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
-       }
-       be16_add_cpu(&left->bb_numrecs, numrrecs);
-       left->bb_rightsib = right->bb_rightsib;
-       xfs_bmbt_log_block(cur, lbp, XFS_BB_RIGHTSIB | XFS_BB_NUMRECS);
-       if (be64_to_cpu(left->bb_rightsib) != NULLDFSBNO) {
-               if ((error = xfs_btree_read_bufl(mp, cur->bc_tp,
-                               be64_to_cpu(left->bb_rightsib),
-                               0, &rrbp, XFS_BMAP_BTREE_REF))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
-               if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               rrblock->bb_leftsib = cpu_to_be64(lbno);
-               xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
-       }
-       xfs_bmap_add_free(XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(rbp)), 1,
-               cur->bc_private.b.flist, mp);
-       cur->bc_private.b.ip->i_d.di_nblocks--;
-       xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
-       XFS_TRANS_MOD_DQUOT_BYINO(mp, cur->bc_tp, cur->bc_private.b.ip,
-                       XFS_TRANS_DQ_BCOUNT, -1L);
-       xfs_trans_binval(cur->bc_tp, rbp);
-       if (bp != lbp) {
-               cur->bc_bufs[level] = lbp;
-               cur->bc_ptrs[level] += lrecs;
-               cur->bc_ra[level] = 0;
-       } else if ((error = xfs_bmbt_increment(cur, level + 1, &i))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               goto error0;
-       }
-       if (level > 0)
-               cur->bc_ptrs[level]--;
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       *stat = 2;
-       return 0;
-
-error0:
-       if (tcur)
-               xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-       return error;
-}
-
-/*
- * Insert one record/level.  Return information to the caller
- * allowing the next level up to proceed if necessary.
- */
-STATIC int                                     /* error */
-xfs_bmbt_insrec(
-       xfs_btree_cur_t         *cur,
-       int                     level,
-       xfs_fsblock_t           *bnop,
-       xfs_bmbt_rec_t          *recp,
-       xfs_btree_cur_t         **curp,
-       int                     *stat)          /* no-go/done/continue */
-{
-       xfs_bmbt_block_t        *block;         /* bmap btree block */
-       xfs_buf_t               *bp;            /* buffer for block */
-       int                     error;          /* error return value */
-       int                     i;              /* loop index */
-       xfs_bmbt_key_t          key;            /* bmap btree key */
-       xfs_bmbt_key_t          *kp=NULL;       /* pointer to bmap btree key */
-       int                     logflags;       /* inode logging flags */
-       xfs_fsblock_t           nbno;           /* new block number */
-       struct xfs_btree_cur    *ncur;          /* new btree cursor */
-       __uint64_t              startoff;       /* new btree key value */
-       xfs_bmbt_rec_t          nrec;           /* new record count */
-       int                     optr;           /* old key/record index */
-       xfs_bmbt_ptr_t          *pp;            /* pointer to bmap block addr */
-       int                     ptr;            /* key/record index */
-       xfs_bmbt_rec_t          *rp=NULL;       /* pointer to bmap btree rec */
-       int                     numrecs;
-
-       ASSERT(level < cur->bc_nlevels);
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGIFR(cur, level, *bnop, recp);
-       ncur = NULL;
-       key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(recp));
-       optr = ptr = cur->bc_ptrs[level];
-       if (ptr == 0) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       XFS_STATS_INC(xs_bmbt_insrec);
-       block = xfs_bmbt_get_block(cur, level, &bp);
-       numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-       if (ptr <= numrecs) {
-               if (level == 0) {
-                       rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
-                       xfs_btree_check_rec(XFS_BTNUM_BMAP, recp, rp);
-               } else {
-                       kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
-                       xfs_btree_check_key(XFS_BTNUM_BMAP, &key, kp);
-               }
-       }
-#endif
-       nbno = NULLFSBLOCK;
-       if (numrecs == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-               if (numrecs < XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
-                       /*
-                        * A root block, that can be made bigger.
-                        */
-                       xfs_iroot_realloc(cur->bc_private.b.ip, 1,
-                               cur->bc_private.b.whichfork);
-                       block = xfs_bmbt_get_block(cur, level, &bp);
-               } else if (level == cur->bc_nlevels - 1) {
-                       if ((error = xfs_bmbt_newroot(cur, &logflags, stat)) ||
-                           *stat == 0) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               return error;
-                       }
-                       xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
-                               logflags);
-                       block = xfs_bmbt_get_block(cur, level, &bp);
-               } else {
-                       if ((error = xfs_bmbt_rshift(cur, level, &i))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               return error;
-                       }
-                       if (i) {
-                               /* nothing */
-                       } else {
-                               if ((error = xfs_bmbt_lshift(cur, level, &i))) {
-                                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                       return error;
-                               }
-                               if (i) {
-                                       optr = ptr = cur->bc_ptrs[level];
-                               } else {
-                                       if ((error = xfs_bmbt_split(cur, level,
-                                                       &nbno, &startoff, &ncur,
-                                                       &i))) {
-                                               XFS_BMBT_TRACE_CURSOR(cur,
-                                                       ERROR);
-                                               return error;
-                                       }
-                                       if (i) {
-                                               block = xfs_bmbt_get_block(
-                                                           cur, level, &bp);
-#ifdef DEBUG
-                                               if ((error =
-                                                   xfs_btree_check_lblock(cur,
-                                                           block, level, bp))) {
-                                                       XFS_BMBT_TRACE_CURSOR(
-                                                               cur, ERROR);
-                                                       return error;
-                                               }
-#endif
-                                               ptr = cur->bc_ptrs[level];
-                                               xfs_bmbt_disk_set_allf(&nrec,
-                                                       startoff, 0, 0,
-                                                       XFS_EXT_NORM);
-                                       } else {
-                                               XFS_BMBT_TRACE_CURSOR(cur,
-                                                       EXIT);
-                                               *stat = 0;
-                                               return 0;
-                                       }
-                               }
-                       }
-               }
-       }
-       numrecs = be16_to_cpu(block->bb_numrecs);
-       if (level > 0) {
-               kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
-               pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-#ifdef DEBUG
-               for (i = numrecs; i >= ptr; i--) {
-                       if ((error = xfs_btree_check_lptr_disk(cur, pp[i - 1],
-                                       level))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               return error;
-                       }
-               }
-#endif
-               memmove(&kp[ptr], &kp[ptr - 1],
-                       (numrecs - ptr + 1) * sizeof(*kp));
-               memmove(&pp[ptr], &pp[ptr - 1],
-                       (numrecs - ptr + 1) * sizeof(*pp));
-#ifdef DEBUG
-               if ((error = xfs_btree_check_lptr(cur, *bnop, level))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-#endif
-               kp[ptr - 1] = key;
-               pp[ptr - 1] = cpu_to_be64(*bnop);
-               numrecs++;
-               block->bb_numrecs = cpu_to_be16(numrecs);
-               xfs_bmbt_log_keys(cur, bp, ptr, numrecs);
-               xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs);
-       } else {
-               rp = XFS_BMAP_REC_IADDR(block, 1, cur);
-               memmove(&rp[ptr], &rp[ptr - 1],
-                       (numrecs - ptr + 1) * sizeof(*rp));
-               rp[ptr - 1] = *recp;
-               numrecs++;
-               block->bb_numrecs = cpu_to_be16(numrecs);
-               xfs_bmbt_log_recs(cur, bp, ptr, numrecs);
-       }
-       xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
-#ifdef DEBUG
-       if (ptr < numrecs) {
-               if (level == 0)
-                       xfs_btree_check_rec(XFS_BTNUM_BMAP, rp + ptr - 1,
-                               rp + ptr);
-               else
-                       xfs_btree_check_key(XFS_BTNUM_BMAP, kp + ptr - 1,
-                               kp + ptr);
-       }
-#endif
-       if (optr == 1 && (error = xfs_bmbt_updkey(cur, &key, level + 1))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-       *bnop = nbno;
-       if (nbno != NULLFSBLOCK) {
-               *recp = nrec;
-               *curp = ncur;
-       }
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       *stat = 1;
-       return 0;
-}
-
-STATIC int
-xfs_bmbt_killroot(
-       xfs_btree_cur_t         *cur)
-{
-       xfs_bmbt_block_t        *block;
-       xfs_bmbt_block_t        *cblock;
-       xfs_buf_t               *cbp;
-       xfs_bmbt_key_t          *ckp;
-       xfs_bmbt_ptr_t          *cpp;
-#ifdef DEBUG
-       int                     error;
-#endif
-       int                     i;
-       xfs_bmbt_key_t          *kp;
-       xfs_inode_t             *ip;
-       xfs_ifork_t             *ifp;
-       int                     level;
-       xfs_bmbt_ptr_t          *pp;
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       level = cur->bc_nlevels - 1;
-       ASSERT(level >= 1);
-       /*
-        * Don't deal with the root block needs to be a leaf case.
-        * We're just going to turn the thing back into extents anyway.
-        */
-       if (level == 1) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               return 0;
-       }
-       block = xfs_bmbt_get_block(cur, level, &cbp);
-       /*
-        * Give up if the root has multiple children.
-        */
-       if (be16_to_cpu(block->bb_numrecs) != 1) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               return 0;
-       }
-       /*
-        * Only do this if the next level will fit.
-        * Then the data must be copied up to the inode,
-        * instead of freeing the root you free the next level.
-        */
-       cbp = cur->bc_bufs[level - 1];
-       cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
-       if (be16_to_cpu(cblock->bb_numrecs) > XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               return 0;
-       }
-       ASSERT(be64_to_cpu(cblock->bb_leftsib) == NULLDFSBNO);
-       ASSERT(be64_to_cpu(cblock->bb_rightsib) == NULLDFSBNO);
-       ip = cur->bc_private.b.ip;
-       ifp = XFS_IFORK_PTR(ip, cur->bc_private.b.whichfork);
-       ASSERT(XFS_BMAP_BLOCK_IMAXRECS(level, cur) ==
-              XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes));
-       i = (int)(be16_to_cpu(cblock->bb_numrecs) - XFS_BMAP_BLOCK_IMAXRECS(level, cur));
-       if (i) {
-               xfs_iroot_realloc(ip, i, cur->bc_private.b.whichfork);
-               block = ifp->if_broot;
-       }
-       be16_add_cpu(&block->bb_numrecs, i);
-       ASSERT(block->bb_numrecs == cblock->bb_numrecs);
-       kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
-       ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
-       memcpy(kp, ckp, be16_to_cpu(block->bb_numrecs) * sizeof(*kp));
-       pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-       cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
-#ifdef DEBUG
-       for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
-               if ((error = xfs_btree_check_lptr_disk(cur, cpp[i], level - 1))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-       }
-#endif
-       memcpy(pp, cpp, be16_to_cpu(block->bb_numrecs) * sizeof(*pp));
-       xfs_bmap_add_free(XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(cbp)), 1,
-                       cur->bc_private.b.flist, cur->bc_mp);
-       ip->i_d.di_nblocks--;
-       XFS_TRANS_MOD_DQUOT_BYINO(cur->bc_mp, cur->bc_tp, ip,
-                       XFS_TRANS_DQ_BCOUNT, -1L);
-       xfs_trans_binval(cur->bc_tp, cbp);
-       cur->bc_bufs[level - 1] = NULL;
-       be16_add_cpu(&block->bb_level, -1);
-       xfs_trans_log_inode(cur->bc_tp, ip,
-               XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
-       cur->bc_nlevels--;
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       return 0;
-}
-
-/*
- * Log key values from the btree block.
- */
-STATIC void
-xfs_bmbt_log_keys(
-       xfs_btree_cur_t *cur,
-       xfs_buf_t       *bp,
-       int             kfirst,
-       int             klast)
-{
-       xfs_trans_t     *tp;
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGBII(cur, bp, kfirst, klast);
-       tp = cur->bc_tp;
-       if (bp) {
-               xfs_bmbt_block_t        *block;
-               int                     first;
-               xfs_bmbt_key_t          *kp;
-               int                     last;
-
-               block = XFS_BUF_TO_BMBT_BLOCK(bp);
-               kp = XFS_BMAP_KEY_DADDR(block, 1, cur);
-               first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
-               last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
-               xfs_trans_log_buf(tp, bp, first, last);
-       } else {
-               xfs_inode_t              *ip;
-
-               ip = cur->bc_private.b.ip;
-               xfs_trans_log_inode(tp, ip,
-                       XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
-       }
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-
-/*
- * Log pointer values from the btree block.
- */
-STATIC void
-xfs_bmbt_log_ptrs(
-       xfs_btree_cur_t *cur,
-       xfs_buf_t       *bp,
-       int             pfirst,
-       int             plast)
-{
-       xfs_trans_t     *tp;
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGBII(cur, bp, pfirst, plast);
-       tp = cur->bc_tp;
-       if (bp) {
-               xfs_bmbt_block_t        *block;
-               int                     first;
-               int                     last;
-               xfs_bmbt_ptr_t          *pp;
-
-               block = XFS_BUF_TO_BMBT_BLOCK(bp);
-               pp = XFS_BMAP_PTR_DADDR(block, 1, cur);
-               first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
-               last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
-               xfs_trans_log_buf(tp, bp, first, last);
-       } else {
-               xfs_inode_t             *ip;
-
-               ip = cur->bc_private.b.ip;
-               xfs_trans_log_inode(tp, ip,
-                       XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
-       }
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-
-/*
- * Lookup the record.  The cursor is made to point to it, based on dir.
- */
-STATIC int                             /* error */
-xfs_bmbt_lookup(
-       xfs_btree_cur_t         *cur,
-       xfs_lookup_t            dir,
-       int                     *stat)          /* success/failure */
-{
-       xfs_bmbt_block_t        *block=NULL;
-       xfs_buf_t               *bp;
-       xfs_daddr_t             d;
-       xfs_sfiloff_t           diff;
-       int                     error;          /* error return value */
-       xfs_fsblock_t           fsbno=0;
-       int                     high;
-       int                     i;
-       int                     keyno=0;
-       xfs_bmbt_key_t          *kkbase=NULL;
-       xfs_bmbt_key_t          *kkp;
-       xfs_bmbt_rec_t          *krbase=NULL;
-       xfs_bmbt_rec_t          *krp;
-       int                     level;
-       int                     low;
-       xfs_mount_t             *mp;
-       xfs_bmbt_ptr_t          *pp;
-       xfs_bmbt_irec_t         *rp;
-       xfs_fileoff_t           startoff;
-       xfs_trans_t             *tp;
-
-       XFS_STATS_INC(xs_bmbt_lookup);
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGI(cur, (int)dir);
-       tp = cur->bc_tp;
-       mp = cur->bc_mp;
-       rp = &cur->bc_rec.b;
-       for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
-               if (level < cur->bc_nlevels - 1) {
-                       d = XFS_FSB_TO_DADDR(mp, fsbno);
-                       bp = cur->bc_bufs[level];
-                       if (bp && XFS_BUF_ADDR(bp) != d)
-                               bp = NULL;
-                       if (!bp) {
-                               if ((error = xfs_btree_read_bufl(mp, tp, fsbno,
-                                               0, &bp, XFS_BMAP_BTREE_REF))) {
-                                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                       return error;
-                               }
-                               xfs_btree_setbuf(cur, level, bp);
-                               block = XFS_BUF_TO_BMBT_BLOCK(bp);
-                               if ((error = xfs_btree_check_lblock(cur, block,
-                                               level, bp))) {
-                                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                       return error;
-                               }
-                       } else
-                               block = XFS_BUF_TO_BMBT_BLOCK(bp);
-               } else
-                       block = xfs_bmbt_get_block(cur, level, &bp);
-               if (diff == 0)
-                       keyno = 1;
-               else {
-                       if (level > 0)
-                               kkbase = XFS_BMAP_KEY_IADDR(block, 1, cur);
-                       else
-                               krbase = XFS_BMAP_REC_IADDR(block, 1, cur);
-                       low = 1;
-                       if (!(high = be16_to_cpu(block->bb_numrecs))) {
-                               ASSERT(level == 0);
-                               cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
-                               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                               *stat = 0;
-                               return 0;
-                       }
-                       while (low <= high) {
-                               XFS_STATS_INC(xs_bmbt_compare);
-                               keyno = (low + high) >> 1;
-                               if (level > 0) {
-                                       kkp = kkbase + keyno - 1;
-                                       startoff = be64_to_cpu(kkp->br_startoff);
-                               } else {
-                                       krp = krbase + keyno - 1;
-                                       startoff = xfs_bmbt_disk_get_startoff(krp);
-                               }
-                               diff = (xfs_sfiloff_t)
-                                               (startoff - rp->br_startoff);
-                               if (diff < 0)
-                                       low = keyno + 1;
-                               else if (diff > 0)
-                                       high = keyno - 1;
-                               else
-                                       break;
-                       }
-               }
-               if (level > 0) {
-                       if (diff > 0 && --keyno < 1)
-                               keyno = 1;
-                       pp = XFS_BMAP_PTR_IADDR(block, keyno, cur);
-                       fsbno = be64_to_cpu(*pp);
-#ifdef DEBUG
-                       if ((error = xfs_btree_check_lptr(cur, fsbno, level))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               return error;
-                       }
-#endif
-                       cur->bc_ptrs[level] = keyno;
-               }
-       }
-       if (dir != XFS_LOOKUP_LE && diff < 0) {
-               keyno++;
-               /*
-                * If ge search and we went off the end of the block, but it's
-                * not the last block, we're in the wrong block.
-                */
-               if (dir == XFS_LOOKUP_GE && keyno > be16_to_cpu(block->bb_numrecs) &&
-                   be64_to_cpu(block->bb_rightsib) != NULLDFSBNO) {
-                       cur->bc_ptrs[0] = keyno;
-                       if ((error = xfs_bmbt_increment(cur, 0, &i))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               return error;
-                       }
-                       XFS_WANT_CORRUPTED_RETURN(i == 1);
-                       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                       *stat = 1;
-                       return 0;
-               }
-       }
-       else if (dir == XFS_LOOKUP_LE && diff > 0)
-               keyno--;
-       cur->bc_ptrs[0] = keyno;
-       if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs)) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-       } else {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
-       }
-       return 0;
-}
-
-/*
- * Move 1 record left from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                                     /* error */
-xfs_bmbt_lshift(
-       xfs_btree_cur_t         *cur,
-       int                     level,
-       int                     *stat)          /* success/failure */
-{
-       int                     error;          /* error return value */
-#ifdef DEBUG
-       int                     i;              /* loop counter */
-#endif
-       xfs_bmbt_key_t          key;            /* bmap btree key */
-       xfs_buf_t               *lbp;           /* left buffer pointer */
-       xfs_bmbt_block_t        *left;          /* left btree block */
-       xfs_bmbt_key_t          *lkp=NULL;      /* left btree key */
-       xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
-       int                     lrecs;          /* left record count */
-       xfs_bmbt_rec_t          *lrp=NULL;      /* left record pointer */
-       xfs_mount_t             *mp;            /* file system mount point */
-       xfs_buf_t               *rbp;           /* right buffer pointer */
-       xfs_bmbt_block_t        *right;         /* right btree block */
-       xfs_bmbt_key_t          *rkp=NULL;      /* right btree key */
-       xfs_bmbt_ptr_t          *rpp=NULL;      /* right address pointer */
-       xfs_bmbt_rec_t          *rrp=NULL;      /* right record pointer */
-       int                     rrecs;          /* right record count */
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGI(cur, level);
-       if (level == cur->bc_nlevels - 1) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       rbp = cur->bc_bufs[level];
-       right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-#endif
-       if (be64_to_cpu(right->bb_leftsib) == NULLDFSBNO) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       if (cur->bc_ptrs[level] <= 1) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       mp = cur->bc_mp;
-       if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, be64_to_cpu(right->bb_leftsib), 0,
-                       &lbp, XFS_BMAP_BTREE_REF))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-       left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-       if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-       if (be16_to_cpu(left->bb_numrecs) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       lrecs = be16_to_cpu(left->bb_numrecs) + 1;
-       if (level > 0) {
-               lkp = XFS_BMAP_KEY_IADDR(left, lrecs, cur);
-               rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
-               *lkp = *rkp;
-               xfs_bmbt_log_keys(cur, lbp, lrecs, lrecs);
-               lpp = XFS_BMAP_PTR_IADDR(left, lrecs, cur);
-               rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_lptr_disk(cur, *rpp, level))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-#endif
-               *lpp = *rpp;
-               xfs_bmbt_log_ptrs(cur, lbp, lrecs, lrecs);
-       } else {
-               lrp = XFS_BMAP_REC_IADDR(left, lrecs, cur);
-               rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
-               *lrp = *rrp;
-               xfs_bmbt_log_recs(cur, lbp, lrecs, lrecs);
-       }
-       left->bb_numrecs = cpu_to_be16(lrecs);
-       xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
-#ifdef DEBUG
-       if (level > 0)
-               xfs_btree_check_key(XFS_BTNUM_BMAP, lkp - 1, lkp);
-       else
-               xfs_btree_check_rec(XFS_BTNUM_BMAP, lrp - 1, lrp);
-#endif
-       rrecs = be16_to_cpu(right->bb_numrecs) - 1;
-       right->bb_numrecs = cpu_to_be16(rrecs);
-       xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
-       if (level > 0) {
-#ifdef DEBUG
-               for (i = 0; i < rrecs; i++) {
-                       if ((error = xfs_btree_check_lptr_disk(cur, rpp[i + 1],
-                                       level))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               return error;
-                       }
-               }
-#endif
-               memmove(rkp, rkp + 1, rrecs * sizeof(*rkp));
-               memmove(rpp, rpp + 1, rrecs * sizeof(*rpp));
-               xfs_bmbt_log_keys(cur, rbp, 1, rrecs);
-               xfs_bmbt_log_ptrs(cur, rbp, 1, rrecs);
-       } else {
-               memmove(rrp, rrp + 1, rrecs * sizeof(*rrp));
-               xfs_bmbt_log_recs(cur, rbp, 1, rrecs);
-               key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp));
-               rkp = &key;
-       }
-       if ((error = xfs_bmbt_updkey(cur, rkp, level + 1))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-       cur->bc_ptrs[level]--;
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       *stat = 1;
-       return 0;
-}
-
-/*
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                                     /* error */
-xfs_bmbt_rshift(
-       xfs_btree_cur_t         *cur,
-       int                     level,
-       int                     *stat)          /* success/failure */
-{
-       int                     error;          /* error return value */
-       int                     i;              /* loop counter */
-       xfs_bmbt_key_t          key;            /* bmap btree key */
-       xfs_buf_t               *lbp;           /* left buffer pointer */
-       xfs_bmbt_block_t        *left;          /* left btree block */
-       xfs_bmbt_key_t          *lkp;           /* left btree key */
-       xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
-       xfs_bmbt_rec_t          *lrp;           /* left record pointer */
-       xfs_mount_t             *mp;            /* file system mount point */
-       xfs_buf_t               *rbp;           /* right buffer pointer */
-       xfs_bmbt_block_t        *right;         /* right btree block */
-       xfs_bmbt_key_t          *rkp;           /* right btree key */
-       xfs_bmbt_ptr_t          *rpp;           /* right address pointer */
-       xfs_bmbt_rec_t          *rrp=NULL;      /* right record pointer */
-       struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGI(cur, level);
-       if (level == cur->bc_nlevels - 1) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       lbp = cur->bc_bufs[level];
-       left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-#endif
-       if (be64_to_cpu(left->bb_rightsib) == NULLDFSBNO) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       mp = cur->bc_mp;
-       if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, be64_to_cpu(left->bb_rightsib), 0,
-                       &rbp, XFS_BMAP_BTREE_REF))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-       right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-       if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-       if (be16_to_cpu(right->bb_numrecs) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       if (level > 0) {
-               lkp = XFS_BMAP_KEY_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-               lpp = XFS_BMAP_PTR_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-               rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
-               rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
-               for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
-                       if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               return error;
-                       }
-               }
-#endif
-               memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-               memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-#ifdef DEBUG
-               if ((error = xfs_btree_check_lptr_disk(cur, *lpp, level))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-#endif
-               *rkp = *lkp;
-               *rpp = *lpp;
-               xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-               xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-       } else {
-               lrp = XFS_BMAP_REC_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-               rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
-               memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-               *rrp = *lrp;
-               xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-               key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp));
-               rkp = &key;
-       }
-       be16_add_cpu(&left->bb_numrecs, -1);
-       xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
-       be16_add_cpu(&right->bb_numrecs, 1);
-#ifdef DEBUG
-       if (level > 0)
-               xfs_btree_check_key(XFS_BTNUM_BMAP, rkp, rkp + 1);
-       else
-               xfs_btree_check_rec(XFS_BTNUM_BMAP, rrp, rrp + 1);
-#endif
-       xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
-       if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-       i = xfs_btree_lastrec(tcur, level);
-       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-       if ((error = xfs_bmbt_increment(tcur, level, &i))) {
-               XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
-               goto error1;
-       }
-       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-       if ((error = xfs_bmbt_updkey(tcur, rkp, level + 1))) {
-               XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
-               goto error1;
-       }
-       xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       *stat = 1;
-       return 0;
-error0:
-       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-error1:
-       xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-       return error;
-}
-
 /*
  * Determine the extent state.
  */
@@ -1453,229 +60,15 @@ xfs_extent_state(
        return XFS_EXT_NORM;
 }
 
-
-/*
- * Split cur/level block in half.
- * Return new block number and its first record (to be inserted into parent).
- */
-STATIC int                                     /* error */
-xfs_bmbt_split(
-       xfs_btree_cur_t         *cur,
-       int                     level,
-       xfs_fsblock_t           *bnop,
-       __uint64_t              *startoff,
-       xfs_btree_cur_t         **curp,
-       int                     *stat)          /* success/failure */
-{
-       xfs_alloc_arg_t         args;           /* block allocation args */
-       int                     error;          /* error return value */
-       int                     i;              /* loop counter */
-       xfs_fsblock_t           lbno;           /* left sibling block number */
-       xfs_buf_t               *lbp;           /* left buffer pointer */
-       xfs_bmbt_block_t        *left;          /* left btree block */
-       xfs_bmbt_key_t          *lkp;           /* left btree key */
-       xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
-       xfs_bmbt_rec_t          *lrp;           /* left record pointer */
-       xfs_buf_t               *rbp;           /* right buffer pointer */
-       xfs_bmbt_block_t        *right;         /* right btree block */
-       xfs_bmbt_key_t          *rkp;           /* right btree key */
-       xfs_bmbt_ptr_t          *rpp;           /* right address pointer */
-       xfs_bmbt_block_t        *rrblock;       /* right-right btree block */
-       xfs_buf_t               *rrbp;          /* right-right buffer pointer */
-       xfs_bmbt_rec_t          *rrp;           /* right record pointer */
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, *startoff);
-       args.tp = cur->bc_tp;
-       args.mp = cur->bc_mp;
-       lbp = cur->bc_bufs[level];
-       lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp));
-       left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-       args.fsbno = cur->bc_private.b.firstblock;
-       args.firstblock = args.fsbno;
-       args.minleft = 0;
-       if (args.fsbno == NULLFSBLOCK) {
-               args.fsbno = lbno;
-               args.type = XFS_ALLOCTYPE_START_BNO;
-               /*
-                * Make sure there is sufficient room left in the AG to
-                * complete a full tree split for an extent insert.  If
-                * we are converting the middle part of an extent then
-                * we may need space for two tree splits.
-                *
-                * We are relying on the caller to make the correct block
-                * reservation for this operation to succeed.  If the
-                * reservation amount is insufficient then we may fail a
-                * block allocation here and corrupt the filesystem.
-                */
-               args.minleft = xfs_trans_get_block_res(args.tp);
-       } else if (cur->bc_private.b.flist->xbf_low)
-               args.type = XFS_ALLOCTYPE_START_BNO;
-       else
-               args.type = XFS_ALLOCTYPE_NEAR_BNO;
-       args.mod = args.alignment = args.total = args.isfl =
-               args.userdata = args.minalignslop = 0;
-       args.minlen = args.maxlen = args.prod = 1;
-       args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
-       if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return XFS_ERROR(ENOSPC);
-       }
-       if ((error = xfs_alloc_vextent(&args))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-       if (args.fsbno == NULLFSBLOCK && args.minleft) {
-               /*
-                * Could not find an AG with enough free space to satisfy
-                * a full btree split.  Try again without minleft and if
-                * successful activate the lowspace algorithm.
-                */
-               args.fsbno = 0;
-               args.type = XFS_ALLOCTYPE_FIRST_AG;
-               args.minleft = 0;
-               if ((error = xfs_alloc_vextent(&args))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-               cur->bc_private.b.flist->xbf_low = 1;
-       }
-       if (args.fsbno == NULLFSBLOCK) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       ASSERT(args.len == 1);
-       cur->bc_private.b.firstblock = args.fsbno;
-       cur->bc_private.b.allocated++;
-       cur->bc_private.b.ip->i_d.di_nblocks++;
-       xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
-       XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
-                       XFS_TRANS_DQ_BCOUNT, 1L);
-       rbp = xfs_btree_get_bufl(args.mp, args.tp, args.fsbno, 0);
-       right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_lblock(cur, left, level, rbp))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-#endif
-       right->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
-       right->bb_level = left->bb_level;
-       right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
-       if ((be16_to_cpu(left->bb_numrecs) & 1) &&
-           cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
-               be16_add_cpu(&right->bb_numrecs, 1);
-       i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
-       if (level > 0) {
-               lkp = XFS_BMAP_KEY_IADDR(left, i, cur);
-               lpp = XFS_BMAP_PTR_IADDR(left, i, cur);
-               rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
-               rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
-               for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                       if ((error = xfs_btree_check_lptr_disk(cur, lpp[i], level))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               return error;
-                       }
-               }
-#endif
-               memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-               memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-               xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               *startoff = be64_to_cpu(rkp->br_startoff);
-       } else {
-               lrp = XFS_BMAP_REC_IADDR(left, i, cur);
-               rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
-               memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-               xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               *startoff = xfs_bmbt_disk_get_startoff(rrp);
-       }
-       be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
-       right->bb_rightsib = left->bb_rightsib;
-       left->bb_rightsib = cpu_to_be64(args.fsbno);
-       right->bb_leftsib = cpu_to_be64(lbno);
-       xfs_bmbt_log_block(cur, rbp, XFS_BB_ALL_BITS);
-       xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-       if (be64_to_cpu(right->bb_rightsib) != NULLDFSBNO) {
-               if ((error = xfs_btree_read_bufl(args.mp, args.tp,
-                               be64_to_cpu(right->bb_rightsib), 0, &rrbp,
-                               XFS_BMAP_BTREE_REF))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-               rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
-               if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-               rrblock->bb_leftsib = cpu_to_be64(args.fsbno);
-               xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
-       }
-       if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
-               xfs_btree_setbuf(cur, level, rbp);
-               cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
-       }
-       if (level + 1 < cur->bc_nlevels) {
-               if ((error = xfs_btree_dup_cursor(cur, curp))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-               (*curp)->bc_ptrs[level + 1]++;
-       }
-       *bnop = args.fsbno;
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       *stat = 1;
-       return 0;
-}
-
-
-/*
- * Update keys for the record.
- */
-STATIC int
-xfs_bmbt_updkey(
-       xfs_btree_cur_t         *cur,
-       xfs_bmbt_key_t          *keyp,  /* on-disk format */
-       int                     level)
-{
-       xfs_bmbt_block_t        *block;
-       xfs_buf_t               *bp;
-#ifdef DEBUG
-       int                     error;
-#endif
-       xfs_bmbt_key_t          *kp;
-       int                     ptr;
-
-       ASSERT(level >= 1);
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGIK(cur, level, keyp);
-       for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
-               block = xfs_bmbt_get_block(cur, level, &bp);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-#endif
-               ptr = cur->bc_ptrs[level];
-               kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
-               *kp = *keyp;
-               xfs_bmbt_log_keys(cur, bp, ptr, ptr);
-       }
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       return 0;
-}
-
 /*
  * Convert on-disk form of btree root to in-memory form.
  */
 void
 xfs_bmdr_to_bmbt(
+       struct xfs_mount        *mp,
        xfs_bmdr_block_t        *dblock,
        int                     dblocklen,
-       xfs_bmbt_block_t        *rblock,
+       struct xfs_btree_block  *rblock,
        int                     rblocklen)
 {
        int                     dmxr;
@@ -1688,128 +81,18 @@ xfs_bmdr_to_bmbt(
        rblock->bb_level = dblock->bb_level;
        ASSERT(be16_to_cpu(rblock->bb_level) > 0);
        rblock->bb_numrecs = dblock->bb_numrecs;
-       rblock->bb_leftsib = cpu_to_be64(NULLDFSBNO);
-       rblock->bb_rightsib = cpu_to_be64(NULLDFSBNO);
-       dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0);
-       fkp = XFS_BTREE_KEY_ADDR(xfs_bmdr, dblock, 1);
-       tkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen);
-       fpp = XFS_BTREE_PTR_ADDR(xfs_bmdr, dblock, 1, dmxr);
-       tpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
+       rblock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+       rblock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+       dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
+       fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+       tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+       fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+       tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
        dmxr = be16_to_cpu(dblock->bb_numrecs);
        memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
        memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
 }
 
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                            /* error */
-xfs_bmbt_decrement(
-       xfs_btree_cur_t         *cur,
-       int                     level,
-       int                     *stat)          /* success/failure */
-{
-       xfs_bmbt_block_t        *block;
-       xfs_buf_t               *bp;
-       int                     error;          /* error return value */
-       xfs_fsblock_t           fsbno;
-       int                     lev;
-       xfs_mount_t             *mp;
-       xfs_trans_t             *tp;
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGI(cur, level);
-       ASSERT(level < cur->bc_nlevels);
-       if (level < cur->bc_nlevels - 1)
-               xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
-       if (--cur->bc_ptrs[level] > 0) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 1;
-               return 0;
-       }
-       block = xfs_bmbt_get_block(cur, level, &bp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-#endif
-       if (be64_to_cpu(block->bb_leftsib) == NULLDFSBNO) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-               if (--cur->bc_ptrs[lev] > 0)
-                       break;
-               if (lev < cur->bc_nlevels - 1)
-                       xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
-       }
-       if (lev == cur->bc_nlevels) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       tp = cur->bc_tp;
-       mp = cur->bc_mp;
-       for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
-               fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur));
-               if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
-                               XFS_BMAP_BTREE_REF))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-               lev--;
-               xfs_btree_setbuf(cur, lev, bp);
-               block = XFS_BUF_TO_BMBT_BLOCK(bp);
-               if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-               cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
-       }
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       *stat = 1;
-       return 0;
-}
-
-/*
- * Delete the record pointed to by cur.
- */
-int                                    /* error */
-xfs_bmbt_delete(
-       xfs_btree_cur_t *cur,
-       int             *stat)          /* success/failure */
-{
-       int             error;          /* error return value */
-       int             i;
-       int             level;
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       for (level = 0, i = 2; i == 2; level++) {
-               if ((error = xfs_bmbt_delrec(cur, level, &i))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-       }
-       if (i == 0) {
-               for (level = 1; level < cur->bc_nlevels; level++) {
-                       if (cur->bc_ptrs[level] == 0) {
-                               if ((error = xfs_bmbt_decrement(cur, level,
-                                               &i))) {
-                                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                       return error;
-                               }
-                               break;
-                       }
-               }
-       }
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       *stat = i;
-       return 0;
-}
-
 /*
  * Convert a compressed bmap extent record to an uncompressed form.
  * This code must be in sync with the routines xfs_bmbt_get_startoff,
@@ -1863,31 +146,6 @@ xfs_bmbt_get_all(
        __xfs_bmbt_get_all(r->l0, r->l1, s);
 }
 
-/*
- * Get the block pointer for the given level of the cursor.
- * Fill in the buffer pointer, if applicable.
- */
-xfs_bmbt_block_t *
-xfs_bmbt_get_block(
-       xfs_btree_cur_t         *cur,
-       int                     level,
-       xfs_buf_t               **bpp)
-{
-       xfs_ifork_t             *ifp;
-       xfs_bmbt_block_t        *rval;
-
-       if (level < cur->bc_nlevels - 1) {
-               *bpp = cur->bc_bufs[level];
-               rval = XFS_BUF_TO_BMBT_BLOCK(*bpp);
-       } else {
-               *bpp = NULL;
-               ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
-                       cur->bc_private.b.whichfork);
-               rval = ifp->if_broot;
-       }
-       return rval;
-}
-
 /*
  * Extract the blockcount field from an in memory bmap extent record.
  */
@@ -1949,374 +207,33 @@ void
 xfs_bmbt_disk_get_all(
        xfs_bmbt_rec_t  *r,
        xfs_bmbt_irec_t *s)
-{
-       __xfs_bmbt_get_all(be64_to_cpu(r->l0), be64_to_cpu(r->l1), s);
-}
-
-/*
- * Extract the blockcount field from an on disk bmap extent record.
- */
-xfs_filblks_t
-xfs_bmbt_disk_get_blockcount(
-       xfs_bmbt_rec_t  *r)
-{
-       return (xfs_filblks_t)(be64_to_cpu(r->l1) & XFS_MASK64LO(21));
-}
-
-/*
- * Extract the startoff field from a disk format bmap extent record.
- */
-xfs_fileoff_t
-xfs_bmbt_disk_get_startoff(
-       xfs_bmbt_rec_t  *r)
-{
-       return ((xfs_fileoff_t)be64_to_cpu(r->l0) &
-                XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
-}
-
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                            /* error */
-xfs_bmbt_increment(
-       xfs_btree_cur_t         *cur,
-       int                     level,
-       int                     *stat)          /* success/failure */
-{
-       xfs_bmbt_block_t        *block;
-       xfs_buf_t               *bp;
-       int                     error;          /* error return value */
-       xfs_fsblock_t           fsbno;
-       int                     lev;
-       xfs_mount_t             *mp;
-       xfs_trans_t             *tp;
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGI(cur, level);
-       ASSERT(level < cur->bc_nlevels);
-       if (level < cur->bc_nlevels - 1)
-               xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
-       block = xfs_bmbt_get_block(cur, level, &bp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-#endif
-       if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 1;
-               return 0;
-       }
-       if (be64_to_cpu(block->bb_rightsib) == NULLDFSBNO) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-               block = xfs_bmbt_get_block(cur, lev, &bp);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-#endif
-               if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
-                       break;
-               if (lev < cur->bc_nlevels - 1)
-                       xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
-       }
-       if (lev == cur->bc_nlevels) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       tp = cur->bc_tp;
-       mp = cur->bc_mp;
-       for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
-               fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur));
-               if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
-                               XFS_BMAP_BTREE_REF))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-               lev--;
-               xfs_btree_setbuf(cur, lev, bp);
-               block = XFS_BUF_TO_BMBT_BLOCK(bp);
-               if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-               cur->bc_ptrs[lev] = 1;
-       }
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       *stat = 1;
-       return 0;
-}
-
-/*
- * Insert the current record at the point referenced by cur.
- *
- * A multi-level split of the tree on insert will invalidate the original
- * cursor.  All callers of this function should assume that the cursor is
- * no longer valid and revalidate it.
- */
-int                                    /* error */
-xfs_bmbt_insert(
-       xfs_btree_cur_t *cur,
-       int             *stat)          /* success/failure */
-{
-       int             error;          /* error return value */
-       int             i;
-       int             level;
-       xfs_fsblock_t   nbno;
-       xfs_btree_cur_t *ncur;
-       xfs_bmbt_rec_t  nrec;
-       xfs_btree_cur_t *pcur;
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       level = 0;
-       nbno = NULLFSBLOCK;
-       xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b);
-       ncur = NULL;
-       pcur = cur;
-       do {
-               if ((error = xfs_bmbt_insrec(pcur, level++, &nbno, &nrec, &ncur,
-                               &i))) {
-                       if (pcur != cur)
-                               xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) {
-                       cur->bc_nlevels = pcur->bc_nlevels;
-                       cur->bc_private.b.allocated +=
-                               pcur->bc_private.b.allocated;
-                       pcur->bc_private.b.allocated = 0;
-                       ASSERT((cur->bc_private.b.firstblock != NULLFSBLOCK) ||
-                              XFS_IS_REALTIME_INODE(cur->bc_private.b.ip));
-                       cur->bc_private.b.firstblock =
-                               pcur->bc_private.b.firstblock;
-                       ASSERT(cur->bc_private.b.flist ==
-                              pcur->bc_private.b.flist);
-                       xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
-               }
-               if (ncur) {
-                       pcur = ncur;
-                       ncur = NULL;
-               }
-       } while (nbno != NULLFSBLOCK);
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       *stat = i;
-       return 0;
-error0:
-       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-       return error;
-}
-
-/*
- * Log fields from the btree block header.
- */
-void
-xfs_bmbt_log_block(
-       xfs_btree_cur_t         *cur,
-       xfs_buf_t               *bp,
-       int                     fields)
-{
-       int                     first;
-       int                     last;
-       xfs_trans_t             *tp;
-       static const short      offsets[] = {
-               offsetof(xfs_bmbt_block_t, bb_magic),
-               offsetof(xfs_bmbt_block_t, bb_level),
-               offsetof(xfs_bmbt_block_t, bb_numrecs),
-               offsetof(xfs_bmbt_block_t, bb_leftsib),
-               offsetof(xfs_bmbt_block_t, bb_rightsib),
-               sizeof(xfs_bmbt_block_t)
-       };
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGBI(cur, bp, fields);
-       tp = cur->bc_tp;
-       if (bp) {
-               xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first,
-                                 &last);
-               xfs_trans_log_buf(tp, bp, first, last);
-       } else
-               xfs_trans_log_inode(tp, cur->bc_private.b.ip,
-                       XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-
-/*
- * Log record values from the btree block.
- */
-void
-xfs_bmbt_log_recs(
-       xfs_btree_cur_t         *cur,
-       xfs_buf_t               *bp,
-       int                     rfirst,
-       int                     rlast)
-{
-       xfs_bmbt_block_t        *block;
-       int                     first;
-       int                     last;
-       xfs_bmbt_rec_t          *rp;
-       xfs_trans_t             *tp;
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGBII(cur, bp, rfirst, rlast);
-       ASSERT(bp);
-       tp = cur->bc_tp;
-       block = XFS_BUF_TO_BMBT_BLOCK(bp);
-       rp = XFS_BMAP_REC_DADDR(block, 1, cur);
-       first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
-       last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
-       xfs_trans_log_buf(tp, bp, first, last);
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-
-int                                    /* error */
-xfs_bmbt_lookup_eq(
-       xfs_btree_cur_t *cur,
-       xfs_fileoff_t   off,
-       xfs_fsblock_t   bno,
-       xfs_filblks_t   len,
-       int             *stat)          /* success/failure */
-{
-       cur->bc_rec.b.br_startoff = off;
-       cur->bc_rec.b.br_startblock = bno;
-       cur->bc_rec.b.br_blockcount = len;
-       return xfs_bmbt_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-
-int                                    /* error */
-xfs_bmbt_lookup_ge(
-       xfs_btree_cur_t *cur,
-       xfs_fileoff_t   off,
-       xfs_fsblock_t   bno,
-       xfs_filblks_t   len,
-       int             *stat)          /* success/failure */
-{
-       cur->bc_rec.b.br_startoff = off;
-       cur->bc_rec.b.br_startblock = bno;
-       cur->bc_rec.b.br_blockcount = len;
-       return xfs_bmbt_lookup(cur, XFS_LOOKUP_GE, stat);
-}
-
-/*
- * Give the bmap btree a new root block.  Copy the old broot contents
- * down into a real block and make the broot point to it.
- */
-int                                            /* error */
-xfs_bmbt_newroot(
-       xfs_btree_cur_t         *cur,           /* btree cursor */
-       int                     *logflags,      /* logging flags for inode */
-       int                     *stat)          /* return status - 0 fail */
-{
-       xfs_alloc_arg_t         args;           /* allocation arguments */
-       xfs_bmbt_block_t        *block;         /* bmap btree block */
-       xfs_buf_t               *bp;            /* buffer for block */
-       xfs_bmbt_block_t        *cblock;        /* child btree block */
-       xfs_bmbt_key_t          *ckp;           /* child key pointer */
-       xfs_bmbt_ptr_t          *cpp;           /* child ptr pointer */
-       int                     error;          /* error return code */
-#ifdef DEBUG
-       int                     i;              /* loop counter */
-#endif
-       xfs_bmbt_key_t          *kp;            /* pointer to bmap btree key */
-       int                     level;          /* btree level */
-       xfs_bmbt_ptr_t          *pp;            /* pointer to bmap block addr */
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       level = cur->bc_nlevels - 1;
-       block = xfs_bmbt_get_block(cur, level, &bp);
-       /*
-        * Copy the root into a real block.
-        */
-       args.mp = cur->bc_mp;
-       pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-       args.tp = cur->bc_tp;
-       args.fsbno = cur->bc_private.b.firstblock;
-       args.mod = args.minleft = args.alignment = args.total = args.isfl =
-               args.userdata = args.minalignslop = 0;
-       args.minlen = args.maxlen = args.prod = 1;
-       args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
-       args.firstblock = args.fsbno;
-       if (args.fsbno == NULLFSBLOCK) {
-#ifdef DEBUG
-               if ((error = xfs_btree_check_lptr_disk(cur, *pp, level))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-#endif
-               args.fsbno = be64_to_cpu(*pp);
-               args.type = XFS_ALLOCTYPE_START_BNO;
-       } else if (cur->bc_private.b.flist->xbf_low)
-               args.type = XFS_ALLOCTYPE_START_BNO;
-       else
-               args.type = XFS_ALLOCTYPE_NEAR_BNO;
-       if ((error = xfs_alloc_vextent(&args))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-       if (args.fsbno == NULLFSBLOCK) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       ASSERT(args.len == 1);
-       cur->bc_private.b.firstblock = args.fsbno;
-       cur->bc_private.b.allocated++;
-       cur->bc_private.b.ip->i_d.di_nblocks++;
-       XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
-                         XFS_TRANS_DQ_BCOUNT, 1L);
-       bp = xfs_btree_get_bufl(args.mp, cur->bc_tp, args.fsbno, 0);
-       cblock = XFS_BUF_TO_BMBT_BLOCK(bp);
-       *cblock = *block;
-       be16_add_cpu(&block->bb_level, 1);
-       block->bb_numrecs = cpu_to_be16(1);
-       cur->bc_nlevels++;
-       cur->bc_ptrs[level + 1] = 1;
-       kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
-       ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
-       memcpy(ckp, kp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*kp));
-       cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
-#ifdef DEBUG
-       for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
-               if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-       }
-#endif
-       memcpy(cpp, pp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*pp));
-#ifdef DEBUG
-       if ((error = xfs_btree_check_lptr(cur, args.fsbno, level))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-#endif
-       *pp = cpu_to_be64(args.fsbno);
-       xfs_iroot_realloc(cur->bc_private.b.ip, 1 - be16_to_cpu(cblock->bb_numrecs),
-               cur->bc_private.b.whichfork);
-       xfs_btree_setbuf(cur, level, bp);
-       /*
-        * Do all this logging at the end so that
-        * the root is at the right level.
-        */
-       xfs_bmbt_log_block(cur, bp, XFS_BB_ALL_BITS);
-       xfs_bmbt_log_keys(cur, bp, 1, be16_to_cpu(cblock->bb_numrecs));
-       xfs_bmbt_log_ptrs(cur, bp, 1, be16_to_cpu(cblock->bb_numrecs));
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       *logflags |=
-               XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork);
-       *stat = 1;
-       return 0;
+{
+       __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
+                               get_unaligned_be64(&r->l1), s);
+}
+
+/*
+ * Extract the blockcount field from an on disk bmap extent record.
+ */
+xfs_filblks_t
+xfs_bmbt_disk_get_blockcount(
+       xfs_bmbt_rec_t  *r)
+{
+       return (xfs_filblks_t)(be64_to_cpu(r->l1) & XFS_MASK64LO(21));
+}
+
+/*
+ * Extract the startoff field from a disk format bmap extent record.
+ */
+xfs_fileoff_t
+xfs_bmbt_disk_get_startoff(
+       xfs_bmbt_rec_t  *r)
+{
+       return ((xfs_fileoff_t)be64_to_cpu(r->l0) &
+                XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
 }
 
+
 /*
  * Set all the fields in a bmap extent record from the arguments.
  */
@@ -2512,7 +429,8 @@ xfs_bmbt_set_state(
  */
 void
 xfs_bmbt_to_bmdr(
-       xfs_bmbt_block_t        *rblock,
+       struct xfs_mount        *mp,
+       struct xfs_btree_block  *rblock,
        int                     rblocklen,
        xfs_bmdr_block_t        *dblock,
        int                     dblocklen)
@@ -2524,66 +442,21 @@ xfs_bmbt_to_bmdr(
        __be64                  *tpp;
 
        ASSERT(be32_to_cpu(rblock->bb_magic) == XFS_BMAP_MAGIC);
-       ASSERT(be64_to_cpu(rblock->bb_leftsib) == NULLDFSBNO);
-       ASSERT(be64_to_cpu(rblock->bb_rightsib) == NULLDFSBNO);
+       ASSERT(be64_to_cpu(rblock->bb_u.l.bb_leftsib) == NULLDFSBNO);
+       ASSERT(be64_to_cpu(rblock->bb_u.l.bb_rightsib) == NULLDFSBNO);
        ASSERT(be16_to_cpu(rblock->bb_level) > 0);
        dblock->bb_level = rblock->bb_level;
        dblock->bb_numrecs = rblock->bb_numrecs;
-       dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0);
-       fkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen);
-       tkp = XFS_BTREE_KEY_ADDR(xfs_bmdr, dblock, 1);
-       fpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
-       tpp = XFS_BTREE_PTR_ADDR(xfs_bmdr, dblock, 1, dmxr);
+       dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
+       fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+       tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+       fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+       tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
        dmxr = be16_to_cpu(dblock->bb_numrecs);
        memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
        memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
 }
 
-/*
- * Update the record to the passed values.
- */
-int
-xfs_bmbt_update(
-       xfs_btree_cur_t         *cur,
-       xfs_fileoff_t           off,
-       xfs_fsblock_t           bno,
-       xfs_filblks_t           len,
-       xfs_exntst_t            state)
-{
-       xfs_bmbt_block_t        *block;
-       xfs_buf_t               *bp;
-       int                     error;
-       xfs_bmbt_key_t          key;
-       int                     ptr;
-       xfs_bmbt_rec_t          *rp;
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGFFFI(cur, (xfs_dfiloff_t)off, (xfs_dfsbno_t)bno,
-               (xfs_dfilblks_t)len, (int)state);
-       block = xfs_bmbt_get_block(cur, 0, &bp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_lblock(cur, block, 0, bp))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-#endif
-       ptr = cur->bc_ptrs[0];
-       rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
-       xfs_bmbt_disk_set_allf(rp, off, bno, len, state);
-       xfs_bmbt_log_recs(cur, bp, ptr, ptr);
-       if (ptr > 1) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               return 0;
-       }
-       key.br_startoff = cpu_to_be64(off);
-       if ((error = xfs_bmbt_updkey(cur, &key, 1))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       return 0;
-}
-
 /*
  * Check extent records, which have just been read, for
  * any bit in the extent flag field. ASSERT on debug
@@ -2608,3 +481,451 @@ xfs_check_nostate_extents(
        }
        return 0;
 }
+
+
+STATIC struct xfs_btree_cur *
+xfs_bmbt_dup_cursor(
+       struct xfs_btree_cur    *cur)
+{
+       struct xfs_btree_cur    *new;
+
+       new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
+                       cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+
+       /*
+        * Copy the firstblock, flist, and flags values,
+        * since init cursor doesn't get them.
+        */
+       new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
+       new->bc_private.b.flist = cur->bc_private.b.flist;
+       new->bc_private.b.flags = cur->bc_private.b.flags;
+
+       return new;
+}
+
+STATIC void
+xfs_bmbt_update_cursor(
+       struct xfs_btree_cur    *src,
+       struct xfs_btree_cur    *dst)
+{
+       ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) ||
+              (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
+       ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist);
+
+       dst->bc_private.b.allocated += src->bc_private.b.allocated;
+       dst->bc_private.b.firstblock = src->bc_private.b.firstblock;
+
+       src->bc_private.b.allocated = 0;
+}
+
+STATIC int
+xfs_bmbt_alloc_block(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *start,
+       union xfs_btree_ptr     *new,
+       int                     length,
+       int                     *stat)
+{
+       xfs_alloc_arg_t         args;           /* block allocation args */
+       int                     error;          /* error return value */
+
+       memset(&args, 0, sizeof(args));
+       args.tp = cur->bc_tp;
+       args.mp = cur->bc_mp;
+       args.fsbno = cur->bc_private.b.firstblock;
+       args.firstblock = args.fsbno;
+
+       if (args.fsbno == NULLFSBLOCK) {
+               args.fsbno = be64_to_cpu(start->l);
+               args.type = XFS_ALLOCTYPE_START_BNO;
+               /*
+                * Make sure there is sufficient room left in the AG to
+                * complete a full tree split for an extent insert.  If
+                * we are converting the middle part of an extent then
+                * we may need space for two tree splits.
+                *
+                * We are relying on the caller to make the correct block
+                * reservation for this operation to succeed.  If the
+                * reservation amount is insufficient then we may fail a
+                * block allocation here and corrupt the filesystem.
+                */
+               args.minleft = xfs_trans_get_block_res(args.tp);
+       } else if (cur->bc_private.b.flist->xbf_low) {
+               args.type = XFS_ALLOCTYPE_START_BNO;
+       } else {
+               args.type = XFS_ALLOCTYPE_NEAR_BNO;
+       }
+
+       args.minlen = args.maxlen = args.prod = 1;
+       args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+       if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
+               error = XFS_ERROR(ENOSPC);
+               goto error0;
+       }
+       error = xfs_alloc_vextent(&args);
+       if (error)
+               goto error0;
+
+       if (args.fsbno == NULLFSBLOCK && args.minleft) {
+               /*
+                * Could not find an AG with enough free space to satisfy
+                * a full btree split.  Try again without minleft and if
+                * successful activate the lowspace algorithm.
+                */
+               args.fsbno = 0;
+               args.type = XFS_ALLOCTYPE_FIRST_AG;
+               args.minleft = 0;
+               error = xfs_alloc_vextent(&args);
+               if (error)
+                       goto error0;
+               cur->bc_private.b.flist->xbf_low = 1;
+       }
+       if (args.fsbno == NULLFSBLOCK) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+               *stat = 0;
+               return 0;
+       }
+       ASSERT(args.len == 1);
+       cur->bc_private.b.firstblock = args.fsbno;
+       cur->bc_private.b.allocated++;
+       cur->bc_private.b.ip->i_d.di_nblocks++;
+       xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
+       XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
+                       XFS_TRANS_DQ_BCOUNT, 1L);
+
+       new->l = cpu_to_be64(args.fsbno);
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+
+ error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+STATIC int
+xfs_bmbt_free_block(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = cur->bc_mp;
+       struct xfs_inode        *ip = cur->bc_private.b.ip;
+       struct xfs_trans        *tp = cur->bc_tp;
+       xfs_fsblock_t           fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+
+       xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
+       ip->i_d.di_nblocks--;
+
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+       XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+       xfs_trans_binval(tp, bp);
+       return 0;
+}
+
+STATIC int
+xfs_bmbt_get_minrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
+{
+       if (level == cur->bc_nlevels - 1) {
+               struct xfs_ifork        *ifp;
+
+               ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+                                   cur->bc_private.b.whichfork);
+
+               return xfs_bmbt_maxrecs(cur->bc_mp,
+                                       ifp->if_broot_bytes, level == 0) / 2;
+       }
+
+       return cur->bc_mp->m_bmap_dmnr[level != 0];
+}
+
+int
+xfs_bmbt_get_maxrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
+{
+       if (level == cur->bc_nlevels - 1) {
+               struct xfs_ifork        *ifp;
+
+               ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+                                   cur->bc_private.b.whichfork);
+
+               return xfs_bmbt_maxrecs(cur->bc_mp,
+                                       ifp->if_broot_bytes, level == 0);
+       }
+
+       return cur->bc_mp->m_bmap_dmxr[level != 0];
+
+}
+
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork
+ * so that we can resize the in-memory buffer to match it.  After a
+ * resize to the maximum size this function returns the same value
+ * as xfs_bmbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_bmbt_get_dmaxrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
+{
+       if (level != cur->bc_nlevels - 1)
+               return cur->bc_mp->m_bmap_dmxr[level != 0];
+       return xfs_bmdr_maxrecs(cur->bc_mp, cur->bc_private.b.forksize,
+                               level == 0);
+}
+
+STATIC void
+xfs_bmbt_init_key_from_rec(
+       union xfs_btree_key     *key,
+       union xfs_btree_rec     *rec)
+{
+       key->bmbt.br_startoff =
+               cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt));
+}
+
+STATIC void
+xfs_bmbt_init_rec_from_key(
+       union xfs_btree_key     *key,
+       union xfs_btree_rec     *rec)
+{
+       ASSERT(key->bmbt.br_startoff != 0);
+
+       xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff),
+                              0, 0, XFS_EXT_NORM);
+}
+
+STATIC void
+xfs_bmbt_init_rec_from_cur(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec)
+{
+       xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
+}
+
+STATIC void
+xfs_bmbt_init_ptr_from_cur(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr)
+{
+       ptr->l = 0;
+}
+
+STATIC __int64_t
+xfs_bmbt_key_diff(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *key)
+{
+       return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) -
+                                     cur->bc_rec.b.br_startoff;
+}
+
+#ifdef DEBUG
+STATIC int
+xfs_bmbt_keys_inorder(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *k1,
+       union xfs_btree_key     *k2)
+{
+       return be64_to_cpu(k1->bmbt.br_startoff) <
+               be64_to_cpu(k2->bmbt.br_startoff);
+}
+
+STATIC int
+xfs_bmbt_recs_inorder(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *r1,
+       union xfs_btree_rec     *r2)
+{
+       return xfs_bmbt_disk_get_startoff(&r1->bmbt) +
+               xfs_bmbt_disk_get_blockcount(&r1->bmbt) <=
+               xfs_bmbt_disk_get_startoff(&r2->bmbt);
+}
+#endif /* DEBUG */
+
+#ifdef XFS_BTREE_TRACE
+ktrace_t       *xfs_bmbt_trace_buf;
+
+STATIC void
+xfs_bmbt_trace_enter(
+       struct xfs_btree_cur    *cur,
+       const char              *func,
+       char                    *s,
+       int                     type,
+       int                     line,
+       __psunsigned_t          a0,
+       __psunsigned_t          a1,
+       __psunsigned_t          a2,
+       __psunsigned_t          a3,
+       __psunsigned_t          a4,
+       __psunsigned_t          a5,
+       __psunsigned_t          a6,
+       __psunsigned_t          a7,
+       __psunsigned_t          a8,
+       __psunsigned_t          a9,
+       __psunsigned_t          a10)
+{
+       struct xfs_inode        *ip = cur->bc_private.b.ip;
+       int                     whichfork = cur->bc_private.b.whichfork;
+
+       ktrace_enter(xfs_bmbt_trace_buf,
+               (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
+               (void *)func, (void *)s, (void *)ip, (void *)cur,
+               (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+               (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+               (void *)a8, (void *)a9, (void *)a10);
+       ktrace_enter(ip->i_btrace,
+               (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
+               (void *)func, (void *)s, (void *)ip, (void *)cur,
+               (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+               (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+               (void *)a8, (void *)a9, (void *)a10);
+}
+
+STATIC void
+xfs_bmbt_trace_cursor(
+       struct xfs_btree_cur    *cur,
+       __uint32_t              *s0,
+       __uint64_t              *l0,
+       __uint64_t              *l1)
+{
+       struct xfs_bmbt_rec_host r;
+
+       xfs_bmbt_set_all(&r, &cur->bc_rec.b);
+
+       *s0 = (cur->bc_nlevels << 24) |
+             (cur->bc_private.b.flags << 16) |
+              cur->bc_private.b.allocated;
+       *l0 = r.l0;
+       *l1 = r.l1;
+}
+
+STATIC void
+xfs_bmbt_trace_key(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *key,
+       __uint64_t              *l0,
+       __uint64_t              *l1)
+{
+       *l0 = be64_to_cpu(key->bmbt.br_startoff);
+       *l1 = 0;
+}
+
+STATIC void
+xfs_bmbt_trace_record(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec,
+       __uint64_t              *l0,
+       __uint64_t              *l1,
+       __uint64_t              *l2)
+{
+       struct xfs_bmbt_irec    irec;
+
+       xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
+       *l0 = irec.br_startoff;
+       *l1 = irec.br_startblock;
+       *l2 = irec.br_blockcount;
+}
+#endif /* XFS_BTREE_TRACE */
+
+static const struct xfs_btree_ops xfs_bmbt_ops = {
+       .rec_len                = sizeof(xfs_bmbt_rec_t),
+       .key_len                = sizeof(xfs_bmbt_key_t),
+
+       .dup_cursor             = xfs_bmbt_dup_cursor,
+       .update_cursor          = xfs_bmbt_update_cursor,
+       .alloc_block            = xfs_bmbt_alloc_block,
+       .free_block             = xfs_bmbt_free_block,
+       .get_maxrecs            = xfs_bmbt_get_maxrecs,
+       .get_minrecs            = xfs_bmbt_get_minrecs,
+       .get_dmaxrecs           = xfs_bmbt_get_dmaxrecs,
+       .init_key_from_rec      = xfs_bmbt_init_key_from_rec,
+       .init_rec_from_key      = xfs_bmbt_init_rec_from_key,
+       .init_rec_from_cur      = xfs_bmbt_init_rec_from_cur,
+       .init_ptr_from_cur      = xfs_bmbt_init_ptr_from_cur,
+       .key_diff               = xfs_bmbt_key_diff,
+
+#ifdef DEBUG
+       .keys_inorder           = xfs_bmbt_keys_inorder,
+       .recs_inorder           = xfs_bmbt_recs_inorder,
+#endif
+
+#ifdef XFS_BTREE_TRACE
+       .trace_enter            = xfs_bmbt_trace_enter,
+       .trace_cursor           = xfs_bmbt_trace_cursor,
+       .trace_key              = xfs_bmbt_trace_key,
+       .trace_record           = xfs_bmbt_trace_record,
+#endif
+};
+
+/*
+ * Allocate a new bmap btree cursor.
+ */
+struct xfs_btree_cur *                         /* new bmap btree cursor */
+xfs_bmbt_init_cursor(
+       struct xfs_mount        *mp,            /* file system mount point */
+       struct xfs_trans        *tp,            /* transaction pointer */
+       struct xfs_inode        *ip,            /* inode owning the btree */
+       int                     whichfork)      /* data or attr fork */
+{
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+       struct xfs_btree_cur    *cur;
+
+       cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+
+       cur->bc_tp = tp;
+       cur->bc_mp = mp;
+       cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+       cur->bc_btnum = XFS_BTNUM_BMAP;
+       cur->bc_blocklog = mp->m_sb.sb_blocklog;
+
+       cur->bc_ops = &xfs_bmbt_ops;
+       cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
+
+       cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
+       cur->bc_private.b.ip = ip;
+       cur->bc_private.b.firstblock = NULLFSBLOCK;
+       cur->bc_private.b.flist = NULL;
+       cur->bc_private.b.allocated = 0;
+       cur->bc_private.b.flags = 0;
+       cur->bc_private.b.whichfork = whichfork;
+
+       return cur;
+}
+
+/*
+ * Calculate number of records in a bmap btree block.
+ */
+int
+xfs_bmbt_maxrecs(
+       struct xfs_mount        *mp,
+       int                     blocklen,
+       int                     leaf)
+{
+       blocklen -= XFS_BMBT_BLOCK_LEN(mp);
+
+       if (leaf)
+               return blocklen / sizeof(xfs_bmbt_rec_t);
+       return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+}
+
+/*
+ * Calculate number of records in a bmap btree inode root.
+ */
+int
+xfs_bmdr_maxrecs(
+       struct xfs_mount        *mp,
+       int                     blocklen,
+       int                     leaf)
+{
+       blocklen -= sizeof(xfs_bmdr_block_t);
+
+       if (leaf)
+               return blocklen / sizeof(xfs_bmdr_rec_t);
+       return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
+}
index cd0d4b4bb81645cb50331cc70a3d254eb80e849f..a4555abb6622a5d2ac6c08ab0f585761d6ff4585 100644 (file)
 #define XFS_BMAP_MAGIC 0x424d4150      /* 'BMAP' */
 
 struct xfs_btree_cur;
-struct xfs_btree_lblock;
+struct xfs_btree_block;
 struct xfs_mount;
 struct xfs_inode;
+struct xfs_trans;
 
 /*
  * Bmap root header, on-disk form only.
@@ -145,71 +146,60 @@ typedef struct xfs_bmbt_key {
 /* btree pointer type */
 typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
 
-/* btree block header type */
-typedef struct xfs_btree_lblock xfs_bmbt_block_t;
-
-#define XFS_BUF_TO_BMBT_BLOCK(bp)      ((xfs_bmbt_block_t *)XFS_BUF_PTR(bp))
-
-#define XFS_BMAP_RBLOCK_DSIZE(lev,cur) ((cur)->bc_private.b.forksize)
-#define XFS_BMAP_RBLOCK_ISIZE(lev,cur) \
-       ((int)XFS_IFORK_PTR((cur)->bc_private.b.ip, \
-                   (cur)->bc_private.b.whichfork)->if_broot_bytes)
-
-#define XFS_BMAP_BLOCK_DMAXRECS(lev,cur) \
-       (((lev) == (cur)->bc_nlevels - 1 ? \
-               XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \
-                       xfs_bmdr, (lev) == 0) : \
-               ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0])))
-#define XFS_BMAP_BLOCK_IMAXRECS(lev,cur) \
-       (((lev) == (cur)->bc_nlevels - 1 ? \
-                       XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur),\
-                               xfs_bmbt, (lev) == 0) : \
-                       ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0])))
-
-#define XFS_BMAP_BLOCK_DMINRECS(lev,cur) \
-       (((lev) == (cur)->bc_nlevels - 1 ? \
-                       XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur),\
-                               xfs_bmdr, (lev) == 0) : \
-                       ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0])))
-#define XFS_BMAP_BLOCK_IMINRECS(lev,cur) \
-       (((lev) == (cur)->bc_nlevels - 1 ? \
-                       XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur),\
-                               xfs_bmbt, (lev) == 0) : \
-                       ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0])))
-
-#define XFS_BMAP_REC_DADDR(bb,i,cur)   (XFS_BTREE_REC_ADDR(xfs_bmbt, bb, i))
-
-#define XFS_BMAP_REC_IADDR(bb,i,cur)   (XFS_BTREE_REC_ADDR(xfs_bmbt, bb, i))
-
-#define XFS_BMAP_KEY_DADDR(bb,i,cur)   \
-       (XFS_BTREE_KEY_ADDR(xfs_bmbt, bb, i))
-
-#define XFS_BMAP_KEY_IADDR(bb,i,cur)   \
-       (XFS_BTREE_KEY_ADDR(xfs_bmbt, bb, i))
-
-#define XFS_BMAP_PTR_DADDR(bb,i,cur)   \
-       (XFS_BTREE_PTR_ADDR(xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS(   \
-                               be16_to_cpu((bb)->bb_level), cur)))
-#define XFS_BMAP_PTR_IADDR(bb,i,cur)   \
-       (XFS_BTREE_PTR_ADDR(xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS(   \
-                               be16_to_cpu((bb)->bb_level), cur)))
+/*
+ * Btree block header size depends on a superblock flag.
+ *
+ * (not quite yet, but soon)
+ */
+#define XFS_BMBT_BLOCK_LEN(mp) XFS_BTREE_LBLOCK_LEN
+
+#define XFS_BMBT_REC_ADDR(mp, block, index) \
+       ((xfs_bmbt_rec_t *) \
+               ((char *)(block) + \
+                XFS_BMBT_BLOCK_LEN(mp) + \
+                ((index) - 1) * sizeof(xfs_bmbt_rec_t)))
+
+#define XFS_BMBT_KEY_ADDR(mp, block, index) \
+       ((xfs_bmbt_key_t *) \
+               ((char *)(block) + \
+                XFS_BMBT_BLOCK_LEN(mp) + \
+                ((index) - 1) * sizeof(xfs_bmbt_key_t)))
+
+#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \
+       ((xfs_bmbt_ptr_t *) \
+               ((char *)(block) + \
+                XFS_BMBT_BLOCK_LEN(mp) + \
+                (maxrecs) * sizeof(xfs_bmbt_key_t) + \
+                ((index) - 1) * sizeof(xfs_bmbt_ptr_t)))
+
+#define XFS_BMDR_REC_ADDR(block, index) \
+       ((xfs_bmdr_rec_t *) \
+               ((char *)(block) + \
+                sizeof(struct xfs_bmdr_block) + \
+                ((index) - 1) * sizeof(xfs_bmdr_rec_t)))
+
+#define XFS_BMDR_KEY_ADDR(block, index) \
+       ((xfs_bmdr_key_t *) \
+               ((char *)(block) + \
+                sizeof(struct xfs_bmdr_block) + \
+                ((index) - 1) * sizeof(xfs_bmdr_key_t)))
+
+#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \
+       ((xfs_bmdr_ptr_t *) \
+               ((char *)(block) + \
+                sizeof(struct xfs_bmdr_block) + \
+                (maxrecs) * sizeof(xfs_bmdr_key_t) + \
+                ((index) - 1) * sizeof(xfs_bmdr_ptr_t)))
 
 /*
  * These are to be used when we know the size of the block and
  * we don't have a cursor.
  */
-#define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz) \
-       (XFS_BTREE_REC_ADDR(xfs_bmbt,bb,i))
-#define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz) \
-       (XFS_BTREE_KEY_ADDR(xfs_bmbt,bb,i))
-#define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz) \
-       (XFS_BTREE_PTR_ADDR(xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz)))
-
-#define XFS_BMAP_BROOT_NUMRECS(bb)     be16_to_cpu((bb)->bb_numrecs)
-#define XFS_BMAP_BROOT_MAXRECS(sz)     XFS_BTREE_BLOCK_MAXRECS(sz,xfs_bmbt,0)
+#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
+       XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
 
 #define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \
-       (int)(sizeof(xfs_bmbt_block_t) + \
+       (int)(XFS_BTREE_LBLOCK_LEN + \
               ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
 
 #define XFS_BMAP_BROOT_SPACE(bb) \
@@ -223,42 +213,12 @@ typedef struct xfs_btree_lblock xfs_bmbt_block_t;
  */
 #define XFS_BM_MAXLEVELS(mp,w)         ((mp)->m_bm_maxlevels[(w)])
 
-#define XFS_BMAP_SANITY_CHECK(mp,bb,level) \
-       (be32_to_cpu((bb)->bb_magic) == XFS_BMAP_MAGIC && \
-        be16_to_cpu((bb)->bb_level) == level && \
-        be16_to_cpu((bb)->bb_numrecs) > 0 && \
-        be16_to_cpu((bb)->bb_numrecs) <= (mp)->m_bmap_dmxr[(level) != 0])
-
-
-#ifdef __KERNEL__
-
-#if defined(XFS_BMBT_TRACE)
-/*
- * Trace buffer entry types.
- */
-#define XFS_BMBT_KTRACE_ARGBI  1
-#define XFS_BMBT_KTRACE_ARGBII 2
-#define XFS_BMBT_KTRACE_ARGFFFI 3
-#define XFS_BMBT_KTRACE_ARGI   4
-#define XFS_BMBT_KTRACE_ARGIFK 5
-#define XFS_BMBT_KTRACE_ARGIFR 6
-#define XFS_BMBT_KTRACE_ARGIK  7
-#define XFS_BMBT_KTRACE_CUR    8
-
-#define XFS_BMBT_TRACE_SIZE    4096    /* size of global trace buffer */
-#define XFS_BMBT_KTRACE_SIZE   32      /* size of per-inode trace buffer */
-extern ktrace_t        *xfs_bmbt_trace_buf;
-#endif
-
 /*
  * Prototypes for xfs_bmap.c to call.
  */
-extern void xfs_bmdr_to_bmbt(xfs_bmdr_block_t *, int, xfs_bmbt_block_t *, int);
-extern int xfs_bmbt_decrement(struct xfs_btree_cur *, int, int *);
-extern int xfs_bmbt_delete(struct xfs_btree_cur *, int *);
+extern void xfs_bmdr_to_bmbt(struct xfs_mount *, xfs_bmdr_block_t *, int,
+                       struct xfs_btree_block *, int);
 extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
-extern xfs_bmbt_block_t *xfs_bmbt_get_block(struct xfs_btree_cur *cur,
-                                               int, struct xfs_buf **bpp);
 extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r);
 extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
 extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
@@ -268,22 +228,6 @@ extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
 extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
 extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
 
-extern int xfs_bmbt_increment(struct xfs_btree_cur *, int, int *);
-extern int xfs_bmbt_insert(struct xfs_btree_cur *, int *);
-extern void xfs_bmbt_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
-extern void xfs_bmbt_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int,
-                               int);
-extern int xfs_bmbt_lookup_eq(struct xfs_btree_cur *, xfs_fileoff_t,
-                               xfs_fsblock_t, xfs_filblks_t, int *);
-extern int xfs_bmbt_lookup_ge(struct xfs_btree_cur *, xfs_fileoff_t,
-                               xfs_fsblock_t, xfs_filblks_t, int *);
-
-/*
- * Give the bmap btree a new root block.  Copy the old broot contents
- * down into a real block and make the broot point to it.
- */
-extern int xfs_bmbt_newroot(struct xfs_btree_cur *cur, int *lflags, int *stat);
-
 extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
 extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o,
                        xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
@@ -296,10 +240,15 @@ extern void xfs_bmbt_disk_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
 extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
                        xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
 
-extern void xfs_bmbt_to_bmdr(xfs_bmbt_block_t *, int, xfs_bmdr_block_t *, int);
-extern int xfs_bmbt_update(struct xfs_btree_cur *, xfs_fileoff_t,
-                               xfs_fsblock_t, xfs_filblks_t, xfs_exntst_t);
+extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
+                       xfs_bmdr_block_t *, int);
+
+extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
+extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+
+extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
+               struct xfs_trans *, struct xfs_inode *, int);
 
-#endif /* __KERNEL__ */
 
 #endif /* __XFS_BMAP_BTREE_H__ */
index cc593a84c34572211cc2ff44997457838bbe4f5d..7ed59267420d8e5a661efac5b10190a85066d2f0 100644 (file)
@@ -34,7 +34,9 @@
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_inode_item.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_error.h"
 
@@ -50,135 +52,33 @@ const __uint32_t xfs_magics[XFS_BTNUM_MAX] = {
        XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
 };
 
-/*
- * Checking routine: return maxrecs for the block.
- */
-STATIC int                             /* number of records fitting in block */
-xfs_btree_maxrecs(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_btree_block_t       *block) /* generic btree block pointer */
-{
-       switch (cur->bc_btnum) {
-       case XFS_BTNUM_BNO:
-       case XFS_BTNUM_CNT:
-               return (int)XFS_ALLOC_BLOCK_MAXRECS(
-                               be16_to_cpu(block->bb_h.bb_level), cur);
-       case XFS_BTNUM_BMAP:
-               return (int)XFS_BMAP_BLOCK_IMAXRECS(
-                               be16_to_cpu(block->bb_h.bb_level), cur);
-       case XFS_BTNUM_INO:
-               return (int)XFS_INOBT_BLOCK_MAXRECS(
-                               be16_to_cpu(block->bb_h.bb_level), cur);
-       default:
-               ASSERT(0);
-               return 0;
-       }
-}
-
-/*
- * External routines.
- */
-
-#ifdef DEBUG
-/*
- * Debug routine: check that block header is ok.
- */
-void
-xfs_btree_check_block(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_btree_block_t       *block, /* generic btree block pointer */
-       int                     level,  /* level of the btree block */
-       xfs_buf_t               *bp)    /* buffer containing block, if any */
-{
-       if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
-               xfs_btree_check_lblock(cur, (xfs_btree_lblock_t *)block, level,
-                       bp);
-       else
-               xfs_btree_check_sblock(cur, (xfs_btree_sblock_t *)block, level,
-                       bp);
-}
-
-/*
- * Debug routine: check that keys are in the right order.
- */
-void
-xfs_btree_check_key(
-       xfs_btnum_t     btnum,          /* btree identifier */
-       void            *ak1,           /* pointer to left (lower) key */
-       void            *ak2)           /* pointer to right (higher) key */
-{
-       switch (btnum) {
-       case XFS_BTNUM_BNO: {
-               xfs_alloc_key_t *k1;
-               xfs_alloc_key_t *k2;
-
-               k1 = ak1;
-               k2 = ak2;
-               ASSERT(be32_to_cpu(k1->ar_startblock) < be32_to_cpu(k2->ar_startblock));
-               break;
-           }
-       case XFS_BTNUM_CNT: {
-               xfs_alloc_key_t *k1;
-               xfs_alloc_key_t *k2;
-
-               k1 = ak1;
-               k2 = ak2;
-               ASSERT(be32_to_cpu(k1->ar_blockcount) < be32_to_cpu(k2->ar_blockcount) ||
-                      (k1->ar_blockcount == k2->ar_blockcount &&
-                       be32_to_cpu(k1->ar_startblock) < be32_to_cpu(k2->ar_startblock)));
-               break;
-           }
-       case XFS_BTNUM_BMAP: {
-               xfs_bmbt_key_t  *k1;
-               xfs_bmbt_key_t  *k2;
-
-               k1 = ak1;
-               k2 = ak2;
-               ASSERT(be64_to_cpu(k1->br_startoff) < be64_to_cpu(k2->br_startoff));
-               break;
-           }
-       case XFS_BTNUM_INO: {
-               xfs_inobt_key_t *k1;
-               xfs_inobt_key_t *k2;
-
-               k1 = ak1;
-               k2 = ak2;
-               ASSERT(be32_to_cpu(k1->ir_startino) < be32_to_cpu(k2->ir_startino));
-               break;
-           }
-       default:
-               ASSERT(0);
-       }
-}
-#endif /* DEBUG */
 
-/*
- * Checking routine: check that long form block header is ok.
- */
-/* ARGSUSED */
-int                                    /* error (0 or EFSCORRUPTED) */
+STATIC int                             /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_lblock(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_btree_lblock_t      *block, /* btree long form block pointer */
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       struct xfs_btree_block  *block, /* btree long form block pointer */
        int                     level,  /* level of the btree block */
-       xfs_buf_t               *bp)    /* buffer for block, if any */
+       struct xfs_buf          *bp)    /* buffer for block, if any */
 {
        int                     lblock_ok; /* block passes checks */
-       xfs_mount_t             *mp;    /* file system mount point */
+       struct xfs_mount        *mp;    /* file system mount point */
 
        mp = cur->bc_mp;
        lblock_ok =
                be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
                be16_to_cpu(block->bb_level) == level &&
                be16_to_cpu(block->bb_numrecs) <=
-                       xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) &&
-               block->bb_leftsib &&
-               (be64_to_cpu(block->bb_leftsib) == NULLDFSBNO ||
-                XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_leftsib))) &&
-               block->bb_rightsib &&
-               (be64_to_cpu(block->bb_rightsib) == NULLDFSBNO ||
-                XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_rightsib)));
-       if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK,
+                       cur->bc_ops->get_maxrecs(cur, level) &&
+               block->bb_u.l.bb_leftsib &&
+               (be64_to_cpu(block->bb_u.l.bb_leftsib) == NULLDFSBNO ||
+                XFS_FSB_SANITY_CHECK(mp,
+                       be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
+               block->bb_u.l.bb_rightsib &&
+               (be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO ||
+                XFS_FSB_SANITY_CHECK(mp,
+                       be64_to_cpu(block->bb_u.l.bb_rightsib)));
+       if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
+                       XFS_ERRTAG_BTREE_CHECK_LBLOCK,
                        XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
                if (bp)
                        xfs_buftrace("LBTREE ERROR", bp);
@@ -189,98 +89,15 @@ xfs_btree_check_lblock(
        return 0;
 }
 
-/*
- * Checking routine: check that (long) pointer is ok.
- */
-int                                    /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lptr(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       xfs_dfsbno_t    ptr,            /* btree block disk address */
-       int             level)          /* btree block level */
-{
-       xfs_mount_t     *mp;            /* file system mount point */
-
-       mp = cur->bc_mp;
-       XFS_WANT_CORRUPTED_RETURN(
-               level > 0 &&
-               ptr != NULLDFSBNO &&
-               XFS_FSB_SANITY_CHECK(mp, ptr));
-       return 0;
-}
-
-#ifdef DEBUG
-/*
- * Debug routine: check that records are in the right order.
- */
-void
-xfs_btree_check_rec(
-       xfs_btnum_t     btnum,          /* btree identifier */
-       void            *ar1,           /* pointer to left (lower) record */
-       void            *ar2)           /* pointer to right (higher) record */
-{
-       switch (btnum) {
-       case XFS_BTNUM_BNO: {
-               xfs_alloc_rec_t *r1;
-               xfs_alloc_rec_t *r2;
-
-               r1 = ar1;
-               r2 = ar2;
-               ASSERT(be32_to_cpu(r1->ar_startblock) +
-                      be32_to_cpu(r1->ar_blockcount) <=
-                      be32_to_cpu(r2->ar_startblock));
-               break;
-           }
-       case XFS_BTNUM_CNT: {
-               xfs_alloc_rec_t *r1;
-               xfs_alloc_rec_t *r2;
-
-               r1 = ar1;
-               r2 = ar2;
-               ASSERT(be32_to_cpu(r1->ar_blockcount) < be32_to_cpu(r2->ar_blockcount) ||
-                      (r1->ar_blockcount == r2->ar_blockcount &&
-                       be32_to_cpu(r1->ar_startblock) < be32_to_cpu(r2->ar_startblock)));
-               break;
-           }
-       case XFS_BTNUM_BMAP: {
-               xfs_bmbt_rec_t  *r1;
-               xfs_bmbt_rec_t  *r2;
-
-               r1 = ar1;
-               r2 = ar2;
-               ASSERT(xfs_bmbt_disk_get_startoff(r1) +
-                      xfs_bmbt_disk_get_blockcount(r1) <=
-                      xfs_bmbt_disk_get_startoff(r2));
-               break;
-           }
-       case XFS_BTNUM_INO: {
-               xfs_inobt_rec_t *r1;
-               xfs_inobt_rec_t *r2;
-
-               r1 = ar1;
-               r2 = ar2;
-               ASSERT(be32_to_cpu(r1->ir_startino) + XFS_INODES_PER_CHUNK <=
-                      be32_to_cpu(r2->ir_startino));
-               break;
-           }
-       default:
-               ASSERT(0);
-       }
-}
-#endif /* DEBUG */
-
-/*
- * Checking routine: check that block header is ok.
- */
-/* ARGSUSED */
-int                                    /* error (0 or EFSCORRUPTED) */
+STATIC int                             /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_sblock(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_btree_sblock_t      *block, /* btree short form block pointer */
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       struct xfs_btree_block  *block, /* btree short form block pointer */
        int                     level,  /* level of the btree block */
-       xfs_buf_t               *bp)    /* buffer containing block */
+       struct xfs_buf          *bp)    /* buffer containing block */
 {
-       xfs_buf_t               *agbp;  /* buffer for ag. freespace struct */
-       xfs_agf_t               *agf;   /* ag. freespace structure */
+       struct xfs_buf          *agbp;  /* buffer for ag. freespace struct */
+       struct xfs_agf          *agf;   /* ag. freespace structure */
        xfs_agblock_t           agflen; /* native ag. freespace length */
        int                     sblock_ok; /* block passes checks */
 
@@ -291,13 +108,13 @@ xfs_btree_check_sblock(
                be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
                be16_to_cpu(block->bb_level) == level &&
                be16_to_cpu(block->bb_numrecs) <=
-                       xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) &&
-               (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK ||
-                be32_to_cpu(block->bb_leftsib) < agflen) &&
-               block->bb_leftsib &&
-               (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK ||
-                be32_to_cpu(block->bb_rightsib) < agflen) &&
-               block->bb_rightsib;
+                       cur->bc_ops->get_maxrecs(cur, level) &&
+               (be32_to_cpu(block->bb_u.s.bb_leftsib) == NULLAGBLOCK ||
+                be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
+               block->bb_u.s.bb_leftsib &&
+               (be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK ||
+                be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
+               block->bb_u.s.bb_rightsib;
        if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp,
                        XFS_ERRTAG_BTREE_CHECK_SBLOCK,
                        XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
@@ -311,26 +128,77 @@ xfs_btree_check_sblock(
 }
 
 /*
- * Checking routine: check that (short) pointer is ok.
+ * Debug routine: check that block header is ok.
+ */
+int
+xfs_btree_check_block(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       struct xfs_btree_block  *block, /* generic btree block pointer */
+       int                     level,  /* level of the btree block */
+       struct xfs_buf          *bp)    /* buffer containing block, if any */
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+               return xfs_btree_check_lblock(cur, block, level, bp);
+       else
+               return xfs_btree_check_sblock(cur, block, level, bp);
+}
+
+/*
+ * Check that (long) pointer is ok.
  */
 int                                    /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_dfsbno_t            bno,    /* btree block disk address */
+       int                     level)  /* btree block level */
+{
+       XFS_WANT_CORRUPTED_RETURN(
+               level > 0 &&
+               bno != NULLDFSBNO &&
+               XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
+       return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check that (short) pointer is ok.
+ */
+STATIC int                             /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_sptr(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       xfs_agblock_t   ptr,            /* btree block disk address */
-       int             level)          /* btree block level */
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           bno,    /* btree block disk address */
+       int                     level)  /* btree block level */
 {
-       xfs_buf_t       *agbp;          /* buffer for ag. freespace struct */
-       xfs_agf_t       *agf;           /* ag. freespace structure */
+       xfs_agblock_t           agblocks = cur->bc_mp->m_sb.sb_agblocks;
 
-       agbp = cur->bc_private.a.agbp;
-       agf = XFS_BUF_TO_AGF(agbp);
        XFS_WANT_CORRUPTED_RETURN(
                level > 0 &&
-               ptr != NULLAGBLOCK && ptr != 0 &&
-               ptr < be32_to_cpu(agf->agf_length));
+               bno != NULLAGBLOCK &&
+               bno != 0 &&
+               bno < agblocks);
        return 0;
 }
 
+/*
+ * Check that block ptr is ok.
+ */
+STATIC int                             /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_ptr(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       union xfs_btree_ptr     *ptr,   /* btree block disk address */
+       int                     index,  /* offset from ptr to check */
+       int                     level)  /* btree block level */
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+               return xfs_btree_check_lptr(cur,
+                               be64_to_cpu((&ptr->l)[index]), level);
+       } else {
+               return xfs_btree_check_sptr(cur,
+                               be32_to_cpu((&ptr->s)[index]), level);
+       }
+}
+#endif
+
 /*
  * Delete the btree cursor.
  */
@@ -387,16 +255,17 @@ xfs_btree_dup_cursor(
 
        tp = cur->bc_tp;
        mp = cur->bc_mp;
+
        /*
         * Allocate a new cursor like the old one.
         */
-       new = xfs_btree_init_cursor(mp, tp, cur->bc_private.a.agbp,
-               cur->bc_private.a.agno, cur->bc_btnum, cur->bc_private.b.ip,
-               cur->bc_private.b.whichfork);
+       new = cur->bc_ops->dup_cursor(cur);
+
        /*
         * Copy the record currently in the cursor.
         */
        new->bc_rec = cur->bc_rec;
+
        /*
         * For each level current, re-get the buffer and copy the ptr value.
         */
@@ -416,46 +285,174 @@ xfs_btree_dup_cursor(
                } else
                        new->bc_bufs[i] = NULL;
        }
-       /*
-        * For bmap btrees, copy the firstblock, flist, and flags values,
-        * since init cursor doesn't get them.
-        */
-       if (new->bc_btnum == XFS_BTNUM_BMAP) {
-               new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
-               new->bc_private.b.flist = cur->bc_private.b.flist;
-               new->bc_private.b.flags = cur->bc_private.b.flags;
-       }
        *ncur = new;
        return 0;
 }
 
+/*
+ * XFS btree block layout and addressing:
+ *
+ * There are two types of blocks in the btree: leaf and non-leaf blocks.
+ *
+ * The leaf record start with a header then followed by records containing
+ * the values.  A non-leaf block also starts with the same header, and
+ * then first contains lookup keys followed by an equal number of pointers
+ * to the btree blocks at the previous level.
+ *
+ *             +--------+-------+-------+-------+-------+-------+-------+
+ * Leaf:       | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
+ *             +--------+-------+-------+-------+-------+-------+-------+
+ *
+ *             +--------+-------+-------+-------+-------+-------+-------+
+ * Non-Leaf:   | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
+ *             +--------+-------+-------+-------+-------+-------+-------+
+ *
+ * The header is called struct xfs_btree_block for reasons better left unknown
+ * and comes in different versions for short (32bit) and long (64bit) block
+ * pointers.  The record and key structures are defined by the btree instances
+ * and opaque to the btree core.  The block pointers are simple disk endian
+ * integers, available in a short (32bit) and long (64bit) variant.
+ *
+ * The helpers below calculate the offset of a given record, key or pointer
+ * into a btree block (xfs_btree_*_offset) or return a pointer to the given
+ * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
+ * inside the btree block is done using indices starting at one, not zero!
+ */
+
+/*
+ * Return size of the btree block header for this btree instance.
+ */
+static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
+{
+       return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+               XFS_BTREE_LBLOCK_LEN :
+               XFS_BTREE_SBLOCK_LEN;
+}
+
+/*
+ * Return size of btree block pointers for this btree instance.
+ */
+static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
+{
+       return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+               sizeof(__be64) : sizeof(__be32);
+}
+
+/*
+ * Calculate offset of the n-th record in a btree block.
+ */
+STATIC size_t
+xfs_btree_rec_offset(
+       struct xfs_btree_cur    *cur,
+       int                     n)
+{
+       return xfs_btree_block_len(cur) +
+               (n - 1) * cur->bc_ops->rec_len;
+}
+
+/*
+ * Calculate offset of the n-th key in a btree block.
+ */
+STATIC size_t
+xfs_btree_key_offset(
+       struct xfs_btree_cur    *cur,
+       int                     n)
+{
+       return xfs_btree_block_len(cur) +
+               (n - 1) * cur->bc_ops->key_len;
+}
+
+/*
+ * Calculate offset of the n-th block pointer in a btree block.
+ */
+STATIC size_t
+xfs_btree_ptr_offset(
+       struct xfs_btree_cur    *cur,
+       int                     n,
+       int                     level)
+{
+       return xfs_btree_block_len(cur) +
+               cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
+               (n - 1) * xfs_btree_ptr_len(cur);
+}
+
+/*
+ * Return a pointer to the n-th record in the btree block.
+ */
+STATIC union xfs_btree_rec *
+xfs_btree_rec_addr(
+       struct xfs_btree_cur    *cur,
+       int                     n,
+       struct xfs_btree_block  *block)
+{
+       return (union xfs_btree_rec *)
+               ((char *)block + xfs_btree_rec_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th key in the btree block.
+ */
+STATIC union xfs_btree_key *
+xfs_btree_key_addr(
+       struct xfs_btree_cur    *cur,
+       int                     n,
+       struct xfs_btree_block  *block)
+{
+       return (union xfs_btree_key *)
+               ((char *)block + xfs_btree_key_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th block pointer in the btree block.
+ */
+STATIC union xfs_btree_ptr *
+xfs_btree_ptr_addr(
+       struct xfs_btree_cur    *cur,
+       int                     n,
+       struct xfs_btree_block  *block)
+{
+       int                     level = xfs_btree_get_level(block);
+
+       ASSERT(block->bb_level != 0);
+
+       return (union xfs_btree_ptr *)
+               ((char *)block + xfs_btree_ptr_offset(cur, n, level));
+}
+
+/*
+ * Get a the root block which is stored in the inode.
+ *
+ * For now this btree implementation assumes the btree root is always
+ * stored in the if_broot field of an inode fork.
+ */
+STATIC struct xfs_btree_block *
+xfs_btree_get_iroot(
+       struct xfs_btree_cur    *cur)
+{
+       struct xfs_ifork        *ifp;
+
+       ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+       return (struct xfs_btree_block *)ifp->if_broot;
+}
+
 /*
  * Retrieve the block pointer from the cursor at the given level.
- * This may be a bmap btree root or from a buffer.
+ * This may be an inode btree root or from a buffer.
  */
-STATIC xfs_btree_block_t *             /* generic btree block pointer */
+STATIC struct xfs_btree_block *                /* generic btree block pointer */
 xfs_btree_get_block(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
+       struct xfs_btree_cur    *cur,   /* btree cursor */
        int                     level,  /* level in btree */
-       xfs_buf_t               **bpp)  /* buffer containing the block */
-{
-       xfs_btree_block_t       *block; /* return value */
-       xfs_buf_t               *bp;    /* return buffer */
-       xfs_ifork_t             *ifp;   /* inode fork pointer */
-       int                     whichfork; /* data or attr fork */
-
-       if (cur->bc_btnum == XFS_BTNUM_BMAP && level == cur->bc_nlevels - 1) {
-               whichfork = cur->bc_private.b.whichfork;
-               ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, whichfork);
-               block = (xfs_btree_block_t *)ifp->if_broot;
-               bp = NULL;
-       } else {
-               bp = cur->bc_bufs[level];
-               block = XFS_BUF_TO_BLOCK(bp);
+       struct xfs_buf          **bpp)  /* buffer containing the block */
+{
+       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+           (level == cur->bc_nlevels - 1)) {
+               *bpp = NULL;
+               return xfs_btree_get_iroot(cur);
        }
-       ASSERT(block != NULL);
-       *bpp = bp;
-       return block;
+
+       *bpp = cur->bc_bufs[level];
+       return XFS_BUF_TO_BLOCK(*bpp);
 }
 
 /*
@@ -504,97 +501,6 @@ xfs_btree_get_bufs(
        return bp;
 }
 
-/*
- * Allocate a new btree cursor.
- * The cursor is either for allocation (A) or bmap (B) or inodes (I).
- */
-xfs_btree_cur_t *                      /* new btree cursor */
-xfs_btree_init_cursor(
-       xfs_mount_t     *mp,            /* file system mount point */
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_buf_t       *agbp,          /* (A only) buffer for agf structure */
-                                       /* (I only) buffer for agi structure */
-       xfs_agnumber_t  agno,           /* (AI only) allocation group number */
-       xfs_btnum_t     btnum,          /* btree identifier */
-       xfs_inode_t     *ip,            /* (B only) inode owning the btree */
-       int             whichfork)      /* (B only) data or attr fork */
-{
-       xfs_agf_t       *agf;           /* (A) allocation group freespace */
-       xfs_agi_t       *agi;           /* (I) allocation group inodespace */
-       xfs_btree_cur_t *cur;           /* return value */
-       xfs_ifork_t     *ifp;           /* (I) inode fork pointer */
-       int             nlevels=0;      /* number of levels in the btree */
-
-       ASSERT(xfs_btree_cur_zone != NULL);
-       /*
-        * Allocate a new cursor.
-        */
-       cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
-       /*
-        * Deduce the number of btree levels from the arguments.
-        */
-       switch (btnum) {
-       case XFS_BTNUM_BNO:
-       case XFS_BTNUM_CNT:
-               agf = XFS_BUF_TO_AGF(agbp);
-               nlevels = be32_to_cpu(agf->agf_levels[btnum]);
-               break;
-       case XFS_BTNUM_BMAP:
-               ifp = XFS_IFORK_PTR(ip, whichfork);
-               nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
-               break;
-       case XFS_BTNUM_INO:
-               agi = XFS_BUF_TO_AGI(agbp);
-               nlevels = be32_to_cpu(agi->agi_level);
-               break;
-       default:
-               ASSERT(0);
-       }
-       /*
-        * Fill in the common fields.
-        */
-       cur->bc_tp = tp;
-       cur->bc_mp = mp;
-       cur->bc_nlevels = nlevels;
-       cur->bc_btnum = btnum;
-       cur->bc_blocklog = mp->m_sb.sb_blocklog;
-       /*
-        * Fill in private fields.
-        */
-       switch (btnum) {
-       case XFS_BTNUM_BNO:
-       case XFS_BTNUM_CNT:
-               /*
-                * Allocation btree fields.
-                */
-               cur->bc_private.a.agbp = agbp;
-               cur->bc_private.a.agno = agno;
-               break;
-       case XFS_BTNUM_INO:
-               /*
-                * Inode allocation btree fields.
-                */
-               cur->bc_private.a.agbp = agbp;
-               cur->bc_private.a.agno = agno;
-               break;
-       case XFS_BTNUM_BMAP:
-               /*
-                * Bmap btree fields.
-                */
-               cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
-               cur->bc_private.b.ip = ip;
-               cur->bc_private.b.firstblock = NULLFSBLOCK;
-               cur->bc_private.b.flist = NULL;
-               cur->bc_private.b.allocated = 0;
-               cur->bc_private.b.flags = 0;
-               cur->bc_private.b.whichfork = whichfork;
-               break;
-       default:
-               ASSERT(0);
-       }
-       return cur;
-}
-
 /*
  * Check for the cursor referring to the last block at the given level.
  */
@@ -603,12 +509,12 @@ xfs_btree_islastblock(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     level)  /* level to check */
 {
-       xfs_btree_block_t       *block; /* generic btree block pointer */
+       struct xfs_btree_block  *block; /* generic btree block pointer */
        xfs_buf_t               *bp;    /* buffer containing block */
 
        block = xfs_btree_get_block(cur, level, &bp);
        xfs_btree_check_block(cur, block, level, bp);
-       if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
                return be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO;
        else
                return be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK;
@@ -618,12 +524,12 @@ xfs_btree_islastblock(
  * Change the cursor to point to the first record at the given level.
  * Other levels are unaffected.
  */
-int                                    /* success=1, failure=0 */
+STATIC int                             /* success=1, failure=0 */
 xfs_btree_firstrec(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     level)  /* level to change */
 {
-       xfs_btree_block_t       *block; /* generic btree block pointer */
+       struct xfs_btree_block  *block; /* generic btree block pointer */
        xfs_buf_t               *bp;    /* buffer containing block */
 
        /*
@@ -634,7 +540,7 @@ xfs_btree_firstrec(
        /*
         * It's empty, there is no such record.
         */
-       if (!block->bb_h.bb_numrecs)
+       if (!block->bb_numrecs)
                return 0;
        /*
         * Set the ptr value to 1, that's the first record/key.
@@ -647,12 +553,12 @@ xfs_btree_firstrec(
  * Change the cursor to point to the last record in the current block
  * at the given level.  Other levels are unaffected.
  */
-int                                    /* success=1, failure=0 */
+STATIC int                             /* success=1, failure=0 */
 xfs_btree_lastrec(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     level)  /* level to change */
 {
-       xfs_btree_block_t       *block; /* generic btree block pointer */
+       struct xfs_btree_block  *block; /* generic btree block pointer */
        xfs_buf_t               *bp;    /* buffer containing block */
 
        /*
@@ -663,12 +569,12 @@ xfs_btree_lastrec(
        /*
         * It's empty, there is no such record.
         */
-       if (!block->bb_h.bb_numrecs)
+       if (!block->bb_numrecs)
                return 0;
        /*
         * Set the ptr value to numrecs, that's the last record/key.
         */
-       cur->bc_ptrs[level] = be16_to_cpu(block->bb_h.bb_numrecs);
+       cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
        return 1;
 }
 
@@ -817,80 +723,98 @@ xfs_btree_reada_bufs(
        xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
 }
 
-/*
- * Read-ahead btree blocks, at the given level.
- * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
- */
-int
-xfs_btree_readahead_core(
-       xfs_btree_cur_t         *cur,           /* btree cursor */
-       int                     lev,            /* level in btree */
-       int                     lr)             /* left/right bits */
+STATIC int
+xfs_btree_readahead_lblock(
+       struct xfs_btree_cur    *cur,
+       int                     lr,
+       struct xfs_btree_block  *block)
 {
-       xfs_alloc_block_t       *a;
-       xfs_bmbt_block_t        *b;
-       xfs_inobt_block_t       *i;
        int                     rval = 0;
+       xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+       xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
 
-       ASSERT(cur->bc_bufs[lev] != NULL);
-       cur->bc_ra[lev] |= lr;
-       switch (cur->bc_btnum) {
-       case XFS_BTNUM_BNO:
-       case XFS_BTNUM_CNT:
-               a = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]);
-               if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(a->bb_leftsib) != NULLAGBLOCK) {
-                       xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                               be32_to_cpu(a->bb_leftsib), 1);
-                       rval++;
-               }
-               if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(a->bb_rightsib) != NULLAGBLOCK) {
-                       xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                               be32_to_cpu(a->bb_rightsib), 1);
-                       rval++;
-               }
-               break;
-       case XFS_BTNUM_BMAP:
-               b = XFS_BUF_TO_BMBT_BLOCK(cur->bc_bufs[lev]);
-               if ((lr & XFS_BTCUR_LEFTRA) && be64_to_cpu(b->bb_leftsib) != NULLDFSBNO) {
-                       xfs_btree_reada_bufl(cur->bc_mp, be64_to_cpu(b->bb_leftsib), 1);
-                       rval++;
-               }
-               if ((lr & XFS_BTCUR_RIGHTRA) && be64_to_cpu(b->bb_rightsib) != NULLDFSBNO) {
-                       xfs_btree_reada_bufl(cur->bc_mp, be64_to_cpu(b->bb_rightsib), 1);
-                       rval++;
-               }
-               break;
-       case XFS_BTNUM_INO:
-               i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]);
-               if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(i->bb_leftsib) != NULLAGBLOCK) {
-                       xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                               be32_to_cpu(i->bb_leftsib), 1);
-                       rval++;
-               }
-               if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(i->bb_rightsib) != NULLAGBLOCK) {
-                       xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                               be32_to_cpu(i->bb_rightsib), 1);
-                       rval++;
-               }
-               break;
-       default:
-               ASSERT(0);
+       if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
+               xfs_btree_reada_bufl(cur->bc_mp, left, 1);
+               rval++;
+       }
+
+       if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
+               xfs_btree_reada_bufl(cur->bc_mp, right, 1);
+               rval++;
        }
+
        return rval;
 }
 
-/*
- * Set the buffer for level "lev" in the cursor to bp, releasing
- * any previous buffer.
- */
-void
-xfs_btree_setbuf(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     lev,    /* level in btree */
-       xfs_buf_t               *bp)    /* new buffer to set */
+STATIC int
+xfs_btree_readahead_sblock(
+       struct xfs_btree_cur    *cur,
+       int                     lr,
+       struct xfs_btree_block *block)
 {
-       xfs_btree_block_t       *b;     /* btree block */
-       xfs_buf_t               *obp;   /* old buffer pointer */
+       int                     rval = 0;
+       xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
+       xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
+
+
+       if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
+               xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+                                    left, 1);
+               rval++;
+       }
+
+       if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
+               xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+                                    right, 1);
+               rval++;
+       }
+
+       return rval;
+}
+
+/*
+ * Read-ahead btree blocks, at the given level.
+ * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ */
+STATIC int
+xfs_btree_readahead(
+       struct xfs_btree_cur    *cur,           /* btree cursor */
+       int                     lev,            /* level in btree */
+       int                     lr)             /* left/right bits */
+{
+       struct xfs_btree_block  *block;
+
+       /*
+        * No readahead needed if we are at the root level and the
+        * btree root is stored in the inode.
+        */
+       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+           (lev == cur->bc_nlevels - 1))
+               return 0;
+
+       if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
+               return 0;
+
+       cur->bc_ra[lev] |= lr;
+       block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
+
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+               return xfs_btree_readahead_lblock(cur, lr, block);
+       return xfs_btree_readahead_sblock(cur, lr, block);
+}
+
+/*
+ * Set the buffer for level "lev" in the cursor to bp, releasing
+ * any previous buffer.
+ */
+void
+xfs_btree_setbuf(
+       xfs_btree_cur_t         *cur,   /* btree cursor */
+       int                     lev,    /* level in btree */
+       xfs_buf_t               *bp)    /* new buffer to set */
+{
+       struct xfs_btree_block  *b;     /* btree block */
+       xfs_buf_t               *obp;   /* old buffer pointer */
 
        obp = cur->bc_bufs[lev];
        if (obp)
@@ -900,7 +824,7 @@ xfs_btree_setbuf(
        if (!bp)
                return;
        b = XFS_BUF_TO_BLOCK(bp);
-       if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) {
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
                if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO)
                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
                if (be64_to_cpu(b->bb_u.l.bb_rightsib) == NULLDFSBNO)
@@ -912,3 +836,2855 @@ xfs_btree_setbuf(
                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
        }
 }
+
+STATIC int
+xfs_btree_ptr_is_null(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr)
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+               return be64_to_cpu(ptr->l) == NULLFSBLOCK;
+       else
+               return be32_to_cpu(ptr->s) == NULLAGBLOCK;
+}
+
+STATIC void
+xfs_btree_set_ptr_null(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr)
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+               ptr->l = cpu_to_be64(NULLFSBLOCK);
+       else
+               ptr->s = cpu_to_be32(NULLAGBLOCK);
+}
+
+/*
+ * Get/set/init sibling pointers
+ */
+STATIC void
+xfs_btree_get_sibling(
+       struct xfs_btree_cur    *cur,
+       struct xfs_btree_block  *block,
+       union xfs_btree_ptr     *ptr,
+       int                     lr)
+{
+       ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+               if (lr == XFS_BB_RIGHTSIB)
+                       ptr->l = block->bb_u.l.bb_rightsib;
+               else
+                       ptr->l = block->bb_u.l.bb_leftsib;
+       } else {
+               if (lr == XFS_BB_RIGHTSIB)
+                       ptr->s = block->bb_u.s.bb_rightsib;
+               else
+                       ptr->s = block->bb_u.s.bb_leftsib;
+       }
+}
+
+STATIC void
+xfs_btree_set_sibling(
+       struct xfs_btree_cur    *cur,
+       struct xfs_btree_block  *block,
+       union xfs_btree_ptr     *ptr,
+       int                     lr)
+{
+       ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+               if (lr == XFS_BB_RIGHTSIB)
+                       block->bb_u.l.bb_rightsib = ptr->l;
+               else
+                       block->bb_u.l.bb_leftsib = ptr->l;
+       } else {
+               if (lr == XFS_BB_RIGHTSIB)
+                       block->bb_u.s.bb_rightsib = ptr->s;
+               else
+                       block->bb_u.s.bb_leftsib = ptr->s;
+       }
+}
+
+STATIC void
+xfs_btree_init_block(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       int                     numrecs,
+       struct xfs_btree_block  *new)   /* new block */
+{
+       new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
+       new->bb_level = cpu_to_be16(level);
+       new->bb_numrecs = cpu_to_be16(numrecs);
+
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+               new->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
+               new->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
+       } else {
+               new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+               new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+       }
+}
+
+/*
+ * Return true if ptr is the last record in the btree and
+ * we need to track updateÑ• to this record.  The decision
+ * will be further refined in the update_lastrec method.
+ */
+STATIC int
+xfs_btree_is_lastrec(
+       struct xfs_btree_cur    *cur,
+       struct xfs_btree_block  *block,
+       int                     level)
+{
+       union xfs_btree_ptr     ptr;
+
+       if (level > 0)
+               return 0;
+       if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
+               return 0;
+
+       xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+       if (!xfs_btree_ptr_is_null(cur, &ptr))
+               return 0;
+       return 1;
+}
+
+STATIC void
+xfs_btree_buf_to_ptr(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp,
+       union xfs_btree_ptr     *ptr)
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+               ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
+                                       XFS_BUF_ADDR(bp)));
+       else {
+               ptr->s = cpu_to_be32(XFS_DADDR_TO_AGBNO(cur->bc_mp,
+                                       XFS_BUF_ADDR(bp)));
+       }
+}
+
+STATIC xfs_daddr_t
+xfs_btree_ptr_to_daddr(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr)
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+               ASSERT(be64_to_cpu(ptr->l) != NULLFSBLOCK);
+
+               return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+       } else {
+               ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
+               ASSERT(be32_to_cpu(ptr->s) != NULLAGBLOCK);
+
+               return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
+                                       be32_to_cpu(ptr->s));
+       }
+}
+
+STATIC void
+xfs_btree_set_refs(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp)
+{
+       switch (cur->bc_btnum) {
+       case XFS_BTNUM_BNO:
+       case XFS_BTNUM_CNT:
+               XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+               break;
+       case XFS_BTNUM_INO:
+               XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+               break;
+       case XFS_BTNUM_BMAP:
+               XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+               break;
+       default:
+               ASSERT(0);
+       }
+}
+
+STATIC int
+xfs_btree_get_buf_block(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr,
+       int                     flags,
+       struct xfs_btree_block  **block,
+       struct xfs_buf          **bpp)
+{
+       struct xfs_mount        *mp = cur->bc_mp;
+       xfs_daddr_t             d;
+
+       /* need to sort out how callers deal with failures first */
+       ASSERT(!(flags & XFS_BUF_TRYLOCK));
+
+       d = xfs_btree_ptr_to_daddr(cur, ptr);
+       *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
+                                mp->m_bsize, flags);
+
+       ASSERT(*bpp);
+       ASSERT(!XFS_BUF_GETERROR(*bpp));
+
+       *block = XFS_BUF_TO_BLOCK(*bpp);
+       return 0;
+}
+
+/*
+ * Read in the buffer at the given ptr and return the buffer and
+ * the block pointer within the buffer.
+ */
+STATIC int
+xfs_btree_read_buf_block(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr,
+       int                     level,
+       int                     flags,
+       struct xfs_btree_block  **block,
+       struct xfs_buf          **bpp)
+{
+       struct xfs_mount        *mp = cur->bc_mp;
+       xfs_daddr_t             d;
+       int                     error;
+
+       /* need to sort out how callers deal with failures first */
+       ASSERT(!(flags & XFS_BUF_TRYLOCK));
+
+       d = xfs_btree_ptr_to_daddr(cur, ptr);
+       error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
+                                  mp->m_bsize, flags, bpp);
+       if (error)
+               return error;
+
+       ASSERT(*bpp != NULL);
+       ASSERT(!XFS_BUF_GETERROR(*bpp));
+
+       xfs_btree_set_refs(cur, *bpp);
+       *block = XFS_BUF_TO_BLOCK(*bpp);
+
+       error = xfs_btree_check_block(cur, *block, level, *bpp);
+       if (error)
+               xfs_trans_brelse(cur->bc_tp, *bpp);
+       return error;
+}
+
+/*
+ * Copy keys from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_keys(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *dst_key,
+       union xfs_btree_key     *src_key,
+       int                     numkeys)
+{
+       ASSERT(numkeys >= 0);
+       memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
+}
+
+/*
+ * Copy records from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_recs(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *dst_rec,
+       union xfs_btree_rec     *src_rec,
+       int                     numrecs)
+{
+       ASSERT(numrecs >= 0);
+       memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
+}
+
+/*
+ * Copy block pointers from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_ptrs(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *dst_ptr,
+       union xfs_btree_ptr     *src_ptr,
+       int                     numptrs)
+{
+       ASSERT(numptrs >= 0);
+       memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+
+/*
+ * Shift keys one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_keys(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *key,
+       int                     dir,
+       int                     numkeys)
+{
+       char                    *dst_key;
+
+       ASSERT(numkeys >= 0);
+       ASSERT(dir == 1 || dir == -1);
+
+       dst_key = (char *)key + (dir * cur->bc_ops->key_len);
+       memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
+}
+
+/*
+ * Shift records one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_recs(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec,
+       int                     dir,
+       int                     numrecs)
+{
+       char                    *dst_rec;
+
+       ASSERT(numrecs >= 0);
+       ASSERT(dir == 1 || dir == -1);
+
+       dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
+       memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
+}
+
+/*
+ * Shift block pointers one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_ptrs(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr,
+       int                     dir,
+       int                     numptrs)
+{
+       char                    *dst_ptr;
+
+       ASSERT(numptrs >= 0);
+       ASSERT(dir == 1 || dir == -1);
+
+       dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
+       memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+
+/*
+ * Log key values from the btree block.
+ */
+STATIC void
+xfs_btree_log_keys(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp,
+       int                     first,
+       int                     last)
+{
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+       if (bp) {
+               xfs_trans_log_buf(cur->bc_tp, bp,
+                                 xfs_btree_key_offset(cur, first),
+                                 xfs_btree_key_offset(cur, last + 1) - 1);
+       } else {
+               xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+                               xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log record values from the btree block.
+ */
+void
+xfs_btree_log_recs(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp,
+       int                     first,
+       int                     last)
+{
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+       xfs_trans_log_buf(cur->bc_tp, bp,
+                         xfs_btree_rec_offset(cur, first),
+                         xfs_btree_rec_offset(cur, last + 1) - 1);
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log block pointer fields from a btree block (nonleaf).
+ */
+STATIC void
+xfs_btree_log_ptrs(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       struct xfs_buf          *bp,    /* buffer containing btree block */
+       int                     first,  /* index of first pointer to log */
+       int                     last)   /* index of last pointer to log */
+{
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+       if (bp) {
+               struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+               int                     level = xfs_btree_get_level(block);
+
+               xfs_trans_log_buf(cur->bc_tp, bp,
+                               xfs_btree_ptr_offset(cur, first, level),
+                               xfs_btree_ptr_offset(cur, last + 1, level) - 1);
+       } else {
+               xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+                       xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log fields from a btree block header.
+ */
+void
+xfs_btree_log_block(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       struct xfs_buf          *bp,    /* buffer containing btree block */
+       int                     fields) /* mask of fields: XFS_BB_... */
+{
+       int                     first;  /* first byte offset logged */
+       int                     last;   /* last byte offset logged */
+       static const short      soffsets[] = {  /* table of offsets (short) */
+               offsetof(struct xfs_btree_block, bb_magic),
+               offsetof(struct xfs_btree_block, bb_level),
+               offsetof(struct xfs_btree_block, bb_numrecs),
+               offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
+               offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
+               XFS_BTREE_SBLOCK_LEN
+       };
+       static const short      loffsets[] = {  /* table of offsets (long) */
+               offsetof(struct xfs_btree_block, bb_magic),
+               offsetof(struct xfs_btree_block, bb_level),
+               offsetof(struct xfs_btree_block, bb_numrecs),
+               offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
+               offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
+               XFS_BTREE_LBLOCK_LEN
+       };
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
+
+       if (bp) {
+               xfs_btree_offsets(fields,
+                                 (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+                                       loffsets : soffsets,
+                                 XFS_BB_NUM_BITS, &first, &last);
+               xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+       } else {
+               xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+                       xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                            /* error */
+xfs_btree_increment(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       int                     *stat)          /* success/failure */
+{
+       struct xfs_btree_block  *block;
+       union xfs_btree_ptr     ptr;
+       struct xfs_buf          *bp;
+       int                     error;          /* error return value */
+       int                     lev;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGI(cur, level);
+
+       ASSERT(level < cur->bc_nlevels);
+
+       /* Read-ahead to the right at this level. */
+       xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+
+       /* Get a pointer to the btree block. */
+       block = xfs_btree_get_block(cur, level, &bp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, level, bp);
+       if (error)
+               goto error0;
+#endif
+
+       /* We're done if we remain in the block after the increment. */
+       if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
+               goto out1;
+
+       /* Fail if we just went off the right edge of the tree. */
+       xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+       if (xfs_btree_ptr_is_null(cur, &ptr))
+               goto out0;
+
+       XFS_BTREE_STATS_INC(cur, increment);
+
+       /*
+        * March up the tree incrementing pointers.
+        * Stop when we don't go off the right edge of a block.
+        */
+       for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+               block = xfs_btree_get_block(cur, lev, &bp);
+
+#ifdef DEBUG
+               error = xfs_btree_check_block(cur, block, lev, bp);
+               if (error)
+                       goto error0;
+#endif
+
+               if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
+                       break;
+
+               /* Read-ahead the right block for the next loop. */
+               xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+       }
+
+       /*
+        * If we went off the root then we are either seriously
+        * confused or have the tree root in an inode.
+        */
+       if (lev == cur->bc_nlevels) {
+               if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+                       goto out0;
+               ASSERT(0);
+               error = EFSCORRUPTED;
+               goto error0;
+       }
+       ASSERT(lev < cur->bc_nlevels);
+
+       /*
+        * Now walk back down the tree, fixing up the cursor's buffer
+        * pointers and key numbers.
+        */
+       for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+               union xfs_btree_ptr     *ptrp;
+
+               ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+               error = xfs_btree_read_buf_block(cur, ptrp, --lev,
+                                                       0, &block, &bp);
+               if (error)
+                       goto error0;
+
+               xfs_btree_setbuf(cur, lev, bp);
+               cur->bc_ptrs[lev] = 1;
+       }
+out1:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+
+out0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 0;
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                            /* error */
+xfs_btree_decrement(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       int                     *stat)          /* success/failure */
+{
+       struct xfs_btree_block  *block;
+       xfs_buf_t               *bp;
+       int                     error;          /* error return value */
+       int                     lev;
+       union xfs_btree_ptr     ptr;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGI(cur, level);
+
+       ASSERT(level < cur->bc_nlevels);
+
+       /* Read-ahead to the left at this level. */
+       xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+
+       /* We're done if we remain in the block after the decrement. */
+       if (--cur->bc_ptrs[level] > 0)
+               goto out1;
+
+       /* Get a pointer to the btree block. */
+       block = xfs_btree_get_block(cur, level, &bp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, level, bp);
+       if (error)
+               goto error0;
+#endif
+
+       /* Fail if we just went off the left edge of the tree. */
+       xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+       if (xfs_btree_ptr_is_null(cur, &ptr))
+               goto out0;
+
+       XFS_BTREE_STATS_INC(cur, decrement);
+
+       /*
+        * March up the tree decrementing pointers.
+        * Stop when we don't go off the left edge of a block.
+        */
+       for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+               if (--cur->bc_ptrs[lev] > 0)
+                       break;
+               /* Read-ahead the left block for the next loop. */
+               xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+       }
+
+       /*
+        * If we went off the root then we are seriously confused.
+        * or the root of the tree is in an inode.
+        */
+       if (lev == cur->bc_nlevels) {
+               if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+                       goto out0;
+               ASSERT(0);
+               error = EFSCORRUPTED;
+               goto error0;
+       }
+       ASSERT(lev < cur->bc_nlevels);
+
+       /*
+        * Now walk back down the tree, fixing up the cursor's buffer
+        * pointers and key numbers.
+        */
+       for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+               union xfs_btree_ptr     *ptrp;
+
+               ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+               error = xfs_btree_read_buf_block(cur, ptrp, --lev,
+                                                       0, &block, &bp);
+               if (error)
+                       goto error0;
+               xfs_btree_setbuf(cur, lev, bp);
+               cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
+       }
+out1:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+
+out0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 0;
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+STATIC int
+xfs_btree_lookup_get_block(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       int                     level,  /* level in the btree */
+       union xfs_btree_ptr     *pp,    /* ptr to btree block */
+       struct xfs_btree_block  **blkp) /* return btree block */
+{
+       struct xfs_buf          *bp;    /* buffer pointer for btree block */
+       int                     error = 0;
+
+       /* special case the root block if in an inode */
+       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+           (level == cur->bc_nlevels - 1)) {
+               *blkp = xfs_btree_get_iroot(cur);
+               return 0;
+       }
+
+       /*
+        * If the old buffer at this level for the disk address we are
+        * looking for re-use it.
+        *
+        * Otherwise throw it away and get a new one.
+        */
+       bp = cur->bc_bufs[level];
+       if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
+               *blkp = XFS_BUF_TO_BLOCK(bp);
+               return 0;
+       }
+
+       error = xfs_btree_read_buf_block(cur, pp, level, 0, blkp, &bp);
+       if (error)
+               return error;
+
+       xfs_btree_setbuf(cur, level, bp);
+       return 0;
+}
+
+/*
+ * Get current search key.  For level 0 we don't actually have a key
+ * structure so we make one up from the record.  For all other levels
+ * we just return the right key.
+ */
+STATIC union xfs_btree_key *
+xfs_lookup_get_search_key(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       int                     keyno,
+       struct xfs_btree_block  *block,
+       union xfs_btree_key     *kp)
+{
+       if (level == 0) {
+               cur->bc_ops->init_key_from_rec(kp,
+                               xfs_btree_rec_addr(cur, keyno, block));
+               return kp;
+       }
+
+       return xfs_btree_key_addr(cur, keyno, block);
+}
+
+/*
+ * Lookup the record.  The cursor is made to point to it, based on dir.
+ * Return 0 if can't find any such record, 1 for success.
+ */
+int                                    /* error */
+xfs_btree_lookup(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_lookup_t            dir,    /* <=, ==, or >= */
+       int                     *stat)  /* success/failure */
+{
+       struct xfs_btree_block  *block; /* current btree block */
+       __int64_t               diff;   /* difference for the current key */
+       int                     error;  /* error return value */
+       int                     keyno;  /* current key number */
+       int                     level;  /* level in the btree */
+       union xfs_btree_ptr     *pp;    /* ptr to btree block */
+       union xfs_btree_ptr     ptr;    /* ptr to btree block */
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGI(cur, dir);
+
+       XFS_BTREE_STATS_INC(cur, lookup);
+
+       block = NULL;
+       keyno = 0;
+
+       /* initialise start pointer from cursor */
+       cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+       pp = &ptr;
+
+       /*
+        * Iterate over each level in the btree, starting at the root.
+        * For each level above the leaves, find the key we need, based
+        * on the lookup record, then follow the corresponding block
+        * pointer down to the next level.
+        */
+       for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+               /* Get the block we need to do the lookup on. */
+               error = xfs_btree_lookup_get_block(cur, level, pp, &block);
+               if (error)
+                       goto error0;
+
+               if (diff == 0) {
+                       /*
+                        * If we already had a key match at a higher level, we
+                        * know we need to use the first entry in this block.
+                        */
+                       keyno = 1;
+               } else {
+                       /* Otherwise search this block. Do a binary search. */
+
+                       int     high;   /* high entry number */
+                       int     low;    /* low entry number */
+
+                       /* Set low and high entry numbers, 1-based. */
+                       low = 1;
+                       high = xfs_btree_get_numrecs(block);
+                       if (!high) {
+                               /* Block is empty, must be an empty leaf. */
+                               ASSERT(level == 0 && cur->bc_nlevels == 1);
+
+                               cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+                               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                               *stat = 0;
+                               return 0;
+                       }
+
+                       /* Binary search the block. */
+                       while (low <= high) {
+                               union xfs_btree_key     key;
+                               union xfs_btree_key     *kp;
+
+                               XFS_BTREE_STATS_INC(cur, compare);
+
+                               /* keyno is average of low and high. */
+                               keyno = (low + high) >> 1;
+
+                               /* Get current search key */
+                               kp = xfs_lookup_get_search_key(cur, level,
+                                               keyno, block, &key);
+
+                               /*
+                                * Compute difference to get next direction:
+                                *  - less than, move right
+                                *  - greater than, move left
+                                *  - equal, we're done
+                                */
+                               diff = cur->bc_ops->key_diff(cur, kp);
+                               if (diff < 0)
+                                       low = keyno + 1;
+                               else if (diff > 0)
+                                       high = keyno - 1;
+                               else
+                                       break;
+                       }
+               }
+
+               /*
+                * If there are more levels, set up for the next level
+                * by getting the block number and filling in the cursor.
+                */
+               if (level > 0) {
+                       /*
+                        * If we moved left, need the previous key number,
+                        * unless there isn't one.
+                        */
+                       if (diff > 0 && --keyno < 1)
+                               keyno = 1;
+                       pp = xfs_btree_ptr_addr(cur, keyno, block);
+
+#ifdef DEBUG
+                       error = xfs_btree_check_ptr(cur, pp, 0, level);
+                       if (error)
+                               goto error0;
+#endif
+                       cur->bc_ptrs[level] = keyno;
+               }
+       }
+
+       /* Done with the search. See if we need to adjust the results. */
+       if (dir != XFS_LOOKUP_LE && diff < 0) {
+               keyno++;
+               /*
+                * If ge search and we went off the end of the block, but it's
+                * not the last block, we're in the wrong block.
+                */
+               xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+               if (dir == XFS_LOOKUP_GE &&
+                   keyno > xfs_btree_get_numrecs(block) &&
+                   !xfs_btree_ptr_is_null(cur, &ptr)) {
+                       int     i;
+
+                       cur->bc_ptrs[0] = keyno;
+                       error = xfs_btree_increment(cur, 0, &i);
+                       if (error)
+                               goto error0;
+                       XFS_WANT_CORRUPTED_RETURN(i == 1);
+                       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                       *stat = 1;
+                       return 0;
+               }
+       } else if (dir == XFS_LOOKUP_LE && diff > 0)
+               keyno--;
+       cur->bc_ptrs[0] = keyno;
+
+       /* Return if we succeeded or not. */
+       if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
+               *stat = 0;
+       else if (dir != XFS_LOOKUP_EQ || diff == 0)
+               *stat = 1;
+       else
+               *stat = 0;
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Update keys at all levels from here to the root along the cursor's path.
+ */
+STATIC int
+xfs_btree_updkey(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *keyp,
+       int                     level)
+{
+       struct xfs_btree_block  *block;
+       struct xfs_buf          *bp;
+       union xfs_btree_key     *kp;
+       int                     ptr;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
+
+       ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
+
+       /*
+        * Go up the tree from this level toward the root.
+        * At each level, update the key value to the value input.
+        * Stop when we reach a level where the cursor isn't pointing
+        * at the first entry in the block.
+        */
+       for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+#ifdef DEBUG
+               int             error;
+#endif
+               block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+               error = xfs_btree_check_block(cur, block, level, bp);
+               if (error) {
+                       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+                       return error;
+               }
+#endif
+               ptr = cur->bc_ptrs[level];
+               kp = xfs_btree_key_addr(cur, ptr, block);
+               xfs_btree_copy_keys(cur, kp, keyp, 1);
+               xfs_btree_log_keys(cur, bp, ptr, ptr);
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       return 0;
+}
+
+/*
+ * Update the record referred to by cur to the value in the
+ * given record. This either works (return 0) or gets an
+ * EFSCORRUPTED error.
+ */
+int
+xfs_btree_update(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec)
+{
+       struct xfs_btree_block  *block;
+       struct xfs_buf          *bp;
+       int                     error;
+       int                     ptr;
+       union xfs_btree_rec     *rp;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGR(cur, rec);
+
+       /* Pick up the current block. */
+       block = xfs_btree_get_block(cur, 0, &bp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, 0, bp);
+       if (error)
+               goto error0;
+#endif
+       /* Get the address of the rec to be updated. */
+       ptr = cur->bc_ptrs[0];
+       rp = xfs_btree_rec_addr(cur, ptr, block);
+
+       /* Fill in the new contents and log them. */
+       xfs_btree_copy_recs(cur, rp, rec, 1);
+       xfs_btree_log_recs(cur, bp, ptr, ptr);
+
+       /*
+        * If we are tracking the last record in the tree and
+        * we are at the far right edge of the tree, update it.
+        */
+       if (xfs_btree_is_lastrec(cur, block, 0)) {
+               cur->bc_ops->update_lastrec(cur, block, rec,
+                                           ptr, LASTREC_UPDATE);
+       }
+
+       /* Updating first rec in leaf. Pass new key value up to our parent. */
+       if (ptr == 1) {
+               union xfs_btree_key     key;
+
+               cur->bc_ops->init_key_from_rec(&key, rec);
+               error = xfs_btree_updkey(cur, &key, 1);
+               if (error)
+                       goto error0;
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int                                     /* error */
+xfs_btree_lshift(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       int                     *stat)          /* success/failure */
+{
+       union xfs_btree_key     key;            /* btree key */
+       struct xfs_buf          *lbp;           /* left buffer pointer */
+       struct xfs_btree_block  *left;          /* left btree block */
+       int                     lrecs;          /* left record count */
+       struct xfs_buf          *rbp;           /* right buffer pointer */
+       struct xfs_btree_block  *right;         /* right btree block */
+       int                     rrecs;          /* right record count */
+       union xfs_btree_ptr     lptr;           /* left btree pointer */
+       union xfs_btree_key     *rkp = NULL;    /* right btree key */
+       union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
+       union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
+       int                     error;          /* error return value */
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGI(cur, level);
+
+       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+           level == cur->bc_nlevels - 1)
+               goto out0;
+
+       /* Set up variables for this block as "right". */
+       right = xfs_btree_get_block(cur, level, &rbp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, right, level, rbp);
+       if (error)
+               goto error0;
+#endif
+
+       /* If we've got no left sibling then we can't shift an entry left. */
+       xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+       if (xfs_btree_ptr_is_null(cur, &lptr))
+               goto out0;
+
+       /*
+        * If the cursor entry is the one that would be moved, don't
+        * do it... it's too complicated.
+        */
+       if (cur->bc_ptrs[level] <= 1)
+               goto out0;
+
+       /* Set up the left neighbor as "left". */
+       error = xfs_btree_read_buf_block(cur, &lptr, level, 0, &left, &lbp);
+       if (error)
+               goto error0;
+
+       /* If it's full, it can't take another entry. */
+       lrecs = xfs_btree_get_numrecs(left);
+       if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
+               goto out0;
+
+       rrecs = xfs_btree_get_numrecs(right);
+
+       /*
+        * We add one entry to the left side and remove one for the right side.
+        * Accout for it here, the changes will be updated on disk and logged
+        * later.
+        */
+       lrecs++;
+       rrecs--;
+
+       XFS_BTREE_STATS_INC(cur, lshift);
+       XFS_BTREE_STATS_ADD(cur, moves, 1);
+
+       /*
+        * If non-leaf, copy a key and a ptr to the left block.
+        * Log the changes to the left block.
+        */
+       if (level > 0) {
+               /* It's a non-leaf.  Move keys and pointers. */
+               union xfs_btree_key     *lkp;   /* left btree key */
+               union xfs_btree_ptr     *lpp;   /* left address pointer */
+
+               lkp = xfs_btree_key_addr(cur, lrecs, left);
+               rkp = xfs_btree_key_addr(cur, 1, right);
+
+               lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+               rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+               error = xfs_btree_check_ptr(cur, rpp, 0, level);
+               if (error)
+                       goto error0;
+#endif
+               xfs_btree_copy_keys(cur, lkp, rkp, 1);
+               xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
+
+               xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
+               xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
+
+               ASSERT(cur->bc_ops->keys_inorder(cur,
+                       xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
+       } else {
+               /* It's a leaf.  Move records.  */
+               union xfs_btree_rec     *lrp;   /* left record pointer */
+
+               lrp = xfs_btree_rec_addr(cur, lrecs, left);
+               rrp = xfs_btree_rec_addr(cur, 1, right);
+
+               xfs_btree_copy_recs(cur, lrp, rrp, 1);
+               xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
+
+               ASSERT(cur->bc_ops->recs_inorder(cur,
+                       xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
+       }
+
+       xfs_btree_set_numrecs(left, lrecs);
+       xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+
+       xfs_btree_set_numrecs(right, rrecs);
+       xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+
+       /*
+        * Slide the contents of right down one entry.
+        */
+       XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
+       if (level > 0) {
+               /* It's a nonleaf. operate on keys and ptrs */
+#ifdef DEBUG
+               int                     i;              /* loop index */
+
+               for (i = 0; i < rrecs; i++) {
+                       error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
+                       if (error)
+                               goto error0;
+               }
+#endif
+               xfs_btree_shift_keys(cur,
+                               xfs_btree_key_addr(cur, 2, right),
+                               -1, rrecs);
+               xfs_btree_shift_ptrs(cur,
+                               xfs_btree_ptr_addr(cur, 2, right),
+                               -1, rrecs);
+
+               xfs_btree_log_keys(cur, rbp, 1, rrecs);
+               xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+       } else {
+               /* It's a leaf. operate on records */
+               xfs_btree_shift_recs(cur,
+                       xfs_btree_rec_addr(cur, 2, right),
+                       -1, rrecs);
+               xfs_btree_log_recs(cur, rbp, 1, rrecs);
+
+               /*
+                * If it's the first record in the block, we'll need a key
+                * structure to pass up to the next level (updkey).
+                */
+               cur->bc_ops->init_key_from_rec(&key,
+                       xfs_btree_rec_addr(cur, 1, right));
+               rkp = &key;
+       }
+
+       /* Update the parent key values of right. */
+       error = xfs_btree_updkey(cur, rkp, level + 1);
+       if (error)
+               goto error0;
+
+       /* Slide the cursor value left one. */
+       cur->bc_ptrs[level]--;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+
+out0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 0;
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int                                     /* error */
+xfs_btree_rshift(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       int                     *stat)          /* success/failure */
+{
+       union xfs_btree_key     key;            /* btree key */
+       struct xfs_buf          *lbp;           /* left buffer pointer */
+       struct xfs_btree_block  *left;          /* left btree block */
+       struct xfs_buf          *rbp;           /* right buffer pointer */
+       struct xfs_btree_block  *right;         /* right btree block */
+       struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
+       union xfs_btree_ptr     rptr;           /* right block pointer */
+       union xfs_btree_key     *rkp;           /* right btree key */
+       int                     rrecs;          /* right record count */
+       int                     lrecs;          /* left record count */
+       int                     error;          /* error return value */
+       int                     i;              /* loop counter */
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGI(cur, level);
+
+       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+           (level == cur->bc_nlevels - 1))
+               goto out0;
+
+       /* Set up variables for this block as "left". */
+       left = xfs_btree_get_block(cur, level, &lbp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, left, level, lbp);
+       if (error)
+               goto error0;
+#endif
+
+       /* If we've got no right sibling then we can't shift an entry right. */
+       xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+       if (xfs_btree_ptr_is_null(cur, &rptr))
+               goto out0;
+
+       /*
+        * If the cursor entry is the one that would be moved, don't
+        * do it... it's too complicated.
+        */
+       lrecs = xfs_btree_get_numrecs(left);
+       if (cur->bc_ptrs[level] >= lrecs)
+               goto out0;
+
+       /* Set up the right neighbor as "right". */
+       error = xfs_btree_read_buf_block(cur, &rptr, level, 0, &right, &rbp);
+       if (error)
+               goto error0;
+
+       /* If it's full, it can't take another entry. */
+       rrecs = xfs_btree_get_numrecs(right);
+       if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
+               goto out0;
+
+       XFS_BTREE_STATS_INC(cur, rshift);
+       XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+
+       /*
+        * Make a hole at the start of the right neighbor block, then
+        * copy the last left block entry to the hole.
+        */
+       if (level > 0) {
+               /* It's a nonleaf. make a hole in the keys and ptrs */
+               union xfs_btree_key     *lkp;
+               union xfs_btree_ptr     *lpp;
+               union xfs_btree_ptr     *rpp;
+
+               lkp = xfs_btree_key_addr(cur, lrecs, left);
+               lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+               rkp = xfs_btree_key_addr(cur, 1, right);
+               rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+#ifdef DEBUG
+               for (i = rrecs - 1; i >= 0; i--) {
+                       error = xfs_btree_check_ptr(cur, rpp, i, level);
+                       if (error)
+                               goto error0;
+               }
+#endif
+
+               xfs_btree_shift_keys(cur, rkp, 1, rrecs);
+               xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
+
+#ifdef DEBUG
+               error = xfs_btree_check_ptr(cur, lpp, 0, level);
+               if (error)
+                       goto error0;
+#endif
+
+               /* Now put the new data in, and log it. */
+               xfs_btree_copy_keys(cur, rkp, lkp, 1);
+               xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
+
+               xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
+               xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
+
+               ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
+                       xfs_btree_key_addr(cur, 2, right)));
+       } else {
+               /* It's a leaf. make a hole in the records */
+               union xfs_btree_rec     *lrp;
+               union xfs_btree_rec     *rrp;
+
+               lrp = xfs_btree_rec_addr(cur, lrecs, left);
+               rrp = xfs_btree_rec_addr(cur, 1, right);
+
+               xfs_btree_shift_recs(cur, rrp, 1, rrecs);
+
+               /* Now put the new data in, and log it. */
+               xfs_btree_copy_recs(cur, rrp, lrp, 1);
+               xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
+
+               cur->bc_ops->init_key_from_rec(&key, rrp);
+               rkp = &key;
+
+               ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
+                       xfs_btree_rec_addr(cur, 2, right)));
+       }
+
+       /*
+        * Decrement and log left's numrecs, bump and log right's numrecs.
+        */
+       xfs_btree_set_numrecs(left, --lrecs);
+       xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+
+       xfs_btree_set_numrecs(right, ++rrecs);
+       xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+
+       /*
+        * Using a temporary cursor, update the parent key values of the
+        * block on the right.
+        */
+       error = xfs_btree_dup_cursor(cur, &tcur);
+       if (error)
+               goto error0;
+       i = xfs_btree_lastrec(tcur, level);
+       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+       error = xfs_btree_increment(tcur, level, &i);
+       if (error)
+               goto error1;
+
+       error = xfs_btree_updkey(tcur, rkp, level + 1);
+       if (error)
+               goto error1;
+
+       xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+
+out0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 0;
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+
+error1:
+       XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
+       xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+       return error;
+}
+
+/*
+ * Split cur/level block in half.
+ * Return new block number and the key to its first
+ * record (to be inserted into parent).
+ */
+STATIC int                                     /* error */
+xfs_btree_split(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       union xfs_btree_ptr     *ptrp,
+       union xfs_btree_key     *key,
+       struct xfs_btree_cur    **curp,
+       int                     *stat)          /* success/failure */
+{
+       union xfs_btree_ptr     lptr;           /* left sibling block ptr */
+       struct xfs_buf          *lbp;           /* left buffer pointer */
+       struct xfs_btree_block  *left;          /* left btree block */
+       union xfs_btree_ptr     rptr;           /* right sibling block ptr */
+       struct xfs_buf          *rbp;           /* right buffer pointer */
+       struct xfs_btree_block  *right;         /* right btree block */
+       union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
+       struct xfs_buf          *rrbp;          /* right-right buffer pointer */
+       struct xfs_btree_block  *rrblock;       /* right-right btree block */
+       int                     lrecs;
+       int                     rrecs;
+       int                     src_index;
+       int                     error;          /* error return value */
+#ifdef DEBUG
+       int                     i;
+#endif
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
+
+       XFS_BTREE_STATS_INC(cur, split);
+
+       /* Set up left block (current one). */
+       left = xfs_btree_get_block(cur, level, &lbp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, left, level, lbp);
+       if (error)
+               goto error0;
+#endif
+
+       xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+
+       /* Allocate the new block. If we can't do it, we're toast. Give up. */
+       error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, 1, stat);
+       if (error)
+               goto error0;
+       if (*stat == 0)
+               goto out0;
+       XFS_BTREE_STATS_INC(cur, alloc);
+
+       /* Set up the new block as "right". */
+       error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
+       if (error)
+               goto error0;
+
+       /* Fill in the btree header for the new right block. */
+       xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right);
+
+       /*
+        * Split the entries between the old and the new block evenly.
+        * Make sure that if there's an odd number of entries now, that
+        * each new block will have the same number of entries.
+        */
+       lrecs = xfs_btree_get_numrecs(left);
+       rrecs = lrecs / 2;
+       if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
+               rrecs++;
+       src_index = (lrecs - rrecs + 1);
+
+       XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+
+       /*
+        * Copy btree block entries from the left block over to the
+        * new block, the right. Update the right block and log the
+        * changes.
+        */
+       if (level > 0) {
+               /* It's a non-leaf.  Move keys and pointers. */
+               union xfs_btree_key     *lkp;   /* left btree key */
+               union xfs_btree_ptr     *lpp;   /* left address pointer */
+               union xfs_btree_key     *rkp;   /* right btree key */
+               union xfs_btree_ptr     *rpp;   /* right address pointer */
+
+               lkp = xfs_btree_key_addr(cur, src_index, left);
+               lpp = xfs_btree_ptr_addr(cur, src_index, left);
+               rkp = xfs_btree_key_addr(cur, 1, right);
+               rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+#ifdef DEBUG
+               for (i = src_index; i < rrecs; i++) {
+                       error = xfs_btree_check_ptr(cur, lpp, i, level);
+                       if (error)
+                               goto error0;
+               }
+#endif
+
+               xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
+               xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
+
+               xfs_btree_log_keys(cur, rbp, 1, rrecs);
+               xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+
+               /* Grab the keys to the entries moved to the right block */
+               xfs_btree_copy_keys(cur, key, rkp, 1);
+       } else {
+               /* It's a leaf.  Move records.  */
+               union xfs_btree_rec     *lrp;   /* left record pointer */
+               union xfs_btree_rec     *rrp;   /* right record pointer */
+
+               lrp = xfs_btree_rec_addr(cur, src_index, left);
+               rrp = xfs_btree_rec_addr(cur, 1, right);
+
+               xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
+               xfs_btree_log_recs(cur, rbp, 1, rrecs);
+
+               cur->bc_ops->init_key_from_rec(key,
+                       xfs_btree_rec_addr(cur, 1, right));
+       }
+
+
+       /*
+        * Find the left block number by looking in the buffer.
+        * Adjust numrecs, sibling pointers.
+        */
+       xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
+       xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
+       xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+       xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+
+       lrecs -= rrecs;
+       xfs_btree_set_numrecs(left, lrecs);
+       xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
+
+       xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
+       xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+
+       /*
+        * If there's a block to the new block's right, make that block
+        * point back to right instead of to left.
+        */
+       if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
+               error = xfs_btree_read_buf_block(cur, &rrptr, level,
+                                                       0, &rrblock, &rrbp);
+               if (error)
+                       goto error0;
+               xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
+               xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+       }
+       /*
+        * If the cursor is really in the right block, move it there.
+        * If it's just pointing past the last entry in left, then we'll
+        * insert there, so don't change anything in that case.
+        */
+       if (cur->bc_ptrs[level] > lrecs + 1) {
+               xfs_btree_setbuf(cur, level, rbp);
+               cur->bc_ptrs[level] -= lrecs;
+       }
+       /*
+        * If there are more levels, we'll need another cursor which refers
+        * the right block, no matter where this cursor was.
+        */
+       if (level + 1 < cur->bc_nlevels) {
+               error = xfs_btree_dup_cursor(cur, curp);
+               if (error)
+                       goto error0;
+               (*curp)->bc_ptrs[level + 1]++;
+       }
+       *ptrp = rptr;
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+out0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 0;
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Copy the old inode root contents into a real block and make the
+ * broot point to it.
+ */
+int                                            /* error */
+xfs_btree_new_iroot(
+       struct xfs_btree_cur    *cur,           /* btree cursor */
+       int                     *logflags,      /* logging flags for inode */
+       int                     *stat)          /* return status - 0 fail */
+{
+       struct xfs_buf          *cbp;           /* buffer for cblock */
+       struct xfs_btree_block  *block;         /* btree block */
+       struct xfs_btree_block  *cblock;        /* child btree block */
+       union xfs_btree_key     *ckp;           /* child key pointer */
+       union xfs_btree_ptr     *cpp;           /* child ptr pointer */
+       union xfs_btree_key     *kp;            /* pointer to btree key */
+       union xfs_btree_ptr     *pp;            /* pointer to block addr */
+       union xfs_btree_ptr     nptr;           /* new block addr */
+       int                     level;          /* btree level */
+       int                     error;          /* error return code */
+#ifdef DEBUG
+       int                     i;              /* loop counter */
+#endif
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_STATS_INC(cur, newroot);
+
+       ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+
+       level = cur->bc_nlevels - 1;
+
+       block = xfs_btree_get_iroot(cur);
+       pp = xfs_btree_ptr_addr(cur, 1, block);
+
+       /* Allocate the new block. If we can't do it, we're toast. Give up. */
+       error = cur->bc_ops->alloc_block(cur, pp, &nptr, 1, stat);
+       if (error)
+               goto error0;
+       if (*stat == 0) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+               return 0;
+       }
+       XFS_BTREE_STATS_INC(cur, alloc);
+
+       /* Copy the root into a real block. */
+       error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
+       if (error)
+               goto error0;
+
+       memcpy(cblock, block, xfs_btree_block_len(cur));
+
+       be16_add_cpu(&block->bb_level, 1);
+       xfs_btree_set_numrecs(block, 1);
+       cur->bc_nlevels++;
+       cur->bc_ptrs[level + 1] = 1;
+
+       kp = xfs_btree_key_addr(cur, 1, block);
+       ckp = xfs_btree_key_addr(cur, 1, cblock);
+       xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
+
+       cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+       for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
+               error = xfs_btree_check_ptr(cur, pp, i, level);
+               if (error)
+                       goto error0;
+       }
+#endif
+       xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
+
+#ifdef DEBUG
+       error = xfs_btree_check_ptr(cur, &nptr, 0, level);
+       if (error)
+               goto error0;
+#endif
+       xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
+
+       xfs_iroot_realloc(cur->bc_private.b.ip,
+                         1 - xfs_btree_get_numrecs(cblock),
+                         cur->bc_private.b.whichfork);
+
+       xfs_btree_setbuf(cur, level, cbp);
+
+       /*
+        * Do all this logging at the end so that
+        * the root is at the right level.
+        */
+       xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+       xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+       xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+
+       *logflags |=
+               XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork);
+       *stat = 1;
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       return 0;
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Allocate a new root block, fill it in.
+ */
+STATIC int                             /* error */
+xfs_btree_new_root(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       int                     *stat)  /* success/failure */
+{
+       struct xfs_btree_block  *block; /* one half of the old root block */
+       struct xfs_buf          *bp;    /* buffer containing block */
+       int                     error;  /* error return value */
+       struct xfs_buf          *lbp;   /* left buffer pointer */
+       struct xfs_btree_block  *left;  /* left btree block */
+       struct xfs_buf          *nbp;   /* new (root) buffer */
+       struct xfs_btree_block  *new;   /* new (root) btree block */
+       int                     nptr;   /* new value for key index, 1 or 2 */
+       struct xfs_buf          *rbp;   /* right buffer pointer */
+       struct xfs_btree_block  *right; /* right btree block */
+       union xfs_btree_ptr     rptr;
+       union xfs_btree_ptr     lptr;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_STATS_INC(cur, newroot);
+
+       /* initialise our start point from the cursor */
+       cur->bc_ops->init_ptr_from_cur(cur, &rptr);
+
+       /* Allocate the new block. If we can't do it, we're toast. Give up. */
+       error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, 1, stat);
+       if (error)
+               goto error0;
+       if (*stat == 0)
+               goto out0;
+       XFS_BTREE_STATS_INC(cur, alloc);
+
+       /* Set up the new block. */
+       error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
+       if (error)
+               goto error0;
+
+       /* Set the root in the holding structure  increasing the level by 1. */
+       cur->bc_ops->set_root(cur, &lptr, 1);
+
+       /*
+        * At the previous root level there are now two blocks: the old root,
+        * and the new block generated when it was split.  We don't know which
+        * one the cursor is pointing at, so we set up variables "left" and
+        * "right" for each case.
+        */
+       block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
+       if (error)
+               goto error0;
+#endif
+
+       xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+       if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+               /* Our block is left, pick up the right block. */
+               lbp = bp;
+               xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+               left = block;
+               error = xfs_btree_read_buf_block(cur, &rptr,
+                                       cur->bc_nlevels - 1, 0, &right, &rbp);
+               if (error)
+                       goto error0;
+               bp = rbp;
+               nptr = 1;
+       } else {
+               /* Our block is right, pick up the left block. */
+               rbp = bp;
+               xfs_btree_buf_to_ptr(cur, rbp, &rptr);
+               right = block;
+               xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+               error = xfs_btree_read_buf_block(cur, &lptr,
+                                       cur->bc_nlevels - 1, 0, &left, &lbp);
+               if (error)
+                       goto error0;
+               bp = lbp;
+               nptr = 2;
+       }
+       /* Fill in the new block's btree header and log it. */
+       xfs_btree_init_block(cur, cur->bc_nlevels, 2, new);
+       xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
+       ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
+                       !xfs_btree_ptr_is_null(cur, &rptr));
+
+       /* Fill in the key data in the new root. */
+       if (xfs_btree_get_level(left) > 0) {
+               xfs_btree_copy_keys(cur,
+                               xfs_btree_key_addr(cur, 1, new),
+                               xfs_btree_key_addr(cur, 1, left), 1);
+               xfs_btree_copy_keys(cur,
+                               xfs_btree_key_addr(cur, 2, new),
+                               xfs_btree_key_addr(cur, 1, right), 1);
+       } else {
+               cur->bc_ops->init_key_from_rec(
+                               xfs_btree_key_addr(cur, 1, new),
+                               xfs_btree_rec_addr(cur, 1, left));
+               cur->bc_ops->init_key_from_rec(
+                               xfs_btree_key_addr(cur, 2, new),
+                               xfs_btree_rec_addr(cur, 1, right));
+       }
+       xfs_btree_log_keys(cur, nbp, 1, 2);
+
+       /* Fill in the pointer data in the new root. */
+       xfs_btree_copy_ptrs(cur,
+               xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
+       xfs_btree_copy_ptrs(cur,
+               xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
+       xfs_btree_log_ptrs(cur, nbp, 1, 2);
+
+       /* Fix up the cursor. */
+       xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
+       cur->bc_ptrs[cur->bc_nlevels] = nptr;
+       cur->bc_nlevels++;
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+out0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 0;
+       return 0;
+}
+
+STATIC int
+xfs_btree_make_block_unfull(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       int                     level,  /* btree level */
+       int                     numrecs,/* # of recs in block */
+       int                     *oindex,/* old tree index */
+       int                     *index, /* new tree index */
+       union xfs_btree_ptr     *nptr,  /* new btree ptr */
+       struct xfs_btree_cur    **ncur, /* new btree cursor */
+       union xfs_btree_rec     *nrec,  /* new record */
+       int                     *stat)
+{
+       union xfs_btree_key     key;    /* new btree key value */
+       int                     error = 0;
+
+       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+           level == cur->bc_nlevels - 1) {
+               struct xfs_inode *ip = cur->bc_private.b.ip;
+
+               if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
+                       /* A root block that can be made bigger. */
+
+                       xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
+               } else {
+                       /* A root block that needs replacing */
+                       int     logflags = 0;
+
+                       error = xfs_btree_new_iroot(cur, &logflags, stat);
+                       if (error || *stat == 0)
+                               return error;
+
+                       xfs_trans_log_inode(cur->bc_tp, ip, logflags);
+               }
+
+               return 0;
+       }
+
+       /* First, try shifting an entry to the right neighbor. */
+       error = xfs_btree_rshift(cur, level, stat);
+       if (error || *stat)
+               return error;
+
+       /* Next, try shifting an entry to the left neighbor. */
+       error = xfs_btree_lshift(cur, level, stat);
+       if (error)
+               return error;
+
+       if (*stat) {
+               *oindex = *index = cur->bc_ptrs[level];
+               return 0;
+       }
+
+       /*
+        * Next, try splitting the current block in half.
+        *
+        * If this works we have to re-set our variables because we
+        * could be in a different block now.
+        */
+       error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
+       if (error || *stat == 0)
+               return error;
+
+
+       *index = cur->bc_ptrs[level];
+       cur->bc_ops->init_rec_from_key(&key, nrec);
+       return 0;
+}
+
+/*
+ * Insert one record/level.  Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int
+xfs_btree_insrec(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       int                     level,  /* level to insert record at */
+       union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
+       union xfs_btree_rec     *recp,  /* i/o: record data inserted */
+       struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
+       int                     *stat)  /* success/failure */
+{
+       struct xfs_btree_block  *block; /* btree block */
+       struct xfs_buf          *bp;    /* buffer for block */
+       union xfs_btree_key     key;    /* btree key */
+       union xfs_btree_ptr     nptr;   /* new block ptr */
+       struct xfs_btree_cur    *ncur;  /* new btree cursor */
+       union xfs_btree_rec     nrec;   /* new record count */
+       int                     optr;   /* old key/record index */
+       int                     ptr;    /* key/record index */
+       int                     numrecs;/* number of records */
+       int                     error;  /* error return value */
+#ifdef DEBUG
+       int                     i;
+#endif
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
+
+       ncur = NULL;
+
+       /*
+        * If we have an external root pointer, and we've made it to the
+        * root level, allocate a new root block and we're done.
+        */
+       if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+           (level >= cur->bc_nlevels)) {
+               error = xfs_btree_new_root(cur, stat);
+               xfs_btree_set_ptr_null(cur, ptrp);
+
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+               return error;
+       }
+
+       /* If we're off the left edge, return failure. */
+       ptr = cur->bc_ptrs[level];
+       if (ptr == 0) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+               *stat = 0;
+               return 0;
+       }
+
+       /* Make a key out of the record data to be inserted, and save it. */
+       cur->bc_ops->init_key_from_rec(&key, recp);
+
+       optr = ptr;
+
+       XFS_BTREE_STATS_INC(cur, insrec);
+
+       /* Get pointers to the btree buffer and block. */
+       block = xfs_btree_get_block(cur, level, &bp);
+       numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, level, bp);
+       if (error)
+               goto error0;
+
+       /* Check that the new entry is being inserted in the right place. */
+       if (ptr <= numrecs) {
+               if (level == 0) {
+                       ASSERT(cur->bc_ops->recs_inorder(cur, recp,
+                               xfs_btree_rec_addr(cur, ptr, block)));
+               } else {
+                       ASSERT(cur->bc_ops->keys_inorder(cur, &key,
+                               xfs_btree_key_addr(cur, ptr, block)));
+               }
+       }
+#endif
+
+       /*
+        * If the block is full, we can't insert the new entry until we
+        * make the block un-full.
+        */
+       xfs_btree_set_ptr_null(cur, &nptr);
+       if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
+               error = xfs_btree_make_block_unfull(cur, level, numrecs,
+                                       &optr, &ptr, &nptr, &ncur, &nrec, stat);
+               if (error || *stat == 0)
+                       goto error0;
+       }
+
+       /*
+        * The current block may have changed if the block was
+        * previously full and we have just made space in it.
+        */
+       block = xfs_btree_get_block(cur, level, &bp);
+       numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, level, bp);
+       if (error)
+               return error;
+#endif
+
+       /*
+        * At this point we know there's room for our new entry in the block
+        * we're pointing at.
+        */
+       XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
+
+       if (level > 0) {
+               /* It's a nonleaf. make a hole in the keys and ptrs */
+               union xfs_btree_key     *kp;
+               union xfs_btree_ptr     *pp;
+
+               kp = xfs_btree_key_addr(cur, ptr, block);
+               pp = xfs_btree_ptr_addr(cur, ptr, block);
+
+#ifdef DEBUG
+               for (i = numrecs - ptr; i >= 0; i--) {
+                       error = xfs_btree_check_ptr(cur, pp, i, level);
+                       if (error)
+                               return error;
+               }
+#endif
+
+               xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
+               xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
+
+#ifdef DEBUG
+               error = xfs_btree_check_ptr(cur, ptrp, 0, level);
+               if (error)
+                       goto error0;
+#endif
+
+               /* Now put the new data in, bump numrecs and log it. */
+               xfs_btree_copy_keys(cur, kp, &key, 1);
+               xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
+               numrecs++;
+               xfs_btree_set_numrecs(block, numrecs);
+               xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
+               xfs_btree_log_keys(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+               if (ptr < numrecs) {
+                       ASSERT(cur->bc_ops->keys_inorder(cur, kp,
+                               xfs_btree_key_addr(cur, ptr + 1, block)));
+               }
+#endif
+       } else {
+               /* It's a leaf. make a hole in the records */
+               union xfs_btree_rec             *rp;
+
+               rp = xfs_btree_rec_addr(cur, ptr, block);
+
+               xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
+
+               /* Now put the new data in, bump numrecs and log it. */
+               xfs_btree_copy_recs(cur, rp, recp, 1);
+               xfs_btree_set_numrecs(block, ++numrecs);
+               xfs_btree_log_recs(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+               if (ptr < numrecs) {
+                       ASSERT(cur->bc_ops->recs_inorder(cur, rp,
+                               xfs_btree_rec_addr(cur, ptr + 1, block)));
+               }
+#endif
+       }
+
+       /* Log the new number of records in the btree header. */
+       xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+
+       /* If we inserted at the start of a block, update the parents' keys. */
+       if (optr == 1) {
+               error = xfs_btree_updkey(cur, &key, level + 1);
+               if (error)
+                       goto error0;
+       }
+
+       /*
+        * If we are tracking the last record in the tree and
+        * we are at the far right edge of the tree, update it.
+        */
+       if (xfs_btree_is_lastrec(cur, block, level)) {
+               cur->bc_ops->update_lastrec(cur, block, recp,
+                                           ptr, LASTREC_INSREC);
+       }
+
+       /*
+        * Return the new block number, if any.
+        * If there is one, give back a record value and a cursor too.
+        */
+       *ptrp = nptr;
+       if (!xfs_btree_ptr_is_null(cur, &nptr)) {
+               *recp = nrec;
+               *curp = ncur;
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Insert the record at the point referenced by cur.
+ *
+ * A multi-level split of the tree on insert will invalidate the original
+ * cursor.  All callers of this function should assume that the cursor is
+ * no longer valid and revalidate it.
+ */
+int
+xfs_btree_insert(
+       struct xfs_btree_cur    *cur,
+       int                     *stat)
+{
+       int                     error;  /* error return value */
+       int                     i;      /* result value, 0 for failure */
+       int                     level;  /* current level number in btree */
+       union xfs_btree_ptr     nptr;   /* new block number (split result) */
+       struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
+       struct xfs_btree_cur    *pcur;  /* previous level's cursor */
+       union xfs_btree_rec     rec;    /* record to insert */
+
+       level = 0;
+       ncur = NULL;
+       pcur = cur;
+
+       xfs_btree_set_ptr_null(cur, &nptr);
+       cur->bc_ops->init_rec_from_cur(cur, &rec);
+
+       /*
+        * Loop going up the tree, starting at the leaf level.
+        * Stop when we don't get a split block, that must mean that
+        * the insert is finished with this level.
+        */
+       do {
+               /*
+                * Insert nrec/nptr into this level of the tree.
+                * Note if we fail, nptr will be null.
+                */
+               error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
+               if (error) {
+                       if (pcur != cur)
+                               xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+                       goto error0;
+               }
+
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               level++;
+
+               /*
+                * See if the cursor we just used is trash.
+                * Can't trash the caller's cursor, but otherwise we should
+                * if ncur is a new cursor or we're about to be done.
+                */
+               if (pcur != cur &&
+                   (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
+                       /* Save the state from the cursor before we trash it */
+                       if (cur->bc_ops->update_cursor)
+                               cur->bc_ops->update_cursor(pcur, cur);
+                       cur->bc_nlevels = pcur->bc_nlevels;
+                       xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+               }
+               /* If we got a new cursor, switch to it. */
+               if (ncur) {
+                       pcur = ncur;
+                       ncur = NULL;
+               }
+       } while (!xfs_btree_ptr_is_null(cur, &nptr));
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = i;
+       return 0;
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Try to merge a non-leaf block back into the inode root.
+ *
+ * Note: the killroot names comes from the fact that we're effectively
+ * killing the old root block.  But because we can't just delete the
+ * inode we have to copy the single block it was pointing to into the
+ * inode.
+ */
+int
+xfs_btree_kill_iroot(
+       struct xfs_btree_cur    *cur)
+{
+       int                     whichfork = cur->bc_private.b.whichfork;
+       struct xfs_inode        *ip = cur->bc_private.b.ip;
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+       struct xfs_btree_block  *block;
+       struct xfs_btree_block  *cblock;
+       union xfs_btree_key     *kp;
+       union xfs_btree_key     *ckp;
+       union xfs_btree_ptr     *pp;
+       union xfs_btree_ptr     *cpp;
+       struct xfs_buf          *cbp;
+       int                     level;
+       int                     index;
+       int                     numrecs;
+#ifdef DEBUG
+       union xfs_btree_ptr     ptr;
+       int                     i;
+#endif
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+       ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+       ASSERT(cur->bc_nlevels > 1);
+
+       /*
+        * Don't deal with the root block needs to be a leaf case.
+        * We're just going to turn the thing back into extents anyway.
+        */
+       level = cur->bc_nlevels - 1;
+       if (level == 1)
+               goto out0;
+
+       /*
+        * Give up if the root has multiple children.
+        */
+       block = xfs_btree_get_iroot(cur);
+       if (xfs_btree_get_numrecs(block) != 1)
+               goto out0;
+
+       cblock = xfs_btree_get_block(cur, level - 1, &cbp);
+       numrecs = xfs_btree_get_numrecs(cblock);
+
+       /*
+        * Only do this if the next level will fit.
+        * Then the data must be copied up to the inode,
+        * instead of freeing the root you free the next level.
+        */
+       if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
+               goto out0;
+
+       XFS_BTREE_STATS_INC(cur, killroot);
+
+#ifdef DEBUG
+       xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+       ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+       xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+       ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+#endif
+
+       index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
+       if (index) {
+               xfs_iroot_realloc(cur->bc_private.b.ip, index,
+                                 cur->bc_private.b.whichfork);
+               block = ifp->if_broot;
+       }
+
+       be16_add_cpu(&block->bb_numrecs, index);
+       ASSERT(block->bb_numrecs == cblock->bb_numrecs);
+
+       kp = xfs_btree_key_addr(cur, 1, block);
+       ckp = xfs_btree_key_addr(cur, 1, cblock);
+       xfs_btree_copy_keys(cur, kp, ckp, numrecs);
+
+       pp = xfs_btree_ptr_addr(cur, 1, block);
+       cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+       for (i = 0; i < numrecs; i++) {
+               int             error;
+
+               error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
+               if (error) {
+                       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+                       return error;
+               }
+       }
+#endif
+       xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+
+       cur->bc_ops->free_block(cur, cbp);
+       XFS_BTREE_STATS_INC(cur, free);
+
+       cur->bc_bufs[level - 1] = NULL;
+       be16_add_cpu(&block->bb_level, -1);
+       xfs_trans_log_inode(cur->bc_tp, ip,
+               XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
+       cur->bc_nlevels--;
+out0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       return 0;
+}
+
+STATIC int
+xfs_btree_dec_cursor(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       int                     *stat)
+{
+       int                     error;
+       int                     i;
+
+       if (level > 0) {
+               error = xfs_btree_decrement(cur, level, &i);
+               if (error)
+                       return error;
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+}
+
+/*
+ * Single level of the btree record deletion routine.
+ * Delete record pointed to by cur/level.
+ * Remove the record from its block then rebalance the tree.
+ * Return 0 for error, 1 for done, 2 to go on to the next level.
+ */
+STATIC int                                     /* error */
+xfs_btree_delrec(
+       struct xfs_btree_cur    *cur,           /* btree cursor */
+       int                     level,          /* level removing record from */
+       int                     *stat)          /* fail/done/go-on */
+{
+       struct xfs_btree_block  *block;         /* btree block */
+       union xfs_btree_ptr     cptr;           /* current block ptr */
+       struct xfs_buf          *bp;            /* buffer for block */
+       int                     error;          /* error return value */
+       int                     i;              /* loop counter */
+       union xfs_btree_key     key;            /* storage for keyp */
+       union xfs_btree_key     *keyp = &key;   /* passed to the next level */
+       union xfs_btree_ptr     lptr;           /* left sibling block ptr */
+       struct xfs_buf          *lbp;           /* left buffer pointer */
+       struct xfs_btree_block  *left;          /* left btree block */
+       int                     lrecs = 0;      /* left record count */
+       int                     ptr;            /* key/record index */
+       union xfs_btree_ptr     rptr;           /* right sibling block ptr */
+       struct xfs_buf          *rbp;           /* right buffer pointer */
+       struct xfs_btree_block  *right;         /* right btree block */
+       struct xfs_btree_block  *rrblock;       /* right-right btree block */
+       struct xfs_buf          *rrbp;          /* right-right buffer pointer */
+       int                     rrecs = 0;      /* right record count */
+       struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
+       int                     numrecs;        /* temporary numrec count */
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGI(cur, level);
+
+       tcur = NULL;
+
+       /* Get the index of the entry being deleted, check for nothing there. */
+       ptr = cur->bc_ptrs[level];
+       if (ptr == 0) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+               *stat = 0;
+               return 0;
+       }
+
+       /* Get the buffer & block containing the record or key/ptr. */
+       block = xfs_btree_get_block(cur, level, &bp);
+       numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, level, bp);
+       if (error)
+               goto error0;
+#endif
+
+       /* Fail if we're off the end of the block. */
+       if (ptr > numrecs) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+               *stat = 0;
+               return 0;
+       }
+
+       XFS_BTREE_STATS_INC(cur, delrec);
+       XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
+
+       /* Excise the entries being deleted. */
+       if (level > 0) {
+               /* It's a nonleaf. operate on keys and ptrs */
+               union xfs_btree_key     *lkp;
+               union xfs_btree_ptr     *lpp;
+
+               lkp = xfs_btree_key_addr(cur, ptr + 1, block);
+               lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
+
+#ifdef DEBUG
+               for (i = 0; i < numrecs - ptr; i++) {
+                       error = xfs_btree_check_ptr(cur, lpp, i, level);
+                       if (error)
+                               goto error0;
+               }
+#endif
+
+               if (ptr < numrecs) {
+                       xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
+                       xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
+                       xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
+                       xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
+               }
+
+               /*
+                * If it's the first record in the block, we'll need to pass a
+                * key up to the next level (updkey).
+                */
+               if (ptr == 1)
+                       keyp = xfs_btree_key_addr(cur, 1, block);
+       } else {
+               /* It's a leaf. operate on records */
+               if (ptr < numrecs) {
+                       xfs_btree_shift_recs(cur,
+                               xfs_btree_rec_addr(cur, ptr + 1, block),
+                               -1, numrecs - ptr);
+                       xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
+               }
+
+               /*
+                * If it's the first record in the block, we'll need a key
+                * structure to pass up to the next level (updkey).
+                */
+               if (ptr == 1) {
+                       cur->bc_ops->init_key_from_rec(&key,
+                                       xfs_btree_rec_addr(cur, 1, block));
+                       keyp = &key;
+               }
+       }
+
+       /*
+        * Decrement and log the number of entries in the block.
+        */
+       xfs_btree_set_numrecs(block, --numrecs);
+       xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+
+       /*
+        * If we are tracking the last record in the tree and
+        * we are at the far right edge of the tree, update it.
+        */
+       if (xfs_btree_is_lastrec(cur, block, level)) {
+               cur->bc_ops->update_lastrec(cur, block, NULL,
+                                           ptr, LASTREC_DELREC);
+       }
+
+       /*
+        * We're at the root level.  First, shrink the root block in-memory.
+        * Try to get rid of the next level down.  If we can't then there's
+        * nothing left to do.
+        */
+       if (level == cur->bc_nlevels - 1) {
+               if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+                       xfs_iroot_realloc(cur->bc_private.b.ip, -1,
+                                         cur->bc_private.b.whichfork);
+
+                       error = xfs_btree_kill_iroot(cur);
+                       if (error)
+                               goto error0;
+
+                       error = xfs_btree_dec_cursor(cur, level, stat);
+                       if (error)
+                               goto error0;
+                       *stat = 1;
+                       return 0;
+               }
+
+               /*
+                * If this is the root level, and there's only one entry left,
+                * and it's NOT the leaf level, then we can get rid of this
+                * level.
+                */
+               if (numrecs == 1 && level > 0) {
+                       union xfs_btree_ptr     *pp;
+                       /*
+                        * pp is still set to the first pointer in the block.
+                        * Make it the new root of the btree.
+                        */
+                       pp = xfs_btree_ptr_addr(cur, 1, block);
+                       error = cur->bc_ops->kill_root(cur, bp, level, pp);
+                       if (error)
+                               goto error0;
+               } else if (level > 0) {
+                       error = xfs_btree_dec_cursor(cur, level, stat);
+                       if (error)
+                               goto error0;
+               }
+               *stat = 1;
+               return 0;
+       }
+
+       /*
+        * If we deleted the leftmost entry in the block, update the
+        * key values above us in the tree.
+        */
+       if (ptr == 1) {
+               error = xfs_btree_updkey(cur, keyp, level + 1);
+               if (error)
+                       goto error0;
+       }
+
+       /*
+        * If the number of records remaining in the block is at least
+        * the minimum, we're done.
+        */
+       if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
+               error = xfs_btree_dec_cursor(cur, level, stat);
+               if (error)
+                       goto error0;
+               return 0;
+       }
+
+       /*
+        * Otherwise, we have to move some records around to keep the
+        * tree balanced.  Look at the left and right sibling blocks to
+        * see if we can re-balance by moving only one record.
+        */
+       xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+       xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
+
+       if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+               /*
+                * One child of root, need to get a chance to copy its contents
+                * into the root and delete it. Can't go up to next level,
+                * there's nothing to delete there.
+                */
+               if (xfs_btree_ptr_is_null(cur, &rptr) &&
+                   xfs_btree_ptr_is_null(cur, &lptr) &&
+                   level == cur->bc_nlevels - 2) {
+                       error = xfs_btree_kill_iroot(cur);
+                       if (!error)
+                               error = xfs_btree_dec_cursor(cur, level, stat);
+                       if (error)
+                               goto error0;
+                       return 0;
+               }
+       }
+
+       ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
+              !xfs_btree_ptr_is_null(cur, &lptr));
+
+       /*
+        * Duplicate the cursor so our btree manipulations here won't
+        * disrupt the next level up.
+        */
+       error = xfs_btree_dup_cursor(cur, &tcur);
+       if (error)
+               goto error0;
+
+       /*
+        * If there's a right sibling, see if it's ok to shift an entry
+        * out of it.
+        */
+       if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+               /*
+                * Move the temp cursor to the last entry in the next block.
+                * Actually any entry but the first would suffice.
+                */
+               i = xfs_btree_lastrec(tcur, level);
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+               error = xfs_btree_increment(tcur, level, &i);
+               if (error)
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+               i = xfs_btree_lastrec(tcur, level);
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+               /* Grab a pointer to the block. */
+               right = xfs_btree_get_block(tcur, level, &rbp);
+#ifdef DEBUG
+               error = xfs_btree_check_block(tcur, right, level, rbp);
+               if (error)
+                       goto error0;
+#endif
+               /* Grab the current block number, for future use. */
+               xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
+
+               /*
+                * If right block is full enough so that removing one entry
+                * won't make it too empty, and left-shifting an entry out
+                * of right to us works, we're done.
+                */
+               if (xfs_btree_get_numrecs(right) - 1 >=
+                   cur->bc_ops->get_minrecs(tcur, level)) {
+                       error = xfs_btree_lshift(tcur, level, &i);
+                       if (error)
+                               goto error0;
+                       if (i) {
+                               ASSERT(xfs_btree_get_numrecs(block) >=
+                                      cur->bc_ops->get_minrecs(tcur, level));
+
+                               xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                               tcur = NULL;
+
+                               error = xfs_btree_dec_cursor(cur, level, stat);
+                               if (error)
+                                       goto error0;
+                               return 0;
+                       }
+               }
+
+               /*
+                * Otherwise, grab the number of records in right for
+                * future reference, and fix up the temp cursor to point
+                * to our block again (last record).
+                */
+               rrecs = xfs_btree_get_numrecs(right);
+               if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+                       i = xfs_btree_firstrec(tcur, level);
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+                       error = xfs_btree_decrement(tcur, level, &i);
+                       if (error)
+                               goto error0;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               }
+       }
+
+       /*
+        * If there's a left sibling, see if it's ok to shift an entry
+        * out of it.
+        */
+       if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+               /*
+                * Move the temp cursor to the first entry in the
+                * previous block.
+                */
+               i = xfs_btree_firstrec(tcur, level);
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+               error = xfs_btree_decrement(tcur, level, &i);
+               if (error)
+                       goto error0;
+               i = xfs_btree_firstrec(tcur, level);
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+               /* Grab a pointer to the block. */
+               left = xfs_btree_get_block(tcur, level, &lbp);
+#ifdef DEBUG
+               error = xfs_btree_check_block(cur, left, level, lbp);
+               if (error)
+                       goto error0;
+#endif
+               /* Grab the current block number, for future use. */
+               xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
+
+               /*
+                * If left block is full enough so that removing one entry
+                * won't make it too empty, and right-shifting an entry out
+                * of left to us works, we're done.
+                */
+               if (xfs_btree_get_numrecs(left) - 1 >=
+                   cur->bc_ops->get_minrecs(tcur, level)) {
+                       error = xfs_btree_rshift(tcur, level, &i);
+                       if (error)
+                               goto error0;
+                       if (i) {
+                               ASSERT(xfs_btree_get_numrecs(block) >=
+                                      cur->bc_ops->get_minrecs(tcur, level));
+                               xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                               tcur = NULL;
+                               if (level == 0)
+                                       cur->bc_ptrs[0]++;
+                               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                               *stat = 1;
+                               return 0;
+                       }
+               }
+
+               /*
+                * Otherwise, grab the number of records in right for
+                * future reference.
+                */
+               lrecs = xfs_btree_get_numrecs(left);
+       }
+
+       /* Delete the temp cursor, we're done with it. */
+       xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+       tcur = NULL;
+
+       /* If here, we need to do a join to keep the tree balanced. */
+       ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
+
+       if (!xfs_btree_ptr_is_null(cur, &lptr) &&
+           lrecs + xfs_btree_get_numrecs(block) <=
+                       cur->bc_ops->get_maxrecs(cur, level)) {
+               /*
+                * Set "right" to be the starting block,
+                * "left" to be the left neighbor.
+                */
+               rptr = cptr;
+               right = block;
+               rbp = bp;
+               error = xfs_btree_read_buf_block(cur, &lptr, level,
+                                                       0, &left, &lbp);
+               if (error)
+                       goto error0;
+
+       /*
+        * If that won't work, see if we can join with the right neighbor block.
+        */
+       } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
+                  rrecs + xfs_btree_get_numrecs(block) <=
+                       cur->bc_ops->get_maxrecs(cur, level)) {
+               /*
+                * Set "left" to be the starting block,
+                * "right" to be the right neighbor.
+                */
+               lptr = cptr;
+               left = block;
+               lbp = bp;
+               error = xfs_btree_read_buf_block(cur, &rptr, level,
+                                                       0, &right, &rbp);
+               if (error)
+                       goto error0;
+
+       /*
+        * Otherwise, we can't fix the imbalance.
+        * Just return.  This is probably a logic error, but it's not fatal.
+        */
+       } else {
+               error = xfs_btree_dec_cursor(cur, level, stat);
+               if (error)
+                       goto error0;
+               return 0;
+       }
+
+       rrecs = xfs_btree_get_numrecs(right);
+       lrecs = xfs_btree_get_numrecs(left);
+
+       /*
+        * We're now going to join "left" and "right" by moving all the stuff
+        * in "right" to "left" and deleting "right".
+        */
+       XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+       if (level > 0) {
+               /* It's a non-leaf.  Move keys and pointers. */
+               union xfs_btree_key     *lkp;   /* left btree key */
+               union xfs_btree_ptr     *lpp;   /* left address pointer */
+               union xfs_btree_key     *rkp;   /* right btree key */
+               union xfs_btree_ptr     *rpp;   /* right address pointer */
+
+               lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
+               lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
+               rkp = xfs_btree_key_addr(cur, 1, right);
+               rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+               for (i = 1; i < rrecs; i++) {
+                       error = xfs_btree_check_ptr(cur, rpp, i, level);
+                       if (error)
+                               goto error0;
+               }
+#endif
+               xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
+               xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
+
+               xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
+               xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
+       } else {
+               /* It's a leaf.  Move records.  */
+               union xfs_btree_rec     *lrp;   /* left record pointer */
+               union xfs_btree_rec     *rrp;   /* right record pointer */
+
+               lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
+               rrp = xfs_btree_rec_addr(cur, 1, right);
+
+               xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
+               xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
+       }
+
+       XFS_BTREE_STATS_INC(cur, join);
+
+       /*
+        * Fix up the the number of records and right block pointer in the
+        * surviving block, and log it.
+        */
+       xfs_btree_set_numrecs(left, lrecs + rrecs);
+       xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
+       xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+       xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+
+       /* If there is a right sibling, point it to the remaining block. */
+       xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+       if (!xfs_btree_ptr_is_null(cur, &cptr)) {
+               error = xfs_btree_read_buf_block(cur, &cptr, level,
+                                                       0, &rrblock, &rrbp);
+               if (error)
+                       goto error0;
+               xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
+               xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+       }
+
+       /* Free the deleted block. */
+       error = cur->bc_ops->free_block(cur, rbp);
+       if (error)
+               goto error0;
+       XFS_BTREE_STATS_INC(cur, free);
+
+       /*
+        * If we joined with the left neighbor, set the buffer in the
+        * cursor to the left block, and fix up the index.
+        */
+       if (bp != lbp) {
+               cur->bc_bufs[level] = lbp;
+               cur->bc_ptrs[level] += lrecs;
+               cur->bc_ra[level] = 0;
+       }
+       /*
+        * If we joined with the right neighbor and there's a level above
+        * us, increment the cursor at that level.
+        */
+       else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
+                  (level + 1 < cur->bc_nlevels)) {
+               error = xfs_btree_increment(cur, level + 1, &i);
+               if (error)
+                       goto error0;
+       }
+
+       /*
+        * Readjust the ptr at this level if it's not a leaf, since it's
+        * still pointing at the deletion point, which makes the cursor
+        * inconsistent.  If this makes the ptr 0, the caller fixes it up.
+        * We can't use decrement because it would change the next level up.
+        */
+       if (level > 0)
+               cur->bc_ptrs[level]--;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       /* Return value means the next level up has something to do. */
+       *stat = 2;
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       if (tcur)
+               xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+       return error;
+}
+
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int                                    /* error */
+xfs_btree_delete(
+       struct xfs_btree_cur    *cur,
+       int                     *stat)  /* success/failure */
+{
+       int                     error;  /* error return value */
+       int                     level;
+       int                     i;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+       /*
+        * Go up the tree, starting at leaf level.
+        *
+        * If 2 is returned then a join was done; go to the next level.
+        * Otherwise we are done.
+        */
+       for (level = 0, i = 2; i == 2; level++) {
+               error = xfs_btree_delrec(cur, level, &i);
+               if (error)
+                       goto error0;
+       }
+
+       if (i == 0) {
+               for (level = 1; level < cur->bc_nlevels; level++) {
+                       if (cur->bc_ptrs[level] == 0) {
+                               error = xfs_btree_decrement(cur, level, &i);
+                               if (error)
+                                       goto error0;
+                               break;
+                       }
+               }
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = i;
+       return 0;
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int                                    /* error */
+xfs_btree_get_rec(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       union xfs_btree_rec     **recp, /* output: btree record */
+       int                     *stat)  /* output: success/failure */
+{
+       struct xfs_btree_block  *block; /* btree block */
+       struct xfs_buf          *bp;    /* buffer pointer */
+       int                     ptr;    /* record number */
+#ifdef DEBUG
+       int                     error;  /* error return value */
+#endif
+
+       ptr = cur->bc_ptrs[0];
+       block = xfs_btree_get_block(cur, 0, &bp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, 0, bp);
+       if (error)
+               return error;
+#endif
+
+       /*
+        * Off the right end or left end, return failure.
+        */
+       if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
+               *stat = 0;
+               return 0;
+       }
+
+       /*
+        * Point to the record and extract its data.
+        */
+       *recp = xfs_btree_rec_addr(cur, ptr, block);
+       *stat = 1;
+       return 0;
+}
index 1f528a2a37545d858d40577a948e56ee7699d479..789fffdf8b2f557bc1aa3ed3c51acc5081ee941b 100644 (file)
@@ -39,39 +39,19 @@ extern kmem_zone_t  *xfs_btree_cur_zone;
 #define        XFS_BTNUM_INO   ((xfs_btnum_t)XFS_BTNUM_INOi)
 
 /*
- * Short form header: space allocation btrees.
- */
-typedef struct xfs_btree_sblock {
-       __be32          bb_magic;       /* magic number for block type */
-       __be16          bb_level;       /* 0 is a leaf */
-       __be16          bb_numrecs;     /* current # of data records */
-       __be32          bb_leftsib;     /* left sibling block or NULLAGBLOCK */
-       __be32          bb_rightsib;    /* right sibling block or NULLAGBLOCK */
-} xfs_btree_sblock_t;
-
-/*
- * Long form header: bmap btrees.
- */
-typedef struct xfs_btree_lblock {
-       __be32          bb_magic;       /* magic number for block type */
-       __be16          bb_level;       /* 0 is a leaf */
-       __be16          bb_numrecs;     /* current # of data records */
-       __be64          bb_leftsib;     /* left sibling block or NULLDFSBNO */
-       __be64          bb_rightsib;    /* right sibling block or NULLDFSBNO */
-} xfs_btree_lblock_t;
-
-/*
- * Combined header and structure, used by common code.
+ * Generic btree header.
+ *
+ * This is a comination of the actual format used on disk for short and long
+ * format btrees.  The first three fields are shared by both format, but
+ * the pointers are different and should be used with care.
+ *
+ * To get the size of the actual short or long form headers please use
+ * the size macros below.  Never use sizeof(xfs_btree_block).
  */
-typedef struct xfs_btree_hdr
-{
+struct xfs_btree_block {
        __be32          bb_magic;       /* magic number for block type */
        __be16          bb_level;       /* 0 is a leaf */
        __be16          bb_numrecs;     /* current # of data records */
-} xfs_btree_hdr_t;
-
-typedef struct xfs_btree_block {
-       xfs_btree_hdr_t bb_h;           /* header */
        union {
                struct {
                        __be32          bb_leftsib;
@@ -82,7 +62,36 @@ typedef struct xfs_btree_block {
                        __be64          bb_rightsib;
                } l;                    /* long form pointers */
        } bb_u;                         /* rest */
-} xfs_btree_block_t;
+};
+
+#define XFS_BTREE_SBLOCK_LEN   16      /* size of a short form block */
+#define XFS_BTREE_LBLOCK_LEN   24      /* size of a long form block */
+
+
+/*
+ * Generic key, ptr and record wrapper structures.
+ *
+ * These are disk format structures, and are converted where necessary
+ * by the btree specific code that needs to interpret them.
+ */
+union xfs_btree_ptr {
+       __be32                  s;      /* short form ptr */
+       __be64                  l;      /* long form ptr */
+};
+
+union xfs_btree_key {
+       xfs_bmbt_key_t          bmbt;
+       xfs_bmdr_key_t          bmbr;   /* bmbt root block */
+       xfs_alloc_key_t         alloc;
+       xfs_inobt_key_t         inobt;
+};
+
+union xfs_btree_rec {
+       xfs_bmbt_rec_t          bmbt;
+       xfs_bmdr_rec_t          bmbr;   /* bmbt root block */
+       xfs_alloc_rec_t         alloc;
+       xfs_inobt_rec_t         inobt;
+};
 
 /*
  * For logging record fields.
@@ -95,47 +104,132 @@ typedef struct xfs_btree_block {
 #define        XFS_BB_NUM_BITS         5
 #define        XFS_BB_ALL_BITS         ((1 << XFS_BB_NUM_BITS) - 1)
 
-/*
- * Boolean to select which form of xfs_btree_block_t.bb_u to use.
- */
-#define        XFS_BTREE_LONG_PTRS(btnum)      ((btnum) == XFS_BTNUM_BMAP)
-
 /*
  * Magic numbers for btree blocks.
  */
 extern const __uint32_t        xfs_magics[];
 
 /*
- * Maximum and minimum records in a btree block.
- * Given block size, type prefix, and leaf flag (0 or 1).
- * The divisor below is equivalent to lf ? (e1) : (e2) but that produces
- * compiler warnings.
- */
-#define        XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf)       \
-       ((int)(((bsz) - (uint)sizeof(t ## _block_t)) / \
-        (((lf) * (uint)sizeof(t ## _rec_t)) + \
-         ((1 - (lf)) * \
-          ((uint)sizeof(t ## _key_t) + (uint)sizeof(t ## _ptr_t))))))
-#define        XFS_BTREE_BLOCK_MINRECS(bsz,t,lf)       \
-       (XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) / 2)
-
-/*
- * Record, key, and pointer address calculation macros.
- * Given block size, type prefix, block pointer, and index of requested entry
- * (first entry numbered 1).
- */
-#define        XFS_BTREE_REC_ADDR(t,bb,i)      \
-       ((t ## _rec_t *)((char *)(bb) + sizeof(t ## _block_t) + \
-        ((i) - 1) * sizeof(t ## _rec_t)))
-#define        XFS_BTREE_KEY_ADDR(t,bb,i)      \
-       ((t ## _key_t *)((char *)(bb) + sizeof(t ## _block_t) + \
-        ((i) - 1) * sizeof(t ## _key_t)))
-#define        XFS_BTREE_PTR_ADDR(t,bb,i,mxr)  \
-       ((t ## _ptr_t *)((char *)(bb) + sizeof(t ## _block_t) + \
-        (mxr) * sizeof(t ## _key_t) + ((i) - 1) * sizeof(t ## _ptr_t)))
+ * Generic stats interface
+ */
+#define __XFS_BTREE_STATS_INC(type, stat) \
+       XFS_STATS_INC(xs_ ## type ## _2_ ## stat)
+#define XFS_BTREE_STATS_INC(cur, stat)  \
+do {    \
+       switch (cur->bc_btnum) {  \
+       case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break;   \
+       case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break;   \
+       case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break;  \
+       case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break;    \
+       case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;       \
+       }       \
+} while (0)
+
+#define __XFS_BTREE_STATS_ADD(type, stat, val) \
+       XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val)
+#define XFS_BTREE_STATS_ADD(cur, stat, val)  \
+do {    \
+       switch (cur->bc_btnum) {  \
+       case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \
+       case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
+       case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
+       case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
+       case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;       \
+       }       \
+} while (0)
 
 #define        XFS_BTREE_MAXLEVELS     8       /* max of all btrees */
 
+struct xfs_btree_ops {
+       /* size of the key and record structures */
+       size_t  key_len;
+       size_t  rec_len;
+
+       /* cursor operations */
+       struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
+       void    (*update_cursor)(struct xfs_btree_cur *src,
+                                struct xfs_btree_cur *dst);
+
+       /* update btree root pointer */
+       void    (*set_root)(struct xfs_btree_cur *cur,
+                               union xfs_btree_ptr *nptr, int level_change);
+       int     (*kill_root)(struct xfs_btree_cur *cur, struct xfs_buf *bp,
+                               int level, union xfs_btree_ptr *newroot);
+
+       /* block allocation / freeing */
+       int     (*alloc_block)(struct xfs_btree_cur *cur,
+                              union xfs_btree_ptr *start_bno,
+                              union xfs_btree_ptr *new_bno,
+                              int length, int *stat);
+       int     (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
+
+       /* update last record information */
+       void    (*update_lastrec)(struct xfs_btree_cur *cur,
+                                 struct xfs_btree_block *block,
+                                 union xfs_btree_rec *rec,
+                                 int ptr, int reason);
+
+       /* records in block/level */
+       int     (*get_minrecs)(struct xfs_btree_cur *cur, int level);
+       int     (*get_maxrecs)(struct xfs_btree_cur *cur, int level);
+
+       /* records on disk.  Matter for the root in inode case. */
+       int     (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level);
+
+       /* init values of btree structures */
+       void    (*init_key_from_rec)(union xfs_btree_key *key,
+                                    union xfs_btree_rec *rec);
+       void    (*init_rec_from_key)(union xfs_btree_key *key,
+                                    union xfs_btree_rec *rec);
+       void    (*init_rec_from_cur)(struct xfs_btree_cur *cur,
+                                    union xfs_btree_rec *rec);
+       void    (*init_ptr_from_cur)(struct xfs_btree_cur *cur,
+                                    union xfs_btree_ptr *ptr);
+
+       /* difference between key value and cursor value */
+       __int64_t (*key_diff)(struct xfs_btree_cur *cur,
+                             union xfs_btree_key *key);
+
+#ifdef DEBUG
+       /* check that k1 is lower than k2 */
+       int     (*keys_inorder)(struct xfs_btree_cur *cur,
+                               union xfs_btree_key *k1,
+                               union xfs_btree_key *k2);
+
+       /* check that r1 is lower than r2 */
+       int     (*recs_inorder)(struct xfs_btree_cur *cur,
+                               union xfs_btree_rec *r1,
+                               union xfs_btree_rec *r2);
+#endif
+
+       /* btree tracing */
+#ifdef XFS_BTREE_TRACE
+       void            (*trace_enter)(struct xfs_btree_cur *, const char *,
+                                      char *, int, int, __psunsigned_t,
+                                      __psunsigned_t, __psunsigned_t,
+                                      __psunsigned_t, __psunsigned_t,
+                                      __psunsigned_t, __psunsigned_t,
+                                      __psunsigned_t, __psunsigned_t,
+                                      __psunsigned_t, __psunsigned_t);
+       void            (*trace_cursor)(struct xfs_btree_cur *, __uint32_t *,
+                                       __uint64_t *, __uint64_t *);
+       void            (*trace_key)(struct xfs_btree_cur *,
+                                    union xfs_btree_key *, __uint64_t *,
+                                    __uint64_t *);
+       void            (*trace_record)(struct xfs_btree_cur *,
+                                       union xfs_btree_rec *, __uint64_t *,
+                                       __uint64_t *, __uint64_t *);
+#endif
+};
+
+/*
+ * Reasons for the update_lastrec method to be called.
+ */
+#define LASTREC_UPDATE 0
+#define LASTREC_INSREC 1
+#define LASTREC_DELREC 2
+
+
 /*
  * Btree cursor structure.
  * This collects all information needed by the btree code in one place.
@@ -144,6 +238,8 @@ typedef struct xfs_btree_cur
 {
        struct xfs_trans        *bc_tp; /* transaction we're in, if any */
        struct xfs_mount        *bc_mp; /* file system mount struct */
+       const struct xfs_btree_ops *bc_ops;
+       uint                    bc_flags; /* btree features - below */
        union {
                xfs_alloc_rec_incore_t  a;
                xfs_bmbt_irec_t         b;
@@ -175,94 +271,40 @@ typedef struct xfs_btree_cur
        }               bc_private;     /* per-btree type data */
 } xfs_btree_cur_t;
 
+/* cursor flags */
+#define XFS_BTREE_LONG_PTRS            (1<<0)  /* pointers are 64bits long */
+#define XFS_BTREE_ROOT_IN_INODE                (1<<1)  /* root may be variable size */
+#define XFS_BTREE_LASTREC_UPDATE       (1<<2)  /* track last rec externally */
+
+
 #define        XFS_BTREE_NOERROR       0
 #define        XFS_BTREE_ERROR         1
 
 /*
  * Convert from buffer to btree block header.
  */
-#define        XFS_BUF_TO_BLOCK(bp)    ((xfs_btree_block_t *)XFS_BUF_PTR(bp))
-#define        XFS_BUF_TO_LBLOCK(bp)   ((xfs_btree_lblock_t *)XFS_BUF_PTR(bp))
-#define        XFS_BUF_TO_SBLOCK(bp)   ((xfs_btree_sblock_t *)XFS_BUF_PTR(bp))
+#define        XFS_BUF_TO_BLOCK(bp)    ((struct xfs_btree_block *)XFS_BUF_PTR(bp))
 
 
-#ifdef __KERNEL__
-
-#ifdef DEBUG
 /*
- * Debug routine: check that block header is ok.
+ * Check that block header is ok.
  */
-void
+int
 xfs_btree_check_block(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_btree_block_t       *block, /* generic btree block pointer */
-       int                     level,  /* level of the btree block */
-       struct xfs_buf          *bp);   /* buffer containing block, if any */
-
-/*
- * Debug routine: check that keys are in the right order.
- */
-void
-xfs_btree_check_key(
-       xfs_btnum_t             btnum,  /* btree identifier */
-       void                    *ak1,   /* pointer to left (lower) key */
-       void                    *ak2);  /* pointer to right (higher) key */
-
-/*
- * Debug routine: check that records are in the right order.
- */
-void
-xfs_btree_check_rec(
-       xfs_btnum_t             btnum,  /* btree identifier */
-       void                    *ar1,   /* pointer to left (lower) record */
-       void                    *ar2);  /* pointer to right (higher) record */
-#else
-#define        xfs_btree_check_block(a,b,c,d)
-#define        xfs_btree_check_key(a,b,c)
-#define        xfs_btree_check_rec(a,b,c)
-#endif /* DEBUG */
-
-/*
- * Checking routine: check that long form block header is ok.
- */
-int                                    /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lblock(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_btree_lblock_t      *block, /* btree long form block pointer */
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       struct xfs_btree_block  *block, /* generic btree block pointer */
        int                     level,  /* level of the btree block */
        struct xfs_buf          *bp);   /* buffer containing block, if any */
 
 /*
- * Checking routine: check that (long) pointer is ok.
+ * Check that (long) pointer is ok.
  */
 int                                    /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_lptr(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
+       struct xfs_btree_cur    *cur,   /* btree cursor */
        xfs_dfsbno_t            ptr,    /* btree block disk address */
        int                     level); /* btree block level */
 
-#define xfs_btree_check_lptr_disk(cur, ptr, level) \
-       xfs_btree_check_lptr(cur, be64_to_cpu(ptr), level)
-
-/*
- * Checking routine: check that short form block header is ok.
- */
-int                                    /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sblock(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_btree_sblock_t      *block, /* btree short form block pointer */
-       int                     level,  /* level of the btree block */
-       struct xfs_buf          *bp);   /* buffer containing block */
-
-/*
- * Checking routine: check that (short) pointer is ok.
- */
-int                                    /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sptr(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_agblock_t           ptr,    /* btree block disk address */
-       int                     level); /* btree block level */
-
 /*
  * Delete the btree cursor.
  */
@@ -280,15 +322,6 @@ xfs_btree_dup_cursor(
        xfs_btree_cur_t         *cur,   /* input cursor */
        xfs_btree_cur_t         **ncur);/* output cursor */
 
-/*
- * Change the cursor to point to the first record in the current block
- * at the given level.  Other levels are unaffected.
- */
-int                                    /* success=1, failure=0 */
-xfs_btree_firstrec(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level); /* level to change */
-
 /*
  * Get a buffer for the block, return it with no data read.
  * Long-form addressing.
@@ -312,20 +345,6 @@ xfs_btree_get_bufs(
        xfs_agblock_t           agbno,  /* allocation group block number */
        uint                    lock);  /* lock flags for get_buf */
 
-/*
- * Allocate a new btree cursor.
- * The cursor is either for allocation (A) or bmap (B).
- */
-xfs_btree_cur_t *                      /* new btree cursor */
-xfs_btree_init_cursor(
-       struct xfs_mount        *mp,    /* file system mount point */
-       struct xfs_trans        *tp,    /* transaction pointer */
-       struct xfs_buf          *agbp,  /* (A only) buffer for agf structure */
-       xfs_agnumber_t          agno,   /* (A only) allocation group number */
-       xfs_btnum_t             btnum,  /* btree identifier */
-       struct xfs_inode        *ip,    /* (B only) inode owning the btree */
-       int                     whichfork); /* (B only) data/attr fork */
-
 /*
  * Check for the cursor referring to the last block at the given level.
  */
@@ -334,15 +353,6 @@ xfs_btree_islastblock(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     level); /* level to check */
 
-/*
- * Change the cursor to point to the last record in the current block
- * at the given level.  Other levels are unaffected.
- */
-int                                    /* success=1, failure=0 */
-xfs_btree_lastrec(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level); /* level to change */
-
 /*
  * Compute first and last byte offsets for the fields given.
  * Interprets the offsets table, which contains struct field offsets.
@@ -404,39 +414,53 @@ xfs_btree_reada_bufs(
        xfs_extlen_t            count); /* count of filesystem blocks */
 
 /*
- * Read-ahead btree blocks, at the given level.
- * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ * Set the buffer for level "lev" in the cursor to bp, releasing
+ * any previous buffer.
  */
-int                                    /* readahead block count */
-xfs_btree_readahead_core(
+void
+xfs_btree_setbuf(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     lev,    /* level in btree */
-       int                     lr);    /* left/right bits */
+       struct xfs_buf          *bp);   /* new buffer to set */
 
-static inline int                      /* readahead block count */
-xfs_btree_readahead(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     lev,    /* level in btree */
-       int                     lr)     /* left/right bits */
-{
-       if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
-               return 0;
 
-       return xfs_btree_readahead_core(cur, lev, lr);
-}
+/*
+ * Common btree core entry points.
+ */
+int xfs_btree_increment(struct xfs_btree_cur *, int, int *);
+int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
+int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
+int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
+int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
+int xfs_btree_kill_iroot(struct xfs_btree_cur *);
+int xfs_btree_insert(struct xfs_btree_cur *, int *);
+int xfs_btree_delete(struct xfs_btree_cur *, int *);
+int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
 
+/*
+ * Internal btree helpers also used by xfs_bmap.c.
+ */
+void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
+void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
 
 /*
- * Set the buffer for level "lev" in the cursor to bp, releasing
- * any previous buffer.
+ * Helpers.
  */
-void
-xfs_btree_setbuf(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     lev,    /* level in btree */
-       struct xfs_buf          *bp);   /* new buffer to set */
+static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block)
+{
+       return be16_to_cpu(block->bb_numrecs);
+}
+
+static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block,
+               __uint16_t numrecs)
+{
+       block->bb_numrecs = cpu_to_be16(numrecs);
+}
 
-#endif /* __KERNEL__ */
+static inline int xfs_btree_get_level(struct xfs_btree_block *block)
+{
+       return be16_to_cpu(block->bb_level);
+}
 
 
 /*
diff --git a/fs/xfs/xfs_btree_trace.c b/fs/xfs/xfs_btree_trace.c
new file mode 100644 (file)
index 0000000..44ff942
--- /dev/null
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
+
+STATIC void
+xfs_btree_trace_ptr(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     ptr,
+       __psunsigned_t          *high,
+       __psunsigned_t          *low)
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+               __u64 val = be64_to_cpu(ptr.l);
+               *high = val >> 32;
+               *low = (int)val;
+       } else {
+               *high = 0;
+               *low = be32_to_cpu(ptr.s);
+       }
+}
+
+/*
+ * Add a trace buffer entry for arguments, for a buffer & 1 integer arg.
+ */
+void
+xfs_btree_trace_argbi(
+       const char              *func,
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *b,
+       int                     i,
+       int                     line)
+{
+       cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBI,
+                                line, (__psunsigned_t)b, i, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for a buffer & 2 integer args.
+ */
+void
+xfs_btree_trace_argbii(
+       const char              *func,
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *b,
+       int                     i0,
+       int                     i1,
+       int                     line)
+{
+       cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBII,
+                                line, (__psunsigned_t)b, i0, i1, 0, 0, 0, 0,
+                                0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for 3 block-length args
+ * and an integer arg.
+ */
+void
+xfs_btree_trace_argfffi(
+       const char              *func,
+       struct xfs_btree_cur    *cur,
+       xfs_dfiloff_t           o,
+       xfs_dfsbno_t            b,
+       xfs_dfilblks_t          i,
+       int                     j,
+       int                     line)
+{
+       cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGFFFI,
+                                line,
+                                o >> 32, (int)o,
+                                b >> 32, (int)b,
+                                i >> 32, (int)i,
+                                (int)j, 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for one integer arg.
+ */
+void
+xfs_btree_trace_argi(
+       const char              *func,
+       struct xfs_btree_cur    *cur,
+       int                     i,
+       int                     line)
+{
+       cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGI,
+                                line, i, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for int, fsblock, key.
+ */
+void
+xfs_btree_trace_argipk(
+       const char              *func,
+       struct xfs_btree_cur    *cur,
+       int                     i,
+       union xfs_btree_ptr     ptr,
+       union xfs_btree_key     *key,
+       int                     line)
+{
+       __psunsigned_t          high, low;
+       __uint64_t              l0, l1;
+
+       xfs_btree_trace_ptr(cur, ptr, &high, &low);
+       cur->bc_ops->trace_key(cur, key, &l0, &l1);
+       cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPK,
+                                line, i, high, low,
+                                l0 >> 32, (int)l0,
+                                l1 >> 32, (int)l1,
+                                0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for int, fsblock, rec.
+ */
+void
+xfs_btree_trace_argipr(
+       const char              *func,
+       struct xfs_btree_cur    *cur,
+       int                     i,
+       union xfs_btree_ptr     ptr,
+       union xfs_btree_rec     *rec,
+       int                     line)
+{
+       __psunsigned_t          high, low;
+       __uint64_t              l0, l1, l2;
+
+       xfs_btree_trace_ptr(cur, ptr, &high, &low);
+       cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2);
+       cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPR,
+                             line, i,
+                             high, low,
+                             l0 >> 32, (int)l0,
+                             l1 >> 32, (int)l1,
+                             l2 >> 32, (int)l2,
+                             0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for int, key.
+ */
+void
+xfs_btree_trace_argik(
+       const char              *func,
+       struct xfs_btree_cur    *cur,
+       int                     i,
+       union xfs_btree_key     *key,
+       int                     line)
+{
+       __uint64_t              l0, l1;
+
+       cur->bc_ops->trace_key(cur, key, &l0, &l1);
+       cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIK,
+                                line, i,
+                                l0 >> 32, (int)l0,
+                                l1 >> 32, (int)l1,
+                                0, 0, 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for record.
+ */
+void
+xfs_btree_trace_argr(
+       const char              *func,
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec,
+       int                     line)
+{
+       __uint64_t              l0, l1, l2;
+
+       cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2);
+       cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGR,
+                             line,
+                             l0 >> 32, (int)l0,
+                             l1 >> 32, (int)l1,
+                             l2 >> 32, (int)l2,
+                             0, 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for the cursor/operation.
+ */
+void
+xfs_btree_trace_cursor(
+       const char              *func,
+       struct xfs_btree_cur    *cur,
+       int                     type,
+       int                     line)
+{
+       __uint32_t              s0;
+       __uint64_t              l0, l1;
+       char                    *s;
+
+       switch (type) {
+       case XBT_ARGS:
+               s = "args";
+               break;
+       case XBT_ENTRY:
+               s = "entry";
+               break;
+       case XBT_ERROR:
+               s = "error";
+               break;
+       case XBT_EXIT:
+               s = "exit";
+               break;
+       default:
+               s = "unknown";
+               break;
+       }
+
+       cur->bc_ops->trace_cursor(cur, &s0, &l0, &l1);
+       cur->bc_ops->trace_enter(cur, func, s, XFS_BTREE_KTRACE_CUR, line,
+                                s0,
+                                l0 >> 32, (int)l0,
+                                l1 >> 32, (int)l1,
+                                (__psunsigned_t)cur->bc_bufs[0],
+                                (__psunsigned_t)cur->bc_bufs[1],
+                                (__psunsigned_t)cur->bc_bufs[2],
+                                (__psunsigned_t)cur->bc_bufs[3],
+                                (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1],
+                                (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]);
+}
diff --git a/fs/xfs/xfs_btree_trace.h b/fs/xfs/xfs_btree_trace.h
new file mode 100644 (file)
index 0000000..b3f5eb3
--- /dev/null
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BTREE_TRACE_H__
+#define        __XFS_BTREE_TRACE_H__
+
+struct xfs_btree_cur;
+struct xfs_buf;
+
+
+/*
+ * Trace hooks.
+ * i,j = integer (32 bit)
+ * b = btree block buffer (xfs_buf_t)
+ * p = btree ptr
+ * r = btree record
+ * k = btree key
+ */
+
+#ifdef XFS_BTREE_TRACE
+
+/*
+ * Trace buffer entry types.
+ */
+#define XFS_BTREE_KTRACE_ARGBI   1
+#define XFS_BTREE_KTRACE_ARGBII  2
+#define XFS_BTREE_KTRACE_ARGFFFI 3
+#define XFS_BTREE_KTRACE_ARGI    4
+#define XFS_BTREE_KTRACE_ARGIPK  5
+#define XFS_BTREE_KTRACE_ARGIPR  6
+#define XFS_BTREE_KTRACE_ARGIK   7
+#define XFS_BTREE_KTRACE_ARGR   8
+#define XFS_BTREE_KTRACE_CUR     9
+
+/*
+ * Sub-types for cursor traces.
+ */
+#define XBT_ARGS       0
+#define XBT_ENTRY      1
+#define XBT_ERROR      2
+#define XBT_EXIT       3
+
+void xfs_btree_trace_argbi(const char *, struct xfs_btree_cur *,
+               struct xfs_buf *, int, int);
+void xfs_btree_trace_argbii(const char *, struct xfs_btree_cur *,
+               struct xfs_buf *, int, int, int);
+void xfs_btree_trace_argfffi(const char *, struct xfs_btree_cur *,
+               xfs_dfiloff_t, xfs_dfsbno_t, xfs_dfilblks_t, int, int);
+void xfs_btree_trace_argi(const char *, struct xfs_btree_cur *, int, int);
+void xfs_btree_trace_argipk(const char *, struct xfs_btree_cur *, int,
+               union xfs_btree_ptr, union xfs_btree_key *, int);
+void xfs_btree_trace_argipr(const char *, struct xfs_btree_cur *, int,
+               union xfs_btree_ptr, union xfs_btree_rec *, int);
+void xfs_btree_trace_argik(const char *, struct xfs_btree_cur *, int,
+               union xfs_btree_key *, int);
+void xfs_btree_trace_argr(const char *, struct xfs_btree_cur *,
+               union xfs_btree_rec *, int);
+void xfs_btree_trace_cursor(const char *, struct xfs_btree_cur *, int, int);
+
+
+#define XFS_ALLOCBT_TRACE_SIZE 4096    /* size of global trace buffer */
+extern ktrace_t        *xfs_allocbt_trace_buf;
+
+#define XFS_INOBT_TRACE_SIZE   4096    /* size of global trace buffer */
+extern ktrace_t        *xfs_inobt_trace_buf;
+
+#define XFS_BMBT_TRACE_SIZE    4096    /* size of global trace buffer */
+#define XFS_BMBT_KTRACE_SIZE   32      /* size of per-inode trace buffer */
+extern ktrace_t        *xfs_bmbt_trace_buf;
+
+
+#define        XFS_BTREE_TRACE_ARGBI(c, b, i)  \
+       xfs_btree_trace_argbi(__func__, c, b, i, __LINE__)
+#define        XFS_BTREE_TRACE_ARGBII(c, b, i, j)      \
+       xfs_btree_trace_argbii(__func__, c, b, i, j, __LINE__)
+#define        XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j)  \
+       xfs_btree_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
+#define        XFS_BTREE_TRACE_ARGI(c, i)      \
+       xfs_btree_trace_argi(__func__, c, i, __LINE__)
+#define        XFS_BTREE_TRACE_ARGIPK(c, i, p, k)      \
+       xfs_btree_trace_argipk(__func__, c, i, p, k, __LINE__)
+#define        XFS_BTREE_TRACE_ARGIPR(c, i, p, r)      \
+       xfs_btree_trace_argipr(__func__, c, i, p, r, __LINE__)
+#define        XFS_BTREE_TRACE_ARGIK(c, i, k)  \
+       xfs_btree_trace_argik(__func__, c, i, k, __LINE__)
+#define XFS_BTREE_TRACE_ARGR(c, r)     \
+       xfs_btree_trace_argr(__func__, c, r, __LINE__)
+#define        XFS_BTREE_TRACE_CURSOR(c, t)    \
+       xfs_btree_trace_cursor(__func__, c, t, __LINE__)
+#else
+#define        XFS_BTREE_TRACE_ARGBI(c, b, i)
+#define        XFS_BTREE_TRACE_ARGBII(c, b, i, j)
+#define        XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j)
+#define        XFS_BTREE_TRACE_ARGI(c, i)
+#define        XFS_BTREE_TRACE_ARGIPK(c, i, p, s)
+#define        XFS_BTREE_TRACE_ARGIPR(c, i, p, r)
+#define        XFS_BTREE_TRACE_ARGIK(c, i, k)
+#define XFS_BTREE_TRACE_ARGR(c, r)
+#define        XFS_BTREE_TRACE_CURSOR(c, t)
+#endif /* XFS_BTREE_TRACE */
+
+#endif /* __XFS_BTREE_TRACE_H__ */
index 002fc2617c8ee9426bff6203eb5bc6039094bf0a..92af4098c7e848043b74443d04d29d32597fd46b 100644 (file)
@@ -375,7 +375,7 @@ xfs_buf_item_unpin(
        xfs_buf_log_item_t      *bip,
        int                     stale)
 {
-       xfs_mount_t     *mp;
+       struct xfs_ail  *ailp;
        xfs_buf_t       *bp;
        int             freed;
 
@@ -387,7 +387,7 @@ xfs_buf_item_unpin(
        xfs_buftrace("XFS_UNPIN", bp);
 
        freed = atomic_dec_and_test(&bip->bli_refcount);
-       mp = bip->bli_item.li_mountp;
+       ailp = bip->bli_item.li_ailp;
        xfs_bunpin(bp);
        if (freed && stale) {
                ASSERT(bip->bli_flags & XFS_BLI_STALE);
@@ -399,17 +399,17 @@ xfs_buf_item_unpin(
                xfs_buftrace("XFS_UNPIN STALE", bp);
                /*
                 * If we get called here because of an IO error, we may
-                * or may not have the item on the AIL. xfs_trans_delete_ail()
+                * or may not have the item on the AIL. xfs_trans_ail_delete()
                 * will take care of that situation.
-                * xfs_trans_delete_ail() drops the AIL lock.
+                * xfs_trans_ail_delete() drops the AIL lock.
                 */
                if (bip->bli_flags & XFS_BLI_STALE_INODE) {
                        xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
                        XFS_BUF_CLR_IODONE_FUNC(bp);
                } else {
-                       spin_lock(&mp->m_ail_lock);
-                       xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip);
+                       spin_lock(&ailp->xa_lock);
+                       xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
                        xfs_buf_item_relse(bp);
                        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL);
                }
@@ -707,8 +707,8 @@ xfs_buf_item_init(
         * the first.  If we do already have one, there is
         * nothing to do here so return.
         */
-       if (XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *) != mp)
-               XFS_BUF_SET_FSPRIVATE3(bp, mp);
+       if (bp->b_mount != mp)
+               bp->b_mount = mp;
        XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
        if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
@@ -731,6 +731,7 @@ xfs_buf_item_init(
        bip->bli_item.li_type = XFS_LI_BUF;
        bip->bli_item.li_ops = &xfs_buf_item_ops;
        bip->bli_item.li_mountp = mp;
+       bip->bli_item.li_ailp = mp->m_ail;
        bip->bli_buf = bp;
        xfs_buf_hold(bp);
        bip->bli_format.blf_type = XFS_LI_BUF;
@@ -997,21 +998,7 @@ xfs_buf_iodone_callbacks(
                        xfs_buf_do_callbacks(bp, lip);
                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
                        XFS_BUF_CLR_IODONE_FUNC(bp);
-
-                       /*
-                        * XFS_SHUT flag gets set when we go thru the
-                        * entire buffer cache and deliberately start
-                        * throwing away delayed write buffers.
-                        * Since there's no biowait done on those,
-                        * we should just brelse them.
-                        */
-                       if (XFS_BUF_ISSHUT(bp)) {
-                           XFS_BUF_UNSHUT(bp);
-                               xfs_buf_relse(bp);
-                       } else {
-                               xfs_biodone(bp);
-                       }
-
+                       xfs_biodone(bp);
                        return;
                }
 
@@ -1122,27 +1109,23 @@ xfs_buf_iodone(
        xfs_buf_t               *bp,
        xfs_buf_log_item_t      *bip)
 {
-       struct xfs_mount        *mp;
+       struct xfs_ail          *ailp = bip->bli_item.li_ailp;
 
        ASSERT(bip->bli_buf == bp);
 
        xfs_buf_rele(bp);
-       mp = bip->bli_item.li_mountp;
 
        /*
         * If we are forcibly shutting down, this may well be
         * off the AIL already. That's because we simulate the
         * log-committed callbacks to unpin these buffers. Or we may never
         * have put this item on AIL because of the transaction was
-        * aborted forcibly. xfs_trans_delete_ail() takes care of these.
+        * aborted forcibly. xfs_trans_ail_delete() takes care of these.
         *
         * Either way, AIL is useless if we're forcing a shutdown.
         */
-       spin_lock(&mp->m_ail_lock);
-       /*
-        * xfs_trans_delete_ail() drops the AIL lock.
-        */
-       xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip);
+       spin_lock(&ailp->xa_lock);
+       xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
        xfs_buf_item_free(bip);
 }
 
diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h
deleted file mode 100644 (file)
index d2ce5dd..0000000
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_CLNT_H__
-#define __XFS_CLNT_H__
-
-/*
- * XFS arguments structure, constructed from the arguments we
- * are passed via the mount system call.
- *
- * NOTE: The mount system call is handled differently between
- * Linux and IRIX.  In IRIX we worked work with a binary data
- * structure coming in across the syscall interface from user
- * space (the mount userspace knows about each filesystem type
- * and the set of valid options for it, and converts the users
- * argument string into a binary structure _before_ making the
- * system call), and the ABI issues that this implies.
- *
- * In Linux, we are passed a comma separated set of options;
- * ie. a NULL terminated string of characters.  Userspace mount
- * code does not have any knowledge of mount options expected by
- * each filesystem type and so each filesystem parses its mount
- * options in kernel space.
- *
- * For the Linux port, we kept this structure pretty much intact
- * and use it internally (because the existing code groks it).
- */
-struct xfs_mount_args {
-       int     flags;          /* flags -> see XFSMNT_... macros below */
-       int     flags2;         /* flags -> see XFSMNT2_... macros below */
-       int     logbufs;        /* Number of log buffers, -1 to default */
-       int     logbufsize;     /* Size of log buffers, -1 to default */
-       char    fsname[MAXNAMELEN+1];   /* data device name */
-       char    rtname[MAXNAMELEN+1];   /* realtime device filename */
-       char    logname[MAXNAMELEN+1];  /* journal device filename */
-       char    mtpt[MAXNAMELEN+1];     /* filesystem mount point */
-       int     sunit;          /* stripe unit (BBs) */
-       int     swidth;         /* stripe width (BBs), multiple of sunit */
-       uchar_t iosizelog;      /* log2 of the preferred I/O size */
-       int     ihashsize;      /* inode hash table size (buckets) */
-};
-
-/*
- * XFS mount option flags -- args->flags1
- */
-#define        XFSMNT_ATTR2            0x00000001      /* allow ATTR2 EA format */
-#define        XFSMNT_WSYNC            0x00000002      /* safe mode nfs mount
-                                                * compatible */
-#define        XFSMNT_INO64            0x00000004      /* move inode numbers up
-                                                * past 2^32 */
-#define XFSMNT_UQUOTA          0x00000008      /* user quota accounting */
-#define XFSMNT_PQUOTA          0x00000010      /* IRIX prj quota accounting */
-#define XFSMNT_UQUOTAENF       0x00000020      /* user quota limit
-                                                * enforcement */
-#define XFSMNT_PQUOTAENF       0x00000040      /* IRIX project quota limit
-                                                * enforcement */
-#define XFSMNT_QUIET           0x00000080      /* don't report mount errors */
-#define XFSMNT_NOALIGN         0x00000200      /* don't allocate at
-                                                * stripe boundaries*/
-#define XFSMNT_RETERR          0x00000400      /* return error to user */
-#define XFSMNT_NORECOVERY      0x00000800      /* no recovery, implies
-                                                * read-only mount */
-#define XFSMNT_SHARED          0x00001000      /* shared XFS mount */
-#define XFSMNT_IOSIZE          0x00002000      /* optimize for I/O size */
-#define XFSMNT_OSYNCISOSYNC    0x00004000      /* o_sync is REALLY o_sync */
-                                               /* (osyncisdsync is default) */
-#define XFSMNT_NOATTR2         0x00008000      /* turn off ATTR2 EA format */
-#define XFSMNT_32BITINODES     0x00200000      /* restrict inodes to 32
-                                                * bits of address space */
-#define XFSMNT_GQUOTA          0x00400000      /* group quota accounting */
-#define XFSMNT_GQUOTAENF       0x00800000      /* group quota limit
-                                                * enforcement */
-#define XFSMNT_NOUUID          0x01000000      /* Ignore fs uuid */
-#define XFSMNT_DMAPI           0x02000000      /* enable dmapi/xdsm */
-#define XFSMNT_BARRIER         0x04000000      /* use write barriers */
-#define XFSMNT_IKEEP           0x08000000      /* inode cluster delete */
-#define XFSMNT_SWALLOC         0x10000000      /* turn on stripe width
-                                                * allocation */
-#define XFSMNT_DIRSYNC         0x40000000      /* sync creat,link,unlink,rename
-                                                * symlink,mkdir,rmdir,mknod */
-#define XFSMNT_FLAGS2          0x80000000      /* more flags set in flags2 */
-
-/*
- * XFS mount option flags -- args->flags2
- */
-#define XFSMNT2_COMPAT_IOSIZE  0x00000001      /* don't report large preferred
-                                                * I/O size in stat(2) */
-#define XFSMNT2_FILESTREAMS    0x00000002      /* enable the filestreams
-                                                * allocator */
-
-#endif /* __XFS_CLNT_H__ */
index 8be0b00ede9aafc83970a23e4ad9c7df2565b54d..70b710c1792d33bfce9c94641b2e62016d48493e 100644 (file)
@@ -72,27 +72,7 @@ typedef struct xfs_da_intnode {
 typedef struct xfs_da_node_hdr xfs_da_node_hdr_t;
 typedef struct xfs_da_node_entry xfs_da_node_entry_t;
 
-#define XFS_DA_MAXHASH ((xfs_dahash_t)-1) /* largest valid hash value */
-
 #define        XFS_LBSIZE(mp)  (mp)->m_sb.sb_blocksize
-#define        XFS_LBLOG(mp)   (mp)->m_sb.sb_blocklog
-
-#define        XFS_DA_MAKE_BNOENTRY(mp,bno,entry)      \
-       (((bno) << (mp)->m_dircook_elog) | (entry))
-#define        XFS_DA_MAKE_COOKIE(mp,bno,entry,hash)   \
-       (((xfs_off_t)XFS_DA_MAKE_BNOENTRY(mp, bno, entry) << 32) | (hash))
-#define        XFS_DA_COOKIE_HASH(mp,cookie)           ((xfs_dahash_t)cookie)
-#define        XFS_DA_COOKIE_BNO(mp,cookie)            \
-       ((((xfs_off_t)(cookie) >> 31) == -1LL ? \
-               (xfs_dablk_t)0 : \
-               (xfs_dablk_t)((xfs_off_t)(cookie) >> \
-                               ((mp)->m_dircook_elog + 32))))
-#define        XFS_DA_COOKIE_ENTRY(mp,cookie)          \
-       ((((xfs_off_t)(cookie) >> 31) == -1LL ? \
-               (xfs_dablk_t)0 : \
-               (xfs_dablk_t)(((xfs_off_t)(cookie) >> 32) & \
-                               ((1 << (mp)->m_dircook_elog) - 1))))
-
 
 /*========================================================================
  * Btree searching and modification structure definitions.
@@ -226,9 +206,8 @@ struct xfs_nameops {
 };
 
 
-#ifdef __KERNEL__
 /*========================================================================
- * Function prototypes for the kernel.
+ * Function prototypes.
  *========================================================================*/
 
 /*
@@ -289,6 +268,5 @@ xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf);
 
 extern struct kmem_zone *xfs_da_state_zone;
 extern struct kmem_zone *xfs_dabuf_zone;
-#endif /* __KERNEL__ */
 
 #endif /* __XFS_DA_BTREE_H__ */
index 75b0cd4da0ea30140bbdd3068c02ba94c751bd13..b4c1ee713492315f03f097ec66611c6f0b602d31 100644 (file)
@@ -49,9 +49,8 @@
  */
 int
 xfs_swapext(
-       xfs_swapext_t   __user *sxu)
+       xfs_swapext_t   *sxp)
 {
-       xfs_swapext_t   *sxp;
        xfs_inode_t     *ip, *tip;
        struct file     *file, *target_file;
        int             error = 0;
@@ -62,11 +61,6 @@ xfs_swapext(
                goto out;
        }
 
-       if (copy_from_user(sxp, sxu, sizeof(xfs_swapext_t))) {
-               error = XFS_ERROR(EFAULT);
-               goto out_free_sxp;
-       }
-
        /* Pull information for the target fd */
        file = fget((int)sxp->sx_fdtarget);
        if (!file) {
index da178205be689210ba520b9ec8c80d5bd584e04f..4f55a630655889714d437ce0ff69d978a6624846 100644 (file)
@@ -46,7 +46,7 @@ typedef struct xfs_swapext
 /*
  * Syscall interface for xfs_swapext
  */
-int    xfs_swapext(struct xfs_swapext __user *sx);
+int    xfs_swapext(struct xfs_swapext *sx);
 
 int    xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
                struct xfs_swapext *sxp);
index c9065eaf2a4d2a9eceea58fe0563f4377cfe0ad1..162e8726df5e65b9666cee344aadda997909d878 100644 (file)
 #ifndef __XFS_DINODE_H__
 #define        __XFS_DINODE_H__
 
-struct xfs_buf;
-struct xfs_mount;
+#define        XFS_DINODE_MAGIC                0x494e  /* 'IN' */
+#define XFS_DINODE_GOOD_VERSION(v)     (((v) == 1 || (v) == 2))
 
-#define        XFS_DINODE_VERSION_1    1
-#define        XFS_DINODE_VERSION_2    2
-#define XFS_DINODE_GOOD_VERSION(v)     \
-       (((v) == XFS_DINODE_VERSION_1 || (v) == XFS_DINODE_VERSION_2))
-#define        XFS_DINODE_MAGIC        0x494e  /* 'IN' */
-
-/*
- * Disk inode structure.
- * This is just the header; the inode is expanded to fill a variable size
- * with the last field expanding.  It is split into the core and "other"
- * because we only need the core part in the in-core inode.
- */
 typedef struct xfs_timestamp {
        __be32          t_sec;          /* timestamp seconds */
        __be32          t_nsec;         /* timestamp nanoseconds */
 } xfs_timestamp_t;
 
 /*
- * Note: Coordinate changes to this structure with the XFS_DI_* #defines
- * below, the offsets table in xfs_ialloc_log_di() and struct xfs_icdinode
- * in xfs_inode.h.
+ * On-disk inode structure.
+ *
+ * This is just the header or "dinode core", the inode is expanded to fill a
+ * variable size the leftover area split into a data and an attribute fork.
+ * The format of the data and attribute fork depends on the format of the
+ * inode as indicated by di_format and di_aformat.  To access the data and
+ * attribute use the XFS_DFORK_PTR, XFS_DFORK_DPTR, and XFS_DFORK_PTR macros
+ * below.
+ *
+ * There is a very similar struct icdinode in xfs_inode which matches the
+ * layout of the first 96 bytes of this structure, but is kept in native
+ * format instead of big endian.
  */
-typedef struct xfs_dinode_core {
+typedef struct xfs_dinode {
        __be16          di_magic;       /* inode magic # = XFS_DINODE_MAGIC */
        __be16          di_mode;        /* mode and type of file */
        __u8            di_version;     /* inode version */
@@ -69,34 +66,12 @@ typedef struct xfs_dinode_core {
        __be16          di_dmstate;     /* DMIG state info */
        __be16          di_flags;       /* random flags, XFS_DIFLAG_... */
        __be32          di_gen;         /* generation number */
-} xfs_dinode_core_t;
 
-#define DI_MAX_FLUSH 0xffff
+       /* di_next_unlinked is the only non-core field in the old dinode */
+       __be32          di_next_unlinked;/* agi unlinked list ptr */
+} __attribute__((packed)) xfs_dinode_t;
 
-typedef struct xfs_dinode
-{
-       xfs_dinode_core_t       di_core;
-       /*
-        * In adding anything between the core and the union, be
-        * sure to update the macros like XFS_LITINO below and
-        * XFS_BMAP_RBLOCK_DSIZE in xfs_bmap_btree.h.
-        */
-       __be32                  di_next_unlinked;/* agi unlinked list ptr */
-       union {
-               xfs_bmdr_block_t di_bmbt;       /* btree root block */
-               xfs_bmbt_rec_32_t di_bmx[1];    /* extent list */
-               xfs_dir2_sf_t   di_dir2sf;      /* shortform directory v2 */
-               char            di_c[1];        /* local contents */
-               __be32          di_dev;         /* device for S_IFCHR/S_IFBLK */
-               uuid_t          di_muuid;       /* mount point value */
-               char            di_symlink[1];  /* local symbolic link */
-       }               di_u;
-       union {
-               xfs_bmdr_block_t di_abmbt;      /* btree root block */
-               xfs_bmbt_rec_32_t di_abmx[1];   /* extent list */
-               xfs_attr_shortform_t di_attrsf; /* shortform attribute list */
-       }               di_a;
-} xfs_dinode_t;
+#define DI_MAX_FLUSH 0xffff
 
 /*
  * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
@@ -106,51 +81,15 @@ typedef struct xfs_dinode
 #define        XFS_MAXLINK             ((1U << 31) - 1U)
 #define        XFS_MAXLINK_1           65535U
 
-/*
- * Bit names for logging disk inodes only
- */
-#define        XFS_DI_MAGIC            0x0000001
-#define        XFS_DI_MODE             0x0000002
-#define        XFS_DI_VERSION          0x0000004
-#define        XFS_DI_FORMAT           0x0000008
-#define        XFS_DI_ONLINK           0x0000010
-#define        XFS_DI_UID              0x0000020
-#define        XFS_DI_GID              0x0000040
-#define        XFS_DI_NLINK            0x0000080
-#define        XFS_DI_PROJID           0x0000100
-#define        XFS_DI_PAD              0x0000200
-#define        XFS_DI_ATIME            0x0000400
-#define        XFS_DI_MTIME            0x0000800
-#define        XFS_DI_CTIME            0x0001000
-#define        XFS_DI_SIZE             0x0002000
-#define        XFS_DI_NBLOCKS          0x0004000
-#define        XFS_DI_EXTSIZE          0x0008000
-#define        XFS_DI_NEXTENTS         0x0010000
-#define        XFS_DI_NAEXTENTS        0x0020000
-#define        XFS_DI_FORKOFF          0x0040000
-#define        XFS_DI_AFORMAT          0x0080000
-#define        XFS_DI_DMEVMASK         0x0100000
-#define        XFS_DI_DMSTATE          0x0200000
-#define        XFS_DI_FLAGS            0x0400000
-#define        XFS_DI_GEN              0x0800000
-#define        XFS_DI_NEXT_UNLINKED    0x1000000
-#define        XFS_DI_U                0x2000000
-#define        XFS_DI_A                0x4000000
-#define        XFS_DI_NUM_BITS         27
-#define        XFS_DI_ALL_BITS         ((1 << XFS_DI_NUM_BITS) - 1)
-#define        XFS_DI_CORE_BITS        (XFS_DI_ALL_BITS & ~(XFS_DI_U|XFS_DI_A))
-
 /*
  * Values for di_format
  */
-typedef enum xfs_dinode_fmt
-{
-       XFS_DINODE_FMT_DEV,             /* CHR, BLK: di_dev */
-       XFS_DINODE_FMT_LOCAL,           /* DIR, REG: di_c */
-                                       /* LNK: di_symlink */
-       XFS_DINODE_FMT_EXTENTS,         /* DIR, REG, LNK: di_bmx */
-       XFS_DINODE_FMT_BTREE,           /* DIR, REG, LNK: di_bmbt */
-       XFS_DINODE_FMT_UUID             /* MNT: di_uuid */
+typedef enum xfs_dinode_fmt {
+       XFS_DINODE_FMT_DEV,             /* xfs_dev_t */
+       XFS_DINODE_FMT_LOCAL,           /* bulk data */
+       XFS_DINODE_FMT_EXTENTS,         /* struct xfs_bmbt_rec */
+       XFS_DINODE_FMT_BTREE,           /* struct xfs_bmdr_block */
+       XFS_DINODE_FMT_UUID             /* uuid_t */
 } xfs_dinode_fmt_t;
 
 /*
@@ -166,13 +105,13 @@ typedef enum xfs_dinode_fmt
  */
 #define        XFS_LITINO(mp)  ((mp)->m_litino)
 #define        XFS_BROOT_SIZE_ADJ      \
-       (sizeof(xfs_bmbt_block_t) - sizeof(xfs_bmdr_block_t))
+       (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t))
 
 /*
  * Inode data & attribute fork sizes, per inode.
  */
-#define XFS_DFORK_Q(dip)               ((dip)->di_core.di_forkoff != 0)
-#define XFS_DFORK_BOFF(dip)            ((int)((dip)->di_core.di_forkoff << 3))
+#define XFS_DFORK_Q(dip)               ((dip)->di_forkoff != 0)
+#define XFS_DFORK_BOFF(dip)            ((int)((dip)->di_forkoff << 3))
 
 #define XFS_DFORK_DSIZE(dip,mp) \
        (XFS_DFORK_Q(dip) ? \
@@ -187,22 +126,41 @@ typedef enum xfs_dinode_fmt
                XFS_DFORK_DSIZE(dip, mp) : \
                XFS_DFORK_ASIZE(dip, mp))
 
-#define XFS_DFORK_DPTR(dip)                ((dip)->di_u.di_c)
+/*
+ * Return pointers to the data or attribute forks.
+ */
+#define XFS_DFORK_DPTR(dip) \
+       ((char *)(dip) + sizeof(struct xfs_dinode))
 #define XFS_DFORK_APTR(dip)    \
-       ((dip)->di_u.di_c + XFS_DFORK_BOFF(dip))
+       (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
 #define XFS_DFORK_PTR(dip,w)   \
        ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
+
 #define XFS_DFORK_FORMAT(dip,w) \
        ((w) == XFS_DATA_FORK ? \
-               (dip)->di_core.di_format : \
-               (dip)->di_core.di_aformat)
+               (dip)->di_format : \
+               (dip)->di_aformat)
 #define XFS_DFORK_NEXTENTS(dip,w) \
        ((w) == XFS_DATA_FORK ? \
-               be32_to_cpu((dip)->di_core.di_nextents) : \
-               be16_to_cpu((dip)->di_core.di_anextents))
+               be32_to_cpu((dip)->di_nextents) : \
+               be16_to_cpu((dip)->di_anextents))
 
 #define        XFS_BUF_TO_DINODE(bp)   ((xfs_dinode_t *)XFS_BUF_PTR(bp))
 
+/*
+ * For block and character special files the 32bit dev_t is stored at the
+ * beginning of the data fork.
+ */
+static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
+{
+       return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
+}
+
+static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
+{
+       *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
+}
+
 /*
  * Values for di_flags
  * There should be a one-to-one correspondence between these flags and the
index deecc9d238f874b221086769c7eca5598826ca60..6ac44b550d39d5ff7b0ecd02965e8d8ae8daef04 100644 (file)
@@ -33,13 +33,6 @@ struct xfs_inode;
 struct xfs_mount;
 struct xfs_trans;
 
-/*
- * Maximum size of a shortform directory.
- */
-#define        XFS_DIR2_SF_MAX_SIZE    \
-       (XFS_DINODE_MAX_SIZE - (uint)sizeof(xfs_dinode_core_t) - \
-        (uint)sizeof(xfs_agino_t))
-
 /*
  * Inode number stored as 8 8-bit values.
  */
index a1e55fb9d5dde43c95565c1aece67d4345232391..e71e2581c0c34b7e3febea7b907d41c5b82c979d 100644 (file)
@@ -25,7 +25,6 @@
 #include "xfs_inum.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_clnt.h"
 
 
 static struct xfs_dmops xfs_dmcore_stub = {
@@ -38,9 +37,9 @@ static struct xfs_dmops xfs_dmcore_stub = {
 };
 
 int
-xfs_dmops_get(struct xfs_mount *mp, struct xfs_mount_args *args)
+xfs_dmops_get(struct xfs_mount *mp)
 {
-       if (args->flags & XFSMNT_DMAPI) {
+       if (mp->m_flags & XFS_MOUNT_DMAPI) {
                cmn_err(CE_WARN,
                        "XFS: dmapi support not available in this kernel.");
                return EINVAL;
index f227ecd1a2942714e1d062c142c27f14781613a1..92d5cd5bf4f2329a902247fc3dbaf2fa521e16bb 100644 (file)
@@ -153,21 +153,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
 }
 #endif /* DEBUG */
 
-static void
-xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap)
-{
-       if (mp != NULL) {
-               char    *newfmt;
-               int     len = 16 + mp->m_fsname_len + strlen(fmt);
-
-               newfmt = kmem_alloc(len, KM_SLEEP);
-               sprintf(newfmt, "Filesystem \"%s\": %s", mp->m_fsname, fmt);
-               icmn_err(level, newfmt, ap);
-               kmem_free(newfmt);
-       } else {
-               icmn_err(level, fmt, ap);
-       }
-}
 
 void
 xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
index 11543f10b0c6d10780846dcf1fedd65b3821f192..0c93051c46515efb60a898c9c5ac4af4b826be39 100644 (file)
@@ -159,11 +159,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
 #define                XFS_PTAG_FSBLOCK_ZERO           0x00000080
 
 struct xfs_mount;
-/* PRINTFLIKE4 */
+
+extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp,
+               char *fmt, va_list ap)
+       __attribute__ ((format (printf, 3, 0)));
 extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
-                       char *fmt, ...);
-/* PRINTFLIKE3 */
-extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...);
+                       char *fmt, ...)
+       __attribute__ ((format (printf, 4, 5)));
+extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...)
+       __attribute__ ((format (printf, 3, 4)));
 
 extern void xfs_hex_dump(void *p, int length);
 
index 8aa28f751b2a6cb54bf60762772079820f74426c..05a4bdd4be39a09db4cec3f18e7d9200838ec6b4 100644 (file)
@@ -108,19 +108,16 @@ xfs_efi_item_pin(xfs_efi_log_item_t *efip)
 STATIC void
 xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
 {
-       xfs_mount_t     *mp;
+       struct xfs_ail          *ailp = efip->efi_item.li_ailp;
 
-       mp = efip->efi_item.li_mountp;
-       spin_lock(&mp->m_ail_lock);
+       spin_lock(&ailp->xa_lock);
        if (efip->efi_flags & XFS_EFI_CANCELED) {
-               /*
-                * xfs_trans_delete_ail() drops the AIL lock.
-                */
-               xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip);
+               /* xfs_trans_ail_delete() drops the AIL lock. */
+               xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
                xfs_efi_item_free(efip);
        } else {
                efip->efi_flags |= XFS_EFI_COMMITTED;
-               spin_unlock(&mp->m_ail_lock);
+               spin_unlock(&ailp->xa_lock);
        }
 }
 
@@ -134,26 +131,23 @@ xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
 STATIC void
 xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp)
 {
-       xfs_mount_t     *mp;
+       struct xfs_ail          *ailp = efip->efi_item.li_ailp;
        xfs_log_item_desc_t     *lidp;
 
-       mp = efip->efi_item.li_mountp;
-       spin_lock(&mp->m_ail_lock);
+       spin_lock(&ailp->xa_lock);
        if (efip->efi_flags & XFS_EFI_CANCELED) {
                /*
                 * free the xaction descriptor pointing to this item
                 */
                lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip);
                xfs_trans_free_item(tp, lidp);
-               /*
-                * pull the item off the AIL.
-                * xfs_trans_delete_ail() drops the AIL lock.
-                */
-               xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip);
+
+               /* xfs_trans_ail_delete() drops the AIL lock. */
+               xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
                xfs_efi_item_free(efip);
        } else {
                efip->efi_flags |= XFS_EFI_COMMITTED;
-               spin_unlock(&mp->m_ail_lock);
+               spin_unlock(&ailp->xa_lock);
        }
 }
 
@@ -268,6 +262,7 @@ xfs_efi_init(xfs_mount_t    *mp,
        efip->efi_item.li_type = XFS_LI_EFI;
        efip->efi_item.li_ops = &xfs_efi_item_ops;
        efip->efi_item.li_mountp = mp;
+       efip->efi_item.li_ailp = mp->m_ail;
        efip->efi_format.efi_nextents = nextents;
        efip->efi_format.efi_id = (__psint_t)(void*)efip;
 
@@ -345,25 +340,22 @@ void
 xfs_efi_release(xfs_efi_log_item_t     *efip,
                uint                    nextents)
 {
-       xfs_mount_t     *mp;
-       int             extents_left;
+       struct xfs_ail          *ailp = efip->efi_item.li_ailp;
+       int                     extents_left;
 
-       mp = efip->efi_item.li_mountp;
        ASSERT(efip->efi_next_extent > 0);
        ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
 
-       spin_lock(&mp->m_ail_lock);
+       spin_lock(&ailp->xa_lock);
        ASSERT(efip->efi_next_extent >= nextents);
        efip->efi_next_extent -= nextents;
        extents_left = efip->efi_next_extent;
        if (extents_left == 0) {
-               /*
-                * xfs_trans_delete_ail() drops the AIL lock.
-                */
-               xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip);
+               /* xfs_trans_ail_delete() drops the AIL lock. */
+               xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
                xfs_efi_item_free(efip);
        } else {
-               spin_unlock(&mp->m_ail_lock);
+               spin_unlock(&ailp->xa_lock);
        }
 }
 
@@ -565,6 +557,7 @@ xfs_efd_init(xfs_mount_t    *mp,
        efdp->efd_item.li_type = XFS_LI_EFD;
        efdp->efd_item.li_ops = &xfs_efd_item_ops;
        efdp->efd_item.li_mountp = mp;
+       efdp->efd_item.li_ailp = mp->m_ail;
        efdp->efd_efip = efip;
        efdp->efd_format.efd_nextents = nextents;
        efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
index 01c0cc88d3f37055bc071dc545ca731509361bc6..589c41c3844638c2ddbc6f07e9c5a4df1d3c9634 100644 (file)
@@ -113,22 +113,14 @@ struct getbmapx {
 #define BMV_IF_ATTRFORK                0x1     /* return attr fork rather than data */
 #define BMV_IF_NO_DMAPI_READ   0x2     /* Do not generate DMAPI read event  */
 #define BMV_IF_PREALLOC                0x4     /* rtn status BMV_OF_PREALLOC if req */
-#define BMV_IF_VALID   (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC)
-#ifdef __KERNEL__
-#define BMV_IF_EXTENDED 0x40000000     /* getpmapx if set */
-#endif
+#define BMV_IF_DELALLOC                0x8     /* rtn status BMV_OF_DELALLOC if req */
+#define BMV_IF_VALID   \
+       (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC)
 
 /*     bmv_oflags values - returned for for each non-header segment */
 #define BMV_OF_PREALLOC                0x1     /* segment = unwritten pre-allocation */
-
-/*     Convert getbmap <-> getbmapx - move fields from p1 to p2. */
-#define GETBMAP_CONVERT(p1,p2) {       \
-       p2.bmv_offset = p1.bmv_offset;  \
-       p2.bmv_block = p1.bmv_block;    \
-       p2.bmv_length = p1.bmv_length;  \
-       p2.bmv_count = p1.bmv_count;    \
-       p2.bmv_entries = p1.bmv_entries;  }
-
+#define BMV_OF_DELALLOC                0x2     /* segment = delayed allocation */
+#define BMV_OF_LAST            0x4     /* segment is the last in the file */
 
 /*
  * Structure for XFS_IOC_FSSETDM.
@@ -426,10 +418,6 @@ typedef struct xfs_handle {
 #define XFS_IOC_GETXFLAGS      FS_IOC_GETFLAGS
 #define XFS_IOC_SETXFLAGS      FS_IOC_SETFLAGS
 #define XFS_IOC_GETVERSION     FS_IOC_GETVERSION
-/* 32-bit compat counterparts */
-#define XFS_IOC32_GETXFLAGS    FS_IOC32_GETFLAGS
-#define XFS_IOC32_SETXFLAGS    FS_IOC32_SETFLAGS
-#define XFS_IOC32_GETVERSION   FS_IOC32_GETVERSION
 
 /*
  * ioctl commands that replace IRIX fcntl()'s
index 84583cf73db329ac156986cfc9476f210a48fdda..852b6d32e8d02404c2d831bd311ba7c2d85bd45b 100644 (file)
@@ -126,7 +126,7 @@ xfs_growfs_data_private(
        xfs_extlen_t            agsize;
        xfs_extlen_t            tmpsize;
        xfs_alloc_rec_t         *arec;
-       xfs_btree_sblock_t      *block;
+       struct xfs_btree_block  *block;
        xfs_buf_t               *bp;
        int                     bucket;
        int                     dpct;
@@ -251,14 +251,14 @@ xfs_growfs_data_private(
                bp = xfs_buf_get(mp->m_ddev_targp,
                        XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
                        BTOBB(mp->m_sb.sb_blocksize), 0);
-               block = XFS_BUF_TO_SBLOCK(bp);
+               block = XFS_BUF_TO_BLOCK(bp);
                memset(block, 0, mp->m_sb.sb_blocksize);
                block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
                block->bb_level = 0;
                block->bb_numrecs = cpu_to_be16(1);
-               block->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-               block->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-               arec = XFS_BTREE_REC_ADDR(xfs_alloc, block, 1);
+               block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+               block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+               arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
                arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
                arec->ar_blockcount = cpu_to_be32(
                        agsize - be32_to_cpu(arec->ar_startblock));
@@ -272,14 +272,14 @@ xfs_growfs_data_private(
                bp = xfs_buf_get(mp->m_ddev_targp,
                        XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
                        BTOBB(mp->m_sb.sb_blocksize), 0);
-               block = XFS_BUF_TO_SBLOCK(bp);
+               block = XFS_BUF_TO_BLOCK(bp);
                memset(block, 0, mp->m_sb.sb_blocksize);
                block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
                block->bb_level = 0;
                block->bb_numrecs = cpu_to_be16(1);
-               block->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-               block->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-               arec = XFS_BTREE_REC_ADDR(xfs_alloc, block, 1);
+               block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+               block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+               arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
                arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
                arec->ar_blockcount = cpu_to_be32(
                        agsize - be32_to_cpu(arec->ar_startblock));
@@ -294,13 +294,13 @@ xfs_growfs_data_private(
                bp = xfs_buf_get(mp->m_ddev_targp,
                        XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
                        BTOBB(mp->m_sb.sb_blocksize), 0);
-               block = XFS_BUF_TO_SBLOCK(bp);
+               block = XFS_BUF_TO_BLOCK(bp);
                memset(block, 0, mp->m_sb.sb_blocksize);
                block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
                block->bb_level = 0;
                block->bb_numrecs = 0;
-               block->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-               block->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+               block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+               block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
                error = xfs_bwrite(mp, bp);
                if (error) {
                        goto error0;
@@ -435,6 +435,9 @@ xfs_growfs_data(
        xfs_growfs_data_t       *in)
 {
        int error;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return XFS_ERROR(EPERM);
        if (!mutex_trylock(&mp->m_growlock))
                return XFS_ERROR(EWOULDBLOCK);
        error = xfs_growfs_data_private(mp, in);
@@ -448,6 +451,9 @@ xfs_growfs_log(
        xfs_growfs_log_t        *in)
 {
        int error;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return XFS_ERROR(EPERM);
        if (!mutex_trylock(&mp->m_growlock))
                return XFS_ERROR(EWOULDBLOCK);
        error = xfs_growfs_log_private(mp, in);
index aad8c5da38afadb158518d7ee5dc51a3309726f2..e6ebbaeb4dc6ac6e62460e8f04bae3a29b3d7adf 100644 (file)
 #include "xfs_error.h"
 #include "xfs_bmap.h"
 
-/*
- * Log specified fields for the inode given by bp and off.
- */
-STATIC void
-xfs_ialloc_log_di(
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_buf_t       *bp,            /* inode buffer */
-       int             off,            /* index of inode in buffer */
-       int             fields)         /* bitmask of fields to log */
-{
-       int                     first;          /* first byte number */
-       int                     ioffset;        /* off in bytes */
-       int                     last;           /* last byte number */
-       xfs_mount_t             *mp;            /* mount point structure */
-       static const short      offsets[] = {   /* field offsets */
-                                               /* keep in sync with bits */
-               offsetof(xfs_dinode_core_t, di_magic),
-               offsetof(xfs_dinode_core_t, di_mode),
-               offsetof(xfs_dinode_core_t, di_version),
-               offsetof(xfs_dinode_core_t, di_format),
-               offsetof(xfs_dinode_core_t, di_onlink),
-               offsetof(xfs_dinode_core_t, di_uid),
-               offsetof(xfs_dinode_core_t, di_gid),
-               offsetof(xfs_dinode_core_t, di_nlink),
-               offsetof(xfs_dinode_core_t, di_projid),
-               offsetof(xfs_dinode_core_t, di_pad),
-               offsetof(xfs_dinode_core_t, di_atime),
-               offsetof(xfs_dinode_core_t, di_mtime),
-               offsetof(xfs_dinode_core_t, di_ctime),
-               offsetof(xfs_dinode_core_t, di_size),
-               offsetof(xfs_dinode_core_t, di_nblocks),
-               offsetof(xfs_dinode_core_t, di_extsize),
-               offsetof(xfs_dinode_core_t, di_nextents),
-               offsetof(xfs_dinode_core_t, di_anextents),
-               offsetof(xfs_dinode_core_t, di_forkoff),
-               offsetof(xfs_dinode_core_t, di_aformat),
-               offsetof(xfs_dinode_core_t, di_dmevmask),
-               offsetof(xfs_dinode_core_t, di_dmstate),
-               offsetof(xfs_dinode_core_t, di_flags),
-               offsetof(xfs_dinode_core_t, di_gen),
-               offsetof(xfs_dinode_t, di_next_unlinked),
-               offsetof(xfs_dinode_t, di_u),
-               offsetof(xfs_dinode_t, di_a),
-               sizeof(xfs_dinode_t)
-       };
-
-
-       ASSERT(offsetof(xfs_dinode_t, di_core) == 0);
-       ASSERT((fields & (XFS_DI_U|XFS_DI_A)) == 0);
-       mp = tp->t_mountp;
-       /*
-        * Get the inode-relative first and last bytes for these fields
-        */
-       xfs_btree_offsets(fields, offsets, XFS_DI_NUM_BITS, &first, &last);
-       /*
-        * Convert to buffer offsets and log it.
-        */
-       ioffset = off << mp->m_sb.sb_inodelog;
-       first += ioffset;
-       last += ioffset;
-       xfs_trans_log_buf(tp, bp, first, last);
-}
 
 /*
  * Allocation group level functions.
@@ -118,6 +56,102 @@ xfs_ialloc_cluster_alignment(
        return 1;
 }
 
+/*
+ * Lookup the record equal to ino in the btree given by cur.
+ */
+STATIC int                             /* error */
+xfs_inobt_lookup_eq(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agino_t             ino,    /* starting inode of chunk */
+       __int32_t               fcnt,   /* free inode count */
+       xfs_inofree_t           free,   /* free inode mask */
+       int                     *stat)  /* success/failure */
+{
+       cur->bc_rec.i.ir_startino = ino;
+       cur->bc_rec.i.ir_freecount = fcnt;
+       cur->bc_rec.i.ir_free = free;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/*
+ * Lookup the first record greater than or equal to ino
+ * in the btree given by cur.
+ */
+int                                    /* error */
+xfs_inobt_lookup_ge(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agino_t             ino,    /* starting inode of chunk */
+       __int32_t               fcnt,   /* free inode count */
+       xfs_inofree_t           free,   /* free inode mask */
+       int                     *stat)  /* success/failure */
+{
+       cur->bc_rec.i.ir_startino = ino;
+       cur->bc_rec.i.ir_freecount = fcnt;
+       cur->bc_rec.i.ir_free = free;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+ * Lookup the first record less than or equal to ino
+ * in the btree given by cur.
+ */
+int                                    /* error */
+xfs_inobt_lookup_le(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agino_t             ino,    /* starting inode of chunk */
+       __int32_t               fcnt,   /* free inode count */
+       xfs_inofree_t           free,   /* free inode mask */
+       int                     *stat)  /* success/failure */
+{
+       cur->bc_rec.i.ir_startino = ino;
+       cur->bc_rec.i.ir_freecount = fcnt;
+       cur->bc_rec.i.ir_free = free;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [ino, fcnt, free].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int                             /* error */
+xfs_inobt_update(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agino_t             ino,    /* starting inode of chunk */
+       __int32_t               fcnt,   /* free inode count */
+       xfs_inofree_t           free)   /* free inode mask */
+{
+       union xfs_btree_rec     rec;
+
+       rec.inobt.ir_startino = cpu_to_be32(ino);
+       rec.inobt.ir_freecount = cpu_to_be32(fcnt);
+       rec.inobt.ir_free = cpu_to_be64(free);
+       return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int                                    /* error */
+xfs_inobt_get_rec(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agino_t             *ino,   /* output: starting inode of chunk */
+       __int32_t               *fcnt,  /* output: number of free inodes */
+       xfs_inofree_t           *free,  /* output: free inode mask */
+       int                     *stat)  /* output: success/failure */
+{
+       union xfs_btree_rec     *rec;
+       int                     error;
+
+       error = xfs_btree_get_rec(cur, &rec, stat);
+       if (!error && *stat == 1) {
+               *ino = be32_to_cpu(rec->inobt.ir_startino);
+               *fcnt = be32_to_cpu(rec->inobt.ir_freecount);
+               *free = be64_to_cpu(rec->inobt.ir_free);
+       }
+       return error;
+}
+
 /*
  * Allocate new inodes in the allocation group specified by agbp.
  * Return 0 for success, else error code.
@@ -287,9 +321,9 @@ xfs_ialloc_ag_alloc(
         * able to use the file system.
         */
        if (xfs_sb_version_hasnlink(&args.mp->m_sb))
-               version = XFS_DINODE_VERSION_2;
+               version = 2;
        else
-               version = XFS_DINODE_VERSION_1;
+               version = 1;
 
        /*
         * Seed the new inode cluster with a random generation number. This
@@ -310,18 +344,25 @@ xfs_ialloc_ag_alloc(
                                         XFS_BUF_LOCK);
                ASSERT(fbuf);
                ASSERT(!XFS_BUF_GETERROR(fbuf));
+
                /*
-                * Set initial values for the inodes in this buffer.
+                * Initialize all inodes in this buffer and then log them.
+                *
+                * XXX: It would be much better if we had just one transaction to
+                *      log a whole cluster of inodes instead of all the indivdual
+                *      transactions causing a lot of log traffic.
                 */
                xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
                for (i = 0; i < ninodes; i++) {
+                       int     ioffset = i << args.mp->m_sb.sb_inodelog;
+                       uint    isize = sizeof(struct xfs_dinode);
+
                        free = XFS_MAKE_IPTR(args.mp, fbuf, i);
-                       free->di_core.di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
-                       free->di_core.di_version = version;
-                       free->di_core.di_gen = cpu_to_be32(gen);
+                       free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+                       free->di_version = version;
+                       free->di_gen = cpu_to_be32(gen);
                        free->di_next_unlinked = cpu_to_be32(NULLAGINO);
-                       xfs_ialloc_log_di(tp, fbuf, i,
-                               XFS_DI_CORE_BITS | XFS_DI_NEXT_UNLINKED);
+                       xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
                }
                xfs_trans_inode_alloc_buf(tp, fbuf);
        }
@@ -335,8 +376,7 @@ xfs_ialloc_ag_alloc(
        /*
         * Insert records describing the new inode chunk into the btree.
         */
-       cur = xfs_btree_init_cursor(args.mp, tp, agbp, agno,
-                       XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
+       cur = xfs_inobt_init_cursor(args.mp, tp, agbp, agno);
        for (thisino = newino;
             thisino < newino + newlen;
             thisino += XFS_INODES_PER_CHUNK) {
@@ -346,7 +386,7 @@ xfs_ialloc_ag_alloc(
                        return error;
                }
                ASSERT(i == 0);
-               if ((error = xfs_inobt_insert(cur, &i))) {
+               if ((error = xfs_btree_insert(cur, &i))) {
                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
                        return error;
                }
@@ -676,8 +716,7 @@ nextag:
         */
        agno = tagno;
        *IO_agbp = NULL;
-       cur = xfs_btree_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno),
-                                   XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
+       cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
        /*
         * If pagino is 0 (this is the root inode allocation) use newino.
         * This must work because we've just allocated some.
@@ -697,7 +736,7 @@ nextag:
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                        freecount += rec.ir_freecount;
-                       if ((error = xfs_inobt_increment(cur, 0, &i)))
+                       if ((error = xfs_btree_increment(cur, 0, &i)))
                                goto error0;
                } while (i == 1);
 
@@ -741,7 +780,7 @@ nextag:
                        /*
                         * Search left with tcur, back up 1 record.
                         */
-                       if ((error = xfs_inobt_decrement(tcur, 0, &i)))
+                       if ((error = xfs_btree_decrement(tcur, 0, &i)))
                                goto error1;
                        doneleft = !i;
                        if (!doneleft) {
@@ -755,7 +794,7 @@ nextag:
                        /*
                         * Search right with cur, go forward 1 record.
                         */
-                       if ((error = xfs_inobt_increment(cur, 0, &i)))
+                       if ((error = xfs_btree_increment(cur, 0, &i)))
                                goto error1;
                        doneright = !i;
                        if (!doneright) {
@@ -817,7 +856,7 @@ nextag:
                                 * further left.
                                 */
                                if (useleft) {
-                                       if ((error = xfs_inobt_decrement(tcur, 0,
+                                       if ((error = xfs_btree_decrement(tcur, 0,
                                                        &i)))
                                                goto error1;
                                        doneleft = !i;
@@ -837,7 +876,7 @@ nextag:
                                 * further right.
                                 */
                                else {
-                                       if ((error = xfs_inobt_increment(cur, 0,
+                                       if ((error = xfs_btree_increment(cur, 0,
                                                        &i)))
                                                goto error1;
                                        doneright = !i;
@@ -892,7 +931,7 @@ nextag:
                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                                if (rec.ir_freecount > 0)
                                        break;
-                               if ((error = xfs_inobt_increment(cur, 0, &i)))
+                               if ((error = xfs_btree_increment(cur, 0, &i)))
                                        goto error0;
                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                        }
@@ -926,7 +965,7 @@ nextag:
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                        freecount += rec.ir_freecount;
-                       if ((error = xfs_inobt_increment(cur, 0, &i)))
+                       if ((error = xfs_btree_increment(cur, 0, &i)))
                                goto error0;
                } while (i == 1);
                ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
@@ -1022,8 +1061,7 @@ xfs_difree(
        /*
         * Initialize the cursor.
         */
-       cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
-               (xfs_inode_t *)0, 0);
+       cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
 #ifdef DEBUG
        if (cur->bc_nlevels == 1) {
                int freecount = 0;
@@ -1036,7 +1074,7 @@ xfs_difree(
                                goto error0;
                        if (i) {
                                freecount += rec.ir_freecount;
-                               if ((error = xfs_inobt_increment(cur, 0, &i)))
+                               if ((error = xfs_btree_increment(cur, 0, &i)))
                                        goto error0;
                        }
                } while (i == 1);
@@ -1098,8 +1136,8 @@ xfs_difree(
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
 
-               if ((error = xfs_inobt_delete(cur, &i))) {
-                       cmn_err(CE_WARN, "xfs_difree: xfs_inobt_delete returned an error %d on %s.\n",
+               if ((error = xfs_btree_delete(cur, &i))) {
+                       cmn_err(CE_WARN, "xfs_difree: xfs_btree_delete returned an error %d on %s.\n",
                                error, mp->m_fsname);
                        goto error0;
                }
@@ -1141,7 +1179,7 @@ xfs_difree(
                                goto error0;
                        if (i) {
                                freecount += rec.ir_freecount;
-                               if ((error = xfs_inobt_increment(cur, 0, &i)))
+                               if ((error = xfs_btree_increment(cur, 0, &i)))
                                        goto error0;
                        }
                } while (i == 1);
@@ -1158,36 +1196,28 @@ error0:
 }
 
 /*
- * Return the location of the inode in bno/off, for mapping it into a buffer.
+ * Return the location of the inode in imap, for mapping it into a buffer.
  */
-/*ARGSUSED*/
 int
-xfs_dilocate(
-       xfs_mount_t     *mp,    /* file system mount structure */
-       xfs_trans_t     *tp,    /* transaction pointer */
+xfs_imap(
+       xfs_mount_t      *mp,   /* file system mount structure */
+       xfs_trans_t      *tp,   /* transaction pointer */
        xfs_ino_t       ino,    /* inode to locate */
-       xfs_fsblock_t   *bno,   /* output: block containing inode */
-       int             *len,   /* output: num blocks in inode cluster */
-       int             *off,   /* output: index in block of inode */
-       uint            flags)  /* flags concerning inode lookup */
+       struct xfs_imap *imap,  /* location map structure */
+       uint            flags)  /* flags for inode btree lookup */
 {
        xfs_agblock_t   agbno;  /* block number of inode in the alloc group */
-       xfs_buf_t       *agbp;  /* agi buffer */
        xfs_agino_t     agino;  /* inode number within alloc group */
        xfs_agnumber_t  agno;   /* allocation group number */
        int             blks_per_cluster; /* num blocks per inode cluster */
        xfs_agblock_t   chunk_agbno;    /* first block in inode chunk */
-       xfs_agino_t     chunk_agino;    /* first agino in inode chunk */
-       __int32_t       chunk_cnt;      /* count of free inodes in chunk */
-       xfs_inofree_t   chunk_free;     /* mask of free inodes in chunk */
        xfs_agblock_t   cluster_agbno;  /* first block in inode cluster */
-       xfs_btree_cur_t *cur;   /* inode btree cursor */
        int             error;  /* error code */
-       int             i;      /* temp state */
        int             offset; /* index of inode in its buffer */
        int             offset_agbno;   /* blks from chunk start to inode */
 
        ASSERT(ino != NULLFSINO);
+
        /*
         * Split up the inode number into its parts.
         */
@@ -1198,24 +1228,24 @@ xfs_dilocate(
            ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
 #ifdef DEBUG
                /* no diagnostics for bulkstat, ino comes from userspace */
-               if (flags & XFS_IMAP_BULKSTAT)
+               if (flags & XFS_IGET_BULKSTAT)
                        return XFS_ERROR(EINVAL);
                if (agno >= mp->m_sb.sb_agcount) {
                        xfs_fs_cmn_err(CE_ALERT, mp,
-                                       "xfs_dilocate: agno (%d) >= "
+                                       "xfs_imap: agno (%d) >= "
                                        "mp->m_sb.sb_agcount (%d)",
                                        agno,  mp->m_sb.sb_agcount);
                }
                if (agbno >= mp->m_sb.sb_agblocks) {
                        xfs_fs_cmn_err(CE_ALERT, mp,
-                                       "xfs_dilocate: agbno (0x%llx) >= "
+                                       "xfs_imap: agbno (0x%llx) >= "
                                        "mp->m_sb.sb_agblocks (0x%lx)",
                                        (unsigned long long) agbno,
                                        (unsigned long) mp->m_sb.sb_agblocks);
                }
                if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
                        xfs_fs_cmn_err(CE_ALERT, mp,
-                                       "xfs_dilocate: ino (0x%llx) != "
+                                       "xfs_imap: ino (0x%llx) != "
                                        "XFS_AGINO_TO_INO(mp, agno, agino) "
                                        "(0x%llx)",
                                        ino, XFS_AGINO_TO_INO(mp, agno, agino));
@@ -1224,65 +1254,89 @@ xfs_dilocate(
 #endif /* DEBUG */
                return XFS_ERROR(EINVAL);
        }
-       if ((mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) ||
-           !(flags & XFS_IMAP_LOOKUP)) {
+
+       /*
+        * If the inode cluster size is the same as the blocksize or
+        * smaller we get to the buffer by simple arithmetics.
+        */
+       if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) {
                offset = XFS_INO_TO_OFFSET(mp, ino);
                ASSERT(offset < mp->m_sb.sb_inopblock);
-               *bno = XFS_AGB_TO_FSB(mp, agno, agbno);
-               *off = offset;
-               *len = 1;
+
+               imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+               imap->im_len = XFS_FSB_TO_BB(mp, 1);
+               imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
                return 0;
        }
+
        blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
-       if (*bno != NULLFSBLOCK) {
+
+       /*
+        * If we get a block number passed from bulkstat we can use it to
+        * find the buffer easily.
+        */
+       if (imap->im_blkno) {
                offset = XFS_INO_TO_OFFSET(mp, ino);
                ASSERT(offset < mp->m_sb.sb_inopblock);
-               cluster_agbno = XFS_FSB_TO_AGBNO(mp, *bno);
-               *off = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
-                       offset;
-               *len = blks_per_cluster;
+
+               cluster_agbno = XFS_DADDR_TO_AGBNO(mp, imap->im_blkno);
+               offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock;
+
+               imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+               imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
                return 0;
        }
+
+       /*
+        * If the inode chunks are aligned then use simple maths to
+        * find the location. Otherwise we have to do a btree
+        * lookup to find the location.
+        */
        if (mp->m_inoalign_mask) {
                offset_agbno = agbno & mp->m_inoalign_mask;
                chunk_agbno = agbno - offset_agbno;
        } else {
+               xfs_btree_cur_t *cur;   /* inode btree cursor */
+               xfs_agino_t     chunk_agino; /* first agino in inode chunk */
+               __int32_t       chunk_cnt; /* count of free inodes in chunk */
+               xfs_inofree_t   chunk_free; /* mask of free inodes in chunk */
+               xfs_buf_t       *agbp;  /* agi buffer */
+               int             i;      /* temp state */
+
                down_read(&mp->m_peraglock);
                error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
                up_read(&mp->m_peraglock);
                if (error) {
-#ifdef DEBUG
-                       xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+                       xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
                                        "xfs_ialloc_read_agi() returned "
                                        "error %d, agno %d",
                                        error, agno);
-#endif /* DEBUG */
                        return error;
                }
-               cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
-                       (xfs_inode_t *)0, 0);
-               if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
-#ifdef DEBUG
-                       xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+
+               cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
+               error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i);
+               if (error) {
+                       xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
                                        "xfs_inobt_lookup_le() failed");
-#endif /* DEBUG */
                        goto error0;
                }
-               if ((error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
-                               &chunk_free, &i))) {
-#ifdef DEBUG
-                       xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+
+               error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
+                               &chunk_free, &i);
+               if (error) {
+                       xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
                                        "xfs_inobt_get_rec() failed");
-#endif /* DEBUG */
                        goto error0;
                }
                if (i == 0) {
 #ifdef DEBUG
-                       xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+                       xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
                                        "xfs_inobt_get_rec() failed");
 #endif /* DEBUG */
                        error = XFS_ERROR(EINVAL);
                }
+ error0:
                xfs_trans_brelse(tp, agbp);
                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
                if (error)
@@ -1290,19 +1344,35 @@ xfs_dilocate(
                chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino);
                offset_agbno = agbno - chunk_agbno;
        }
+
        ASSERT(agbno >= chunk_agbno);
        cluster_agbno = chunk_agbno +
                ((offset_agbno / blks_per_cluster) * blks_per_cluster);
        offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
                XFS_INO_TO_OFFSET(mp, ino);
-       *bno = XFS_AGB_TO_FSB(mp, agno, cluster_agbno);
-       *off = offset;
-       *len = blks_per_cluster;
+
+       imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
+       imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+       imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+
+       /*
+        * If the inode number maps to a block outside the bounds
+        * of the file system then return NULL rather than calling
+        * read_buf and panicing when we get an error from the
+        * driver.
+        */
+       if ((imap->im_blkno + imap->im_len) >
+           XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+               xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
+                       "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
+                       " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
+                       (unsigned long long) imap->im_blkno,
+                       (unsigned long long) imap->im_len,
+                       XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+               return XFS_ERROR(EINVAL);
+       }
+
        return 0;
-error0:
-       xfs_trans_brelse(tp, agbp);
-       xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-       return error;
 }
 
 /*
@@ -1370,70 +1440,95 @@ xfs_ialloc_log_agi(
        xfs_trans_log_buf(tp, bp, first, last);
 }
 
+#ifdef DEBUG
+STATIC void
+xfs_check_agi_unlinked(
+       struct xfs_agi          *agi)
+{
+       int                     i;
+
+       for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
+               ASSERT(agi->agi_unlinked[i]);
+}
+#else
+#define xfs_check_agi_unlinked(agi)
+#endif
+
 /*
  * Read in the allocation group header (inode allocation section)
  */
 int
-xfs_ialloc_read_agi(
-       xfs_mount_t     *mp,            /* file system mount structure */
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_agnumber_t  agno,           /* allocation group number */
-       xfs_buf_t       **bpp)          /* allocation group hdr buf */
+xfs_read_agi(
+       struct xfs_mount        *mp,    /* file system mount structure */
+       struct xfs_trans        *tp,    /* transaction pointer */
+       xfs_agnumber_t          agno,   /* allocation group number */
+       struct xfs_buf          **bpp)  /* allocation group hdr buf */
 {
-       xfs_agi_t       *agi;           /* allocation group header */
-       int             agi_ok;         /* agi is consistent */
-       xfs_buf_t       *bp;            /* allocation group hdr buf */
-       xfs_perag_t     *pag;           /* per allocation group data */
-       int             error;
+       struct xfs_agi          *agi;   /* allocation group header */
+       int                     agi_ok; /* agi is consistent */
+       int                     error;
 
        ASSERT(agno != NULLAGNUMBER);
-       error = xfs_trans_read_buf(
-                       mp, tp, mp->m_ddev_targp,
+
+       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
                        XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-                       XFS_FSS_TO_BB(mp, 1), 0, &bp);
+                       XFS_FSS_TO_BB(mp, 1), 0, bpp);
        if (error)
                return error;
-       ASSERT(bp && !XFS_BUF_GETERROR(bp));
+
+       ASSERT(*bpp && !XFS_BUF_GETERROR(*bpp));
+       agi = XFS_BUF_TO_AGI(*bpp);
 
        /*
         * Validate the magic number of the agi block.
         */
-       agi = XFS_BUF_TO_AGI(bp);
-       agi_ok =
-               be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
-               XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
+       agi_ok = be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
+               XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) &&
+               be32_to_cpu(agi->agi_seqno) == agno;
        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
                        XFS_RANDOM_IALLOC_READ_AGI))) {
-               XFS_CORRUPTION_ERROR("xfs_ialloc_read_agi", XFS_ERRLEVEL_LOW,
+               XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW,
                                     mp, agi);
-               xfs_trans_brelse(tp, bp);
+               xfs_trans_brelse(tp, *bpp);
                return XFS_ERROR(EFSCORRUPTED);
        }
+
+       XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGI, XFS_AGI_REF);
+
+       xfs_check_agi_unlinked(agi);
+       return 0;
+}
+
+int
+xfs_ialloc_read_agi(
+       struct xfs_mount        *mp,    /* file system mount structure */
+       struct xfs_trans        *tp,    /* transaction pointer */
+       xfs_agnumber_t          agno,   /* allocation group number */
+       struct xfs_buf          **bpp)  /* allocation group hdr buf */
+{
+       struct xfs_agi          *agi;   /* allocation group header */
+       struct xfs_perag        *pag;   /* per allocation group data */
+       int                     error;
+
+       error = xfs_read_agi(mp, tp, agno, bpp);
+       if (error)
+               return error;
+
+       agi = XFS_BUF_TO_AGI(*bpp);
        pag = &mp->m_perag[agno];
+
        if (!pag->pagi_init) {
                pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
                pag->pagi_count = be32_to_cpu(agi->agi_count);
                pag->pagi_init = 1;
-       } else {
-               /*
-                * It's possible for these to be out of sync if
-                * we are in the middle of a forced shutdown.
-                */
-               ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
-                       XFS_FORCED_SHUTDOWN(mp));
        }
 
-#ifdef DEBUG
-       {
-               int     i;
-
-               for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
-                       ASSERT(agi->agi_unlinked[i]);
-       }
-#endif
-
-       XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGI, XFS_AGI_REF);
-       *bpp = bp;
+       /*
+        * It's possible for these to be out of sync if
+        * we are in the middle of a forced shutdown.
+        */
+       ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
+               XFS_FORCED_SHUTDOWN(mp));
        return 0;
 }
 
index 4e30ec1d13bc35b3fd3af4b0b0ad50bde967ad58..50f558a4e0a861d8e1cf9b42018af0dc953d5dcd 100644 (file)
@@ -20,6 +20,7 @@
 
 struct xfs_buf;
 struct xfs_dinode;
+struct xfs_imap;
 struct xfs_mount;
 struct xfs_trans;
 
@@ -56,7 +57,6 @@ static inline int xfs_ialloc_find_free(xfs_inofree_t *fp)
 }
 
 
-#ifdef __KERNEL__
 /*
  * Allocate an inode on disk.
  * Mode is used to tell whether the new inode will need space, and whether
@@ -105,17 +105,14 @@ xfs_difree(
        xfs_ino_t       *first_ino);    /* first inode in deleted cluster */
 
 /*
- * Return the location of the inode in bno/len/off,
- * for mapping it into a buffer.
+ * Return the location of the inode in imap, for mapping it into a buffer.
  */
 int
-xfs_dilocate(
+xfs_imap(
        struct xfs_mount *mp,           /* file system mount structure */
        struct xfs_trans *tp,           /* transaction pointer */
        xfs_ino_t       ino,            /* inode to locate */
-       xfs_fsblock_t   *bno,           /* output: block containing inode */
-       int             *len,           /* output: num blocks in cluster*/
-       int             *off,           /* output: index in block of inode */
+       struct xfs_imap *imap,          /* location map structure */
        uint            flags);         /* flags for inode btree lookup */
 
 /*
@@ -154,6 +151,24 @@ xfs_ialloc_pagi_init(
        struct xfs_trans *tp,           /* transaction pointer */
         xfs_agnumber_t  agno);         /* allocation group number */
 
-#endif /* __KERNEL__ */
+/*
+ * Lookup the first record greater than or equal to ino
+ * in the btree given by cur.
+ */
+int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino,
+               __int32_t fcnt, xfs_inofree_t free, int *stat);
+
+/*
+ * Lookup the first record less than or equal to ino
+ * in the btree given by cur.
+ */
+int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
+               __int32_t fcnt, xfs_inofree_t free, int *stat);
+
+/*
+ * Get the data from the pointed-to record.
+ */
+extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino,
+                            __int32_t *fcnt, xfs_inofree_t *free, int *stat);
 
 #endif /* __XFS_IALLOC_H__ */
index 83502f3edef0d99477e6fb539418950ec9019c9b..99f2408e8d8e634ac446914a505982bba3d48deb 100644 (file)
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 
-STATIC void xfs_inobt_log_block(xfs_trans_t *, xfs_buf_t *, int);
-STATIC void xfs_inobt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_inobt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_inobt_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC int xfs_inobt_lshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_inobt_newroot(xfs_btree_cur_t *, int *);
-STATIC int xfs_inobt_rshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_inobt_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
-               xfs_inobt_key_t *, xfs_btree_cur_t **, int *);
-STATIC int xfs_inobt_updkey(xfs_btree_cur_t *, xfs_inobt_key_t *, int);
 
-/*
- * Single level of the xfs_inobt_delete record deletion routine.
- * Delete record pointed to by cur/level.
- * Remove the record from its block then rebalance the tree.
- * Return 0 for error, 1 for done, 2 to go on to the next level.
- */
-STATIC int                             /* error */
-xfs_inobt_delrec(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level removing record from */
-       int                     *stat)  /* fail/done/go-on */
+STATIC int
+xfs_inobt_get_minrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
 {
-       xfs_buf_t               *agbp;  /* buffer for a.g. inode header */
-       xfs_mount_t             *mp;    /* mount structure */
-       xfs_agi_t               *agi;   /* allocation group inode header */
-       xfs_inobt_block_t       *block; /* btree block record/key lives in */
-       xfs_agblock_t           bno;    /* btree block number */
-       xfs_buf_t               *bp;    /* buffer for block */
-       int                     error;  /* error return value */
-       int                     i;      /* loop index */
-       xfs_inobt_key_t         key;    /* kp points here if block is level 0 */
-       xfs_inobt_key_t         *kp = NULL;     /* pointer to btree keys */
-       xfs_agblock_t           lbno;   /* left block's block number */
-       xfs_buf_t               *lbp;   /* left block's buffer pointer */
-       xfs_inobt_block_t       *left;  /* left btree block */
-       xfs_inobt_key_t         *lkp;   /* left block key pointer */
-       xfs_inobt_ptr_t         *lpp;   /* left block address pointer */
-       int                     lrecs = 0;      /* number of records in left block */
-       xfs_inobt_rec_t         *lrp;   /* left block record pointer */
-       xfs_inobt_ptr_t         *pp = NULL;     /* pointer to btree addresses */
-       int                     ptr;    /* index in btree block for this rec */
-       xfs_agblock_t           rbno;   /* right block's block number */
-       xfs_buf_t               *rbp;   /* right block's buffer pointer */
-       xfs_inobt_block_t       *right; /* right btree block */
-       xfs_inobt_key_t         *rkp;   /* right block key pointer */
-       xfs_inobt_rec_t         *rp;    /* pointer to btree records */
-       xfs_inobt_ptr_t         *rpp;   /* right block address pointer */
-       int                     rrecs = 0;      /* number of records in right block */
-       int                     numrecs;
-       xfs_inobt_rec_t         *rrp;   /* right block record pointer */
-       xfs_btree_cur_t         *tcur;  /* temporary btree cursor */
-
-       mp = cur->bc_mp;
-
-       /*
-        * Get the index of the entry being deleted, check for nothing there.
-        */
-       ptr = cur->bc_ptrs[level];
-       if (ptr == 0) {
-               *stat = 0;
-               return 0;
-       }
-
-       /*
-        * Get the buffer & block containing the record or key/ptr.
-        */
-       bp = cur->bc_bufs[level];
-       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-               return error;
-#endif
-       /*
-        * Fail if we're off the end of the block.
-        */
+       return cur->bc_mp->m_inobt_mnr[level != 0];
+}
 
-       numrecs = be16_to_cpu(block->bb_numrecs);
-       if (ptr > numrecs) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * It's a nonleaf.  Excise the key and ptr being deleted, by
-        * sliding the entries past them down one.
-        * Log the changed areas of the block.
-        */
-       if (level > 0) {
-               kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
-               pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
-               for (i = ptr; i < numrecs; i++) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i]), level)))
-                               return error;
-               }
-#endif
-               if (ptr < numrecs) {
-                       memmove(&kp[ptr - 1], &kp[ptr],
-                               (numrecs - ptr) * sizeof(*kp));
-                       memmove(&pp[ptr - 1], &pp[ptr],
-                               (numrecs - ptr) * sizeof(*kp));
-                       xfs_inobt_log_keys(cur, bp, ptr, numrecs - 1);
-                       xfs_inobt_log_ptrs(cur, bp, ptr, numrecs - 1);
-               }
-       }
-       /*
-        * It's a leaf.  Excise the record being deleted, by sliding the
-        * entries past it down one.  Log the changed areas of the block.
-        */
-       else {
-               rp = XFS_INOBT_REC_ADDR(block, 1, cur);
-               if (ptr < numrecs) {
-                       memmove(&rp[ptr - 1], &rp[ptr],
-                               (numrecs - ptr) * sizeof(*rp));
-                       xfs_inobt_log_recs(cur, bp, ptr, numrecs - 1);
-               }
-               /*
-                * If it's the first record in the block, we'll need a key
-                * structure to pass up to the next level (updkey).
-                */
-               if (ptr == 1) {
-                       key.ir_startino = rp->ir_startino;
-                       kp = &key;
-               }
-       }
-       /*
-        * Decrement and log the number of entries in the block.
-        */
-       numrecs--;
-       block->bb_numrecs = cpu_to_be16(numrecs);
-       xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-       /*
-        * Is this the root level?  If so, we're almost done.
-        */
-       if (level == cur->bc_nlevels - 1) {
-               /*
-                * If this is the root level,
-                * and there's only one entry left,
-                * and it's NOT the leaf level,
-                * then we can get rid of this level.
-                */
-               if (numrecs == 1 && level > 0) {
-                       agbp = cur->bc_private.a.agbp;
-                       agi = XFS_BUF_TO_AGI(agbp);
-                       /*
-                        * pp is still set to the first pointer in the block.
-                        * Make it the new root of the btree.
-                        */
-                       bno = be32_to_cpu(agi->agi_root);
-                       agi->agi_root = *pp;
-                       be32_add_cpu(&agi->agi_level, -1);
-                       /*
-                        * Free the block.
-                        */
-                       if ((error = xfs_free_extent(cur->bc_tp,
-                               XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, bno), 1)))
-                               return error;
-                       xfs_trans_binval(cur->bc_tp, bp);
-                       xfs_ialloc_log_agi(cur->bc_tp, agbp,
-                               XFS_AGI_ROOT | XFS_AGI_LEVEL);
-                       /*
-                        * Update the cursor so there's one fewer level.
-                        */
-                       cur->bc_bufs[level] = NULL;
-                       cur->bc_nlevels--;
-               } else if (level > 0 &&
-                          (error = xfs_inobt_decrement(cur, level, &i)))
-                       return error;
-               *stat = 1;
-               return 0;
-       }
-       /*
-        * If we deleted the leftmost entry in the block, update the
-        * key values above us in the tree.
-        */
-       if (ptr == 1 && (error = xfs_inobt_updkey(cur, kp, level + 1)))
-               return error;
-       /*
-        * If the number of records remaining in the block is at least
-        * the minimum, we're done.
-        */
-       if (numrecs >= XFS_INOBT_BLOCK_MINRECS(level, cur)) {
-               if (level > 0 &&
-                   (error = xfs_inobt_decrement(cur, level, &i)))
-                       return error;
-               *stat = 1;
-               return 0;
-       }
-       /*
-        * Otherwise, we have to move some records around to keep the
-        * tree balanced.  Look at the left and right sibling blocks to
-        * see if we can re-balance by moving only one record.
-        */
-       rbno = be32_to_cpu(block->bb_rightsib);
-       lbno = be32_to_cpu(block->bb_leftsib);
-       bno = NULLAGBLOCK;
-       ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
-       /*
-        * Duplicate the cursor so our btree manipulations here won't
-        * disrupt the next level up.
-        */
-       if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-               return error;
-       /*
-        * If there's a right sibling, see if it's ok to shift an entry
-        * out of it.
-        */
-       if (rbno != NULLAGBLOCK) {
-               /*
-                * Move the temp cursor to the last entry in the next block.
-                * Actually any entry but the first would suffice.
-                */
-               i = xfs_btree_lastrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if ((error = xfs_inobt_increment(tcur, level, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               i = xfs_btree_lastrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               /*
-                * Grab a pointer to the block.
-                */
-               rbp = tcur->bc_bufs[level];
-               right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                       goto error0;
-#endif
-               /*
-                * Grab the current block number, for future use.
-                */
-               bno = be32_to_cpu(right->bb_leftsib);
-               /*
-                * If right block is full enough so that removing one entry
-                * won't make it too empty, and left-shifting an entry out
-                * of right to us works, we're done.
-                */
-               if (be16_to_cpu(right->bb_numrecs) - 1 >=
-                    XFS_INOBT_BLOCK_MINRECS(level, cur)) {
-                       if ((error = xfs_inobt_lshift(tcur, level, &i)))
-                               goto error0;
-                       if (i) {
-                               ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                      XFS_INOBT_BLOCK_MINRECS(level, cur));
-                               xfs_btree_del_cursor(tcur,
-                                                    XFS_BTREE_NOERROR);
-                               if (level > 0 &&
-                                   (error = xfs_inobt_decrement(cur, level,
-                                               &i)))
-                                       return error;
-                               *stat = 1;
-                               return 0;
-                       }
-               }
-               /*
-                * Otherwise, grab the number of records in right for
-                * future reference, and fix up the temp cursor to point
-                * to our block again (last record).
-                */
-               rrecs = be16_to_cpu(right->bb_numrecs);
-               if (lbno != NULLAGBLOCK) {
-                       xfs_btree_firstrec(tcur, level);
-                       if ((error = xfs_inobt_decrement(tcur, level, &i)))
-                               goto error0;
-               }
-       }
-       /*
-        * If there's a left sibling, see if it's ok to shift an entry
-        * out of it.
-        */
-       if (lbno != NULLAGBLOCK) {
-               /*
-                * Move the temp cursor to the first entry in the
-                * previous block.
-                */
-               xfs_btree_firstrec(tcur, level);
-               if ((error = xfs_inobt_decrement(tcur, level, &i)))
-                       goto error0;
-               xfs_btree_firstrec(tcur, level);
-               /*
-                * Grab a pointer to the block.
-                */
-               lbp = tcur->bc_bufs[level];
-               left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                       goto error0;
-#endif
-               /*
-                * Grab the current block number, for future use.
-                */
-               bno = be32_to_cpu(left->bb_rightsib);
-               /*
-                * If left block is full enough so that removing one entry
-                * won't make it too empty, and right-shifting an entry out
-                * of left to us works, we're done.
-                */
-               if (be16_to_cpu(left->bb_numrecs) - 1 >=
-                    XFS_INOBT_BLOCK_MINRECS(level, cur)) {
-                       if ((error = xfs_inobt_rshift(tcur, level, &i)))
-                               goto error0;
-                       if (i) {
-                               ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                      XFS_INOBT_BLOCK_MINRECS(level, cur));
-                               xfs_btree_del_cursor(tcur,
-                                                    XFS_BTREE_NOERROR);
-                               if (level == 0)
-                                       cur->bc_ptrs[0]++;
-                               *stat = 1;
-                               return 0;
-                       }
-               }
-               /*
-                * Otherwise, grab the number of records in right for
-                * future reference.
-                */
-               lrecs = be16_to_cpu(left->bb_numrecs);
-       }
-       /*
-        * Delete the temp cursor, we're done with it.
-        */
-       xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-       /*
-        * If here, we need to do a join to keep the tree balanced.
-        */
-       ASSERT(bno != NULLAGBLOCK);
-       /*
-        * See if we can join with the left neighbor block.
-        */
-       if (lbno != NULLAGBLOCK &&
-           lrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-               /*
-                * Set "right" to be the starting block,
-                * "left" to be the left neighbor.
-                */
-               rbno = bno;
-               right = block;
-               rrecs = be16_to_cpu(right->bb_numrecs);
-               rbp = bp;
-               if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                               cur->bc_private.a.agno, lbno, 0, &lbp,
-                               XFS_INO_BTREE_REF)))
-                       return error;
-               left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-               lrecs = be16_to_cpu(left->bb_numrecs);
-               if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                       return error;
-       }
-       /*
-        * If that won't work, see if we can join with the right neighbor block.
-        */
-       else if (rbno != NULLAGBLOCK &&
-                rrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-               /*
-                * Set "left" to be the starting block,
-                * "right" to be the right neighbor.
-                */
-               lbno = bno;
-               left = block;
-               lrecs = be16_to_cpu(left->bb_numrecs);
-               lbp = bp;
-               if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                               cur->bc_private.a.agno, rbno, 0, &rbp,
-                               XFS_INO_BTREE_REF)))
-                       return error;
-               right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-               rrecs = be16_to_cpu(right->bb_numrecs);
-               if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                       return error;
-       }
-       /*
-        * Otherwise, we can't fix the imbalance.
-        * Just return.  This is probably a logic error, but it's not fatal.
-        */
-       else {
-               if (level > 0 && (error = xfs_inobt_decrement(cur, level, &i)))
-                       return error;
-               *stat = 1;
-               return 0;
-       }
-       /*
-        * We're now going to join "left" and "right" by moving all the stuff
-        * in "right" to "left" and deleting "right".
-        */
-       if (level > 0) {
-               /*
-                * It's a non-leaf.  Move keys and pointers.
-                */
-               lkp = XFS_INOBT_KEY_ADDR(left, lrecs + 1, cur);
-               lpp = XFS_INOBT_PTR_ADDR(left, lrecs + 1, cur);
-               rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
-               rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-               for (i = 0; i < rrecs; i++) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
-                               return error;
-               }
-#endif
-               memcpy(lkp, rkp, rrecs * sizeof(*lkp));
-               memcpy(lpp, rpp, rrecs * sizeof(*lpp));
-               xfs_inobt_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
-               xfs_inobt_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
-       } else {
-               /*
-                * It's a leaf.  Move records.
-                */
-               lrp = XFS_INOBT_REC_ADDR(left, lrecs + 1, cur);
-               rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
-               memcpy(lrp, rrp, rrecs * sizeof(*lrp));
-               xfs_inobt_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
-       }
-       /*
-        * If we joined with the left neighbor, set the buffer in the
-        * cursor to the left block, and fix up the index.
-        */
-       if (bp != lbp) {
-               xfs_btree_setbuf(cur, level, lbp);
-               cur->bc_ptrs[level] += lrecs;
-       }
-       /*
-        * If we joined with the right neighbor and there's a level above
-        * us, increment the cursor at that level.
-        */
-       else if (level + 1 < cur->bc_nlevels &&
-                (error = xfs_alloc_increment(cur, level + 1, &i)))
-               return error;
-       /*
-        * Fix up the number of records in the surviving block.
-        */
-       lrecs += rrecs;
-       left->bb_numrecs = cpu_to_be16(lrecs);
-       /*
-        * Fix up the right block pointer in the surviving block, and log it.
-        */
-       left->bb_rightsib = right->bb_rightsib;
-       xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-       /*
-        * If there is a right sibling now, make it point to the
-        * remaining block.
-        */
-       if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
-               xfs_inobt_block_t       *rrblock;
-               xfs_buf_t               *rrbp;
+STATIC struct xfs_btree_cur *
+xfs_inobt_dup_cursor(
+       struct xfs_btree_cur    *cur)
+{
+       return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
+                       cur->bc_private.a.agbp, cur->bc_private.a.agno);
+}
 
-               if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                               cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0,
-                               &rrbp, XFS_INO_BTREE_REF)))
-                       return error;
-               rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
-               if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
-                       return error;
-               rrblock->bb_leftsib = cpu_to_be32(lbno);
-               xfs_inobt_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
-       }
-       /*
-        * Free the deleting block.
-        */
-       if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp,
-                                    cur->bc_private.a.agno, rbno), 1)))
-               return error;
-       xfs_trans_binval(cur->bc_tp, rbp);
-       /*
-        * Readjust the ptr at this level if it's not a leaf, since it's
-        * still pointing at the deletion point, which makes the cursor
-        * inconsistent.  If this makes the ptr 0, the caller fixes it up.
-        * We can't use decrement because it would change the next level up.
-        */
-       if (level > 0)
-               cur->bc_ptrs[level]--;
-       /*
-        * Return value means the next level up has something to do.
-        */
-       *stat = 2;
-       return 0;
+STATIC void
+xfs_inobt_set_root(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *nptr,
+       int                     inc)    /* level change */
+{
+       struct xfs_buf          *agbp = cur->bc_private.a.agbp;
+       struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
 
-error0:
-       xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-       return error;
+       agi->agi_root = nptr->s;
+       be32_add_cpu(&agi->agi_level, inc);
+       xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
 }
 
-/*
- * Insert one record/level.  Return information to the caller
- * allowing the next level up to proceed if necessary.
- */
-STATIC int                             /* error */
-xfs_inobt_insrec(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level to insert record at */
-       xfs_agblock_t           *bnop,  /* i/o: block number inserted */
-       xfs_inobt_rec_t         *recp,  /* i/o: record data inserted */
-       xfs_btree_cur_t         **curp, /* output: new cursor replacing cur */
-       int                     *stat)  /* success/failure */
+STATIC int
+xfs_inobt_alloc_block(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *start,
+       union xfs_btree_ptr     *new,
+       int                     length,
+       int                     *stat)
 {
-       xfs_inobt_block_t       *block; /* btree block record/key lives in */
-       xfs_buf_t               *bp;    /* buffer for block */
-       int                     error;  /* error return value */
-       int                     i;      /* loop index */
-       xfs_inobt_key_t         key;    /* key value being inserted */
-       xfs_inobt_key_t         *kp=NULL;       /* pointer to btree keys */
-       xfs_agblock_t           nbno;   /* block number of allocated block */
-       xfs_btree_cur_t         *ncur;  /* new cursor to be used at next lvl */
-       xfs_inobt_key_t         nkey;   /* new key value, from split */
-       xfs_inobt_rec_t         nrec;   /* new record value, for caller */
-       int                     numrecs;
-       int                     optr;   /* old ptr value */
-       xfs_inobt_ptr_t         *pp;    /* pointer to btree addresses */
-       int                     ptr;    /* index in btree block for this rec */
-       xfs_inobt_rec_t         *rp=NULL;       /* pointer to btree records */
+       xfs_alloc_arg_t         args;           /* block allocation args */
+       int                     error;          /* error return value */
+       xfs_agblock_t           sbno = be32_to_cpu(start->s);
 
-       /*
-        * GCC doesn't understand the (arguably complex) control flow in
-        * this function and complains about uninitialized structure fields
-        * without this.
-        */
-       memset(&nrec, 0, sizeof(nrec));
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
 
-       /*
-        * If we made it to the root level, allocate a new root block
-        * and we're done.
-        */
-       if (level >= cur->bc_nlevels) {
-               error = xfs_inobt_newroot(cur, &i);
-               *bnop = NULLAGBLOCK;
-               *stat = i;
+       memset(&args, 0, sizeof(args));
+       args.tp = cur->bc_tp;
+       args.mp = cur->bc_mp;
+       args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
+       args.minlen = 1;
+       args.maxlen = 1;
+       args.prod = 1;
+       args.type = XFS_ALLOCTYPE_NEAR_BNO;
+
+       error = xfs_alloc_vextent(&args);
+       if (error) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
                return error;
        }
-       /*
-        * Make a key out of the record data to be inserted, and save it.
-        */
-       key.ir_startino = recp->ir_startino;
-       optr = ptr = cur->bc_ptrs[level];
-       /*
-        * If we're off the left edge, return failure.
-        */
-       if (ptr == 0) {
+       if (args.fsbno == NULLFSBLOCK) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
                *stat = 0;
                return 0;
        }
-       /*
-        * Get pointers to the btree buffer and block.
-        */
-       bp = cur->bc_bufs[level];
-       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-       numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-               return error;
-       /*
-        * Check that the new entry is being inserted in the right place.
-        */
-       if (ptr <= numrecs) {
-               if (level == 0) {
-                       rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
-                       xfs_btree_check_rec(cur->bc_btnum, recp, rp);
-               } else {
-                       kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
-                       xfs_btree_check_key(cur->bc_btnum, &key, kp);
-               }
-       }
-#endif
-       nbno = NULLAGBLOCK;
-       ncur = NULL;
-       /*
-        * If the block is full, we can't insert the new entry until we
-        * make the block un-full.
-        */
-       if (numrecs == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-               /*
-                * First, try shifting an entry to the right neighbor.
-                */
-               if ((error = xfs_inobt_rshift(cur, level, &i)))
-                       return error;
-               if (i) {
-                       /* nothing */
-               }
-               /*
-                * Next, try shifting an entry to the left neighbor.
-                */
-               else {
-                       if ((error = xfs_inobt_lshift(cur, level, &i)))
-                               return error;
-                       if (i) {
-                               optr = ptr = cur->bc_ptrs[level];
-                       } else {
-                               /*
-                                * Next, try splitting the current block
-                                * in half. If this works we have to
-                                * re-set our variables because
-                                * we could be in a different block now.
-                                */
-                               if ((error = xfs_inobt_split(cur, level, &nbno,
-                                               &nkey, &ncur, &i)))
-                                       return error;
-                               if (i) {
-                                       bp = cur->bc_bufs[level];
-                                       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-                                       if ((error = xfs_btree_check_sblock(cur,
-                                                       block, level, bp)))
-                                               return error;
-#endif
-                                       ptr = cur->bc_ptrs[level];
-                                       nrec.ir_startino = nkey.ir_startino;
-                               } else {
-                                       /*
-                                        * Otherwise the insert fails.
-                                        */
-                                       *stat = 0;
-                                       return 0;
-                               }
-                       }
-               }
-       }
-       /*
-        * At this point we know there's room for our new entry in the block
-        * we're pointing at.
-        */
-       numrecs = be16_to_cpu(block->bb_numrecs);
-       if (level > 0) {
-               /*
-                * It's a non-leaf entry.  Make a hole for the new data
-                * in the key and ptr regions of the block.
-                */
-               kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
-               pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
-               for (i = numrecs; i >= ptr; i--) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level)))
-                               return error;
-               }
-#endif
-               memmove(&kp[ptr], &kp[ptr - 1],
-                       (numrecs - ptr + 1) * sizeof(*kp));
-               memmove(&pp[ptr], &pp[ptr - 1],
-                       (numrecs - ptr + 1) * sizeof(*pp));
-               /*
-                * Now stuff the new data in, bump numrecs and log the new data.
-                */
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
-                       return error;
-#endif
-               kp[ptr - 1] = key;
-               pp[ptr - 1] = cpu_to_be32(*bnop);
-               numrecs++;
-               block->bb_numrecs = cpu_to_be16(numrecs);
-               xfs_inobt_log_keys(cur, bp, ptr, numrecs);
-               xfs_inobt_log_ptrs(cur, bp, ptr, numrecs);
-       } else {
-               /*
-                * It's a leaf entry.  Make a hole for the new record.
-                */
-               rp = XFS_INOBT_REC_ADDR(block, 1, cur);
-               memmove(&rp[ptr], &rp[ptr - 1],
-                       (numrecs - ptr + 1) * sizeof(*rp));
-               /*
-                * Now stuff the new record in, bump numrecs
-                * and log the new data.
-                */
-               rp[ptr - 1] = *recp;
-               numrecs++;
-               block->bb_numrecs = cpu_to_be16(numrecs);
-               xfs_inobt_log_recs(cur, bp, ptr, numrecs);
-       }
-       /*
-        * Log the new number of records in the btree header.
-        */
-       xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-#ifdef DEBUG
-       /*
-        * Check that the key/record is in the right place, now.
-        */
-       if (ptr < numrecs) {
-               if (level == 0)
-                       xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
-                               rp + ptr);
-               else
-                       xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
-                               kp + ptr);
-       }
-#endif
-       /*
-        * If we inserted at the start of a block, update the parents' keys.
-        */
-       if (optr == 1 && (error = xfs_inobt_updkey(cur, &key, level + 1)))
-               return error;
-       /*
-        * Return the new block number, if any.
-        * If there is one, give back a record value and a cursor too.
-        */
-       *bnop = nbno;
-       if (nbno != NULLAGBLOCK) {
-               *recp = nrec;
-               *curp = ncur;
-       }
+       ASSERT(args.len == 1);
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+
+       new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
        *stat = 1;
        return 0;
 }
 
-/*
- * Log header fields from a btree block.
- */
-STATIC void
-xfs_inobt_log_block(
-       xfs_trans_t             *tp,    /* transaction pointer */
-       xfs_buf_t               *bp,    /* buffer containing btree block */
-       int                     fields) /* mask of fields: XFS_BB_... */
+STATIC int
+xfs_inobt_free_block(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp)
 {
-       int                     first;  /* first byte offset logged */
-       int                     last;   /* last byte offset logged */
-       static const short      offsets[] = {   /* table of offsets */
-               offsetof(xfs_inobt_block_t, bb_magic),
-               offsetof(xfs_inobt_block_t, bb_level),
-               offsetof(xfs_inobt_block_t, bb_numrecs),
-               offsetof(xfs_inobt_block_t, bb_leftsib),
-               offsetof(xfs_inobt_block_t, bb_rightsib),
-               sizeof(xfs_inobt_block_t)
-       };
+       xfs_fsblock_t           fsbno;
+       int                     error;
 
-       xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
-       xfs_trans_log_buf(tp, bp, first, last);
+       fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp));
+       error = xfs_free_extent(cur->bc_tp, fsbno, 1);
+       if (error)
+               return error;
+
+       xfs_trans_binval(cur->bc_tp, bp);
+       return error;
 }
 
-/*
- * Log keys from a btree block (nonleaf).
- */
-STATIC void
-xfs_inobt_log_keys(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_buf_t               *bp,    /* buffer containing btree block */
-       int                     kfirst, /* index of first key to log */
-       int                     klast)  /* index of last key to log */
+STATIC int
+xfs_inobt_get_maxrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
 {
-       xfs_inobt_block_t       *block; /* btree block to log from */
-       int                     first;  /* first byte offset logged */
-       xfs_inobt_key_t         *kp;    /* key pointer in btree block */
-       int                     last;   /* last byte offset logged */
-
-       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-       kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
-       first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
-       last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
-       xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+       return cur->bc_mp->m_inobt_mxr[level != 0];
 }
 
-/*
- * Log block pointer fields from a btree block (nonleaf).
- */
 STATIC void
-xfs_inobt_log_ptrs(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_buf_t               *bp,    /* buffer containing btree block */
-       int                     pfirst, /* index of first pointer to log */
-       int                     plast)  /* index of last pointer to log */
+xfs_inobt_init_key_from_rec(
+       union xfs_btree_key     *key,
+       union xfs_btree_rec     *rec)
 {
-       xfs_inobt_block_t       *block; /* btree block to log from */
-       int                     first;  /* first byte offset logged */
-       int                     last;   /* last byte offset logged */
-       xfs_inobt_ptr_t         *pp;    /* block-pointer pointer in btree blk */
-
-       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-       pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
-       first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
-       last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
-       xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+       key->inobt.ir_startino = rec->inobt.ir_startino;
 }
 
-/*
- * Log records from a btree block (leaf).
- */
 STATIC void
-xfs_inobt_log_recs(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_buf_t               *bp,    /* buffer containing btree block */
-       int                     rfirst, /* index of first record to log */
-       int                     rlast)  /* index of last record to log */
+xfs_inobt_init_rec_from_key(
+       union xfs_btree_key     *key,
+       union xfs_btree_rec     *rec)
 {
-       xfs_inobt_block_t       *block; /* btree block to log from */
-       int                     first;  /* first byte offset logged */
-       int                     last;   /* last byte offset logged */
-       xfs_inobt_rec_t         *rp;    /* record pointer for btree block */
+       rec->inobt.ir_startino = key->inobt.ir_startino;
+}
 
-       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-       rp = XFS_INOBT_REC_ADDR(block, 1, cur);
-       first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
-       last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
-       xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+STATIC void
+xfs_inobt_init_rec_from_cur(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec)
+{
+       rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
+       rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
+       rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
 }
 
 /*
- * Lookup the record.  The cursor is made to point to it, based on dir.
- * Return 0 if can't find any such record, 1 for success.
+ * intial value of ptr for lookup
  */
-STATIC int                             /* error */
-xfs_inobt_lookup(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_lookup_t            dir,    /* <=, ==, or >= */
-       int                     *stat)  /* success/failure */
+STATIC void
+xfs_inobt_init_ptr_from_cur(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr)
 {
-       xfs_agblock_t           agbno;  /* a.g. relative btree block number */
-       xfs_agnumber_t          agno;   /* allocation group number */
-       xfs_inobt_block_t       *block=NULL;    /* current btree block */
-       __int64_t               diff;   /* difference for the current key */
-       int                     error;  /* error return value */
-       int                     keyno=0;        /* current key number */
-       int                     level;  /* level in the btree */
-       xfs_mount_t             *mp;    /* file system mount point */
-
-       /*
-        * Get the allocation group header, and the root block number.
-        */
-       mp = cur->bc_mp;
-       {
-               xfs_agi_t       *agi;   /* a.g. inode header */
-
-               agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
-               agno = be32_to_cpu(agi->agi_seqno);
-               agbno = be32_to_cpu(agi->agi_root);
-       }
-       /*
-        * Iterate over each level in the btree, starting at the root.
-        * For each level above the leaves, find the key we need, based
-        * on the lookup record, then follow the corresponding block
-        * pointer down to the next level.
-        */
-       for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
-               xfs_buf_t       *bp;    /* buffer pointer for btree block */
-               xfs_daddr_t     d;      /* disk address of btree block */
-
-               /*
-                * Get the disk address we're looking for.
-                */
-               d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-               /*
-                * If the old buffer at this level is for a different block,
-                * throw it away, otherwise just use it.
-                */
-               bp = cur->bc_bufs[level];
-               if (bp && XFS_BUF_ADDR(bp) != d)
-                       bp = NULL;
-               if (!bp) {
-                       /*
-                        * Need to get a new buffer.  Read it, then
-                        * set it in the cursor, releasing the old one.
-                        */
-                       if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                       agno, agbno, 0, &bp, XFS_INO_BTREE_REF)))
-                               return error;
-                       xfs_btree_setbuf(cur, level, bp);
-                       /*
-                        * Point to the btree block, now that we have the buffer
-                        */
-                       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-                       if ((error = xfs_btree_check_sblock(cur, block, level,
-                                       bp)))
-                               return error;
-               } else
-                       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-               /*
-                * If we already had a key match at a higher level, we know
-                * we need to use the first entry in this block.
-                */
-               if (diff == 0)
-                       keyno = 1;
-               /*
-                * Otherwise we need to search this block.  Do a binary search.
-                */
-               else {
-                       int             high;   /* high entry number */
-                       xfs_inobt_key_t *kkbase=NULL;/* base of keys in block */
-                       xfs_inobt_rec_t *krbase=NULL;/* base of records in block */
-                       int             low;    /* low entry number */
+       struct xfs_agi          *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
 
-                       /*
-                        * Get a pointer to keys or records.
-                        */
-                       if (level > 0)
-                               kkbase = XFS_INOBT_KEY_ADDR(block, 1, cur);
-                       else
-                               krbase = XFS_INOBT_REC_ADDR(block, 1, cur);
-                       /*
-                        * Set low and high entry numbers, 1-based.
-                        */
-                       low = 1;
-                       if (!(high = be16_to_cpu(block->bb_numrecs))) {
-                               /*
-                                * If the block is empty, the tree must
-                                * be an empty leaf.
-                                */
-                               ASSERT(level == 0 && cur->bc_nlevels == 1);
-                               cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
-                               *stat = 0;
-                               return 0;
-                       }
-                       /*
-                        * Binary search the block.
-                        */
-                       while (low <= high) {
-                               xfs_agino_t     startino;       /* key value */
-
-                               /*
-                                * keyno is average of low and high.
-                                */
-                               keyno = (low + high) >> 1;
-                               /*
-                                * Get startino.
-                                */
-                               if (level > 0) {
-                                       xfs_inobt_key_t *kkp;
-
-                                       kkp = kkbase + keyno - 1;
-                                       startino = be32_to_cpu(kkp->ir_startino);
-                               } else {
-                                       xfs_inobt_rec_t *krp;
-
-                                       krp = krbase + keyno - 1;
-                                       startino = be32_to_cpu(krp->ir_startino);
-                               }
-                               /*
-                                * Compute difference to get next direction.
-                                */
-                               diff = (__int64_t)
-                                       startino - cur->bc_rec.i.ir_startino;
-                               /*
-                                * Less than, move right.
-                                */
-                               if (diff < 0)
-                                       low = keyno + 1;
-                               /*
-                                * Greater than, move left.
-                                */
-                               else if (diff > 0)
-                                       high = keyno - 1;
-                               /*
-                                * Equal, we're done.
-                                */
-                               else
-                                       break;
-                       }
-               }
-               /*
-                * If there are more levels, set up for the next level
-                * by getting the block number and filling in the cursor.
-                */
-               if (level > 0) {
-                       /*
-                        * If we moved left, need the previous key number,
-                        * unless there isn't one.
-                        */
-                       if (diff > 0 && --keyno < 1)
-                               keyno = 1;
-                       agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, keyno, cur));
-#ifdef DEBUG
-                       if ((error = xfs_btree_check_sptr(cur, agbno, level)))
-                               return error;
-#endif
-                       cur->bc_ptrs[level] = keyno;
-               }
-       }
-       /*
-        * Done with the search.
-        * See if we need to adjust the results.
-        */
-       if (dir != XFS_LOOKUP_LE && diff < 0) {
-               keyno++;
-               /*
-                * If ge search and we went off the end of the block, but it's
-                * not the last block, we're in the wrong block.
-                */
-               if (dir == XFS_LOOKUP_GE &&
-                   keyno > be16_to_cpu(block->bb_numrecs) &&
-                   be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
-                       int     i;
+       ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
 
-                       cur->bc_ptrs[0] = keyno;
-                       if ((error = xfs_inobt_increment(cur, 0, &i)))
-                               return error;
-                       ASSERT(i == 1);
-                       *stat = 1;
-                       return 0;
-               }
-       }
-       else if (dir == XFS_LOOKUP_LE && diff > 0)
-               keyno--;
-       cur->bc_ptrs[0] = keyno;
-       /*
-        * Return if we succeeded or not.
-        */
-       if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs))
-               *stat = 0;
-       else
-               *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
-       return 0;
+       ptr->s = agi->agi_root;
 }
 
-/*
- * Move 1 record left from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                             /* error */
-xfs_inobt_lshift(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level to shift record on */
-       int                     *stat)  /* success/failure */
+STATIC __int64_t
+xfs_inobt_key_diff(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *key)
 {
-       int                     error;  /* error return value */
-#ifdef DEBUG
-       int                     i;      /* loop index */
-#endif
-       xfs_inobt_key_t         key;    /* key value for leaf level upward */
-       xfs_buf_t               *lbp;   /* buffer for left neighbor block */
-       xfs_inobt_block_t       *left;  /* left neighbor btree block */
-       xfs_inobt_key_t         *lkp=NULL;      /* key pointer for left block */
-       xfs_inobt_ptr_t         *lpp;   /* address pointer for left block */
-       xfs_inobt_rec_t         *lrp=NULL;      /* record pointer for left block */
-       int                     nrec;   /* new number of left block entries */
-       xfs_buf_t               *rbp;   /* buffer for right (current) block */
-       xfs_inobt_block_t       *right; /* right (current) btree block */
-       xfs_inobt_key_t         *rkp=NULL;      /* key pointer for right block */
-       xfs_inobt_ptr_t         *rpp=NULL;      /* address pointer for right block */
-       xfs_inobt_rec_t         *rrp=NULL;      /* record pointer for right block */
-
-       /*
-        * Set up variables for this block as "right".
-        */
-       rbp = cur->bc_bufs[level];
-       right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-               return error;
-#endif
-       /*
-        * If we've got no left sibling then we can't shift an entry left.
-        */
-       if (be32_to_cpu(right->bb_leftsib) == NULLAGBLOCK) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * If the cursor entry is the one that would be moved, don't
-        * do it... it's too complicated.
-        */
-       if (cur->bc_ptrs[level] <= 1) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * Set up the left neighbor as "left".
-        */
-       if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                       cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib),
-                       0, &lbp, XFS_INO_BTREE_REF)))
-               return error;
-       left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-       if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-               return error;
-       /*
-        * If it's full, it can't take another entry.
-        */
-       if (be16_to_cpu(left->bb_numrecs) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-               *stat = 0;
-               return 0;
-       }
-       nrec = be16_to_cpu(left->bb_numrecs) + 1;
-       /*
-        * If non-leaf, copy a key and a ptr to the left block.
-        */
-       if (level > 0) {
-               lkp = XFS_INOBT_KEY_ADDR(left, nrec, cur);
-               rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
-               *lkp = *rkp;
-               xfs_inobt_log_keys(cur, lbp, nrec, nrec);
-               lpp = XFS_INOBT_PTR_ADDR(left, nrec, cur);
-               rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level)))
-                       return error;
-#endif
-               *lpp = *rpp;
-               xfs_inobt_log_ptrs(cur, lbp, nrec, nrec);
-       }
-       /*
-        * If leaf, copy a record to the left block.
-        */
-       else {
-               lrp = XFS_INOBT_REC_ADDR(left, nrec, cur);
-               rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
-               *lrp = *rrp;
-               xfs_inobt_log_recs(cur, lbp, nrec, nrec);
-       }
-       /*
-        * Bump and log left's numrecs, decrement and log right's numrecs.
-        */
-       be16_add_cpu(&left->bb_numrecs, 1);
-       xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-#ifdef DEBUG
-       if (level > 0)
-               xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
-       else
-               xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
-#endif
-       be16_add_cpu(&right->bb_numrecs, -1);
-       xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
-       /*
-        * Slide the contents of right down one entry.
-        */
-       if (level > 0) {
-#ifdef DEBUG
-               for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i + 1]),
-                                       level)))
-                               return error;
-               }
-#endif
-               memmove(rkp, rkp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-               memmove(rpp, rpp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-               xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-       } else {
-               memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-               xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               key.ir_startino = rrp->ir_startino;
-               rkp = &key;
-       }
-       /*
-        * Update the parent key values of right.
-        */
-       if ((error = xfs_inobt_updkey(cur, rkp, level + 1)))
-               return error;
-       /*
-        * Slide the cursor value left one.
-        */
-       cur->bc_ptrs[level]--;
-       *stat = 1;
-       return 0;
+       return (__int64_t)be32_to_cpu(key->inobt.ir_startino) -
+                         cur->bc_rec.i.ir_startino;
 }
 
-/*
- * Allocate a new root block, fill it in.
- */
-STATIC int                             /* error */
-xfs_inobt_newroot(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     *stat)  /* success/failure */
+STATIC int
+xfs_inobt_kill_root(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp,
+       int                     level,
+       union xfs_btree_ptr     *newroot)
 {
-       xfs_agi_t               *agi;   /* a.g. inode header */
-       xfs_alloc_arg_t         args;   /* allocation argument structure */
-       xfs_inobt_block_t       *block; /* one half of the old root block */
-       xfs_buf_t               *bp;    /* buffer containing block */
-       int                     error;  /* error return value */
-       xfs_inobt_key_t         *kp;    /* btree key pointer */
-       xfs_agblock_t           lbno;   /* left block number */
-       xfs_buf_t               *lbp;   /* left buffer pointer */
-       xfs_inobt_block_t       *left;  /* left btree block */
-       xfs_buf_t               *nbp;   /* new (root) buffer */
-       xfs_inobt_block_t       *new;   /* new (root) btree block */
-       int                     nptr;   /* new value for key index, 1 or 2 */
-       xfs_inobt_ptr_t         *pp;    /* btree address pointer */
-       xfs_agblock_t           rbno;   /* right block number */
-       xfs_buf_t               *rbp;   /* right buffer pointer */
-       xfs_inobt_block_t       *right; /* right btree block */
-       xfs_inobt_rec_t         *rp;    /* btree record pointer */
+       int                     error;
 
-       ASSERT(cur->bc_nlevels < XFS_IN_MAXLEVELS(cur->bc_mp));
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_STATS_INC(cur, killroot);
 
        /*
-        * Get a block & a buffer.
+        * Update the root pointer, decreasing the level by 1 and then
+        * free the old root.
         */
-       agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
-       args.tp = cur->bc_tp;
-       args.mp = cur->bc_mp;
-       args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno,
-               be32_to_cpu(agi->agi_root));
-       args.mod = args.minleft = args.alignment = args.total = args.wasdel =
-               args.isfl = args.userdata = args.minalignslop = 0;
-       args.minlen = args.maxlen = args.prod = 1;
-       args.type = XFS_ALLOCTYPE_NEAR_BNO;
-       if ((error = xfs_alloc_vextent(&args)))
+       xfs_inobt_set_root(cur, newroot, -1);
+       error = xfs_inobt_free_block(cur, bp);
+       if (error) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
                return error;
-       /*
-        * None available, we fail.
-        */
-       if (args.fsbno == NULLFSBLOCK) {
-               *stat = 0;
-               return 0;
-       }
-       ASSERT(args.len == 1);
-       nbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
-       new = XFS_BUF_TO_INOBT_BLOCK(nbp);
-       /*
-        * Set the root data in the a.g. inode structure.
-        */
-       agi->agi_root = cpu_to_be32(args.agbno);
-       be32_add_cpu(&agi->agi_level, 1);
-       xfs_ialloc_log_agi(args.tp, cur->bc_private.a.agbp,
-               XFS_AGI_ROOT | XFS_AGI_LEVEL);
-       /*
-        * At the previous root level there are now two blocks: the old
-        * root, and the new block generated when it was split.
-        * We don't know which one the cursor is pointing at, so we
-        * set up variables "left" and "right" for each case.
-        */
-       bp = cur->bc_bufs[cur->bc_nlevels - 1];
-       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, cur->bc_nlevels - 1, bp)))
-               return error;
-#endif
-       if (be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
-               /*
-                * Our block is left, pick up the right block.
-                */
-               lbp = bp;
-               lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
-               left = block;
-               rbno = be32_to_cpu(left->bb_rightsib);
-               if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
-                               rbno, 0, &rbp, XFS_INO_BTREE_REF)))
-                       return error;
-               bp = rbp;
-               right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-               if ((error = xfs_btree_check_sblock(cur, right,
-                               cur->bc_nlevels - 1, rbp)))
-                       return error;
-               nptr = 1;
-       } else {
-               /*
-                * Our block is right, pick up the left block.
-                */
-               rbp = bp;
-               rbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(rbp));
-               right = block;
-               lbno = be32_to_cpu(right->bb_leftsib);
-               if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
-                               lbno, 0, &lbp, XFS_INO_BTREE_REF)))
-                       return error;
-               bp = lbp;
-               left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-               if ((error = xfs_btree_check_sblock(cur, left,
-                               cur->bc_nlevels - 1, lbp)))
-                       return error;
-               nptr = 2;
-       }
-       /*
-        * Fill in the new block's btree header and log it.
-        */
-       new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
-       new->bb_level = cpu_to_be16(cur->bc_nlevels);
-       new->bb_numrecs = cpu_to_be16(2);
-       new->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-       new->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-       xfs_inobt_log_block(args.tp, nbp, XFS_BB_ALL_BITS);
-       ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
-       /*
-        * Fill in the key data in the new root.
-        */
-       kp = XFS_INOBT_KEY_ADDR(new, 1, cur);
-       if (be16_to_cpu(left->bb_level) > 0) {
-               kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur);
-               kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur);
-       } else {
-               rp = XFS_INOBT_REC_ADDR(left, 1, cur);
-               kp[0].ir_startino = rp->ir_startino;
-               rp = XFS_INOBT_REC_ADDR(right, 1, cur);
-               kp[1].ir_startino = rp->ir_startino;
        }
-       xfs_inobt_log_keys(cur, nbp, 1, 2);
-       /*
-        * Fill in the pointer data in the new root.
-        */
-       pp = XFS_INOBT_PTR_ADDR(new, 1, cur);
-       pp[0] = cpu_to_be32(lbno);
-       pp[1] = cpu_to_be32(rbno);
-       xfs_inobt_log_ptrs(cur, nbp, 1, 2);
-       /*
-        * Fix up the cursor.
-        */
-       xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
-       cur->bc_ptrs[cur->bc_nlevels] = nptr;
-       cur->bc_nlevels++;
-       *stat = 1;
-       return 0;
-}
 
-/*
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                             /* error */
-xfs_inobt_rshift(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level to shift record on */
-       int                     *stat)  /* success/failure */
-{
-       int                     error;  /* error return value */
-       int                     i;      /* loop index */
-       xfs_inobt_key_t         key;    /* key value for leaf level upward */
-       xfs_buf_t               *lbp;   /* buffer for left (current) block */
-       xfs_inobt_block_t       *left;  /* left (current) btree block */
-       xfs_inobt_key_t         *lkp;   /* key pointer for left block */
-       xfs_inobt_ptr_t         *lpp;   /* address pointer for left block */
-       xfs_inobt_rec_t         *lrp;   /* record pointer for left block */
-       xfs_buf_t               *rbp;   /* buffer for right neighbor block */
-       xfs_inobt_block_t       *right; /* right neighbor btree block */
-       xfs_inobt_key_t         *rkp;   /* key pointer for right block */
-       xfs_inobt_ptr_t         *rpp;   /* address pointer for right block */
-       xfs_inobt_rec_t         *rrp=NULL;      /* record pointer for right block */
-       xfs_btree_cur_t         *tcur;  /* temporary cursor */
+       XFS_BTREE_STATS_INC(cur, free);
 
-       /*
-        * Set up variables for this block as "left".
-        */
-       lbp = cur->bc_bufs[level];
-       left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-               return error;
-#endif
-       /*
-        * If we've got no right sibling then we can't shift an entry right.
-        */
-       if (be32_to_cpu(left->bb_rightsib) == NULLAGBLOCK) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * If the cursor entry is the one that would be moved, don't
-        * do it... it's too complicated.
-        */
-       if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * Set up the right neighbor as "right".
-        */
-       if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                       cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib),
-                       0, &rbp, XFS_INO_BTREE_REF)))
-               return error;
-       right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-       if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-               return error;
-       /*
-        * If it's full, it can't take another entry.
-        */
-       if (be16_to_cpu(right->bb_numrecs) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * Make a hole at the start of the right neighbor block, then
-        * copy the last left block entry to the hole.
-        */
-       if (level > 0) {
-               lkp = XFS_INOBT_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-               lpp = XFS_INOBT_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-               rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
-               rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-               for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
-                               return error;
-               }
-#endif
-               memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-               memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level)))
-                       return error;
-#endif
-               *rkp = *lkp;
-               *rpp = *lpp;
-               xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-               xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-       } else {
-               lrp = XFS_INOBT_REC_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-               rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
-               memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-               *rrp = *lrp;
-               xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-               key.ir_startino = rrp->ir_startino;
-               rkp = &key;
-       }
-       /*
-        * Decrement and log left's numrecs, bump and log right's numrecs.
-        */
-       be16_add_cpu(&left->bb_numrecs, -1);
-       xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-       be16_add_cpu(&right->bb_numrecs, 1);
-#ifdef DEBUG
-       if (level > 0)
-               xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
-       else
-               xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
-#endif
-       xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
-       /*
-        * Using a temporary cursor, update the parent key values of the
-        * block on the right.
-        */
-       if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-               return error;
-       xfs_btree_lastrec(tcur, level);
-       if ((error = xfs_inobt_increment(tcur, level, &i)) ||
-           (error = xfs_inobt_updkey(tcur, rkp, level + 1))) {
-               xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-               return error;
-       }
-       xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-       *stat = 1;
+       cur->bc_bufs[level] = NULL;
+       cur->bc_nlevels--;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
        return 0;
 }
 
-/*
- * Split cur/level block in half.
- * Return new block number and its first record (to be inserted into parent).
- */
-STATIC int                             /* error */
-xfs_inobt_split(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level to split */
-       xfs_agblock_t           *bnop,  /* output: block number allocated */
-       xfs_inobt_key_t         *keyp,  /* output: first key of new block */
-       xfs_btree_cur_t         **curp, /* output: new cursor */
-       int                     *stat)  /* success/failure */
-{
-       xfs_alloc_arg_t         args;   /* allocation argument structure */
-       int                     error;  /* error return value */
-       int                     i;      /* loop index/record number */
-       xfs_agblock_t           lbno;   /* left (current) block number */
-       xfs_buf_t               *lbp;   /* buffer for left block */
-       xfs_inobt_block_t       *left;  /* left (current) btree block */
-       xfs_inobt_key_t         *lkp;   /* left btree key pointer */
-       xfs_inobt_ptr_t         *lpp;   /* left btree address pointer */
-       xfs_inobt_rec_t         *lrp;   /* left btree record pointer */
-       xfs_buf_t               *rbp;   /* buffer for right block */
-       xfs_inobt_block_t       *right; /* right (new) btree block */
-       xfs_inobt_key_t         *rkp;   /* right btree key pointer */
-       xfs_inobt_ptr_t         *rpp;   /* right btree address pointer */
-       xfs_inobt_rec_t         *rrp;   /* right btree record pointer */
-
-       /*
-        * Set up left block (current one).
-        */
-       lbp = cur->bc_bufs[level];
-       args.tp = cur->bc_tp;
-       args.mp = cur->bc_mp;
-       lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
-       /*
-        * Allocate the new block.
-        * If we can't do it, we're toast.  Give up.
-        */
-       args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, lbno);
-       args.mod = args.minleft = args.alignment = args.total = args.wasdel =
-               args.isfl = args.userdata = args.minalignslop = 0;
-       args.minlen = args.maxlen = args.prod = 1;
-       args.type = XFS_ALLOCTYPE_NEAR_BNO;
-       if ((error = xfs_alloc_vextent(&args)))
-               return error;
-       if (args.fsbno == NULLFSBLOCK) {
-               *stat = 0;
-               return 0;
-       }
-       ASSERT(args.len == 1);
-       rbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
-       /*
-        * Set up the new block as "right".
-        */
-       right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-       /*
-        * "Left" is the current (according to the cursor) block.
-        */
-       left = XFS_BUF_TO_INOBT_BLOCK(lbp);
 #ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-               return error;
-#endif
-       /*
-        * Fill in the btree header for the new block.
-        */
-       right->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
-       right->bb_level = left->bb_level;
-       right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
-       /*
-        * Make sure that if there's an odd number of entries now, that
-        * each new block will have the same number of entries.
-        */
-       if ((be16_to_cpu(left->bb_numrecs) & 1) &&
-           cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
-               be16_add_cpu(&right->bb_numrecs, 1);
-       i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
-       /*
-        * For non-leaf blocks, copy keys and addresses over to the new block.
-        */
-       if (level > 0) {
-               lkp = XFS_INOBT_KEY_ADDR(left, i, cur);
-               lpp = XFS_INOBT_PTR_ADDR(left, i, cur);
-               rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
-               rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-               for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
-                               return error;
-               }
-#endif
-               memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-               memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-               xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               *keyp = *rkp;
-       }
-       /*
-        * For leaf blocks, copy records over to the new block.
-        */
-       else {
-               lrp = XFS_INOBT_REC_ADDR(left, i, cur);
-               rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
-               memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-               xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               keyp->ir_startino = rrp->ir_startino;
-       }
-       /*
-        * Find the left block number by looking in the buffer.
-        * Adjust numrecs, sibling pointers.
-        */
-       be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
-       right->bb_rightsib = left->bb_rightsib;
-       left->bb_rightsib = cpu_to_be32(args.agbno);
-       right->bb_leftsib = cpu_to_be32(lbno);
-       xfs_inobt_log_block(args.tp, rbp, XFS_BB_ALL_BITS);
-       xfs_inobt_log_block(args.tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-       /*
-        * If there's a block to the new block's right, make that block
-        * point back to right instead of to left.
-        */
-       if (be32_to_cpu(right->bb_rightsib) != NULLAGBLOCK) {
-               xfs_inobt_block_t       *rrblock;       /* rr btree block */
-               xfs_buf_t               *rrbp;          /* buffer for rrblock */
-
-               if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
-                               be32_to_cpu(right->bb_rightsib), 0, &rrbp,
-                               XFS_INO_BTREE_REF)))
-                       return error;
-               rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
-               if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
-                       return error;
-               rrblock->bb_leftsib = cpu_to_be32(args.agbno);
-               xfs_inobt_log_block(args.tp, rrbp, XFS_BB_LEFTSIB);
-       }
-       /*
-        * If the cursor is really in the right block, move it there.
-        * If it's just pointing past the last entry in left, then we'll
-        * insert there, so don't change anything in that case.
-        */
-       if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
-               xfs_btree_setbuf(cur, level, rbp);
-               cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
-       }
-       /*
-        * If there are more levels, we'll need another cursor which refers
-        * the right block, no matter where this cursor was.
-        */
-       if (level + 1 < cur->bc_nlevels) {
-               if ((error = xfs_btree_dup_cursor(cur, curp)))
-                       return error;
-               (*curp)->bc_ptrs[level + 1]++;
-       }
-       *bnop = args.agbno;
-       *stat = 1;
-       return 0;
+STATIC int
+xfs_inobt_keys_inorder(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *k1,
+       union xfs_btree_key     *k2)
+{
+       return be32_to_cpu(k1->inobt.ir_startino) <
+               be32_to_cpu(k2->inobt.ir_startino);
 }
 
-/*
- * Update keys at all levels from here to the root along the cursor's path.
- */
-STATIC int                             /* error */
-xfs_inobt_updkey(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_inobt_key_t         *keyp,  /* new key value to update to */
-       int                     level)  /* starting level for update */
+STATIC int
+xfs_inobt_recs_inorder(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *r1,
+       union xfs_btree_rec     *r2)
 {
-       int                     ptr;    /* index of key in block */
-
-       /*
-        * Go up the tree from this level toward the root.
-        * At each level, update the key value to the value input.
-        * Stop when we reach a level where the cursor isn't pointing
-        * at the first entry in the block.
-        */
-       for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
-               xfs_buf_t               *bp;    /* buffer for block */
-               xfs_inobt_block_t       *block; /* btree block */
-#ifdef DEBUG
-               int                     error;  /* error return value */
-#endif
-               xfs_inobt_key_t         *kp;    /* ptr to btree block keys */
-
-               bp = cur->bc_bufs[level];
-               block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                       return error;
-#endif
-               ptr = cur->bc_ptrs[level];
-               kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
-               *kp = *keyp;
-               xfs_inobt_log_keys(cur, bp, ptr, ptr);
-       }
-       return 0;
+       return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <=
+               be32_to_cpu(r2->inobt.ir_startino);
 }
+#endif /* DEBUG */
 
-/*
- * Externally visible routines.
- */
+#ifdef XFS_BTREE_TRACE
+ktrace_t       *xfs_inobt_trace_buf;
 
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                    /* error */
-xfs_inobt_decrement(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level in btree, 0 is leaf */
-       int                     *stat)  /* success/failure */
+STATIC void
+xfs_inobt_trace_enter(
+       struct xfs_btree_cur    *cur,
+       const char              *func,
+       char                    *s,
+       int                     type,
+       int                     line,
+       __psunsigned_t          a0,
+       __psunsigned_t          a1,
+       __psunsigned_t          a2,
+       __psunsigned_t          a3,
+       __psunsigned_t          a4,
+       __psunsigned_t          a5,
+       __psunsigned_t          a6,
+       __psunsigned_t          a7,
+       __psunsigned_t          a8,
+       __psunsigned_t          a9,
+       __psunsigned_t          a10)
 {
-       xfs_inobt_block_t       *block; /* btree block */
-       int                     error;
-       int                     lev;    /* btree level */
-
-       ASSERT(level < cur->bc_nlevels);
-       /*
-        * Read-ahead to the left at this level.
-        */
-       xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
-       /*
-        * Decrement the ptr at this level.  If we're still in the block
-        * then we're done.
-        */
-       if (--cur->bc_ptrs[level] > 0) {
-               *stat = 1;
-               return 0;
-       }
-       /*
-        * Get a pointer to the btree block.
-        */
-       block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[level]);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, level,
-                       cur->bc_bufs[level])))
-               return error;
-#endif
-       /*
-        * If we just went off the left edge of the tree, return failure.
-        */
-       if (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * March up the tree decrementing pointers.
-        * Stop when we don't go off the left edge of a block.
-        */
-       for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-               if (--cur->bc_ptrs[lev] > 0)
-                       break;
-               /*
-                * Read-ahead the left block, we're going to read it
-                * in the next loop.
-                */
-               xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
-       }
-       /*
-        * If we went off the root then we are seriously confused.
-        */
-       ASSERT(lev < cur->bc_nlevels);
-       /*
-        * Now walk back down the tree, fixing up the cursor's buffer
-        * pointers and key numbers.
-        */
-       for (block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
-               xfs_agblock_t   agbno;  /* block number of btree block */
-               xfs_buf_t       *bp;    /* buffer containing btree block */
-
-               agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
-               if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                               cur->bc_private.a.agno, agbno, 0, &bp,
-                               XFS_INO_BTREE_REF)))
-                       return error;
-               lev--;
-               xfs_btree_setbuf(cur, lev, bp);
-               block = XFS_BUF_TO_INOBT_BLOCK(bp);
-               if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                       return error;
-               cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
-       }
-       *stat = 1;
-       return 0;
+       ktrace_enter(xfs_inobt_trace_buf, (void *)(__psint_t)type,
+               (void *)func, (void *)s, NULL, (void *)cur,
+               (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+               (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+               (void *)a8, (void *)a9, (void *)a10);
 }
 
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-int                                    /* error */
-xfs_inobt_delete(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       int             *stat)          /* success/failure */
+STATIC void
+xfs_inobt_trace_cursor(
+       struct xfs_btree_cur    *cur,
+       __uint32_t              *s0,
+       __uint64_t              *l0,
+       __uint64_t              *l1)
 {
-       int             error;
-       int             i;              /* result code */
-       int             level;          /* btree level */
-
-       /*
-        * Go up the tree, starting at leaf level.
-        * If 2 is returned then a join was done; go to the next level.
-        * Otherwise we are done.
-        */
-       for (level = 0, i = 2; i == 2; level++) {
-               if ((error = xfs_inobt_delrec(cur, level, &i)))
-                       return error;
-       }
-       if (i == 0) {
-               for (level = 1; level < cur->bc_nlevels; level++) {
-                       if (cur->bc_ptrs[level] == 0) {
-                               if ((error = xfs_inobt_decrement(cur, level, &i)))
-                                       return error;
-                               break;
-                       }
-               }
-       }
-       *stat = i;
-       return 0;
+       *s0 = cur->bc_private.a.agno;
+       *l0 = cur->bc_rec.i.ir_startino;
+       *l1 = cur->bc_rec.i.ir_free;
 }
 
-
-/*
- * Get the data from the pointed-to record.
- */
-int                                    /* error */
-xfs_inobt_get_rec(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_agino_t             *ino,   /* output: starting inode of chunk */
-       __int32_t               *fcnt,  /* output: number of free inodes */
-       xfs_inofree_t           *free,  /* output: free inode mask */
-       int                     *stat)  /* output: success/failure */
+STATIC void
+xfs_inobt_trace_key(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *key,
+       __uint64_t              *l0,
+       __uint64_t              *l1)
 {
-       xfs_inobt_block_t       *block; /* btree block */
-       xfs_buf_t               *bp;    /* buffer containing btree block */
-#ifdef DEBUG
-       int                     error;  /* error return value */
-#endif
-       int                     ptr;    /* record number */
-       xfs_inobt_rec_t         *rec;   /* record data */
-
-       bp = cur->bc_bufs[0];
-       ptr = cur->bc_ptrs[0];
-       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
-               return error;
-#endif
-       /*
-        * Off the right end or left end, return failure.
-        */
-       if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * Point to the record and extract its data.
-        */
-       rec = XFS_INOBT_REC_ADDR(block, ptr, cur);
-       *ino = be32_to_cpu(rec->ir_startino);
-       *fcnt = be32_to_cpu(rec->ir_freecount);
-       *free = be64_to_cpu(rec->ir_free);
-       *stat = 1;
-       return 0;
+       *l0 = be32_to_cpu(key->inobt.ir_startino);
+       *l1 = 0;
 }
 
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                    /* error */
-xfs_inobt_increment(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level in btree, 0 is leaf */
-       int                     *stat)  /* success/failure */
+STATIC void
+xfs_inobt_trace_record(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec,
+       __uint64_t              *l0,
+       __uint64_t              *l1,
+       __uint64_t              *l2)
 {
-       xfs_inobt_block_t       *block; /* btree block */
-       xfs_buf_t               *bp;    /* buffer containing btree block */
-       int                     error;  /* error return value */
-       int                     lev;    /* btree level */
+       *l0 = be32_to_cpu(rec->inobt.ir_startino);
+       *l1 = be32_to_cpu(rec->inobt.ir_freecount);
+       *l2 = be64_to_cpu(rec->inobt.ir_free);
+}
+#endif /* XFS_BTREE_TRACE */
+
+static const struct xfs_btree_ops xfs_inobt_ops = {
+       .rec_len                = sizeof(xfs_inobt_rec_t),
+       .key_len                = sizeof(xfs_inobt_key_t),
+
+       .dup_cursor             = xfs_inobt_dup_cursor,
+       .set_root               = xfs_inobt_set_root,
+       .kill_root              = xfs_inobt_kill_root,
+       .alloc_block            = xfs_inobt_alloc_block,
+       .free_block             = xfs_inobt_free_block,
+       .get_minrecs            = xfs_inobt_get_minrecs,
+       .get_maxrecs            = xfs_inobt_get_maxrecs,
+       .init_key_from_rec      = xfs_inobt_init_key_from_rec,
+       .init_rec_from_key      = xfs_inobt_init_rec_from_key,
+       .init_rec_from_cur      = xfs_inobt_init_rec_from_cur,
+       .init_ptr_from_cur      = xfs_inobt_init_ptr_from_cur,
+       .key_diff               = xfs_inobt_key_diff,
 
-       ASSERT(level < cur->bc_nlevels);
-       /*
-        * Read-ahead to the right at this level.
-        */
-       xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
-       /*
-        * Get a pointer to the btree block.
-        */
-       bp = cur->bc_bufs[level];
-       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-               return error;
-#endif
-       /*
-        * Increment the ptr at this level.  If we're still in the block
-        * then we're done.
-        */
-       if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
-               *stat = 1;
-               return 0;
-       }
-       /*
-        * If we just went off the right edge of the tree, return failure.
-        */
-       if (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * March up the tree incrementing pointers.
-        * Stop when we don't go off the right edge of a block.
-        */
-       for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-               bp = cur->bc_bufs[lev];
-               block = XFS_BUF_TO_INOBT_BLOCK(bp);
 #ifdef DEBUG
-               if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                       return error;
+       .keys_inorder           = xfs_inobt_keys_inorder,
+       .recs_inorder           = xfs_inobt_recs_inorder,
 #endif
-               if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
-                       break;
-               /*
-                * Read-ahead the right block, we're going to read it
-                * in the next loop.
-                */
-               xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
-       }
-       /*
-        * If we went off the root then we are seriously confused.
-        */
-       ASSERT(lev < cur->bc_nlevels);
-       /*
-        * Now walk back down the tree, fixing up the cursor's buffer
-        * pointers and key numbers.
-        */
-       for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_INOBT_BLOCK(bp);
-            lev > level; ) {
-               xfs_agblock_t   agbno;  /* block number of btree block */
 
-               agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
-               if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                               cur->bc_private.a.agno, agbno, 0, &bp,
-                               XFS_INO_BTREE_REF)))
-                       return error;
-               lev--;
-               xfs_btree_setbuf(cur, lev, bp);
-               block = XFS_BUF_TO_INOBT_BLOCK(bp);
-               if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                       return error;
-               cur->bc_ptrs[lev] = 1;
-       }
-       *stat = 1;
-       return 0;
-}
+#ifdef XFS_BTREE_TRACE
+       .trace_enter            = xfs_inobt_trace_enter,
+       .trace_cursor           = xfs_inobt_trace_cursor,
+       .trace_key              = xfs_inobt_trace_key,
+       .trace_record           = xfs_inobt_trace_record,
+#endif
+};
 
 /*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
+ * Allocate a new inode btree cursor.
  */
-int                                    /* error */
-xfs_inobt_insert(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       int             *stat)          /* success/failure */
+struct xfs_btree_cur *                         /* new inode btree cursor */
+xfs_inobt_init_cursor(
+       struct xfs_mount        *mp,            /* file system mount point */
+       struct xfs_trans        *tp,            /* transaction pointer */
+       struct xfs_buf          *agbp,          /* buffer for agi structure */
+       xfs_agnumber_t          agno)           /* allocation group number */
 {
-       int             error;          /* error return value */
-       int             i;              /* result value, 0 for failure */
-       int             level;          /* current level number in btree */
-       xfs_agblock_t   nbno;           /* new block number (split result) */
-       xfs_btree_cur_t *ncur;          /* new cursor (split result) */
-       xfs_inobt_rec_t nrec;           /* record being inserted this level */
-       xfs_btree_cur_t *pcur;          /* previous level's cursor */
+       struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
+       struct xfs_btree_cur    *cur;
 
-       level = 0;
-       nbno = NULLAGBLOCK;
-       nrec.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
-       nrec.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
-       nrec.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
-       ncur = NULL;
-       pcur = cur;
-       /*
-        * Loop going up the tree, starting at the leaf level.
-        * Stop when we don't get a split block, that must mean that
-        * the insert is finished with this level.
-        */
-       do {
-               /*
-                * Insert nrec/nbno into this level of the tree.
-                * Note if we fail, nbno will be null.
-                */
-               if ((error = xfs_inobt_insrec(pcur, level++, &nbno, &nrec, &ncur,
-                               &i))) {
-                       if (pcur != cur)
-                               xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-                       return error;
-               }
-               /*
-                * See if the cursor we just used is trash.
-                * Can't trash the caller's cursor, but otherwise we should
-                * if ncur is a new cursor or we're about to be done.
-                */
-               if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
-                       cur->bc_nlevels = pcur->bc_nlevels;
-                       xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
-               }
-               /*
-                * If we got a new cursor, switch to it.
-                */
-               if (ncur) {
-                       pcur = ncur;
-                       ncur = NULL;
-               }
-       } while (nbno != NULLAGBLOCK);
-       *stat = i;
-       return 0;
-}
+       cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
 
-/*
- * Lookup the record equal to ino in the btree given by cur.
- */
-int                                    /* error */
-xfs_inobt_lookup_eq(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       xfs_agino_t     ino,            /* starting inode of chunk */
-       __int32_t       fcnt,           /* free inode count */
-       xfs_inofree_t   free,           /* free inode mask */
-       int             *stat)          /* success/failure */
-{
-       cur->bc_rec.i.ir_startino = ino;
-       cur->bc_rec.i.ir_freecount = fcnt;
-       cur->bc_rec.i.ir_free = free;
-       return xfs_inobt_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
+       cur->bc_tp = tp;
+       cur->bc_mp = mp;
+       cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+       cur->bc_btnum = XFS_BTNUM_INO;
+       cur->bc_blocklog = mp->m_sb.sb_blocklog;
 
-/*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
- */
-int                                    /* error */
-xfs_inobt_lookup_ge(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       xfs_agino_t     ino,            /* starting inode of chunk */
-       __int32_t       fcnt,           /* free inode count */
-       xfs_inofree_t   free,           /* free inode mask */
-       int             *stat)          /* success/failure */
-{
-       cur->bc_rec.i.ir_startino = ino;
-       cur->bc_rec.i.ir_freecount = fcnt;
-       cur->bc_rec.i.ir_free = free;
-       return xfs_inobt_lookup(cur, XFS_LOOKUP_GE, stat);
-}
+       cur->bc_ops = &xfs_inobt_ops;
 
-/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
- */
-int                                    /* error */
-xfs_inobt_lookup_le(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       xfs_agino_t     ino,            /* starting inode of chunk */
-       __int32_t       fcnt,           /* free inode count */
-       xfs_inofree_t   free,           /* free inode mask */
-       int             *stat)          /* success/failure */
-{
-       cur->bc_rec.i.ir_startino = ino;
-       cur->bc_rec.i.ir_freecount = fcnt;
-       cur->bc_rec.i.ir_free = free;
-       return xfs_inobt_lookup(cur, XFS_LOOKUP_LE, stat);
+       cur->bc_private.a.agbp = agbp;
+       cur->bc_private.a.agno = agno;
+
+       return cur;
 }
 
 /*
- * Update the record referred to by cur, to the value given
- * by [ino, fcnt, free].
- * This either works (return 0) or gets an EFSCORRUPTED error.
+ * Calculate number of records in an inobt btree block.
  */
-int                                    /* error */
-xfs_inobt_update(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_agino_t             ino,    /* starting inode of chunk */
-       __int32_t               fcnt,   /* free inode count */
-       xfs_inofree_t           free)   /* free inode mask */
+int
+xfs_inobt_maxrecs(
+       struct xfs_mount        *mp,
+       int                     blocklen,
+       int                     leaf)
 {
-       xfs_inobt_block_t       *block; /* btree block to update */
-       xfs_buf_t               *bp;    /* buffer containing btree block */
-       int                     error;  /* error return value */
-       int                     ptr;    /* current record number (updating) */
-       xfs_inobt_rec_t         *rp;    /* pointer to updated record */
+       blocklen -= XFS_INOBT_BLOCK_LEN(mp);
 
-       /*
-        * Pick up the current block.
-        */
-       bp = cur->bc_bufs[0];
-       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
-               return error;
-#endif
-       /*
-        * Get the address of the rec to be updated.
-        */
-       ptr = cur->bc_ptrs[0];
-       rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
-       /*
-        * Fill in the new contents and log them.
-        */
-       rp->ir_startino = cpu_to_be32(ino);
-       rp->ir_freecount = cpu_to_be32(fcnt);
-       rp->ir_free = cpu_to_be64(free);
-       xfs_inobt_log_recs(cur, bp, ptr, ptr);
-       /*
-        * Updating first record in leaf. Pass new key value up to our parent.
-        */
-       if (ptr == 1) {
-               xfs_inobt_key_t key;    /* key containing [ino] */
-
-               key.ir_startino = cpu_to_be32(ino);
-               if ((error = xfs_inobt_updkey(cur, &key, 1)))
-                       return error;
-       }
-       return 0;
+       if (leaf)
+               return blocklen / sizeof(xfs_inobt_rec_t);
+       return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
 }
index 8efc4a5b8b9289f30d8d11c069021b6a606e592d..37e5dd01a5779a9da7cd836439d8a1cee0415b8b 100644 (file)
@@ -24,7 +24,6 @@
 
 struct xfs_buf;
 struct xfs_btree_cur;
-struct xfs_btree_sblock;
 struct xfs_mount;
 
 /*
@@ -70,11 +69,6 @@ typedef struct xfs_inobt_key {
 /* btree pointer type */
 typedef __be32 xfs_inobt_ptr_t;
 
-/* btree block header type */
-typedef        struct xfs_btree_sblock xfs_inobt_block_t;
-
-#define        XFS_BUF_TO_INOBT_BLOCK(bp)      ((xfs_inobt_block_t *)XFS_BUF_PTR(bp))
-
 /*
  * Bit manipulations for ir_free.
  */
@@ -84,14 +78,6 @@ typedef      struct xfs_btree_sblock xfs_inobt_block_t;
 #define        XFS_INOBT_SET_FREE(rp,i)        ((rp)->ir_free |= XFS_INOBT_MASK(i))
 #define        XFS_INOBT_CLR_FREE(rp,i)        ((rp)->ir_free &= ~XFS_INOBT_MASK(i))
 
-/*
- * Real block structures have a size equal to the disk block size.
- */
-#define        XFS_INOBT_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_inobt_mxr[lev != 0])
-#define        XFS_INOBT_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_inobt_mnr[lev != 0])
-#define        XFS_INOBT_IS_LAST_REC(cur)      \
-       ((cur)->bc_ptrs[0] == be16_to_cpu(XFS_BUF_TO_INOBT_BLOCK((cur)->bc_bufs[0])->bb_numrecs))
-
 /*
  * Maximum number of inode btree levels.
  */
@@ -104,75 +90,38 @@ typedef    struct xfs_btree_sblock xfs_inobt_block_t;
 #define        XFS_PREALLOC_BLOCKS(mp)         ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
 
 /*
- * Record, key, and pointer address macros for btree blocks.
- */
-#define XFS_INOBT_REC_ADDR(bb,i,cur) \
-       (XFS_BTREE_REC_ADDR(xfs_inobt, bb, i))
-
-#define        XFS_INOBT_KEY_ADDR(bb,i,cur) \
-       (XFS_BTREE_KEY_ADDR(xfs_inobt, bb, i))
-
-#define        XFS_INOBT_PTR_ADDR(bb,i,cur) \
-       (XFS_BTREE_PTR_ADDR(xfs_inobt, bb, \
-                               i, XFS_INOBT_BLOCK_MAXRECS(1, cur)))
-
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_inobt_decrement(struct xfs_btree_cur *cur, int level, int *stat);
-
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-extern int xfs_inobt_delete(struct xfs_btree_cur *cur, int *stat);
-
-/*
- * Get the data from the pointed-to record.
- */
-extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino,
-                            __int32_t *fcnt, xfs_inofree_t *free, int *stat);
-
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_inobt_increment(struct xfs_btree_cur *cur, int level, int *stat);
-
-/*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
- */
-extern int xfs_inobt_insert(struct xfs_btree_cur *cur, int *stat);
-
-/*
- * Lookup the record equal to ino in the btree given by cur.
- */
-extern int xfs_inobt_lookup_eq(struct xfs_btree_cur *cur, xfs_agino_t ino,
-                               __int32_t fcnt, xfs_inofree_t free, int *stat);
-
-/*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
- */
-extern int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino,
-                               __int32_t fcnt, xfs_inofree_t free, int *stat);
-
-/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
+ * Btree block header size depends on a superblock flag.
+ *
+ * (not quite yet, but soon)
  */
-extern int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
-                               __int32_t fcnt, xfs_inofree_t free, int *stat);
+#define XFS_INOBT_BLOCK_LEN(mp)        XFS_BTREE_SBLOCK_LEN
 
 /*
- * Update the record referred to by cur, to the value given
- * by [ino, fcnt, free].
- * This either works (return 0) or gets an EFSCORRUPTED error.
- */
-extern int xfs_inobt_update(struct xfs_btree_cur *cur, xfs_agino_t ino,
-                               __int32_t fcnt, xfs_inofree_t free);
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_INOBT_REC_ADDR(mp, block, index) \
+       ((xfs_inobt_rec_t *) \
+               ((char *)(block) + \
+                XFS_INOBT_BLOCK_LEN(mp) + \
+                (((index) - 1) * sizeof(xfs_inobt_rec_t))))
+
+#define XFS_INOBT_KEY_ADDR(mp, block, index) \
+       ((xfs_inobt_key_t *) \
+               ((char *)(block) + \
+                XFS_INOBT_BLOCK_LEN(mp) + \
+                ((index) - 1) * sizeof(xfs_inobt_key_t)))
+
+#define XFS_INOBT_PTR_ADDR(mp, block, index, maxrecs) \
+       ((xfs_inobt_ptr_t *) \
+               ((char *)(block) + \
+                XFS_INOBT_BLOCK_LEN(mp) + \
+                (maxrecs) * sizeof(xfs_inobt_key_t) + \
+                ((index) - 1) * sizeof(xfs_inobt_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
+               struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
+extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
 
 #endif /* __XFS_IALLOC_BTREE_H__ */
index e229e9e001c291fa556c043cf79c9070743675e7..e2fb6210d4c58e7e9609d9e5c584210d5a4058c3 100644 (file)
 #include "xfs_ialloc.h"
 #include "xfs_quota.h"
 #include "xfs_utils.h"
+#include "xfs_trans_priv.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_btree_trace.h"
+#include "xfs_dir2_trace.h"
+
 
 /*
- * Look up an inode by number in the given file system.
- * The inode is looked up in the cache held in each AG.
- * If the inode is found in the cache, attach it to the provided
- * vnode.
- *
- * If it is not in core, read it in from the file system's device,
- * add it to the cache and attach the provided vnode.
- *
- * The inode is locked according to the value of the lock_flags parameter.
- * This flag parameter indicates how and if the inode's IO lock and inode lock
- * should be taken.
- *
- * mp -- the mount point structure for the current file system.  It points
- *       to the inode hash table.
- * tp -- a pointer to the current transaction if there is one.  This is
- *       simply passed through to the xfs_iread() call.
- * ino -- the number of the inode desired.  This is the unique identifier
- *        within the file system for the inode being requested.
- * lock_flags -- flags indicating how to lock the inode.  See the comment
- *              for xfs_ilock() for a list of valid values.
- * bno -- the block number starting the buffer containing the inode,
- *       if known (as by bulkstat), else 0.
+ * Allocate and initialise an xfs_inode.
  */
-STATIC int
-xfs_iget_core(
-       struct inode    *inode,
-       xfs_mount_t     *mp,
-       xfs_trans_t     *tp,
-       xfs_ino_t       ino,
-       uint            flags,
-       uint            lock_flags,
-       xfs_inode_t     **ipp,
-       xfs_daddr_t     bno)
+STATIC struct xfs_inode *
+xfs_inode_alloc(
+       struct xfs_mount        *mp,
+       xfs_ino_t               ino)
 {
-       struct inode    *old_inode;
-       xfs_inode_t     *ip;
-       xfs_inode_t     *iq;
-       int             error;
-       unsigned long   first_index, mask;
-       xfs_perag_t     *pag;
-       xfs_agino_t     agino;
+       struct xfs_inode        *ip;
 
-       /* the radix tree exists only in inode capable AGs */
-       if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi)
-               return EINVAL;
+       /*
+        * if this didn't occur in transactions, we could use
+        * KM_MAYFAIL and return NULL here on ENOMEM. Set the
+        * code up to do this anyway.
+        */
+       ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+       if (!ip)
+               return NULL;
 
-       /* get the perag structure and ensure that it's inode capable */
-       pag = xfs_get_perag(mp, ino);
-       if (!pag->pagi_inodeok)
-               return EINVAL;
-       ASSERT(pag->pag_ici_init);
-       agino = XFS_INO_TO_AGINO(mp, ino);
+       ASSERT(atomic_read(&ip->i_iocount) == 0);
+       ASSERT(atomic_read(&ip->i_pincount) == 0);
+       ASSERT(!spin_is_locked(&ip->i_flags_lock));
+       ASSERT(completion_done(&ip->i_flush));
 
-again:
-       read_lock(&pag->pag_ici_lock);
-       ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+       /*
+        * initialise the VFS inode here to get failures
+        * out of the way early.
+        */
+       if (!inode_init_always(mp->m_super, VFS_I(ip))) {
+               kmem_zone_free(xfs_inode_zone, ip);
+               return NULL;
+       }
+
+       /* initialise the xfs inode */
+       ip->i_ino = ino;
+       ip->i_mount = mp;
+       memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
+       ip->i_afp = NULL;
+       memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
+       ip->i_flags = 0;
+       ip->i_update_core = 0;
+       ip->i_update_size = 0;
+       ip->i_delayed_blks = 0;
+       memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+       ip->i_size = 0;
+       ip->i_new_size = 0;
+
+       /*
+        * Initialize inode's trace buffers.
+        */
+#ifdef XFS_INODE_TRACE
+       ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_BMAP_TRACE
+       ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_BTREE_TRACE
+       ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_RW_TRACE
+       ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_ILOCK_TRACE
+       ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_DIR2_TRACE
+       ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
+#endif
+
+       return ip;
+}
+
+/*
+ * Check the validity of the inode we just found it the cache
+ */
+static int
+xfs_iget_cache_hit(
+       struct xfs_perag        *pag,
+       struct xfs_inode        *ip,
+       int                     flags,
+       int                     lock_flags) __releases(pag->pag_ici_lock)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       int                     error = EAGAIN;
+
+       /*
+        * If INEW is set this inode is being set up
+        * If IRECLAIM is set this inode is being torn down
+        * Pause and try again.
+        */
+       if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
+               XFS_STATS_INC(xs_ig_frecycle);
+               goto out_error;
+       }
+
+       /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
+       if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
 
-       if (ip != NULL) {
                /*
-                * If INEW is set this inode is being set up
-                * we need to pause and try again.
+                * If lookup is racing with unlink, then we should return an
+                * error immediately so we don't remove it from the reclaim
+                * list and potentially leak the inode.
                 */
-               if (xfs_iflags_test(ip, XFS_INEW)) {
-                       read_unlock(&pag->pag_ici_lock);
-                       delay(1);
-                       XFS_STATS_INC(xs_ig_frecycle);
-
-                       goto again;
+               if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
+                       error = ENOENT;
+                       goto out_error;
                }
 
-               old_inode = ip->i_vnode;
-               if (old_inode == NULL) {
-                       /*
-                        * If IRECLAIM is set this inode is
-                        * on its way out of the system,
-                        * we need to pause and try again.
-                        */
-                       if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
-                               read_unlock(&pag->pag_ici_lock);
-                               delay(1);
-                               XFS_STATS_INC(xs_ig_frecycle);
-
-                               goto again;
-                       }
-                       ASSERT(xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-
-                       /*
-                        * If lookup is racing with unlink, then we
-                        * should return an error immediately so we
-                        * don't remove it from the reclaim list and
-                        * potentially leak the inode.
-                        */
-                       if ((ip->i_d.di_mode == 0) &&
-                           !(flags & XFS_IGET_CREATE)) {
-                               read_unlock(&pag->pag_ici_lock);
-                               xfs_put_perag(mp, pag);
-                               return ENOENT;
-                       }
-
-                       xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
-
-                       XFS_STATS_INC(xs_ig_found);
-                       xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
-                       read_unlock(&pag->pag_ici_lock);
-
-                       XFS_MOUNT_ILOCK(mp);
-                       list_del_init(&ip->i_reclaim);
-                       XFS_MOUNT_IUNLOCK(mp);
-
-                       goto finish_inode;
-
-               } else if (inode != old_inode) {
-                       /* The inode is being torn down, pause and
-                        * try again.
-                        */
-                       if (old_inode->i_state & (I_FREEING | I_CLEAR)) {
-                               read_unlock(&pag->pag_ici_lock);
-                               delay(1);
-                               XFS_STATS_INC(xs_ig_frecycle);
-
-                               goto again;
-                       }
-/* Chances are the other vnode (the one in the inode) is being torn
-* down right now, and we landed on top of it. Question is, what do
-* we do? Unhook the old inode and hook up the new one?
-*/
-                       cmn_err(CE_PANIC,
-               "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p",
-                                       old_inode, inode);
-               }
+               xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
 
                /*
-                * Inode cache hit
+                * We need to re-initialise the VFS inode as it has been
+                * 'freed' by the VFS. Do this here so we can deal with
+                * errors cleanly, then tag it so it can be set up correctly
+                * later.
                 */
-               read_unlock(&pag->pag_ici_lock);
-               XFS_STATS_INC(xs_ig_found);
-
-finish_inode:
-               if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
-                       xfs_put_perag(mp, pag);
-                       return ENOENT;
+               if (!inode_init_always(mp->m_super, VFS_I(ip))) {
+                       error = ENOMEM;
+                       goto out_error;
                }
 
-               if (lock_flags != 0)
-                       xfs_ilock(ip, lock_flags);
+               /*
+                * We must set the XFS_INEW flag before clearing the
+                * XFS_IRECLAIMABLE flag so that if a racing lookup does
+                * not find the XFS_IRECLAIMABLE above but has the igrab()
+                * below succeed we can safely check XFS_INEW to detect
+                * that this inode is still being initialised.
+                */
+               xfs_iflags_set(ip, XFS_INEW);
+               xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
+
+               /* clear the radix tree reclaim flag as well. */
+               __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+       } else if (!igrab(VFS_I(ip))) {
+               /* If the VFS inode is being torn down, pause and try again. */
+               XFS_STATS_INC(xs_ig_frecycle);
+               goto out_error;
+       } else if (xfs_iflags_test(ip, XFS_INEW)) {
+               /*
+                * We are racing with another cache hit that is
+                * currently recycling this inode out of the XFS_IRECLAIMABLE
+                * state. Wait for the initialisation to complete before
+                * continuing.
+                */
+               wait_on_inode(VFS_I(ip));
+       }
 
-               xfs_iflags_clear(ip, XFS_ISTALE);
-               xfs_itrace_exit_tag(ip, "xfs_iget.found");
-               goto return_ip;
+       if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+               error = ENOENT;
+               iput(VFS_I(ip));
+               goto out_error;
        }
 
-       /*
-        * Inode cache miss
-        */
+       /* We've got a live one. */
        read_unlock(&pag->pag_ici_lock);
-       XFS_STATS_INC(xs_ig_missed);
 
-       /*
-        * Read the disk inode attributes into a new inode structure and get
-        * a new vnode for it. This should also initialize i_ino and i_mount.
-        */
-       error = xfs_iread(mp, tp, ino, &ip, bno,
-                         (flags & XFS_IGET_BULKSTAT) ? XFS_IMAP_BULKSTAT : 0);
-       if (error) {
-               xfs_put_perag(mp, pag);
-               return error;
-       }
+       if (lock_flags != 0)
+               xfs_ilock(ip, lock_flags);
 
-       xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
+       xfs_iflags_clear(ip, XFS_ISTALE);
+       xfs_itrace_exit_tag(ip, "xfs_iget.found");
+       XFS_STATS_INC(xs_ig_found);
+       return 0;
+
+out_error:
+       read_unlock(&pag->pag_ici_lock);
+       return error;
+}
 
 
-       mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
-                    "xfsino", ip->i_ino);
-       mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-       init_waitqueue_head(&ip->i_ipin_wait);
-       atomic_set(&ip->i_pincount, 0);
+static int
+xfs_iget_cache_miss(
+       struct xfs_mount        *mp,
+       struct xfs_perag        *pag,
+       xfs_trans_t             *tp,
+       xfs_ino_t               ino,
+       struct xfs_inode        **ipp,
+       xfs_daddr_t             bno,
+       int                     flags,
+       int                     lock_flags) __releases(pag->pag_ici_lock)
+{
+       struct xfs_inode        *ip;
+       int                     error;
+       unsigned long           first_index, mask;
+       xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
 
-       /*
-        * Because we want to use a counting completion, complete
-        * the flush completion once to allow a single access to
-        * the flush completion without blocking.
-        */
-       init_completion(&ip->i_flush);
-       complete(&ip->i_flush);
+       ip = xfs_inode_alloc(mp, ino);
+       if (!ip)
+               return ENOMEM;
 
-       if (lock_flags)
-               xfs_ilock(ip, lock_flags);
+       error = xfs_iread(mp, tp, ip, bno, flags);
+       if (error)
+               goto out_destroy;
+
+       xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
 
        if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
-               xfs_idestroy(ip);
-               xfs_put_perag(mp, pag);
-               return ENOENT;
+               error = ENOENT;
+               goto out_destroy;
        }
 
+       if (lock_flags)
+               xfs_ilock(ip, lock_flags);
+
        /*
         * Preload the radix tree so we can insert safely under the
-        * write spinlock.
+        * write spinlock. Note that we cannot sleep inside the preload
+        * region.
         */
        if (radix_tree_preload(GFP_KERNEL)) {
-               xfs_idestroy(ip);
-               delay(1);
-               goto again;
+               error = EAGAIN;
+               goto out_unlock;
        }
+
        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
        first_index = agino & mask;
        write_lock(&pag->pag_ici_lock);
-       /*
-        * insert the new inode
-        */
+
+       /* insert the new inode */
        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
        if (unlikely(error)) {
-               BUG_ON(error != -EEXIST);
-               write_unlock(&pag->pag_ici_lock);
-               radix_tree_preload_end();
-               xfs_idestroy(ip);
+               WARN_ON(error != -EEXIST);
                XFS_STATS_INC(xs_ig_dup);
-               goto again;
+               error = EAGAIN;
+               goto out_preload_end;
        }
 
-       /*
-        * These values _must_ be set before releasing the radix tree lock!
-        */
+       /* These values _must_ be set before releasing the radix tree lock! */
        ip->i_udquot = ip->i_gdquot = NULL;
        xfs_iflags_set(ip, XFS_INEW);
 
        write_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
-
-       /*
-        * Link ip to its mount and thread it on the mount's inode list.
-        */
-       XFS_MOUNT_ILOCK(mp);
-       if ((iq = mp->m_inodes)) {
-               ASSERT(iq->i_mprev->i_mnext == iq);
-               ip->i_mprev = iq->i_mprev;
-               iq->i_mprev->i_mnext = ip;
-               iq->i_mprev = ip;
-               ip->i_mnext = iq;
-       } else {
-               ip->i_mnext = ip;
-               ip->i_mprev = ip;
-       }
-       mp->m_inodes = ip;
-
-       XFS_MOUNT_IUNLOCK(mp);
-       xfs_put_perag(mp, pag);
-
- return_ip:
-       ASSERT(ip->i_df.if_ext_max ==
-              XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
-
-       xfs_iflags_set(ip, XFS_IMODIFIED);
        *ipp = ip;
-
-       /*
-        * Set up the Linux with the Linux inode.
-        */
-       ip->i_vnode = inode;
-       inode->i_private = ip;
-
-       /*
-        * If we have a real type for an on-disk inode, we can set ops(&unlock)
-        * now.  If it's a new inode being created, xfs_ialloc will handle it.
-        */
-       if (ip->i_d.di_mode != 0)
-               xfs_setup_inode(ip);
        return 0;
-}
 
+out_preload_end:
+       write_unlock(&pag->pag_ici_lock);
+       radix_tree_preload_end();
+out_unlock:
+       if (lock_flags)
+               xfs_iunlock(ip, lock_flags);
+out_destroy:
+       xfs_destroy_inode(ip);
+       return error;
+}
 
 /*
- * The 'normal' internal xfs_iget, if needed it will
- * 'allocate', or 'get', the vnode.
+ * Look up an inode by number in the given file system.
+ * The inode is looked up in the cache held in each AG.
+ * If the inode is found in the cache, initialise the vfs inode
+ * if necessary.
+ *
+ * If it is not in core, read it in from the file system's device,
+ * add it to the cache and initialise the vfs inode.
+ *
+ * The inode is locked according to the value of the lock_flags parameter.
+ * This flag parameter indicates how and if the inode's IO lock and inode lock
+ * should be taken.
+ *
+ * mp -- the mount point structure for the current file system.  It points
+ *       to the inode hash table.
+ * tp -- a pointer to the current transaction if there is one.  This is
+ *       simply passed through to the xfs_iread() call.
+ * ino -- the number of the inode desired.  This is the unique identifier
+ *        within the file system for the inode being requested.
+ * lock_flags -- flags indicating how to lock the inode.  See the comment
+ *              for xfs_ilock() for a list of valid values.
+ * bno -- the block number starting the buffer containing the inode,
+ *       if known (as by bulkstat), else 0.
  */
 int
 xfs_iget(
@@ -324,61 +326,64 @@ xfs_iget(
        xfs_inode_t     **ipp,
        xfs_daddr_t     bno)
 {
-       struct inode    *inode;
        xfs_inode_t     *ip;
        int             error;
+       xfs_perag_t     *pag;
+       xfs_agino_t     agino;
 
-       XFS_STATS_INC(xs_ig_attempts);
+       /* the radix tree exists only in inode capable AGs */
+       if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi)
+               return EINVAL;
 
-retry:
-       inode = iget_locked(mp->m_super, ino);
-       if (!inode)
-               /* If we got no inode we are out of memory */
-               return ENOMEM;
+       /* get the perag structure and ensure that it's inode capable */
+       pag = xfs_get_perag(mp, ino);
+       if (!pag->pagi_inodeok)
+               return EINVAL;
+       ASSERT(pag->pag_ici_init);
+       agino = XFS_INO_TO_AGINO(mp, ino);
 
-       if (inode->i_state & I_NEW) {
-               XFS_STATS_INC(vn_active);
-               XFS_STATS_INC(vn_alloc);
-
-               error = xfs_iget_core(inode, mp, tp, ino, flags,
-                               lock_flags, ipp, bno);
-               if (error) {
-                       make_bad_inode(inode);
-                       if (inode->i_state & I_NEW)
-                               unlock_new_inode(inode);
-                       iput(inode);
-               }
-               return error;
+again:
+       error = 0;
+       read_lock(&pag->pag_ici_lock);
+       ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+
+       if (ip) {
+               error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
+               if (error)
+                       goto out_error_or_again;
+       } else {
+               read_unlock(&pag->pag_ici_lock);
+               XFS_STATS_INC(xs_ig_missed);
+
+               error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, bno,
+                                                       flags, lock_flags);
+               if (error)
+                       goto out_error_or_again;
        }
+       xfs_put_perag(mp, pag);
 
+       *ipp = ip;
+
+       ASSERT(ip->i_df.if_ext_max ==
+              XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
        /*
-        * If the inode is not fully constructed due to
-        * filehandle mismatches wait for the inode to go
-        * away and try again.
-        *
-        * iget_locked will call __wait_on_freeing_inode
-        * to wait for the inode to go away.
+        * If we have a real type for an on-disk inode, we can set ops(&unlock)
+        * now.  If it's a new inode being created, xfs_ialloc will handle it.
         */
-       if (is_bad_inode(inode)) {
-               iput(inode);
-               delay(1);
-               goto retry;
-       }
+       if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
+               xfs_setup_inode(ip);
+       return 0;
 
-       ip = XFS_I(inode);
-       if (!ip) {
-               iput(inode);
+out_error_or_again:
+       if (error == EAGAIN) {
                delay(1);
-               goto retry;
+               goto again;
        }
-
-       if (lock_flags != 0)
-               xfs_ilock(ip, lock_flags);
-       XFS_STATS_INC(xs_ig_found);
-       *ipp = ip;
-       return 0;
+       xfs_put_perag(mp, pag);
+       return error;
 }
 
+
 /*
  * Look for the inode corresponding to the given ino in the hash table.
  * If it is there and its i_transp pointer matches tp, return it.
@@ -444,99 +449,109 @@ xfs_iput_new(
        IRELE(ip);
 }
 
-
 /*
- * This routine embodies the part of the reclaim code that pulls
- * the inode from the inode hash table and the mount structure's
- * inode list.
- * This should only be called from xfs_reclaim().
+ * This is called free all the memory associated with an inode.
+ * It must free the inode itself and any buffers allocated for
+ * if_extents/if_data and if_broot.  It must also free the lock
+ * associated with the inode.
+ *
+ * Note: because we don't initialise everything on reallocation out
+ * of the zone, we must ensure we nullify everything correctly before
+ * freeing the structure.
  */
 void
-xfs_ireclaim(xfs_inode_t *ip)
+xfs_ireclaim(
+       struct xfs_inode        *ip)
 {
-       /*
-        * Remove from old hash list and mount list.
-        */
-       XFS_STATS_INC(xs_ig_reclaims);
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_perag        *pag;
 
-       xfs_iextract(ip);
-
-       /*
-        * Here we do a spurious inode lock in order to coordinate with
-        * xfs_sync().  This is because xfs_sync() references the inodes
-        * in the mount list without taking references on the corresponding
-        * vnodes.  We make that OK here by ensuring that we wait until
-        * the inode is unlocked in xfs_sync() before we go ahead and
-        * free it.  We get both the regular lock and the io lock because
-        * the xfs_sync() code may need to drop the regular one but will
-        * still hold the io lock.
-        */
-       xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-
-       /*
-        * Release dquots (and their references) if any. An inode may escape
-        * xfs_inactive and get here via vn_alloc->vn_reclaim path.
-        */
-       XFS_QM_DQDETACH(ip->i_mount, ip);
-
-       /*
-        * Pull our behavior descriptor from the vnode chain.
-        */
-       if (ip->i_vnode) {
-               ip->i_vnode->i_private = NULL;
-               ip->i_vnode = NULL;
-       }
+       XFS_STATS_INC(xs_ig_reclaims);
 
        /*
-        * Free all memory associated with the inode.
+        * Remove the inode from the per-AG radix tree.  It doesn't matter
+        * if it was never added to it because radix_tree_delete can deal
+        * with that case just fine.
         */
-       xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-       xfs_idestroy(ip);
-}
-
-/*
- * This routine removes an about-to-be-destroyed inode from
- * all of the lists in which it is located with the exception
- * of the behavior chain.
- */
-void
-xfs_iextract(
-       xfs_inode_t     *ip)
-{
-       xfs_mount_t     *mp = ip->i_mount;
-       xfs_perag_t     *pag = xfs_get_perag(mp, ip->i_ino);
-       xfs_inode_t     *iq;
-
+       pag = xfs_get_perag(mp, ip->i_ino);
        write_lock(&pag->pag_ici_lock);
        radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino));
        write_unlock(&pag->pag_ici_lock);
        xfs_put_perag(mp, pag);
 
        /*
-        * Remove from mount's inode list.
+        * Here we do an (almost) spurious inode lock in order to coordinate
+        * with inode cache radix tree lookups.  This is because the lookup
+        * can reference the inodes in the cache without taking references.
+        *
+        * We make that OK here by ensuring that we wait until the inode is
+        * unlocked after the lookup before we go ahead and free it.  We get
+        * both the ilock and the iolock because the code may need to drop the
+        * ilock one but will still hold the iolock.
         */
-       XFS_MOUNT_ILOCK(mp);
-       ASSERT((ip->i_mnext != NULL) && (ip->i_mprev != NULL));
-       iq = ip->i_mnext;
-       iq->i_mprev = ip->i_mprev;
-       ip->i_mprev->i_mnext = iq;
-
+       xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
        /*
-        * Fix up the head pointer if it points to the inode being deleted.
+        * Release dquots (and their references) if any.
         */
-       if (mp->m_inodes == ip) {
-               if (ip == iq) {
-                       mp->m_inodes = NULL;
-               } else {
-                       mp->m_inodes = iq;
-               }
+       XFS_QM_DQDETACH(ip->i_mount, ip);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+       switch (ip->i_d.di_mode & S_IFMT) {
+       case S_IFREG:
+       case S_IFDIR:
+       case S_IFLNK:
+               xfs_idestroy_fork(ip, XFS_DATA_FORK);
+               break;
        }
 
-       /* Deal with the deleted inodes list */
-       list_del_init(&ip->i_reclaim);
+       if (ip->i_afp)
+               xfs_idestroy_fork(ip, XFS_ATTR_FORK);
 
-       mp->m_ireclaims++;
-       XFS_MOUNT_IUNLOCK(mp);
+#ifdef XFS_INODE_TRACE
+       ktrace_free(ip->i_trace);
+#endif
+#ifdef XFS_BMAP_TRACE
+       ktrace_free(ip->i_xtrace);
+#endif
+#ifdef XFS_BTREE_TRACE
+       ktrace_free(ip->i_btrace);
+#endif
+#ifdef XFS_RW_TRACE
+       ktrace_free(ip->i_rwtrace);
+#endif
+#ifdef XFS_ILOCK_TRACE
+       ktrace_free(ip->i_lock_trace);
+#endif
+#ifdef XFS_DIR2_TRACE
+       ktrace_free(ip->i_dir_trace);
+#endif
+       if (ip->i_itemp) {
+               /*
+                * Only if we are shutting down the fs will we see an
+                * inode still in the AIL. If it is there, we should remove
+                * it to prevent a use-after-free from occurring.
+                */
+               xfs_log_item_t  *lip = &ip->i_itemp->ili_item;
+               struct xfs_ail  *ailp = lip->li_ailp;
+
+               ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
+                                      XFS_FORCED_SHUTDOWN(ip->i_mount));
+               if (lip->li_flags & XFS_LI_IN_AIL) {
+                       spin_lock(&ailp->xa_lock);
+                       if (lip->li_flags & XFS_LI_IN_AIL)
+                               xfs_trans_ail_delete(ailp, lip);
+                       else
+                               spin_unlock(&ailp->xa_lock);
+               }
+               xfs_inode_item_destroy(ip);
+               ip->i_itemp = NULL;
+       }
+       /* asserts to verify all state is correct here */
+       ASSERT(atomic_read(&ip->i_iocount) == 0);
+       ASSERT(atomic_read(&ip->i_pincount) == 0);
+       ASSERT(!spin_is_locked(&ip->i_flags_lock));
+       ASSERT(completion_done(&ip->i_flush));
+       kmem_zone_free(xfs_inode_zone, ip);
 }
 
 /*
@@ -737,7 +752,7 @@ xfs_iunlock(
                 * it is in the AIL and anyone is waiting on it.  Don't do
                 * this if the caller has asked us not to.
                 */
-               xfs_trans_unlocked_item(ip->i_mount,
+               xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp,
                                        (xfs_log_item_t*)(ip->i_itemp));
        }
        xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address);
@@ -790,3 +805,51 @@ xfs_isilocked(
 }
 #endif
 
+#ifdef XFS_INODE_TRACE
+
+#define KTRACE_ENTER(ip, vk, s, line, ra)                      \
+       ktrace_enter((ip)->i_trace,                             \
+/*  0 */               (void *)(__psint_t)(vk),                \
+/*  1 */               (void *)(s),                            \
+/*  2 */               (void *)(__psint_t) line,               \
+/*  3 */               (void *)(__psint_t)atomic_read(&VFS_I(ip)->i_count), \
+/*  4 */               (void *)(ra),                           \
+/*  5 */               NULL,                                   \
+/*  6 */               (void *)(__psint_t)current_cpu(),       \
+/*  7 */               (void *)(__psint_t)current_pid(),       \
+/*  8 */               (void *)__return_address,               \
+/*  9 */               NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+
+/*
+ * Vnode tracing code.
+ */
+void
+_xfs_itrace_entry(xfs_inode_t *ip, const char *func, inst_t *ra)
+{
+       KTRACE_ENTER(ip, INODE_KTRACE_ENTRY, func, 0, ra);
+}
+
+void
+_xfs_itrace_exit(xfs_inode_t *ip, const char *func, inst_t *ra)
+{
+       KTRACE_ENTER(ip, INODE_KTRACE_EXIT, func, 0, ra);
+}
+
+void
+xfs_itrace_hold(xfs_inode_t *ip, char *file, int line, inst_t *ra)
+{
+       KTRACE_ENTER(ip, INODE_KTRACE_HOLD, file, line, ra);
+}
+
+void
+_xfs_itrace_ref(xfs_inode_t *ip, char *file, int line, inst_t *ra)
+{
+       KTRACE_ENTER(ip, INODE_KTRACE_REF, file, line, ra);
+}
+
+void
+xfs_itrace_rele(xfs_inode_t *ip, char *file, int line, inst_t *ra)
+{
+       KTRACE_ENTER(ip, INODE_KTRACE_RELE, file, line, ra);
+}
+#endif /* XFS_INODE_TRACE */
diff --git a/fs/xfs/xfs_imap.h b/fs/xfs/xfs_imap.h
deleted file mode 100644 (file)
index d364500..0000000
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_IMAP_H__
-#define        __XFS_IMAP_H__
-
-/*
- * This is the structure passed to xfs_imap() to map
- * an inode number to its on disk location.
- */
-typedef struct xfs_imap {
-       xfs_daddr_t     im_blkno;       /* starting BB of inode chunk */
-       uint            im_len;         /* length in BBs of inode chunk */
-       xfs_agblock_t   im_agblkno;     /* logical block of inode chunk in ag */
-       ushort          im_ioffset;     /* inode offset in block in "inodes" */
-       ushort          im_boffset;     /* inode offset in block in bytes */
-} xfs_imap_t;
-
-#ifdef __KERNEL__
-struct xfs_mount;
-struct xfs_trans;
-int    xfs_imap(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
-                xfs_imap_t *, uint);
-#endif
-
-#endif /* __XFS_IMAP_H__ */
index a391b955df0176969727b3768d08e6162216e96c..5a5e035e5d388a0fa844c9c66a47cf92dd2db1e7 100644 (file)
@@ -23,7 +23,6 @@
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
-#include "xfs_imap.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_sb.h"
@@ -41,6 +40,7 @@
 #include "xfs_buf_item.h"
 #include "xfs_inode_item.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_bmap.h"
@@ -133,10 +133,10 @@ STATIC int
 xfs_imap_to_bp(
        xfs_mount_t     *mp,
        xfs_trans_t     *tp,
-       xfs_imap_t      *imap,
+       struct xfs_imap *imap,
        xfs_buf_t       **bpp,
        uint            buf_flags,
-       uint            imap_flags)
+       uint            iget_flags)
 {
        int             error;
        int             i;
@@ -173,12 +173,12 @@ xfs_imap_to_bp(
 
                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
                                        (i << mp->m_sb.sb_inodelog));
-               di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
-                           XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
+               di_ok = be16_to_cpu(dip->di_magic) == XFS_DINODE_MAGIC &&
+                           XFS_DINODE_GOOD_VERSION(dip->di_version);
                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
                                                XFS_ERRTAG_ITOBP_INOTOBP,
                                                XFS_RANDOM_ITOBP_INOTOBP))) {
-                       if (imap_flags & XFS_IMAP_BULKSTAT) {
+                       if (iget_flags & XFS_IGET_BULKSTAT) {
                                xfs_trans_brelse(tp, bp);
                                return XFS_ERROR(EINVAL);
                        }
@@ -190,7 +190,7 @@ xfs_imap_to_bp(
                                        "daddr %lld #%d (magic=%x)",
                                XFS_BUFTARG_NAME(mp->m_ddev_targp),
                                (unsigned long long)imap->im_blkno, i,
-                               be16_to_cpu(dip->di_core.di_magic));
+                               be16_to_cpu(dip->di_magic));
 #endif
                        xfs_trans_brelse(tp, bp);
                        return XFS_ERROR(EFSCORRUPTED);
@@ -221,25 +221,26 @@ xfs_imap_to_bp(
  * Use xfs_imap() to determine the size and location of the
  * buffer to read from disk.
  */
-STATIC int
+int
 xfs_inotobp(
        xfs_mount_t     *mp,
        xfs_trans_t     *tp,
        xfs_ino_t       ino,
        xfs_dinode_t    **dipp,
        xfs_buf_t       **bpp,
-       int             *offset)
+       int             *offset,
+       uint            imap_flags)
 {
-       xfs_imap_t      imap;
+       struct xfs_imap imap;
        xfs_buf_t       *bp;
        int             error;
 
        imap.im_blkno = 0;
-       error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
+       error = xfs_imap(mp, tp, ino, &imap, imap_flags);
        if (error)
                return error;
 
-       error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, 0);
+       error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags);
        if (error)
                return error;
 
@@ -260,15 +261,11 @@ xfs_inotobp(
  * If a non-zero error is returned, then the contents of bpp and
  * dipp are undefined.
  *
- * If the inode is new and has not yet been initialized, use xfs_imap()
- * to determine the size and location of the buffer to read from disk.
- * If the inode has already been mapped to its buffer and read in once,
- * then use the mapping information stored in the inode rather than
- * calling xfs_imap().  This allows us to avoid the overhead of looking
- * at the inode btree for small block file systems (see xfs_dilocate()).
- * We can tell whether the inode has been mapped in before by comparing
- * its disk block address to 0.  Only uninitialized inodes will have
- * 0 for the disk block address.
+ * The inode is expected to already been mapped to its buffer and read
+ * in once, thus we can use the mapping information stored in the inode
+ * rather than calling xfs_imap().  This allows us to avoid the overhead
+ * of looking at the inode btree for small block file systems
+ * (see xfs_imap()).
  */
 int
 xfs_itobp(
@@ -277,40 +274,14 @@ xfs_itobp(
        xfs_inode_t     *ip,
        xfs_dinode_t    **dipp,
        xfs_buf_t       **bpp,
-       xfs_daddr_t     bno,
-       uint            imap_flags,
        uint            buf_flags)
 {
-       xfs_imap_t      imap;
        xfs_buf_t       *bp;
        int             error;
 
-       if (ip->i_blkno == (xfs_daddr_t)0) {
-               imap.im_blkno = bno;
-               error = xfs_imap(mp, tp, ip->i_ino, &imap,
-                                       XFS_IMAP_LOOKUP | imap_flags);
-               if (error)
-                       return error;
+       ASSERT(ip->i_imap.im_blkno != 0);
 
-               /*
-                * Fill in the fields in the inode that will be used to
-                * map the inode to its buffer from now on.
-                */
-               ip->i_blkno = imap.im_blkno;
-               ip->i_len = imap.im_len;
-               ip->i_boffset = imap.im_boffset;
-       } else {
-               /*
-                * We've already mapped the inode once, so just use the
-                * mapping that we saved the first time.
-                */
-               imap.im_blkno = ip->i_blkno;
-               imap.im_len = ip->i_len;
-               imap.im_boffset = ip->i_boffset;
-       }
-       ASSERT(bno == 0 || bno == imap.im_blkno);
-
-       error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, imap_flags);
+       error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, buf_flags, 0);
        if (error)
                return error;
 
@@ -321,7 +292,7 @@ xfs_itobp(
                return EAGAIN;
        }
 
-       *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+       *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
        *bpp = bp;
        return 0;
 }
@@ -348,26 +319,26 @@ xfs_iformat(
                XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        error = 0;
 
-       if (unlikely(be32_to_cpu(dip->di_core.di_nextents) +
-                    be16_to_cpu(dip->di_core.di_anextents) >
-                    be64_to_cpu(dip->di_core.di_nblocks))) {
+       if (unlikely(be32_to_cpu(dip->di_nextents) +
+                    be16_to_cpu(dip->di_anextents) >
+                    be64_to_cpu(dip->di_nblocks))) {
                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
                        "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
                        (unsigned long long)ip->i_ino,
-                       (int)(be32_to_cpu(dip->di_core.di_nextents) +
-                             be16_to_cpu(dip->di_core.di_anextents)),
+                       (int)(be32_to_cpu(dip->di_nextents) +
+                             be16_to_cpu(dip->di_anextents)),
                        (unsigned long long)
-                               be64_to_cpu(dip->di_core.di_nblocks));
+                               be64_to_cpu(dip->di_nblocks));
                XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
                                     ip->i_mount, dip);
                return XFS_ERROR(EFSCORRUPTED);
        }
 
-       if (unlikely(dip->di_core.di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
+       if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
                        "corrupt dinode %Lu, forkoff = 0x%x.",
                        (unsigned long long)ip->i_ino,
-                       dip->di_core.di_forkoff);
+                       dip->di_forkoff);
                XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
                                     ip->i_mount, dip);
                return XFS_ERROR(EFSCORRUPTED);
@@ -378,25 +349,25 @@ xfs_iformat(
        case S_IFCHR:
        case S_IFBLK:
        case S_IFSOCK:
-               if (unlikely(dip->di_core.di_format != XFS_DINODE_FMT_DEV)) {
+               if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
                        XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
                                              ip->i_mount, dip);
                        return XFS_ERROR(EFSCORRUPTED);
                }
                ip->i_d.di_size = 0;
                ip->i_size = 0;
-               ip->i_df.if_u2.if_rdev = be32_to_cpu(dip->di_u.di_dev);
+               ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
                break;
 
        case S_IFREG:
        case S_IFLNK:
        case S_IFDIR:
-               switch (dip->di_core.di_format) {
+               switch (dip->di_format) {
                case XFS_DINODE_FMT_LOCAL:
                        /*
                         * no local regular files yet
                         */
-                       if (unlikely((be16_to_cpu(dip->di_core.di_mode) & S_IFMT) == S_IFREG)) {
+                       if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) {
                                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
                                        "corrupt inode %Lu "
                                        "(local format for regular file).",
@@ -407,7 +378,7 @@ xfs_iformat(
                                return XFS_ERROR(EFSCORRUPTED);
                        }
 
-                       di_size = be64_to_cpu(dip->di_core.di_size);
+                       di_size = be64_to_cpu(dip->di_size);
                        if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
                                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
                                        "corrupt inode %Lu "
@@ -449,7 +420,7 @@ xfs_iformat(
        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
        ip->i_afp->if_ext_max =
                XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
-       switch (dip->di_core.di_aformat) {
+       switch (dip->di_aformat) {
        case XFS_DINODE_FMT_LOCAL:
                atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
                size = be16_to_cpu(atp->hdr.totsize);
@@ -621,7 +592,7 @@ xfs_iformat_btree(
        ifp = XFS_IFORK_PTR(ip, whichfork);
        dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
        size = XFS_BMAP_BROOT_SPACE(dfp);
-       nrecs = XFS_BMAP_BROOT_NUMRECS(dfp);
+       nrecs = be16_to_cpu(dfp->bb_numrecs);
 
        /*
         * blow out if -- fork has less extents than can fit in
@@ -649,8 +620,9 @@ xfs_iformat_btree(
         * Copy and convert from the on-disk structure
         * to the in-memory structure.
         */
-       xfs_bmdr_to_bmbt(dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
-               ifp->if_broot, size);
+       xfs_bmdr_to_bmbt(ip->i_mount, dfp,
+                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+                        ifp->if_broot, size);
        ifp->if_flags &= ~XFS_IFEXTENTS;
        ifp->if_flags |= XFS_IFBROOT;
 
@@ -660,7 +632,7 @@ xfs_iformat_btree(
 void
 xfs_dinode_from_disk(
        xfs_icdinode_t          *to,
-       xfs_dinode_core_t       *from)
+       xfs_dinode_t            *from)
 {
        to->di_magic = be16_to_cpu(from->di_magic);
        to->di_mode = be16_to_cpu(from->di_mode);
@@ -694,7 +666,7 @@ xfs_dinode_from_disk(
 
 void
 xfs_dinode_to_disk(
-       xfs_dinode_core_t       *to,
+       xfs_dinode_t            *to,
        xfs_icdinode_t          *from)
 {
        to->di_magic = cpu_to_be16(from->di_magic);
@@ -781,93 +753,57 @@ uint
 xfs_dic2xflags(
        xfs_dinode_t            *dip)
 {
-       xfs_dinode_core_t       *dic = &dip->di_core;
-
-       return _xfs_dic2xflags(be16_to_cpu(dic->di_flags)) |
+       return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
                                (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
 }
 
 /*
- * Given a mount structure and an inode number, return a pointer
- * to a newly allocated in-core inode corresponding to the given
- * inode number.
- *
- * Initialize the inode's attributes and extent pointers if it
- * already has them (it will not if the inode has no links).
+ * Read the disk inode attributes into the in-core inode structure.
  */
 int
 xfs_iread(
        xfs_mount_t     *mp,
        xfs_trans_t     *tp,
-       xfs_ino_t       ino,
-       xfs_inode_t     **ipp,
+       xfs_inode_t     *ip,
        xfs_daddr_t     bno,
-       uint            imap_flags)
+       uint            iget_flags)
 {
        xfs_buf_t       *bp;
        xfs_dinode_t    *dip;
-       xfs_inode_t     *ip;
        int             error;
 
-       ASSERT(xfs_inode_zone != NULL);
-
-       ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP);
-       ip->i_ino = ino;
-       ip->i_mount = mp;
-       atomic_set(&ip->i_iocount, 0);
-       spin_lock_init(&ip->i_flags_lock);
-
        /*
-        * Get pointer's to the on-disk inode and the buffer containing it.
-        * If the inode number refers to a block outside the file system
-        * then xfs_itobp() will return NULL.  In this case we should
-        * return NULL as well.  Set i_blkno to 0 so that xfs_itobp() will
-        * know that this is a new incore inode.
+        * Fill in the location information in the in-core inode.
         */
-       error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK);
-       if (error) {
-               kmem_zone_free(xfs_inode_zone, ip);
+       ip->i_imap.im_blkno = bno;
+       error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
+       if (error)
                return error;
-       }
+       ASSERT(bno == 0 || bno == ip->i_imap.im_blkno);
 
        /*
-        * Initialize inode's trace buffers.
-        * Do this before xfs_iformat in case it adds entries.
+        * Get pointers to the on-disk inode and the buffer containing it.
         */
-#ifdef XFS_INODE_TRACE
-       ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_BMAP_TRACE
-       ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_BMBT_TRACE
-       ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_RW_TRACE
-       ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_ILOCK_TRACE
-       ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_DIR2_TRACE
-       ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
-#endif
+       error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp,
+                              XFS_BUF_LOCK, iget_flags);
+       if (error)
+               return error;
+       dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
 
        /*
         * If we got something that isn't an inode it means someone
         * (nfs or dmi) has a stale handle.
         */
-       if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC) {
-               kmem_zone_free(xfs_inode_zone, ip);
-               xfs_trans_brelse(tp, bp);
+       if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) {
 #ifdef DEBUG
                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
-                               "dip->di_core.di_magic (0x%x) != "
+                               "dip->di_magic (0x%x) != "
                                "XFS_DINODE_MAGIC (0x%x)",
-                               be16_to_cpu(dip->di_core.di_magic),
+                               be16_to_cpu(dip->di_magic),
                                XFS_DINODE_MAGIC);
 #endif /* DEBUG */
-               return XFS_ERROR(EINVAL);
+               error = XFS_ERROR(EINVAL);
+               goto out_brelse;
        }
 
        /*
@@ -877,24 +813,22 @@ xfs_iread(
         * specific information.
         * Otherwise, just get the truly permanent information.
         */
-       if (dip->di_core.di_mode) {
-               xfs_dinode_from_disk(&ip->i_d, &dip->di_core);
+       if (dip->di_mode) {
+               xfs_dinode_from_disk(&ip->i_d, dip);
                error = xfs_iformat(ip, dip);
                if (error)  {
-                       kmem_zone_free(xfs_inode_zone, ip);
-                       xfs_trans_brelse(tp, bp);
 #ifdef DEBUG
                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
                                        "xfs_iformat() returned error %d",
                                        error);
 #endif /* DEBUG */
-                       return error;
+                       goto out_brelse;
                }
        } else {
-               ip->i_d.di_magic = be16_to_cpu(dip->di_core.di_magic);
-               ip->i_d.di_version = dip->di_core.di_version;
-               ip->i_d.di_gen = be32_to_cpu(dip->di_core.di_gen);
-               ip->i_d.di_flushiter = be16_to_cpu(dip->di_core.di_flushiter);
+               ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
+               ip->i_d.di_version = dip->di_version;
+               ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
+               ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
                /*
                 * Make sure to pull in the mode here as well in
                 * case the inode is released without being used.
@@ -911,8 +845,6 @@ xfs_iread(
                        XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        }
 
-       INIT_LIST_HEAD(&ip->i_reclaim);
-
        /*
         * The inode format changed when we moved the link count and
         * made it 32 bits long.  If this is an old format inode,
@@ -924,7 +856,7 @@ xfs_iread(
         * the new format. We don't change the version number so that we
         * can distinguish this from a real new format inode.
         */
-       if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+       if (ip->i_d.di_version == 1) {
                ip->i_d.di_nlink = ip->i_d.di_onlink;
                ip->i_d.di_onlink = 0;
                ip->i_d.di_projid = 0;
@@ -938,7 +870,7 @@ xfs_iread(
         * around for a while.  This helps to keep recently accessed
         * meta-data in-core longer.
         */
-        XFS_BUF_SET_REF(bp, XFS_INO_REF);
+       XFS_BUF_SET_REF(bp, XFS_INO_REF);
 
        /*
         * Use xfs_trans_brelse() to release the buffer containing the
@@ -953,9 +885,9 @@ xfs_iread(
         * to worry about the inode being changed just because we released
         * the buffer.
         */
+ out_brelse:
        xfs_trans_brelse(tp, bp);
-       *ipp = ip;
-       return 0;
+       return error;
 }
 
 /*
@@ -1049,6 +981,7 @@ xfs_ialloc(
        uint            flags;
        int             error;
        timespec_t      tv;
+       int             filestreams = 0;
 
        /*
         * Call the space management code to pick
@@ -1056,9 +989,8 @@ xfs_ialloc(
         */
        error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
                            ialloc_context, call_again, &ino);
-       if (error != 0) {
+       if (error)
                return error;
-       }
        if (*call_again || ino == NULLFSINO) {
                *ipp = NULL;
                return 0;
@@ -1072,9 +1004,8 @@ xfs_ialloc(
         */
        error = xfs_trans_iget(tp->t_mountp, tp, ino,
                                XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
-       if (error != 0) {
+       if (error)
                return error;
-       }
        ASSERT(ip != NULL);
 
        ip->i_d.di_mode = (__uint16_t)mode;
@@ -1093,8 +1024,8 @@ xfs_ialloc(
         * here rather than here and in the flush/logging code.
         */
        if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) &&
-           ip->i_d.di_version == XFS_DINODE_VERSION_1) {
-               ip->i_d.di_version = XFS_DINODE_VERSION_2;
+           ip->i_d.di_version == 1) {
+               ip->i_d.di_version = 2;
                /*
                 * We've already zeroed the old link count, the projid field,
                 * and the pad field.
@@ -1104,7 +1035,7 @@ xfs_ialloc(
        /*
         * Project ids won't be stored on disk if we are using a version 1 inode.
         */
-       if ((prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1))
+       if ((prid != 0) && (ip->i_d.di_version == 1))
                xfs_bump_ino_vers2(tp, ip);
 
        if (pip && XFS_INHERIT_GID(pip)) {
@@ -1155,13 +1086,12 @@ xfs_ialloc(
                flags |= XFS_ILOG_DEV;
                break;
        case S_IFREG:
-               if (pip && xfs_inode_is_filestream(pip)) {
-                       error = xfs_filestream_associate(pip, ip);
-                       if (error < 0)
-                               return -error;
-                       if (!error)
-                               xfs_iflags_set(ip, XFS_IFILESTREAM);
-               }
+               /*
+                * we can't set up filestreams until after the VFS inode
+                * is set up properly.
+                */
+               if (pip && xfs_inode_is_filestream(pip))
+                       filestreams = 1;
                /* fall through */
        case S_IFDIR:
                if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
@@ -1227,6 +1157,15 @@ xfs_ialloc(
        /* now that we have an i_mode we can setup inode ops and unlock */
        xfs_setup_inode(ip);
 
+       /* now we have set up the vfs inode we can associate the filestream */
+       if (filestreams) {
+               error = xfs_filestream_associate(pip, ip);
+               if (error < 0)
+                       return -error;
+               if (!error)
+                       xfs_iflags_set(ip, XFS_IFILESTREAM);
+       }
+
        *ipp = ip;
        return 0;
 }
@@ -1383,8 +1322,8 @@ xfs_itrunc_trace(
  * direct I/O with the truncate operation.  Also, because we hold
  * the IOLOCK in exclusive mode, we prevent new direct I/Os from being
  * started until the truncate completes and drops the lock. Essentially,
- * the vn_iowait() call forms an I/O barrier that provides strict ordering
- * between direct I/Os and the truncate operation.
+ * the xfs_ioend_wait() call forms an I/O barrier that provides strict
+ * ordering between direct I/Os and the truncate operation.
  *
  * The flags parameter can have either the value XFS_ITRUNC_DEFINITE
  * or XFS_ITRUNC_MAYBE.  The XFS_ITRUNC_MAYBE value should be used
@@ -1415,7 +1354,7 @@ xfs_itruncate_start(
 
        /* wait for the completion of any pending DIOs */
        if (new_size == 0 || new_size < ip->i_size)
-               vn_iowait(ip);
+               xfs_ioend_wait(ip);
 
        /*
         * Call toss_pages or flushinval_pages to get rid of pages
@@ -1726,8 +1665,14 @@ xfs_itruncate_finish(
                xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
                xfs_trans_ihold(ntp, ip);
 
-               if (!error)
-                       error = xfs_trans_reserve(ntp, 0,
+               if (error)
+                       return error;
+               /*
+                * transaction commit worked ok so we can drop the extra ticket
+                * reference that we gained in xfs_trans_dup()
+                */
+               xfs_log_ticket_put(ntp->t_ticket);
+               error = xfs_trans_reserve(ntp, 0,
                                        XFS_ITRUNCATE_LOG_RES(mp), 0,
                                        XFS_TRANS_PERM_LOG_RES,
                                        XFS_ITRUNCATE_LOG_COUNT);
@@ -1781,13 +1726,10 @@ xfs_iunlink(
        xfs_dinode_t    *dip;
        xfs_buf_t       *agibp;
        xfs_buf_t       *ibp;
-       xfs_agnumber_t  agno;
-       xfs_daddr_t     agdaddr;
        xfs_agino_t     agino;
        short           bucket_index;
        int             offset;
        int             error;
-       int             agi_ok;
 
        ASSERT(ip->i_d.di_nlink == 0);
        ASSERT(ip->i_d.di_mode != 0);
@@ -1795,31 +1737,15 @@ xfs_iunlink(
 
        mp = tp->t_mountp;
 
-       agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
-       agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
-
        /*
         * Get the agi buffer first.  It ensures lock ordering
         * on the list.
         */
-       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
-                                  XFS_FSS_TO_BB(mp, 1), 0, &agibp);
+       error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
        if (error)
                return error;
-
-       /*
-        * Validate the magic number of the agi block.
-        */
        agi = XFS_BUF_TO_AGI(agibp);
-       agi_ok =
-               be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
-               XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
-       if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK,
-                       XFS_RANDOM_IUNLINK))) {
-               XFS_CORRUPTION_ERROR("xfs_iunlink", XFS_ERRLEVEL_LOW, mp, agi);
-               xfs_trans_brelse(tp, agibp);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
+
        /*
         * Get the index into the agi hash table for the
         * list this inode will go on.
@@ -1837,14 +1763,14 @@ xfs_iunlink(
                 * Here we put the head pointer into our next pointer,
                 * and then we fall through to point the head at us.
                 */
-               error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
+               error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
                if (error)
                        return error;
 
                ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO);
                /* both on-disk, don't endian flip twice */
                dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
-               offset = ip->i_boffset +
+               offset = ip->i_imap.im_boffset +
                        offsetof(xfs_dinode_t, di_next_unlinked);
                xfs_trans_inode_buf(tp, ibp);
                xfs_trans_log_buf(tp, ibp, offset,
@@ -1879,7 +1805,6 @@ xfs_iunlink_remove(
        xfs_buf_t       *agibp;
        xfs_buf_t       *ibp;
        xfs_agnumber_t  agno;
-       xfs_daddr_t     agdaddr;
        xfs_agino_t     agino;
        xfs_agino_t     next_agino;
        xfs_buf_t       *last_ibp;
@@ -1887,45 +1812,20 @@ xfs_iunlink_remove(
        short           bucket_index;
        int             offset, last_offset = 0;
        int             error;
-       int             agi_ok;
 
-       /*
-        * First pull the on-disk inode from the AGI unlinked list.
-        */
        mp = tp->t_mountp;
-
        agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
-       agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
 
        /*
         * Get the agi buffer first.  It ensures lock ordering
         * on the list.
         */
-       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
-                                  XFS_FSS_TO_BB(mp, 1), 0, &agibp);
-       if (error) {
-               cmn_err(CE_WARN,
-                       "xfs_iunlink_remove: xfs_trans_read_buf()  returned an error %d on %s.  Returning error.",
-                       error, mp->m_fsname);
+       error = xfs_read_agi(mp, tp, agno, &agibp);
+       if (error)
                return error;
-       }
-       /*
-        * Validate the magic number of the agi block.
-        */
+
        agi = XFS_BUF_TO_AGI(agibp);
-       agi_ok =
-               be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
-               XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
-       if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK_REMOVE,
-                       XFS_RANDOM_IUNLINK_REMOVE))) {
-               XFS_CORRUPTION_ERROR("xfs_iunlink_remove", XFS_ERRLEVEL_LOW,
-                                    mp, agi);
-               xfs_trans_brelse(tp, agibp);
-               cmn_err(CE_WARN,
-                       "xfs_iunlink_remove: XFS_TEST_ERROR()  returned an error on %s.  Returning EFSCORRUPTED.",
-                        mp->m_fsname);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
+
        /*
         * Get the index into the agi hash table for the
         * list this inode will go on.
@@ -1945,7 +1845,7 @@ xfs_iunlink_remove(
                 * of dealing with the buffer when there is no need to
                 * change it.
                 */
-               error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
+               error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
                if (error) {
                        cmn_err(CE_WARN,
                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -1956,7 +1856,7 @@ xfs_iunlink_remove(
                ASSERT(next_agino != 0);
                if (next_agino != NULLAGINO) {
                        dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
-                       offset = ip->i_boffset +
+                       offset = ip->i_imap.im_boffset +
                                offsetof(xfs_dinode_t, di_next_unlinked);
                        xfs_trans_inode_buf(tp, ibp);
                        xfs_trans_log_buf(tp, ibp, offset,
@@ -1992,7 +1892,7 @@ xfs_iunlink_remove(
                        }
                        next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
                        error = xfs_inotobp(mp, tp, next_ino, &last_dip,
-                                           &last_ibp, &last_offset);
+                                           &last_ibp, &last_offset, 0);
                        if (error) {
                                cmn_err(CE_WARN,
                        "xfs_iunlink_remove: xfs_inotobp()  returned an error %d on %s.  Returning error.",
@@ -2007,7 +1907,7 @@ xfs_iunlink_remove(
                 * Now last_ibp points to the buffer previous to us on
                 * the unlinked list.  Pull us from the list.
                 */
-               error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
+               error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
                if (error) {
                        cmn_err(CE_WARN,
                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -2019,7 +1919,7 @@ xfs_iunlink_remove(
                ASSERT(next_agino != agino);
                if (next_agino != NULLAGINO) {
                        dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
-                       offset = ip->i_boffset +
+                       offset = ip->i_imap.im_boffset +
                                offsetof(xfs_dinode_t, di_next_unlinked);
                        xfs_trans_inode_buf(tp, ibp);
                        xfs_trans_log_buf(tp, ibp, offset,
@@ -2160,9 +2060,9 @@ xfs_ifree_cluster(
                                iip = (xfs_inode_log_item_t *)lip;
                                ASSERT(iip->ili_logged == 1);
                                lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
-                               spin_lock(&mp->m_ail_lock);
-                               iip->ili_flush_lsn = iip->ili_item.li_lsn;
-                               spin_unlock(&mp->m_ail_lock);
+                               xfs_trans_ail_copy_lsn(mp->m_ail,
+                                                       &iip->ili_flush_lsn,
+                                                       &iip->ili_item.li_lsn);
                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
                                pre_flushed++;
                        }
@@ -2183,9 +2083,8 @@ xfs_ifree_cluster(
                        iip->ili_last_fields = iip->ili_format.ilf_fields;
                        iip->ili_format.ilf_fields = 0;
                        iip->ili_logged = 1;
-                       spin_lock(&mp->m_ail_lock);
-                       iip->ili_flush_lsn = iip->ili_item.li_lsn;
-                       spin_unlock(&mp->m_ail_lock);
+                       xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
+                                               &iip->ili_item.li_lsn);
 
                        xfs_buf_attach_iodone(bp,
                                (void(*)(xfs_buf_t*,xfs_log_item_t*))
@@ -2263,7 +2162,7 @@ xfs_ifree(
 
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
-       error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
+       error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
        if (error)
                return error;
 
@@ -2279,7 +2178,7 @@ xfs_ifree(
        * This is a temporary hack that would require a proper fix
        * in the future.
        */
-       dip->di_core.di_mode = 0;
+       dip->di_mode = 0;
 
        if (delete) {
                xfs_ifree_cluster(ip, tp, first_ino);
@@ -2312,9 +2211,10 @@ xfs_iroot_realloc(
        int                     rec_diff,
        int                     whichfork)
 {
+       struct xfs_mount        *mp = ip->i_mount;
        int                     cur_max;
        xfs_ifork_t             *ifp;
-       xfs_bmbt_block_t        *new_broot;
+       struct xfs_btree_block  *new_broot;
        int                     new_max;
        size_t                  new_size;
        char                    *np;
@@ -2335,8 +2235,7 @@ xfs_iroot_realloc(
                 */
                if (ifp->if_broot_bytes == 0) {
                        new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
-                       ifp->if_broot = (xfs_bmbt_block_t*)kmem_alloc(new_size,
-                                                                    KM_SLEEP);
+                       ifp->if_broot = kmem_alloc(new_size, KM_SLEEP);
                        ifp->if_broot_bytes = (int)new_size;
                        return;
                }
@@ -2347,18 +2246,16 @@ xfs_iroot_realloc(
                 * location.  The records don't change location because
                 * they are kept butted up against the btree block header.
                 */
-               cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
+               cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
                new_max = cur_max + rec_diff;
                new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
-               ifp->if_broot = (xfs_bmbt_block_t *)
-                 kmem_realloc(ifp->if_broot,
-                               new_size,
+               ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
                                (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
                                KM_SLEEP);
-               op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
-                                                     ifp->if_broot_bytes);
-               np = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
-                                                     (int)new_size);
+               op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+                                                    ifp->if_broot_bytes);
+               np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+                                                    (int)new_size);
                ifp->if_broot_bytes = (int)new_size;
                ASSERT(ifp->if_broot_bytes <=
                        XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
@@ -2372,7 +2269,7 @@ xfs_iroot_realloc(
         * records, just get rid of the root and clear the status bit.
         */
        ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
-       cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
+       cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
        new_max = cur_max + rec_diff;
        ASSERT(new_max >= 0);
        if (new_max > 0)
@@ -2380,11 +2277,11 @@ xfs_iroot_realloc(
        else
                new_size = 0;
        if (new_size > 0) {
-               new_broot = (xfs_bmbt_block_t *)kmem_alloc(new_size, KM_SLEEP);
+               new_broot = kmem_alloc(new_size, KM_SLEEP);
                /*
                 * First copy over the btree block header.
                 */
-               memcpy(new_broot, ifp->if_broot, sizeof(xfs_bmbt_block_t));
+               memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN);
        } else {
                new_broot = NULL;
                ifp->if_flags &= ~XFS_IFBROOT;
@@ -2397,18 +2294,16 @@ xfs_iroot_realloc(
                /*
                 * First copy the records.
                 */
-               op = (char *)XFS_BMAP_BROOT_REC_ADDR(ifp->if_broot, 1,
-                                                    ifp->if_broot_bytes);
-               np = (char *)XFS_BMAP_BROOT_REC_ADDR(new_broot, 1,
-                                                    (int)new_size);
+               op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
+               np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
                memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
 
                /*
                 * Then copy the pointers.
                 */
-               op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
+               op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
                                                     ifp->if_broot_bytes);
-               np = (char *)XFS_BMAP_BROOT_PTR_ADDR(new_broot, 1,
+               np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
                                                     (int)new_size);
                memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
        }
@@ -2511,64 +2406,6 @@ xfs_idata_realloc(
        ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
 }
 
-
-
-
-/*
- * Map inode to disk block and offset.
- *
- * mp -- the mount point structure for the current file system
- * tp -- the current transaction
- * ino -- the inode number of the inode to be located
- * imap -- this structure is filled in with the information necessary
- *      to retrieve the given inode from disk
- * flags -- flags to pass to xfs_dilocate indicating whether or not
- *      lookups in the inode btree were OK or not
- */
-int
-xfs_imap(
-       xfs_mount_t     *mp,
-       xfs_trans_t     *tp,
-       xfs_ino_t       ino,
-       xfs_imap_t      *imap,
-       uint            flags)
-{
-       xfs_fsblock_t   fsbno;
-       int             len;
-       int             off;
-       int             error;
-
-       fsbno = imap->im_blkno ?
-               XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK;
-       error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags);
-       if (error)
-               return error;
-
-       imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno);
-       imap->im_len = XFS_FSB_TO_BB(mp, len);
-       imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno);
-       imap->im_ioffset = (ushort)off;
-       imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog);
-
-       /*
-        * If the inode number maps to a block outside the bounds
-        * of the file system then return NULL rather than calling
-        * read_buf and panicing when we get an error from the
-        * driver.
-        */
-       if ((imap->im_blkno + imap->im_len) >
-           XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-               xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
-                       "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
-                       " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
-                       (unsigned long long) imap->im_blkno,
-                       (unsigned long long) imap->im_len,
-                       XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
-               return EINVAL;
-       }
-       return 0;
-}
-
 void
 xfs_idestroy_fork(
        xfs_inode_t     *ip,
@@ -2612,70 +2449,6 @@ xfs_idestroy_fork(
        }
 }
 
-/*
- * This is called free all the memory associated with an inode.
- * It must free the inode itself and any buffers allocated for
- * if_extents/if_data and if_broot.  It must also free the lock
- * associated with the inode.
- */
-void
-xfs_idestroy(
-       xfs_inode_t     *ip)
-{
-       switch (ip->i_d.di_mode & S_IFMT) {
-       case S_IFREG:
-       case S_IFDIR:
-       case S_IFLNK:
-               xfs_idestroy_fork(ip, XFS_DATA_FORK);
-               break;
-       }
-       if (ip->i_afp)
-               xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-       mrfree(&ip->i_lock);
-       mrfree(&ip->i_iolock);
-
-#ifdef XFS_INODE_TRACE
-       ktrace_free(ip->i_trace);
-#endif
-#ifdef XFS_BMAP_TRACE
-       ktrace_free(ip->i_xtrace);
-#endif
-#ifdef XFS_BMBT_TRACE
-       ktrace_free(ip->i_btrace);
-#endif
-#ifdef XFS_RW_TRACE
-       ktrace_free(ip->i_rwtrace);
-#endif
-#ifdef XFS_ILOCK_TRACE
-       ktrace_free(ip->i_lock_trace);
-#endif
-#ifdef XFS_DIR2_TRACE
-       ktrace_free(ip->i_dir_trace);
-#endif
-       if (ip->i_itemp) {
-               /*
-                * Only if we are shutting down the fs will we see an
-                * inode still in the AIL. If it is there, we should remove
-                * it to prevent a use-after-free from occurring.
-                */
-               xfs_mount_t     *mp = ip->i_mount;
-               xfs_log_item_t  *lip = &ip->i_itemp->ili_item;
-
-               ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
-                                      XFS_FORCED_SHUTDOWN(ip->i_mount));
-               if (lip->li_flags & XFS_LI_IN_AIL) {
-                       spin_lock(&mp->m_ail_lock);
-                       if (lip->li_flags & XFS_LI_IN_AIL)
-                               xfs_trans_delete_ail(mp, lip);
-                       else
-                               spin_unlock(&mp->m_ail_lock);
-               }
-               xfs_inode_item_destroy(ip);
-       }
-       kmem_zone_free(xfs_inode_zone, ip);
-}
-
-
 /*
  * Increment the pin count of the given buffer.
  * This value is protected by ipinlock spinlock in the mount structure.
@@ -2880,7 +2653,7 @@ xfs_iflush_fork(
                        ASSERT(ifp->if_broot_bytes <=
                               (XFS_IFORK_SIZE(ip, whichfork) +
                                XFS_BROOT_SIZE_ADJ));
-                       xfs_bmbt_to_bmdr(ifp->if_broot, ifp->if_broot_bytes,
+                       xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
                                (xfs_bmdr_block_t *)cp,
                                XFS_DFORK_SIZE(dip, mp, whichfork));
                }
@@ -2889,15 +2662,16 @@ xfs_iflush_fork(
        case XFS_DINODE_FMT_DEV:
                if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
                        ASSERT(whichfork == XFS_DATA_FORK);
-                       dip->di_u.di_dev = cpu_to_be32(ip->i_df.if_u2.if_rdev);
+                       xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
                }
                break;
 
        case XFS_DINODE_FMT_UUID:
                if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
                        ASSERT(whichfork == XFS_DATA_FORK);
-                       memcpy(&dip->di_u.di_muuid, &ip->i_df.if_u2.if_uuid,
-                               sizeof(uuid_t));
+                       memcpy(XFS_DFORK_DPTR(dip),
+                              &ip->i_df.if_u2.if_uuid,
+                              sizeof(uuid_t));
                }
                break;
 
@@ -3030,7 +2804,6 @@ cluster_corrupt_out:
                        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
                        XFS_BUF_UNDONE(bp);
                        XFS_BUF_STALE(bp);
-                       XFS_BUF_SHUT(bp);
                        XFS_BUF_ERROR(bp,EIO);
                        xfs_biodone(bp);
                } else {
@@ -3172,7 +2945,7 @@ xfs_iflush(
        /*
         * Get the buffer containing the on-disk inode.
         */
-       error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0,
+       error = xfs_itobp(mp, NULL, ip, &dip, &bp,
                                noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK);
        if (error || !bp) {
                xfs_ifunlock(ip);
@@ -3253,7 +3026,7 @@ xfs_iflush_int(
        }
 
        /* set *dip = inode's place in the buffer */
-       dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_boffset);
+       dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
 
        /*
         * Clear i_update_core before copying out the data.
@@ -3275,11 +3048,11 @@ xfs_iflush_int(
         */
        xfs_synchronize_atime(ip);
 
-       if (XFS_TEST_ERROR(be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC,
+       if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
                               mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
                    "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p",
-                       ip->i_ino, be16_to_cpu(dip->di_core.di_magic), dip);
+                       ip->i_ino, be16_to_cpu(dip->di_magic), dip);
                goto corrupt_out;
        }
        if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
@@ -3342,7 +3115,7 @@ xfs_iflush_int(
         * because if the inode is dirty at all the core must
         * be.
         */
-       xfs_dinode_to_disk(&dip->di_core, &ip->i_d);
+       xfs_dinode_to_disk(dip, &ip->i_d);
 
        /* Wrap, we never let the log put out DI_MAX_FLUSH */
        if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
@@ -3354,28 +3127,27 @@ xfs_iflush_int(
         * convert back to the old inode format.  If the superblock version
         * has been updated, then make the conversion permanent.
         */
-       ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
-              xfs_sb_version_hasnlink(&mp->m_sb));
-       if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+       ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
+       if (ip->i_d.di_version == 1) {
                if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
                        /*
                         * Convert it back.
                         */
                        ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
-                       dip->di_core.di_onlink = cpu_to_be16(ip->i_d.di_nlink);
+                       dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink);
                } else {
                        /*
                         * The superblock version has already been bumped,
                         * so just make the conversion to the new inode
                         * format permanent.
                         */
-                       ip->i_d.di_version = XFS_DINODE_VERSION_2;
-                       dip->di_core.di_version =  XFS_DINODE_VERSION_2;
+                       ip->i_d.di_version = 2;
+                       dip->di_version = 2;
                        ip->i_d.di_onlink = 0;
-                       dip->di_core.di_onlink = 0;
+                       dip->di_onlink = 0;
                        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
-                       memset(&(dip->di_core.di_pad[0]), 0,
-                             sizeof(dip->di_core.di_pad));
+                       memset(&(dip->di_pad[0]), 0,
+                             sizeof(dip->di_pad));
                        ASSERT(ip->i_d.di_projid == 0);
                }
        }
@@ -3418,10 +3190,8 @@ xfs_iflush_int(
                iip->ili_format.ilf_fields = 0;
                iip->ili_logged = 1;
 
-               ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */
-               spin_lock(&mp->m_ail_lock);
-               iip->ili_flush_lsn = iip->ili_item.li_lsn;
-               spin_unlock(&mp->m_ail_lock);
+               xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
+                                       &iip->ili_item.li_lsn);
 
                /*
                 * Attach the function xfs_iflush_done to the inode's
@@ -3459,45 +3229,8 @@ corrupt_out:
 }
 
 
-/*
- * Flush all inactive inodes in mp.
- */
-void
-xfs_iflush_all(
-       xfs_mount_t     *mp)
-{
-       xfs_inode_t     *ip;
-
- again:
-       XFS_MOUNT_ILOCK(mp);
-       ip = mp->m_inodes;
-       if (ip == NULL)
-               goto out;
-
-       do {
-               /* Make sure we skip markers inserted by sync */
-               if (ip->i_mount == NULL) {
-                       ip = ip->i_mnext;
-                       continue;
-               }
-
-               if (!VFS_I(ip)) {
-                       XFS_MOUNT_IUNLOCK(mp);
-                       xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC);
-                       goto again;
-               }
-
-               ASSERT(vn_count(VFS_I(ip)) == 0);
-
-               ip = ip->i_mnext;
-       } while (ip != mp->m_inodes);
- out:
-       XFS_MOUNT_IUNLOCK(mp);
-}
 
 #ifdef XFS_ILOCK_TRACE
-ktrace_t       *xfs_ilock_trace_buf;
-
 void
 xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra)
 {
index 6be310d41daf105556239a2f338dafaeb940e35f..1f175fa34b225df1da087a6fceadb201205fcbe8 100644 (file)
@@ -19,8 +19,7 @@
 #define        __XFS_INODE_H__
 
 struct xfs_dinode;
-struct xfs_dinode_core;
-
+struct xfs_inode;
 
 /*
  * Fork identifiers.
@@ -63,7 +62,7 @@ typedef struct xfs_ext_irec {
 typedef struct xfs_ifork {
        int                     if_bytes;       /* bytes in if_u1 */
        int                     if_real_bytes;  /* bytes allocated in if_u1 */
-       xfs_bmbt_block_t        *if_broot;      /* file's incore btree root */
+       struct xfs_btree_block  *if_broot;      /* file's incore btree root */
        short                   if_broot_bytes; /* bytes allocated for root */
        unsigned char           if_flags;       /* per-fork flags */
        unsigned char           if_ext_max;     /* max # of extent records */
@@ -84,52 +83,14 @@ typedef struct xfs_ifork {
 } xfs_ifork_t;
 
 /*
- * Flags for xfs_ichgtime().
+ * Inode location information.  Stored in the inode and passed to
+ * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
  */
-#define        XFS_ICHGTIME_MOD        0x1     /* data fork modification timestamp */
-#define        XFS_ICHGTIME_CHG        0x2     /* inode field change timestamp */
-
-/*
- * Per-fork incore inode flags.
- */
-#define        XFS_IFINLINE    0x01    /* Inline data is read in */
-#define        XFS_IFEXTENTS   0x02    /* All extent pointers are read in */
-#define        XFS_IFBROOT     0x04    /* i_broot points to the bmap b-tree root */
-#define        XFS_IFEXTIREC   0x08    /* Indirection array of extent blocks */
-
-/*
- * Flags for xfs_itobp(), xfs_imap() and xfs_dilocate().
- */
-#define XFS_IMAP_LOOKUP                0x1
-#define XFS_IMAP_BULKSTAT      0x2
-
-#ifdef __KERNEL__
-struct bhv_desc;
-struct cred;
-struct ktrace;
-struct xfs_buf;
-struct xfs_bmap_free;
-struct xfs_bmbt_irec;
-struct xfs_bmbt_block;
-struct xfs_inode;
-struct xfs_inode_log_item;
-struct xfs_mount;
-struct xfs_trans;
-struct xfs_dquot;
-
-#if defined(XFS_ILOCK_TRACE)
-#define XFS_ILOCK_KTRACE_SIZE  32
-extern ktrace_t *xfs_ilock_trace_buf;
-extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *);
-#else
-#define        xfs_ilock_trace(i,n,f,ra)
-#endif
-
-typedef struct dm_attrs_s {
-       __uint32_t      da_dmevmask;    /* DMIG event mask */
-       __uint16_t      da_dmstate;     /* DMIG state info */
-       __uint16_t      da_pad;         /* DMIG extra padding */
-} dm_attrs_t;
+struct xfs_imap {
+       xfs_daddr_t     im_blkno;       /* starting BB of inode chunk */
+       ushort          im_len;         /* length in BBs of inode chunk */
+       ushort          im_boffset;     /* inode offset in block in bytes */
+};
 
 /*
  * This is the xfs in-core inode structure.
@@ -160,7 +121,7 @@ typedef struct xfs_ictimestamp {
 } xfs_ictimestamp_t;
 
 /*
- * NOTE:  This structure must be kept identical to struct xfs_dinode_core
+ * NOTE:  This structure must be kept identical to struct xfs_dinode
  *       in xfs_dinode.h except for the endianess annotations.
  */
 typedef struct xfs_icdinode {
@@ -191,27 +152,97 @@ typedef struct xfs_icdinode {
        __uint32_t      di_gen;         /* generation number */
 } xfs_icdinode_t;
 
-typedef struct {
-       struct xfs_inode        *ip_mnext;      /* next inode in mount list */
-       struct xfs_inode        *ip_mprev;      /* ptr to prev inode */
-       struct xfs_mount        *ip_mount;      /* fs mount struct ptr */
-} xfs_iptr_t;
+/*
+ * Flags for xfs_ichgtime().
+ */
+#define        XFS_ICHGTIME_MOD        0x1     /* data fork modification timestamp */
+#define        XFS_ICHGTIME_CHG        0x2     /* inode field change timestamp */
+
+/*
+ * Per-fork incore inode flags.
+ */
+#define        XFS_IFINLINE    0x01    /* Inline data is read in */
+#define        XFS_IFEXTENTS   0x02    /* All extent pointers are read in */
+#define        XFS_IFBROOT     0x04    /* i_broot points to the bmap b-tree root */
+#define        XFS_IFEXTIREC   0x08    /* Indirection array of extent blocks */
+
+/*
+ * Fork handling.
+ */
+
+#define XFS_IFORK_Q(ip)                        ((ip)->i_d.di_forkoff != 0)
+#define XFS_IFORK_BOFF(ip)             ((int)((ip)->i_d.di_forkoff << 3))
+
+#define XFS_IFORK_PTR(ip,w)            \
+       ((w) == XFS_DATA_FORK ? \
+               &(ip)->i_df : \
+               (ip)->i_afp)
+#define XFS_IFORK_DSIZE(ip) \
+       (XFS_IFORK_Q(ip) ? \
+               XFS_IFORK_BOFF(ip) : \
+               XFS_LITINO((ip)->i_mount))
+#define XFS_IFORK_ASIZE(ip) \
+       (XFS_IFORK_Q(ip) ? \
+               XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \
+               0)
+#define XFS_IFORK_SIZE(ip,w) \
+       ((w) == XFS_DATA_FORK ? \
+               XFS_IFORK_DSIZE(ip) : \
+               XFS_IFORK_ASIZE(ip))
+#define XFS_IFORK_FORMAT(ip,w) \
+       ((w) == XFS_DATA_FORK ? \
+               (ip)->i_d.di_format : \
+               (ip)->i_d.di_aformat)
+#define XFS_IFORK_FMT_SET(ip,w,n) \
+       ((w) == XFS_DATA_FORK ? \
+               ((ip)->i_d.di_format = (n)) : \
+               ((ip)->i_d.di_aformat = (n)))
+#define XFS_IFORK_NEXTENTS(ip,w) \
+       ((w) == XFS_DATA_FORK ? \
+               (ip)->i_d.di_nextents : \
+               (ip)->i_d.di_anextents)
+#define XFS_IFORK_NEXT_SET(ip,w,n) \
+       ((w) == XFS_DATA_FORK ? \
+               ((ip)->i_d.di_nextents = (n)) : \
+               ((ip)->i_d.di_anextents = (n)))
+
+
+
+#ifdef __KERNEL__
+
+struct bhv_desc;
+struct cred;
+struct ktrace;
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_bmbt_irec;
+struct xfs_inode_log_item;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot;
+
+#if defined(XFS_ILOCK_TRACE)
+#define XFS_ILOCK_KTRACE_SIZE  32
+extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *);
+#else
+#define        xfs_ilock_trace(i,n,f,ra)
+#endif
+
+typedef struct dm_attrs_s {
+       __uint32_t      da_dmevmask;    /* DMIG event mask */
+       __uint16_t      da_dmstate;     /* DMIG state info */
+       __uint16_t      da_pad;         /* DMIG extra padding */
+} dm_attrs_t;
 
 typedef struct xfs_inode {
        /* Inode linking and identification information. */
-       struct xfs_inode        *i_mnext;       /* next inode in mount list */
-       struct xfs_inode        *i_mprev;       /* ptr to prev inode */
        struct xfs_mount        *i_mount;       /* fs mount struct ptr */
-       struct list_head        i_reclaim;      /* reclaim list */
-       struct inode            *i_vnode;       /* vnode backpointer */
        struct xfs_dquot        *i_udquot;      /* user dquot */
        struct xfs_dquot        *i_gdquot;      /* group dquot */
 
        /* Inode location stuff */
        xfs_ino_t               i_ino;          /* inode number (agno/agino)*/
-       xfs_daddr_t             i_blkno;        /* blkno of inode buffer */
-       ushort                  i_len;          /* len of inode buffer */
-       ushort                  i_boffset;      /* off of inode in buffer */
+       struct xfs_imap         i_imap;         /* location for xfs_imap() */
 
        /* Extent information. */
        xfs_ifork_t             *i_afp;         /* attribute fork pointer */
@@ -230,7 +261,6 @@ typedef struct xfs_inode {
        unsigned short          i_flags;        /* see defined flags below */
        unsigned char           i_update_core;  /* timestamps/size is dirty */
        unsigned char           i_update_size;  /* di_size field is dirty */
-       unsigned int            i_gen;          /* generation count */
        unsigned int            i_delayed_blks; /* count of delay alloc blks */
 
        xfs_icdinode_t          i_d;            /* most of ondisk inode */
@@ -238,6 +268,10 @@ typedef struct xfs_inode {
        xfs_fsize_t             i_size;         /* in-memory size */
        xfs_fsize_t             i_new_size;     /* size when write completes */
        atomic_t                i_iocount;      /* outstanding I/O count */
+
+       /* VFS inode */
+       struct inode            i_vnode;        /* embedded VFS inode */
+
        /* Trace buffers per inode. */
 #ifdef XFS_INODE_TRACE
        struct ktrace           *i_trace;       /* general inode trace */
@@ -245,7 +279,7 @@ typedef struct xfs_inode {
 #ifdef XFS_BMAP_TRACE
        struct ktrace           *i_xtrace;      /* inode extent list trace */
 #endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
        struct ktrace           *i_btrace;      /* inode bmap btree trace */
 #endif
 #ifdef XFS_RW_TRACE
@@ -265,13 +299,30 @@ typedef struct xfs_inode {
 /* Convert from vfs inode to xfs inode */
 static inline struct xfs_inode *XFS_I(struct inode *inode)
 {
-       return (struct xfs_inode *)inode->i_private;
+       return container_of(inode, struct xfs_inode, i_vnode);
 }
 
 /* convert from xfs inode to vfs inode */
 static inline struct inode *VFS_I(struct xfs_inode *ip)
 {
-       return (struct inode *)ip->i_vnode;
+       return &ip->i_vnode;
+}
+
+/*
+ * Get rid of a partially initialized inode.
+ *
+ * We have to go through destroy_inode to make sure allocations
+ * from init_inode_always like the security data are undone.
+ *
+ * We mark the inode bad so that it takes the short cut in
+ * the reclaim path instead of going through the flush path
+ * which doesn't make sense for an inode that has never seen the
+ * light of day.
+ */
+static inline void xfs_destroy_inode(struct xfs_inode *ip)
+{
+       make_bad_inode(VFS_I(ip));
+       return destroy_inode(VFS_I(ip));
 }
 
 /*
@@ -327,65 +378,36 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
        spin_unlock(&ip->i_flags_lock);
        return ret;
 }
-#endif /* __KERNEL__ */
-
 
 /*
- * Fork handling.
+ * Manage the i_flush queue embedded in the inode.  This completion
+ * queue synchronizes processes attempting to flush the in-core
+ * inode back to disk.
  */
+static inline void xfs_iflock(xfs_inode_t *ip)
+{
+       wait_for_completion(&ip->i_flush);
+}
 
-#define XFS_IFORK_Q(ip)                        ((ip)->i_d.di_forkoff != 0)
-#define XFS_IFORK_BOFF(ip)             ((int)((ip)->i_d.di_forkoff << 3))
-
-#define XFS_IFORK_PTR(ip,w)            \
-       ((w) == XFS_DATA_FORK ? \
-               &(ip)->i_df : \
-               (ip)->i_afp)
-#define XFS_IFORK_DSIZE(ip) \
-       (XFS_IFORK_Q(ip) ? \
-               XFS_IFORK_BOFF(ip) : \
-               XFS_LITINO((ip)->i_mount))
-#define XFS_IFORK_ASIZE(ip) \
-       (XFS_IFORK_Q(ip) ? \
-               XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \
-               0)
-#define XFS_IFORK_SIZE(ip,w) \
-       ((w) == XFS_DATA_FORK ? \
-               XFS_IFORK_DSIZE(ip) : \
-               XFS_IFORK_ASIZE(ip))
-#define XFS_IFORK_FORMAT(ip,w) \
-       ((w) == XFS_DATA_FORK ? \
-               (ip)->i_d.di_format : \
-               (ip)->i_d.di_aformat)
-#define XFS_IFORK_FMT_SET(ip,w,n) \
-       ((w) == XFS_DATA_FORK ? \
-               ((ip)->i_d.di_format = (n)) : \
-               ((ip)->i_d.di_aformat = (n)))
-#define XFS_IFORK_NEXTENTS(ip,w) \
-       ((w) == XFS_DATA_FORK ? \
-               (ip)->i_d.di_nextents : \
-               (ip)->i_d.di_anextents)
-#define XFS_IFORK_NEXT_SET(ip,w,n) \
-       ((w) == XFS_DATA_FORK ? \
-               ((ip)->i_d.di_nextents = (n)) : \
-               ((ip)->i_d.di_anextents = (n)))
+static inline int xfs_iflock_nowait(xfs_inode_t *ip)
+{
+       return try_wait_for_completion(&ip->i_flush);
+}
 
-#ifdef __KERNEL__
+static inline void xfs_ifunlock(xfs_inode_t *ip)
+{
+       complete(&ip->i_flush);
+}
 
 /*
  * In-core inode flags.
  */
-#define XFS_IGRIO      0x0001  /* inode used for guaranteed rate i/o */
-#define XFS_IUIOSZ     0x0002  /* inode i/o sizes have been explicitly set */
-#define XFS_IQUIESCE    0x0004  /* we have started quiescing for this inode */
-#define XFS_IRECLAIM    0x0008  /* we have started reclaiming this inode    */
-#define XFS_ISTALE     0x0010  /* inode has been staled */
-#define XFS_IRECLAIMABLE 0x0020 /* inode can be reclaimed */
-#define XFS_INEW       0x0040
-#define XFS_IFILESTREAM        0x0080  /* inode is in a filestream directory */
-#define XFS_IMODIFIED  0x0100  /* XFS inode state possibly differs */
-                               /* to the Linux inode state. */
-#define XFS_ITRUNCATED 0x0200  /* truncated down so flush-on-close */
+#define XFS_IRECLAIM    0x0001  /* we have started reclaiming this inode    */
+#define XFS_ISTALE     0x0002  /* inode has been staled */
+#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
+#define XFS_INEW       0x0008  /* inode has just been allocated */
+#define XFS_IFILESTREAM        0x0010  /* inode is in a filestream directory */
+#define XFS_ITRUNCATED 0x0020  /* truncated down so flush-on-close */
 
 /*
  * Flags for inode locking.
@@ -459,17 +481,9 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
        (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
         ((pip)->i_d.di_mode & S_ISGID))
 
-/*
- * Flags for xfs_iget()
- */
-#define XFS_IGET_CREATE                0x1
-#define XFS_IGET_BULKSTAT      0x2
-
 /*
  * xfs_iget.c prototypes.
  */
-void           xfs_ihash_init(struct xfs_mount *);
-void           xfs_ihash_free(struct xfs_mount *);
 xfs_inode_t    *xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
                                  struct xfs_trans *);
 int            xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
@@ -484,25 +498,13 @@ int               xfs_isilocked(xfs_inode_t *, uint);
 uint           xfs_ilock_map_shared(xfs_inode_t *);
 void           xfs_iunlock_map_shared(xfs_inode_t *, uint);
 void           xfs_ireclaim(xfs_inode_t *);
-int            xfs_finish_reclaim(xfs_inode_t *, int, int);
-int            xfs_finish_reclaim_all(struct xfs_mount *, int);
 
 /*
  * xfs_inode.c prototypes.
  */
-int            xfs_itobp(struct xfs_mount *, struct xfs_trans *,
-                         xfs_inode_t *, struct xfs_dinode **, struct xfs_buf **,
-                         xfs_daddr_t, uint, uint);
-int            xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
-                         xfs_inode_t **, xfs_daddr_t, uint);
-int            xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
 int            xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
                           xfs_nlink_t, xfs_dev_t, cred_t *, xfs_prid_t,
                           int, struct xfs_buf **, boolean_t *, xfs_inode_t **);
-void           xfs_dinode_from_disk(struct xfs_icdinode *,
-                                    struct xfs_dinode_core *);
-void           xfs_dinode_to_disk(struct xfs_dinode_core *,
-                                  struct xfs_icdinode *);
 
 uint           xfs_ip2xflags(struct xfs_inode *);
 uint           xfs_dic2xflags(struct xfs_dinode *);
@@ -513,17 +515,10 @@ int               xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
                                     xfs_fsize_t, int, int);
 int            xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
 
-void           xfs_idestroy_fork(xfs_inode_t *, int);
-void           xfs_idestroy(xfs_inode_t *);
-void           xfs_idata_realloc(xfs_inode_t *, int, int);
-void           xfs_iextract(xfs_inode_t *);
 void           xfs_iext_realloc(xfs_inode_t *, int, int);
-void           xfs_iroot_realloc(xfs_inode_t *, int, int);
 void           xfs_ipin(xfs_inode_t *);
 void           xfs_iunpin(xfs_inode_t *);
-int            xfs_iextents_copy(xfs_inode_t *, xfs_bmbt_rec_t *, int);
 int            xfs_iflush(xfs_inode_t *, uint);
-void           xfs_iflush_all(struct xfs_mount *);
 void           xfs_ichgtime(xfs_inode_t *, int);
 xfs_fsize_t    xfs_file_last_byte(xfs_inode_t *);
 void           xfs_lock_inodes(xfs_inode_t **, int, uint);
@@ -532,6 +527,77 @@ void               xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 void           xfs_synchronize_atime(xfs_inode_t *);
 void           xfs_mark_inode_dirty_sync(xfs_inode_t *);
 
+#if defined(XFS_INODE_TRACE)
+
+#define        INODE_TRACE_SIZE        16              /* number of trace entries */
+#define        INODE_KTRACE_ENTRY      1
+#define        INODE_KTRACE_EXIT       2
+#define        INODE_KTRACE_HOLD       3
+#define        INODE_KTRACE_REF        4
+#define        INODE_KTRACE_RELE       5
+
+extern void _xfs_itrace_entry(struct xfs_inode *, const char *, inst_t *);
+extern void _xfs_itrace_exit(struct xfs_inode *, const char *, inst_t *);
+extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *);
+extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *);
+extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *);
+#define xfs_itrace_entry(ip)   \
+       _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address)
+#define xfs_itrace_exit(ip)    \
+       _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address)
+#define xfs_itrace_exit_tag(ip, tag)   \
+       _xfs_itrace_exit(ip, tag, (inst_t *)__return_address)
+#define xfs_itrace_ref(ip)     \
+       _xfs_itrace_ref(ip, __FILE__, __LINE__, (inst_t *)__return_address)
+
+#else
+#define        xfs_itrace_entry(a)
+#define        xfs_itrace_exit(a)
+#define        xfs_itrace_exit_tag(a, b)
+#define        xfs_itrace_hold(a, b, c, d)
+#define        xfs_itrace_ref(a)
+#define        xfs_itrace_rele(a, b, c, d)
+#endif
+
+#define IHOLD(ip) \
+do { \
+       ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
+       atomic_inc(&(VFS_I(ip)->i_count)); \
+       xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
+} while (0)
+
+#define IRELE(ip) \
+do { \
+       xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
+       iput(VFS_I(ip)); \
+} while (0)
+
+#endif /* __KERNEL__ */
+
+/*
+ * Flags for xfs_iget()
+ */
+#define XFS_IGET_CREATE                0x1
+#define XFS_IGET_BULKSTAT      0x2
+
+int            xfs_inotobp(struct xfs_mount *, struct xfs_trans *,
+                           xfs_ino_t, struct xfs_dinode **,
+                           struct xfs_buf **, int *, uint);
+int            xfs_itobp(struct xfs_mount *, struct xfs_trans *,
+                         struct xfs_inode *, struct xfs_dinode **,
+                         struct xfs_buf **, uint);
+int            xfs_iread(struct xfs_mount *, struct xfs_trans *,
+                         struct xfs_inode *, xfs_daddr_t, uint);
+void           xfs_dinode_from_disk(struct xfs_icdinode *,
+                                    struct xfs_dinode *);
+void           xfs_dinode_to_disk(struct xfs_dinode *,
+                                  struct xfs_icdinode *);
+void           xfs_idestroy_fork(struct xfs_inode *, int);
+void           xfs_idata_realloc(struct xfs_inode *, int, int);
+void           xfs_iroot_realloc(struct xfs_inode *, int, int);
+int            xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
+int            xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int);
+
 xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t);
 void           xfs_iext_insert(xfs_ifork_t *, xfs_extnum_t, xfs_extnum_t,
                                xfs_bmbt_irec_t *);
@@ -561,7 +627,8 @@ void                xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
 #define xfs_ipincount(ip)      ((unsigned int) atomic_read(&ip->i_pincount))
 
 #ifdef DEBUG
-void           xfs_isize_check(struct xfs_mount *, xfs_inode_t *, xfs_fsize_t);
+void           xfs_isize_check(struct xfs_mount *, struct xfs_inode *,
+                               xfs_fsize_t);
 #else  /* DEBUG */
 #define xfs_isize_check(mp, ip, isize)
 #endif /* DEBUG */
@@ -576,26 +643,4 @@ extern struct kmem_zone    *xfs_ifork_zone;
 extern struct kmem_zone        *xfs_inode_zone;
 extern struct kmem_zone        *xfs_ili_zone;
 
-/*
- * Manage the i_flush queue embedded in the inode.  This completion
- * queue synchronizes processes attempting to flush the in-core
- * inode back to disk.
- */
-static inline void xfs_iflock(xfs_inode_t *ip)
-{
-       wait_for_completion(&ip->i_flush);
-}
-
-static inline int xfs_iflock_nowait(xfs_inode_t *ip)
-{
-       return try_wait_for_completion(&ip->i_flush);
-}
-
-static inline void xfs_ifunlock(xfs_inode_t *ip)
-{
-       complete(&ip->i_flush);
-}
-
-#endif /* __KERNEL__ */
-
 #endif /* __XFS_INODE_H__ */
index 97c7452e2620ee56040c03a6b8bb09fdfb4cc854..977c4aec587eeb3ea316408a73f147082c14c5ba 100644 (file)
@@ -281,7 +281,7 @@ xfs_inode_item_format(
        xfs_mark_inode_dirty_sync(ip);
 
        vecp->i_addr = (xfs_caddr_t)&ip->i_d;
-       vecp->i_len  = sizeof(xfs_dinode_core_t);
+       vecp->i_len  = sizeof(struct xfs_icdinode);
        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE);
        vecp++;
        nvecs++;
@@ -296,9 +296,8 @@ xfs_inode_item_format(
         * has a new version number, then we don't bother converting back.
         */
        mp = ip->i_mount;
-       ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
-              xfs_sb_version_hasnlink(&mp->m_sb));
-       if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+       ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
+       if (ip->i_d.di_version == 1) {
                if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
                        /*
                         * Convert it back.
@@ -311,7 +310,7 @@ xfs_inode_item_format(
                         * so just make the conversion to the new inode
                         * format permanent.
                         */
-                       ip->i_d.di_version = XFS_DINODE_VERSION_2;
+                       ip->i_d.di_version = 2;
                        ip->i_d.di_onlink = 0;
                        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
                }
@@ -932,6 +931,7 @@ xfs_inode_item_init(
        iip->ili_item.li_type = XFS_LI_INODE;
        iip->ili_item.li_ops = &xfs_inode_item_ops;
        iip->ili_item.li_mountp = mp;
+       iip->ili_item.li_ailp = mp->m_ail;
        iip->ili_inode = ip;
 
        /*
@@ -942,9 +942,9 @@ xfs_inode_item_init(
 
        iip->ili_format.ilf_type = XFS_LI_INODE;
        iip->ili_format.ilf_ino = ip->i_ino;
-       iip->ili_format.ilf_blkno = ip->i_blkno;
-       iip->ili_format.ilf_len = ip->i_len;
-       iip->ili_format.ilf_boffset = ip->i_boffset;
+       iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
+       iip->ili_format.ilf_len = ip->i_imap.im_len;
+       iip->ili_format.ilf_boffset = ip->i_imap.im_boffset;
 }
 
 /*
@@ -976,9 +976,8 @@ xfs_iflush_done(
        xfs_buf_t               *bp,
        xfs_inode_log_item_t    *iip)
 {
-       xfs_inode_t     *ip;
-
-       ip = iip->ili_inode;
+       xfs_inode_t             *ip = iip->ili_inode;
+       struct xfs_ail          *ailp = iip->ili_item.li_ailp;
 
        /*
         * We only want to pull the item from the AIL if it is
@@ -991,15 +990,12 @@ xfs_iflush_done(
         */
        if (iip->ili_logged &&
            (iip->ili_item.li_lsn == iip->ili_flush_lsn)) {
-               spin_lock(&ip->i_mount->m_ail_lock);
+               spin_lock(&ailp->xa_lock);
                if (iip->ili_item.li_lsn == iip->ili_flush_lsn) {
-                       /*
-                        * xfs_trans_delete_ail() drops the AIL lock.
-                        */
-                       xfs_trans_delete_ail(ip->i_mount,
-                                            (xfs_log_item_t*)iip);
+                       /* xfs_trans_ail_delete() drops the AIL lock. */
+                       xfs_trans_ail_delete(ailp, (xfs_log_item_t*)iip);
                } else {
-                       spin_unlock(&ip->i_mount->m_ail_lock);
+                       spin_unlock(&ailp->xa_lock);
                }
        }
 
@@ -1031,21 +1027,20 @@ void
 xfs_iflush_abort(
        xfs_inode_t             *ip)
 {
-       xfs_inode_log_item_t    *iip;
+       xfs_inode_log_item_t    *iip = ip->i_itemp;
        xfs_mount_t             *mp;
 
        iip = ip->i_itemp;
        mp = ip->i_mount;
        if (iip) {
+               struct xfs_ail  *ailp = iip->ili_item.li_ailp;
                if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
-                       spin_lock(&mp->m_ail_lock);
+                       spin_lock(&ailp->xa_lock);
                        if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
-                               /*
-                                * xfs_trans_delete_ail() drops the AIL lock.
-                                */
-                               xfs_trans_delete_ail(mp, (xfs_log_item_t *)iip);
+                               /* xfs_trans_ail_delete() drops the AIL lock. */
+                               xfs_trans_ail_delete(ailp, (xfs_log_item_t *)iip);
                        } else
-                               spin_unlock(&mp->m_ail_lock);
+                               spin_unlock(&ailp->xa_lock);
                }
                iip->ili_logged = 0;
                /*
index 40513077ab36f71178d50a0df2a0505e56df0853..1ff04cc323ad98c5b5faddd5e760ac4c212aba2e 100644 (file)
@@ -112,6 +112,24 @@ typedef struct xfs_inode_log_format_64 {
 #define        XFS_ILI_IOLOCKED_ANY   (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)
 
 
+#define        XFS_ILOG_FBROOT(w)      xfs_ilog_fbroot(w)
+static inline int xfs_ilog_fbroot(int w)
+{
+       return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
+}
+
+#define        XFS_ILOG_FEXT(w)        xfs_ilog_fext(w)
+static inline int xfs_ilog_fext(int w)
+{
+       return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
+}
+
+#define        XFS_ILOG_FDATA(w)       xfs_ilog_fdata(w)
+static inline int xfs_ilog_fdata(int w)
+{
+       return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
+}
+
 #ifdef __KERNEL__
 
 struct xfs_buf;
@@ -148,26 +166,6 @@ typedef struct xfs_inode_log_item {
 } xfs_inode_log_item_t;
 
 
-#define        XFS_ILOG_FDATA(w)       xfs_ilog_fdata(w)
-static inline int xfs_ilog_fdata(int w)
-{
-       return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
-}
-
-#endif /* __KERNEL__ */
-
-#define        XFS_ILOG_FBROOT(w)      xfs_ilog_fbroot(w)
-static inline int xfs_ilog_fbroot(int w)
-{
-       return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
-}
-
-#define        XFS_ILOG_FEXT(w)        xfs_ilog_fext(w)
-static inline int xfs_ilog_fext(int w)
-{
-       return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
-}
-
 static inline int xfs_inode_clean(xfs_inode_t *ip)
 {
        return (!ip->i_itemp ||
@@ -175,9 +173,6 @@ static inline int xfs_inode_clean(xfs_inode_t *ip)
               !ip->i_update_core;
 }
 
-
-#ifdef __KERNEL__
-
 extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
 extern void xfs_inode_item_destroy(struct xfs_inode *);
 extern void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *);
index 67f22b2b44b3c41c51ce3ac6c1f911f5db47b181..911062cf73a61ae21176bcf8d6c07ab9d48985d8 100644 (file)
@@ -290,7 +290,6 @@ STATIC int
 xfs_iomap_eof_align_last_fsb(
        xfs_mount_t     *mp,
        xfs_inode_t     *ip,
-       xfs_fsize_t     isize,
        xfs_extlen_t    extsize,
        xfs_fileoff_t   *last_fsb)
 {
@@ -306,14 +305,14 @@ xfs_iomap_eof_align_last_fsb(
         * stripe width and we are allocating past the allocation eof.
         */
        else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) &&
-               (isize >= XFS_FSB_TO_B(mp, mp->m_swidth)))
+               (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth)))
                new_last_fsb = roundup_64(*last_fsb, mp->m_swidth);
        /*
         * Roundup the allocation request to a stripe unit (m_dalign) boundary
         * if the file size is >= stripe unit size, and we are allocating past
         * the allocation eof.
         */
-       else if (mp->m_dalign && (isize >= XFS_FSB_TO_B(mp, mp->m_dalign)))
+       else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign)))
                new_last_fsb = roundup_64(*last_fsb, mp->m_dalign);
 
        /*
@@ -403,7 +402,6 @@ xfs_iomap_write_direct(
        xfs_filblks_t   count_fsb, resaligned;
        xfs_fsblock_t   firstfsb;
        xfs_extlen_t    extsz, temp;
-       xfs_fsize_t     isize;
        int             nimaps;
        int             bmapi_flag;
        int             quota_flag;
@@ -426,15 +424,10 @@ xfs_iomap_write_direct(
        rt = XFS_IS_REALTIME_INODE(ip);
        extsz = xfs_get_extsz_hint(ip);
 
-       isize = ip->i_size;
-       if (ip->i_new_size > isize)
-               isize = ip->i_new_size;
-
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
        last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
-       if ((offset + count) > isize) {
-               error = xfs_iomap_eof_align_last_fsb(mp, ip, isize, extsz,
-                                                       &last_fsb);
+       if ((offset + count) > ip->i_size) {
+               error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
                if (error)
                        goto error_out;
        } else {
@@ -559,7 +552,6 @@ STATIC int
 xfs_iomap_eof_want_preallocate(
        xfs_mount_t     *mp,
        xfs_inode_t     *ip,
-       xfs_fsize_t     isize,
        xfs_off_t       offset,
        size_t          count,
        int             ioflag,
@@ -573,7 +565,7 @@ xfs_iomap_eof_want_preallocate(
        int             n, error, imaps;
 
        *prealloc = 0;
-       if ((ioflag & BMAPI_SYNC) || (offset + count) <= isize)
+       if ((ioflag & BMAPI_SYNC) || (offset + count) <= ip->i_size)
                return 0;
 
        /*
@@ -617,7 +609,6 @@ xfs_iomap_write_delay(
        xfs_fileoff_t   ioalign;
        xfs_fsblock_t   firstblock;
        xfs_extlen_t    extsz;
-       xfs_fsize_t     isize;
        int             nimaps;
        xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
        int             prealloc, fsynced = 0;
@@ -637,11 +628,7 @@ xfs_iomap_write_delay(
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
 
 retry:
-       isize = ip->i_size;
-       if (ip->i_new_size > isize)
-               isize = ip->i_new_size;
-
-       error = xfs_iomap_eof_want_preallocate(mp, ip, isize, offset, count,
+       error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
                                ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
        if (error)
                return error;
@@ -655,8 +642,7 @@ retry:
        }
 
        if (prealloc || extsz) {
-               error = xfs_iomap_eof_align_last_fsb(mp, ip, isize, extsz,
-                                                       &last_fsb);
+               error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
                if (error)
                        return error;
        }
index cf6754a3c5b3e07a1e660c11ead329de21ef314c..e19d0a8d5618d934408d5c37086e435a11a5bdb5 100644 (file)
@@ -69,7 +69,7 @@ xfs_bulkstat_one_iget(
        }
 
        ASSERT(ip != NULL);
-       ASSERT(ip->i_blkno != (xfs_daddr_t)0);
+       ASSERT(ip->i_imap.im_blkno != 0);
 
        dic = &ip->i_d;
 
@@ -125,13 +125,9 @@ STATIC void
 xfs_bulkstat_one_dinode(
        xfs_mount_t     *mp,            /* mount point for filesystem */
        xfs_ino_t       ino,            /* inode number to get data for */
-       xfs_dinode_t    *dip,           /* dinode inode pointer */
+       xfs_dinode_t    *dic,           /* dinode inode pointer */
        xfs_bstat_t     *buf)           /* return buffer */
 {
-       xfs_dinode_core_t *dic;         /* dinode core info pointer */
-
-       dic = &dip->di_core;
-
        /*
         * The inode format changed when we moved the link count and
         * made it 32 bits long.  If this is an old format inode,
@@ -143,7 +139,7 @@ xfs_bulkstat_one_dinode(
         * the new format. We don't change the version number so that we
         * can distinguish this from a real new format inode.
         */
-       if (dic->di_version == XFS_DINODE_VERSION_1) {
+       if (dic->di_version == 1) {
                buf->bs_nlink = be16_to_cpu(dic->di_onlink);
                buf->bs_projid = 0;
        } else {
@@ -162,7 +158,7 @@ xfs_bulkstat_one_dinode(
        buf->bs_mtime.tv_nsec = be32_to_cpu(dic->di_mtime.t_nsec);
        buf->bs_ctime.tv_sec = be32_to_cpu(dic->di_ctime.t_sec);
        buf->bs_ctime.tv_nsec = be32_to_cpu(dic->di_ctime.t_nsec);
-       buf->bs_xflags = xfs_dic2xflags(dip);
+       buf->bs_xflags = xfs_dic2xflags(dic);
        buf->bs_extsize = be32_to_cpu(dic->di_extsize) << mp->m_sb.sb_blocklog;
        buf->bs_extents = be32_to_cpu(dic->di_nextents);
        buf->bs_gen = be32_to_cpu(dic->di_gen);
@@ -173,7 +169,7 @@ xfs_bulkstat_one_dinode(
 
        switch (dic->di_format) {
        case XFS_DINODE_FMT_DEV:
-               buf->bs_rdev = be32_to_cpu(dip->di_u.di_dev);
+               buf->bs_rdev = xfs_dinode_get_rdev(dic);
                buf->bs_blksize = BLKDEV_IOSIZE;
                buf->bs_blocks = 0;
                break;
@@ -192,27 +188,34 @@ xfs_bulkstat_one_dinode(
        }
 }
 
+/* Return 0 on success or positive error */
 STATIC int
 xfs_bulkstat_one_fmt(
        void                    __user *ubuffer,
+       int                     ubsize,
+       int                     *ubused,
        const xfs_bstat_t       *buffer)
 {
+       if (ubsize < sizeof(*buffer))
+               return XFS_ERROR(ENOMEM);
        if (copy_to_user(ubuffer, buffer, sizeof(*buffer)))
-               return -EFAULT;
-       return sizeof(*buffer);
+               return XFS_ERROR(EFAULT);
+       if (ubused)
+               *ubused = sizeof(*buffer);
+       return 0;
 }
 
 /*
  * Return stat information for one inode.
  * Return 0 if ok, else errno.
  */
-int                            /* error status */
-xfs_bulkstat_one(
+int                                    /* error status */
+xfs_bulkstat_one_int(
        xfs_mount_t     *mp,            /* mount point for filesystem */
        xfs_ino_t       ino,            /* inode number to get data for */
        void            __user *buffer, /* buffer to place output in */
        int             ubsize,         /* size of buffer */
-       void            *private_data,  /* my private data */
+       bulkstat_one_fmt_pf formatter,  /* formatter, copy to user */
        xfs_daddr_t     bno,            /* starting bno of inode cluster */
        int             *ubused,        /* bytes used by me */
        void            *dibuff,        /* on-disk inode buffer */
@@ -221,15 +224,12 @@ xfs_bulkstat_one(
        xfs_bstat_t     *buf;           /* return buffer */
        int             error = 0;      /* error value */
        xfs_dinode_t    *dip;           /* dinode inode pointer */
-       bulkstat_one_fmt_pf formatter = private_data ? : xfs_bulkstat_one_fmt;
 
        dip = (xfs_dinode_t *)dibuff;
        *stat = BULKSTAT_RV_NOTHING;
 
        if (!buffer || xfs_internal_inum(mp, ino))
                return XFS_ERROR(EINVAL);
-       if (ubsize < sizeof(*buf))
-               return XFS_ERROR(ENOMEM);
 
        buf = kmem_alloc(sizeof(*buf), KM_SLEEP);
 
@@ -244,21 +244,34 @@ xfs_bulkstat_one(
                xfs_bulkstat_one_dinode(mp, ino, dip, buf);
        }
 
-       error = formatter(buffer, buf);
-       if (error < 0)  {
-               error = EFAULT;
+       error = formatter(buffer, ubsize, ubused, buf);
+       if (error)
                goto out_free;
-       }
 
        *stat = BULKSTAT_RV_DIDONE;
-       if (ubused)
-               *ubused = error;
 
  out_free:
        kmem_free(buf);
        return error;
 }
 
+int
+xfs_bulkstat_one(
+       xfs_mount_t     *mp,            /* mount point for filesystem */
+       xfs_ino_t       ino,            /* inode number to get data for */
+       void            __user *buffer, /* buffer to place output in */
+       int             ubsize,         /* size of buffer */
+       void            *private_data,  /* my private data */
+       xfs_daddr_t     bno,            /* starting bno of inode cluster */
+       int             *ubused,        /* bytes used by me */
+       void            *dibuff,        /* on-disk inode buffer */
+       int             *stat)          /* BULKSTAT_RV_... */
+{
+       return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
+                                   xfs_bulkstat_one_fmt, bno,
+                                   ubused, dibuff, stat);
+}
+
 /*
  * Test to see whether we can use the ondisk inode directly, based
  * on the given bulkstat flags, filling in dipp accordingly.
@@ -287,19 +300,19 @@ xfs_bulkstat_use_dinode(
         * to disk yet. This is a temporary hack that would require a proper
         * fix in the future.
         */
-       if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC ||
-           !XFS_DINODE_GOOD_VERSION(dip->di_core.di_version) ||
-           !dip->di_core.di_mode)
+       if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
+           !XFS_DINODE_GOOD_VERSION(dip->di_version) ||
+           !dip->di_mode)
                return 0;
        if (flags & BULKSTAT_FG_QUICK) {
                *dipp = dip;
                return 1;
        }
        /* BULKSTAT_FG_INLINE: if attr fork is local, or not there, use it */
-       aformat = dip->di_core.di_aformat;
+       aformat = dip->di_aformat;
        if ((XFS_DFORK_Q(dip) == 0) ||
            (aformat == XFS_DINODE_FMT_LOCAL) ||
-           (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_core.di_anextents)) {
+           (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_anextents)) {
                *dipp = dip;
                return 1;
        }
@@ -359,7 +372,6 @@ xfs_bulkstat(
        int                     ubused; /* bytes used by formatter */
        xfs_buf_t               *bp;    /* ptr to on-disk inode cluster buf */
        xfs_dinode_t            *dip;   /* ptr into bp for specific inode */
-       xfs_inode_t             *ip;    /* ptr to in-core inode struct */
 
        /*
         * Get the last inode value, see if there's nothing to do.
@@ -416,8 +428,7 @@ xfs_bulkstat(
                /*
                 * Allocate and initialize a btree cursor for ialloc btree.
                 */
-               cur = xfs_btree_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_INO,
-                                               (xfs_inode_t *)0, 0);
+               cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
                irbp = irbuf;
                irbufend = irbuf + nirbuf;
                end_of_ag = 0;
@@ -472,7 +483,7 @@ xfs_bulkstat(
                         * In any case, increment to the next record.
                         */
                        if (!error)
-                               error = xfs_inobt_increment(cur, 0, &tmp);
+                               error = xfs_btree_increment(cur, 0, &tmp);
                } else {
                        /*
                         * Start of ag.  Lookup the first inode chunk.
@@ -539,7 +550,7 @@ xfs_bulkstat(
                         * Set agino to after this chunk and bump the cursor.
                         */
                        agino = gino + XFS_INODES_PER_CHUNK;
-                       error = xfs_inobt_increment(cur, 0, &tmp);
+                       error = xfs_btree_increment(cur, 0, &tmp);
                        cond_resched();
                }
                /*
@@ -586,6 +597,8 @@ xfs_bulkstat(
 
                                        if (flags & (BULKSTAT_FG_QUICK |
                                                     BULKSTAT_FG_INLINE)) {
+                                               int offset;
+
                                                ino = XFS_AGINO_TO_INO(mp, agno,
                                                                       agino);
                                                bno = XFS_AGB_TO_DADDR(mp, agno,
@@ -594,21 +607,15 @@ xfs_bulkstat(
                                                /*
                                                 * Get the inode cluster buffer
                                                 */
-                                               ASSERT(xfs_inode_zone != NULL);
-                                               ip = kmem_zone_zalloc(xfs_inode_zone,
-                                                                     KM_SLEEP);
-                                               ip->i_ino = ino;
-                                               ip->i_mount = mp;
-                                               spin_lock_init(&ip->i_flags_lock);
                                                if (bp)
                                                        xfs_buf_relse(bp);
-                                               error = xfs_itobp(mp, NULL, ip,
-                                                               &dip, &bp, bno,
-                                                               XFS_IMAP_BULKSTAT,
-                                                               XFS_BUF_LOCK);
+
+                                               error = xfs_inotobp(mp, NULL, ino, &dip,
+                                                                   &bp, &offset,
+                                                                   XFS_IGET_BULKSTAT);
+
                                                if (!error)
-                                                       clustidx = ip->i_boffset / mp->m_sb.sb_inodesize;
-                                               kmem_zone_free(xfs_inode_zone, ip);
+                                                       clustidx = offset / mp->m_sb.sb_inodesize;
                                                if (XFS_TEST_ERROR(error != 0,
                                                                   mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK,
                                                                   XFS_RANDOM_BULKSTAT_READ_CHUNK)) {
@@ -842,8 +849,7 @@ xfs_inumbers(
                                agino = 0;
                                continue;
                        }
-                       cur = xfs_btree_init_cursor(mp, NULL, agbp, agno,
-                               XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
+                       cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
                        error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp);
                        if (error) {
                                xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -887,7 +893,7 @@ xfs_inumbers(
                        bufidx = 0;
                }
                if (left) {
-                       error = xfs_inobt_increment(cur, 0, &tmp);
+                       error = xfs_btree_increment(cur, 0, &tmp);
                        if (error) {
                                xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
                                cur = NULL;
index a1f18fce9b7071083e9ed88bc48c6908a4e65f28..1fb04e7deb61298d13c4a55db385071b92c3b9a5 100644 (file)
@@ -71,8 +71,22 @@ xfs_bulkstat_single(
 
 typedef int (*bulkstat_one_fmt_pf)(  /* used size in bytes or negative error */
        void                    __user *ubuffer, /* buffer to write to */
+       int                     ubsize,          /* remaining user buffer sz */
+       int                     *ubused,         /* bytes used by formatter */
        const xfs_bstat_t       *buffer);        /* buffer to read from */
 
+int
+xfs_bulkstat_one_int(
+       xfs_mount_t             *mp,
+       xfs_ino_t               ino,
+       void                    __user *buffer,
+       int                     ubsize,
+       bulkstat_one_fmt_pf     formatter,
+       xfs_daddr_t             bno,
+       int                     *ubused,
+       void                    *dibuff,
+       int                     *stat);
+
 int
 xfs_bulkstat_one(
        xfs_mount_t             *mp,
index 3608a0f0a5f62a50f1569b948ce8935cc4b6901f..f4726f702a9ea51e84ba08622c7f322dffab3a7c 100644 (file)
@@ -100,12 +100,11 @@ STATIC void xlog_ungrant_log_space(xlog_t  *log,
 
 
 /* local ticket functions */
-STATIC xlog_ticket_t   *xlog_ticket_get(xlog_t *log,
+STATIC xlog_ticket_t   *xlog_ticket_alloc(xlog_t *log,
                                         int    unit_bytes,
                                         int    count,
                                         char   clientid,
                                         uint   flags);
-STATIC void            xlog_ticket_put(xlog_t *log, xlog_ticket_t *ticket);
 
 #if defined(DEBUG)
 STATIC void    xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr);
@@ -360,7 +359,7 @@ xfs_log_done(xfs_mount_t    *mp,
                 */
                xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)");
                xlog_ungrant_log_space(log, ticket);
-               xlog_ticket_put(log, ticket);
+               xfs_log_ticket_put(ticket);
        } else {
                xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
                xlog_regrant_reserve_log_space(log, ticket);
@@ -514,7 +513,7 @@ xfs_log_reserve(xfs_mount_t  *mp,
                retval = xlog_regrant_write_log_space(log, internal_ticket);
        } else {
                /* may sleep if need to allocate more tickets */
-               internal_ticket = xlog_ticket_get(log, unit_bytes, cnt,
+               internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
                                                  client, flags);
                if (!internal_ticket)
                        return XFS_ERROR(ENOMEM);
@@ -572,12 +571,12 @@ xfs_log_mount(
        /*
         * Initialize the AIL now we have a log.
         */
-       spin_lock_init(&mp->m_ail_lock);
        error = xfs_trans_ail_init(mp);
        if (error) {
                cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error);
                goto error;
        }
+       mp->m_log->l_ailp = mp->m_ail;
 
        /*
         * skip log recovery on a norecovery mount.  pretend it all
@@ -730,8 +729,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                spin_lock(&log->l_icloglock);
                iclog = log->l_iclog;
                atomic_inc(&iclog->ic_refcnt);
-               spin_unlock(&log->l_icloglock);
                xlog_state_want_sync(log, iclog);
+               spin_unlock(&log->l_icloglock);
                error = xlog_state_release_iclog(log, iclog);
 
                spin_lock(&log->l_icloglock);
@@ -749,7 +748,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                if (tic) {
                        xlog_trace_loggrant(log, tic, "unmount rec");
                        xlog_ungrant_log_space(log, tic);
-                       xlog_ticket_put(log, tic);
+                       xfs_log_ticket_put(tic);
                }
        } else {
                /*
@@ -768,9 +767,9 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                spin_lock(&log->l_icloglock);
                iclog = log->l_iclog;
                atomic_inc(&iclog->ic_refcnt);
-               spin_unlock(&log->l_icloglock);
 
                xlog_state_want_sync(log, iclog);
+               spin_unlock(&log->l_icloglock);
                error =  xlog_state_release_iclog(log, iclog);
 
                spin_lock(&log->l_icloglock);
@@ -906,7 +905,7 @@ xfs_log_move_tail(xfs_mount_t       *mp,
 int
 xfs_log_need_covered(xfs_mount_t *mp)
 {
-       int             needed = 0, gen;
+       int             needed = 0;
        xlog_t          *log = mp->m_log;
 
        if (!xfs_fs_writable(mp))
@@ -915,7 +914,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
        spin_lock(&log->l_icloglock);
        if (((log->l_covered_state == XLOG_STATE_COVER_NEED) ||
                (log->l_covered_state == XLOG_STATE_COVER_NEED2))
-                       && !xfs_trans_first_ail(mp, &gen)
+                       && !xfs_trans_ail_tail(log->l_ailp)
                        && xlog_iclogs_empty(log)) {
                if (log->l_covered_state == XLOG_STATE_COVER_NEED)
                        log->l_covered_state = XLOG_STATE_COVER_DONE;
@@ -952,7 +951,7 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
        xfs_lsn_t tail_lsn;
        xlog_t    *log = mp->m_log;
 
-       tail_lsn = xfs_trans_tail_ail(mp);
+       tail_lsn = xfs_trans_ail_tail(mp->m_ail);
        spin_lock(&log->l_grant_lock);
        if (tail_lsn != 0) {
                log->l_tail_lsn = tail_lsn;
@@ -1030,12 +1029,6 @@ xlog_iodone(xfs_buf_t *bp)
        ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long) 2);
        XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
        aborted = 0;
-
-       /*
-        * Some versions of cpp barf on the recursive definition of
-        * ic_log -> hic_fields.ic_log and expand ic_log twice when
-        * it is passed through two macros.  Workaround broken cpp.
-        */
        l = iclog->ic_log;
 
        /*
@@ -1302,7 +1295,7 @@ xlog_alloc_log(xfs_mount_t        *mp,
                XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
                XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
                iclog->ic_bp = bp;
-               iclog->hic_data = bp->b_addr;
+               iclog->ic_data = bp->b_addr;
 #ifdef DEBUG
                log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
 #endif
@@ -1322,7 +1315,7 @@ xlog_alloc_log(xfs_mount_t        *mp,
                atomic_set(&iclog->ic_refcnt, 0);
                spin_lock_init(&iclog->ic_callback_lock);
                iclog->ic_callback_tail = &(iclog->ic_callback);
-               iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize;
+               iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
 
                ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
                ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
@@ -1446,7 +1439,7 @@ xlog_grant_push_ail(xfs_mount_t   *mp,
      */
     if (threshold_lsn &&
        !XLOG_FORCED_SHUTDOWN(log))
-           xfs_trans_push_ail(mp, threshold_lsn);
+           xfs_trans_ail_push(log->l_ailp, threshold_lsn);
 }      /* xlog_grant_push_ail */
 
 
@@ -1991,7 +1984,9 @@ xlog_write(xfs_mount_t *  mp,
                if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
                    xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
                    record_cnt = data_cnt = 0;
+                   spin_lock(&log->l_icloglock);
                    xlog_state_want_sync(log, iclog);
+                   spin_unlock(&log->l_icloglock);
                    if (commit_iclog) {
                        ASSERT(flags & XLOG_COMMIT_TRANS);
                        *commit_iclog = iclog;
@@ -3200,7 +3195,7 @@ try_again:
 STATIC void
 xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
 {
-       spin_lock(&log->l_icloglock);
+       ASSERT(spin_is_locked(&log->l_icloglock));
 
        if (iclog->ic_state == XLOG_STATE_ACTIVE) {
                xlog_state_switch_iclogs(log, iclog, 0);
@@ -3208,10 +3203,7 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
                ASSERT(iclog->ic_state &
                        (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR));
        }
-
-       spin_unlock(&log->l_icloglock);
-}      /* xlog_state_want_sync */
-
+}
 
 
 /*****************************************************************************
@@ -3222,22 +3214,33 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
  */
 
 /*
- * Free a used ticket.
+ * Free a used ticket when it's refcount falls to zero.
  */
-STATIC void
-xlog_ticket_put(xlog_t         *log,
-               xlog_ticket_t   *ticket)
+void
+xfs_log_ticket_put(
+       xlog_ticket_t   *ticket)
 {
-       sv_destroy(&ticket->t_wait);
-       kmem_zone_free(xfs_log_ticket_zone, ticket);
-}      /* xlog_ticket_put */
+       ASSERT(atomic_read(&ticket->t_ref) > 0);
+       if (atomic_dec_and_test(&ticket->t_ref)) {
+               sv_destroy(&ticket->t_wait);
+               kmem_zone_free(xfs_log_ticket_zone, ticket);
+       }
+}
 
+xlog_ticket_t *
+xfs_log_ticket_get(
+       xlog_ticket_t   *ticket)
+{
+       ASSERT(atomic_read(&ticket->t_ref) > 0);
+       atomic_inc(&ticket->t_ref);
+       return ticket;
+}
 
 /*
  * Allocate and initialise a new log ticket.
  */
 STATIC xlog_ticket_t *
-xlog_ticket_get(xlog_t         *log,
+xlog_ticket_alloc(xlog_t               *log,
                int             unit_bytes,
                int             cnt,
                char            client,
@@ -3308,6 +3311,7 @@ xlog_ticket_get(xlog_t            *log,
                unit_bytes += 2*BBSIZE;
         }
 
+       atomic_set(&tic->t_ref, 1);
        tic->t_unit_res         = unit_bytes;
        tic->t_curr_res         = unit_bytes;
        tic->t_cnt              = cnt;
@@ -3323,7 +3327,7 @@ xlog_ticket_get(xlog_t            *log,
        xlog_tic_reset_res(tic);
 
        return tic;
-}      /* xlog_ticket_get */
+}
 
 
 /******************************************************************************
@@ -3452,7 +3456,7 @@ xlog_verify_iclog(xlog_t   *log,
        ptr = iclog->ic_datap;
        base_ptr = ptr;
        ophead = (xlog_op_header_t *)ptr;
-       xhdr = (xlog_in_core_2_t *)&iclog->ic_header;
+       xhdr = iclog->ic_data;
        for (i = 0; i < len; i++) {
                ophead = (xlog_op_header_t *)ptr;
 
@@ -3558,7 +3562,8 @@ xfs_log_force_umount(
        if (!log ||
            log->l_flags & XLOG_ACTIVE_RECOVERY) {
                mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
-               XFS_BUF_DONE(mp->m_sb_bp);
+               if (mp->m_sb_bp)
+                       XFS_BUF_DONE(mp->m_sb_bp);
                return 0;
        }
 
@@ -3579,7 +3584,9 @@ xfs_log_force_umount(
        spin_lock(&log->l_icloglock);
        spin_lock(&log->l_grant_lock);
        mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
-       XFS_BUF_DONE(mp->m_sb_bp);
+       if (mp->m_sb_bp)
+               XFS_BUF_DONE(mp->m_sb_bp);
+
        /*
         * This flag is sort of redundant because of the mount flag, but
         * it's good to maintain the separation between the log and the rest
index d47b91f10822b98400a6d11961a8a12d4e930d76..8a3e84e900a34e5153a453eae896bb795576a57b 100644 (file)
@@ -134,6 +134,7 @@ typedef struct xfs_log_callback {
 #ifdef __KERNEL__
 /* Log manager interfaces */
 struct xfs_mount;
+struct xlog_ticket;
 xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
                       xfs_log_ticket_t ticket,
                       void             **iclog,
@@ -177,6 +178,9 @@ int   xfs_log_need_covered(struct xfs_mount *mp);
 
 void     xlog_iodone(struct xfs_buf *);
 
+struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket);
+void     xfs_log_ticket_put(struct xlog_ticket *ticket);
+
 #endif
 
 
index e7d8f84443fab87ceee83561fb6b1f04685400ff..654167be0efb6c29aa4288f6ee44bd4627753cef 100644 (file)
@@ -245,6 +245,7 @@ typedef struct xlog_ticket {
        struct xlog_ticket *t_next;      /*                              :4|8 */
        struct xlog_ticket *t_prev;      /*                              :4|8 */
        xlog_tid_t         t_tid;        /* transaction identifier       : 4  */
+       atomic_t           t_ref;        /* ticket reference count       : 4  */
        int                t_curr_res;   /* current reservation in bytes : 4  */
        int                t_unit_res;   /* unit reservation in bytes    : 4  */
        char               t_ocnt;       /* original count               : 1  */
@@ -309,6 +310,16 @@ typedef struct xlog_rec_ext_header {
 } xlog_rec_ext_header_t;
 
 #ifdef __KERNEL__
+
+/*
+ * Quite misnamed, because this union lays out the actual on-disk log buffer.
+ */
+typedef union xlog_in_core2 {
+       xlog_rec_header_t       hic_header;
+       xlog_rec_ext_header_t   hic_xheader;
+       char                    hic_sector[XLOG_HEADER_SIZE];
+} xlog_in_core_2_t;
+
 /*
  * - A log record header is 512 bytes.  There is plenty of room to grow the
  *     xlog_rec_header_t into the reserved space.
@@ -338,7 +349,7 @@ typedef struct xlog_rec_ext_header {
  * We'll put all the read-only and l_icloglock fields in the first cacheline,
  * and move everything else out to subsequent cachelines.
  */
-typedef struct xlog_iclog_fields {
+typedef struct xlog_in_core {
        sv_t                    ic_force_wait;
        sv_t                    ic_write_wait;
        struct xlog_in_core     *ic_next;
@@ -361,40 +372,10 @@ typedef struct xlog_iclog_fields {
 
        /* reference counts need their own cacheline */
        atomic_t                ic_refcnt ____cacheline_aligned_in_smp;
-} xlog_iclog_fields_t;
-
-typedef union xlog_in_core2 {
-       xlog_rec_header_t       hic_header;
-       xlog_rec_ext_header_t   hic_xheader;
-       char                    hic_sector[XLOG_HEADER_SIZE];
-} xlog_in_core_2_t;
-
-typedef struct xlog_in_core {
-       xlog_iclog_fields_t     hic_fields;
-       xlog_in_core_2_t        *hic_data;
+       xlog_in_core_2_t        *ic_data;
+#define ic_header      ic_data->hic_header
 } xlog_in_core_t;
 
-/*
- * Defines to save our code from this glop.
- */
-#define        ic_force_wait   hic_fields.ic_force_wait
-#define ic_write_wait  hic_fields.ic_write_wait
-#define        ic_next         hic_fields.ic_next
-#define        ic_prev         hic_fields.ic_prev
-#define        ic_bp           hic_fields.ic_bp
-#define        ic_log          hic_fields.ic_log
-#define        ic_callback     hic_fields.ic_callback
-#define        ic_callback_lock hic_fields.ic_callback_lock
-#define        ic_callback_tail hic_fields.ic_callback_tail
-#define        ic_trace        hic_fields.ic_trace
-#define        ic_size         hic_fields.ic_size
-#define        ic_offset       hic_fields.ic_offset
-#define        ic_refcnt       hic_fields.ic_refcnt
-#define        ic_bwritecnt    hic_fields.ic_bwritecnt
-#define        ic_state        hic_fields.ic_state
-#define ic_datap       hic_fields.ic_datap
-#define ic_header      hic_data->hic_header
-
 /*
  * The reservation head lsn is not made up of a cycle number and block number.
  * Instead, it uses a cycle number and byte number.  Logs don't expect to
@@ -404,6 +385,7 @@ typedef struct xlog_in_core {
 typedef struct log {
        /* The following fields don't need locking */
        struct xfs_mount        *l_mp;          /* mount point */
+       struct xfs_ail          *l_ailp;        /* AIL log is working with */
        struct xfs_buf          *l_xbuf;        /* extra buffer for log
                                                 * wrapping */
        struct xfs_buftarg      *l_targ;        /* buftarg of log */
index 70e3ba32e6be5988552f954464b072a2b31cab89..35cca98bd94c90c926dd3db7ae49974661577a4c 100644 (file)
@@ -36,7 +36,6 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
-#include "xfs_imap.h"
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_log_priv.h"
@@ -54,10 +53,8 @@ STATIC void  xlog_recover_insert_item_backq(xlog_recover_item_t **q,
                                               xlog_recover_item_t *item);
 #if defined(DEBUG)
 STATIC void    xlog_recover_check_summary(xlog_t *);
-STATIC void    xlog_recover_check_ail(xfs_mount_t *, xfs_log_item_t *, int);
 #else
 #define        xlog_recover_check_summary(log)
-#define        xlog_recover_check_ail(mp, lip, gen)
 #endif
 
 
@@ -270,21 +267,16 @@ STATIC void
 xlog_recover_iodone(
        struct xfs_buf  *bp)
 {
-       xfs_mount_t     *mp;
-
-       ASSERT(XFS_BUF_FSPRIVATE(bp, void *));
-
        if (XFS_BUF_GETERROR(bp)) {
                /*
                 * We're not going to bother about retrying
                 * this during recovery. One strike!
                 */
-               mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *);
                xfs_ioerror_alert("xlog_recover_iodone",
-                                 mp, bp, XFS_BUF_ADDR(bp));
-               xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+                                 bp->b_mount, bp, XFS_BUF_ADDR(bp));
+               xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
        }
-       XFS_BUF_SET_FSPRIVATE(bp, NULL);
+       bp->b_mount = NULL;
        XFS_BUF_CLR_IODONE_FUNC(bp);
        xfs_biodone(bp);
 }
@@ -2228,9 +2220,8 @@ xlog_recover_do_buffer_trans(
                XFS_BUF_STALE(bp);
                error = xfs_bwrite(mp, bp);
        } else {
-               ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
-                      XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
-               XFS_BUF_SET_FSPRIVATE(bp, mp);
+               ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
+               bp->b_mount = mp;
                XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
                xfs_bdwrite(mp, bp);
        }
@@ -2247,7 +2238,6 @@ xlog_recover_do_inode_trans(
        xfs_inode_log_format_t  *in_f;
        xfs_mount_t             *mp;
        xfs_buf_t               *bp;
-       xfs_imap_t              imap;
        xfs_dinode_t            *dip;
        xfs_ino_t               ino;
        int                     len;
@@ -2275,54 +2265,35 @@ xlog_recover_do_inode_trans(
        }
        ino = in_f->ilf_ino;
        mp = log->l_mp;
-       if (ITEM_TYPE(item) == XFS_LI_INODE) {
-               imap.im_blkno = (xfs_daddr_t)in_f->ilf_blkno;
-               imap.im_len = in_f->ilf_len;
-               imap.im_boffset = in_f->ilf_boffset;
-       } else {
-               /*
-                * It's an old inode format record.  We don't know where
-                * its cluster is located on disk, and we can't allow
-                * xfs_imap() to figure it out because the inode btrees
-                * are not ready to be used.  Therefore do not pass the
-                * XFS_IMAP_LOOKUP flag to xfs_imap().  This will give
-                * us only the single block in which the inode lives
-                * rather than its cluster, so we must make sure to
-                * invalidate the buffer when we write it out below.
-                */
-               imap.im_blkno = 0;
-               error = xfs_imap(log->l_mp, NULL, ino, &imap, 0);
-               if (error)
-                       goto error;
-       }
 
        /*
         * Inode buffers can be freed, look out for it,
         * and do not replay the inode.
         */
-       if (xlog_check_buffer_cancelled(log, imap.im_blkno, imap.im_len, 0)) {
+       if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
+                                       in_f->ilf_len, 0)) {
                error = 0;
                goto error;
        }
 
-       bp = xfs_buf_read_flags(mp->m_ddev_targp, imap.im_blkno, imap.im_len,
-                                                               XFS_BUF_LOCK);
+       bp = xfs_buf_read_flags(mp->m_ddev_targp, in_f->ilf_blkno,
+                               in_f->ilf_len, XFS_BUF_LOCK);
        if (XFS_BUF_ISERROR(bp)) {
                xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
-                                 bp, imap.im_blkno);
+                                 bp, in_f->ilf_blkno);
                error = XFS_BUF_GETERROR(bp);
                xfs_buf_relse(bp);
                goto error;
        }
        error = 0;
        ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
-       dip = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+       dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
 
        /*
         * Make sure the place we're flushing out to really looks
         * like an inode!
         */
-       if (unlikely(be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC)) {
+       if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
@@ -2345,12 +2316,12 @@ xlog_recover_do_inode_trans(
        }
 
        /* Skip replay when the on disk inode is newer than the log one */
-       if (dicp->di_flushiter < be16_to_cpu(dip->di_core.di_flushiter)) {
+       if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
                /*
                 * Deal with the wrap case, DI_MAX_FLUSH is less
                 * than smaller numbers
                 */
-               if (be16_to_cpu(dip->di_core.di_flushiter) == DI_MAX_FLUSH &&
+               if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
                    dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
                        /* do nothing */
                } else {
@@ -2410,7 +2381,7 @@ xlog_recover_do_inode_trans(
                error = EFSCORRUPTED;
                goto error;
        }
-       if (unlikely(item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t))) {
+       if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
@@ -2422,23 +2393,24 @@ xlog_recover_do_inode_trans(
        }
 
        /* The core is in in-core format */
-       xfs_dinode_to_disk(&dip->di_core,
-               (xfs_icdinode_t *)item->ri_buf[1].i_addr);
+       xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr);
 
        /* the rest is in on-disk format */
-       if (item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t)) {
-               memcpy((xfs_caddr_t) dip + sizeof(xfs_dinode_core_t),
-                       item->ri_buf[1].i_addr + sizeof(xfs_dinode_core_t),
-                       item->ri_buf[1].i_len  - sizeof(xfs_dinode_core_t));
+       if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
+               memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),
+                       item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),
+                       item->ri_buf[1].i_len  - sizeof(struct xfs_icdinode));
        }
 
        fields = in_f->ilf_fields;
        switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
        case XFS_ILOG_DEV:
-               dip->di_u.di_dev = cpu_to_be32(in_f->ilf_u.ilfu_rdev);
+               xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
                break;
        case XFS_ILOG_UUID:
-               dip->di_u.di_muuid = in_f->ilf_u.ilfu_uuid;
+               memcpy(XFS_DFORK_DPTR(dip),
+                      &in_f->ilf_u.ilfu_uuid,
+                      sizeof(uuid_t));
                break;
        }
 
@@ -2454,12 +2426,12 @@ xlog_recover_do_inode_trans(
        switch (fields & XFS_ILOG_DFORK) {
        case XFS_ILOG_DDATA:
        case XFS_ILOG_DEXT:
-               memcpy(&dip->di_u, src, len);
+               memcpy(XFS_DFORK_DPTR(dip), src, len);
                break;
 
        case XFS_ILOG_DBROOT:
-               xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
-                                &(dip->di_u.di_bmbt),
+               xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
+                                (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
                                 XFS_DFORK_DSIZE(dip, mp));
                break;
 
@@ -2496,8 +2468,8 @@ xlog_recover_do_inode_trans(
 
                case XFS_ILOG_ABROOT:
                        dest = XFS_DFORK_APTR(dip);
-                       xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
-                                        (xfs_bmdr_block_t*)dest,
+                       xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
+                                        len, (xfs_bmdr_block_t*)dest,
                                         XFS_DFORK_ASIZE(dip, mp));
                        break;
 
@@ -2512,9 +2484,8 @@ xlog_recover_do_inode_trans(
 
 write_inode_buffer:
        if (ITEM_TYPE(item) == XFS_LI_INODE) {
-               ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
-                      XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
-               XFS_BUF_SET_FSPRIVATE(bp, mp);
+               ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
+               bp->b_mount = mp;
                XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
                xfs_bdwrite(mp, bp);
        } else {
@@ -2645,9 +2616,8 @@ xlog_recover_do_dquot_trans(
        memcpy(ddq, recddq, item->ri_buf[1].i_len);
 
        ASSERT(dq_f->qlf_size == 2);
-       ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
-              XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
-       XFS_BUF_SET_FSPRIVATE(bp, mp);
+       ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
+       bp->b_mount = mp;
        XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
        xfs_bdwrite(mp, bp);
 
@@ -2689,11 +2659,11 @@ xlog_recover_do_efi_trans(
        efip->efi_next_extent = efi_formatp->efi_nextents;
        efip->efi_flags |= XFS_EFI_COMMITTED;
 
-       spin_lock(&mp->m_ail_lock);
+       spin_lock(&log->l_ailp->xa_lock);
        /*
-        * xfs_trans_update_ail() drops the AIL lock.
+        * xfs_trans_ail_update() drops the AIL lock.
         */
-       xfs_trans_update_ail(mp, (xfs_log_item_t *)efip, lsn);
+       xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
        return 0;
 }
 
@@ -2712,12 +2682,12 @@ xlog_recover_do_efd_trans(
        xlog_recover_item_t     *item,
        int                     pass)
 {
-       xfs_mount_t             *mp;
        xfs_efd_log_format_t    *efd_formatp;
        xfs_efi_log_item_t      *efip = NULL;
        xfs_log_item_t          *lip;
-       int                     gen;
        __uint64_t              efi_id;
+       struct xfs_ail_cursor   cur;
+       struct xfs_ail          *ailp = log->l_ailp;
 
        if (pass == XLOG_RECOVER_PASS1) {
                return;
@@ -2734,25 +2704,26 @@ xlog_recover_do_efd_trans(
         * Search for the efi with the id in the efd format structure
         * in the AIL.
         */
-       mp = log->l_mp;
-       spin_lock(&mp->m_ail_lock);
-       lip = xfs_trans_first_ail(mp, &gen);
+       spin_lock(&ailp->xa_lock);
+       lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
        while (lip != NULL) {
                if (lip->li_type == XFS_LI_EFI) {
                        efip = (xfs_efi_log_item_t *)lip;
                        if (efip->efi_format.efi_id == efi_id) {
                                /*
-                                * xfs_trans_delete_ail() drops the
+                                * xfs_trans_ail_delete() drops the
                                 * AIL lock.
                                 */
-                               xfs_trans_delete_ail(mp, lip);
+                               xfs_trans_ail_delete(ailp, lip);
                                xfs_efi_item_free(efip);
-                               return;
+                               spin_lock(&ailp->xa_lock);
+                               break;
                        }
                }
-               lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+               lip = xfs_trans_ail_cursor_next(ailp, &cur);
        }
-       spin_unlock(&mp->m_ail_lock);
+       xfs_trans_ail_cursor_done(ailp, &cur);
+       spin_unlock(&ailp->xa_lock);
 }
 
 /*
@@ -3035,33 +3006,6 @@ abort_error:
        return error;
 }
 
-/*
- * Verify that once we've encountered something other than an EFI
- * in the AIL that there are no more EFIs in the AIL.
- */
-#if defined(DEBUG)
-STATIC void
-xlog_recover_check_ail(
-       xfs_mount_t             *mp,
-       xfs_log_item_t          *lip,
-       int                     gen)
-{
-       int                     orig_gen = gen;
-
-       do {
-               ASSERT(lip->li_type != XFS_LI_EFI);
-               lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
-               /*
-                * The check will be bogus if we restart from the
-                * beginning of the AIL, so ASSERT that we don't.
-                * We never should since we're holding the AIL lock
-                * the entire time.
-                */
-               ASSERT(gen == orig_gen);
-       } while (lip != NULL);
-}
-#endif /* DEBUG */
-
 /*
  * When this is called, all of the EFIs which did not have
  * corresponding EFDs should be in the AIL.  What we do now
@@ -3086,20 +3030,23 @@ xlog_recover_process_efis(
 {
        xfs_log_item_t          *lip;
        xfs_efi_log_item_t      *efip;
-       int                     gen;
-       xfs_mount_t             *mp;
        int                     error = 0;
+       struct xfs_ail_cursor   cur;
+       struct xfs_ail          *ailp;
 
-       mp = log->l_mp;
-       spin_lock(&mp->m_ail_lock);
-
-       lip = xfs_trans_first_ail(mp, &gen);
+       ailp = log->l_ailp;
+       spin_lock(&ailp->xa_lock);
+       lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
        while (lip != NULL) {
                /*
                 * We're done when we see something other than an EFI.
+                * There should be no EFIs left in the AIL now.
                 */
                if (lip->li_type != XFS_LI_EFI) {
-                       xlog_recover_check_ail(mp, lip, gen);
+#ifdef DEBUG
+                       for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
+                               ASSERT(lip->li_type != XFS_LI_EFI);
+#endif
                        break;
                }
 
@@ -3108,18 +3055,20 @@ xlog_recover_process_efis(
                 */
                efip = (xfs_efi_log_item_t *)lip;
                if (efip->efi_flags & XFS_EFI_RECOVERED) {
-                       lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+                       lip = xfs_trans_ail_cursor_next(ailp, &cur);
                        continue;
                }
 
-               spin_unlock(&mp->m_ail_lock);
-               error = xlog_recover_process_efi(mp, efip);
+               spin_unlock(&ailp->xa_lock);
+               error = xlog_recover_process_efi(log->l_mp, efip);
+               spin_lock(&ailp->xa_lock);
                if (error)
-                       return error;
-               spin_lock(&mp->m_ail_lock);
-               lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+                       goto out;
+               lip = xfs_trans_ail_cursor_next(ailp, &cur);
        }
-       spin_unlock(&mp->m_ail_lock);
+out:
+       xfs_trans_ail_cursor_done(ailp, &cur);
+       spin_unlock(&ailp->xa_lock);
        return error;
 }
 
@@ -3140,19 +3089,16 @@ xlog_recover_clear_agi_bucket(
        int             error;
 
        tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
-       error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
-       if (!error)
-               error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-                                  XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-                                  XFS_FSS_TO_BB(mp, 1), 0, &agibp);
+       error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),
+                                 0, 0, 0);
        if (error)
                goto out_abort;
 
-       error = EINVAL;
-       agi = XFS_BUF_TO_AGI(agibp);
-       if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC)
+       error = xfs_read_agi(mp, tp, agno, &agibp);
+       if (error)
                goto out_abort;
 
+       agi = XFS_BUF_TO_AGI(agibp);
        agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
        offset = offsetof(xfs_agi_t, agi_unlinked) +
                 (sizeof(xfs_agino_t) * bucket);
@@ -3172,6 +3118,62 @@ out_error:
        return;
 }
 
+STATIC xfs_agino_t
+xlog_recover_process_one_iunlink(
+       struct xfs_mount                *mp,
+       xfs_agnumber_t                  agno,
+       xfs_agino_t                     agino,
+       int                             bucket)
+{
+       struct xfs_buf                  *ibp;
+       struct xfs_dinode               *dip;
+       struct xfs_inode                *ip;
+       xfs_ino_t                       ino;
+       int                             error;
+
+       ino = XFS_AGINO_TO_INO(mp, agno, agino);
+       error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
+       if (error)
+               goto fail;
+
+       /*
+        * Get the on disk inode to find the next inode in the bucket.
+        */
+       error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XFS_BUF_LOCK);
+       if (error)
+               goto fail_iput;
+
+       ASSERT(ip->i_d.di_nlink == 0);
+       ASSERT(ip->i_d.di_mode != 0);
+
+       /* setup for the next pass */
+       agino = be32_to_cpu(dip->di_next_unlinked);
+       xfs_buf_relse(ibp);
+
+       /*
+        * Prevent any DMAPI event from being sent when the reference on
+        * the inode is dropped.
+        */
+       ip->i_d.di_dmevmask = 0;
+
+       IRELE(ip);
+       return agino;
+
+ fail_iput:
+       IRELE(ip);
+ fail:
+       /*
+        * We can't read in the inode this bucket points to, or this inode
+        * is messed up.  Just ditch this bucket of inodes.  We will lose
+        * some inodes and space, but at least we won't hang.
+        *
+        * Call xlog_recover_clear_agi_bucket() to perform a transaction to
+        * clear the inode pointer in the bucket.
+        */
+       xlog_recover_clear_agi_bucket(mp, agno, bucket);
+       return NULLAGINO;
+}
+
 /*
  * xlog_iunlink_recover
  *
@@ -3192,11 +3194,7 @@ xlog_recover_process_iunlinks(
        xfs_agnumber_t  agno;
        xfs_agi_t       *agi;
        xfs_buf_t       *agibp;
-       xfs_buf_t       *ibp;
-       xfs_dinode_t    *dip;
-       xfs_inode_t     *ip;
        xfs_agino_t     agino;
-       xfs_ino_t       ino;
        int             bucket;
        int             error;
        uint            mp_dmevmask;
@@ -3213,22 +3211,21 @@ xlog_recover_process_iunlinks(
                /*
                 * Find the agi for this ag.
                 */
-               agibp = xfs_buf_read(mp->m_ddev_targp,
-                               XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-                               XFS_FSS_TO_BB(mp, 1), 0);
-               if (XFS_BUF_ISERROR(agibp)) {
-                       xfs_ioerror_alert("xlog_recover_process_iunlinks(#1)",
-                               log->l_mp, agibp,
-                               XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)));
+               error = xfs_read_agi(mp, NULL, agno, &agibp);
+               if (error) {
+                       /*
+                        * AGI is b0rked. Don't process it.
+                        *
+                        * We should probably mark the filesystem as corrupt
+                        * after we've recovered all the ag's we can....
+                        */
+                       continue;
                }
                agi = XFS_BUF_TO_AGI(agibp);
-               ASSERT(XFS_AGI_MAGIC == be32_to_cpu(agi->agi_magicnum));
 
                for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
-
                        agino = be32_to_cpu(agi->agi_unlinked[bucket]);
                        while (agino != NULLAGINO) {
-
                                /*
                                 * Release the agi buffer so that it can
                                 * be acquired in the normal course of the
@@ -3236,87 +3233,17 @@ xlog_recover_process_iunlinks(
                                 */
                                xfs_buf_relse(agibp);
 
-                               ino = XFS_AGINO_TO_INO(mp, agno, agino);
-                               error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
-                               ASSERT(error || (ip != NULL));
-
-                               if (!error) {
-                                       /*
-                                        * Get the on disk inode to find the
-                                        * next inode in the bucket.
-                                        */
-                                       error = xfs_itobp(mp, NULL, ip, &dip,
-                                                       &ibp, 0, 0,
-                                                       XFS_BUF_LOCK);
-                                       ASSERT(error || (dip != NULL));
-                               }
-
-                               if (!error) {
-                                       ASSERT(ip->i_d.di_nlink == 0);
-
-                                       /* setup for the next pass */
-                                       agino = be32_to_cpu(
-                                                       dip->di_next_unlinked);
-                                       xfs_buf_relse(ibp);
-                                       /*
-                                        * Prevent any DMAPI event from
-                                        * being sent when the
-                                        * reference on the inode is
-                                        * dropped.
-                                        */
-                                       ip->i_d.di_dmevmask = 0;
-
-                                       /*
-                                        * If this is a new inode, handle
-                                        * it specially.  Otherwise,
-                                        * just drop our reference to the
-                                        * inode.  If there are no
-                                        * other references, this will
-                                        * send the inode to
-                                        * xfs_inactive() which will
-                                        * truncate the file and free
-                                        * the inode.
-                                        */
-                                       if (ip->i_d.di_mode == 0)
-                                               xfs_iput_new(ip, 0);
-                                       else
-                                               IRELE(ip);
-                               } else {
-                                       /*
-                                        * We can't read in the inode
-                                        * this bucket points to, or
-                                        * this inode is messed up.  Just
-                                        * ditch this bucket of inodes.  We
-                                        * will lose some inodes and space,
-                                        * but at least we won't hang.  Call
-                                        * xlog_recover_clear_agi_bucket()
-                                        * to perform a transaction to clear
-                                        * the inode pointer in the bucket.
-                                        */
-                                       xlog_recover_clear_agi_bucket(mp, agno,
-                                                       bucket);
-
-                                       agino = NULLAGINO;
-                               }
+                               agino = xlog_recover_process_one_iunlink(mp,
+                                                       agno, agino, bucket);
 
                                /*
                                 * Reacquire the agibuffer and continue around
-                                * the loop.
+                                * the loop. This should never fail as we know
+                                * the buffer was good earlier on.
                                 */
-                               agibp = xfs_buf_read(mp->m_ddev_targp,
-                                               XFS_AG_DADDR(mp, agno,
-                                                       XFS_AGI_DADDR(mp)),
-                                               XFS_FSS_TO_BB(mp, 1), 0);
-                               if (XFS_BUF_ISERROR(agibp)) {
-                                       xfs_ioerror_alert(
-                               "xlog_recover_process_iunlinks(#2)",
-                                               log->l_mp, agibp,
-                                               XFS_AG_DADDR(mp, agno,
-                                                       XFS_AGI_DADDR(mp)));
-                               }
+                               error = xfs_read_agi(mp, NULL, agno, &agibp);
+                               ASSERT(error == 0);
                                agi = XFS_BUF_TO_AGI(agibp);
-                               ASSERT(XFS_AGI_MAGIC == be32_to_cpu(
-                                       agi->agi_magicnum));
                        }
                }
 
@@ -3367,7 +3294,6 @@ xlog_pack_data(
        int                     size = iclog->ic_offset + roundoff;
        __be32                  cycle_lsn;
        xfs_caddr_t             dp;
-       xlog_in_core_2_t        *xhdr;
 
        xlog_pack_data_checksum(log, iclog, size);
 
@@ -3382,7 +3308,8 @@ xlog_pack_data(
        }
 
        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
-               xhdr = (xlog_in_core_2_t *)&iclog->ic_header;
+               xlog_in_core_2_t *xhdr = iclog->ic_data;
+
                for ( ; i < BTOBB(size); i++) {
                        j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
                        k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3440,7 +3367,6 @@ xlog_unpack_data(
        xlog_t                  *log)
 {
        int                     i, j, k;
-       xlog_in_core_2_t        *xhdr;
 
        for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
                  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
@@ -3449,7 +3375,7 @@ xlog_unpack_data(
        }
 
        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
-               xhdr = (xlog_in_core_2_t *)rhead;
+               xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
                for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
                        j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
                        k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -4003,11 +3929,8 @@ xlog_recover_check_summary(
 {
        xfs_mount_t     *mp;
        xfs_agf_t       *agfp;
-       xfs_agi_t       *agip;
        xfs_buf_t       *agfbp;
        xfs_buf_t       *agibp;
-       xfs_daddr_t     agfdaddr;
-       xfs_daddr_t     agidaddr;
        xfs_buf_t       *sbbp;
 #ifdef XFS_LOUD_RECOVERY
        xfs_sb_t        *sbp;
@@ -4016,6 +3939,7 @@ xlog_recover_check_summary(
        __uint64_t      freeblks;
        __uint64_t      itotal;
        __uint64_t      ifree;
+       int             error;
 
        mp = log->l_mp;
 
@@ -4023,37 +3947,27 @@ xlog_recover_check_summary(
        itotal = 0LL;
        ifree = 0LL;
        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
-               agfdaddr = XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp));
-               agfbp = xfs_buf_read(mp->m_ddev_targp, agfdaddr,
-                               XFS_FSS_TO_BB(mp, 1), 0);
-               if (XFS_BUF_ISERROR(agfbp)) {
-                       xfs_ioerror_alert("xlog_recover_check_summary(agf)",
-                                               mp, agfbp, agfdaddr);
-               }
-               agfp = XFS_BUF_TO_AGF(agfbp);
-               ASSERT(XFS_AGF_MAGIC == be32_to_cpu(agfp->agf_magicnum));
-               ASSERT(XFS_AGF_GOOD_VERSION(be32_to_cpu(agfp->agf_versionnum)));
-               ASSERT(be32_to_cpu(agfp->agf_seqno) == agno);
-
-               freeblks += be32_to_cpu(agfp->agf_freeblks) +
-                           be32_to_cpu(agfp->agf_flcount);
-               xfs_buf_relse(agfbp);
-
-               agidaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
-               agibp = xfs_buf_read(mp->m_ddev_targp, agidaddr,
-                               XFS_FSS_TO_BB(mp, 1), 0);
-               if (XFS_BUF_ISERROR(agibp)) {
-                       xfs_ioerror_alert("xlog_recover_check_summary(agi)",
-                                         mp, agibp, agidaddr);
+               error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
+               if (error) {
+                       xfs_fs_cmn_err(CE_ALERT, mp,
+                                       "xlog_recover_check_summary(agf)"
+                                       "agf read failed agno %d error %d",
+                                                       agno, error);
+               } else {
+                       agfp = XFS_BUF_TO_AGF(agfbp);
+                       freeblks += be32_to_cpu(agfp->agf_freeblks) +
+                                   be32_to_cpu(agfp->agf_flcount);
+                       xfs_buf_relse(agfbp);
                }
-               agip = XFS_BUF_TO_AGI(agibp);
-               ASSERT(XFS_AGI_MAGIC == be32_to_cpu(agip->agi_magicnum));
-               ASSERT(XFS_AGI_GOOD_VERSION(be32_to_cpu(agip->agi_versionnum)));
-               ASSERT(be32_to_cpu(agip->agi_seqno) == agno);
 
-               itotal += be32_to_cpu(agip->agi_count);
-               ifree += be32_to_cpu(agip->agi_freecount);
-               xfs_buf_relse(agibp);
+               error = xfs_read_agi(mp, NULL, agno, &agibp);
+               if (!error) {
+                       struct xfs_agi  *agi = XFS_BUF_TO_AGI(agibp);
+
+                       itotal += be32_to_cpu(agi->agi_count);
+                       ifree += be32_to_cpu(agi->agi_freecount);
+                       xfs_buf_relse(agibp);
+               }
        }
 
        sbbp = xfs_getsb(mp, 0);
index 15f5dd22fbb242ae0c52c457c65943da177813f2..3c97c6463a4e418af83145d350d7aac3e49cc617 100644 (file)
@@ -567,8 +567,6 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 STATIC void
 xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
 {
-       int     i;
-
        mp->m_agfrotor = mp->m_agirotor = 0;
        spin_lock_init(&mp->m_agirotor_lock);
        mp->m_maxagi = mp->m_sb.sb_agcount;
@@ -577,12 +575,10 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
        mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
        mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
        mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
-       mp->m_litino = sbp->sb_inodesize -
-               ((uint)sizeof(xfs_dinode_core_t) + (uint)sizeof(xfs_agino_t));
+       mp->m_litino = sbp->sb_inodesize - sizeof(struct xfs_dinode);
        mp->m_blockmask = sbp->sb_blocksize - 1;
        mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
        mp->m_blockwmask = mp->m_blockwsize - 1;
-       INIT_LIST_HEAD(&mp->m_del_inodes);
 
        /*
         * Setup for attributes, in case they get created.
@@ -605,24 +601,20 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
        }
        ASSERT(mp->m_attroffset < XFS_LITINO(mp));
 
-       for (i = 0; i < 2; i++) {
-               mp->m_alloc_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
-                       xfs_alloc, i == 0);
-               mp->m_alloc_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
-                       xfs_alloc, i == 0);
-       }
-       for (i = 0; i < 2; i++) {
-               mp->m_bmap_dmxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
-                       xfs_bmbt, i == 0);
-               mp->m_bmap_dmnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
-                       xfs_bmbt, i == 0);
-       }
-       for (i = 0; i < 2; i++) {
-               mp->m_inobt_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
-                       xfs_inobt, i == 0);
-               mp->m_inobt_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
-                       xfs_inobt, i == 0);
-       }
+       mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
+       mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
+       mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
+       mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
+
+       mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
+       mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
+       mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
+       mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
+
+       mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
+       mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
+       mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
+       mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
 
        mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
        mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
@@ -1228,6 +1220,16 @@ xfs_unmountfs(
        __uint64_t              resblks;
        int                     error;
 
+       /*
+        * Release dquot that rootinode, rbmino and rsumino might be holding,
+        * and release the quota inodes.
+        */
+       XFS_QM_UNMOUNT(mp);
+
+       if (mp->m_rbmip)
+               IRELE(mp->m_rbmip);
+       if (mp->m_rsumip)
+               IRELE(mp->m_rsumip);
        IRELE(mp->m_rootip);
 
        /*
@@ -1241,7 +1243,7 @@ xfs_unmountfs(
         * need to force the log first.
         */
        xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
-       xfs_iflush_all(mp);
+       xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_ASYNC);
 
        XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
 
@@ -1288,11 +1290,6 @@ xfs_unmountfs(
        xfs_unmountfs_wait(mp);                 /* wait for async bufs */
        xfs_log_unmount(mp);                    /* Done! No more fs ops. */
 
-       /*
-        * All inodes from this mount point should be freed.
-        */
-       ASSERT(mp->m_inodes == NULL);
-
        if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
                uuid_table_remove(&mp->m_sb.sb_uuid);
 
@@ -1365,24 +1362,6 @@ xfs_log_sbcount(
        return error;
 }
 
-STATIC void
-xfs_mark_shared_ro(
-       xfs_mount_t     *mp,
-       xfs_buf_t       *bp)
-{
-       xfs_dsb_t       *sb = XFS_BUF_TO_SBP(bp);
-       __uint16_t      version;
-
-       if (!(sb->sb_flags & XFS_SBF_READONLY))
-               sb->sb_flags |= XFS_SBF_READONLY;
-
-       version = be16_to_cpu(sb->sb_versionnum);
-       if ((version & XFS_SB_VERSION_NUMBITS) != XFS_SB_VERSION_4 ||
-           !(version & XFS_SB_VERSION_SHAREDBIT))
-               version |= XFS_SB_VERSION_SHAREDBIT;
-       sb->sb_versionnum = cpu_to_be16(version);
-}
-
 int
 xfs_unmountfs_writesb(xfs_mount_t *mp)
 {
@@ -1398,12 +1377,6 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
 
                sbp = xfs_getsb(mp, 0);
 
-               /*
-                * mark shared-readonly if desired
-                */
-               if (mp->m_mk_sharedro)
-                       xfs_mark_shared_ro(mp, sbp);
-
                XFS_BUF_UNDONE(sbp);
                XFS_BUF_UNREAD(sbp);
                XFS_BUF_UNDELAYWRITE(sbp);
@@ -1415,8 +1388,6 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
                if (error)
                        xfs_ioerror_alert("xfs_unmountfs_writesb",
                                          mp, sbp, XFS_BUF_ADDR(sbp));
-               if (error && mp->m_mk_sharedro)
-                       xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting.  Filesystem may not be marked shared readonly");
                xfs_buf_relse(sbp);
        }
        return error;
index f3c1024b1241ed8663568db7e309e75c83f866f0..c1e0284673272990d971396c71f937e4cc665aee 100644 (file)
@@ -18,7 +18,6 @@
 #ifndef __XFS_MOUNT_H__
 #define        __XFS_MOUNT_H__
 
-
 typedef struct xfs_trans_reservations {
        uint    tr_write;       /* extent alloc trans */
        uint    tr_itruncate;   /* truncate trans */
@@ -44,14 +43,16 @@ typedef struct xfs_trans_reservations {
 } xfs_trans_reservations_t;
 
 #ifndef __KERNEL__
-/*
- * Moved here from xfs_ag.h to avoid reordering header files
- */
+
 #define XFS_DADDR_TO_AGNO(mp,d) \
        ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
 #define XFS_DADDR_TO_AGBNO(mp,d) \
        ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
-#else
+
+#else /* __KERNEL__ */
+
+#include "xfs_sync.h"
+
 struct cred;
 struct log;
 struct xfs_mount_args;
@@ -62,6 +63,7 @@ struct xfs_extdelta;
 struct xfs_swapext;
 struct xfs_mru_cache;
 struct xfs_nameops;
+struct xfs_ail;
 
 /*
  * Prototypes and functions for the Data Migration subsystem.
@@ -115,7 +117,7 @@ struct xfs_quotainfo;
 
 typedef int    (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *);
 typedef int    (*xfs_qmmount_t)(struct xfs_mount *, uint, uint);
-typedef int    (*xfs_qmunmount_t)(struct xfs_mount *);
+typedef void   (*xfs_qmunmount_t)(struct xfs_mount *);
 typedef void   (*xfs_qmdone_t)(struct xfs_mount *);
 typedef void   (*xfs_dqrele_t)(struct xfs_dquot *);
 typedef int    (*xfs_dqattach_t)(struct xfs_inode *, uint);
@@ -132,7 +134,7 @@ typedef struct xfs_dquot * (*xfs_dqvopchown_t)(
                        struct xfs_dquot **, struct xfs_dquot *);
 typedef int    (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *,
                        struct xfs_dquot *, struct xfs_dquot *, uint);
-typedef void   (*xfs_dqstatvfs_t)(struct xfs_inode *, bhv_statvfs_t *);
+typedef void   (*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *);
 typedef int    (*xfs_dqsync_t)(struct xfs_mount *, int flags);
 typedef int    (*xfs_quotactl_t)(struct xfs_mount *, int, int, xfs_caddr_t);
 
@@ -223,18 +225,10 @@ extern void       xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
 #define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
 #endif
 
-typedef struct xfs_ail {
-       struct list_head        xa_ail;
-       uint                    xa_gen;
-       struct task_struct      *xa_task;
-       xfs_lsn_t               xa_target;
-} xfs_ail_t;
-
 typedef struct xfs_mount {
        struct super_block      *m_super;
        xfs_tid_t               m_tid;          /* next unused tid for fs */
-       spinlock_t              m_ail_lock;     /* fs AIL mutex */
-       xfs_ail_t               m_ail;          /* fs active log item list */
+       struct xfs_ail          *m_ail;         /* fs active log item list */
        xfs_sb_t                m_sb;           /* copy of fs superblock */
        spinlock_t              m_sb_lock;      /* sb counter lock */
        struct xfs_buf          *m_sb_bp;       /* buffer for superblock */
@@ -247,10 +241,6 @@ typedef struct xfs_mount {
        xfs_agnumber_t          m_agirotor;     /* last ag dir inode alloced */
        spinlock_t              m_agirotor_lock;/* .. and lock protecting it */
        xfs_agnumber_t          m_maxagi;       /* highest inode alloc group */
-       struct xfs_inode        *m_inodes;      /* active inode list */
-       struct list_head        m_del_inodes;   /* inodes to reclaim */
-       mutex_t                 m_ilock;        /* inode list mutex */
-       uint                    m_ireclaims;    /* count of calls to reclaim*/
        uint                    m_readio_log;   /* min read size log bytes */
        uint                    m_readio_blocks; /* min read size blocks */
        uint                    m_writeio_log;  /* min write size log bytes */
@@ -267,7 +257,6 @@ typedef struct xfs_mount {
        xfs_buftarg_t           *m_ddev_targp;  /* saves taking the address */
        xfs_buftarg_t           *m_logdev_targp;/* ptr to log device */
        xfs_buftarg_t           *m_rtdev_targp; /* ptr to rt device */
-       __uint8_t               m_dircook_elog; /* log d-cookie entry bits */
        __uint8_t               m_blkbit_log;   /* blocklog + NBBY */
        __uint8_t               m_blkbb_log;    /* blocklog - BBSHIFT */
        __uint8_t               m_agno_log;     /* log #ag's */
@@ -276,12 +265,12 @@ typedef struct xfs_mount {
        uint                    m_blockmask;    /* sb_blocksize-1 */
        uint                    m_blockwsize;   /* sb_blocksize in words */
        uint                    m_blockwmask;   /* blockwsize-1 */
-       uint                    m_alloc_mxr[2]; /* XFS_ALLOC_BLOCK_MAXRECS */
-       uint                    m_alloc_mnr[2]; /* XFS_ALLOC_BLOCK_MINRECS */
-       uint                    m_bmap_dmxr[2]; /* XFS_BMAP_BLOCK_DMAXRECS */
-       uint                    m_bmap_dmnr[2]; /* XFS_BMAP_BLOCK_DMINRECS */
-       uint                    m_inobt_mxr[2]; /* XFS_INOBT_BLOCK_MAXRECS */
-       uint                    m_inobt_mnr[2]; /* XFS_INOBT_BLOCK_MINRECS */
+       uint                    m_alloc_mxr[2]; /* max alloc btree records */
+       uint                    m_alloc_mnr[2]; /* min alloc btree records */
+       uint                    m_bmap_dmxr[2]; /* max bmap btree records */
+       uint                    m_bmap_dmnr[2]; /* min bmap btree records */
+       uint                    m_inobt_mxr[2]; /* max inobt btree records */
+       uint                    m_inobt_mnr[2]; /* min inobt btree records */
        uint                    m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
        uint                    m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
        uint                    m_in_maxlevels; /* XFS_IN_MAXLEVELS */
@@ -312,9 +301,6 @@ typedef struct xfs_mount {
        int                     m_sinoalign;    /* stripe unit inode alignment */
        int                     m_attr_magicpct;/* 37% of the blocksize */
        int                     m_dir_magicpct; /* 37% of the dir blocksize */
-       __uint8_t               m_mk_sharedro;  /* mark shared ro on unmount */
-       __uint8_t               m_inode_quiesce;/* call quiesce on new inodes.
-                                                  field governed by m_ilock */
        __uint8_t               m_sectbb_log;   /* sectlog - BBSHIFT */
        const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */
        int                     m_dirblksize;   /* directory block sz--bytes */
@@ -362,7 +348,6 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_ATTR2                (1ULL << 8)     /* allow use of attr2 format */
 #define XFS_MOUNT_GRPID                (1ULL << 9)     /* group-ID assigned from directory */
 #define XFS_MOUNT_NORECOVERY   (1ULL << 10)    /* no recovery - dirty fs */
-#define XFS_MOUNT_SHARED       (1ULL << 11)    /* shared mount */
 #define XFS_MOUNT_DFLT_IOSIZE  (1ULL << 12)    /* set default i/o size */
 #define XFS_MOUNT_OSYNCISOSYNC (1ULL << 13)    /* o_sync is REALLY o_sync */
                                                /* osyncisdsync is now default*/
@@ -439,6 +424,16 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
 #define xfs_force_shutdown(m,f)        \
        xfs_do_force_shutdown(m, f, __FILE__, __LINE__)
 
+#define SHUTDOWN_META_IO_ERROR 0x0001  /* write attempt to metadata failed */
+#define SHUTDOWN_LOG_IO_ERROR  0x0002  /* write attempt to the log failed */
+#define SHUTDOWN_FORCE_UMOUNT  0x0004  /* shutdown from a forced unmount */
+#define SHUTDOWN_CORRUPT_INCORE        0x0008  /* corrupt in-memory data structures */
+#define SHUTDOWN_REMOTE_REQ    0x0010  /* shutdown came from remote cell */
+#define SHUTDOWN_DEVICE_REQ    0x0020  /* failed all paths to the device */
+
+#define xfs_test_for_freeze(mp)                ((mp)->m_super->s_frozen)
+#define xfs_wait_for_freeze(mp,l)      vfs_check_frozen((mp)->m_super, (l))
+
 /*
  * Flags for xfs_mountfs
  */
@@ -508,14 +503,12 @@ typedef struct xfs_mod_sb {
 #define        XFS_MOUNT_ILOCK(mp)     mutex_lock(&((mp)->m_ilock))
 #define        XFS_MOUNT_IUNLOCK(mp)   mutex_unlock(&((mp)->m_ilock))
 
-extern void    xfs_mod_sb(xfs_trans_t *, __int64_t);
 extern int     xfs_log_sbcount(xfs_mount_t *, uint);
 extern int     xfs_mountfs(xfs_mount_t *mp);
 extern void    xfs_mountfs_check_barriers(xfs_mount_t *mp);
 
 extern void    xfs_unmountfs(xfs_mount_t *);
 extern int     xfs_unmountfs_writesb(xfs_mount_t *);
-extern int     xfs_unmount_flush(xfs_mount_t *, int);
 extern int     xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
 extern int     xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
                        int64_t, int);
@@ -525,20 +518,20 @@ extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
 extern int     xfs_readsb(xfs_mount_t *, int);
 extern void    xfs_freesb(xfs_mount_t *);
 extern int     xfs_fs_writable(xfs_mount_t *);
-extern int     xfs_syncsub(xfs_mount_t *, int, int *);
-extern int     xfs_sync_inodes(xfs_mount_t *, int, int *);
-extern xfs_agnumber_t  xfs_initialize_perag(xfs_mount_t *, xfs_agnumber_t);
-extern void    xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
-extern void    xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
 extern int     xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
 
-extern int     xfs_dmops_get(struct xfs_mount *, struct xfs_mount_args *);
+extern int     xfs_dmops_get(struct xfs_mount *);
 extern void    xfs_dmops_put(struct xfs_mount *);
-extern int     xfs_qmops_get(struct xfs_mount *, struct xfs_mount_args *);
+extern int     xfs_qmops_get(struct xfs_mount *);
 extern void    xfs_qmops_put(struct xfs_mount *);
 
 extern struct xfs_dmops xfs_dmcore_xfs;
 
 #endif /* __KERNEL__ */
 
+extern void    xfs_mod_sb(struct xfs_trans *, __int64_t);
+extern xfs_agnumber_t  xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t);
+extern void    xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
+extern void    xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
+
 #endif /* __XFS_MOUNT_H__ */
index a294e58db8dda71c5230c39f39117863c7344d72..27f80581520ad2d19c138419a2e3bca42de86944 100644 (file)
@@ -28,7 +28,6 @@
 #include "xfs_mount.h"
 #include "xfs_quota.h"
 #include "xfs_error.h"
-#include "xfs_clnt.h"
 
 
 STATIC struct xfs_dquot *
@@ -131,9 +130,9 @@ static struct xfs_qmops xfs_qmcore_stub = {
 };
 
 int
-xfs_qmops_get(struct xfs_mount *mp, struct xfs_mount_args *args)
+xfs_qmops_get(struct xfs_mount *mp)
 {
-       if (args->flags & (XFSMNT_UQUOTA | XFSMNT_PQUOTA | XFSMNT_GQUOTA)) {
+       if (XFS_IS_QUOTA_RUNNING(mp)) {
 #ifdef CONFIG_XFS_QUOTA
                mp->m_qm_ops = &xfs_qmcore_xfs;
 #else
index 12c4ec775af88a9402071035a6a46a0b946400c2..48965ecaa1558589e5a1b9db4a829e9c367665e4 100644 (file)
@@ -84,11 +84,9 @@ typedef struct xfs_dqblk {
 #define XFS_DQ_USER            0x0001          /* a user quota */
 #define XFS_DQ_PROJ            0x0002          /* project quota */
 #define XFS_DQ_GROUP           0x0004          /* a group quota */
-#define XFS_DQ_FLOCKED         0x0008          /* flush lock taken */
-#define XFS_DQ_DIRTY           0x0010          /* dquot is dirty */
-#define XFS_DQ_WANT            0x0020          /* for lookup/reclaim race */
-#define XFS_DQ_INACTIVE                0x0040          /* dq off mplist & hashlist */
-#define XFS_DQ_MARKER          0x0080          /* sentinel */
+#define XFS_DQ_DIRTY           0x0008          /* dquot is dirty */
+#define XFS_DQ_WANT            0x0010          /* for lookup/reclaim race */
+#define XFS_DQ_INACTIVE                0x0020          /* dq off mplist & hashlist */
 
 #define XFS_DQ_ALLTYPES                (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
 
index c903130be7fd6f6ec3032067e990bab8582b9886..86471bb40fd44021b2d60c32d7e254e12db9c576 100644 (file)
 #include "xfs_vnodeops.h"
 
 
-/*
- * Given an array of up to 4 inode pointers, unlock the pointed to inodes.
- * If there are fewer than 4 entries in the array, the empty entries will
- * be at the end and will have NULL pointers in them.
- */
-STATIC void
-xfs_rename_unlock4(
-       xfs_inode_t     **i_tab,
-       uint            lock_mode)
-{
-       int     i;
-
-       xfs_iunlock(i_tab[0], lock_mode);
-       for (i = 1; i < 4; i++) {
-               if (i_tab[i] == NULL)
-                       break;
-
-               /*
-                * Watch out for duplicate entries in the table.
-                */
-               if (i_tab[i] != i_tab[i-1])
-                       xfs_iunlock(i_tab[i], lock_mode);
-       }
-}
-
 /*
  * Enter all inodes for a rename transaction into a sorted array.
  */
@@ -204,19 +179,6 @@ xfs_rename(
         */
        xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
 
-       /*
-        * If we are using project inheritance, we only allow renames
-        * into our tree when the project IDs are the same; else the
-        * tree quota mechanism would be circumvented.
-        */
-       if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
-                    (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
-               error = XFS_ERROR(EXDEV);
-               xfs_rename_unlock4(inodes, XFS_ILOCK_EXCL);
-               xfs_trans_cancel(tp, cancel_flags);
-               goto std_return;
-       }
-
        /*
         * Join all the inodes to the transaction. From this point on,
         * we can rely on either trans_commit or trans_cancel to unlock
@@ -241,6 +203,17 @@ xfs_rename(
                xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
        }
 
+       /*
+        * If we are using project inheritance, we only allow renames
+        * into our tree when the project IDs are the same; else the
+        * tree quota mechanism would be circumvented.
+        */
+       if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+                    (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
+               error = XFS_ERROR(EXDEV);
+               goto error_return;
+       }
+
        /*
         * Set up the target.
         */
@@ -367,19 +340,11 @@ xfs_rename(
                                        &first_block, &free_list, spaceres);
        if (error)
                goto abort_return;
-       xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
-       /*
-        * Update the generation counts on all the directory inodes
-        * that we're modifying.
-        */
-       src_dp->i_gen++;
+       xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
-
-       if (new_parent) {
-               target_dp->i_gen++;
+       if (new_parent)
                xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
-       }
 
        /*
         * If this is a synchronous mount, make sure that the
index e2f68de161599d28d7b33f3348fd6dea4680f5e7..edf12c7b834c04abd2ad11a3be4cd374940d920a 100644 (file)
@@ -85,7 +85,6 @@ xfs_growfs_rt_alloc(
 {
        xfs_fileoff_t   bno;            /* block number in file */
        xfs_buf_t       *bp;            /* temporary buffer for zeroing */
-       int             cancelflags;    /* flags for xfs_trans_cancel */
        int             committed;      /* transaction committed flag */
        xfs_daddr_t     d;              /* disk block address */
        int             error;          /* error return value */
@@ -96,15 +95,16 @@ xfs_growfs_rt_alloc(
        xfs_bmbt_irec_t map;            /* block map output */
        int             nmap;           /* number of block maps */
        int             resblks;        /* space reservation */
-       xfs_trans_t     *tp;            /* transaction pointer */
 
        /*
         * Allocate space to the file, as necessary.
         */
        while (oblocks < nblocks) {
+               int             cancelflags = 0;
+               xfs_trans_t     *tp;
+
                tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
                resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
-               cancelflags = 0;
                /*
                 * Reserve space & log for one extent added to the file.
                 */
@@ -171,7 +171,9 @@ xfs_growfs_rt_alloc(
                                mp->m_bsize, 0);
                        if (bp == NULL) {
                                error = XFS_ERROR(EIO);
-                               goto error_cancel;
+error_cancel:
+                               xfs_trans_cancel(tp, cancelflags);
+                               goto error;
                        }
                        memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize);
                        xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
@@ -188,8 +190,6 @@ xfs_growfs_rt_alloc(
                oblocks = map.br_startoff + map.br_blockcount;
        }
        return 0;
-error_cancel:
-       xfs_trans_cancel(tp, cancelflags);
 error:
        return error;
 }
@@ -1856,7 +1856,6 @@ xfs_growfs_rt(
 {
        xfs_rtblock_t   bmbno;          /* bitmap block number */
        xfs_buf_t       *bp;            /* temporary buffer */
-       int             cancelflags;    /* flags for xfs_trans_cancel */
        int             error;          /* error return value */
        xfs_inode_t     *ip;            /* bitmap inode, used as lock */
        xfs_mount_t     *nmp;           /* new (fake) mount structure */
@@ -1872,13 +1871,13 @@ xfs_growfs_rt(
        xfs_extlen_t    rsumblocks;     /* current number of rt summary blks */
        xfs_sb_t        *sbp;           /* old superblock */
        xfs_fsblock_t   sumbno;         /* summary block number */
-       xfs_trans_t     *tp;            /* transaction pointer */
 
        sbp = &mp->m_sb;
-       cancelflags = 0;
        /*
         * Initial error checking.
         */
+       if (!capable(CAP_SYS_ADMIN))
+               return XFS_ERROR(EPERM);
        if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL ||
            (nrblocks = in->newblocks) <= sbp->sb_rblocks ||
            (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
@@ -1942,6 +1941,9 @@ xfs_growfs_rt(
                     ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
             bmbno < nrbmblocks;
             bmbno++) {
+               xfs_trans_t     *tp;
+               int             cancelflags = 0;
+
                *nmp = *mp;
                nsbp = &nmp->m_sb;
                /*
@@ -1967,16 +1969,15 @@ xfs_growfs_rt(
                 * Start a transaction, get the log reservation.
                 */
                tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
-               cancelflags = 0;
                if ((error = xfs_trans_reserve(tp, 0,
                                XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0)))
-                       break;
+                       goto error_cancel;
                /*
                 * Lock out other callers by grabbing the bitmap inode lock.
                 */
                if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
                                                XFS_ILOCK_EXCL, &ip)))
-                       break;
+                       goto error_cancel;
                ASSERT(ip == mp->m_rbmip);
                /*
                 * Update the bitmap inode's size.
@@ -1990,7 +1991,7 @@ xfs_growfs_rt(
                 */
                if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0,
                                                XFS_ILOCK_EXCL, &ip)))
-                       break;
+                       goto error_cancel;
                ASSERT(ip == mp->m_rsumip);
                /*
                 * Update the summary inode's size.
@@ -2005,7 +2006,7 @@ xfs_growfs_rt(
                    mp->m_rsumlevels != nmp->m_rsumlevels) {
                        error = xfs_rtcopy_summary(mp, nmp, tp);
                        if (error)
-                               break;
+                               goto error_cancel;
                }
                /*
                 * Update superblock fields.
@@ -2031,8 +2032,11 @@ xfs_growfs_rt(
                bp = NULL;
                error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents,
                        nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
-               if (error)
+               if (error) {
+error_cancel:
+                       xfs_trans_cancel(tp, cancelflags);
                        break;
+               }
                /*
                 * Mark more blocks free in the superblock.
                 */
@@ -2045,15 +2049,10 @@ xfs_growfs_rt(
                mp->m_rsumsize = nrsumsize;
 
                error = xfs_trans_commit(tp, 0);
-               if (error) {
-                       tp = NULL;
+               if (error)
                        break;
-               }
        }
 
-       if (error && tp)
-               xfs_trans_cancel(tp, cancelflags);
-
        /*
         * Free the fake mp structure.
         */
index 3a82576dde9a39659f239adae1d42da66ed26b01..36f3a21c54d295aa835ebabe3f13a390696482c7 100644 (file)
@@ -406,7 +406,7 @@ xfs_bwrite(
         * XXXsup how does this work for quotas.
         */
        XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
-       XFS_BUF_SET_FSPRIVATE3(bp, mp);
+       bp->b_mount = mp;
        XFS_BUF_WRITE(bp);
 
        if ((error = XFS_bwrite(bp))) {
index 3f8cf1587f4cc09521ad17815160af4ad7395cd8..1ed71916e4c9d9aa70f06ebe2df17ab4572d6a2d 100644 (file)
@@ -79,6 +79,7 @@ struct xfs_mount;
 #define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002      /* Superblk counters */
 #define XFS_SB_VERSION2_RESERVED4BIT   0x00000004
 #define XFS_SB_VERSION2_ATTR2BIT       0x00000008      /* Inline attr rework */
+#define XFS_SB_VERSION2_PARENTBIT      0x00000010      /* parent pointers */
 
 #define        XFS_SB_VERSION2_OKREALFBITS     \
        (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
@@ -296,30 +297,34 @@ typedef enum {
 
 #define        XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
 
-#ifdef __KERNEL__
 static inline int xfs_sb_good_version(xfs_sb_t *sbp)
 {
-       return (((sbp->sb_versionnum >= XFS_SB_VERSION_1) && \
-                 (sbp->sb_versionnum <= XFS_SB_VERSION_3)) || \
-                  ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
-                   !((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || \
-                     ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && \
-                      (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) && \
-                   (sbp->sb_shared_vn <= XFS_SB_MAX_SHARED_VN)));
-}
+       /* We always support version 1-3 */
+       if (sbp->sb_versionnum >= XFS_SB_VERSION_1 &&
+           sbp->sb_versionnum <= XFS_SB_VERSION_3)
+               return 1;
+
+       /* We support version 4 if all feature bits are supported */
+       if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) {
+               if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) ||
+                   ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+                    (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS)))
+                       return 0;
+
+#ifdef __KERNEL__
+               if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
+                       return 0;
 #else
-static inline int xfs_sb_good_version(xfs_sb_t *sbp)
-{
-       return (((sbp->sb_versionnum >= XFS_SB_VERSION_1) && \
-                 (sbp->sb_versionnum <= XFS_SB_VERSION_3)) || \
-                  ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
-                   !((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || \
-                     ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && \
-                      (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) && \
-                 (!(sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) || \
-                  (sbp->sb_shared_vn <= XFS_SB_MAX_SHARED_VN))));
+               if ((sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) &&
+                   sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
+                       return 0;
+#endif
+
+               return 1;
+       }
+
+       return 0;
 }
-#endif /* __KERNEL__ */
 
 /*
  * Detect a mismatched features2 field.  Older kernels read/wrote
@@ -332,123 +337,127 @@ static inline int xfs_sb_has_mismatched_features2(xfs_sb_t *sbp)
 
 static inline unsigned xfs_sb_version_tonew(unsigned v)
 {
-       return ((((v) == XFS_SB_VERSION_1) ? \
-               0 : \
-               (((v) == XFS_SB_VERSION_2) ? \
-                       XFS_SB_VERSION_ATTRBIT : \
-                       (XFS_SB_VERSION_ATTRBIT | XFS_SB_VERSION_NLINKBIT))) | \
-               XFS_SB_VERSION_4);
+       if (v == XFS_SB_VERSION_1)
+               return XFS_SB_VERSION_4;
+
+       if (v == XFS_SB_VERSION_2)
+               return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
+
+       return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT |
+               XFS_SB_VERSION_NLINKBIT;
 }
 
 static inline unsigned xfs_sb_version_toold(unsigned v)
 {
-       return (((v) & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT)) ? \
-               0 : \
-               (((v) & XFS_SB_VERSION_NLINKBIT) ? \
-                       XFS_SB_VERSION_3 : \
-                       (((v) & XFS_SB_VERSION_ATTRBIT) ?  \
-                               XFS_SB_VERSION_2 : \
-                               XFS_SB_VERSION_1)));
+       if (v & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT))
+               return 0;
+       if (v & XFS_SB_VERSION_NLINKBIT)
+               return XFS_SB_VERSION_3;
+       if (v & XFS_SB_VERSION_ATTRBIT)
+               return XFS_SB_VERSION_2;
+       return XFS_SB_VERSION_1;
 }
 
 static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp)
 {
-       return ((sbp)->sb_versionnum == XFS_SB_VERSION_2) || \
-                ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \
-                ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
-                 ((sbp)->sb_versionnum & XFS_SB_VERSION_ATTRBIT));
+       return sbp->sb_versionnum == XFS_SB_VERSION_2 ||
+               sbp->sb_versionnum == XFS_SB_VERSION_3 ||
+               (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+                (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT));
 }
 
 static inline void xfs_sb_version_addattr(xfs_sb_t *sbp)
 {
-       (sbp)->sb_versionnum = (((sbp)->sb_versionnum == XFS_SB_VERSION_1) ? \
-               XFS_SB_VERSION_2 : \
-               ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) ? \
-                       ((sbp)->sb_versionnum | XFS_SB_VERSION_ATTRBIT) : \
-                       (XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT)));
+       if (sbp->sb_versionnum == XFS_SB_VERSION_1)
+               sbp->sb_versionnum = XFS_SB_VERSION_2;
+       else if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
+               sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
+       else
+               sbp->sb_versionnum = XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
 }
 
 static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp)
 {
-       return ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \
-                ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
-                 ((sbp)->sb_versionnum & XFS_SB_VERSION_NLINKBIT));
+       return sbp->sb_versionnum == XFS_SB_VERSION_3 ||
+                (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+                 (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT));
 }
 
 static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp)
 {
-       (sbp)->sb_versionnum = ((sbp)->sb_versionnum <= XFS_SB_VERSION_2 ? \
-               XFS_SB_VERSION_3 : \
-               ((sbp)->sb_versionnum | XFS_SB_VERSION_NLINKBIT));
+       if (sbp->sb_versionnum <= XFS_SB_VERSION_2)
+               sbp->sb_versionnum = XFS_SB_VERSION_3;
+       else
+               sbp->sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
 }
 
 static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp)
 {
-       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
-               ((sbp)->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
+       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+               (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
 }
 
 static inline void xfs_sb_version_addquota(xfs_sb_t *sbp)
 {
-       (sbp)->sb_versionnum = \
-                (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 ? \
-                       ((sbp)->sb_versionnum | XFS_SB_VERSION_QUOTABIT) : \
-                       (xfs_sb_version_tonew((sbp)->sb_versionnum) | \
-                        XFS_SB_VERSION_QUOTABIT));
+       if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
+               sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
+       else
+               sbp->sb_versionnum = xfs_sb_version_tonew(sbp->sb_versionnum) |
+                                       XFS_SB_VERSION_QUOTABIT;
 }
 
 static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp)
 {
-       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
-               ((sbp)->sb_versionnum & XFS_SB_VERSION_ALIGNBIT);
+       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+               (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT);
 }
 
 static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp)
 {
-       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
-               ((sbp)->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
+       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+               (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
 }
 
 static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp)
 {
-       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
-               ((sbp)->sb_versionnum & XFS_SB_VERSION_SHAREDBIT);
+       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+               (sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT);
 }
 
 static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp)
 {
-       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
-               ((sbp)->sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
+       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+               (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
 }
 
 static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp)
 {
-       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
-               ((sbp)->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
+       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+               (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
 }
 
 static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp)
 {
-       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
-               ((sbp)->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
+       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+               (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
 }
 
 static inline int xfs_sb_version_hassector(xfs_sb_t *sbp)
 {
-       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
-               ((sbp)->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
+       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+               (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
 }
 
 static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp)
 {
-       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
                (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
 }
 
 static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
 {
-       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
-               ((sbp)->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
+       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+               (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
 }
 
 /*
@@ -463,22 +472,20 @@ static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
 
 static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp)
 {
-       return (xfs_sb_version_hasmorebits(sbp) &&      \
-               ((sbp)->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
+       return xfs_sb_version_hasmorebits(sbp) &&
+               (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT);
 }
 
 static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp)
 {
-       return (xfs_sb_version_hasmorebits(sbp)) &&     \
-               ((sbp)->sb_features2 & XFS_SB_VERSION2_ATTR2BIT);
+       return xfs_sb_version_hasmorebits(sbp) &&
+               (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT);
 }
 
 static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp)
 {
-       ((sbp)->sb_versionnum = \
-               ((sbp)->sb_versionnum | XFS_SB_VERSION_MOREBITSBIT),    \
-       ((sbp)->sb_features2 =  \
-               ((sbp)->sb_features2 | XFS_SB_VERSION2_ATTR2BIT)));
+       sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
+       sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
 }
 
 static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
index 4e1c22a23be5fb2496b20600767de4ee1a8822ce..8570b826fedd85e2375831aebc754f18e3685ab1 100644 (file)
@@ -290,7 +290,7 @@ xfs_trans_dup(
        ASSERT(tp->t_ticket != NULL);
 
        ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE);
-       ntp->t_ticket = tp->t_ticket;
+       ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
        ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
        tp->t_blk_res = tp->t_blk_res_used;
        ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
@@ -1259,6 +1259,13 @@ xfs_trans_roll(
 
        trans = *tpp;
 
+       /*
+        * transaction commit worked ok so we can drop the extra ticket
+        * reference that we gained in xfs_trans_dup()
+        */
+       xfs_log_ticket_put(trans->t_ticket);
+
+
        /*
         * Reserve space in the log for th next transaction.
         * This also pushes items in the "AIL", the list of logged items,
@@ -1383,11 +1390,12 @@ xfs_trans_chunk_committed(
        xfs_log_item_desc_t     *lidp;
        xfs_log_item_t          *lip;
        xfs_lsn_t               item_lsn;
-       struct xfs_mount        *mp;
        int                     i;
 
        lidp = licp->lic_descs;
        for (i = 0; i < licp->lic_unused; i++, lidp++) {
+               struct xfs_ail          *ailp;
+
                if (xfs_lic_isfree(licp, i)) {
                        continue;
                }
@@ -1424,19 +1432,19 @@ xfs_trans_chunk_committed(
                 * This would cause the earlier transaction to fail
                 * the test below.
                 */
-               mp = lip->li_mountp;
-               spin_lock(&mp->m_ail_lock);
+               ailp = lip->li_ailp;
+               spin_lock(&ailp->xa_lock);
                if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
                        /*
                         * This will set the item's lsn to item_lsn
                         * and update the position of the item in
                         * the AIL.
                         *
-                        * xfs_trans_update_ail() drops the AIL lock.
+                        * xfs_trans_ail_update() drops the AIL lock.
                         */
-                       xfs_trans_update_ail(mp, lip, item_lsn);
+                       xfs_trans_ail_update(ailp, lip, item_lsn);
                } else {
-                       spin_unlock(&mp->m_ail_lock);
+                       spin_unlock(&ailp->xa_lock);
                }
 
                /*
index 74c80bd2b0ec5a838e325945b9787a2a9c3c10b3..d6fe4a88d79f0557bcdd2a46434a438debd9cb72 100644 (file)
@@ -18,6 +18,8 @@
 #ifndef        __XFS_TRANS_H__
 #define        __XFS_TRANS_H__
 
+struct xfs_log_item;
+
 /*
  * This is the structure written in the log at the head of
  * every transaction. It identifies the type and id of the
@@ -98,76 +100,6 @@ typedef struct xfs_trans_header {
 #define        XFS_TRANS_TYPE_MAX              41
 /* new transaction types need to be reflected in xfs_logprint(8) */
 
-
-#ifdef __KERNEL__
-struct xfs_buf;
-struct xfs_buftarg;
-struct xfs_efd_log_item;
-struct xfs_efi_log_item;
-struct xfs_inode;
-struct xfs_item_ops;
-struct xfs_log_iovec;
-struct xfs_log_item;
-struct xfs_log_item_desc;
-struct xfs_mount;
-struct xfs_trans;
-struct xfs_dquot_acct;
-
-typedef struct xfs_log_item {
-       struct list_head                li_ail;         /* AIL pointers */
-       xfs_lsn_t                       li_lsn;         /* last on-disk lsn */
-       struct xfs_log_item_desc        *li_desc;       /* ptr to current desc*/
-       struct xfs_mount                *li_mountp;     /* ptr to fs mount */
-       uint                            li_type;        /* item type */
-       uint                            li_flags;       /* misc flags */
-       struct xfs_log_item             *li_bio_list;   /* buffer item list */
-       void                            (*li_cb)(struct xfs_buf *,
-                                                struct xfs_log_item *);
-                                                       /* buffer item iodone */
-                                                       /* callback func */
-       struct xfs_item_ops             *li_ops;        /* function list */
-} xfs_log_item_t;
-
-#define        XFS_LI_IN_AIL   0x1
-#define XFS_LI_ABORTED 0x2
-
-typedef struct xfs_item_ops {
-       uint (*iop_size)(xfs_log_item_t *);
-       void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
-       void (*iop_pin)(xfs_log_item_t *);
-       void (*iop_unpin)(xfs_log_item_t *, int);
-       void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
-       uint (*iop_trylock)(xfs_log_item_t *);
-       void (*iop_unlock)(xfs_log_item_t *);
-       xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
-       void (*iop_push)(xfs_log_item_t *);
-       void (*iop_pushbuf)(xfs_log_item_t *);
-       void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
-} xfs_item_ops_t;
-
-#define IOP_SIZE(ip)           (*(ip)->li_ops->iop_size)(ip)
-#define IOP_FORMAT(ip,vp)      (*(ip)->li_ops->iop_format)(ip, vp)
-#define IOP_PIN(ip)            (*(ip)->li_ops->iop_pin)(ip)
-#define IOP_UNPIN(ip, flags)   (*(ip)->li_ops->iop_unpin)(ip, flags)
-#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
-#define IOP_TRYLOCK(ip)                (*(ip)->li_ops->iop_trylock)(ip)
-#define IOP_UNLOCK(ip)         (*(ip)->li_ops->iop_unlock)(ip)
-#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn)
-#define IOP_PUSH(ip)           (*(ip)->li_ops->iop_push)(ip)
-#define IOP_PUSHBUF(ip)                (*(ip)->li_ops->iop_pushbuf)(ip)
-#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
-
-/*
- * Return values for the IOP_TRYLOCK() routines.
- */
-#define        XFS_ITEM_SUCCESS        0
-#define        XFS_ITEM_PINNED         1
-#define        XFS_ITEM_LOCKED         2
-#define        XFS_ITEM_FLUSHING       3
-#define XFS_ITEM_PUSHBUF       4
-
-#endif /* __KERNEL__ */
-
 /*
  * This structure is used to track log items associated with
  * a transaction.  It points to the log item and keeps some
@@ -176,7 +108,7 @@ typedef struct xfs_item_ops {
  * once we get to commit processing (see xfs_trans_commit()).
  */
 typedef struct xfs_log_item_desc {
-       xfs_log_item_t  *lid_item;
+       struct xfs_log_item     *lid_item;
        ushort          lid_size;
        unsigned char   lid_flags;
        unsigned char   lid_index;
@@ -276,94 +208,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
                (xfs_caddr_t)(((xfs_log_item_chunk_t*)0)->lic_descs));
 }
 
-#ifdef __KERNEL__
-/*
- * This structure is used to maintain a list of block ranges that have been
- * freed in the transaction.  The ranges are listed in the perag[] busy list
- * between when they're freed and the transaction is committed to disk.
- */
-
-typedef struct xfs_log_busy_slot {
-       xfs_agnumber_t          lbc_ag;
-       ushort                  lbc_idx;        /* index in perag.busy[] */
-} xfs_log_busy_slot_t;
-
-#define XFS_LBC_NUM_SLOTS      31
-typedef struct xfs_log_busy_chunk {
-       struct xfs_log_busy_chunk       *lbc_next;
-       uint                            lbc_free;       /* free slots bitmask */
-       ushort                          lbc_unused;     /* first unused */
-       xfs_log_busy_slot_t             lbc_busy[XFS_LBC_NUM_SLOTS];
-} xfs_log_busy_chunk_t;
-
-#define        XFS_LBC_MAX_SLOT        (XFS_LBC_NUM_SLOTS - 1)
-#define        XFS_LBC_FREEMASK        ((1U << XFS_LBC_NUM_SLOTS) - 1)
-
-#define        XFS_LBC_INIT(cp)        ((cp)->lbc_free = XFS_LBC_FREEMASK)
-#define        XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
-#define        XFS_LBC_SLOT(cp, slot)  (&((cp)->lbc_busy[(slot)]))
-#define        XFS_LBC_VACANCY(cp)     (((cp)->lbc_free) & XFS_LBC_FREEMASK)
-#define        XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
-
-/*
- * This is the type of function which can be given to xfs_trans_callback()
- * to be called upon the transaction's commit to disk.
- */
-typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
-
-/*
- * This is the structure maintained for every active transaction.
- */
-typedef struct xfs_trans {
-       unsigned int            t_magic;        /* magic number */
-       xfs_log_callback_t      t_logcb;        /* log callback struct */
-       unsigned int            t_type;         /* transaction type */
-       unsigned int            t_log_res;      /* amt of log space resvd */
-       unsigned int            t_log_count;    /* count for perm log res */
-       unsigned int            t_blk_res;      /* # of blocks resvd */
-       unsigned int            t_blk_res_used; /* # of resvd blocks used */
-       unsigned int            t_rtx_res;      /* # of rt extents resvd */
-       unsigned int            t_rtx_res_used; /* # of resvd rt extents used */
-       xfs_log_ticket_t        t_ticket;       /* log mgr ticket */
-       xfs_lsn_t               t_lsn;          /* log seq num of start of
-                                                * transaction. */
-       xfs_lsn_t               t_commit_lsn;   /* log seq num of end of
-                                                * transaction. */
-       struct xfs_mount        *t_mountp;      /* ptr to fs mount struct */
-       struct xfs_dquot_acct   *t_dqinfo;      /* acctg info for dquots */
-       xfs_trans_callback_t    t_callback;     /* transaction callback */
-       void                    *t_callarg;     /* callback arg */
-       unsigned int            t_flags;        /* misc flags */
-       int64_t                 t_icount_delta; /* superblock icount change */
-       int64_t                 t_ifree_delta;  /* superblock ifree change */
-       int64_t                 t_fdblocks_delta; /* superblock fdblocks chg */
-       int64_t                 t_res_fdblocks_delta; /* on-disk only chg */
-       int64_t                 t_frextents_delta;/* superblock freextents chg*/
-       int64_t                 t_res_frextents_delta; /* on-disk only chg */
-#ifdef DEBUG
-       int64_t                 t_ag_freeblks_delta; /* debugging counter */
-       int64_t                 t_ag_flist_delta; /* debugging counter */
-       int64_t                 t_ag_btree_delta; /* debugging counter */
-#endif
-       int64_t                 t_dblocks_delta;/* superblock dblocks change */
-       int64_t                 t_agcount_delta;/* superblock agcount change */
-       int64_t                 t_imaxpct_delta;/* superblock imaxpct change */
-       int64_t                 t_rextsize_delta;/* superblock rextsize chg */
-       int64_t                 t_rbmblocks_delta;/* superblock rbmblocks chg */
-       int64_t                 t_rblocks_delta;/* superblock rblocks change */
-       int64_t                 t_rextents_delta;/* superblocks rextents chg */
-       int64_t                 t_rextslog_delta;/* superblocks rextslog chg */
-       unsigned int            t_items_free;   /* log item descs free */
-       xfs_log_item_chunk_t    t_items;        /* first log item desc chunk */
-       xfs_trans_header_t      t_header;       /* header for in-log trans */
-       unsigned int            t_busy_free;    /* busy descs free */
-       xfs_log_busy_chunk_t    t_busy;         /* busy/async free blocks */
-       unsigned long           t_pflags;       /* saved process flags state */
-} xfs_trans_t;
-
-#endif /* __KERNEL__ */
-
-
 #define        XFS_TRANS_MAGIC         0x5452414E      /* 'TRAN' */
 /*
  * Values for t_flags.
@@ -906,6 +750,157 @@ typedef struct xfs_trans {
 #define        XFS_DQUOT_REF           1
 
 #ifdef __KERNEL__
+
+struct xfs_buf;
+struct xfs_buftarg;
+struct xfs_efd_log_item;
+struct xfs_efi_log_item;
+struct xfs_inode;
+struct xfs_item_ops;
+struct xfs_log_iovec;
+struct xfs_log_item_desc;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot_acct;
+
+typedef struct xfs_log_item {
+       struct list_head                li_ail;         /* AIL pointers */
+       xfs_lsn_t                       li_lsn;         /* last on-disk lsn */
+       struct xfs_log_item_desc        *li_desc;       /* ptr to current desc*/
+       struct xfs_mount                *li_mountp;     /* ptr to fs mount */
+       struct xfs_ail                  *li_ailp;       /* ptr to AIL */
+       uint                            li_type;        /* item type */
+       uint                            li_flags;       /* misc flags */
+       struct xfs_log_item             *li_bio_list;   /* buffer item list */
+       void                            (*li_cb)(struct xfs_buf *,
+                                                struct xfs_log_item *);
+                                                       /* buffer item iodone */
+                                                       /* callback func */
+       struct xfs_item_ops             *li_ops;        /* function list */
+} xfs_log_item_t;
+
+#define        XFS_LI_IN_AIL   0x1
+#define XFS_LI_ABORTED 0x2
+
+typedef struct xfs_item_ops {
+       uint (*iop_size)(xfs_log_item_t *);
+       void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
+       void (*iop_pin)(xfs_log_item_t *);
+       void (*iop_unpin)(xfs_log_item_t *, int);
+       void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
+       uint (*iop_trylock)(xfs_log_item_t *);
+       void (*iop_unlock)(xfs_log_item_t *);
+       xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
+       void (*iop_push)(xfs_log_item_t *);
+       void (*iop_pushbuf)(xfs_log_item_t *);
+       void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
+} xfs_item_ops_t;
+
+#define IOP_SIZE(ip)           (*(ip)->li_ops->iop_size)(ip)
+#define IOP_FORMAT(ip,vp)      (*(ip)->li_ops->iop_format)(ip, vp)
+#define IOP_PIN(ip)            (*(ip)->li_ops->iop_pin)(ip)
+#define IOP_UNPIN(ip, flags)   (*(ip)->li_ops->iop_unpin)(ip, flags)
+#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
+#define IOP_TRYLOCK(ip)                (*(ip)->li_ops->iop_trylock)(ip)
+#define IOP_UNLOCK(ip)         (*(ip)->li_ops->iop_unlock)(ip)
+#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn)
+#define IOP_PUSH(ip)           (*(ip)->li_ops->iop_push)(ip)
+#define IOP_PUSHBUF(ip)                (*(ip)->li_ops->iop_pushbuf)(ip)
+#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
+
+/*
+ * Return values for the IOP_TRYLOCK() routines.
+ */
+#define        XFS_ITEM_SUCCESS        0
+#define        XFS_ITEM_PINNED         1
+#define        XFS_ITEM_LOCKED         2
+#define        XFS_ITEM_FLUSHING       3
+#define XFS_ITEM_PUSHBUF       4
+
+/*
+ * This structure is used to maintain a list of block ranges that have been
+ * freed in the transaction.  The ranges are listed in the perag[] busy list
+ * between when they're freed and the transaction is committed to disk.
+ */
+
+typedef struct xfs_log_busy_slot {
+       xfs_agnumber_t          lbc_ag;
+       ushort                  lbc_idx;        /* index in perag.busy[] */
+} xfs_log_busy_slot_t;
+
+#define XFS_LBC_NUM_SLOTS      31
+typedef struct xfs_log_busy_chunk {
+       struct xfs_log_busy_chunk       *lbc_next;
+       uint                            lbc_free;       /* free slots bitmask */
+       ushort                          lbc_unused;     /* first unused */
+       xfs_log_busy_slot_t             lbc_busy[XFS_LBC_NUM_SLOTS];
+} xfs_log_busy_chunk_t;
+
+#define        XFS_LBC_MAX_SLOT        (XFS_LBC_NUM_SLOTS - 1)
+#define        XFS_LBC_FREEMASK        ((1U << XFS_LBC_NUM_SLOTS) - 1)
+
+#define        XFS_LBC_INIT(cp)        ((cp)->lbc_free = XFS_LBC_FREEMASK)
+#define        XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
+#define        XFS_LBC_SLOT(cp, slot)  (&((cp)->lbc_busy[(slot)]))
+#define        XFS_LBC_VACANCY(cp)     (((cp)->lbc_free) & XFS_LBC_FREEMASK)
+#define        XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
+
+/*
+ * This is the type of function which can be given to xfs_trans_callback()
+ * to be called upon the transaction's commit to disk.
+ */
+typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
+
+/*
+ * This is the structure maintained for every active transaction.
+ */
+typedef struct xfs_trans {
+       unsigned int            t_magic;        /* magic number */
+       xfs_log_callback_t      t_logcb;        /* log callback struct */
+       unsigned int            t_type;         /* transaction type */
+       unsigned int            t_log_res;      /* amt of log space resvd */
+       unsigned int            t_log_count;    /* count for perm log res */
+       unsigned int            t_blk_res;      /* # of blocks resvd */
+       unsigned int            t_blk_res_used; /* # of resvd blocks used */
+       unsigned int            t_rtx_res;      /* # of rt extents resvd */
+       unsigned int            t_rtx_res_used; /* # of resvd rt extents used */
+       xfs_log_ticket_t        t_ticket;       /* log mgr ticket */
+       xfs_lsn_t               t_lsn;          /* log seq num of start of
+                                                * transaction. */
+       xfs_lsn_t               t_commit_lsn;   /* log seq num of end of
+                                                * transaction. */
+       struct xfs_mount        *t_mountp;      /* ptr to fs mount struct */
+       struct xfs_dquot_acct   *t_dqinfo;      /* acctg info for dquots */
+       xfs_trans_callback_t    t_callback;     /* transaction callback */
+       void                    *t_callarg;     /* callback arg */
+       unsigned int            t_flags;        /* misc flags */
+       int64_t                 t_icount_delta; /* superblock icount change */
+       int64_t                 t_ifree_delta;  /* superblock ifree change */
+       int64_t                 t_fdblocks_delta; /* superblock fdblocks chg */
+       int64_t                 t_res_fdblocks_delta; /* on-disk only chg */
+       int64_t                 t_frextents_delta;/* superblock freextents chg*/
+       int64_t                 t_res_frextents_delta; /* on-disk only chg */
+#ifdef DEBUG
+       int64_t                 t_ag_freeblks_delta; /* debugging counter */
+       int64_t                 t_ag_flist_delta; /* debugging counter */
+       int64_t                 t_ag_btree_delta; /* debugging counter */
+#endif
+       int64_t                 t_dblocks_delta;/* superblock dblocks change */
+       int64_t                 t_agcount_delta;/* superblock agcount change */
+       int64_t                 t_imaxpct_delta;/* superblock imaxpct change */
+       int64_t                 t_rextsize_delta;/* superblock rextsize chg */
+       int64_t                 t_rbmblocks_delta;/* superblock rbmblocks chg */
+       int64_t                 t_rblocks_delta;/* superblock rblocks change */
+       int64_t                 t_rextents_delta;/* superblocks rextents chg */
+       int64_t                 t_rextslog_delta;/* superblocks rextslog chg */
+       unsigned int            t_items_free;   /* log item descs free */
+       xfs_log_item_chunk_t    t_items;        /* first log item desc chunk */
+       xfs_trans_header_t      t_header;       /* header for in-log trans */
+       unsigned int            t_busy_free;    /* busy descs free */
+       xfs_log_busy_chunk_t    t_busy;         /* busy/async free blocks */
+       unsigned long           t_pflags;       /* saved process flags state */
+} xfs_trans_t;
+
 /*
  * XFS transaction mechanism exported interfaces that are
  * actually macros.
@@ -928,7 +923,6 @@ typedef struct xfs_trans {
 /*
  * XFS transaction mechanism exported interfaces.
  */
-void           xfs_trans_init(struct xfs_mount *);
 xfs_trans_t    *xfs_trans_alloc(struct xfs_mount *, uint);
 xfs_trans_t    *_xfs_trans_alloc(struct xfs_mount *, uint);
 xfs_trans_t    *xfs_trans_dup(xfs_trans_t *);
@@ -975,13 +969,8 @@ int                _xfs_trans_commit(xfs_trans_t *,
                                  int *);
 #define xfs_trans_commit(tp, flags)    _xfs_trans_commit(tp, flags, NULL)
 void           xfs_trans_cancel(xfs_trans_t *, int);
-int            xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
 int            xfs_trans_ail_init(struct xfs_mount *);
 void           xfs_trans_ail_destroy(struct xfs_mount *);
-void           xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t);
-xfs_lsn_t      xfs_trans_tail_ail(struct xfs_mount *);
-void           xfs_trans_unlocked_item(struct xfs_mount *,
-                                       xfs_log_item_t *);
 xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
                                        xfs_agnumber_t ag,
                                        xfs_extlen_t idx);
@@ -990,4 +979,7 @@ extern kmem_zone_t  *xfs_trans_zone;
 
 #endif /* __KERNEL__ */
 
+void           xfs_trans_init(struct xfs_mount *);
+int            xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
+
 #endif /* __XFS_TRANS_H__ */
index 1f77c00af566be43c8a2c569e2be9222002f555f..2d47f10f8bed4cbe9b2441a29649311c2aa0acae 100644 (file)
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2008 Dave Chinner
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
 
-STATIC void xfs_ail_insert(xfs_ail_t *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_t *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_t *);
-STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_t *, xfs_log_item_t *);
+STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
+STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
 
 #ifdef DEBUG
-STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *);
+STATIC void xfs_ail_check(struct xfs_ail *, xfs_log_item_t *);
 #else
 #define        xfs_ail_check(a,l)
 #endif /* DEBUG */
@@ -50,20 +51,20 @@ STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *);
  * lsn of the last item in the AIL.
  */
 xfs_lsn_t
-xfs_trans_tail_ail(
-       xfs_mount_t     *mp)
+xfs_trans_ail_tail(
+       struct xfs_ail  *ailp)
 {
        xfs_lsn_t       lsn;
        xfs_log_item_t  *lip;
 
-       spin_lock(&mp->m_ail_lock);
-       lip = xfs_ail_min(&mp->m_ail);
+       spin_lock(&ailp->xa_lock);
+       lip = xfs_ail_min(ailp);
        if (lip == NULL) {
                lsn = (xfs_lsn_t)0;
        } else {
                lsn = lip->li_lsn;
        }
-       spin_unlock(&mp->m_ail_lock);
+       spin_unlock(&ailp->xa_lock);
 
        return lsn;
 }
@@ -85,16 +86,125 @@ xfs_trans_tail_ail(
  * any of the objects, so the lock is not needed.
  */
 void
-xfs_trans_push_ail(
-       xfs_mount_t             *mp,
-       xfs_lsn_t               threshold_lsn)
+xfs_trans_ail_push(
+       struct xfs_ail  *ailp,
+       xfs_lsn_t       threshold_lsn)
 {
-       xfs_log_item_t          *lip;
+       xfs_log_item_t  *lip;
+
+       lip = xfs_ail_min(ailp);
+       if (lip && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
+               if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0)
+                       xfsaild_wakeup(ailp, threshold_lsn);
+       }
+}
+
+/*
+ * AIL traversal cursor initialisation.
+ *
+ * The cursor keeps track of where our current traversal is up
+ * to by tracking the next Æ£tem in the list for us. However, for
+ * this to be safe, removing an object from the AIL needs to invalidate
+ * any cursor that points to it. hence the traversal cursor needs to
+ * be linked to the struct xfs_ail so that deletion can search all the
+ * active cursors for invalidation.
+ *
+ * We don't link the push cursor because it is embedded in the struct
+ * xfs_ail and hence easily findable.
+ */
+STATIC void
+xfs_trans_ail_cursor_init(
+       struct xfs_ail          *ailp,
+       struct xfs_ail_cursor   *cur)
+{
+       cur->item = NULL;
+       if (cur == &ailp->xa_cursors)
+               return;
+
+       cur->next = ailp->xa_cursors.next;
+       ailp->xa_cursors.next = cur;
+}
+
+/*
+ * Set the cursor to the next item, because when we look
+ * up the cursor the current item may have been freed.
+ */
+STATIC void
+xfs_trans_ail_cursor_set(
+       struct xfs_ail          *ailp,
+       struct xfs_ail_cursor   *cur,
+       struct xfs_log_item     *lip)
+{
+       if (lip)
+               cur->item = xfs_ail_next(ailp, lip);
+}
+
+/*
+ * Get the next item in the traversal and advance the cursor.
+ * If the cursor was invalidated (inidicated by a lip of 1),
+ * restart the traversal.
+ */
+struct xfs_log_item *
+xfs_trans_ail_cursor_next(
+       struct xfs_ail          *ailp,
+       struct xfs_ail_cursor   *cur)
+{
+       struct xfs_log_item     *lip = cur->item;
+
+       if ((__psint_t)lip & 1)
+               lip = xfs_ail_min(ailp);
+       xfs_trans_ail_cursor_set(ailp, cur, lip);
+       return lip;
+}
+
+/*
+ * Now that the traversal is complete, we need to remove the cursor
+ * from the list of traversing cursors. Avoid removing the embedded
+ * push cursor, but use the fact it is alway present to make the
+ * list deletion simple.
+ */
+void
+xfs_trans_ail_cursor_done(
+       struct xfs_ail          *ailp,
+       struct xfs_ail_cursor   *done)
+{
+       struct xfs_ail_cursor   *prev = NULL;
+       struct xfs_ail_cursor   *cur;
+
+       done->item = NULL;
+       if (done == &ailp->xa_cursors)
+               return;
+       prev = &ailp->xa_cursors;
+       for (cur = prev->next; cur; prev = cur, cur = prev->next) {
+               if (cur == done) {
+                       prev->next = cur->next;
+                       break;
+               }
+       }
+       ASSERT(cur);
+}
+
+/*
+ * Invalidate any cursor that is pointing to this item. This is
+ * called when an item is removed from the AIL. Any cursor pointing
+ * to this object is now invalid and the traversal needs to be
+ * terminated so it doesn't reference a freed object. We set the
+ * cursor item to a value of 1 so we can distinguish between an
+ * invalidation and the end of the list when getting the next item
+ * from the cursor.
+ */
+STATIC void
+xfs_trans_ail_cursor_clear(
+       struct xfs_ail          *ailp,
+       struct xfs_log_item     *lip)
+{
+       struct xfs_ail_cursor   *cur;
 
-       lip = xfs_ail_min(&mp->m_ail);
-       if (lip && !XFS_FORCED_SHUTDOWN(mp)) {
-               if (XFS_LSN_CMP(threshold_lsn, mp->m_ail.xa_target) > 0)
-                       xfsaild_wakeup(mp, threshold_lsn);
+       /* need to search all cursors */
+       for (cur = &ailp->xa_cursors; cur; cur = cur->next) {
+               if (cur->item == lip)
+                       cur->item = (struct xfs_log_item *)
+                                       ((__psint_t)cur->item | 1);
        }
 }
 
@@ -103,25 +213,27 @@ xfs_trans_push_ail(
  * Return the current tree generation number for use
  * in calls to xfs_trans_next_ail().
  */
-STATIC xfs_log_item_t *
-xfs_trans_first_push_ail(
-       xfs_mount_t     *mp,
-       int             *gen,
-       xfs_lsn_t       lsn)
+xfs_log_item_t *
+xfs_trans_ail_cursor_first(
+       struct xfs_ail          *ailp,
+       struct xfs_ail_cursor   *cur,
+       xfs_lsn_t               lsn)
 {
-       xfs_log_item_t  *lip;
+       xfs_log_item_t          *lip;
 
-       lip = xfs_ail_min(&mp->m_ail);
-       *gen = (int)mp->m_ail.xa_gen;
+       xfs_trans_ail_cursor_init(ailp, cur);
+       lip = xfs_ail_min(ailp);
        if (lsn == 0)
-               return lip;
+               goto out;
 
-       list_for_each_entry(lip, &mp->m_ail.xa_ail, li_ail) {
+       list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
                if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0)
-                       return lip;
+                       goto out;
        }
-
-       return NULL;
+       lip = NULL;
+out:
+       xfs_trans_ail_cursor_set(ailp, cur, lip);
+       return lip;
 }
 
 /*
@@ -129,29 +241,29 @@ xfs_trans_first_push_ail(
  */
 long
 xfsaild_push(
-       xfs_mount_t     *mp,
+       struct xfs_ail  *ailp,
        xfs_lsn_t       *last_lsn)
 {
        long            tout = 1000; /* milliseconds */
        xfs_lsn_t       last_pushed_lsn = *last_lsn;
-       xfs_lsn_t       target =  mp->m_ail.xa_target;
+       xfs_lsn_t       target =  ailp->xa_target;
        xfs_lsn_t       lsn;
        xfs_log_item_t  *lip;
-       int             gen;
-       int             restarts;
        int             flush_log, count, stuck;
+       xfs_mount_t     *mp = ailp->xa_mount;
+       struct xfs_ail_cursor   *cur = &ailp->xa_cursors;
 
-#define        XFS_TRANS_PUSH_AIL_RESTARTS     10
-
-       spin_lock(&mp->m_ail_lock);
-       lip = xfs_trans_first_push_ail(mp, &gen, *last_lsn);
+       spin_lock(&ailp->xa_lock);
+       xfs_trans_ail_cursor_init(ailp, cur);
+       lip = xfs_trans_ail_cursor_first(ailp, cur, *last_lsn);
        if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
                /*
                 * AIL is empty or our push has reached the end.
                 */
-               spin_unlock(&mp->m_ail_lock);
+               xfs_trans_ail_cursor_done(ailp, cur);
+               spin_unlock(&ailp->xa_lock);
                last_pushed_lsn = 0;
-               goto out;
+               return tout;
        }
 
        XFS_STATS_INC(xs_push_ail);
@@ -169,7 +281,7 @@ xfsaild_push(
         */
        tout = 10;
        lsn = lip->li_lsn;
-       flush_log = stuck = count = restarts = 0;
+       flush_log = stuck = count = 0;
        while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) {
                int     lock_result;
                /*
@@ -184,7 +296,7 @@ xfsaild_push(
                 * skip to the next item in the list.
                 */
                lock_result = IOP_TRYLOCK(lip);
-               spin_unlock(&mp->m_ail_lock);
+               spin_unlock(&ailp->xa_lock);
                switch (lock_result) {
                case XFS_ITEM_SUCCESS:
                        XFS_STATS_INC(xs_push_ail_success);
@@ -221,7 +333,7 @@ xfsaild_push(
                        break;
                }
 
-               spin_lock(&mp->m_ail_lock);
+               spin_lock(&ailp->xa_lock);
                /* should we bother continuing? */
                if (XFS_FORCED_SHUTDOWN(mp))
                        break;
@@ -244,14 +356,13 @@ xfsaild_push(
                if (stuck > 100)
                        break;
 
-               lip = xfs_trans_next_ail(mp, lip, &gen, &restarts);
+               lip = xfs_trans_ail_cursor_next(ailp, cur);
                if (lip == NULL)
                        break;
-               if (restarts > XFS_TRANS_PUSH_AIL_RESTARTS)
-                       break;
                lsn = lip->li_lsn;
        }
-       spin_unlock(&mp->m_ail_lock);
+       xfs_trans_ail_cursor_done(ailp, cur);
+       spin_unlock(&ailp->xa_lock);
 
        if (flush_log) {
                /*
@@ -274,8 +385,7 @@ xfsaild_push(
                 */
                tout += 20;
                last_pushed_lsn = 0;
-       } else if ((restarts > XFS_TRANS_PUSH_AIL_RESTARTS) ||
-                  ((stuck * 100) / count > 90)) {
+       } else if ((stuck * 100) / count > 90) {
                /*
                 * Either there is a lot of contention on the AIL or we
                 * are stuck due to operations in progress. "Stuck" in this
@@ -287,7 +397,6 @@ xfsaild_push(
                 */
                tout += 10;
        }
-out:
        *last_lsn = last_pushed_lsn;
        return tout;
 }      /* xfsaild_push */
@@ -303,7 +412,7 @@ out:
  */
 void
 xfs_trans_unlocked_item(
-       xfs_mount_t     *mp,
+       struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 {
        xfs_log_item_t  *min_lip;
@@ -315,7 +424,7 @@ xfs_trans_unlocked_item(
         * over some potentially valid data.
         */
        if (!(lip->li_flags & XFS_LI_IN_AIL) ||
-           XFS_FORCED_SHUTDOWN(mp)) {
+           XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
                return;
        }
 
@@ -331,10 +440,10 @@ xfs_trans_unlocked_item(
         * the call to xfs_log_move_tail() doesn't do anything if there's
         * not enough free space to wake people up so we're safe calling it.
         */
-       min_lip = xfs_ail_min(&mp->m_ail);
+       min_lip = xfs_ail_min(ailp);
 
        if (min_lip == lip)
-               xfs_log_move_tail(mp, 1);
+               xfs_log_move_tail(ailp->xa_mount, 1);
 }      /* xfs_trans_unlocked_item */
 
 
@@ -347,41 +456,37 @@ xfs_trans_unlocked_item(
  * we move in the AIL is the minimum one, update the tail lsn in the
  * log manager.
  *
- * Increment the AIL's generation count to indicate that the tree
- * has changed.
- *
  * This function must be called with the AIL lock held.  The lock
  * is dropped before returning.
  */
 void
-xfs_trans_update_ail(
-       xfs_mount_t     *mp,
+xfs_trans_ail_update(
+       struct xfs_ail  *ailp,
        xfs_log_item_t  *lip,
-       xfs_lsn_t       lsn) __releases(mp->m_ail_lock)
+       xfs_lsn_t       lsn) __releases(ailp->xa_lock)
 {
-       xfs_log_item_t          *dlip=NULL;
+       xfs_log_item_t          *dlip = NULL;
        xfs_log_item_t          *mlip;  /* ptr to minimum lip */
 
-       mlip = xfs_ail_min(&mp->m_ail);
+       mlip = xfs_ail_min(ailp);
 
        if (lip->li_flags & XFS_LI_IN_AIL) {
-               dlip = xfs_ail_delete(&mp->m_ail, lip);
+               dlip = xfs_ail_delete(ailp, lip);
                ASSERT(dlip == lip);
+               xfs_trans_ail_cursor_clear(ailp, dlip);
        } else {
                lip->li_flags |= XFS_LI_IN_AIL;
        }
 
        lip->li_lsn = lsn;
-
-       xfs_ail_insert(&mp->m_ail, lip);
-       mp->m_ail.xa_gen++;
+       xfs_ail_insert(ailp, lip);
 
        if (mlip == dlip) {
-               mlip = xfs_ail_min(&mp->m_ail);
-               spin_unlock(&mp->m_ail_lock);
-               xfs_log_move_tail(mp, mlip->li_lsn);
+               mlip = xfs_ail_min(ailp);
+               spin_unlock(&ailp->xa_lock);
+               xfs_log_move_tail(ailp->xa_mount, mlip->li_lsn);
        } else {
-               spin_unlock(&mp->m_ail_lock);
+               spin_unlock(&ailp->xa_lock);
        }
 
 
@@ -403,29 +508,30 @@ xfs_trans_update_ail(
  * is dropped before returning.
  */
 void
-xfs_trans_delete_ail(
-       xfs_mount_t     *mp,
-       xfs_log_item_t  *lip) __releases(mp->m_ail_lock)
+xfs_trans_ail_delete(
+       struct xfs_ail  *ailp,
+       xfs_log_item_t  *lip) __releases(ailp->xa_lock)
 {
        xfs_log_item_t          *dlip;
        xfs_log_item_t          *mlip;
 
        if (lip->li_flags & XFS_LI_IN_AIL) {
-               mlip = xfs_ail_min(&mp->m_ail);
-               dlip = xfs_ail_delete(&mp->m_ail, lip);
+               mlip = xfs_ail_min(ailp);
+               dlip = xfs_ail_delete(ailp, lip);
                ASSERT(dlip == lip);
+               xfs_trans_ail_cursor_clear(ailp, dlip);
 
 
                lip->li_flags &= ~XFS_LI_IN_AIL;
                lip->li_lsn = 0;
-               mp->m_ail.xa_gen++;
 
                if (mlip == dlip) {
-                       mlip = xfs_ail_min(&mp->m_ail);
-                       spin_unlock(&mp->m_ail_lock);
-                       xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0));
+                       mlip = xfs_ail_min(ailp);
+                       spin_unlock(&ailp->xa_lock);
+                       xfs_log_move_tail(ailp->xa_mount,
+                                               (mlip ? mlip->li_lsn : 0));
                } else {
-                       spin_unlock(&mp->m_ail_lock);
+                       spin_unlock(&ailp->xa_lock);
                }
        }
        else {
@@ -433,13 +539,13 @@ xfs_trans_delete_ail(
                 * If the file system is not being shutdown, we are in
                 * serious trouble if we get to this stage.
                 */
-               if (XFS_FORCED_SHUTDOWN(mp))
-                       spin_unlock(&mp->m_ail_lock);
-               else {
+               struct xfs_mount        *mp = ailp->xa_mount;
+
+               spin_unlock(&ailp->xa_lock);
+               if (!XFS_FORCED_SHUTDOWN(mp)) {
                        xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
                "%s: attempting to delete a log item that is not in the AIL",
                                        __func__);
-                       spin_unlock(&mp->m_ail_lock);
                        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
                }
        }
@@ -447,56 +553,6 @@ xfs_trans_delete_ail(
 
 
 
-/*
- * Return the item in the AIL with the smallest lsn.
- * Return the current tree generation number for use
- * in calls to xfs_trans_next_ail().
- */
-xfs_log_item_t *
-xfs_trans_first_ail(
-       xfs_mount_t     *mp,
-       int             *gen)
-{
-       xfs_log_item_t  *lip;
-
-       lip = xfs_ail_min(&mp->m_ail);
-       *gen = (int)mp->m_ail.xa_gen;
-
-       return lip;
-}
-
-/*
- * If the generation count of the tree has not changed since the
- * caller last took something from the AIL, then return the elmt
- * in the tree which follows the one given.  If the count has changed,
- * then return the minimum elmt of the AIL and bump the restarts counter
- * if one is given.
- */
-xfs_log_item_t *
-xfs_trans_next_ail(
-       xfs_mount_t     *mp,
-       xfs_log_item_t  *lip,
-       int             *gen,
-       int             *restarts)
-{
-       xfs_log_item_t  *nlip;
-
-       ASSERT(mp && lip && gen);
-       if (mp->m_ail.xa_gen == *gen) {
-               nlip = xfs_ail_next(&mp->m_ail, lip);
-       } else {
-               nlip = xfs_ail_min(&mp->m_ail);
-               *gen = (int)mp->m_ail.xa_gen;
-               if (restarts != NULL) {
-                       XFS_STATS_INC(xs_push_ail_restarts);
-                       (*restarts)++;
-               }
-       }
-
-       return (nlip);
-}
-
-
 /*
  * The active item list (AIL) is a doubly linked list of log
  * items sorted by ascending lsn.  The base of the list is
@@ -515,15 +571,35 @@ int
 xfs_trans_ail_init(
        xfs_mount_t     *mp)
 {
-       INIT_LIST_HEAD(&mp->m_ail.xa_ail);
-       return xfsaild_start(mp);
+       struct xfs_ail  *ailp;
+       int             error;
+
+       ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
+       if (!ailp)
+               return ENOMEM;
+
+       ailp->xa_mount = mp;
+       INIT_LIST_HEAD(&ailp->xa_ail);
+       spin_lock_init(&ailp->xa_lock);
+       error = xfsaild_start(ailp);
+       if (error)
+               goto out_free_ailp;
+       mp->m_ail = ailp;
+       return 0;
+
+out_free_ailp:
+       kmem_free(ailp);
+       return error;
 }
 
 void
 xfs_trans_ail_destroy(
        xfs_mount_t     *mp)
 {
-       xfsaild_stop(mp);
+       struct xfs_ail  *ailp = mp->m_ail;
+
+       xfsaild_stop(ailp);
+       kmem_free(ailp);
 }
 
 /*
@@ -534,7 +610,7 @@ xfs_trans_ail_destroy(
  */
 STATIC void
 xfs_ail_insert(
-       xfs_ail_t       *ailp,
+       struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 /* ARGSUSED */
 {
@@ -568,7 +644,7 @@ xfs_ail_insert(
 /*ARGSUSED*/
 STATIC xfs_log_item_t *
 xfs_ail_delete(
-       xfs_ail_t       *ailp,
+       struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 /* ARGSUSED */
 {
@@ -585,7 +661,7 @@ xfs_ail_delete(
  */
 STATIC xfs_log_item_t *
 xfs_ail_min(
-       xfs_ail_t       *ailp)
+       struct xfs_ail  *ailp)
 /* ARGSUSED */
 {
        if (list_empty(&ailp->xa_ail))
@@ -601,7 +677,7 @@ xfs_ail_min(
  */
 STATIC xfs_log_item_t *
 xfs_ail_next(
-       xfs_ail_t       *ailp,
+       struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 /* ARGSUSED */
 {
@@ -617,7 +693,7 @@ xfs_ail_next(
  */
 STATIC void
 xfs_ail_check(
-       xfs_ail_t       *ailp,
+       struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 {
        xfs_log_item_t  *prev_lip;
index 4e855b5ced666eefeb5b910e0599adeed3a0ec17..8ee2f8c8b0a61863c9686a7bbae9b4794616615f 100644 (file)
@@ -527,9 +527,8 @@ xfs_trans_brelse(xfs_trans_t        *tp,
                        lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
                        if (lip->li_type == XFS_LI_BUF) {
                                bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
-                               xfs_trans_unlocked_item(
-                                               bip->bli_item.li_mountp,
-                                               lip);
+                               xfs_trans_unlocked_item(bip->bli_item.li_ailp,
+                                                       lip);
                        }
                }
                xfs_buf_relse(bp);
@@ -626,7 +625,7 @@ xfs_trans_brelse(xfs_trans_t        *tp,
         * tell the AIL that the buffer is being unlocked.
         */
        if (bip != NULL) {
-               xfs_trans_unlocked_item(bip->bli_item.li_mountp,
+               xfs_trans_unlocked_item(bip->bli_item.li_ailp,
                                        (xfs_log_item_t*)bip);
        }
 
index 2a1c0f071f9181e8a045a70dc2bc72dd453c0d05..23d276af2e0c43c0fddb374462adcc9fd0ddc481 100644 (file)
@@ -85,7 +85,6 @@ xfs_trans_iget(
 {
        int                     error;
        xfs_inode_t             *ip;
-       xfs_inode_log_item_t    *iip;
 
        /*
         * If the transaction pointer is NULL, just call the normal
@@ -138,34 +137,7 @@ xfs_trans_iget(
        }
        ASSERT(ip != NULL);
 
-       /*
-        * Get a log_item_desc to point at the new item.
-        */
-       if (ip->i_itemp == NULL)
-               xfs_inode_item_init(ip, mp);
-       iip = ip->i_itemp;
-       (void) xfs_trans_add_item(tp, (xfs_log_item_t *)(iip));
-
-       xfs_trans_inode_broot_debug(ip);
-
-       /*
-        * If the IO lock has been acquired, mark that in
-        * the inode log item so we'll know to unlock it
-        * when the transaction commits.
-        */
-       ASSERT(iip->ili_flags == 0);
-       if (lock_flags & XFS_IOLOCK_EXCL) {
-               iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL;
-       } else if (lock_flags & XFS_IOLOCK_SHARED) {
-               iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED;
-       }
-
-       /*
-        * Initialize i_transp so we can find it with xfs_inode_incore()
-        * above.
-        */
-       ip->i_transp = tp;
-
+       xfs_trans_ijoin(tp, ip, lock_flags);
        *ipp = ip;
        return 0;
 }
index 3c666e8317f82bd3f0e41451a06bd39d3c9ff8db..e110bf57d7f496ddd70ce9f4927da8d67fa20545 100644 (file)
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
+/* XXX: from here down needed until struct xfs_trans has it's own ailp */
+#include "xfs_bit.h"
+#include "xfs_buf_item.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
 
 STATIC int     xfs_trans_unlock_chunk(xfs_log_item_chunk_t *,
                                        int, int, xfs_lsn_t);
@@ -79,6 +87,7 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
                lidp->lid_size = 0;
                lip->li_desc = lidp;
                lip->li_mountp = tp->t_mountp;
+               lip->li_ailp = tp->t_mountp->m_ail;
                return lidp;
        }
 
@@ -120,6 +129,7 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
        lidp->lid_size = 0;
        lip->li_desc = lidp;
        lip->li_mountp = tp->t_mountp;
+       lip->li_ailp = tp->t_mountp->m_ail;
        return lidp;
 }
 
index 3c748c456ed4f9a58c03babd86d9afd401ca8efe..73e2ad3974328ed9a62a81663feab3aacd0f730d 100644 (file)
@@ -44,25 +44,93 @@ xfs_log_busy_slot_t         *xfs_trans_add_busy(xfs_trans_t *tp,
                                                    xfs_extlen_t idx);
 
 /*
- * From xfs_trans_ail.c
+ * AIL traversal cursor.
+ *
+ * Rather than using a generation number for detecting changes in the ail, use
+ * a cursor that is protected by the ail lock. The aild cursor exists in the
+ * struct xfs_ail, but other traversals can declare it on the stack and link it
+ * to the ail list.
+ *
+ * When an object is deleted from or moved int the AIL, the cursor list is
+ * searched to see if the object is a designated cursor item. If it is, it is
+ * deleted from the cursor so that the next time the cursor is used traversal
+ * will return to the start.
+ *
+ * This means a traversal colliding with a removal will cause a restart of the
+ * list scan, rather than any insertion or deletion anywhere in the list. The
+ * low bit of the item pointer is set if the cursor has been invalidated so
+ * that we can tell the difference between invalidation and reaching the end
+ * of the list to trigger traversal restarts.
  */
-void                   xfs_trans_update_ail(struct xfs_mount *mp,
-                                    struct xfs_log_item *lip, xfs_lsn_t lsn)
-                                    __releases(mp->m_ail_lock);
-void                   xfs_trans_delete_ail(struct xfs_mount *mp,
-                                    struct xfs_log_item *lip)
-                                    __releases(mp->m_ail_lock);
-struct xfs_log_item    *xfs_trans_first_ail(struct xfs_mount *, int *);
-struct xfs_log_item    *xfs_trans_next_ail(struct xfs_mount *,
-                                    struct xfs_log_item *, int *, int *);
+struct xfs_ail_cursor {
+       struct xfs_ail_cursor   *next;
+       struct xfs_log_item     *item;
+};
 
+/*
+ * Private AIL structures.
+ *
+ * Eventually we need to drive the locking in here as well.
+ */
+struct xfs_ail {
+       struct xfs_mount        *xa_mount;
+       struct list_head        xa_ail;
+       uint                    xa_gen;
+       struct task_struct      *xa_task;
+       xfs_lsn_t               xa_target;
+       struct xfs_ail_cursor   xa_cursors;
+       spinlock_t              xa_lock;
+};
 
 /*
- * AIL push thread support
+ * From xfs_trans_ail.c
  */
-long   xfsaild_push(struct xfs_mount *, xfs_lsn_t *);
-void   xfsaild_wakeup(struct xfs_mount *, xfs_lsn_t);
-int    xfsaild_start(struct xfs_mount *);
-void   xfsaild_stop(struct xfs_mount *);
+void                   xfs_trans_ail_update(struct xfs_ail *ailp,
+                                       struct xfs_log_item *lip, xfs_lsn_t lsn)
+                                       __releases(ailp->xa_lock);
+void                   xfs_trans_ail_delete(struct xfs_ail *ailp,
+                                       struct xfs_log_item *lip)
+                                       __releases(ailp->xa_lock);
+void                   xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
+void                   xfs_trans_unlocked_item(struct xfs_ail *,
+                                       xfs_log_item_t *);
+
+xfs_lsn_t              xfs_trans_ail_tail(struct xfs_ail *ailp);
+
+struct xfs_log_item    *xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
+                                       struct xfs_ail_cursor *cur,
+                                       xfs_lsn_t lsn);
+struct xfs_log_item    *xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
+                                       struct xfs_ail_cursor *cur);
+void                   xfs_trans_ail_cursor_done(struct xfs_ail *ailp,
+                                       struct xfs_ail_cursor *cur);
+
+long   xfsaild_push(struct xfs_ail *, xfs_lsn_t *);
+void   xfsaild_wakeup(struct xfs_ail *, xfs_lsn_t);
+int    xfsaild_start(struct xfs_ail *);
+void   xfsaild_stop(struct xfs_ail *);
 
+#if BITS_PER_LONG != 64
+static inline void
+xfs_trans_ail_copy_lsn(
+       struct xfs_ail  *ailp,
+       xfs_lsn_t       *dst,
+       xfs_lsn_t       *src)
+{
+       ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */
+       spin_lock(&ailp->xa_lock);
+       *dst = *src;
+       spin_unlock(&ailp->xa_lock);
+}
+#else
+static inline void
+xfs_trans_ail_copy_lsn(
+       struct xfs_ail  *ailp,
+       xfs_lsn_t       *dst,
+       xfs_lsn_t       *src)
+{
+       ASSERT(sizeof(xfs_lsn_t) == 8);
+       *dst = *src;
+}
+#endif
 #endif /* __XFS_TRANS_PRIV_H__ */
index 35d4d414bcc273ca0981fad6ed4dddbbf8a9ea42..fcc2285d03ed7c9be2c56586d631242e6fe06874 100644 (file)
@@ -172,6 +172,12 @@ xfs_dir_ialloc(
                        *ipp = NULL;
                        return code;
                }
+
+               /*
+                * transaction commit worked ok so we can drop the extra ticket
+                * reference that we gained in xfs_trans_dup()
+                */
+               xfs_log_ticket_put(tp->t_ticket);
                code = xfs_trans_reserve(tp, 0, log_res, 0,
                                         XFS_TRANS_PERM_LOG_RES, log_count);
                /*
@@ -268,9 +274,9 @@ xfs_bump_ino_vers2(
        xfs_mount_t     *mp;
 
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-       ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1);
+       ASSERT(ip->i_d.di_version == 1);
 
-       ip->i_d.di_version = XFS_DINODE_VERSION_2;
+       ip->i_d.di_version = 2;
        ip->i_d.di_onlink = 0;
        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
        mp = tp->t_mountp;
@@ -302,7 +308,7 @@ xfs_bumplink(
        ASSERT(ip->i_d.di_nlink > 0);
        ip->i_d.di_nlink++;
        inc_nlink(VFS_I(ip));
-       if ((ip->i_d.di_version == XFS_DINODE_VERSION_1) &&
+       if ((ip->i_d.di_version == 1) &&
            (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
                /*
                 * The inode has increased its number of links beyond
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
deleted file mode 100644 (file)
index 439dd39..0000000
+++ /dev/null
@@ -1,757 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_ialloc.h"
-#include "xfs_quota.h"
-#include "xfs_error.h"
-#include "xfs_bmap.h"
-#include "xfs_rw.h"
-#include "xfs_buf_item.h"
-#include "xfs_log_priv.h"
-#include "xfs_dir2_trace.h"
-#include "xfs_extfree_item.h"
-#include "xfs_acl.h"
-#include "xfs_attr.h"
-#include "xfs_clnt.h"
-#include "xfs_mru_cache.h"
-#include "xfs_filestream.h"
-#include "xfs_fsops.h"
-#include "xfs_vnodeops.h"
-#include "xfs_vfsops.h"
-#include "xfs_utils.h"
-
-
-STATIC void
-xfs_quiesce_fs(
-       xfs_mount_t             *mp)
-{
-       int                     count = 0, pincount;
-
-       xfs_flush_buftarg(mp->m_ddev_targp, 0);
-       xfs_finish_reclaim_all(mp, 0);
-
-       /* This loop must run at least twice.
-        * The first instance of the loop will flush
-        * most meta data but that will generate more
-        * meta data (typically directory updates).
-        * Which then must be flushed and logged before
-        * we can write the unmount record.
-        */
-       do {
-               xfs_syncsub(mp, SYNC_INODE_QUIESCE, NULL);
-               pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
-               if (!pincount) {
-                       delay(50);
-                       count++;
-               }
-       } while (count < 2);
-}
-
-/*
- * Second stage of a quiesce. The data is already synced, now we have to take
- * care of the metadata. New transactions are already blocked, so we need to
- * wait for any remaining transactions to drain out before proceding.
- */
-void
-xfs_attr_quiesce(
-       xfs_mount_t     *mp)
-{
-       int     error = 0;
-
-       /* wait for all modifications to complete */
-       while (atomic_read(&mp->m_active_trans) > 0)
-               delay(100);
-
-       /* flush inodes and push all remaining buffers out to disk */
-       xfs_quiesce_fs(mp);
-
-       ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
-
-       /* Push the superblock and write an unmount record */
-       error = xfs_log_sbcount(mp, 1);
-       if (error)
-               xfs_fs_cmn_err(CE_WARN, mp,
-                               "xfs_attr_quiesce: failed to log sb changes. "
-                               "Frozen image may not be consistent.");
-       xfs_log_unmount_write(mp);
-       xfs_unmountfs_writesb(mp);
-}
-
-/*
- * xfs_unmount_flush implements a set of flush operation on special
- * inodes, which are needed as a separate set of operations so that
- * they can be called as part of relocation process.
- */
-int
-xfs_unmount_flush(
-       xfs_mount_t     *mp,            /* Mount structure we are getting
-                                          rid of. */
-       int             relocation)     /* Called from vfs relocation. */
-{
-       xfs_inode_t     *rip = mp->m_rootip;
-       xfs_inode_t     *rbmip;
-       xfs_inode_t     *rsumip = NULL;
-       int             error;
-
-       xfs_ilock(rip, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
-       xfs_iflock(rip);
-
-       /*
-        * Flush out the real time inodes.
-        */
-       if ((rbmip = mp->m_rbmip) != NULL) {
-               xfs_ilock(rbmip, XFS_ILOCK_EXCL);
-               xfs_iflock(rbmip);
-               error = xfs_iflush(rbmip, XFS_IFLUSH_SYNC);
-               xfs_iunlock(rbmip, XFS_ILOCK_EXCL);
-
-               if (error == EFSCORRUPTED)
-                       goto fscorrupt_out;
-
-               ASSERT(vn_count(VFS_I(rbmip)) == 1);
-
-               rsumip = mp->m_rsumip;
-               xfs_ilock(rsumip, XFS_ILOCK_EXCL);
-               xfs_iflock(rsumip);
-               error = xfs_iflush(rsumip, XFS_IFLUSH_SYNC);
-               xfs_iunlock(rsumip, XFS_ILOCK_EXCL);
-
-               if (error == EFSCORRUPTED)
-                       goto fscorrupt_out;
-
-               ASSERT(vn_count(VFS_I(rsumip)) == 1);
-       }
-
-       /*
-        * Synchronously flush root inode to disk
-        */
-       error = xfs_iflush(rip, XFS_IFLUSH_SYNC);
-       if (error == EFSCORRUPTED)
-               goto fscorrupt_out2;
-
-       if (vn_count(VFS_I(rip)) != 1 && !relocation) {
-               xfs_iunlock(rip, XFS_ILOCK_EXCL);
-               return XFS_ERROR(EBUSY);
-       }
-
-       /*
-        * Release dquot that rootinode, rbmino and rsumino might be holding,
-        * flush and purge the quota inodes.
-        */
-       error = XFS_QM_UNMOUNT(mp);
-       if (error == EFSCORRUPTED)
-               goto fscorrupt_out2;
-
-       if (rbmip) {
-               IRELE(rbmip);
-               IRELE(rsumip);
-       }
-
-       xfs_iunlock(rip, XFS_ILOCK_EXCL);
-       return 0;
-
-fscorrupt_out:
-       xfs_ifunlock(rip);
-
-fscorrupt_out2:
-       xfs_iunlock(rip, XFS_ILOCK_EXCL);
-
-       return XFS_ERROR(EFSCORRUPTED);
-}
-
-/*
- * xfs_sync flushes any pending I/O to file system vfsp.
- *
- * This routine is called by vfs_sync() to make sure that things make it
- * out to disk eventually, on sync() system calls to flush out everything,
- * and when the file system is unmounted.  For the vfs_sync() case, all
- * we really need to do is sync out the log to make all of our meta-data
- * updates permanent (except for timestamps).  For calls from pflushd(),
- * dirty pages are kept moving by calling pdflush() on the inodes
- * containing them.  We also flush the inodes that we can lock without
- * sleeping and the superblock if we can lock it without sleeping from
- * vfs_sync() so that items at the tail of the log are always moving out.
- *
- * Flags:
- *      SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want
- *                    to sleep if we can help it.  All we really need
- *                    to do is ensure that the log is synced at least
- *                    periodically.  We also push the inodes and
- *                    superblock if we can lock them without sleeping
- *                     and they are not pinned.
- *      SYNC_ATTR    - We need to flush the inodes.  If SYNC_BDFLUSH is not
- *                    set, then we really want to lock each inode and flush
- *                    it.
- *      SYNC_WAIT    - All the flushes that take place in this call should
- *                    be synchronous.
- *      SYNC_DELWRI  - This tells us to push dirty pages associated with
- *                    inodes.  SYNC_WAIT and SYNC_BDFLUSH are used to
- *                    determine if they should be flushed sync, async, or
- *                    delwri.
- *      SYNC_CLOSE   - This flag is passed when the system is being
- *                    unmounted.  We should sync and invalidate everything.
- *      SYNC_FSDATA  - This indicates that the caller would like to make
- *                    sure the superblock is safe on disk.  We can ensure
- *                    this by simply making sure the log gets flushed
- *                    if SYNC_BDFLUSH is set, and by actually writing it
- *                    out otherwise.
- *     SYNC_IOWAIT  - The caller wants us to wait for all data I/O to complete
- *                    before we return (including direct I/O). Forms the drain
- *                    side of the write barrier needed to safely quiesce the
- *                    filesystem.
- *
- */
-int
-xfs_sync(
-       xfs_mount_t     *mp,
-       int             flags)
-{
-       int             error;
-
-       /*
-        * Get the Quota Manager to flush the dquots.
-        *
-        * If XFS quota support is not enabled or this filesystem
-        * instance does not use quotas XFS_QM_DQSYNC will always
-        * return zero.
-        */
-       error = XFS_QM_DQSYNC(mp, flags);
-       if (error) {
-               /*
-                * If we got an IO error, we will be shutting down.
-                * So, there's nothing more for us to do here.
-                */
-               ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp));
-               if (XFS_FORCED_SHUTDOWN(mp))
-                       return XFS_ERROR(error);
-       }
-
-       if (flags & SYNC_IOWAIT)
-               xfs_filestream_flush(mp);
-
-       return xfs_syncsub(mp, flags, NULL);
-}
-
-/*
- * xfs sync routine for internal use
- *
- * This routine supports all of the flags defined for the generic vfs_sync
- * interface as explained above under xfs_sync.
- *
- */
-int
-xfs_sync_inodes(
-       xfs_mount_t     *mp,
-       int             flags,
-       int             *bypassed)
-{
-       xfs_inode_t     *ip = NULL;
-       struct inode    *vp = NULL;
-       int             error;
-       int             last_error;
-       uint64_t        fflag;
-       uint            lock_flags;
-       uint            base_lock_flags;
-       boolean_t       mount_locked;
-       boolean_t       vnode_refed;
-       int             preempt;
-       xfs_iptr_t      *ipointer;
-#ifdef DEBUG
-       boolean_t       ipointer_in = B_FALSE;
-
-#define IPOINTER_SET   ipointer_in = B_TRUE
-#define IPOINTER_CLR   ipointer_in = B_FALSE
-#else
-#define IPOINTER_SET
-#define IPOINTER_CLR
-#endif
-
-
-/* Insert a marker record into the inode list after inode ip. The list
- * must be locked when this is called. After the call the list will no
- * longer be locked.
- */
-#define IPOINTER_INSERT(ip, mp)        { \
-               ASSERT(ipointer_in == B_FALSE); \
-               ipointer->ip_mnext = ip->i_mnext; \
-               ipointer->ip_mprev = ip; \
-               ip->i_mnext = (xfs_inode_t *)ipointer; \
-               ipointer->ip_mnext->i_mprev = (xfs_inode_t *)ipointer; \
-               preempt = 0; \
-               XFS_MOUNT_IUNLOCK(mp); \
-               mount_locked = B_FALSE; \
-               IPOINTER_SET; \
-       }
-
-/* Remove the marker from the inode list. If the marker was the only item
- * in the list then there are no remaining inodes and we should zero out
- * the whole list. If we are the current head of the list then move the head
- * past us.
- */
-#define IPOINTER_REMOVE(ip, mp)        { \
-               ASSERT(ipointer_in == B_TRUE); \
-               if (ipointer->ip_mnext != (xfs_inode_t *)ipointer) { \
-                       ip = ipointer->ip_mnext; \
-                       ip->i_mprev = ipointer->ip_mprev; \
-                       ipointer->ip_mprev->i_mnext = ip; \
-                       if (mp->m_inodes == (xfs_inode_t *)ipointer) { \
-                               mp->m_inodes = ip; \
-                       } \
-               } else { \
-                       ASSERT(mp->m_inodes == (xfs_inode_t *)ipointer); \
-                       mp->m_inodes = NULL; \
-                       ip = NULL; \
-               } \
-               IPOINTER_CLR; \
-       }
-
-#define XFS_PREEMPT_MASK       0x7f
-
-       ASSERT(!(flags & SYNC_BDFLUSH));
-
-       if (bypassed)
-               *bypassed = 0;
-       if (mp->m_flags & XFS_MOUNT_RDONLY)
-               return 0;
-       error = 0;
-       last_error = 0;
-       preempt = 0;
-
-       /* Allocate a reference marker */
-       ipointer = (xfs_iptr_t *)kmem_zalloc(sizeof(xfs_iptr_t), KM_SLEEP);
-
-       fflag = XFS_B_ASYNC;            /* default is don't wait */
-       if (flags & SYNC_DELWRI)
-               fflag = XFS_B_DELWRI;
-       if (flags & SYNC_WAIT)
-               fflag = 0;              /* synchronous overrides all */
-
-       base_lock_flags = XFS_ILOCK_SHARED;
-       if (flags & (SYNC_DELWRI | SYNC_CLOSE)) {
-               /*
-                * We need the I/O lock if we're going to call any of
-                * the flush/inval routines.
-                */
-               base_lock_flags |= XFS_IOLOCK_SHARED;
-       }
-
-       XFS_MOUNT_ILOCK(mp);
-
-       ip = mp->m_inodes;
-
-       mount_locked = B_TRUE;
-       vnode_refed  = B_FALSE;
-
-       IPOINTER_CLR;
-
-       do {
-               ASSERT(ipointer_in == B_FALSE);
-               ASSERT(vnode_refed == B_FALSE);
-
-               lock_flags = base_lock_flags;
-
-               /*
-                * There were no inodes in the list, just break out
-                * of the loop.
-                */
-               if (ip == NULL) {
-                       break;
-               }
-
-               /*
-                * We found another sync thread marker - skip it
-                */
-               if (ip->i_mount == NULL) {
-                       ip = ip->i_mnext;
-                       continue;
-               }
-
-               vp = VFS_I(ip);
-
-               /*
-                * If the vnode is gone then this is being torn down,
-                * call reclaim if it is flushed, else let regular flush
-                * code deal with it later in the loop.
-                */
-
-               if (vp == NULL) {
-                       /* Skip ones already in reclaim */
-                       if (ip->i_flags & XFS_IRECLAIM) {
-                               ip = ip->i_mnext;
-                               continue;
-                       }
-                       if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
-                               ip = ip->i_mnext;
-                       } else if ((xfs_ipincount(ip) == 0) &&
-                                   xfs_iflock_nowait(ip)) {
-                               IPOINTER_INSERT(ip, mp);
-
-                               xfs_finish_reclaim(ip, 1,
-                                               XFS_IFLUSH_DELWRI_ELSE_ASYNC);
-
-                               XFS_MOUNT_ILOCK(mp);
-                               mount_locked = B_TRUE;
-                               IPOINTER_REMOVE(ip, mp);
-                       } else {
-                               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                               ip = ip->i_mnext;
-                       }
-                       continue;
-               }
-
-               if (VN_BAD(vp)) {
-                       ip = ip->i_mnext;
-                       continue;
-               }
-
-               if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) {
-                       XFS_MOUNT_IUNLOCK(mp);
-                       kmem_free(ipointer);
-                       return 0;
-               }
-
-               /*
-                * Try to lock without sleeping.  We're out of order with
-                * the inode list lock here, so if we fail we need to drop
-                * the mount lock and try again.  If we're called from
-                * bdflush() here, then don't bother.
-                *
-                * The inode lock here actually coordinates with the
-                * almost spurious inode lock in xfs_ireclaim() to prevent
-                * the vnode we handle here without a reference from
-                * being freed while we reference it.  If we lock the inode
-                * while it's on the mount list here, then the spurious inode
-                * lock in xfs_ireclaim() after the inode is pulled from
-                * the mount list will sleep until we release it here.
-                * This keeps the vnode from being freed while we reference
-                * it.
-                */
-               if (xfs_ilock_nowait(ip, lock_flags) == 0) {
-                       if (vp == NULL) {
-                               ip = ip->i_mnext;
-                               continue;
-                       }
-
-                       vp = vn_grab(vp);
-                       if (vp == NULL) {
-                               ip = ip->i_mnext;
-                               continue;
-                       }
-
-                       IPOINTER_INSERT(ip, mp);
-                       xfs_ilock(ip, lock_flags);
-
-                       ASSERT(vp == VFS_I(ip));
-                       ASSERT(ip->i_mount == mp);
-
-                       vnode_refed = B_TRUE;
-               }
-
-               /* From here on in the loop we may have a marker record
-                * in the inode list.
-                */
-
-               /*
-                * If we have to flush data or wait for I/O completion
-                * we need to drop the ilock that we currently hold.
-                * If we need to drop the lock, insert a marker if we
-                * have not already done so.
-                */
-               if ((flags & (SYNC_CLOSE|SYNC_IOWAIT)) ||
-                   ((flags & SYNC_DELWRI) && VN_DIRTY(vp))) {
-                       if (mount_locked) {
-                               IPOINTER_INSERT(ip, mp);
-                       }
-                       xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-                       if (flags & SYNC_CLOSE) {
-                               /* Shutdown case. Flush and invalidate. */
-                               if (XFS_FORCED_SHUTDOWN(mp))
-                                       xfs_tosspages(ip, 0, -1,
-                                                            FI_REMAPF);
-                               else
-                                       error = xfs_flushinval_pages(ip,
-                                                       0, -1, FI_REMAPF);
-                       } else if ((flags & SYNC_DELWRI) && VN_DIRTY(vp)) {
-                               error = xfs_flush_pages(ip, 0,
-                                                       -1, fflag, FI_NONE);
-                       }
-
-                       /*
-                        * When freezing, we need to wait ensure all I/O (including direct
-                        * I/O) is complete to ensure no further data modification can take
-                        * place after this point
-                        */
-                       if (flags & SYNC_IOWAIT)
-                               vn_iowait(ip);
-
-                       xfs_ilock(ip, XFS_ILOCK_SHARED);
-               }
-
-               if ((flags & SYNC_ATTR) &&
-                   (ip->i_update_core ||
-                    (ip->i_itemp && ip->i_itemp->ili_format.ilf_fields))) {
-                       if (mount_locked)
-                               IPOINTER_INSERT(ip, mp);
-
-                       if (flags & SYNC_WAIT) {
-                               xfs_iflock(ip);
-                               error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
-
-                       /*
-                        * If we can't acquire the flush lock, then the inode
-                        * is already being flushed so don't bother waiting.
-                        *
-                        * If we can lock it then do a delwri flush so we can
-                        * combine multiple inode flushes in each disk write.
-                        */
-                       } else if (xfs_iflock_nowait(ip)) {
-                               error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
-                       } else if (bypassed) {
-                               (*bypassed)++;
-                       }
-               }
-
-               if (lock_flags != 0) {
-                       xfs_iunlock(ip, lock_flags);
-               }
-
-               if (vnode_refed) {
-                       /*
-                        * If we had to take a reference on the vnode
-                        * above, then wait until after we've unlocked
-                        * the inode to release the reference.  This is
-                        * because we can be already holding the inode
-                        * lock when IRELE() calls xfs_inactive().
-                        *
-                        * Make sure to drop the mount lock before calling
-                        * IRELE() so that we don't trip over ourselves if
-                        * we have to go for the mount lock again in the
-                        * inactive code.
-                        */
-                       if (mount_locked) {
-                               IPOINTER_INSERT(ip, mp);
-                       }
-
-                       IRELE(ip);
-
-                       vnode_refed = B_FALSE;
-               }
-
-               if (error) {
-                       last_error = error;
-               }
-
-               /*
-                * bail out if the filesystem is corrupted.
-                */
-               if (error == EFSCORRUPTED)  {
-                       if (!mount_locked) {
-                               XFS_MOUNT_ILOCK(mp);
-                               IPOINTER_REMOVE(ip, mp);
-                       }
-                       XFS_MOUNT_IUNLOCK(mp);
-                       ASSERT(ipointer_in == B_FALSE);
-                       kmem_free(ipointer);
-                       return XFS_ERROR(error);
-               }
-
-               /* Let other threads have a chance at the mount lock
-                * if we have looped many times without dropping the
-                * lock.
-                */
-               if ((++preempt & XFS_PREEMPT_MASK) == 0) {
-                       if (mount_locked) {
-                               IPOINTER_INSERT(ip, mp);
-                       }
-               }
-
-               if (mount_locked == B_FALSE) {
-                       XFS_MOUNT_ILOCK(mp);
-                       mount_locked = B_TRUE;
-                       IPOINTER_REMOVE(ip, mp);
-                       continue;
-               }
-
-               ASSERT(ipointer_in == B_FALSE);
-               ip = ip->i_mnext;
-
-       } while (ip != mp->m_inodes);
-
-       XFS_MOUNT_IUNLOCK(mp);
-
-       ASSERT(ipointer_in == B_FALSE);
-
-       kmem_free(ipointer);
-       return XFS_ERROR(last_error);
-}
-
-/*
- * xfs sync routine for internal use
- *
- * This routine supports all of the flags defined for the generic vfs_sync
- * interface as explained above under xfs_sync.
- *
- */
-int
-xfs_syncsub(
-       xfs_mount_t     *mp,
-       int             flags,
-       int             *bypassed)
-{
-       int             error = 0;
-       int             last_error = 0;
-       uint            log_flags = XFS_LOG_FORCE;
-       xfs_buf_t       *bp;
-       xfs_buf_log_item_t      *bip;
-
-       /*
-        * Sync out the log.  This ensures that the log is periodically
-        * flushed even if there is not enough activity to fill it up.
-        */
-       if (flags & SYNC_WAIT)
-               log_flags |= XFS_LOG_SYNC;
-
-       xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
-
-       if (flags & (SYNC_ATTR|SYNC_DELWRI)) {
-               if (flags & SYNC_BDFLUSH)
-                       xfs_finish_reclaim_all(mp, 1);
-               else
-                       error = xfs_sync_inodes(mp, flags, bypassed);
-       }
-
-       /*
-        * Flushing out dirty data above probably generated more
-        * log activity, so if this isn't vfs_sync() then flush
-        * the log again.
-        */
-       if (flags & SYNC_DELWRI) {
-               xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
-       }
-
-       if (flags & SYNC_FSDATA) {
-               /*
-                * If this is vfs_sync() then only sync the superblock
-                * if we can lock it without sleeping and it is not pinned.
-                */
-               if (flags & SYNC_BDFLUSH) {
-                       bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
-                       if (bp != NULL) {
-                               bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
-                               if ((bip != NULL) &&
-                                   xfs_buf_item_dirty(bip)) {
-                                       if (!(XFS_BUF_ISPINNED(bp))) {
-                                               XFS_BUF_ASYNC(bp);
-                                               error = xfs_bwrite(mp, bp);
-                                       } else {
-                                               xfs_buf_relse(bp);
-                                       }
-                               } else {
-                                       xfs_buf_relse(bp);
-                               }
-                       }
-               } else {
-                       bp = xfs_getsb(mp, 0);
-                       /*
-                        * If the buffer is pinned then push on the log so
-                        * we won't get stuck waiting in the write for
-                        * someone, maybe ourselves, to flush the log.
-                        * Even though we just pushed the log above, we
-                        * did not have the superblock buffer locked at
-                        * that point so it can become pinned in between
-                        * there and here.
-                        */
-                       if (XFS_BUF_ISPINNED(bp))
-                               xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
-                       if (flags & SYNC_WAIT)
-                               XFS_BUF_UNASYNC(bp);
-                       else
-                               XFS_BUF_ASYNC(bp);
-                       error = xfs_bwrite(mp, bp);
-               }
-               if (error) {
-                       last_error = error;
-               }
-       }
-
-       /*
-        * Now check to see if the log needs a "dummy" transaction.
-        */
-       if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
-               xfs_trans_t *tp;
-               xfs_inode_t *ip;
-
-               /*
-                * Put a dummy transaction in the log to tell
-                * recovery that all others are OK.
-                */
-               tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
-               if ((error = xfs_trans_reserve(tp, 0,
-                               XFS_ICHANGE_LOG_RES(mp),
-                               0, 0, 0)))  {
-                       xfs_trans_cancel(tp, 0);
-                       return error;
-               }
-
-               ip = mp->m_rootip;
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-
-               xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-               xfs_trans_ihold(tp, ip);
-               xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-               error = xfs_trans_commit(tp, 0);
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
-       }
-
-       /*
-        * When shutting down, we need to insure that the AIL is pushed
-        * to disk or the filesystem can appear corrupt from the PROM.
-        */
-       if ((flags & (SYNC_CLOSE|SYNC_WAIT)) == (SYNC_CLOSE|SYNC_WAIT)) {
-               XFS_bflush(mp->m_ddev_targp);
-               if (mp->m_rtdev_targp) {
-                       XFS_bflush(mp->m_rtdev_targp);
-               }
-       }
-
-       return XFS_ERROR(last_error);
-}
diff --git a/fs/xfs/xfs_vfsops.h b/fs/xfs/xfs_vfsops.h
deleted file mode 100644 (file)
index a74b050..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _XFS_VFSOPS_H
-#define _XFS_VFSOPS_H 1
-
-struct cred;
-struct xfs_fid;
-struct inode;
-struct kstatfs;
-struct xfs_mount;
-struct xfs_mount_args;
-
-int xfs_sync(struct xfs_mount *mp, int flags);
-void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
-               int lnnum);
-void xfs_attr_quiesce(struct xfs_mount *mp);
-
-#endif /* _XFS_VFSOPS_H */
index 8b6812f66a15bf2895aba5fed5f04be0c22625d8..f07bf8768c3a000a4bd221f2cb740cfc8e56efa8 100644 (file)
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
 
-int
-xfs_open(
-       xfs_inode_t     *ip)
-{
-       int             mode;
-
-       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-               return XFS_ERROR(EIO);
-
-       /*
-        * If it's a directory with any blocks, read-ahead block 0
-        * as we're almost certain to have the next operation be a read there.
-        */
-       if (S_ISDIR(ip->i_d.di_mode) && ip->i_d.di_nextents > 0) {
-               mode = xfs_ilock_map_shared(ip);
-               if (ip->i_d.di_nextents > 0)
-                       (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
-               xfs_iunlock(ip, mode);
-       }
-       return 0;
-}
-
 int
 xfs_setattr(
        struct xfs_inode        *ip,
        struct iattr            *iattr,
-       int                     flags,
-       cred_t                  *credp)
+       int                     flags)
 {
        xfs_mount_t             *mp = ip->i_mount;
        struct inode            *inode = VFS_I(ip);
@@ -93,7 +70,6 @@ xfs_setattr(
        gid_t                   gid=0, igid=0;
        int                     timeflags = 0;
        struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
-       int                     file_owner;
        int                     need_iolock = 1;
 
        xfs_itrace_entry(ip);
@@ -104,6 +80,10 @@ xfs_setattr(
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
 
+       code = -inode_change_ok(inode, iattr);
+       if (code)
+               return code;
+
        olddquot1 = olddquot2 = NULL;
        udqp = gdqp = NULL;
 
@@ -181,62 +161,8 @@ xfs_setattr(
 
        xfs_ilock(ip, lock_flags);
 
-       /* boolean: are we the file owner? */
-       file_owner = (current_fsuid() == ip->i_d.di_uid);
-
-       /*
-        * Change various properties of a file.
-        * Only the owner or users with CAP_FOWNER
-        * capability may do these things.
-        */
-       if (mask & (ATTR_MODE|ATTR_UID|ATTR_GID)) {
-               /*
-                * CAP_FOWNER overrides the following restrictions:
-                *
-                * The user ID of the calling process must be equal
-                * to the file owner ID, except in cases where the
-                * CAP_FSETID capability is applicable.
-                */
-               if (!file_owner && !capable(CAP_FOWNER)) {
-                       code = XFS_ERROR(EPERM);
-                       goto error_return;
-               }
-
-               /*
-                * CAP_FSETID overrides the following restrictions:
-                *
-                * The effective user ID of the calling process shall match
-                * the file owner when setting the set-user-ID and
-                * set-group-ID bits on that file.
-                *
-                * The effective group ID or one of the supplementary group
-                * IDs of the calling process shall match the group owner of
-                * the file when setting the set-group-ID bit on that file
-                */
-               if (mask & ATTR_MODE) {
-                       mode_t m = 0;
-
-                       if ((iattr->ia_mode & S_ISUID) && !file_owner)
-                               m |= S_ISUID;
-                       if ((iattr->ia_mode & S_ISGID) &&
-                           !in_group_p((gid_t)ip->i_d.di_gid))
-                               m |= S_ISGID;
-#if 0
-                       /* Linux allows this, Irix doesn't. */
-                       if ((iattr->ia_mode & S_ISVTX) && !S_ISDIR(ip->i_d.di_mode))
-                               m |= S_ISVTX;
-#endif
-                       if (m && !capable(CAP_FSETID))
-                               iattr->ia_mode &= ~m;
-               }
-       }
-
        /*
         * Change file ownership.  Must be the owner or privileged.
-        * If the system was configured with the "restricted_chown"
-        * option, the owner is not permitted to give away the file,
-        * and can change the group id only to a group of which he
-        * or she is a member.
         */
        if (mask & (ATTR_UID|ATTR_GID)) {
                /*
@@ -250,23 +176,6 @@ xfs_setattr(
                gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
                uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
 
-               /*
-                * CAP_CHOWN overrides the following restrictions:
-                *
-                * If _POSIX_CHOWN_RESTRICTED is defined, this capability
-                * shall override the restriction that a process cannot
-                * change the user ID of a file it owns and the restriction
-                * that the group ID supplied to the chown() function
-                * shall be equal to either the group ID or one of the
-                * supplementary group IDs of the calling process.
-                */
-               if (restricted_chown &&
-                   (iuid != uid || (igid != gid &&
-                                    !in_group_p((gid_t)gid))) &&
-                   !capable(CAP_CHOWN)) {
-                       code = XFS_ERROR(EPERM);
-                       goto error_return;
-               }
                /*
                 * Do a quota reservation only if uid/gid is actually
                 * going to change.
@@ -304,36 +213,22 @@ xfs_setattr(
                        code = XFS_ERROR(EINVAL);
                        goto error_return;
                }
+
                /*
                 * Make sure that the dquots are attached to the inode.
                 */
-               if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
+               code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED);
+               if (code)
                        goto error_return;
-       }
-
-       /*
-        * Change file access or modified times.
-        */
-       if (mask & (ATTR_ATIME|ATTR_MTIME)) {
-               if (!file_owner) {
-                       if ((mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)) &&
-                           !capable(CAP_FOWNER)) {
-                               code = XFS_ERROR(EPERM);
-                               goto error_return;
-                       }
-               }
-       }
 
-       /*
-        * Now we can make the changes.  Before we join the inode
-        * to the transaction, if ATTR_SIZE is set then take care of
-        * the part of the truncation that must be done without the
-        * inode lock.  This needs to be done before joining the inode
-        * to the transaction, because the inode cannot be unlocked
-        * once it is a part of the transaction.
-        */
-       if (mask & ATTR_SIZE) {
-               code = 0;
+               /*
+                * Now we can make the changes.  Before we join the inode
+                * to the transaction, if ATTR_SIZE is set then take care of
+                * the part of the truncation that must be done without the
+                * inode lock.  This needs to be done before joining the inode
+                * to the transaction, because the inode cannot be unlocked
+                * once it is a part of the transaction.
+                */
                if (iattr->ia_size > ip->i_size) {
                        /*
                         * Do the first part of growing a file: zero any data
@@ -366,7 +261,7 @@ xfs_setattr(
                }
 
                /* wait for all I/O to complete */
-               vn_iowait(ip);
+               xfs_ioend_wait(ip);
 
                if (!code)
                        code = xfs_itruncate_data(ip, iattr->ia_size);
@@ -388,17 +283,10 @@ xfs_setattr(
                }
                commit_flags = XFS_TRANS_RELEASE_LOG_RES;
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-       }
 
-       if (tp) {
                xfs_trans_ijoin(tp, ip, lock_flags);
                xfs_trans_ihold(tp, ip);
-       }
 
-       /*
-        * Truncate file.  Must have write permission and not be a directory.
-        */
-       if (mask & ATTR_SIZE) {
                /*
                 * Only change the c/mtime if we are changing the size
                 * or we are explicitly asked to change it. This handles
@@ -438,28 +326,13 @@ xfs_setattr(
                         */
                        xfs_iflags_set(ip, XFS_ITRUNCATED);
                }
-       }
-
-       /*
-        * Change file access modes.
-        */
-       if (mask & ATTR_MODE) {
-               ip->i_d.di_mode &= S_IFMT;
-               ip->i_d.di_mode |= iattr->ia_mode & ~S_IFMT;
-
-               inode->i_mode &= S_IFMT;
-               inode->i_mode |= iattr->ia_mode & ~S_IFMT;
-
-               xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
-               timeflags |= XFS_ICHGTIME_CHG;
+       } else if (tp) {
+               xfs_trans_ijoin(tp, ip, lock_flags);
+               xfs_trans_ihold(tp, ip);
        }
 
        /*
         * Change file ownership.  Must be the owner or privileged.
-        * If the system was configured with the "restricted_chown"
-        * option, the owner is not permitted to give away the file,
-        * and can change the group id only to a group of which he
-        * or she is a member.
         */
        if (mask & (ATTR_UID|ATTR_GID)) {
                /*
@@ -503,6 +376,24 @@ xfs_setattr(
                timeflags |= XFS_ICHGTIME_CHG;
        }
 
+       /*
+        * Change file access modes.
+        */
+       if (mask & ATTR_MODE) {
+               umode_t mode = iattr->ia_mode;
+
+               if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+                       mode &= ~S_ISGID;
+
+               ip->i_d.di_mode &= S_IFMT;
+               ip->i_d.di_mode |= mode & ~S_IFMT;
+
+               inode->i_mode &= S_IFMT;
+               inode->i_mode |= mode & ~S_IFMT;
+
+               xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+               timeflags |= XFS_ICHGTIME_CHG;
+       }
 
        /*
         * Change file access or modified times.
@@ -713,7 +604,7 @@ xfs_fsync(
                return XFS_ERROR(EIO);
 
        /* capture size updates in I/O completion before writing the inode. */
-       error = filemap_fdatawait(VFS_I(ip)->i_mapping);
+       error = xfs_wait_on_pages(ip, 0, -1);
        if (error)
                return XFS_ERROR(error);
 
@@ -1028,6 +919,12 @@ xfs_inactive_symlink_rmt(
                ASSERT(XFS_FORCED_SHUTDOWN(mp));
                goto error0;
        }
+       /*
+        * transaction commit worked ok so we can drop the extra ticket
+        * reference that we gained in xfs_trans_dup()
+        */
+       xfs_log_ticket_put(tp->t_ticket);
+
        /*
         * Remove the memory for extent descriptions (just bookkeeping).
         */
@@ -1625,8 +1522,6 @@ xfs_create(
                xfs_trans_set_sync(tp);
        }
 
-       dp->i_gen++;
-
        /*
         * Attach the dquot(s) to the inodes and modify them incore.
         * These ids of the inode couldn't have changed since the new
@@ -1993,13 +1888,6 @@ xfs_remove(
        }
        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
-       /*
-        * Bump the in memory generation count on the parent
-        * directory so that other can know that it has changed.
-        */
-       dp->i_gen++;
-       xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
-
        if (is_dir) {
                /*
                 * Drop the link from ip's "..".
@@ -2009,7 +1897,7 @@ xfs_remove(
                        goto out_bmap_cancel;
 
                /*
-                * Drop the link from dp to ip.
+                * Drop the "." link from ip to self.
                 */
                error = xfs_droplink(tp, ip);
                if (error)
@@ -2017,14 +1905,14 @@ xfs_remove(
        } else {
                /*
                 * When removing a non-directory we need to log the parent
-                * inode here for the i_gen update.  For a directory this is
-                * done implicitly by the xfs_droplink call for the ".." entry.
+                * inode here.  For a directory this is done implicitly
+                * by the xfs_droplink call for the ".." entry.
                 */
                xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        }
 
        /*
-        * Drop the "." link from ip to self.
+        * Drop the link from dp to ip.
         */
        error = xfs_droplink(tp, ip);
        if (error)
@@ -2178,7 +2066,6 @@ xfs_link(
        if (error)
                goto abort_return;
        xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-       tdp->i_gen++;
        xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
 
        error = xfs_bumplink(tp, sip);
@@ -2355,18 +2242,10 @@ xfs_mkdir(
        }
        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
-       /*
-        * Bump the in memory version number of the parent directory
-        * so that other processes accessing it will recognize that
-        * the directory has changed.
-        */
-       dp->i_gen++;
-
        error = xfs_dir_init(tp, cdp, dp);
        if (error)
                goto error2;
 
-       cdp->i_gen = 1;
        error = xfs_bumplink(tp, dp);
        if (error)
                goto error2;
@@ -2652,13 +2531,6 @@ xfs_symlink(
        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 
-       /*
-        * Bump the in memory version number of the parent directory
-        * so that other processes accessing it will recognize that
-        * the directory has changed.
-        */
-       dp->i_gen++;
-
        /*
         * If this is a synchronous mount, make sure that the
         * symlink transaction goes to disk before returning to
@@ -2809,7 +2681,7 @@ xfs_reclaim(
                return 0;
        }
 
-       vn_iowait(ip);
+       xfs_ioend_wait(ip);
 
        ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
 
@@ -2833,122 +2705,10 @@ xfs_reclaim(
        if (!ip->i_update_core && (ip->i_itemp == NULL)) {
                xfs_ilock(ip, XFS_ILOCK_EXCL);
                xfs_iflock(ip);
-               return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
-       } else {
-               xfs_mount_t     *mp = ip->i_mount;
-
-               /* Protect sync and unpin from us */
-               XFS_MOUNT_ILOCK(mp);
-               spin_lock(&ip->i_flags_lock);
-               __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-               VFS_I(ip)->i_private = NULL;
-               ip->i_vnode = NULL;
-               spin_unlock(&ip->i_flags_lock);
-               list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
-               XFS_MOUNT_IUNLOCK(mp);
-       }
-       return 0;
-}
-
-int
-xfs_finish_reclaim(
-       xfs_inode_t     *ip,
-       int             locked,
-       int             sync_mode)
-{
-       xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
-       struct inode    *vp = VFS_I(ip);
-
-       if (vp && VN_BAD(vp))
-               goto reclaim;
-
-       /* The hash lock here protects a thread in xfs_iget_core from
-        * racing with us on linking the inode back with a vnode.
-        * Once we have the XFS_IRECLAIM flag set it will not touch
-        * us.
-        */
-       write_lock(&pag->pag_ici_lock);
-       spin_lock(&ip->i_flags_lock);
-       if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
-           (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
-               spin_unlock(&ip->i_flags_lock);
-               write_unlock(&pag->pag_ici_lock);
-               if (locked) {
-                       xfs_ifunlock(ip);
-                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               }
-               return 1;
-       }
-       __xfs_iflags_set(ip, XFS_IRECLAIM);
-       spin_unlock(&ip->i_flags_lock);
-       write_unlock(&pag->pag_ici_lock);
-       xfs_put_perag(ip->i_mount, pag);
-
-       /*
-        * If the inode is still dirty, then flush it out.  If the inode
-        * is not in the AIL, then it will be OK to flush it delwri as
-        * long as xfs_iflush() does not keep any references to the inode.
-        * We leave that decision up to xfs_iflush() since it has the
-        * knowledge of whether it's OK to simply do a delwri flush of
-        * the inode or whether we need to wait until the inode is
-        * pulled from the AIL.
-        * We get the flush lock regardless, though, just to make sure
-        * we don't free it while it is being flushed.
-        */
-       if (!locked) {
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               xfs_iflock(ip);
+               xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+               return xfs_reclaim_inode(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
        }
-
-       /*
-        * In the case of a forced shutdown we rely on xfs_iflush() to
-        * wait for the inode to be unpinned before returning an error.
-        */
-       if (xfs_iflush(ip, sync_mode) == 0) {
-               /* synchronize with xfs_iflush_done */
-               xfs_iflock(ip);
-               xfs_ifunlock(ip);
-       }
-
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
- reclaim:
-       xfs_ireclaim(ip);
-       return 0;
-}
-
-int
-xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
-{
-       int             purged;
-       xfs_inode_t     *ip, *n;
-       int             done = 0;
-
-       while (!done) {
-               purged = 0;
-               XFS_MOUNT_ILOCK(mp);
-               list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
-                       if (noblock) {
-                               if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
-                                       continue;
-                               if (xfs_ipincount(ip) ||
-                                   !xfs_iflock_nowait(ip)) {
-                                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                       continue;
-                               }
-                       }
-                       XFS_MOUNT_IUNLOCK(mp);
-                       if (xfs_finish_reclaim(ip, noblock,
-                                       XFS_IFLUSH_DELWRI_ELSE_ASYNC))
-                               delay(1);
-                       purged = 1;
-                       break;
-               }
-
-               done = !purged;
-       }
-
-       XFS_MOUNT_IUNLOCK(mp);
+       xfs_inode_set_reclaim_tag(ip);
        return 0;
 }
 
@@ -3197,6 +2957,8 @@ xfs_zero_remaining_bytes(
        bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
                                XFS_IS_REALTIME_INODE(ip) ?
                                mp->m_rtdev_targp : mp->m_ddev_targp);
+       if (!bp)
+               return XFS_ERROR(ENOMEM);
 
        for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
                offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -3312,7 +3074,8 @@ xfs_free_file_space(
                need_iolock = 0;
        if (need_iolock) {
                xfs_ilock(ip, XFS_IOLOCK_EXCL);
-               vn_iowait(ip);  /* wait for the completion of any pending DIOs */
+               /* wait for the completion of any pending DIOs */
+               xfs_ioend_wait(ip);
        }
 
        rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
@@ -3474,7 +3237,6 @@ xfs_change_file_space(
        int             cmd,
        xfs_flock64_t   *bf,
        xfs_off_t       offset,
-       cred_t          *credp,
        int             attr_flags)
 {
        xfs_mount_t     *mp = ip->i_mount;
@@ -3562,7 +3324,7 @@ xfs_change_file_space(
                iattr.ia_valid = ATTR_SIZE;
                iattr.ia_size = startoffset;
 
-               error = xfs_setattr(ip, &iattr, attr_flags, credp);
+               error = xfs_setattr(ip, &iattr, attr_flags);
 
                if (error)
                        return error;
index 7b0c2ab88333885b5f9075b1b99c2d14b71b4578..76df328c61b42c16641ed4e7a8a18021efba7253 100644 (file)
@@ -14,9 +14,7 @@ struct xfs_inode;
 struct xfs_iomap;
 
 
-int xfs_open(struct xfs_inode *ip);
-int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags,
-               cred_t *credp);
+int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
 #define        XFS_ATTR_DMI            0x01    /* invocation from a DMI function */
 #define        XFS_ATTR_NONBLOCK       0x02    /* return EAGAIN if operation would block */
 #define XFS_ATTR_NOLOCK                0x04    /* Don't grab any conflicting locks */
@@ -44,8 +42,7 @@ int xfs_inode_flush(struct xfs_inode *ip, int flags);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
 int xfs_reclaim(struct xfs_inode *ip);
 int xfs_change_file_space(struct xfs_inode *ip, int cmd,
-               xfs_flock64_t *bf, xfs_off_t offset,
-               cred_t *credp, int      attr_flags);
+               xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
 int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
                struct xfs_inode *src_ip, struct xfs_inode *target_dp,
                struct xfs_name *target_name, struct xfs_inode *target_ip);
@@ -56,8 +53,6 @@ int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value,
 int xfs_attr_remove(struct xfs_inode *dp, const char *name, int flags);
 int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
                int flags, struct attrlist_cursor_kern *cursor);
-int xfs_ioctl(struct xfs_inode *ip, struct file *filp,
-               int ioflags, unsigned int cmd, void __user *arg);
 ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb,
                const struct iovec *iovp, unsigned int segs,
                loff_t *offset, int ioflags);
@@ -78,5 +73,6 @@ int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,
                xfs_off_t last, int fiopt);
 int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
                xfs_off_t last, uint64_t flags, int fiopt);
+int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
 
 #endif /* _XFS_VNODEOPS_H */
index 195a8cb2a749912bc0e8d930b489a8552e3a8121..001ded4845b4b9d4e1a5105fbbb1682d7961efca 100644 (file)
@@ -82,6 +82,14 @@ extern int dir_notify_enable;
    (specialy hack for floppy.c) */
 #define FMODE_WRITE_IOCTL      ((__force fmode_t)128)
 
+/*
+ * Don't update ctime and mtime.
+ *
+ * Currently a special hack for the XFS open_by_handle ioctl, but we'll
+ * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon.
+ */
+#define FMODE_NOCMTIME         ((__force fmode_t)2048)
+
 #define RW_MASK                1
 #define RWA_MASK       2
 #define READ 0
@@ -1877,7 +1885,9 @@ extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
 
 extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin);
 
+extern struct inode * inode_init_always(struct super_block *, struct inode *);
 extern void inode_init_once(struct inode *);
+extern void inode_add_to_lists(struct super_block *, struct inode *);
 extern void iput(struct inode *);
 extern struct inode * igrab(struct inode *);
 extern ino_t iunique(struct super_block *, ino_t);
index c35da23ab8fb025f0fd0a03b54f90df3fe246cad..fafeb48f27c046d0b6586e2797b40397a6a76f19 100644 (file)
@@ -730,7 +730,6 @@ static const struct trans_ctl_table trans_fs_quota_table[] = {
 };
 
 static const struct trans_ctl_table trans_fs_xfs_table[] = {
-       { XFS_RESTRICT_CHOWN,   "restrict_chown" },
        { XFS_SGID_INHERIT,     "irix_sgid_inherit" },
        { XFS_SYMLINK_MODE,     "irix_symlink_mode" },
        { XFS_PANIC_MASK,       "panic_mask" },