]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/commitdiff
ocfs2: Remove delete inode vote
authorTiger Yang <tiger.yang@oracle.com>
Tue, 20 Mar 2007 23:01:38 +0000 (16:01 -0700)
committerMark Fasheh <mark.fasheh@oracle.com>
Thu, 26 Apr 2007 21:39:48 +0000 (14:39 -0700)
Ocfs2 currently does cluster-wide node messaging to check the open state of
an inode during delete. This patch removes that mechanism in favor of an
inode cluster lock which is taken at shared read when an inode is first read
and dropped in clear_inode(). This allows a deleting node to test the
liveness of an inode by attempting to take an exclusive lock.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
fs/ocfs2/cluster/tcp_internal.h
fs/ocfs2/dlmglue.c
fs/ocfs2/dlmglue.h
fs/ocfs2/inode.c
fs/ocfs2/inode.h
fs/ocfs2/journal.c
fs/ocfs2/namei.c
fs/ocfs2/ocfs2_fs.h
fs/ocfs2/ocfs2_lockid.h
fs/ocfs2/super.c

index 4dae5df5e4670eb39e0dac036c8284aae8d82d3b..9606111fe89d0964bb5c29d80427ddaddff6ab7f 100644 (file)
@@ -38,6 +38,9 @@
  * locking semantics of the file system using the protocol.  It should 
  * be somewhere else, I'm sure, but right now it isn't.
  *
+ * New in version 8:
+ *     - Replace delete inode votes with a cluster lock
+ *
  * New in version 7:
  *     - DLM join domain includes the live nodemap
  *
@@ -57,7 +60,7 @@
  *     - full 64 bit i_size in the metadata lock lvbs
  *     - introduction of "rw" lock and pushing meta/data locking down
  */
-#define O2NET_PROTOCOL_VERSION 7ULL
+#define O2NET_PROTOCOL_VERSION 8ULL
 struct o2net_handshake {
        __be64  protocol_version;
        __be64  connector_id;
index 31d519a6dbd29680587b5e92a761b575c4d8e054..ca4f0e0e75879f47ced061140e0a31a22cb81fb6 100644 (file)
@@ -225,11 +225,17 @@ static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
        .flags          = 0,
 };
 
+static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
+       .get_osb        = ocfs2_get_inode_osb,
+       .flags          = 0,
+};
+
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
        return lockres->l_type == OCFS2_LOCK_TYPE_META ||
                lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
-               lockres->l_type == OCFS2_LOCK_TYPE_RW;
+               lockres->l_type == OCFS2_LOCK_TYPE_RW ||
+               lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
 }
 
 static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
@@ -373,6 +379,9 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
                case OCFS2_LOCK_TYPE_DATA:
                        ops = &ocfs2_inode_data_lops;
                        break;
+               case OCFS2_LOCK_TYPE_OPEN:
+                       ops = &ocfs2_inode_open_lops;
+                       break;
                default:
                        mlog_bug_on_msg(1, "type: %d\n", type);
                        ops = NULL; /* thanks, gcc */
@@ -1129,6 +1138,12 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
                goto bail;
        }
 
+       ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
+       if (ret) {
+               mlog_errno(ret);
+               goto bail;
+       }
+
 bail:
        mlog_exit(ret);
        return ret;
@@ -1182,6 +1197,99 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
        mlog_exit_void();
 }
 
+/*
+ * ocfs2_open_lock always get PR mode lock.
+ */
+int ocfs2_open_lock(struct inode *inode)
+{
+       int status = 0;
+       struct ocfs2_lock_res *lockres;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       BUG_ON(!inode);
+
+       mlog_entry_void();
+
+       mlog(0, "inode %llu take PRMODE open lock\n",
+            (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+       if (ocfs2_mount_local(osb))
+               goto out;
+
+       lockres = &OCFS2_I(inode)->ip_open_lockres;
+
+       status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+                                   LKM_PRMODE, 0, 0);
+       if (status < 0)
+               mlog_errno(status);
+
+out:
+       mlog_exit(status);
+       return status;
+}
+
+int ocfs2_try_open_lock(struct inode *inode, int write)
+{
+       int status = 0, level;
+       struct ocfs2_lock_res *lockres;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       BUG_ON(!inode);
+
+       mlog_entry_void();
+
+       mlog(0, "inode %llu try to take %s open lock\n",
+            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+            write ? "EXMODE" : "PRMODE");
+
+       if (ocfs2_mount_local(osb))
+               goto out;
+
+       lockres = &OCFS2_I(inode)->ip_open_lockres;
+
+       level = write ? LKM_EXMODE : LKM_PRMODE;
+
+       /*
+        * The file system may already holding a PRMODE/EXMODE open lock.
+        * Since we pass LKM_NOQUEUE, the request won't block waiting on
+        * other nodes and the -EAGAIN will indicate to the caller that
+        * this inode is still in use.
+        */
+       status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+                                   level, LKM_NOQUEUE, 0);
+
+out:
+       mlog_exit(status);
+       return status;
+}
+
+/*
+ * ocfs2_open_unlock unlock PR and EX mode open locks.
+ */
+void ocfs2_open_unlock(struct inode *inode)
+{
+       struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       mlog_entry_void();
+
+       mlog(0, "inode %llu drop open lock\n",
+            (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+       if (ocfs2_mount_local(osb))
+               goto out;
+
+       if(lockres->l_ro_holders)
+               ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+                                    LKM_PRMODE);
+       if(lockres->l_ex_holders)
+               ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+                                    LKM_EXMODE);
+
+out:
+       mlog_exit_void();
+}
+
 int ocfs2_data_lock_full(struct inode *inode,
                         int write,
                         int arg_flags)
@@ -2455,12 +2563,19 @@ int ocfs2_drop_inode_locks(struct inode *inode)
         * ocfs2_clear_inode has done it for us. */
 
        err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
-                             &OCFS2_I(inode)->ip_data_lockres);
+                             &OCFS2_I(inode)->ip_open_lockres);
        if (err < 0)
                mlog_errno(err);
 
        status = err;
 
+       err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+                             &OCFS2_I(inode)->ip_data_lockres);
+       if (err < 0)
+               mlog_errno(err);
+       if (err < 0 && !status)
+               status = err;
+
        err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
                              &OCFS2_I(inode)->ip_meta_lockres);
        if (err < 0)
index c343fca68cf1ec1643e292f560c8dbf7d7b28414..59cb566e7983b758d1c4bd3f751406bd1ba970aa 100644 (file)
@@ -80,6 +80,9 @@ void ocfs2_data_unlock(struct inode *inode,
                       int write);
 int ocfs2_rw_lock(struct inode *inode, int write);
 void ocfs2_rw_unlock(struct inode *inode, int write);
+int ocfs2_open_lock(struct inode *inode);
+int ocfs2_try_open_lock(struct inode *inode, int write);
+void ocfs2_open_unlock(struct inode *inode);
 int ocfs2_meta_lock_atime(struct inode *inode,
                          struct vfsmount *vfsmnt,
                          int *level);
index 28ab56f2b98c50bf8300140c9002b801060c66a9..10d16a9e4fdabc68957f2fb5cf9f12a3d527ab79 100644 (file)
@@ -289,7 +289,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                     (unsigned long long)fe->i_blkno);
 
        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
-       OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
        OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
 
        inode->i_nlink = le16_to_cpu(fe->i_links_count);
@@ -347,6 +346,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 
                ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
                                          OCFS2_LOCK_TYPE_META, 0, inode);
+
+               ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+                                         OCFS2_LOCK_TYPE_OPEN, 0, inode);
        }
 
        ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
@@ -421,7 +423,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
         * cluster lock before trusting anything anyway.
         */
        can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
-               && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK)
+               && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
                && !ocfs2_mount_local(osb);
 
        /*
@@ -438,7 +440,17 @@ static int ocfs2_read_locked_inode(struct inode *inode,
                                  OCFS2_LOCK_TYPE_META,
                                  generation, inode);
 
+       ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+                                 OCFS2_LOCK_TYPE_OPEN,
+                                 0, inode);
+
        if (can_lock) {
+               status = ocfs2_open_lock(inode);
+               if (status) {
+                       make_bad_inode(inode);
+                       mlog_errno(status);
+                       return status;
+               }
                status = ocfs2_meta_lock(inode, NULL, 0);
                if (status) {
                        make_bad_inode(inode);
@@ -447,6 +459,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
                }
        }
 
+       if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
+               status = ocfs2_try_open_lock(inode, 0);
+               if (status) {
+                       make_bad_inode(inode);  
+                       return status;
+               }
+       }
+
        status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
                                  can_lock ? inode : NULL);
        if (status < 0) {
@@ -678,10 +698,10 @@ static int ocfs2_wipe_inode(struct inode *inode,
        struct inode *orphan_dir_inode = NULL;
        struct buffer_head *orphan_dir_bh = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       struct ocfs2_dinode *di;
 
-       /* We've already voted on this so it should be readonly - no
-        * spinlock needed. */
-       orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
+       di = (struct ocfs2_dinode *) di_bh->b_data;
+       orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
 
        status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
        if (status)
@@ -787,6 +807,35 @@ bail:
        return ret;
 }
 
+static int ocfs2_request_delete(struct inode *inode)
+{
+       int status = 0;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       if (ocfs2_inode_is_new(inode))
+               return 0;
+
+       if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
+                                  osb->node_num))
+               return 0;
+       /*
+        * This is how ocfs2 determines whether an inode is still live
+        * within the cluster. Every node takes a shared read lock on
+        * the inode open lock in ocfs2_read_locked_inode(). When we
+        * get to ->delete_inode(), each node tries to convert it's
+        * lock to an exclusive. Trylocks are serialized by the inode
+        * meta data lock. If the upconvert suceeds, we know the inode
+        * is no longer live and can be deleted.
+        *
+        * Though we call this with the meta data lock held, the
+        * trylock keeps us from ABBA deadlock.
+        */
+       status = ocfs2_try_open_lock(inode, 1);
+       if (status < 0 && status != -EAGAIN)
+               mlog_errno(status);
+       return status;
+}
+
 /* Query the cluster to determine whether we should wipe an inode from
  * disk or not.
  *
@@ -839,11 +888,11 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
                goto bail;
        }
 
-       status = ocfs2_request_delete_vote(inode);
-       /* -EBUSY means that other nodes are still using the
+       status = ocfs2_request_delete(inode);
+       /* -EAGAIN means that other nodes are still using the
         * inode. We're done here though, so avoid doing anything on
         * disk and let them worry about deleting it. */
-       if (status == -EBUSY) {
+       if (status == -EAGAIN) {
                status = 0;
                mlog(0, "Skipping delete of %llu because it is in use on"
                     "other nodes\n", (unsigned long long)oi->ip_blkno);
@@ -854,21 +903,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
                goto bail;
        }
 
-       spin_lock(&oi->ip_lock);
-       if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
-               /* Nobody knew which slot this inode was orphaned
-                * into. This may happen during node death and
-                * recovery knows how to clean it up so we can safely
-                * ignore this inode for now on. */
-               mlog(0, "Nobody knew where inode %llu was orphaned!\n",
-                    (unsigned long long)oi->ip_blkno);
-       } else {
-               *wipe = 1;
-
-               mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n",
-                    (unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot);
-       }
-       spin_unlock(&oi->ip_lock);
+       *wipe = 1;
+       mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n",
+            (unsigned long long)oi->ip_blkno,
+            le16_to_cpu(di->i_orphaned_slot));
 
 bail:
        return status;
@@ -1001,11 +1039,16 @@ void ocfs2_clear_inode(struct inode *inode)
        mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
                        "Inode=%lu\n", inode->i_ino);
 
+       /* For remove delete_inode vote, we hold open lock before,
+        * now it is time to unlock PR and EX open locks. */
+       ocfs2_open_unlock(inode);
+
        /* Do these before all the other work so that we don't bounce
         * the vote thread while waiting to destroy the locks. */
        ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
        ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
        ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
+       ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
 
        /* We very well may get a clear_inode before all an inodes
         * metadata has hit disk. Of course, we can't drop any cluster
@@ -1030,6 +1073,7 @@ void ocfs2_clear_inode(struct inode *inode)
        ocfs2_lock_res_free(&oi->ip_rw_lockres);
        ocfs2_lock_res_free(&oi->ip_meta_lockres);
        ocfs2_lock_res_free(&oi->ip_data_lockres);
+       ocfs2_lock_res_free(&oi->ip_open_lockres);
 
        ocfs2_metadata_cache_purge(inode);
 
@@ -1086,9 +1130,6 @@ void ocfs2_drop_inode(struct inode *inode)
        mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
             (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
 
-       /* Testing ip_orphaned_slot here wouldn't work because we may
-        * not have gotten a delete_inode vote from any other nodes
-        * yet. */
        if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
                generic_delete_inode(inode);
        else
index 1a7dd2945b34a7bd5f608dddea23b17f4e2d4b9a..92d4feb34d742193c07b0f138955fb88d511e8bb 100644 (file)
@@ -34,6 +34,7 @@ struct ocfs2_inode_info
        struct ocfs2_lock_res           ip_rw_lockres;
        struct ocfs2_lock_res           ip_meta_lockres;
        struct ocfs2_lock_res           ip_data_lockres;
+       struct ocfs2_lock_res           ip_open_lockres;
 
        /* protects allocation changes on this inode. */
        struct rw_semaphore             ip_alloc_sem;
@@ -119,8 +120,8 @@ void ocfs2_drop_inode(struct inode *inode);
 /* Flags for ocfs2_iget() */
 #define OCFS2_FI_FLAG_NOWAIT   0x1
 #define OCFS2_FI_FLAG_DELETE   0x2
-#define OCFS2_FI_FLAG_SYSFILE  0x4
-#define OCFS2_FI_FLAG_NOLOCK   0x8
+#define OCFS2_FI_FLAG_SYSFILE          0x4
+#define OCFS2_FI_FLAG_ORPHAN_RECOVERY  0x8
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
 struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
                                     u64 blkno,
index 825cb0ae1b4c812862bbff0d8da5608497611eb7..12445a31f733d592fb98879ef591fab8bdd079c1 100644 (file)
@@ -1306,7 +1306,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
                                continue;
 
                        iter = ocfs2_iget(osb, le64_to_cpu(de->inode),
-                                         OCFS2_FI_FLAG_NOLOCK);
+                                         OCFS2_FI_FLAG_ORPHAN_RECOVERY);
                        if (IS_ERR(iter))
                                continue;
 
@@ -1418,7 +1418,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                /* Set the proper information to get us going into
                 * ocfs2_delete_inode. */
                oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
-               oi->ip_orphaned_slot = slot;
                spin_unlock(&oi->ip_lock);
 
                iput(inode);
index 1fff0c02d98b09f17d03c56c9d940c08937f5b99..a93c15fdcef32c2064c4bbb2ebef9479418a59e1 100644 (file)
@@ -187,7 +187,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
         * unlink. */
        spin_lock(&oi->ip_lock);
        oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
-       oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
        spin_unlock(&oi->ip_lock);
 
 bail_add:
@@ -2220,9 +2219,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        /* Record which orphan dir our inode now resides
         * in. delete_inode will use this to determine which orphan
         * dir to lock. */
-       spin_lock(&OCFS2_I(inode)->ip_lock);
-       OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
-       spin_unlock(&OCFS2_I(inode)->ip_lock);
+       fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
 
        mlog(0, "Inode %llu orphaned in slot %d\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
index e61e218f5e0b791e51f351f89a8a1afb9c4fc0a2..a476b63e2e6037f3abaa85d193949a1b5986abae 100644 (file)
@@ -446,7 +446,9 @@ struct ocfs2_dinode {
        __le32 i_ctime_nsec;
        __le32 i_mtime_nsec;
        __le32 i_attr;
-       __le32 i_reserved1;
+       __le16 i_orphaned_slot;         /* Only valid when OCFS2_ORPHANED_FL
+                                          was set in i_flags */
+       __le16 i_reserved1;
 /*70*/ __le64 i_reserved2[8];
 /*B8*/ union {
                __le64 i_pad1;          /* Generic way to refer to this
index 4d5d5655c185085c9e259b6669bcd35cd29d3068..4ca02b1c38ac548ab3abf61424bcc72b9464c26b 100644 (file)
@@ -44,6 +44,7 @@ enum ocfs2_lock_type {
        OCFS2_LOCK_TYPE_RENAME,
        OCFS2_LOCK_TYPE_RW,
        OCFS2_LOCK_TYPE_DENTRY,
+       OCFS2_LOCK_TYPE_OPEN,
        OCFS2_NUM_LOCK_TYPES
 };
 
@@ -69,6 +70,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
                case OCFS2_LOCK_TYPE_DENTRY:
                        c = 'N';
                        break;
+               case OCFS2_LOCK_TYPE_OPEN:
+                       c = 'O';
+                       break;
                default:
                        c = '\0';
        }
@@ -85,6 +89,7 @@ static char *ocfs2_lock_type_strings[] = {
         * important job it does, anyway. */
        [OCFS2_LOCK_TYPE_RW] = "Write/Read",
        [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
+       [OCFS2_LOCK_TYPE_OPEN] = "Open",
 };
 
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
index 6534f92424dd213a4471a8da553439f9843daa37..16564ea6c1412834ad6e10b23a530152a053bddd 100644 (file)
@@ -963,6 +963,7 @@ static void ocfs2_inode_init_once(void *data,
                ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
                ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
                ocfs2_lock_res_init_once(&oi->ip_data_lockres);
+               ocfs2_lock_res_init_once(&oi->ip_open_lockres);
 
                ocfs2_metadata_cache_init(&oi->vfs_inode);