]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/commitdiff
Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfashe...
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 18 Apr 2008 17:15:22 +0000 (10:15 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 18 Apr 2008 17:15:22 +0000 (10:15 -0700)
* 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2: (64 commits)
  ocfs2/net: Add debug interface to o2net
  ocfs2: Only build ocfs2/dlm with the o2cb stack module
  ocfs2/cluster: Get rid of arguments to the timeout routines
  ocfs2: Put tree in MAINTAINERS
  ocfs2: Use BUG_ON
  ocfs2: Convert ocfs2 over to unlocked_ioctl
  ocfs2: Improve rename locking
  fs/ocfs2/aops.c: test for IS_ERR rather than 0
  ocfs2: Add inode stealing for ocfs2_reserve_new_inode
  ocfs2: Add ac_alloc_slot in ocfs2_alloc_context
  ocfs2: Add a new parameter for ocfs2_reserve_suballoc_bits
  ocfs2: Enable cross extent block merge.
  ocfs2: Add support for cross extent block
  ocfs2: Move /sys/o2cb to /sys/fs/o2cb
  sysfs: Allow removal of symlinks in the sysfs root
  ocfs2:  Reconnect after idle time out.
  ocfs2/dlm: Cleanup lockres print
  ocfs2/dlm: Fix lockname in lockres print function
  ocfs2/dlm: Move dlm_print_one_mle() from dlmmaster.c to dlmdebug.c
  ocfs2/dlm: Dumps the purgelist into a debugfs file
  ...

47 files changed:
Documentation/ABI/obsolete/o2cb [new file with mode: 0644]
Documentation/ABI/stable/o2cb [new file with mode: 0644]
Documentation/ABI/testing/sysfs-ocfs2 [new file with mode: 0644]
Documentation/feature-removal-schedule.txt
MAINTAINERS
fs/Kconfig
fs/ocfs2/Makefile
fs/ocfs2/alloc.c
fs/ocfs2/aops.c
fs/ocfs2/cluster/Makefile
fs/ocfs2/cluster/netdebug.c [new file with mode: 0644]
fs/ocfs2/cluster/nodemanager.c
fs/ocfs2/cluster/sys.c
fs/ocfs2/cluster/tcp.c
fs/ocfs2/cluster/tcp.h
fs/ocfs2/cluster/tcp_internal.h
fs/ocfs2/dlm/Makefile
fs/ocfs2/dlm/dlmcommon.h
fs/ocfs2/dlm/dlmdebug.c
fs/ocfs2/dlm/dlmdebug.h [new file with mode: 0644]
fs/ocfs2/dlm/dlmdomain.c
fs/ocfs2/dlm/dlmlock.c
fs/ocfs2/dlm/dlmmaster.c
fs/ocfs2/dlmglue.c
fs/ocfs2/dlmglue.h
fs/ocfs2/file.c
fs/ocfs2/heartbeat.c
fs/ocfs2/heartbeat.h
fs/ocfs2/ioctl.c
fs/ocfs2/ioctl.h
fs/ocfs2/journal.c
fs/ocfs2/journal.h
fs/ocfs2/localalloc.c
fs/ocfs2/namei.c
fs/ocfs2/ocfs2.h
fs/ocfs2/ocfs2_fs.h
fs/ocfs2/ocfs2_lockid.h
fs/ocfs2/slot_map.c
fs/ocfs2/slot_map.h
fs/ocfs2/stack_o2cb.c [new file with mode: 0644]
fs/ocfs2/stack_user.c [new file with mode: 0644]
fs/ocfs2/stackglue.c [new file with mode: 0644]
fs/ocfs2/stackglue.h [new file with mode: 0644]
fs/ocfs2/suballoc.c
fs/ocfs2/suballoc.h
fs/ocfs2/super.c
fs/sysfs/symlink.c

diff --git a/Documentation/ABI/obsolete/o2cb b/Documentation/ABI/obsolete/o2cb
new file mode 100644 (file)
index 0000000..9c49d8e
--- /dev/null
@@ -0,0 +1,11 @@
+What:          /sys/o2cb symlink
+Date:          Dec 2005
+KernelVersion: 2.6.16
+Contact:       ocfs2-devel@oss.oracle.com
+Description:   This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink will
+               be removed when new versions of ocfs2-tools which know to look
+               in /sys/fs/o2cb are sufficiently prevalent. Don't code new
+               software to look here, it should try /sys/fs/o2cb instead.
+               See Documentation/ABI/stable/o2cb for more information on usage.
+Users:         ocfs2-tools. It's sufficient to mail proposed changes to
+               ocfs2-devel@oss.oracle.com.
diff --git a/Documentation/ABI/stable/o2cb b/Documentation/ABI/stable/o2cb
new file mode 100644 (file)
index 0000000..5eb1545
--- /dev/null
@@ -0,0 +1,10 @@
+What:          /sys/fs/o2cb/ (was /sys/o2cb)
+Date:          Dec 2005
+KernelVersion: 2.6.16
+Contact:       ocfs2-devel@oss.oracle.com
+Description:   Ocfs2-tools looks at 'interface-revision' for versioning
+               information. Each logmask/ file controls a set of debug prints
+               and can be written into with the strings "allow", "deny", or
+               "off". Reading the file returns the current state.
+Users:         ocfs2-tools. It's sufficient to mail proposed changes to
+               ocfs2-devel@oss.oracle.com.
diff --git a/Documentation/ABI/testing/sysfs-ocfs2 b/Documentation/ABI/testing/sysfs-ocfs2
new file mode 100644 (file)
index 0000000..b7cc516
--- /dev/null
@@ -0,0 +1,89 @@
+What:          /sys/fs/ocfs2/
+Date:          April 2008
+Contact:       ocfs2-devel@oss.oracle.com
+Description:
+               The /sys/fs/ocfs2 directory contains knobs used by the
+               ocfs2-tools to interact with the filesystem.
+
+What:          /sys/fs/ocfs2/max_locking_protocol
+Date:          April 2008
+Contact:       ocfs2-devel@oss.oracle.com
+Description:
+               The /sys/fs/ocfs2/max_locking_protocol file displays version
+               of ocfs2 locking supported by the filesystem.  This version
+               covers how ocfs2 uses distributed locking between cluster
+               nodes.
+
+               The protocol version has a major and minor number.  Two
+               cluster nodes can interoperate if they have an identical
+               major number and an overlapping minor number - thus,
+               a node with version 1.10 can interoperate with a node
+               sporting version 1.8, as long as both use the 1.8 protocol.
+
+               Reading from this file returns a single line, the major
+               number and minor number joined by a period, eg "1.10".
+
+               This file is read-only.  The value is compiled into the
+               driver.
+
+What:          /sys/fs/ocfs2/loaded_cluster_plugins
+Date:          April 2008
+Contact:       ocfs2-devel@oss.oracle.com
+Description:
+               The /sys/fs/ocfs2/loaded_cluster_plugins file describes
+               the available plugins to support ocfs2 cluster operation.
+               A cluster plugin is required to use ocfs2 in a cluster.
+               There are currently two available plugins:
+
+               * 'o2cb' - The classic o2cb cluster stack that ocfs2 has
+                       used since its inception.
+               * 'user' - A plugin supporting userspace cluster software
+                       in conjunction with fs/dlm.
+
+               Reading from this file returns the names of all loaded
+               plugins, one per line.
+
+               This file is read-only.  Its contents may change as
+               plugins are loaded or removed.
+
+What:          /sys/fs/ocfs2/active_cluster_plugin
+Date:          April 2008
+Contact:       ocfs2-devel@oss.oracle.com
+Description:
+               The /sys/fs/ocfs2/active_cluster_plugin displays which
+               cluster plugin is currently in use by the filesystem.
+               The active plugin will appear in the loaded_cluster_plugins
+               file as well.  Only one plugin can be used at a time.
+
+               Reading from this file returns the name of the active plugin
+               on a single line.
+
+               This file is read-only.  Which plugin is active depends on
+               the cluster stack in use.  The contents may change
+               when all filesystems are unmounted and the cluster stack
+               is changed.
+
+What:          /sys/fs/ocfs2/cluster_stack
+Date:          April 2008
+Contact:       ocfs2-devel@oss.oracle.com
+Description:
+               The /sys/fs/ocfs2/cluster_stack file contains the name
+               of current ocfs2 cluster stack.  This value is set by
+               userspace tools when bringing the cluster stack online.
+
+               Cluster stack names are 4 characters in length.
+
+               When the 'o2cb' cluster stack is used, the 'o2cb' cluster
+               plugin is active.  All other cluster stacks use the 'user'
+               cluster plugin.
+
+               Reading from this file returns the name of the current
+               cluster stack on a single line.
+
+               Writing a new stack name to this file changes the current
+               cluster stack unless there are mounted ocfs2 filesystems.
+               If there are mounted filesystems, attempts to change the
+               stack return an error.
+
+Users:
+       ocfs2-tools <ocfs2-tools-devel@oss.oracle.com>
index 164c89394cff14bd378236420e31a95651aed4c9..4b70622a8a914b900f2910779f52a90042797323 100644 (file)
@@ -318,3 +318,13 @@ Why:       Not used in-tree. The current out-of-tree users used it to
        code / infrastructure should be in the kernel and not in some
        out-of-tree driver.
 Who:   Thomas Gleixner <tglx@linutronix.de>
+
+---------------------------
+
+What:  /sys/o2cb symlink
+When:  January 2010
+Why:   /sys/fs/o2cb is the proper location for this information - /sys/o2cb
+       exists as a symlink for backwards compatibility for old versions of
+       ocfs2-tools. 2 years should be sufficient time to phase in new versions
+       which know to look in /sys/fs/o2cb.
+Who:   ocfs2-devel@oss.oracle.com
index 3eceebb48c929db30d5513a010d104d3102b9cfe..974ee8ddb12cdafa726468781ad65d06737cffc7 100644 (file)
@@ -2952,6 +2952,7 @@ P:        Joel Becker
 M:     joel.becker@oracle.com
 L:     ocfs2-devel@oss.oracle.com
 W:     http://oss.oracle.com/projects/ocfs2/
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2.git
 S:     Supported
 
 OMNIKEY CARDMAN 4000 DRIVER
index c509123bea49175bc2a823f838798d07f58689d9..028ae38ecc52538038d76051c6862345fe431349 100644 (file)
@@ -444,6 +444,32 @@ config OCFS2_FS
          For more information on OCFS2, see the file
          <file:Documentation/filesystems/ocfs2.txt>.
 
+config OCFS2_FS_O2CB
+       tristate "O2CB Kernelspace Clustering"
+       depends on OCFS2_FS
+       default y
+       help
+         OCFS2 includes a simple kernelspace clustering package, the OCFS2
+         Cluster Base.  It only requires a very small userspace component
+         to configure it. This comes with the standard ocfs2-tools package.
+         O2CB is limited to maintaining a cluster for OCFS2 file systems.
+         It cannot manage any other cluster applications.
+
+         It is always safe to say Y here, as the clustering method is
+         run-time selectable.
+
+config OCFS2_FS_USERSPACE_CLUSTER
+       tristate "OCFS2 Userspace Clustering"
+       depends on OCFS2_FS && DLM
+       default y
+       help
+         This option will allow OCFS2 to use userspace clustering services
+         in conjunction with the DLM in fs/dlm.  If you are using a
+         userspace cluster manager, say Y here.
+
+         It is safe to say Y, as the clustering method is run-time
+         selectable.
+
 config OCFS2_DEBUG_MASKLOG
        bool "OCFS2 logging support"
        depends on OCFS2_FS
index 4d4ce48bb42c2eb29f4ecb099b5bd63a97040e2a..f6956de56fdb8e96b0e592f5ed4b95f98a7d3320 100644 (file)
@@ -2,7 +2,12 @@ EXTRA_CFLAGS += -Ifs/ocfs2
 
 EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
 
-obj-$(CONFIG_OCFS2_FS) += ocfs2.o
+obj-$(CONFIG_OCFS2_FS) +=      \
+       ocfs2.o                 \
+       ocfs2_stackglue.o
+
+obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_stack_o2cb.o
+obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
 
 ocfs2-objs := \
        alloc.o                 \
@@ -31,5 +36,10 @@ ocfs2-objs := \
        uptodate.o              \
        ver.o
 
+ocfs2_stackglue-objs := stackglue.o
+ocfs2_stack_o2cb-objs := stack_o2cb.o
+ocfs2_stack_user-objs := stack_user.o
+
+# cluster/ is always needed when OCFS2_FS for masklog support
 obj-$(CONFIG_OCFS2_FS) += cluster/
-obj-$(CONFIG_OCFS2_FS) += dlm/
+obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/
index 447206eb5c2e85fe1c0a4e1542f3a3083cac0982..41f84c92094fc114854f9dfd5ff98f4f470a3bec 100644 (file)
@@ -1029,8 +1029,7 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
        BUG_ON(!next_free);
 
        /* The tree code before us didn't allow enough room in the leaf. */
-       if (el->l_next_free_rec == el->l_count && !has_empty)
-               BUG();
+       BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
 
        /*
         * The easiest way to approach this is to just remove the
@@ -1450,6 +1449,8 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
  *   - When our insert into the right path leaf is at the leftmost edge
  *     and requires an update of the path immediately to it's left. This
  *     can occur at the end of some types of rotation and appending inserts.
+ *   - When we've adjusted the last extent record in the left path leaf and the
+ *     1st extent record in the right path leaf during cross extent block merge.
  */
 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
                                       struct ocfs2_path *left_path,
@@ -2712,24 +2713,147 @@ static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
        }
 }
 
+static int ocfs2_get_right_path(struct inode *inode,
+                               struct ocfs2_path *left_path,
+                               struct ocfs2_path **ret_right_path)
+{
+       int ret;
+       u32 right_cpos;
+       struct ocfs2_path *right_path = NULL;
+       struct ocfs2_extent_list *left_el;
+
+       *ret_right_path = NULL;
+
+       /* This function shouldn't be called for non-trees. */
+       BUG_ON(left_path->p_tree_depth == 0);
+
+       left_el = path_leaf_el(left_path);
+       BUG_ON(left_el->l_next_free_rec != left_el->l_count);
+
+       ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
+                                            &right_cpos);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       /* This function shouldn't be called for the rightmost leaf. */
+       BUG_ON(right_cpos == 0);
+
+       right_path = ocfs2_new_path(path_root_bh(left_path),
+                                   path_root_el(left_path));
+       if (!right_path) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       ret = ocfs2_find_path(inode, right_path, right_cpos);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       *ret_right_path = right_path;
+out:
+       if (ret)
+               ocfs2_free_path(right_path);
+       return ret;
+}
+
 /*
  * Remove split_rec clusters from the record at index and merge them
- * onto the beginning of the record at index + 1.
+ * onto the beginning of the record "next" to it.
+ * For index < l_count - 1, the next means the extent rec at index + 1.
+ * For index == l_count - 1, the "next" means the 1st extent rec of the
+ * next extent block.
  */
-static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
-                               handle_t *handle,
-                               struct ocfs2_extent_rec *split_rec,
-                               struct ocfs2_extent_list *el, int index)
+static int ocfs2_merge_rec_right(struct inode *inode,
+                                struct ocfs2_path *left_path,
+                                handle_t *handle,
+                                struct ocfs2_extent_rec *split_rec,
+                                int index)
 {
-       int ret;
+       int ret, next_free, i;
        unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
        struct ocfs2_extent_rec *left_rec;
        struct ocfs2_extent_rec *right_rec;
+       struct ocfs2_extent_list *right_el;
+       struct ocfs2_path *right_path = NULL;
+       int subtree_index = 0;
+       struct ocfs2_extent_list *el = path_leaf_el(left_path);
+       struct buffer_head *bh = path_leaf_bh(left_path);
+       struct buffer_head *root_bh = NULL;
 
        BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
-
        left_rec = &el->l_recs[index];
-       right_rec = &el->l_recs[index + 1];
+
+       if (index == le16_to_cpu(el->l_next_free_rec - 1) &&
+           le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
+               /* we meet with a cross extent block merge. */
+               ret = ocfs2_get_right_path(inode, left_path, &right_path);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               right_el = path_leaf_el(right_path);
+               next_free = le16_to_cpu(right_el->l_next_free_rec);
+               BUG_ON(next_free <= 0);
+               right_rec = &right_el->l_recs[0];
+               if (ocfs2_is_empty_extent(right_rec)) {
+                       BUG_ON(le16_to_cpu(next_free) <= 1);
+                       right_rec = &right_el->l_recs[1];
+               }
+
+               BUG_ON(le32_to_cpu(left_rec->e_cpos) +
+                      le16_to_cpu(left_rec->e_leaf_clusters) !=
+                      le32_to_cpu(right_rec->e_cpos));
+
+               subtree_index = ocfs2_find_subtree_root(inode,
+                                                       left_path, right_path);
+
+               ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
+                                                     handle->h_buffer_credits,
+                                                     right_path);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               root_bh = left_path->p_node[subtree_index].bh;
+               BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+               ret = ocfs2_journal_access(handle, inode, root_bh,
+                                          OCFS2_JOURNAL_ACCESS_WRITE);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               for (i = subtree_index + 1;
+                    i < path_num_items(right_path); i++) {
+                       ret = ocfs2_journal_access(handle, inode,
+                                                  right_path->p_node[i].bh,
+                                                  OCFS2_JOURNAL_ACCESS_WRITE);
+                       if (ret) {
+                               mlog_errno(ret);
+                               goto out;
+                       }
+
+                       ret = ocfs2_journal_access(handle, inode,
+                                                  left_path->p_node[i].bh,
+                                                  OCFS2_JOURNAL_ACCESS_WRITE);
+                       if (ret) {
+                               mlog_errno(ret);
+                               goto out;
+                       }
+               }
+
+       } else {
+               BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
+               right_rec = &el->l_recs[index + 1];
+       }
 
        ret = ocfs2_journal_access(handle, inode, bh,
                                   OCFS2_JOURNAL_ACCESS_WRITE);
@@ -2751,30 +2875,156 @@ static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
        if (ret)
                mlog_errno(ret);
 
+       if (right_path) {
+               ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
+               if (ret)
+                       mlog_errno(ret);
+
+               ocfs2_complete_edge_insert(inode, handle, left_path,
+                                          right_path, subtree_index);
+       }
+out:
+       if (right_path)
+               ocfs2_free_path(right_path);
+       return ret;
+}
+
+static int ocfs2_get_left_path(struct inode *inode,
+                              struct ocfs2_path *right_path,
+                              struct ocfs2_path **ret_left_path)
+{
+       int ret;
+       u32 left_cpos;
+       struct ocfs2_path *left_path = NULL;
+
+       *ret_left_path = NULL;
+
+       /* This function shouldn't be called for non-trees. */
+       BUG_ON(right_path->p_tree_depth == 0);
+
+       ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
+                                           right_path, &left_cpos);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       /* This function shouldn't be called for the leftmost leaf. */
+       BUG_ON(left_cpos == 0);
+
+       left_path = ocfs2_new_path(path_root_bh(right_path),
+                                  path_root_el(right_path));
+       if (!left_path) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       ret = ocfs2_find_path(inode, left_path, left_cpos);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       *ret_left_path = left_path;
 out:
+       if (ret)
+               ocfs2_free_path(left_path);
        return ret;
 }
 
 /*
  * Remove split_rec clusters from the record at index and merge them
- * onto the tail of the record at index - 1.
+ * onto the tail of the record "before" it.
+ * For index > 0, the "before" means the extent rec at index - 1.
+ *
+ * For index == 0, the "before" means the last record of the previous
+ * extent block. And there is also a situation that we may need to
+ * remove the rightmost leaf extent block in the right_path and change
+ * the right path to indicate the new rightmost path.
  */
-static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
+static int ocfs2_merge_rec_left(struct inode *inode,
+                               struct ocfs2_path *right_path,
                                handle_t *handle,
                                struct ocfs2_extent_rec *split_rec,
-                               struct ocfs2_extent_list *el, int index)
+                               struct ocfs2_cached_dealloc_ctxt *dealloc,
+                               int index)
 {
-       int ret, has_empty_extent = 0;
+       int ret, i, subtree_index = 0, has_empty_extent = 0;
        unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
        struct ocfs2_extent_rec *left_rec;
        struct ocfs2_extent_rec *right_rec;
+       struct ocfs2_extent_list *el = path_leaf_el(right_path);
+       struct buffer_head *bh = path_leaf_bh(right_path);
+       struct buffer_head *root_bh = NULL;
+       struct ocfs2_path *left_path = NULL;
+       struct ocfs2_extent_list *left_el;
 
-       BUG_ON(index <= 0);
+       BUG_ON(index < 0);
 
-       left_rec = &el->l_recs[index - 1];
        right_rec = &el->l_recs[index];
-       if (ocfs2_is_empty_extent(&el->l_recs[0]))
-               has_empty_extent = 1;
+       if (index == 0) {
+               /* we meet with a cross extent block merge. */
+               ret = ocfs2_get_left_path(inode, right_path, &left_path);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               left_el = path_leaf_el(left_path);
+               BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
+                      le16_to_cpu(left_el->l_count));
+
+               left_rec = &left_el->l_recs[
+                               le16_to_cpu(left_el->l_next_free_rec) - 1];
+               BUG_ON(le32_to_cpu(left_rec->e_cpos) +
+                      le16_to_cpu(left_rec->e_leaf_clusters) !=
+                      le32_to_cpu(split_rec->e_cpos));
+
+               subtree_index = ocfs2_find_subtree_root(inode,
+                                                       left_path, right_path);
+
+               ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
+                                                     handle->h_buffer_credits,
+                                                     left_path);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               root_bh = left_path->p_node[subtree_index].bh;
+               BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+               ret = ocfs2_journal_access(handle, inode, root_bh,
+                                          OCFS2_JOURNAL_ACCESS_WRITE);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               for (i = subtree_index + 1;
+                    i < path_num_items(right_path); i++) {
+                       ret = ocfs2_journal_access(handle, inode,
+                                                  right_path->p_node[i].bh,
+                                                  OCFS2_JOURNAL_ACCESS_WRITE);
+                       if (ret) {
+                               mlog_errno(ret);
+                               goto out;
+                       }
+
+                       ret = ocfs2_journal_access(handle, inode,
+                                                  left_path->p_node[i].bh,
+                                                  OCFS2_JOURNAL_ACCESS_WRITE);
+                       if (ret) {
+                               mlog_errno(ret);
+                               goto out;
+                       }
+               }
+       } else {
+               left_rec = &el->l_recs[index - 1];
+               if (ocfs2_is_empty_extent(&el->l_recs[0]))
+                       has_empty_extent = 1;
+       }
 
        ret = ocfs2_journal_access(handle, inode, bh,
                                   OCFS2_JOURNAL_ACCESS_WRITE);
@@ -2790,9 +3040,8 @@ static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
                *left_rec = *split_rec;
 
                has_empty_extent = 0;
-       } else {
+       } else
                le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
-       }
 
        le32_add_cpu(&right_rec->e_cpos, split_clusters);
        le64_add_cpu(&right_rec->e_blkno,
@@ -2805,13 +3054,44 @@ static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
        if (ret)
                mlog_errno(ret);
 
+       if (left_path) {
+               ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+               if (ret)
+                       mlog_errno(ret);
+
+               /*
+                * In the situation that the right_rec is empty and the extent
+                * block is empty also,  ocfs2_complete_edge_insert can't handle
+                * it and we need to delete the right extent block.
+                */
+               if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
+                   le16_to_cpu(el->l_next_free_rec) == 1) {
+
+                       ret = ocfs2_remove_rightmost_path(inode, handle,
+                                                         right_path, dealloc);
+                       if (ret) {
+                               mlog_errno(ret);
+                               goto out;
+                       }
+
+                       /* Now the rightmost extent block has been deleted.
+                        * So we use the new rightmost path.
+                        */
+                       ocfs2_mv_path(right_path, left_path);
+                       left_path = NULL;
+               } else
+                       ocfs2_complete_edge_insert(inode, handle, left_path,
+                                                  right_path, subtree_index);
+       }
 out:
+       if (left_path)
+               ocfs2_free_path(left_path);
        return ret;
 }
 
 static int ocfs2_try_to_merge_extent(struct inode *inode,
                                     handle_t *handle,
-                                    struct ocfs2_path *left_path,
+                                    struct ocfs2_path *path,
                                     int split_index,
                                     struct ocfs2_extent_rec *split_rec,
                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
@@ -2819,7 +3099,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 
 {
        int ret = 0;
-       struct ocfs2_extent_list *el = path_leaf_el(left_path);
+       struct ocfs2_extent_list *el = path_leaf_el(path);
        struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
 
        BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
@@ -2832,7 +3112,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                 * extents - having more than one in a leaf is
                 * illegal.
                 */
-               ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+               ret = ocfs2_rotate_tree_left(inode, handle, path,
                                             dealloc);
                if (ret) {
                        mlog_errno(ret);
@@ -2847,7 +3127,6 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                 * Left-right contig implies this.
                 */
                BUG_ON(!ctxt->c_split_covers_rec);
-               BUG_ON(split_index == 0);
 
                /*
                 * Since the leftright insert always covers the entire
@@ -2858,9 +3137,14 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                 * Since the adding of an empty extent shifts
                 * everything back to the right, there's no need to
                 * update split_index here.
+                *
+                * When the split_index is zero, we need to merge it to the
+                * prevoius extent block. It is more efficient and easier
+                * if we do merge_right first and merge_left later.
                 */
-               ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path),
-                                          handle, split_rec, el, split_index);
+               ret = ocfs2_merge_rec_right(inode, path,
+                                           handle, split_rec,
+                                           split_index);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2871,32 +3155,30 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                 */
                BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
 
-               /*
-                * The left merge left us with an empty extent, remove
-                * it.
-                */
-               ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc);
+               /* The merge left us with an empty extent, remove it. */
+               ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-               split_index--;
+
                rec = &el->l_recs[split_index];
 
                /*
                 * Note that we don't pass split_rec here on purpose -
-                * we've merged it into the left side.
+                * we've merged it into the rec already.
                 */
-               ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path),
-                                           handle, rec, el, split_index);
+               ret = ocfs2_merge_rec_left(inode, path,
+                                          handle, rec,
+                                          dealloc,
+                                          split_index);
+
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
 
-               BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
-
-               ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+               ret = ocfs2_rotate_tree_left(inode, handle, path,
                                             dealloc);
                /*
                 * Error from this last rotate is not critical, so
@@ -2915,8 +3197,9 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                 */
                if (ctxt->c_contig_type == CONTIG_RIGHT) {
                        ret = ocfs2_merge_rec_left(inode,
-                                                  path_leaf_bh(left_path),
-                                                  handle, split_rec, el,
+                                                  path,
+                                                  handle, split_rec,
+                                                  dealloc,
                                                   split_index);
                        if (ret) {
                                mlog_errno(ret);
@@ -2924,8 +3207,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                        }
                } else {
                        ret = ocfs2_merge_rec_right(inode,
-                                                   path_leaf_bh(left_path),
-                                                   handle, split_rec, el,
+                                                   path,
+                                                   handle, split_rec,
                                                    split_index);
                        if (ret) {
                                mlog_errno(ret);
@@ -2938,7 +3221,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
                         * The merge may have left an empty extent in
                         * our leaf. Try to rotate it away.
                         */
-                       ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+                       ret = ocfs2_rotate_tree_left(inode, handle, path,
                                                     dealloc);
                        if (ret)
                                mlog_errno(ret);
@@ -3498,20 +3781,57 @@ out:
 }
 
 static enum ocfs2_contig_type
-ocfs2_figure_merge_contig_type(struct inode *inode,
+ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                               struct ocfs2_extent_list *el, int index,
                               struct ocfs2_extent_rec *split_rec)
 {
-       struct ocfs2_extent_rec *rec;
+       int status;
        enum ocfs2_contig_type ret = CONTIG_NONE;
+       u32 left_cpos, right_cpos;
+       struct ocfs2_extent_rec *rec = NULL;
+       struct ocfs2_extent_list *new_el;
+       struct ocfs2_path *left_path = NULL, *right_path = NULL;
+       struct buffer_head *bh;
+       struct ocfs2_extent_block *eb;
+
+       if (index > 0) {
+               rec = &el->l_recs[index - 1];
+       } else if (path->p_tree_depth > 0) {
+               status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
+                                                      path, &left_cpos);
+               if (status)
+                       goto out;
+
+               if (left_cpos != 0) {
+                       left_path = ocfs2_new_path(path_root_bh(path),
+                                                  path_root_el(path));
+                       if (!left_path)
+                               goto out;
+
+                       status = ocfs2_find_path(inode, left_path, left_cpos);
+                       if (status)
+                               goto out;
+
+                       new_el = path_leaf_el(left_path);
+
+                       if (le16_to_cpu(new_el->l_next_free_rec) !=
+                           le16_to_cpu(new_el->l_count)) {
+                               bh = path_leaf_bh(left_path);
+                               eb = (struct ocfs2_extent_block *)bh->b_data;
+                               OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+                                                                eb);
+                               goto out;
+                       }
+                       rec = &new_el->l_recs[
+                               le16_to_cpu(new_el->l_next_free_rec) - 1];
+               }
+       }
 
        /*
         * We're careful to check for an empty extent record here -
         * the merge code will know what to do if it sees one.
         */
-
-       if (index > 0) {
-               rec = &el->l_recs[index - 1];
+       if (rec) {
                if (index == 1 && ocfs2_is_empty_extent(rec)) {
                        if (split_rec->e_cpos == el->l_recs[index].e_cpos)
                                ret = CONTIG_RIGHT;
@@ -3520,10 +3840,45 @@ ocfs2_figure_merge_contig_type(struct inode *inode,
                }
        }
 
-       if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) {
+       rec = NULL;
+       if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
+               rec = &el->l_recs[index + 1];
+       else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
+                path->p_tree_depth > 0) {
+               status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
+                                                       path, &right_cpos);
+               if (status)
+                       goto out;
+
+               if (right_cpos == 0)
+                       goto out;
+
+               right_path = ocfs2_new_path(path_root_bh(path),
+                                           path_root_el(path));
+               if (!right_path)
+                       goto out;
+
+               status = ocfs2_find_path(inode, right_path, right_cpos);
+               if (status)
+                       goto out;
+
+               new_el = path_leaf_el(right_path);
+               rec = &new_el->l_recs[0];
+               if (ocfs2_is_empty_extent(rec)) {
+                       if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
+                               bh = path_leaf_bh(right_path);
+                               eb = (struct ocfs2_extent_block *)bh->b_data;
+                               OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+                                                                eb);
+                               goto out;
+                       }
+                       rec = &new_el->l_recs[1];
+               }
+       }
+
+       if (rec) {
                enum ocfs2_contig_type contig_type;
 
-               rec = &el->l_recs[index + 1];
                contig_type = ocfs2_extent_contig(inode, rec, split_rec);
 
                if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
@@ -3532,6 +3887,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode,
                        ret = contig_type;
        }
 
+out:
+       if (left_path)
+               ocfs2_free_path(left_path);
+       if (right_path)
+               ocfs2_free_path(right_path);
+
        return ret;
 }
 
@@ -3994,7 +4355,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
                goto out;
        }
 
-       ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el,
+       ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
                                                            split_index,
                                                            split_rec);
 
@@ -4788,6 +5149,8 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
        status = ocfs2_flush_truncate_log(osb);
        if (status < 0)
                mlog_errno(status);
+       else
+               ocfs2_init_inode_steal_slot(osb);
 
        mlog_exit(status);
 }
index 90383ed6100530d6d10e5edae9b437278138fa0e..17964c0505a9785daa29a88f3086dcfab3ac2c1e 100644 (file)
@@ -467,11 +467,11 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
                                                         unsigned to)
 {
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-       handle_t *handle = NULL;
+       handle_t *handle;
        int ret = 0;
 
        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-       if (!handle) {
+       if (IS_ERR(handle)) {
                ret = -ENOMEM;
                mlog_errno(ret);
                goto out;
@@ -487,7 +487,7 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
        }
 out:
        if (ret) {
-               if (handle)
+               if (!IS_ERR(handle))
                        ocfs2_commit_trans(osb, handle);
                handle = ERR_PTR(ret);
        }
index cdd162f13650b167dd8a9cf49db83b1e2ea5339e..bc8c5e7d8608bc027085aed634d75350071c3d80 100644 (file)
@@ -1,4 +1,4 @@
 obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
 
 ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
-       quorum.o tcp.o ver.o
+       quorum.o tcp.o netdebug.o ver.o
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
new file mode 100644 (file)
index 0000000..7bf3c0e
--- /dev/null
@@ -0,0 +1,441 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * netdebug.c
+ *
+ * debug functionality for o2net
+ *
+ * Copyright (C) 2005, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifdef CONFIG_DEBUG_FS
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+
+#include <linux/uaccess.h>
+
+#include "tcp.h"
+#include "nodemanager.h"
+#define MLOG_MASK_PREFIX ML_TCP
+#include "masklog.h"
+
+#include "tcp_internal.h"
+
+#define O2NET_DEBUG_DIR                "o2net"
+#define SC_DEBUG_NAME          "sock_containers"
+#define NST_DEBUG_NAME         "send_tracking"
+
+static struct dentry *o2net_dentry;
+static struct dentry *sc_dentry;
+static struct dentry *nst_dentry;
+
+static DEFINE_SPINLOCK(o2net_debug_lock);
+
+static LIST_HEAD(sock_containers);
+static LIST_HEAD(send_tracking);
+
+void o2net_debug_add_nst(struct o2net_send_tracking *nst)
+{
+       spin_lock(&o2net_debug_lock);
+       list_add(&nst->st_net_debug_item, &send_tracking);
+       spin_unlock(&o2net_debug_lock);
+}
+
+void o2net_debug_del_nst(struct o2net_send_tracking *nst)
+{
+       spin_lock(&o2net_debug_lock);
+       if (!list_empty(&nst->st_net_debug_item))
+               list_del_init(&nst->st_net_debug_item);
+       spin_unlock(&o2net_debug_lock);
+}
+
+static struct o2net_send_tracking
+                       *next_nst(struct o2net_send_tracking *nst_start)
+{
+       struct o2net_send_tracking *nst, *ret = NULL;
+
+       assert_spin_locked(&o2net_debug_lock);
+
+       list_for_each_entry(nst, &nst_start->st_net_debug_item,
+                           st_net_debug_item) {
+               /* discover the head of the list */
+               if (&nst->st_net_debug_item == &send_tracking)
+                       break;
+
+               /* use st_task to detect real nsts in the list */
+               if (nst->st_task != NULL) {
+                       ret = nst;
+                       break;
+               }
+       }
+
+       return ret;
+}
+
+static void *nst_seq_start(struct seq_file *seq, loff_t *pos)
+{
+       struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+
+       spin_lock(&o2net_debug_lock);
+       nst = next_nst(dummy_nst);
+       spin_unlock(&o2net_debug_lock);
+
+       return nst;
+}
+
+static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+       struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+
+       spin_lock(&o2net_debug_lock);
+       nst = next_nst(dummy_nst);
+       list_del_init(&dummy_nst->st_net_debug_item);
+       if (nst)
+               list_add(&dummy_nst->st_net_debug_item,
+                        &nst->st_net_debug_item);
+       spin_unlock(&o2net_debug_lock);
+
+       return nst; /* unused, just needs to be null when done */
+}
+
+static int nst_seq_show(struct seq_file *seq, void *v)
+{
+       struct o2net_send_tracking *nst, *dummy_nst = seq->private;
+
+       spin_lock(&o2net_debug_lock);
+       nst = next_nst(dummy_nst);
+
+       if (nst != NULL) {
+               /* get_task_comm isn't exported.  oh well. */
+               seq_printf(seq, "%p:\n"
+                          "  pid:          %lu\n"
+                          "  tgid:         %lu\n"
+                          "  process name: %s\n"
+                          "  node:         %u\n"
+                          "  sc:           %p\n"
+                          "  message id:   %d\n"
+                          "  message type: %u\n"
+                          "  message key:  0x%08x\n"
+                          "  sock acquiry: %lu.%lu\n"
+                          "  send start:   %lu.%lu\n"
+                          "  wait start:   %lu.%lu\n",
+                          nst, (unsigned long)nst->st_task->pid,
+                          (unsigned long)nst->st_task->tgid,
+                          nst->st_task->comm, nst->st_node,
+                          nst->st_sc, nst->st_id, nst->st_msg_type,
+                          nst->st_msg_key,
+                          nst->st_sock_time.tv_sec, nst->st_sock_time.tv_usec,
+                          nst->st_send_time.tv_sec, nst->st_send_time.tv_usec,
+                          nst->st_status_time.tv_sec,
+                          nst->st_status_time.tv_usec);
+       }
+
+       spin_unlock(&o2net_debug_lock);
+
+       return 0;
+}
+
+static void nst_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static struct seq_operations nst_seq_ops = {
+       .start = nst_seq_start,
+       .next = nst_seq_next,
+       .stop = nst_seq_stop,
+       .show = nst_seq_show,
+};
+
+static int nst_fop_open(struct inode *inode, struct file *file)
+{
+       struct o2net_send_tracking *dummy_nst;
+       struct seq_file *seq;
+       int ret;
+
+       dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL);
+       if (dummy_nst == NULL) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       dummy_nst->st_task = NULL;
+
+       ret = seq_open(file, &nst_seq_ops);
+       if (ret)
+               goto out;
+
+       seq = file->private_data;
+       seq->private = dummy_nst;
+       o2net_debug_add_nst(dummy_nst);
+
+       dummy_nst = NULL;
+
+out:
+       kfree(dummy_nst);
+       return ret;
+}
+
+static int nst_fop_release(struct inode *inode, struct file *file)
+{
+       struct seq_file *seq = file->private_data;
+       struct o2net_send_tracking *dummy_nst = seq->private;
+
+       o2net_debug_del_nst(dummy_nst);
+       return seq_release_private(inode, file);
+}
+
+static struct file_operations nst_seq_fops = {
+       .open = nst_fop_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = nst_fop_release,
+};
+
+void o2net_debug_add_sc(struct o2net_sock_container *sc)
+{
+       spin_lock(&o2net_debug_lock);
+       list_add(&sc->sc_net_debug_item, &sock_containers);
+       spin_unlock(&o2net_debug_lock);
+}
+
+void o2net_debug_del_sc(struct o2net_sock_container *sc)
+{
+       spin_lock(&o2net_debug_lock);
+       list_del_init(&sc->sc_net_debug_item);
+       spin_unlock(&o2net_debug_lock);
+}
+
+static struct o2net_sock_container
+                       *next_sc(struct o2net_sock_container *sc_start)
+{
+       struct o2net_sock_container *sc, *ret = NULL;
+
+       assert_spin_locked(&o2net_debug_lock);
+
+       list_for_each_entry(sc, &sc_start->sc_net_debug_item,
+                           sc_net_debug_item) {
+               /* discover the head of the list miscast as a sc */
+               if (&sc->sc_net_debug_item == &sock_containers)
+                       break;
+
+               /* use sc_page to detect real scs in the list */
+               if (sc->sc_page != NULL) {
+                       ret = sc;
+                       break;
+               }
+       }
+
+       return ret;
+}
+
+static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+       struct o2net_sock_container *sc, *dummy_sc = seq->private;
+
+       spin_lock(&o2net_debug_lock);
+       sc = next_sc(dummy_sc);
+       spin_unlock(&o2net_debug_lock);
+
+       return sc;
+}
+
+static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+       struct o2net_sock_container *sc, *dummy_sc = seq->private;
+
+       spin_lock(&o2net_debug_lock);
+       sc = next_sc(dummy_sc);
+       list_del_init(&dummy_sc->sc_net_debug_item);
+       if (sc)
+               list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item);
+       spin_unlock(&o2net_debug_lock);
+
+       return sc; /* unused, just needs to be null when done */
+}
+
+#define TV_SEC_USEC(TV) TV.tv_sec, TV.tv_usec
+
+static int sc_seq_show(struct seq_file *seq, void *v)
+{
+       struct o2net_sock_container *sc, *dummy_sc = seq->private;
+
+       spin_lock(&o2net_debug_lock);
+       sc = next_sc(dummy_sc);
+
+       if (sc != NULL) {
+               struct inet_sock *inet = NULL;
+
+               __be32 saddr = 0, daddr = 0;
+               __be16 sport = 0, dport = 0;
+
+               if (sc->sc_sock) {
+                       inet = inet_sk(sc->sc_sock->sk);
+                       /* the stack's structs aren't sparse endian clean */
+                       saddr = (__force __be32)inet->saddr;
+                       daddr = (__force __be32)inet->daddr;
+                       sport = (__force __be16)inet->sport;
+                       dport = (__force __be16)inet->dport;
+               }
+
+               /* XXX sigh, inet-> doesn't have sparse annotation so any
+                * use of it here generates a warning with -Wbitwise */
+               seq_printf(seq, "%p:\n"
+                          "  krefs:           %d\n"
+                          "  sock:            %u.%u.%u.%u:%u -> "
+                                             "%u.%u.%u.%u:%u\n"
+                          "  remote node:     %s\n"
+                          "  page off:        %zu\n"
+                          "  handshake ok:    %u\n"
+                          "  timer:           %lu.%lu\n"
+                          "  data ready:      %lu.%lu\n"
+                          "  advance start:   %lu.%lu\n"
+                          "  advance stop:    %lu.%lu\n"
+                          "  func start:      %lu.%lu\n"
+                          "  func stop:       %lu.%lu\n"
+                          "  func key:        %u\n"
+                          "  func type:       %u\n",
+                          sc,
+                          atomic_read(&sc->sc_kref.refcount),
+                          NIPQUAD(saddr), inet ? ntohs(sport) : 0,
+                          NIPQUAD(daddr), inet ? ntohs(dport) : 0,
+                          sc->sc_node->nd_name,
+                          sc->sc_page_off,
+                          sc->sc_handshake_ok,
+                          TV_SEC_USEC(sc->sc_tv_timer),
+                          TV_SEC_USEC(sc->sc_tv_data_ready),
+                          TV_SEC_USEC(sc->sc_tv_advance_start),
+                          TV_SEC_USEC(sc->sc_tv_advance_stop),
+                          TV_SEC_USEC(sc->sc_tv_func_start),
+                          TV_SEC_USEC(sc->sc_tv_func_stop),
+                          sc->sc_msg_key,
+                          sc->sc_msg_type);
+       }
+
+
+       spin_unlock(&o2net_debug_lock);
+
+       return 0;
+}
+
+static void sc_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static struct seq_operations sc_seq_ops = {
+       .start = sc_seq_start,
+       .next = sc_seq_next,
+       .stop = sc_seq_stop,
+       .show = sc_seq_show,
+};
+
+static int sc_fop_open(struct inode *inode, struct file *file)
+{
+       struct o2net_sock_container *dummy_sc;
+       struct seq_file *seq;
+       int ret;
+
+       dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL);
+       if (dummy_sc == NULL) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       dummy_sc->sc_page = NULL;
+
+       ret = seq_open(file, &sc_seq_ops);
+       if (ret)
+               goto out;
+
+       seq = file->private_data;
+       seq->private = dummy_sc;
+       o2net_debug_add_sc(dummy_sc);
+
+       dummy_sc = NULL;
+
+out:
+       kfree(dummy_sc);
+       return ret;
+}
+
+static int sc_fop_release(struct inode *inode, struct file *file)
+{
+       struct seq_file *seq = file->private_data;
+       struct o2net_sock_container *dummy_sc = seq->private;
+
+       o2net_debug_del_sc(dummy_sc);
+       return seq_release_private(inode, file);
+}
+
+static struct file_operations sc_seq_fops = {
+       .open = sc_fop_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = sc_fop_release,
+};
+
+int o2net_debugfs_init(void)
+{
+       o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
+       if (!o2net_dentry) {
+               mlog_errno(-ENOMEM);
+               goto bail;
+       }
+
+       nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR,
+                                        o2net_dentry, NULL,
+                                        &nst_seq_fops);
+       if (!nst_dentry) {
+               mlog_errno(-ENOMEM);
+               goto bail;
+       }
+
+       sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR,
+                                       o2net_dentry, NULL,
+                                       &sc_seq_fops);
+       if (!sc_dentry) {
+               mlog_errno(-ENOMEM);
+               goto bail;
+       }
+
+       return 0;
+bail:
+       if (sc_dentry)
+               debugfs_remove(sc_dentry);
+       if (nst_dentry)
+               debugfs_remove(nst_dentry);
+       if (o2net_dentry)
+               debugfs_remove(o2net_dentry);
+       return -ENOMEM;
+}
+
+void o2net_debugfs_exit(void)
+{
+       if (sc_dentry)
+               debugfs_remove(sc_dentry);
+       if (nst_dentry)
+               debugfs_remove(nst_dentry);
+       if (o2net_dentry)
+               debugfs_remove(o2net_dentry);
+}
+
+#endif /* CONFIG_DEBUG_FS */
index 709fba25bf7e1b4ed4b1f89b12f6fb632450a5b8..cf9401e8cd0b221e4187dbd4624c05e2070a3d89 100644 (file)
@@ -959,7 +959,10 @@ static int __init init_o2nm(void)
        cluster_print_version();
 
        o2hb_init();
-       o2net_init();
+
+       ret = o2net_init();
+       if (ret)
+               goto out;
 
        ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
        if (!ocfs2_table_header) {
index 0c095ce7723d87db01bbb5d980e0fd4643764eb7..98429fd684996c8b34b448f304ebc38d552f5ca7 100644 (file)
@@ -57,6 +57,7 @@ static struct kset *o2cb_kset;
 void o2cb_sys_shutdown(void)
 {
        mlog_sys_shutdown();
+       sysfs_remove_link(NULL, "o2cb");
        kset_unregister(o2cb_kset);
 }
 
@@ -68,6 +69,14 @@ int o2cb_sys_init(void)
        if (!o2cb_kset)
                return -ENOMEM;
 
+       /*
+        * Create this symlink for backwards compatibility with old
+        * versions of ocfs2-tools which look for things in /sys/o2cb.
+        */
+       ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb");
+       if (ret)
+               goto error;
+
        ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
        if (ret)
                goto error;
index b8057c51b20523f560a1bb2cbe5e12b8d6a3403a..1e44ad14881a70ea205230da6c2d313f5b0b0dae 100644 (file)
@@ -142,23 +142,65 @@ static void o2net_idle_timer(unsigned long data);
 static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
 static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
 
-/*
- * FIXME: These should use to_o2nm_cluster_from_node(), but we end up
- * losing our parent link to the cluster during shutdown. This can be
- * solved by adding a pre-removal callback to configfs, or passing
- * around the cluster with the node. -jeffm
- */
-static inline int o2net_reconnect_delay(struct o2nm_node *node)
+static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+                          u32 msgkey, struct task_struct *task, u8 node)
+{
+#ifdef CONFIG_DEBUG_FS
+       INIT_LIST_HEAD(&nst->st_net_debug_item);
+       nst->st_task = task;
+       nst->st_msg_type = msgtype;
+       nst->st_msg_key = msgkey;
+       nst->st_node = node;
+#endif
+}
+
+static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+{
+#ifdef CONFIG_DEBUG_FS
+       do_gettimeofday(&nst->st_sock_time);
+#endif
+}
+
+static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+{
+#ifdef CONFIG_DEBUG_FS
+       do_gettimeofday(&nst->st_send_time);
+#endif
+}
+
+static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+{
+#ifdef CONFIG_DEBUG_FS
+       do_gettimeofday(&nst->st_status_time);
+#endif
+}
+
+static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+                                        struct o2net_sock_container *sc)
+{
+#ifdef CONFIG_DEBUG_FS
+       nst->st_sc = sc;
+#endif
+}
+
+static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
+{
+#ifdef CONFIG_DEBUG_FS
+       nst->st_id = msg_id;
+#endif
+}
+
+static inline int o2net_reconnect_delay(void)
 {
        return o2nm_single_cluster->cl_reconnect_delay_ms;
 }
 
-static inline int o2net_keepalive_delay(struct o2nm_node *node)
+static inline int o2net_keepalive_delay(void)
 {
        return o2nm_single_cluster->cl_keepalive_delay_ms;
 }
 
-static inline int o2net_idle_timeout(struct o2nm_node *node)
+static inline int o2net_idle_timeout(void)
 {
        return o2nm_single_cluster->cl_idle_timeout_ms;
 }
@@ -296,6 +338,7 @@ static void sc_kref_release(struct kref *kref)
        o2nm_node_put(sc->sc_node);
        sc->sc_node = NULL;
 
+       o2net_debug_del_sc(sc);
        kfree(sc);
 }
 
@@ -336,6 +379,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
 
        ret = sc;
        sc->sc_page = page;
+       o2net_debug_add_sc(sc);
        sc = NULL;
        page = NULL;
 
@@ -399,8 +443,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
        mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
        mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
 
-       /* we won't reconnect after our valid conn goes away for
-        * this hb iteration.. here so it shows up in the logs */
        if (was_valid && !valid && err == 0)
                err = -ENOTCONN;
 
@@ -430,11 +472,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
 
        if (!was_valid && valid) {
                o2quo_conn_up(o2net_num_from_nn(nn));
-               /* this is a bit of a hack.  we only try reconnecting
-                * when heartbeating starts until we get a connection.
-                * if that connection then dies we don't try reconnecting.
-                * the only way to start connecting again is to down
-                * heartbeat and bring it back up. */
                cancel_delayed_work(&nn->nn_connect_expired);
                printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n",
                       o2nm_this_node() > sc->sc_node->nd_num ?
@@ -451,12 +488,24 @@ static void o2net_set_nn_state(struct o2net_node *nn,
                /* delay if we're withing a RECONNECT_DELAY of the
                 * last attempt */
                delay = (nn->nn_last_connect_attempt +
-                        msecs_to_jiffies(o2net_reconnect_delay(NULL)))
+                        msecs_to_jiffies(o2net_reconnect_delay()))
                        - jiffies;
-               if (delay > msecs_to_jiffies(o2net_reconnect_delay(NULL)))
+               if (delay > msecs_to_jiffies(o2net_reconnect_delay()))
                        delay = 0;
                mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
                queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
+
+               /*
+                * Delay the expired work after idle timeout.
+                *
+                * We might have lots of failed connection attempts that run
+                * through here but we only cancel the connect_expired work when
+                * a connection attempt succeeds.  So only the first enqueue of
+                * the connect_expired work will do anything.  The rest will see
+                * that it's already queued and do nothing.
+                */
+               delay += msecs_to_jiffies(o2net_idle_timeout());
+               queue_delayed_work(o2net_wq, &nn->nn_connect_expired, delay);
        }
 
        /* keep track of the nn's sc ref for the caller */
@@ -914,6 +963,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
        struct o2net_status_wait nsw = {
                .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item),
        };
+       struct o2net_send_tracking nst;
+
+       o2net_init_nst(&nst, msg_type, key, current, target_node);
 
        if (o2net_wq == NULL) {
                mlog(0, "attempt to tx without o2netd running\n");
@@ -939,6 +991,10 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
                goto out;
        }
 
+       o2net_debug_add_nst(&nst);
+
+       o2net_set_nst_sock_time(&nst);
+
        ret = wait_event_interruptible(nn->nn_sc_wq,
                                       o2net_tx_can_proceed(nn, &sc, &error));
        if (!ret && error)
@@ -946,6 +1002,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
        if (ret)
                goto out;
 
+       o2net_set_nst_sock_container(&nst, sc);
+
        veclen = caller_veclen + 1;
        vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC);
        if (vec == NULL) {
@@ -972,6 +1030,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
                goto out;
 
        msg->msg_num = cpu_to_be32(nsw.ns_id);
+       o2net_set_nst_msg_id(&nst, nsw.ns_id);
+
+       o2net_set_nst_send_time(&nst);
 
        /* finally, convert the message header to network byte-order
         * and send */
@@ -986,6 +1047,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
        }
 
        /* wait on other node's handler */
+       o2net_set_nst_status_time(&nst);
        wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
 
        /* Note that we avoid overwriting the callers status return
@@ -998,6 +1060,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
        mlog(0, "woken, returning system status %d, user status %d\n",
             ret, nsw.ns_status);
 out:
+       o2net_debug_del_nst(&nst); /* must be before dropping sc and node */
        if (sc)
                sc_put(sc);
        if (vec)
@@ -1154,23 +1217,23 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
         * but isn't. This can ultimately cause corruption.
         */
        if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
-                               o2net_idle_timeout(sc->sc_node)) {
+                               o2net_idle_timeout()) {
                mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of "
                     "%u ms, but we use %u ms locally.  disconnecting\n",
                     SC_NODEF_ARGS(sc),
                     be32_to_cpu(hand->o2net_idle_timeout_ms),
-                    o2net_idle_timeout(sc->sc_node));
+                    o2net_idle_timeout());
                o2net_ensure_shutdown(nn, sc, -ENOTCONN);
                return -1;
        }
 
        if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
-                       o2net_keepalive_delay(sc->sc_node)) {
+                       o2net_keepalive_delay()) {
                mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of "
                     "%u ms, but we use %u ms locally.  disconnecting\n",
                     SC_NODEF_ARGS(sc),
                     be32_to_cpu(hand->o2net_keepalive_delay_ms),
-                    o2net_keepalive_delay(sc->sc_node));
+                    o2net_keepalive_delay());
                o2net_ensure_shutdown(nn, sc, -ENOTCONN);
                return -1;
        }
@@ -1193,6 +1256,7 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
         * shut down already */
        if (nn->nn_sc == sc) {
                o2net_sc_reset_idle_timer(sc);
+               atomic_set(&nn->nn_timeout, 0);
                o2net_set_nn_state(nn, sc, 1, 0);
        }
        spin_unlock(&nn->nn_lock);
@@ -1347,12 +1411,11 @@ static void o2net_initialize_handshake(void)
 {
        o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
                O2HB_MAX_WRITE_TIMEOUT_MS);
-       o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(
-               o2net_idle_timeout(NULL));
+       o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(o2net_idle_timeout());
        o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32(
-               o2net_keepalive_delay(NULL));
+               o2net_keepalive_delay());
        o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32(
-               o2net_reconnect_delay(NULL));
+               o2net_reconnect_delay());
 }
 
 /* ------------------------------------------------------------ */
@@ -1391,14 +1454,15 @@ static void o2net_sc_send_keep_req(struct work_struct *work)
 static void o2net_idle_timer(unsigned long data)
 {
        struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
+       struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
        struct timeval now;
 
        do_gettimeofday(&now);
 
        printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
             "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
-                    o2net_idle_timeout(sc->sc_node) / 1000,
-                    o2net_idle_timeout(sc->sc_node) % 1000);
+                    o2net_idle_timeout() / 1000,
+                    o2net_idle_timeout() % 1000);
        mlog(ML_NOTICE, "here are some times that might help debug the "
             "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
             "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
@@ -1413,6 +1477,12 @@ static void o2net_idle_timer(unsigned long data)
             sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
             sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
 
+       /*
+        * Initialize the nn_timeout so that the next connection attempt
+        * will continue in o2net_start_connect.
+        */
+       atomic_set(&nn->nn_timeout, 1);
+
        o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
 }
 
@@ -1420,10 +1490,10 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
 {
        o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
        o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
-                     msecs_to_jiffies(o2net_keepalive_delay(sc->sc_node)));
+                     msecs_to_jiffies(o2net_keepalive_delay()));
        do_gettimeofday(&sc->sc_tv_timer);
        mod_timer(&sc->sc_idle_timeout,
-              jiffies + msecs_to_jiffies(o2net_idle_timeout(sc->sc_node)));
+              jiffies + msecs_to_jiffies(o2net_idle_timeout()));
 }
 
 static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
@@ -1447,6 +1517,7 @@ static void o2net_start_connect(struct work_struct *work)
        struct socket *sock = NULL;
        struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
        int ret = 0, stop;
+       unsigned int timeout;
 
        /* if we're greater we initiate tx, otherwise we accept */
        if (o2nm_this_node() <= o2net_num_from_nn(nn))
@@ -1466,8 +1537,17 @@ static void o2net_start_connect(struct work_struct *work)
        }
 
        spin_lock(&nn->nn_lock);
-       /* see if we already have one pending or have given up */
-       stop = (nn->nn_sc || nn->nn_persistent_error);
+       /*
+        * see if we already have one pending or have given up.
+        * For nn_timeout, it is set when we close the connection
+        * because of the idle time out. So it means that we have
+        * at least connected to that node successfully once,
+        * now try to connect to it again.
+        */
+       timeout = atomic_read(&nn->nn_timeout);
+       stop = (nn->nn_sc ||
+               (nn->nn_persistent_error &&
+               (nn->nn_persistent_error != -ENOTCONN || timeout == 0)));
        spin_unlock(&nn->nn_lock);
        if (stop)
                goto out;
@@ -1555,8 +1635,8 @@ static void o2net_connect_expired(struct work_struct *work)
                mlog(ML_ERROR, "no connection established with node %u after "
                     "%u.%u seconds, giving up and returning errors.\n",
                     o2net_num_from_nn(nn),
-                    o2net_idle_timeout(NULL) / 1000,
-                    o2net_idle_timeout(NULL) % 1000);
+                    o2net_idle_timeout() / 1000,
+                    o2net_idle_timeout() % 1000);
 
                o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
        }
@@ -1579,6 +1659,7 @@ void o2net_disconnect_node(struct o2nm_node *node)
 
        /* don't reconnect until it's heartbeating again */
        spin_lock(&nn->nn_lock);
+       atomic_set(&nn->nn_timeout, 0);
        o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
        spin_unlock(&nn->nn_lock);
 
@@ -1610,20 +1691,15 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
 
        /* ensure an immediate connect attempt */
        nn->nn_last_connect_attempt = jiffies -
-               (msecs_to_jiffies(o2net_reconnect_delay(node)) + 1);
+               (msecs_to_jiffies(o2net_reconnect_delay()) + 1);
 
        if (node_num != o2nm_this_node()) {
-               /* heartbeat doesn't work unless a local node number is
-                * configured and doing so brings up the o2net_wq, so we can
-                * use it.. */
-               queue_delayed_work(o2net_wq, &nn->nn_connect_expired,
-                                  msecs_to_jiffies(o2net_idle_timeout(node)));
-
                /* believe it or not, accept and node hearbeating testing
                 * can succeed for this node before we got here.. so
                 * only use set_nn_state to clear the persistent error
                 * if that hasn't already happened */
                spin_lock(&nn->nn_lock);
+               atomic_set(&nn->nn_timeout, 0);
                if (nn->nn_persistent_error)
                        o2net_set_nn_state(nn, NULL, 0, 0);
                spin_unlock(&nn->nn_lock);
@@ -1747,6 +1823,7 @@ static int o2net_accept_one(struct socket *sock)
        new_sock = NULL;
 
        spin_lock(&nn->nn_lock);
+       atomic_set(&nn->nn_timeout, 0);
        o2net_set_nn_state(nn, sc, 0, 0);
        spin_unlock(&nn->nn_lock);
 
@@ -1922,6 +1999,9 @@ int o2net_init(void)
 
        o2quo_init();
 
+       if (o2net_debugfs_init())
+               return -ENOMEM;
+
        o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL);
        o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
        o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
@@ -1941,6 +2021,7 @@ int o2net_init(void)
        for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
                struct o2net_node *nn = o2net_nn_from_num(i);
 
+               atomic_set(&nn->nn_timeout, 0);
                spin_lock_init(&nn->nn_lock);
                INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect);
                INIT_DELAYED_WORK(&nn->nn_connect_expired,
@@ -1962,4 +2043,5 @@ void o2net_exit(void)
        kfree(o2net_hand);
        kfree(o2net_keep_req);
        kfree(o2net_keep_resp);
+       o2net_debugfs_exit();
 }
index f36f66aab3dd788e200ddca6be96546939af03ad..a705d5d1903688ee7e3decb224e15c49f55ca787 100644 (file)
@@ -117,4 +117,36 @@ int o2net_num_connected_peers(void);
 int o2net_init(void);
 void o2net_exit(void);
 
+struct o2net_send_tracking;
+struct o2net_sock_container;
+
+#ifdef CONFIG_DEBUG_FS
+int o2net_debugfs_init(void);
+void o2net_debugfs_exit(void);
+void o2net_debug_add_nst(struct o2net_send_tracking *nst);
+void o2net_debug_del_nst(struct o2net_send_tracking *nst);
+void o2net_debug_add_sc(struct o2net_sock_container *sc);
+void o2net_debug_del_sc(struct o2net_sock_container *sc);
+#else
+static int o2net_debugfs_init(void)
+{
+       return 0;
+}
+static void o2net_debugfs_exit(void)
+{
+}
+static void o2net_debug_add_nst(struct o2net_send_tracking *nst)
+{
+}
+static void o2net_debug_del_nst(struct o2net_send_tracking *nst)
+{
+}
+static void o2net_debug_add_sc(struct o2net_sock_container *sc)
+{
+}
+static void o2net_debug_del_sc(struct o2net_sock_container *sc)
+{
+}
+#endif /* CONFIG_DEBUG_FS */
+
 #endif /* O2CLUSTER_TCP_H */
index d25b9af28500d99963623a873e8db3e1298b4683..8d58cfe410b13babe15d68c35b3f62a6be7331c3 100644 (file)
@@ -95,6 +95,8 @@ struct o2net_node {
        unsigned                        nn_sc_valid:1;
        /* if this is set tx just returns it */
        int                             nn_persistent_error;
+       /* It is only set to 1 after the idle time out. */
+       atomic_t                        nn_timeout;
 
        /* threads waiting for an sc to arrive wait on the wq for generation
         * to increase.  it is increased when a connecting socket succeeds
@@ -164,7 +166,9 @@ struct o2net_sock_container {
        /* original handlers for the sockets */
        void                    (*sc_state_change)(struct sock *sk);
        void                    (*sc_data_ready)(struct sock *sk, int bytes);
-
+#ifdef CONFIG_DEBUG_FS
+       struct list_head        sc_net_debug_item;
+#endif
        struct timeval          sc_tv_timer;
        struct timeval          sc_tv_data_ready;
        struct timeval          sc_tv_advance_start;
@@ -206,4 +210,24 @@ struct o2net_status_wait {
        struct list_head        ns_node_item;
 };
 
+#ifdef CONFIG_DEBUG_FS
+/* just for state dumps */
+struct o2net_send_tracking {
+       struct list_head                st_net_debug_item;
+       struct task_struct              *st_task;
+       struct o2net_sock_container     *st_sc;
+       u32                             st_id;
+       u32                             st_msg_type;
+       u32                             st_msg_key;
+       u8                              st_node;
+       struct timeval                  st_sock_time;
+       struct timeval                  st_send_time;
+       struct timeval                  st_status_time;
+};
+#else
+struct o2net_send_tracking {
+       u32     dummy;
+};
+#endif /* CONFIG_DEBUG_FS */
+
 #endif /* O2CLUSTER_TCP_INTERNAL_H */
index ce3f7c29d27013944d65a6c3144d8853ff5882e2..190361375700aa02aeb43f6e87623e9dea39e73a 100644 (file)
@@ -1,6 +1,6 @@
 EXTRA_CFLAGS += -Ifs/ocfs2
 
-obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o ocfs2_dlmfs.o
+obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlmfs.o
 
 ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
        dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
index dc8ea666efdb77004aece531c3a7a6670d6288db..d5a86fb81a4902adff4426b552cd3ae19a5d926d 100644 (file)
 /* Intended to make it easier for us to switch out hash functions */
 #define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
 
+enum dlm_mle_type {
+       DLM_MLE_BLOCK,
+       DLM_MLE_MASTER,
+       DLM_MLE_MIGRATION
+};
+
+struct dlm_lock_name {
+       u8 len;
+       u8 name[DLM_LOCKID_NAME_MAX];
+};
+
+struct dlm_master_list_entry {
+       struct list_head list;
+       struct list_head hb_events;
+       struct dlm_ctxt *dlm;
+       spinlock_t spinlock;
+       wait_queue_head_t wq;
+       atomic_t woken;
+       struct kref mle_refs;
+       int inuse;
+       unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+       unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+       unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+       unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+       u8 master;
+       u8 new_master;
+       enum dlm_mle_type type;
+       struct o2hb_callback_func mle_hb_up;
+       struct o2hb_callback_func mle_hb_down;
+       union {
+               struct dlm_lock_resource *res;
+               struct dlm_lock_name name;
+       } u;
+};
+
 enum dlm_ast_type {
        DLM_AST = 0,
        DLM_BAST,
@@ -101,6 +136,7 @@ struct dlm_ctxt
        struct list_head purge_list;
        struct list_head pending_asts;
        struct list_head pending_basts;
+       struct list_head tracking_list;
        unsigned int purge_count;
        spinlock_t spinlock;
        spinlock_t ast_lock;
@@ -122,6 +158,9 @@ struct dlm_ctxt
        atomic_t remote_resources;
        atomic_t unknown_resources;
 
+       struct dlm_debug_ctxt *dlm_debug_ctxt;
+       struct dentry *dlm_debugfs_subroot;
+
        /* NOTE: Next three are protected by dlm_domain_lock */
        struct kref dlm_refs;
        enum dlm_ctxt_state dlm_state;
@@ -270,6 +309,9 @@ struct dlm_lock_resource
        struct list_head dirty;
        struct list_head recovering; // dlm_recovery_ctxt.resources list
 
+       /* Added during init and removed during release */
+       struct list_head tracking;      /* dlm->tracking_list */
+
        /* unused lock resources have their last_used stamped and are
         * put on a list for the dlm thread to run. */
        unsigned long    last_used;
@@ -963,9 +1005,16 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
                                          DLM_LOCK_RES_MIGRATING));
 }
 
+/* create/destroy slab caches */
+int dlm_init_master_caches(void);
+void dlm_destroy_master_caches(void);
+
+int dlm_init_lock_cache(void);
+void dlm_destroy_lock_cache(void);
 
 int dlm_init_mle_cache(void);
 void dlm_destroy_mle_cache(void);
+
 void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up);
 int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
                         struct dlm_lock_resource *res);
index 64239b37e5d4bcd9a48c902469f8a85881f38568..5f6d858770a2b9992a7301561467e6b3fdbbdd45 100644 (file)
@@ -5,7 +5,7 @@
  *
  * debug functionality for the dlm
  *
- * Copyright (C) 2004 Oracle.  All rights reserved.
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
@@ -30,6 +30,7 @@
 #include <linux/utsname.h>
 #include <linux/sysctl.h>
 #include <linux/spinlock.h>
+#include <linux/debugfs.h>
 
 #include "cluster/heartbeat.h"
 #include "cluster/nodemanager.h"
 
 #include "dlmapi.h"
 #include "dlmcommon.h"
-
 #include "dlmdomain.h"
+#include "dlmdebug.h"
 
 #define MLOG_MASK_PREFIX ML_DLM
 #include "cluster/masklog.h"
 
+int stringify_lockname(const char *lockname, int locklen, char *buf, int len);
+
 void dlm_print_one_lock_resource(struct dlm_lock_resource *res)
 {
-       mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
-              res->lockname.len, res->lockname.name,
-              res->owner, res->state);
        spin_lock(&res->spinlock);
        __dlm_print_one_lock_resource(res);
        spin_unlock(&res->spinlock);
@@ -58,7 +58,7 @@ static void dlm_print_lockres_refmap(struct dlm_lock_resource *res)
        int bit;
        assert_spin_locked(&res->spinlock);
 
-       mlog(ML_NOTICE, "  refmap nodes: [ ");
+       printk("  refmap nodes: [ ");
        bit = 0;
        while (1) {
                bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
@@ -70,63 +70,66 @@ static void dlm_print_lockres_refmap(struct dlm_lock_resource *res)
        printk("], inflight=%u\n", res->inflight_locks);
 }
 
+static void __dlm_print_lock(struct dlm_lock *lock)
+{
+       spin_lock(&lock->spinlock);
+
+       printk("    type=%d, conv=%d, node=%u, cookie=%u:%llu, "
+              "ref=%u, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c), "
+              "pending=(conv=%c,lock=%c,cancel=%c,unlock=%c)\n",
+              lock->ml.type, lock->ml.convert_type, lock->ml.node,
+              dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+              dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+              atomic_read(&lock->lock_refs.refcount),
+              (list_empty(&lock->ast_list) ? 'y' : 'n'),
+              (lock->ast_pending ? 'y' : 'n'),
+              (list_empty(&lock->bast_list) ? 'y' : 'n'),
+              (lock->bast_pending ? 'y' : 'n'),
+              (lock->convert_pending ? 'y' : 'n'),
+              (lock->lock_pending ? 'y' : 'n'),
+              (lock->cancel_pending ? 'y' : 'n'),
+              (lock->unlock_pending ? 'y' : 'n'));
+
+       spin_unlock(&lock->spinlock);
+}
+
 void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
 {
        struct list_head *iter2;
        struct dlm_lock *lock;
+       char buf[DLM_LOCKID_NAME_MAX];
 
        assert_spin_locked(&res->spinlock);
 
-       mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
-              res->lockname.len, res->lockname.name,
-              res->owner, res->state);
-       mlog(ML_NOTICE, "  last used: %lu, on purge list: %s\n",
-            res->last_used, list_empty(&res->purge) ? "no" : "yes");
+       stringify_lockname(res->lockname.name, res->lockname.len,
+                          buf, sizeof(buf) - 1);
+       printk("lockres: %s, owner=%u, state=%u\n",
+              buf, res->owner, res->state);
+       printk("  last used: %lu, refcnt: %u, on purge list: %s\n",
+              res->last_used, atomic_read(&res->refs.refcount),
+              list_empty(&res->purge) ? "no" : "yes");
+       printk("  on dirty list: %s, on reco list: %s, "
+              "migrating pending: %s\n",
+              list_empty(&res->dirty) ? "no" : "yes",
+              list_empty(&res->recovering) ? "no" : "yes",
+              res->migration_pending ? "yes" : "no");
+       printk("  inflight locks: %d, asts reserved: %d\n",
+              res->inflight_locks, atomic_read(&res->asts_reserved));
        dlm_print_lockres_refmap(res);
-       mlog(ML_NOTICE, "  granted queue: \n");
+       printk("  granted queue:\n");
        list_for_each(iter2, &res->granted) {
                lock = list_entry(iter2, struct dlm_lock, list);
-               spin_lock(&lock->spinlock);
-               mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
-                      "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
-                      lock->ml.type, lock->ml.convert_type, lock->ml.node, 
-                    dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-                    dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-                      list_empty(&lock->ast_list) ? 'y' : 'n',
-                      lock->ast_pending ? 'y' : 'n',
-                      list_empty(&lock->bast_list) ? 'y' : 'n',
-                      lock->bast_pending ? 'y' : 'n');
-               spin_unlock(&lock->spinlock);
+               __dlm_print_lock(lock);
        }
-       mlog(ML_NOTICE, "  converting queue: \n");
+       printk("  converting queue:\n");
        list_for_each(iter2, &res->converting) {
                lock = list_entry(iter2, struct dlm_lock, list);
-               spin_lock(&lock->spinlock);
-               mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
-                      "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
-                      lock->ml.type, lock->ml.convert_type, lock->ml.node, 
-                    dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-                    dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-                      list_empty(&lock->ast_list) ? 'y' : 'n',
-                      lock->ast_pending ? 'y' : 'n',
-                      list_empty(&lock->bast_list) ? 'y' : 'n',
-                      lock->bast_pending ? 'y' : 'n');
-               spin_unlock(&lock->spinlock);
+               __dlm_print_lock(lock);
        }
-       mlog(ML_NOTICE, "  blocked queue: \n");
+       printk("  blocked queue:\n");
        list_for_each(iter2, &res->blocked) {
                lock = list_entry(iter2, struct dlm_lock, list);
-               spin_lock(&lock->spinlock);
-               mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
-                      "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
-                      lock->ml.type, lock->ml.convert_type, lock->ml.node, 
-                    dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
-                    dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-                      list_empty(&lock->ast_list) ? 'y' : 'n',
-                      lock->ast_pending ? 'y' : 'n',
-                      list_empty(&lock->bast_list) ? 'y' : 'n',
-                      lock->bast_pending ? 'y' : 'n');
-               spin_unlock(&lock->spinlock);
+               __dlm_print_lock(lock);
        }
 }
 
@@ -136,31 +139,6 @@ void dlm_print_one_lock(struct dlm_lock *lockid)
 }
 EXPORT_SYMBOL_GPL(dlm_print_one_lock);
 
-#if 0
-void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
-{
-       struct dlm_lock_resource *res;
-       struct hlist_node *iter;
-       struct hlist_head *bucket;
-       int i;
-
-       mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n",
-                 dlm->name, dlm->node_num, dlm->key);
-       if (!dlm || !dlm->name) {
-               mlog(ML_ERROR, "dlm=%p\n", dlm);
-               return;
-       }
-
-       spin_lock(&dlm->spinlock);
-       for (i=0; i<DLM_HASH_BUCKETS; i++) {
-               bucket = dlm_lockres_hash(dlm, i);
-               hlist_for_each_entry(res, iter, bucket, hash_node)
-                       dlm_print_one_lock_resource(res);
-       }
-       spin_unlock(&dlm->spinlock);
-}
-#endif  /*  0  */
-
 static const char *dlm_errnames[] = {
        [DLM_NORMAL] =                  "DLM_NORMAL",
        [DLM_GRANTED] =                 "DLM_GRANTED",
@@ -266,3 +244,792 @@ const char *dlm_errname(enum dlm_status err)
        return dlm_errnames[err];
 }
 EXPORT_SYMBOL_GPL(dlm_errname);
+
+/* NOTE: This function converts a lockname into a string. It uses knowledge
+ * of the format of the lockname that should be outside the purview of the dlm.
+ * We are adding only to make dlm debugging slightly easier.
+ *
+ * For more on lockname formats, please refer to dlmglue.c and ocfs2_lockid.h.
+ */
+int stringify_lockname(const char *lockname, int locklen, char *buf, int len)
+{
+       int out = 0;
+       __be64 inode_blkno_be;
+
+#define OCFS2_DENTRY_LOCK_INO_START    18
+       if (*lockname == 'N') {
+               memcpy((__be64 *)&inode_blkno_be,
+                      (char *)&lockname[OCFS2_DENTRY_LOCK_INO_START],
+                      sizeof(__be64));
+               out += snprintf(buf + out, len - out, "%.*s%08x",
+                               OCFS2_DENTRY_LOCK_INO_START - 1, lockname,
+                               (unsigned int)be64_to_cpu(inode_blkno_be));
+       } else
+               out += snprintf(buf + out, len - out, "%.*s",
+                               locklen, lockname);
+       return out;
+}
+
+static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
+                            char *buf, int len)
+{
+       int out = 0;
+       int i = -1;
+
+       while ((i = find_next_bit(nodemap, maxnodes, i + 1)) < maxnodes)
+               out += snprintf(buf + out, len - out, "%d ", i);
+
+       return out;
+}
+
+static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
+{
+       int out = 0;
+       unsigned int namelen;
+       const char *name;
+       char *mle_type;
+
+       if (mle->type != DLM_MLE_MASTER) {
+               namelen = mle->u.name.len;
+               name = mle->u.name.name;
+       } else {
+               namelen = mle->u.res->lockname.len;
+               name = mle->u.res->lockname.name;
+       }
+
+       if (mle->type == DLM_MLE_BLOCK)
+               mle_type = "BLK";
+       else if (mle->type == DLM_MLE_MASTER)
+               mle_type = "MAS";
+       else
+               mle_type = "MIG";
+
+       out += stringify_lockname(name, namelen, buf + out, len - out);
+       out += snprintf(buf + out, len - out,
+                       "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
+                       mle_type, mle->master, mle->new_master,
+                       !list_empty(&mle->hb_events),
+                       !!mle->inuse,
+                       atomic_read(&mle->mle_refs.refcount));
+
+       out += snprintf(buf + out, len - out, "Maybe=");
+       out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES,
+                                buf + out, len - out);
+       out += snprintf(buf + out, len - out, "\n");
+
+       out += snprintf(buf + out, len - out, "Vote=");
+       out += stringify_nodemap(mle->vote_map, O2NM_MAX_NODES,
+                                buf + out, len - out);
+       out += snprintf(buf + out, len - out, "\n");
+
+       out += snprintf(buf + out, len - out, "Response=");
+       out += stringify_nodemap(mle->response_map, O2NM_MAX_NODES,
+                                buf + out, len - out);
+       out += snprintf(buf + out, len - out, "\n");
+
+       out += snprintf(buf + out, len - out, "Node=");
+       out += stringify_nodemap(mle->node_map, O2NM_MAX_NODES,
+                                buf + out, len - out);
+       out += snprintf(buf + out, len - out, "\n");
+
+       out += snprintf(buf + out, len - out, "\n");
+
+       return out;
+}
+
+void dlm_print_one_mle(struct dlm_master_list_entry *mle)
+{
+       char *buf;
+
+       buf = (char *) get_zeroed_page(GFP_NOFS);
+       if (buf) {
+               dump_mle(mle, buf, PAGE_SIZE - 1);
+               free_page((unsigned long)buf);
+       }
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+static struct dentry *dlm_debugfs_root = NULL;
+
+#define DLM_DEBUGFS_DIR                                "o2dlm"
+#define DLM_DEBUGFS_DLM_STATE                  "dlm_state"
+#define DLM_DEBUGFS_LOCKING_STATE              "locking_state"
+#define DLM_DEBUGFS_MLE_STATE                  "mle_state"
+#define DLM_DEBUGFS_PURGE_LIST                 "purge_list"
+
+/* begin - utils funcs */
+static void dlm_debug_free(struct kref *kref)
+{
+       struct dlm_debug_ctxt *dc;
+
+       dc = container_of(kref, struct dlm_debug_ctxt, debug_refcnt);
+
+       kfree(dc);
+}
+
+void dlm_debug_put(struct dlm_debug_ctxt *dc)
+{
+       if (dc)
+               kref_put(&dc->debug_refcnt, dlm_debug_free);
+}
+
+static void dlm_debug_get(struct dlm_debug_ctxt *dc)
+{
+       kref_get(&dc->debug_refcnt);
+}
+
+static struct debug_buffer *debug_buffer_allocate(void)
+{
+       struct debug_buffer *db = NULL;
+
+       db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL);
+       if (!db)
+               goto bail;
+
+       db->len = PAGE_SIZE;
+       db->buf = kmalloc(db->len, GFP_KERNEL);
+       if (!db->buf)
+               goto bail;
+
+       return db;
+bail:
+       kfree(db);
+       return NULL;
+}
+
+static ssize_t debug_buffer_read(struct file *file, char __user *buf,
+                                size_t nbytes, loff_t *ppos)
+{
+       struct debug_buffer *db = file->private_data;
+
+       return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len);
+}
+
+static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
+{
+       struct debug_buffer *db = file->private_data;
+       loff_t new = -1;
+
+       switch (whence) {
+       case 0:
+               new = off;
+               break;
+       case 1:
+               new = file->f_pos + off;
+               break;
+       }
+
+       if (new < 0 || new > db->len)
+               return -EINVAL;
+
+       return (file->f_pos = new);
+}
+
+static int debug_buffer_release(struct inode *inode, struct file *file)
+{
+       struct debug_buffer *db = (struct debug_buffer *)file->private_data;
+
+       if (db)
+               kfree(db->buf);
+       kfree(db);
+
+       return 0;
+}
+/* end - util funcs */
+
+/* begin - purge list funcs */
+static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+{
+       struct dlm_lock_resource *res;
+       int out = 0;
+       unsigned long total = 0;
+
+       out += snprintf(db->buf + out, db->len - out,
+                       "Dumping Purgelist for Domain: %s\n", dlm->name);
+
+       spin_lock(&dlm->spinlock);
+       list_for_each_entry(res, &dlm->purge_list, purge) {
+               ++total;
+               if (db->len - out < 100)
+                       continue;
+               spin_lock(&res->spinlock);
+               out += stringify_lockname(res->lockname.name,
+                                         res->lockname.len,
+                                         db->buf + out, db->len - out);
+               out += snprintf(db->buf + out, db->len - out, "\t%ld\n",
+                               (jiffies - res->last_used)/HZ);
+               spin_unlock(&res->spinlock);
+       }
+       spin_unlock(&dlm->spinlock);
+
+       out += snprintf(db->buf + out, db->len - out,
+                       "Total on list: %ld\n", total);
+
+       return out;
+}
+
+static int debug_purgelist_open(struct inode *inode, struct file *file)
+{
+       struct dlm_ctxt *dlm = inode->i_private;
+       struct debug_buffer *db;
+
+       db = debug_buffer_allocate();
+       if (!db)
+               goto bail;
+
+       db->len = debug_purgelist_print(dlm, db);
+
+       file->private_data = db;
+
+       return 0;
+bail:
+       return -ENOMEM;
+}
+
+static struct file_operations debug_purgelist_fops = {
+       .open =         debug_purgelist_open,
+       .release =      debug_buffer_release,
+       .read =         debug_buffer_read,
+       .llseek =       debug_buffer_llseek,
+};
+/* end - purge list funcs */
+
+/* begin - debug mle funcs */
+static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+{
+       struct dlm_master_list_entry *mle;
+       int out = 0;
+       unsigned long total = 0;
+
+       out += snprintf(db->buf + out, db->len - out,
+                       "Dumping MLEs for Domain: %s\n", dlm->name);
+
+       spin_lock(&dlm->master_lock);
+       list_for_each_entry(mle, &dlm->master_list, list) {
+               ++total;
+               if (db->len - out < 200)
+                       continue;
+               out += dump_mle(mle, db->buf + out, db->len - out);
+       }
+       spin_unlock(&dlm->master_lock);
+
+       out += snprintf(db->buf + out, db->len - out,
+                       "Total on list: %ld\n", total);
+       return out;
+}
+
+static int debug_mle_open(struct inode *inode, struct file *file)
+{
+       struct dlm_ctxt *dlm = inode->i_private;
+       struct debug_buffer *db;
+
+       db = debug_buffer_allocate();
+       if (!db)
+               goto bail;
+
+       db->len = debug_mle_print(dlm, db);
+
+       file->private_data = db;
+
+       return 0;
+bail:
+       return -ENOMEM;
+}
+
+static struct file_operations debug_mle_fops = {
+       .open =         debug_mle_open,
+       .release =      debug_buffer_release,
+       .read =         debug_buffer_read,
+       .llseek =       debug_buffer_llseek,
+};
+
+/* end - debug mle funcs */
+
+/* begin - debug lockres funcs */
+static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
+{
+       int out;
+
+#define DEBUG_LOCK_VERSION     1
+       spin_lock(&lock->spinlock);
+       out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d,"
+                      "%d,%d,%d,%d\n",
+                      DEBUG_LOCK_VERSION,
+                      list_type, lock->ml.type, lock->ml.convert_type,
+                      lock->ml.node,
+                      dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+                      dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+                      !list_empty(&lock->ast_list),
+                      !list_empty(&lock->bast_list),
+                      lock->ast_pending, lock->bast_pending,
+                      lock->convert_pending, lock->lock_pending,
+                      lock->cancel_pending, lock->unlock_pending,
+                      atomic_read(&lock->lock_refs.refcount));
+       spin_unlock(&lock->spinlock);
+
+       return out;
+}
+
+static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len)
+{
+       struct dlm_lock *lock;
+       int i;
+       int out = 0;
+
+       out += snprintf(buf + out, len - out, "NAME:");
+       out += stringify_lockname(res->lockname.name, res->lockname.len,
+                                 buf + out, len - out);
+       out += snprintf(buf + out, len - out, "\n");
+
+#define DEBUG_LRES_VERSION     1
+       out += snprintf(buf + out, len - out,
+                       "LRES:%d,%d,%d,%ld,%d,%d,%d,%d,%d,%d,%d\n",
+                       DEBUG_LRES_VERSION,
+                       res->owner, res->state, res->last_used,
+                       !list_empty(&res->purge),
+                       !list_empty(&res->dirty),
+                       !list_empty(&res->recovering),
+                       res->inflight_locks, res->migration_pending,
+                       atomic_read(&res->asts_reserved),
+                       atomic_read(&res->refs.refcount));
+
+       /* refmap */
+       out += snprintf(buf + out, len - out, "RMAP:");
+       out += stringify_nodemap(res->refmap, O2NM_MAX_NODES,
+                                buf + out, len - out);
+       out += snprintf(buf + out, len - out, "\n");
+
+       /* lvb */
+       out += snprintf(buf + out, len - out, "LVBX:");
+       for (i = 0; i < DLM_LVB_LEN; i++)
+               out += snprintf(buf + out, len - out,
+                                       "%02x", (unsigned char)res->lvb[i]);
+       out += snprintf(buf + out, len - out, "\n");
+
+       /* granted */
+       list_for_each_entry(lock, &res->granted, list)
+               out += dump_lock(lock, 0, buf + out, len - out);
+
+       /* converting */
+       list_for_each_entry(lock, &res->converting, list)
+               out += dump_lock(lock, 1, buf + out, len - out);
+
+       /* blocked */
+       list_for_each_entry(lock, &res->blocked, list)
+               out += dump_lock(lock, 2, buf + out, len - out);
+
+       out += snprintf(buf + out, len - out, "\n");
+
+       return out;
+}
+
+static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
+{
+       struct debug_lockres *dl = m->private;
+       struct dlm_ctxt *dlm = dl->dl_ctxt;
+       struct dlm_lock_resource *res = NULL;
+
+       spin_lock(&dlm->spinlock);
+
+       if (dl->dl_res) {
+               list_for_each_entry(res, &dl->dl_res->tracking, tracking) {
+                       if (dl->dl_res) {
+                               dlm_lockres_put(dl->dl_res);
+                               dl->dl_res = NULL;
+                       }
+                       if (&res->tracking == &dlm->tracking_list) {
+                               mlog(0, "End of list found, %p\n", res);
+                               dl = NULL;
+                               break;
+                       }
+                       dlm_lockres_get(res);
+                       dl->dl_res = res;
+                       break;
+               }
+       } else {
+               if (!list_empty(&dlm->tracking_list)) {
+                       list_for_each_entry(res, &dlm->tracking_list, tracking)
+                               break;
+                       dlm_lockres_get(res);
+                       dl->dl_res = res;
+               } else
+                       dl = NULL;
+       }
+
+       if (dl) {
+               spin_lock(&dl->dl_res->spinlock);
+               dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1);
+               spin_unlock(&dl->dl_res->spinlock);
+       }
+
+       spin_unlock(&dlm->spinlock);
+
+       return dl;
+}
+
+static void lockres_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+static void *lockres_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+       return NULL;
+}
+
+static int lockres_seq_show(struct seq_file *s, void *v)
+{
+       struct debug_lockres *dl = (struct debug_lockres *)v;
+
+       seq_printf(s, "%s", dl->dl_buf);
+
+       return 0;
+}
+
+static struct seq_operations debug_lockres_ops = {
+       .start =        lockres_seq_start,
+       .stop =         lockres_seq_stop,
+       .next =         lockres_seq_next,
+       .show =         lockres_seq_show,
+};
+
+static int debug_lockres_open(struct inode *inode, struct file *file)
+{
+       struct dlm_ctxt *dlm = inode->i_private;
+       int ret = -ENOMEM;
+       struct seq_file *seq;
+       struct debug_lockres *dl = NULL;
+
+       dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL);
+       if (!dl) {
+               mlog_errno(ret);
+               goto bail;
+       }
+
+       dl->dl_len = PAGE_SIZE;
+       dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL);
+       if (!dl->dl_buf) {
+               mlog_errno(ret);
+               goto bail;
+       }
+
+       ret = seq_open(file, &debug_lockres_ops);
+       if (ret) {
+               mlog_errno(ret);
+               goto bail;
+       }
+
+       seq = (struct seq_file *) file->private_data;
+       seq->private = dl;
+
+       dlm_grab(dlm);
+       dl->dl_ctxt = dlm;
+
+       return 0;
+bail:
+       if (dl)
+               kfree(dl->dl_buf);
+       kfree(dl);
+       return ret;
+}
+
+static int debug_lockres_release(struct inode *inode, struct file *file)
+{
+       struct seq_file *seq = (struct seq_file *)file->private_data;
+       struct debug_lockres *dl = (struct debug_lockres *)seq->private;
+
+       if (dl->dl_res)
+               dlm_lockres_put(dl->dl_res);
+       dlm_put(dl->dl_ctxt);
+       kfree(dl->dl_buf);
+       return seq_release_private(inode, file);
+}
+
+static struct file_operations debug_lockres_fops = {
+       .open =         debug_lockres_open,
+       .release =      debug_lockres_release,
+       .read =         seq_read,
+       .llseek =       seq_lseek,
+};
+/* end - debug lockres funcs */
+
+/* begin - debug state funcs */
+static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
+{
+       int out = 0;
+       struct dlm_reco_node_data *node;
+       char *state;
+       int lres, rres, ures, tres;
+
+       lres = atomic_read(&dlm->local_resources);
+       rres = atomic_read(&dlm->remote_resources);
+       ures = atomic_read(&dlm->unknown_resources);
+       tres = lres + rres + ures;
+
+       spin_lock(&dlm->spinlock);
+
+       switch (dlm->dlm_state) {
+       case DLM_CTXT_NEW:
+               state = "NEW"; break;
+       case DLM_CTXT_JOINED:
+               state = "JOINED"; break;
+       case DLM_CTXT_IN_SHUTDOWN:
+               state = "SHUTDOWN"; break;
+       case DLM_CTXT_LEAVING:
+               state = "LEAVING"; break;
+       default:
+               state = "UNKNOWN"; break;
+       }
+
+       /* Domain: xxxxxxxxxx  Key: 0xdfbac769 */
+       out += snprintf(db->buf + out, db->len - out,
+                       "Domain: %s  Key: 0x%08x\n", dlm->name, dlm->key);
+
+       /* Thread Pid: xxx  Node: xxx  State: xxxxx */
+       out += snprintf(db->buf + out, db->len - out,
+                       "Thread Pid: %d  Node: %d  State: %s\n",
+                       dlm->dlm_thread_task->pid, dlm->node_num, state);
+
+       /* Number of Joins: xxx  Joining Node: xxx */
+       out += snprintf(db->buf + out, db->len - out,
+                       "Number of Joins: %d  Joining Node: %d\n",
+                       dlm->num_joins, dlm->joining_node);
+
+       /* Domain Map: xx xx xx */
+       out += snprintf(db->buf + out, db->len - out, "Domain Map: ");
+       out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
+                                db->buf + out, db->len - out);
+       out += snprintf(db->buf + out, db->len - out, "\n");
+
+       /* Live Map: xx xx xx */
+       out += snprintf(db->buf + out, db->len - out, "Live Map: ");
+       out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
+                                db->buf + out, db->len - out);
+       out += snprintf(db->buf + out, db->len - out, "\n");
+
+       /* Mastered Resources Total: xxx  Locally: xxx  Remotely: ... */
+       out += snprintf(db->buf + out, db->len - out,
+                       "Mastered Resources Total: %d  Locally: %d  "
+                       "Remotely: %d  Unknown: %d\n",
+                       tres, lres, rres, ures);
+
+       /* Lists: Dirty=Empty  Purge=InUse  PendingASTs=Empty  ... */
+       out += snprintf(db->buf + out, db->len - out,
+                       "Lists: Dirty=%s  Purge=%s  PendingASTs=%s  "
+                       "PendingBASTs=%s  Master=%s\n",
+                       (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
+                       (list_empty(&dlm->purge_list) ? "Empty" : "InUse"),
+                       (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"),
+                       (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"),
+                       (list_empty(&dlm->master_list) ? "Empty" : "InUse"));
+
+       /* Purge Count: xxx  Refs: xxx */
+       out += snprintf(db->buf + out, db->len - out,
+                       "Purge Count: %d  Refs: %d\n", dlm->purge_count,
+                       atomic_read(&dlm->dlm_refs.refcount));
+
+       /* Dead Node: xxx */
+       out += snprintf(db->buf + out, db->len - out,
+                       "Dead Node: %d\n", dlm->reco.dead_node);
+
+       /* What about DLM_RECO_STATE_FINALIZE? */
+       if (dlm->reco.state == DLM_RECO_STATE_ACTIVE)
+               state = "ACTIVE";
+       else
+               state = "INACTIVE";
+
+       /* Recovery Pid: xxxx  Master: xxx  State: xxxx */
+       out += snprintf(db->buf + out, db->len - out,
+                       "Recovery Pid: %d  Master: %d  State: %s\n",
+                       dlm->dlm_reco_thread_task->pid,
+                       dlm->reco.new_master, state);
+
+       /* Recovery Map: xx xx */
+       out += snprintf(db->buf + out, db->len - out, "Recovery Map: ");
+       out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
+                                db->buf + out, db->len - out);
+       out += snprintf(db->buf + out, db->len - out, "\n");
+
+       /* Recovery Node State: */
+       out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n");
+       list_for_each_entry(node, &dlm->reco.node_data, list) {
+               switch (node->state) {
+               case DLM_RECO_NODE_DATA_INIT:
+                       state = "INIT";
+                       break;
+               case DLM_RECO_NODE_DATA_REQUESTING:
+                       state = "REQUESTING";
+                       break;
+               case DLM_RECO_NODE_DATA_DEAD:
+                       state = "DEAD";
+                       break;
+               case DLM_RECO_NODE_DATA_RECEIVING:
+                       state = "RECEIVING";
+                       break;
+               case DLM_RECO_NODE_DATA_REQUESTED:
+                       state = "REQUESTED";
+                       break;
+               case DLM_RECO_NODE_DATA_DONE:
+                       state = "DONE";
+                       break;
+               case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+                       state = "FINALIZE-SENT";
+                       break;
+               default:
+                       state = "BAD";
+                       break;
+               }
+               out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n",
+                               node->node_num, state);
+       }
+
+       spin_unlock(&dlm->spinlock);
+
+       return out;
+}
+
+static int debug_state_open(struct inode *inode, struct file *file)
+{
+       struct dlm_ctxt *dlm = inode->i_private;
+       struct debug_buffer *db = NULL;
+
+       db = debug_buffer_allocate();
+       if (!db)
+               goto bail;
+
+       db->len = debug_state_print(dlm, db);
+
+       file->private_data = db;
+
+       return 0;
+bail:
+       return -ENOMEM;
+}
+
+static struct file_operations debug_state_fops = {
+       .open =         debug_state_open,
+       .release =      debug_buffer_release,
+       .read =         debug_buffer_read,
+       .llseek =       debug_buffer_llseek,
+};
+/* end  - debug state funcs */
+
+/* files in subroot */
+int dlm_debug_init(struct dlm_ctxt *dlm)
+{
+       struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
+
+       /* for dumping dlm_ctxt */
+       dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE,
+                                                    S_IFREG|S_IRUSR,
+                                                    dlm->dlm_debugfs_subroot,
+                                                    dlm, &debug_state_fops);
+       if (!dc->debug_state_dentry) {
+               mlog_errno(-ENOMEM);
+               goto bail;
+       }
+
+       /* for dumping lockres */
+       dc->debug_lockres_dentry =
+                       debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE,
+                                           S_IFREG|S_IRUSR,
+                                           dlm->dlm_debugfs_subroot,
+                                           dlm, &debug_lockres_fops);
+       if (!dc->debug_lockres_dentry) {
+               mlog_errno(-ENOMEM);
+               goto bail;
+       }
+
+       /* for dumping mles */
+       dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE,
+                                                  S_IFREG|S_IRUSR,
+                                                  dlm->dlm_debugfs_subroot,
+                                                  dlm, &debug_mle_fops);
+       if (!dc->debug_mle_dentry) {
+               mlog_errno(-ENOMEM);
+               goto bail;
+       }
+
+       /* for dumping lockres on the purge list */
+       dc->debug_purgelist_dentry =
+                       debugfs_create_file(DLM_DEBUGFS_PURGE_LIST,
+                                           S_IFREG|S_IRUSR,
+                                           dlm->dlm_debugfs_subroot,
+                                           dlm, &debug_purgelist_fops);
+       if (!dc->debug_purgelist_dentry) {
+               mlog_errno(-ENOMEM);
+               goto bail;
+       }
+
+       dlm_debug_get(dc);
+       return 0;
+
+bail:
+       dlm_debug_shutdown(dlm);
+       return -ENOMEM;
+}
+
+void dlm_debug_shutdown(struct dlm_ctxt *dlm)
+{
+       struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
+
+       if (dc) {
+               if (dc->debug_purgelist_dentry)
+                       debugfs_remove(dc->debug_purgelist_dentry);
+               if (dc->debug_mle_dentry)
+                       debugfs_remove(dc->debug_mle_dentry);
+               if (dc->debug_lockres_dentry)
+                       debugfs_remove(dc->debug_lockres_dentry);
+               if (dc->debug_state_dentry)
+                       debugfs_remove(dc->debug_state_dentry);
+               dlm_debug_put(dc);
+       }
+}
+
+/* subroot - domain dir */
+int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+       dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name,
+                                                     dlm_debugfs_root);
+       if (!dlm->dlm_debugfs_subroot) {
+               mlog_errno(-ENOMEM);
+               goto bail;
+       }
+
+       dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt),
+                                     GFP_KERNEL);
+       if (!dlm->dlm_debug_ctxt) {
+               mlog_errno(-ENOMEM);
+               goto bail;
+       }
+       kref_init(&dlm->dlm_debug_ctxt->debug_refcnt);
+
+       return 0;
+bail:
+       dlm_destroy_debugfs_subroot(dlm);
+       return -ENOMEM;
+}
+
+void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+       if (dlm->dlm_debugfs_subroot)
+               debugfs_remove(dlm->dlm_debugfs_subroot);
+}
+
+/* debugfs root */
+int dlm_create_debugfs_root(void)
+{
+       dlm_debugfs_root = debugfs_create_dir(DLM_DEBUGFS_DIR, NULL);
+       if (!dlm_debugfs_root) {
+               mlog_errno(-ENOMEM);
+               return -ENOMEM;
+       }
+       return 0;
+}
+
+void dlm_destroy_debugfs_root(void)
+{
+       if (dlm_debugfs_root)
+               debugfs_remove(dlm_debugfs_root);
+}
+#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
new file mode 100644 (file)
index 0000000..d34a62a
--- /dev/null
@@ -0,0 +1,86 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmdebug.h
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ */
+
+#ifndef DLMDEBUG_H
+#define DLMDEBUG_H
+
+void dlm_print_one_mle(struct dlm_master_list_entry *mle);
+
+#ifdef CONFIG_DEBUG_FS
+
+struct dlm_debug_ctxt {
+       struct kref debug_refcnt;
+       struct dentry *debug_state_dentry;
+       struct dentry *debug_lockres_dentry;
+       struct dentry *debug_mle_dentry;
+       struct dentry *debug_purgelist_dentry;
+};
+
+struct debug_buffer {
+       int len;
+       char *buf;
+};
+
+struct debug_lockres {
+       int dl_len;
+       char *dl_buf;
+       struct dlm_ctxt *dl_ctxt;
+       struct dlm_lock_resource *dl_res;
+};
+
+int dlm_debug_init(struct dlm_ctxt *dlm);
+void dlm_debug_shutdown(struct dlm_ctxt *dlm);
+
+int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
+void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm);
+
+int dlm_create_debugfs_root(void);
+void dlm_destroy_debugfs_root(void);
+
+#else
+
+static int dlm_debug_init(struct dlm_ctxt *dlm)
+{
+       return 0;
+}
+static void dlm_debug_shutdown(struct dlm_ctxt *dlm)
+{
+}
+static int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+       return 0;
+}
+static void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
+{
+}
+static int dlm_create_debugfs_root(void)
+{
+       return 0;
+}
+static void dlm_destroy_debugfs_root(void)
+{
+}
+
+#endif /* CONFIG_DEBUG_FS */
+#endif /* DLMDEBUG_H */
index 0879d86113e347d2706d215223b81bda575cc002..63f8125824e8200a08c5daa3bebdcec6bf8374e1 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/spinlock.h>
 #include <linux/delay.h>
 #include <linux/err.h>
+#include <linux/debugfs.h>
 
 #include "cluster/heartbeat.h"
 #include "cluster/nodemanager.h"
@@ -40,8 +41,8 @@
 
 #include "dlmapi.h"
 #include "dlmcommon.h"
-
 #include "dlmdomain.h"
+#include "dlmdebug.h"
 
 #include "dlmver.h"
 
@@ -298,6 +299,8 @@ static int dlm_wait_on_domain_helper(const char *domain)
 
 static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
 {
+       dlm_destroy_debugfs_subroot(dlm);
+
        if (dlm->lockres_hash)
                dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
 
@@ -395,6 +398,7 @@ static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
 static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
 {
        dlm_unregister_domain_handlers(dlm);
+       dlm_debug_shutdown(dlm);
        dlm_complete_thread(dlm);
        dlm_complete_recovery_thread(dlm);
        dlm_destroy_dlm_worker(dlm);
@@ -644,6 +648,7 @@ int dlm_shutting_down(struct dlm_ctxt *dlm)
 void dlm_unregister_domain(struct dlm_ctxt *dlm)
 {
        int leave = 0;
+       struct dlm_lock_resource *res;
 
        spin_lock(&dlm_domain_lock);
        BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED);
@@ -673,6 +678,15 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
                        msleep(500);
                        mlog(0, "%s: more migration to do\n", dlm->name);
                }
+
+               /* This list should be empty. If not, print remaining lockres */
+               if (!list_empty(&dlm->tracking_list)) {
+                       mlog(ML_ERROR, "Following lockres' are still on the "
+                            "tracking list:\n");
+                       list_for_each_entry(res, &dlm->tracking_list, tracking)
+                               dlm_print_one_lock_resource(res);
+               }
+
                dlm_mark_domain_leaving(dlm);
                dlm_leave_domain(dlm);
                dlm_complete_dlm_shutdown(dlm);
@@ -1405,6 +1419,12 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
                goto bail;
        }
 
+       status = dlm_debug_init(dlm);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
        status = dlm_launch_thread(dlm);
        if (status < 0) {
                mlog_errno(status);
@@ -1472,6 +1492,7 @@ bail:
 
        if (status) {
                dlm_unregister_domain_handlers(dlm);
+               dlm_debug_shutdown(dlm);
                dlm_complete_thread(dlm);
                dlm_complete_recovery_thread(dlm);
                dlm_destroy_dlm_worker(dlm);
@@ -1484,6 +1505,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
                                u32 key)
 {
        int i;
+       int ret;
        struct dlm_ctxt *dlm = NULL;
 
        dlm = kzalloc(sizeof(*dlm), GFP_KERNEL);
@@ -1516,6 +1538,15 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        dlm->key = key;
        dlm->node_num = o2nm_this_node();
 
+       ret = dlm_create_debugfs_subroot(dlm);
+       if (ret < 0) {
+               dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+               kfree(dlm->name);
+               kfree(dlm);
+               dlm = NULL;
+               goto leave;
+       }
+
        spin_lock_init(&dlm->spinlock);
        spin_lock_init(&dlm->master_lock);
        spin_lock_init(&dlm->ast_lock);
@@ -1526,6 +1557,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        INIT_LIST_HEAD(&dlm->reco.node_data);
        INIT_LIST_HEAD(&dlm->purge_list);
        INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
+       INIT_LIST_HEAD(&dlm->tracking_list);
        dlm->reco.state = 0;
 
        INIT_LIST_HEAD(&dlm->pending_asts);
@@ -1816,21 +1848,49 @@ static int __init dlm_init(void)
        dlm_print_version();
 
        status = dlm_init_mle_cache();
-       if (status)
-               return -1;
+       if (status) {
+               mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
+               goto error;
+       }
+
+       status = dlm_init_master_caches();
+       if (status) {
+               mlog(ML_ERROR, "Could not create o2dlm_lockres and "
+                    "o2dlm_lockname slabcaches\n");
+               goto error;
+       }
+
+       status = dlm_init_lock_cache();
+       if (status) {
+               mlog(ML_ERROR, "Count not create o2dlm_lock slabcache\n");
+               goto error;
+       }
 
        status = dlm_register_net_handlers();
        if (status) {
-               dlm_destroy_mle_cache();
-               return -1;
+               mlog(ML_ERROR, "Unable to register network handlers\n");
+               goto error;
        }
 
+       status = dlm_create_debugfs_root();
+       if (status)
+               goto error;
+
        return 0;
+error:
+       dlm_unregister_net_handlers();
+       dlm_destroy_lock_cache();
+       dlm_destroy_master_caches();
+       dlm_destroy_mle_cache();
+       return -1;
 }
 
 static void __exit dlm_exit (void)
 {
+       dlm_destroy_debugfs_root();
        dlm_unregister_net_handlers();
+       dlm_destroy_lock_cache();
+       dlm_destroy_master_caches();
        dlm_destroy_mle_cache();
 }
 
index 52578d907d9ae3518d9c2106ea3f0ce559a0be2e..83a9f2972ac8189ca581ac9237ecf94a590b6f0d 100644 (file)
@@ -53,6 +53,8 @@
 #define MLOG_MASK_PREFIX ML_DLM
 #include "cluster/masklog.h"
 
+static struct kmem_cache *dlm_lock_cache = NULL;
+
 static DEFINE_SPINLOCK(dlm_cookie_lock);
 static u64 dlm_next_cookie = 1;
 
@@ -64,6 +66,22 @@ static void dlm_init_lock(struct dlm_lock *newlock, int type,
 static void dlm_lock_release(struct kref *kref);
 static void dlm_lock_detach_lockres(struct dlm_lock *lock);
 
+int dlm_init_lock_cache(void)
+{
+       dlm_lock_cache = kmem_cache_create("o2dlm_lock",
+                                          sizeof(struct dlm_lock),
+                                          0, SLAB_HWCACHE_ALIGN, NULL);
+       if (dlm_lock_cache == NULL)
+               return -ENOMEM;
+       return 0;
+}
+
+void dlm_destroy_lock_cache(void)
+{
+       if (dlm_lock_cache)
+               kmem_cache_destroy(dlm_lock_cache);
+}
+
 /* Tell us whether we can grant a new lock request.
  * locking:
  *   caller needs:  res->spinlock
@@ -353,7 +371,7 @@ static void dlm_lock_release(struct kref *kref)
                mlog(0, "freeing kernel-allocated lksb\n");
                kfree(lock->lksb);
        }
-       kfree(lock);
+       kmem_cache_free(dlm_lock_cache, lock);
 }
 
 /* associate a lock with it's lockres, getting a ref on the lockres */
@@ -412,7 +430,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
        struct dlm_lock *lock;
        int kernel_allocated = 0;
 
-       lock = kzalloc(sizeof(*lock), GFP_NOFS);
+       lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
        if (!lock)
                return NULL;
 
index ea6b8957786062ad91ab213155fc44ea078724b1..efc015c6128aecc9aae54772dbf71c9a88b39211 100644 (file)
 #include "dlmapi.h"
 #include "dlmcommon.h"
 #include "dlmdomain.h"
+#include "dlmdebug.h"
 
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
 #include "cluster/masklog.h"
 
-enum dlm_mle_type {
-       DLM_MLE_BLOCK,
-       DLM_MLE_MASTER,
-       DLM_MLE_MIGRATION
-};
-
-struct dlm_lock_name
-{
-       u8 len;
-       u8 name[DLM_LOCKID_NAME_MAX];
-};
-
-struct dlm_master_list_entry
-{
-       struct list_head list;
-       struct list_head hb_events;
-       struct dlm_ctxt *dlm;
-       spinlock_t spinlock;
-       wait_queue_head_t wq;
-       atomic_t woken;
-       struct kref mle_refs;
-       int inuse;
-       unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-       unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-       unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-       unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-       u8 master;
-       u8 new_master;
-       enum dlm_mle_type type;
-       struct o2hb_callback_func mle_hb_up;
-       struct o2hb_callback_func mle_hb_down;
-       union {
-               struct dlm_lock_resource *res;
-               struct dlm_lock_name name;
-       } u;
-};
-
 static void dlm_mle_node_down(struct dlm_ctxt *dlm,
                              struct dlm_master_list_entry *mle,
                              struct o2nm_node *node,
@@ -128,98 +92,10 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
        return 1;
 }
 
-#define dlm_print_nodemap(m)  _dlm_print_nodemap(m,#m)
-static void _dlm_print_nodemap(unsigned long *map, const char *mapname)
-{
-       int i;
-       printk("%s=[ ", mapname);
-       for (i=0; i<O2NM_MAX_NODES; i++)
-               if (test_bit(i, map))
-                       printk("%d ", i);
-       printk("]");
-}
-
-static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
-{
-       int refs;
-       char *type;
-       char attached;
-       u8 master;
-       unsigned int namelen;
-       const char *name;
-       struct kref *k;
-       unsigned long *maybe = mle->maybe_map,
-                     *vote = mle->vote_map,
-                     *resp = mle->response_map,
-                     *node = mle->node_map;
-
-       k = &mle->mle_refs;
-       if (mle->type == DLM_MLE_BLOCK)
-               type = "BLK";
-       else if (mle->type == DLM_MLE_MASTER)
-               type = "MAS";
-       else
-               type = "MIG";
-       refs = atomic_read(&k->refcount);
-       master = mle->master;
-       attached = (list_empty(&mle->hb_events) ? 'N' : 'Y');
-
-       if (mle->type != DLM_MLE_MASTER) {
-               namelen = mle->u.name.len;
-               name = mle->u.name.name;
-       } else {
-               namelen = mle->u.res->lockname.len;
-               name = mle->u.res->lockname.name;
-       }
-
-       mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ",
-                 namelen, name, type, refs, master, mle->new_master, attached,
-                 mle->inuse);
-       dlm_print_nodemap(maybe);
-       printk(", ");
-       dlm_print_nodemap(vote);
-       printk(", ");
-       dlm_print_nodemap(resp);
-       printk(", ");
-       dlm_print_nodemap(node);
-       printk(", ");
-       printk("\n");
-}
-
-#if 0
-/* Code here is included but defined out as it aids debugging */
-
-static void dlm_dump_mles(struct dlm_ctxt *dlm)
-{
-       struct dlm_master_list_entry *mle;
-       
-       mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
-       spin_lock(&dlm->master_lock);
-       list_for_each_entry(mle, &dlm->master_list, list)
-               dlm_print_one_mle(mle);
-       spin_unlock(&dlm->master_lock);
-}
-
-int dlm_dump_all_mles(const char __user *data, unsigned int len)
-{
-       struct dlm_ctxt *dlm;
-
-       spin_lock(&dlm_domain_lock);
-       list_for_each_entry(dlm, &dlm_domains, list) {
-               mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
-               dlm_dump_mles(dlm);
-       }
-       spin_unlock(&dlm_domain_lock);
-       return len;
-}
-EXPORT_SYMBOL_GPL(dlm_dump_all_mles);
-
-#endif  /*  0  */
-
-
+static struct kmem_cache *dlm_lockres_cache = NULL;
+static struct kmem_cache *dlm_lockname_cache = NULL;
 static struct kmem_cache *dlm_mle_cache = NULL;
 
-
 static void dlm_mle_release(struct kref *kref);
 static void dlm_init_mle(struct dlm_master_list_entry *mle,
                        enum dlm_mle_type type,
@@ -507,7 +383,7 @@ static void dlm_mle_node_up(struct dlm_ctxt *dlm,
 
 int dlm_init_mle_cache(void)
 {
-       dlm_mle_cache = kmem_cache_create("dlm_mle_cache",
+       dlm_mle_cache = kmem_cache_create("o2dlm_mle",
                                          sizeof(struct dlm_master_list_entry),
                                          0, SLAB_HWCACHE_ALIGN,
                                          NULL);
@@ -560,6 +436,35 @@ static void dlm_mle_release(struct kref *kref)
  * LOCK RESOURCE FUNCTIONS
  */
 
+int dlm_init_master_caches(void)
+{
+       dlm_lockres_cache = kmem_cache_create("o2dlm_lockres",
+                                             sizeof(struct dlm_lock_resource),
+                                             0, SLAB_HWCACHE_ALIGN, NULL);
+       if (!dlm_lockres_cache)
+               goto bail;
+
+       dlm_lockname_cache = kmem_cache_create("o2dlm_lockname",
+                                              DLM_LOCKID_NAME_MAX, 0,
+                                              SLAB_HWCACHE_ALIGN, NULL);
+       if (!dlm_lockname_cache)
+               goto bail;
+
+       return 0;
+bail:
+       dlm_destroy_master_caches();
+       return -ENOMEM;
+}
+
+void dlm_destroy_master_caches(void)
+{
+       if (dlm_lockname_cache)
+               kmem_cache_destroy(dlm_lockname_cache);
+
+       if (dlm_lockres_cache)
+               kmem_cache_destroy(dlm_lockres_cache);
+}
+
 static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
                                  struct dlm_lock_resource *res,
                                  u8 owner)
@@ -610,6 +515,14 @@ static void dlm_lockres_release(struct kref *kref)
        mlog(0, "destroying lockres %.*s\n", res->lockname.len,
             res->lockname.name);
 
+       if (!list_empty(&res->tracking))
+               list_del_init(&res->tracking);
+       else {
+               mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
+                    res->lockname.len, res->lockname.name);
+               dlm_print_one_lock_resource(res);
+       }
+
        if (!hlist_unhashed(&res->hash_node) ||
            !list_empty(&res->granted) ||
            !list_empty(&res->converting) ||
@@ -642,9 +555,9 @@ static void dlm_lockres_release(struct kref *kref)
        BUG_ON(!list_empty(&res->recovering));
        BUG_ON(!list_empty(&res->purge));
 
-       kfree(res->lockname.name);
+       kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
 
-       kfree(res);
+       kmem_cache_free(dlm_lockres_cache, res);
 }
 
 void dlm_lockres_put(struct dlm_lock_resource *res)
@@ -677,6 +590,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        INIT_LIST_HEAD(&res->dirty);
        INIT_LIST_HEAD(&res->recovering);
        INIT_LIST_HEAD(&res->purge);
+       INIT_LIST_HEAD(&res->tracking);
        atomic_set(&res->asts_reserved, 0);
        res->migration_pending = 0;
        res->inflight_locks = 0;
@@ -692,6 +606,8 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
 
        res->last_used = 0;
 
+       list_add_tail(&res->tracking, &dlm->tracking_list);
+
        memset(res->lvb, 0, DLM_LVB_LEN);
        memset(res->refmap, 0, sizeof(res->refmap));
 }
@@ -700,20 +616,28 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
                                   const char *name,
                                   unsigned int namelen)
 {
-       struct dlm_lock_resource *res;
+       struct dlm_lock_resource *res = NULL;
 
-       res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS);
+       res = (struct dlm_lock_resource *)
+                               kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
        if (!res)
-               return NULL;
+               goto error;
 
-       res->lockname.name = kmalloc(namelen, GFP_NOFS);
-       if (!res->lockname.name) {
-               kfree(res);
-               return NULL;
-       }
+       res->lockname.name = (char *)
+                               kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
+       if (!res->lockname.name)
+               goto error;
 
        dlm_init_lockres(dlm, res, name, namelen);
        return res;
+
+error:
+       if (res && res->lockname.name)
+               kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
+
+       if (res)
+               kmem_cache_free(dlm_lockres_cache, res);
+       return NULL;
 }
 
 void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
index 1f1873bf41fb3f5f50f61582b38ecf0c5cb2e0fe..394d25a131a54d406b4fd9422639cde7c1d7c320 100644 (file)
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/mm.h>
-#include <linux/crc32.h>
 #include <linux/kthread.h>
 #include <linux/pagemap.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 
-#include <cluster/heartbeat.h>
-#include <cluster/nodemanager.h>
-#include <cluster/tcp.h>
-
-#include <dlm/dlmapi.h>
-
 #define MLOG_MASK_PREFIX ML_DLM_GLUE
 #include <cluster/masklog.h>
 
@@ -53,6 +46,7 @@
 #include "heartbeat.h"
 #include "inode.h"
 #include "journal.h"
+#include "stackglue.h"
 #include "slot_map.h"
 #include "super.h"
 #include "uptodate.h"
@@ -113,7 +107,8 @@ static void ocfs2_dump_meta_lvb_info(u64 level,
                                     unsigned int line,
                                     struct ocfs2_lock_res *lockres)
 {
-       struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+       struct ocfs2_meta_lvb *lvb =
+               (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
 
        mlog(level, "LVB information for %s (called from %s:%u):\n",
             lockres->l_name, function, line);
@@ -259,31 +254,6 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
        .flags          = 0,
 };
 
-/*
- * This is the filesystem locking protocol version.
- *
- * Whenever the filesystem does new things with locks (adds or removes a
- * lock, orders them differently, does different things underneath a lock),
- * the version must be changed.  The protocol is negotiated when joining
- * the dlm domain.  A node may join the domain if its major version is
- * identical to all other nodes and its minor version is greater than
- * or equal to all other nodes.  When its minor version is greater than
- * the other nodes, it will run at the minor version specified by the
- * other nodes.
- *
- * If a locking change is made that will not be compatible with older
- * versions, the major number must be increased and the minor version set
- * to zero.  If a change merely adds a behavior that can be disabled when
- * speaking to older versions, the minor version must be increased.  If a
- * change adds a fully backwards compatible change (eg, LVB changes that
- * are just ignored by older versions), the version does not need to be
- * updated.
- */
-const struct dlm_protocol_version ocfs2_locking_protocol = {
-       .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
-       .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
-};
-
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
        return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -316,7 +286,7 @@ static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *l
 static int ocfs2_lock_create(struct ocfs2_super *osb,
                             struct ocfs2_lock_res *lockres,
                             int level,
-                            int dlm_flags);
+                            u32 dlm_flags);
 static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
                                                     int wanted);
 static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
@@ -330,10 +300,9 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
                                        struct ocfs2_lock_res *lockres);
 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
                                                int convert);
-#define ocfs2_log_dlm_error(_func, _stat, _lockres) do {       \
-       mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "  \
-               "resource %s: %s\n", dlm_errname(_stat), _func, \
-               _lockres->l_name, dlm_errmsg(_stat));           \
+#define ocfs2_log_dlm_error(_func, _err, _lockres) do {                        \
+       mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \
+            _err, _func, _lockres->l_name);                            \
 } while (0)
 static int ocfs2_downconvert_thread(void *arg);
 static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
@@ -342,12 +311,13 @@ static int ocfs2_inode_lock_update(struct inode *inode,
                                  struct buffer_head **bh);
 static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
 static inline int ocfs2_highest_compat_lock_level(int level);
-static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
-                                     int new_level);
+static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+                                             int new_level);
 static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
                                  struct ocfs2_lock_res *lockres,
                                  int new_level,
-                                 int lvb);
+                                 int lvb,
+                                 unsigned int generation);
 static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
                                        struct ocfs2_lock_res *lockres);
 static int ocfs2_cancel_convert(struct ocfs2_super *osb,
@@ -406,9 +376,9 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
        res->l_ops           = ops;
        res->l_priv          = priv;
 
-       res->l_level         = LKM_IVMODE;
-       res->l_requested     = LKM_IVMODE;
-       res->l_blocking      = LKM_IVMODE;
+       res->l_level         = DLM_LOCK_IV;
+       res->l_requested     = DLM_LOCK_IV;
+       res->l_blocking      = DLM_LOCK_IV;
        res->l_action        = OCFS2_AST_INVALID;
        res->l_unlock_action = OCFS2_UNLOCK_INVALID;
 
@@ -604,10 +574,10 @@ static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
        BUG_ON(!lockres);
 
        switch(level) {
-       case LKM_EXMODE:
+       case DLM_LOCK_EX:
                lockres->l_ex_holders++;
                break;
-       case LKM_PRMODE:
+       case DLM_LOCK_PR:
                lockres->l_ro_holders++;
                break;
        default:
@@ -625,11 +595,11 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
        BUG_ON(!lockres);
 
        switch(level) {
-       case LKM_EXMODE:
+       case DLM_LOCK_EX:
                BUG_ON(!lockres->l_ex_holders);
                lockres->l_ex_holders--;
                break;
-       case LKM_PRMODE:
+       case DLM_LOCK_PR:
                BUG_ON(!lockres->l_ro_holders);
                lockres->l_ro_holders--;
                break;
@@ -644,12 +614,12 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
  * lock types are added. */
 static inline int ocfs2_highest_compat_lock_level(int level)
 {
-       int new_level = LKM_EXMODE;
+       int new_level = DLM_LOCK_EX;
 
-       if (level == LKM_EXMODE)
-               new_level = LKM_NLMODE;
-       else if (level == LKM_PRMODE)
-               new_level = LKM_PRMODE;
+       if (level == DLM_LOCK_EX)
+               new_level = DLM_LOCK_NL;
+       else if (level == DLM_LOCK_PR)
+               new_level = DLM_LOCK_PR;
        return new_level;
 }
 
@@ -688,12 +658,12 @@ static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res
        BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
        BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
        BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
-       BUG_ON(lockres->l_blocking <= LKM_NLMODE);
+       BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
 
        lockres->l_level = lockres->l_requested;
        if (lockres->l_level <=
            ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
-               lockres->l_blocking = LKM_NLMODE;
+               lockres->l_blocking = DLM_LOCK_NL;
                lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
        }
        lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
@@ -712,7 +682,7 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
         * information is already up to data. Convert from NL to
         * *anything* however should mark ourselves as needing an
         * update */
-       if (lockres->l_level == LKM_NLMODE &&
+       if (lockres->l_level == DLM_LOCK_NL &&
            lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
                lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
 
@@ -729,7 +699,7 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc
        BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
        BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
 
-       if (lockres->l_requested > LKM_NLMODE &&
+       if (lockres->l_requested > DLM_LOCK_NL &&
            !(lockres->l_flags & OCFS2_LOCK_LOCAL) &&
            lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
                lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
@@ -767,6 +737,113 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
        return needs_downconvert;
 }
 
+/*
+ * OCFS2_LOCK_PENDING and l_pending_gen.
+ *
+ * Why does OCFS2_LOCK_PENDING exist?  To close a race between setting
+ * OCFS2_LOCK_BUSY and calling ocfs2_dlm_lock().  See ocfs2_unblock_lock()
+ * for more details on the race.
+ *
+ * OCFS2_LOCK_PENDING closes the race quite nicely.  However, it introduces
+ * a race on itself.  In o2dlm, we can get the ast before ocfs2_dlm_lock()
+ * returns.  The ast clears OCFS2_LOCK_BUSY, and must therefore clear
+ * OCFS2_LOCK_PENDING at the same time.  When ocfs2_dlm_lock() returns,
+ * the caller is going to try to clear PENDING again.  If nothing else is
+ * happening, __lockres_clear_pending() sees PENDING is unset and does
+ * nothing.
+ *
+ * But what if another path (eg downconvert thread) has just started a
+ * new locking action?  The other path has re-set PENDING.  Our path
+ * cannot clear PENDING, because that will re-open the original race
+ * window.
+ *
+ * [Example]
+ *
+ * ocfs2_meta_lock()
+ *  ocfs2_cluster_lock()
+ *   set BUSY
+ *   set PENDING
+ *   drop l_lock
+ *   ocfs2_dlm_lock()
+ *    ocfs2_locking_ast()              ocfs2_downconvert_thread()
+ *     clear PENDING                    ocfs2_unblock_lock()
+ *                                       take_l_lock
+ *                                       !BUSY
+ *                                       ocfs2_prepare_downconvert()
+ *                                        set BUSY
+ *                                        set PENDING
+ *                                       drop l_lock
+ *   take l_lock
+ *   clear PENDING
+ *   drop l_lock
+ *                     <window>
+ *                                       ocfs2_dlm_lock()
+ *
+ * So as you can see, we now have a window where l_lock is not held,
+ * PENDING is not set, and ocfs2_dlm_lock() has not been called.
+ *
+ * The core problem is that ocfs2_cluster_lock() has cleared the PENDING
+ * set by ocfs2_prepare_downconvert().  That wasn't nice.
+ *
+ * To solve this we introduce l_pending_gen.  A call to
+ * lockres_clear_pending() will only do so when it is passed a generation
+ * number that matches the lockres.  lockres_set_pending() will return the
+ * current generation number.  When ocfs2_cluster_lock() goes to clear
+ * PENDING, it passes the generation it got from set_pending().  In our
+ * example above, the generation numbers will *not* match.  Thus,
+ * ocfs2_cluster_lock() will not clear the PENDING set by
+ * ocfs2_prepare_downconvert().
+ */
+
+/* Unlocked version for ocfs2_locking_ast() */
+static void __lockres_clear_pending(struct ocfs2_lock_res *lockres,
+                                   unsigned int generation,
+                                   struct ocfs2_super *osb)
+{
+       assert_spin_locked(&lockres->l_lock);
+
+       /*
+        * The ast and locking functions can race us here.  The winner
+        * will clear pending, the loser will not.
+        */
+       if (!(lockres->l_flags & OCFS2_LOCK_PENDING) ||
+           (lockres->l_pending_gen != generation))
+               return;
+
+       lockres_clear_flags(lockres, OCFS2_LOCK_PENDING);
+       lockres->l_pending_gen++;
+
+       /*
+        * The downconvert thread may have skipped us because we
+        * were PENDING.  Wake it up.
+        */
+       if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+               ocfs2_wake_downconvert_thread(osb);
+}
+
+/* Locked version for callers of ocfs2_dlm_lock() */
+static void lockres_clear_pending(struct ocfs2_lock_res *lockres,
+                                 unsigned int generation,
+                                 struct ocfs2_super *osb)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&lockres->l_lock, flags);
+       __lockres_clear_pending(lockres, generation, osb);
+       spin_unlock_irqrestore(&lockres->l_lock, flags);
+}
+
+static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
+{
+       assert_spin_locked(&lockres->l_lock);
+       BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
+
+       lockres_or_flags(lockres, OCFS2_LOCK_PENDING);
+
+       return lockres->l_pending_gen;
+}
+
+
 static void ocfs2_blocking_ast(void *opaque, int level)
 {
        struct ocfs2_lock_res *lockres = opaque;
@@ -774,7 +851,7 @@ static void ocfs2_blocking_ast(void *opaque, int level)
        int needs_downconvert;
        unsigned long flags;
 
-       BUG_ON(level <= LKM_NLMODE);
+       BUG_ON(level <= DLM_LOCK_NL);
 
        mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n",
             lockres->l_name, level, lockres->l_level,
@@ -801,14 +878,22 @@ static void ocfs2_blocking_ast(void *opaque, int level)
 static void ocfs2_locking_ast(void *opaque)
 {
        struct ocfs2_lock_res *lockres = opaque;
-       struct dlm_lockstatus *lksb = &lockres->l_lksb;
+       struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
        unsigned long flags;
+       int status;
 
        spin_lock_irqsave(&lockres->l_lock, flags);
 
-       if (lksb->status != DLM_NORMAL) {
-               mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n",
-                    lockres->l_name, lksb->status);
+       status = ocfs2_dlm_lock_status(&lockres->l_lksb);
+
+       if (status == -EAGAIN) {
+               lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+               goto out;
+       }
+
+       if (status) {
+               mlog(ML_ERROR, "lockres %s: lksb status value of %d!\n",
+                    lockres->l_name, status);
                spin_unlock_irqrestore(&lockres->l_lock, flags);
                return;
        }
@@ -831,11 +916,23 @@ static void ocfs2_locking_ast(void *opaque)
                     lockres->l_unlock_action);
                BUG();
        }
-
+out:
        /* set it to something invalid so if we get called again we
         * can catch it. */
        lockres->l_action = OCFS2_AST_INVALID;
 
+       /* Did we try to cancel this lock?  Clear that state */
+       if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT)
+               lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+
+       /*
+        * We may have beaten the locking functions here.  We certainly
+        * know that dlm_lock() has been called :-)
+        * Because we can't have two lock calls in flight at once, we
+        * can use lockres->l_pending_gen.
+        */
+       __lockres_clear_pending(lockres, lockres->l_pending_gen,  osb);
+
        wake_up(&lockres->l_event);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
 }
@@ -865,15 +962,15 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
 static int ocfs2_lock_create(struct ocfs2_super *osb,
                             struct ocfs2_lock_res *lockres,
                             int level,
-                            int dlm_flags)
+                            u32 dlm_flags)
 {
        int ret = 0;
-       enum dlm_status status = DLM_NORMAL;
        unsigned long flags;
+       unsigned int gen;
 
        mlog_entry_void();
 
-       mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level,
+       mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level,
             dlm_flags);
 
        spin_lock_irqsave(&lockres->l_lock, flags);
@@ -886,24 +983,23 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
        lockres->l_action = OCFS2_AST_ATTACH;
        lockres->l_requested = level;
        lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+       gen = lockres_set_pending(lockres);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
 
-       status = dlmlock(osb->dlm,
-                        level,
-                        &lockres->l_lksb,
-                        dlm_flags,
-                        lockres->l_name,
-                        OCFS2_LOCK_ID_MAX_LEN - 1,
-                        ocfs2_locking_ast,
-                        lockres,
-                        ocfs2_blocking_ast);
-       if (status != DLM_NORMAL) {
-               ocfs2_log_dlm_error("dlmlock", status, lockres);
-               ret = -EINVAL;
+       ret = ocfs2_dlm_lock(osb->cconn,
+                            level,
+                            &lockres->l_lksb,
+                            dlm_flags,
+                            lockres->l_name,
+                            OCFS2_LOCK_ID_MAX_LEN - 1,
+                            lockres);
+       lockres_clear_pending(lockres, gen, osb);
+       if (ret) {
+               ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
                ocfs2_recover_from_dlm_error(lockres, 1);
        }
 
-       mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name);
+       mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name);
 
 bail:
        mlog_exit(ret);
@@ -1016,21 +1112,22 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
 static int ocfs2_cluster_lock(struct ocfs2_super *osb,
                              struct ocfs2_lock_res *lockres,
                              int level,
-                             int lkm_flags,
+                             u32 lkm_flags,
                              int arg_flags)
 {
        struct ocfs2_mask_waiter mw;
-       enum dlm_status status;
        int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
        int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
        unsigned long flags;
+       unsigned int gen;
+       int noqueue_attempted = 0;
 
        mlog_entry_void();
 
        ocfs2_init_mask_waiter(&mw);
 
        if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
-               lkm_flags |= LKM_VALBLK;
+               lkm_flags |= DLM_LKF_VALBLK;
 
 again:
        wait = 0;
@@ -1068,52 +1165,56 @@ again:
        }
 
        if (level > lockres->l_level) {
+               if (noqueue_attempted > 0) {
+                       ret = -EAGAIN;
+                       goto unlock;
+               }
+               if (lkm_flags & DLM_LKF_NOQUEUE)
+                       noqueue_attempted = 1;
+
                if (lockres->l_action != OCFS2_AST_INVALID)
                        mlog(ML_ERROR, "lockres %s has action %u pending\n",
                             lockres->l_name, lockres->l_action);
 
                if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
                        lockres->l_action = OCFS2_AST_ATTACH;
-                       lkm_flags &= ~LKM_CONVERT;
+                       lkm_flags &= ~DLM_LKF_CONVERT;
                } else {
                        lockres->l_action = OCFS2_AST_CONVERT;
-                       lkm_flags |= LKM_CONVERT;
+                       lkm_flags |= DLM_LKF_CONVERT;
                }
 
                lockres->l_requested = level;
                lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+               gen = lockres_set_pending(lockres);
                spin_unlock_irqrestore(&lockres->l_lock, flags);
 
-               BUG_ON(level == LKM_IVMODE);
-               BUG_ON(level == LKM_NLMODE);
+               BUG_ON(level == DLM_LOCK_IV);
+               BUG_ON(level == DLM_LOCK_NL);
 
                mlog(0, "lock %s, convert from %d to level = %d\n",
                     lockres->l_name, lockres->l_level, level);
 
                /* call dlm_lock to upgrade lock now */
-               status = dlmlock(osb->dlm,
-                                level,
-                                &lockres->l_lksb,
-                                lkm_flags,
-                                lockres->l_name,
-                                OCFS2_LOCK_ID_MAX_LEN - 1,
-                                ocfs2_locking_ast,
-                                lockres,
-                                ocfs2_blocking_ast);
-               if (status != DLM_NORMAL) {
-                       if ((lkm_flags & LKM_NOQUEUE) &&
-                           (status == DLM_NOTQUEUED))
-                               ret = -EAGAIN;
-                       else {
-                               ocfs2_log_dlm_error("dlmlock", status,
-                                                   lockres);
-                               ret = -EINVAL;
+               ret = ocfs2_dlm_lock(osb->cconn,
+                                    level,
+                                    &lockres->l_lksb,
+                                    lkm_flags,
+                                    lockres->l_name,
+                                    OCFS2_LOCK_ID_MAX_LEN - 1,
+                                    lockres);
+               lockres_clear_pending(lockres, gen, osb);
+               if (ret) {
+                       if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
+                           (ret != -EAGAIN)) {
+                               ocfs2_log_dlm_error("ocfs2_dlm_lock",
+                                                   ret, lockres);
                        }
                        ocfs2_recover_from_dlm_error(lockres, 1);
                        goto out;
                }
 
-               mlog(0, "lock %s, successfull return from dlmlock\n",
+               mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n",
                     lockres->l_name);
 
                /* At this point we've gone inside the dlm and need to
@@ -1177,9 +1278,9 @@ static int ocfs2_create_new_lock(struct ocfs2_super *osb,
                                 int ex,
                                 int local)
 {
-       int level =  ex ? LKM_EXMODE : LKM_PRMODE;
+       int level =  ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        unsigned long flags;
-       int lkm_flags = local ? LKM_LOCAL : 0;
+       u32 lkm_flags = local ? DLM_LKF_LOCAL : 0;
 
        spin_lock_irqsave(&lockres->l_lock, flags);
        BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
@@ -1222,7 +1323,7 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
        }
 
        /*
-        * We don't want to use LKM_LOCAL on a meta data lock as they
+        * We don't want to use DLM_LKF_LOCAL on a meta data lock as they
         * don't use a generation in their lock names.
         */
        ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
@@ -1261,7 +1362,7 @@ int ocfs2_rw_lock(struct inode *inode, int write)
 
        lockres = &OCFS2_I(inode)->ip_rw_lockres;
 
-       level = write ? LKM_EXMODE : LKM_PRMODE;
+       level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
 
        status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
                                    0);
@@ -1274,7 +1375,7 @@ int ocfs2_rw_lock(struct inode *inode, int write)
 
 void ocfs2_rw_unlock(struct inode *inode, int write)
 {
-       int level = write ? LKM_EXMODE : LKM_PRMODE;
+       int level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
        struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
@@ -1312,7 +1413,7 @@ int ocfs2_open_lock(struct inode *inode)
        lockres = &OCFS2_I(inode)->ip_open_lockres;
 
        status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
-                                   LKM_PRMODE, 0, 0);
+                                   DLM_LOCK_PR, 0, 0);
        if (status < 0)
                mlog_errno(status);
 
@@ -1340,16 +1441,16 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
 
        lockres = &OCFS2_I(inode)->ip_open_lockres;
 
-       level = write ? LKM_EXMODE : LKM_PRMODE;
+       level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
 
        /*
         * The file system may already holding a PRMODE/EXMODE open lock.
-        * Since we pass LKM_NOQUEUE, the request won't block waiting on
+        * Since we pass DLM_LKF_NOQUEUE, the request won't block waiting on
         * other nodes and the -EAGAIN will indicate to the caller that
         * this inode is still in use.
         */
        status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
-                                   level, LKM_NOQUEUE, 0);
+                                   level, DLM_LKF_NOQUEUE, 0);
 
 out:
        mlog_exit(status);
@@ -1374,10 +1475,10 @@ void ocfs2_open_unlock(struct inode *inode)
 
        if(lockres->l_ro_holders)
                ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
-                                    LKM_PRMODE);
+                                    DLM_LOCK_PR);
        if(lockres->l_ex_holders)
                ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
-                                    LKM_EXMODE);
+                                    DLM_LOCK_EX);
 
 out:
        mlog_exit_void();
@@ -1464,7 +1565,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
        ocfs2_init_mask_waiter(&mw);
 
        if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
-           (lockres->l_level > LKM_NLMODE)) {
+           (lockres->l_level > DLM_LOCK_NL)) {
                mlog(ML_ERROR,
                     "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
                     "level: %u\n", lockres->l_name, lockres->l_flags,
@@ -1503,14 +1604,12 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
        lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
 
-       ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags,
-                     lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
-                     ocfs2_locking_ast, lockres, ocfs2_blocking_ast);
-       if (ret != DLM_NORMAL) {
-               if (trylock && ret == DLM_NOTQUEUED)
-                       ret = -EAGAIN;
-               else {
-                       ocfs2_log_dlm_error("dlmlock", ret, lockres);
+       ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
+                            lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
+                            lockres);
+       if (ret) {
+               if (!trylock || (ret != -EAGAIN)) {
+                       ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
                        ret = -EINVAL;
                }
 
@@ -1537,6 +1636,10 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
                 * to just bubble sucess back up to the user.
                 */
                ret = ocfs2_flock_handle_signal(lockres, level);
+       } else if (!ret && (level > lockres->l_level)) {
+               /* Trylock failed asynchronously */
+               BUG_ON(!trylock);
+               ret = -EAGAIN;
        }
 
 out:
@@ -1549,6 +1652,7 @@ out:
 void ocfs2_file_unlock(struct file *file)
 {
        int ret;
+       unsigned int gen;
        unsigned long flags;
        struct ocfs2_file_private *fp = file->private_data;
        struct ocfs2_lock_res *lockres = &fp->fp_flock;
@@ -1572,13 +1676,13 @@ void ocfs2_file_unlock(struct file *file)
         * Fake a blocking ast for the downconvert code.
         */
        lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
-       lockres->l_blocking = LKM_EXMODE;
+       lockres->l_blocking = DLM_LOCK_EX;
 
-       ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
+       gen = ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
        lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
 
-       ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0);
+       ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0, gen);
        if (ret) {
                mlog_errno(ret);
                return;
@@ -1601,11 +1705,11 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
         * condition. */
        if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
                switch(lockres->l_blocking) {
-               case LKM_EXMODE:
+               case DLM_LOCK_EX:
                        if (!lockres->l_ex_holders && !lockres->l_ro_holders)
                                kick = 1;
                        break;
-               case LKM_PRMODE:
+               case DLM_LOCK_PR:
                        if (!lockres->l_ex_holders)
                                kick = 1;
                        break;
@@ -1648,7 +1752,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
 
        mlog_entry_void();
 
-       lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+       lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
 
        /*
         * Invalidate the LVB of a deleted inode - this way other
@@ -1700,7 +1804,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 
        mlog_meta_lvb(0, lockres);
 
-       lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+       lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
 
        /* We're safe here without the lockres lock... */
        spin_lock(&oi->ip_lock);
@@ -1735,7 +1839,8 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
                                              struct ocfs2_lock_res *lockres)
 {
-       struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+       struct ocfs2_meta_lvb *lvb =
+               (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
 
        if (lvb->lvb_version == OCFS2_LVB_VERSION
            && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
@@ -1923,7 +2028,8 @@ int ocfs2_inode_lock_full(struct inode *inode,
                         int ex,
                         int arg_flags)
 {
-       int status, level, dlm_flags, acquired;
+       int status, level, acquired;
+       u32 dlm_flags;
        struct ocfs2_lock_res *lockres = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct buffer_head *local_bh = NULL;
@@ -1950,14 +2056,13 @@ int ocfs2_inode_lock_full(struct inode *inode,
                goto local;
 
        if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
-               wait_event(osb->recovery_event,
-                          ocfs2_node_map_is_empty(osb, &osb->recovery_map));
+               ocfs2_wait_for_recovery(osb);
 
        lockres = &OCFS2_I(inode)->ip_inode_lockres;
-       level = ex ? LKM_EXMODE : LKM_PRMODE;
+       level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        dlm_flags = 0;
        if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
-               dlm_flags |= LKM_NOQUEUE;
+               dlm_flags |= DLM_LKF_NOQUEUE;
 
        status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
        if (status < 0) {
@@ -1974,8 +2079,7 @@ int ocfs2_inode_lock_full(struct inode *inode,
         * committed to owning this lock so we don't allow signals to
         * abort the operation. */
        if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
-               wait_event(osb->recovery_event,
-                          ocfs2_node_map_is_empty(osb, &osb->recovery_map));
+               ocfs2_wait_for_recovery(osb);
 
 local:
        /*
@@ -2109,7 +2213,7 @@ int ocfs2_inode_lock_atime(struct inode *inode,
 void ocfs2_inode_unlock(struct inode *inode,
                       int ex)
 {
-       int level = ex ? LKM_EXMODE : LKM_PRMODE;
+       int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
@@ -2130,10 +2234,8 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
                     int ex)
 {
        int status = 0;
-       int level = ex ? LKM_EXMODE : LKM_PRMODE;
+       int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
-       struct buffer_head *bh;
-       struct ocfs2_slot_info *si = osb->slot_info;
 
        mlog_entry_void();
 
@@ -2159,11 +2261,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
                goto bail;
        }
        if (status) {
-               bh = si->si_bh;
-               status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
-                                         si->si_inode);
-               if (status == 0)
-                       ocfs2_update_slot_info(si);
+               status = ocfs2_refresh_slot_info(osb);
 
                ocfs2_complete_lock_res_refresh(lockres, status);
 
@@ -2178,7 +2276,7 @@ bail:
 void ocfs2_super_unlock(struct ocfs2_super *osb,
                        int ex)
 {
-       int level = ex ? LKM_EXMODE : LKM_PRMODE;
+       int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
 
        if (!ocfs2_mount_local(osb))
@@ -2196,7 +2294,7 @@ int ocfs2_rename_lock(struct ocfs2_super *osb)
        if (ocfs2_mount_local(osb))
                return 0;
 
-       status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0);
+       status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
        if (status < 0)
                mlog_errno(status);
 
@@ -2208,13 +2306,13 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb)
        struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
 
        if (!ocfs2_mount_local(osb))
-               ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
+               ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
 }
 
 int ocfs2_dentry_lock(struct dentry *dentry, int ex)
 {
        int ret;
-       int level = ex ? LKM_EXMODE : LKM_PRMODE;
+       int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
 
@@ -2235,7 +2333,7 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex)
 
 void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
 {
-       int level = ex ? LKM_EXMODE : LKM_PRMODE;
+       int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
 
@@ -2400,7 +2498,7 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
                   lockres->l_blocking);
 
        /* Dump the raw LVB */
-       lvb = lockres->l_lksb.lvb;
+       lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
        for(i = 0; i < DLM_LVB_LEN; i++)
                seq_printf(m, "0x%x\t", lvb[i]);
 
@@ -2504,13 +2602,14 @@ static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
 int ocfs2_dlm_init(struct ocfs2_super *osb)
 {
        int status = 0;
-       u32 dlm_key;
-       struct dlm_ctxt *dlm = NULL;
+       struct ocfs2_cluster_connection *conn = NULL;
 
        mlog_entry_void();
 
-       if (ocfs2_mount_local(osb))
+       if (ocfs2_mount_local(osb)) {
+               osb->node_num = 0;
                goto local;
+       }
 
        status = ocfs2_dlm_init_debug(osb);
        if (status < 0) {
@@ -2527,26 +2626,31 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
                goto bail;
        }
 
-       /* used by the dlm code to make message headers unique, each
-        * node in this domain must agree on this. */
-       dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
-
        /* for now, uuid == domain */
-       dlm = dlm_register_domain(osb->uuid_str, dlm_key,
-                                 &osb->osb_locking_proto);
-       if (IS_ERR(dlm)) {
-               status = PTR_ERR(dlm);
+       status = ocfs2_cluster_connect(osb->osb_cluster_stack,
+                                      osb->uuid_str,
+                                      strlen(osb->uuid_str),
+                                      ocfs2_do_node_down, osb,
+                                      &conn);
+       if (status) {
                mlog_errno(status);
                goto bail;
        }
 
-       dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb);
+       status = ocfs2_cluster_this_node(&osb->node_num);
+       if (status < 0) {
+               mlog_errno(status);
+               mlog(ML_ERROR,
+                    "could not find this host's node number\n");
+               ocfs2_cluster_disconnect(conn, 0);
+               goto bail;
+       }
 
 local:
        ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
        ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
 
-       osb->dlm = dlm;
+       osb->cconn = conn;
 
        status = 0;
 bail:
@@ -2560,14 +2664,19 @@ bail:
        return status;
 }
 
-void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
+                       int hangup_pending)
 {
        mlog_entry_void();
 
-       dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
-
        ocfs2_drop_osb_locks(osb);
 
+       /*
+        * Now that we have dropped all locks and ocfs2_dismount_volume()
+        * has disabled recovery, the DLM won't be talking to us.  It's
+        * safe to tear things down before disconnecting the cluster.
+        */
+
        if (osb->dc_task) {
                kthread_stop(osb->dc_task);
                osb->dc_task = NULL;
@@ -2576,15 +2685,15 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
        ocfs2_lock_res_free(&osb->osb_super_lockres);
        ocfs2_lock_res_free(&osb->osb_rename_lockres);
 
-       dlm_unregister_domain(osb->dlm);
-       osb->dlm = NULL;
+       ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
+       osb->cconn = NULL;
 
        ocfs2_dlm_shutdown_debug(osb);
 
        mlog_exit_void();
 }
 
-static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
+static void ocfs2_unlock_ast(void *opaque, int error)
 {
        struct ocfs2_lock_res *lockres = opaque;
        unsigned long flags;
@@ -2595,24 +2704,9 @@ static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
             lockres->l_unlock_action);
 
        spin_lock_irqsave(&lockres->l_lock, flags);
-       /* We tried to cancel a convert request, but it was already
-        * granted. All we want to do here is clear our unlock
-        * state. The wake_up call done at the bottom is redundant
-        * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
-        * hurt anything anyway */
-       if (status == DLM_CANCELGRANT &&
-           lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
-               mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
-
-               /* We don't clear the busy flag in this case as it
-                * should have been cleared by the ast which the dlm
-                * has called. */
-               goto complete_unlock;
-       }
-
-       if (status != DLM_NORMAL) {
-               mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
-                    "unlock_action %d\n", status, lockres->l_name,
+       if (error) {
+               mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
+                    "unlock_action %d\n", error, lockres->l_name,
                     lockres->l_unlock_action);
                spin_unlock_irqrestore(&lockres->l_lock, flags);
                return;
@@ -2624,14 +2718,13 @@ static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
                lockres->l_action = OCFS2_AST_INVALID;
                break;
        case OCFS2_UNLOCK_DROP_LOCK:
-               lockres->l_level = LKM_IVMODE;
+               lockres->l_level = DLM_LOCK_IV;
                break;
        default:
                BUG();
        }
 
        lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-complete_unlock:
        lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
        spin_unlock_irqrestore(&lockres->l_lock, flags);
 
@@ -2643,16 +2736,16 @@ complete_unlock:
 static int ocfs2_drop_lock(struct ocfs2_super *osb,
                           struct ocfs2_lock_res *lockres)
 {
-       enum dlm_status status;
+       int ret;
        unsigned long flags;
-       int lkm_flags = 0;
+       u32 lkm_flags = 0;
 
        /* We didn't get anywhere near actually using this lockres. */
        if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
                goto out;
 
        if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
-               lkm_flags |= LKM_VALBLK;
+               lkm_flags |= DLM_LKF_VALBLK;
 
        spin_lock_irqsave(&lockres->l_lock, flags);
 
@@ -2678,7 +2771,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
 
        if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
                if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
-                   lockres->l_level == LKM_EXMODE &&
+                   lockres->l_level == DLM_LOCK_EX &&
                    !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
                        lockres->l_ops->set_lvb(lockres);
        }
@@ -2707,15 +2800,15 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
 
        mlog(0, "lock %s\n", lockres->l_name);
 
-       status = dlmunlock(osb->dlm, &lockres->l_lksb, lkm_flags,
-                          ocfs2_unlock_ast, lockres);
-       if (status != DLM_NORMAL) {
-               ocfs2_log_dlm_error("dlmunlock", status, lockres);
+       ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags,
+                              lockres);
+       if (ret) {
+               ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
                mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
-               dlm_print_one_lock(lockres->l_lksb.lockid);
+               ocfs2_dlm_dump_lksb(&lockres->l_lksb);
                BUG();
        }
-       mlog(0, "lock %s, successfull return from dlmunlock\n",
+       mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n",
             lockres->l_name);
 
        ocfs2_wait_on_busy_lock(lockres);
@@ -2806,15 +2899,15 @@ int ocfs2_drop_inode_locks(struct inode *inode)
        return status;
 }
 
-static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
-                                     int new_level)
+static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+                                             int new_level)
 {
        assert_spin_locked(&lockres->l_lock);
 
-       BUG_ON(lockres->l_blocking <= LKM_NLMODE);
+       BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
 
        if (lockres->l_level <= new_level) {
-               mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n",
+               mlog(ML_ERROR, "lockres->l_level (%d) <= new_level (%d)\n",
                     lockres->l_level, new_level);
                BUG();
        }
@@ -2825,33 +2918,33 @@ static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
        lockres->l_action = OCFS2_AST_DOWNCONVERT;
        lockres->l_requested = new_level;
        lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+       return lockres_set_pending(lockres);
 }
 
 static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
                                  struct ocfs2_lock_res *lockres,
                                  int new_level,
-                                 int lvb)
+                                 int lvb,
+                                 unsigned int generation)
 {
-       int ret, dlm_flags = LKM_CONVERT;
-       enum dlm_status status;
+       int ret;
+       u32 dlm_flags = DLM_LKF_CONVERT;
 
        mlog_entry_void();
 
        if (lvb)
-               dlm_flags |= LKM_VALBLK;
-
-       status = dlmlock(osb->dlm,
-                        new_level,
-                        &lockres->l_lksb,
-                        dlm_flags,
-                        lockres->l_name,
-                        OCFS2_LOCK_ID_MAX_LEN - 1,
-                        ocfs2_locking_ast,
-                        lockres,
-                        ocfs2_blocking_ast);
-       if (status != DLM_NORMAL) {
-               ocfs2_log_dlm_error("dlmlock", status, lockres);
-               ret = -EINVAL;
+               dlm_flags |= DLM_LKF_VALBLK;
+
+       ret = ocfs2_dlm_lock(osb->cconn,
+                            new_level,
+                            &lockres->l_lksb,
+                            dlm_flags,
+                            lockres->l_name,
+                            OCFS2_LOCK_ID_MAX_LEN - 1,
+                            lockres);
+       lockres_clear_pending(lockres, generation, osb);
+       if (ret) {
+               ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
                ocfs2_recover_from_dlm_error(lockres, 1);
                goto bail;
        }
@@ -2862,7 +2955,7 @@ bail:
        return ret;
 }
 
-/* returns 1 when the caller should unlock and call dlmunlock */
+/* returns 1 when the caller should unlock and call ocfs2_dlm_unlock */
 static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
                                        struct ocfs2_lock_res *lockres)
 {
@@ -2898,24 +2991,18 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
                                struct ocfs2_lock_res *lockres)
 {
        int ret;
-       enum dlm_status status;
 
        mlog_entry_void();
        mlog(0, "lock %s\n", lockres->l_name);
 
-       ret = 0;
-       status = dlmunlock(osb->dlm,
-                          &lockres->l_lksb,
-                          LKM_CANCEL,
-                          ocfs2_unlock_ast,
-                          lockres);
-       if (status != DLM_NORMAL) {
-               ocfs2_log_dlm_error("dlmunlock", status, lockres);
-               ret = -EINVAL;
+       ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
+                              DLM_LKF_CANCEL, lockres);
+       if (ret) {
+               ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
                ocfs2_recover_from_dlm_error(lockres, 0);
        }
 
-       mlog(0, "lock %s return from dlmunlock\n", lockres->l_name);
+       mlog(0, "lock %s return from ocfs2_dlm_unlock\n", lockres->l_name);
 
        mlog_exit(ret);
        return ret;
@@ -2930,6 +3017,7 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
        int new_level;
        int ret = 0;
        int set_lvb = 0;
+       unsigned int gen;
 
        mlog_entry_void();
 
@@ -2939,6 +3027,32 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
 
 recheck:
        if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+               /* XXX
+                * This is a *big* race.  The OCFS2_LOCK_PENDING flag
+                * exists entirely for one reason - another thread has set
+                * OCFS2_LOCK_BUSY, but has *NOT* yet called dlm_lock().
+                *
+                * If we do ocfs2_cancel_convert() before the other thread
+                * calls dlm_lock(), our cancel will do nothing.  We will
+                * get no ast, and we will have no way of knowing the
+                * cancel failed.  Meanwhile, the other thread will call
+                * into dlm_lock() and wait...forever.
+                *
+                * Why forever?  Because another node has asked for the
+                * lock first; that's why we're here in unblock_lock().
+                *
+                * The solution is OCFS2_LOCK_PENDING.  When PENDING is
+                * set, we just requeue the unblock.  Only when the other
+                * thread has called dlm_lock() and cleared PENDING will
+                * we then cancel their request.
+                *
+                * All callers of dlm_lock() must set OCFS2_DLM_PENDING
+                * at the same time they set OCFS2_DLM_BUSY.  They must
+                * clear OCFS2_DLM_PENDING after dlm_lock() returns.
+                */
+               if (lockres->l_flags & OCFS2_LOCK_PENDING)
+                       goto leave_requeue;
+
                ctl->requeue = 1;
                ret = ocfs2_prepare_cancel_convert(osb, lockres);
                spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -2952,13 +3066,13 @@ recheck:
 
        /* if we're blocking an exclusive and we have *any* holders,
         * then requeue. */
-       if ((lockres->l_blocking == LKM_EXMODE)
+       if ((lockres->l_blocking == DLM_LOCK_EX)
            && (lockres->l_ex_holders || lockres->l_ro_holders))
                goto leave_requeue;
 
        /* If it's a PR we're blocking, then only
         * requeue if we've got any EX holders */
-       if (lockres->l_blocking == LKM_PRMODE &&
+       if (lockres->l_blocking == DLM_LOCK_PR &&
            lockres->l_ex_holders)
                goto leave_requeue;
 
@@ -3005,7 +3119,7 @@ downconvert:
        ctl->requeue = 0;
 
        if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
-               if (lockres->l_level == LKM_EXMODE)
+               if (lockres->l_level == DLM_LOCK_EX)
                        set_lvb = 1;
 
                /*
@@ -3018,9 +3132,11 @@ downconvert:
                        lockres->l_ops->set_lvb(lockres);
        }
 
-       ocfs2_prepare_downconvert(lockres, new_level);
+       gen = ocfs2_prepare_downconvert(lockres, new_level);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
-       ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
+       ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb,
+                                    gen);
+
 leave:
        mlog_exit(ret);
        return ret;
@@ -3059,7 +3175,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
        }
        sync_mapping_buffers(mapping);
-       if (blocking == LKM_EXMODE) {
+       if (blocking == DLM_LOCK_EX) {
                truncate_inode_pages(mapping, 0);
        } else {
                /* We only need to wait on the I/O if we're not also
@@ -3080,8 +3196,8 @@ static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
        struct inode *inode = ocfs2_lock_res_inode(lockres);
        int checkpointed = ocfs2_inode_fully_checkpointed(inode);
 
-       BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE);
-       BUG_ON(lockres->l_level != LKM_EXMODE && !checkpointed);
+       BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR);
+       BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed);
 
        if (checkpointed)
                return 1;
@@ -3145,7 +3261,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
         * valid. The downconvert code will retain a PR for this node,
         * so there's no further work to do.
         */
-       if (blocking == LKM_PRMODE)
+       if (blocking == DLM_LOCK_PR)
                return UNBLOCK_CONTINUE;
 
        /*
@@ -3219,6 +3335,45 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
        return UNBLOCK_CONTINUE_POST;
 }
 
+/*
+ * This is the filesystem locking protocol.  It provides the lock handling
+ * hooks for the underlying DLM.  It has a maximum version number.
+ * The version number allows interoperability with systems running at
+ * the same major number and an equal or smaller minor number.
+ *
+ * Whenever the filesystem does new things with locks (adds or removes a
+ * lock, orders them differently, does different things underneath a lock),
+ * the version must be changed.  The protocol is negotiated when joining
+ * the dlm domain.  A node may join the domain if its major version is
+ * identical to all other nodes and its minor version is greater than
+ * or equal to all other nodes.  When its minor version is greater than
+ * the other nodes, it will run at the minor version specified by the
+ * other nodes.
+ *
+ * If a locking change is made that will not be compatible with older
+ * versions, the major number must be increased and the minor version set
+ * to zero.  If a change merely adds a behavior that can be disabled when
+ * speaking to older versions, the minor version must be increased.  If a
+ * change adds a fully backwards compatible change (eg, LVB changes that
+ * are just ignored by older versions), the version does not need to be
+ * updated.
+ */
+static struct ocfs2_locking_protocol lproto = {
+       .lp_max_version = {
+               .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
+               .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
+       },
+       .lp_lock_ast            = ocfs2_locking_ast,
+       .lp_blocking_ast        = ocfs2_blocking_ast,
+       .lp_unlock_ast          = ocfs2_unlock_ast,
+};
+
+void ocfs2_set_locking_protocol(void)
+{
+       ocfs2_stack_glue_set_locking_protocol(&lproto);
+}
+
+
 static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
                                       struct ocfs2_lock_res *lockres)
 {
index e3cf902404b45e3caaec642d450cf626d09ecb74..2bb01f09c1b11eb39bbb6efa35a438f69f1cd8bb 100644 (file)
@@ -58,7 +58,7 @@ struct ocfs2_meta_lvb {
 #define OCFS2_LOCK_NONBLOCK            (0x04)
 
 int ocfs2_dlm_init(struct ocfs2_super *osb);
-void ocfs2_dlm_shutdown(struct ocfs2_super *osb);
+void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending);
 void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
 void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
                               enum ocfs2_lock_type type,
@@ -114,5 +114,6 @@ void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
 struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
 void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
 
-extern const struct dlm_protocol_version ocfs2_locking_protocol;
+/* To set the locking protocol on module initialization */
+void ocfs2_set_locking_protocol(void);
 #endif /* DLMGLUE_H */
index ed5d5232e85d9cef09b0d7f312d5a11ee8bd7426..9154c82d3258e5afc4b88fde94ecc68fc0aa91e2 100644 (file)
@@ -2242,7 +2242,7 @@ const struct file_operations ocfs2_fops = {
        .open           = ocfs2_file_open,
        .aio_read       = ocfs2_file_aio_read,
        .aio_write      = ocfs2_file_aio_write,
-       .ioctl          = ocfs2_ioctl,
+       .unlocked_ioctl = ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ocfs2_compat_ioctl,
 #endif
@@ -2258,7 +2258,7 @@ const struct file_operations ocfs2_dops = {
        .fsync          = ocfs2_sync_file,
        .release        = ocfs2_dir_release,
        .open           = ocfs2_dir_open,
-       .ioctl          = ocfs2_ioctl,
+       .unlocked_ioctl = ocfs2_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ocfs2_compat_ioctl,
 #endif
index 0758daf64da07c41fcf4f73a7b55d634589ab87d..c6e7213db8688744bdd21d05d5f6296a5e7dcc91 100644 (file)
@@ -28,9 +28,6 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#include <linux/kmod.h>
-
-#include <dlm/dlmapi.h>
 
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -48,7 +45,6 @@ static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
                                            int bit);
 static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
                                              int bit);
-static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map);
 
 /* special case -1 for now
  * TODO: should *really* make sure the calling func never passes -1!!  */
@@ -62,23 +58,23 @@ static void ocfs2_node_map_init(struct ocfs2_node_map *map)
 void ocfs2_init_node_maps(struct ocfs2_super *osb)
 {
        spin_lock_init(&osb->node_map_lock);
-       ocfs2_node_map_init(&osb->recovery_map);
        ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
 }
 
-static void ocfs2_do_node_down(int node_num,
-                              struct ocfs2_super *osb)
+void ocfs2_do_node_down(int node_num, void *data)
 {
+       struct ocfs2_super *osb = data;
+
        BUG_ON(osb->node_num == node_num);
 
        mlog(0, "ocfs2: node down event for %d\n", node_num);
 
-       if (!osb->dlm) {
+       if (!osb->cconn) {
                /*
-                * No DLM means we're not even ready to participate yet.
-                * We check the slots after the DLM comes up, so we will
-                * notice the node death then.  We can safely ignore it
-                * here.
+                * No cluster connection means we're not even ready to
+                * participate yet.  We check the slots after the cluster
+                * comes up, so we will notice the node death then.  We
+                * can safely ignore it here.
                 */
                return;
        }
@@ -86,61 +82,6 @@ static void ocfs2_do_node_down(int node_num,
        ocfs2_recovery_thread(osb, node_num);
 }
 
-/* Called from the dlm when it's about to evict a node. We may also
- * get a heartbeat callback later. */
-static void ocfs2_dlm_eviction_cb(int node_num,
-                                 void *data)
-{
-       struct ocfs2_super *osb = (struct ocfs2_super *) data;
-       struct super_block *sb = osb->sb;
-
-       mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n",
-            MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num);
-
-       ocfs2_do_node_down(node_num, osb);
-}
-
-void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
-{
-       /* Not exactly a heartbeat callback, but leads to essentially
-        * the same path so we set it up here. */
-       dlm_setup_eviction_cb(&osb->osb_eviction_cb,
-                             ocfs2_dlm_eviction_cb,
-                             osb);
-}
-
-void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
-{
-       int ret;
-       char *argv[5], *envp[3];
-
-       if (ocfs2_mount_local(osb))
-               return;
-
-       if (!osb->uuid_str) {
-               /* This can happen if we don't get far enough in mount... */
-               mlog(0, "No UUID with which to stop heartbeat!\n\n");
-               return;
-       }
-
-       argv[0] = (char *)o2nm_get_hb_ctl_path();
-       argv[1] = "-K";
-       argv[2] = "-u";
-       argv[3] = osb->uuid_str;
-       argv[4] = NULL;
-
-       mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
-
-       /* minimal command environment taken from cpu_run_sbin_hotplug */
-       envp[0] = "HOME=/";
-       envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
-       envp[2] = NULL;
-
-       ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
-       if (ret < 0)
-               mlog_errno(ret);
-}
-
 static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
                                            int bit)
 {
@@ -192,112 +133,3 @@ int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
        return ret;
 }
 
-static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map)
-{
-       int bit;
-       bit = find_next_bit(map->map, map->num_nodes, 0);
-       if (bit < map->num_nodes)
-               return 0;
-       return 1;
-}
-
-int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
-                           struct ocfs2_node_map *map)
-{
-       int ret;
-       BUG_ON(map->num_nodes == 0);
-       spin_lock(&osb->node_map_lock);
-       ret = __ocfs2_node_map_is_empty(map);
-       spin_unlock(&osb->node_map_lock);
-       return ret;
-}
-
-#if 0
-
-static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
-                                struct ocfs2_node_map *from)
-{
-       BUG_ON(from->num_nodes == 0);
-       ocfs2_node_map_init(target);
-       __ocfs2_node_map_set(target, from);
-}
-
-/* returns 1 if bit is the only bit set in target, 0 otherwise */
-int ocfs2_node_map_is_only(struct ocfs2_super *osb,
-                          struct ocfs2_node_map *target,
-                          int bit)
-{
-       struct ocfs2_node_map temp;
-       int ret;
-
-       spin_lock(&osb->node_map_lock);
-       __ocfs2_node_map_dup(&temp, target);
-       __ocfs2_node_map_clear_bit(&temp, bit);
-       ret = __ocfs2_node_map_is_empty(&temp);
-       spin_unlock(&osb->node_map_lock);
-
-       return ret;
-}
-
-static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
-                                struct ocfs2_node_map *from)
-{
-       int num_longs, i;
-
-       BUG_ON(target->num_nodes != from->num_nodes);
-       BUG_ON(target->num_nodes == 0);
-
-       num_longs = BITS_TO_LONGS(target->num_nodes);
-       for (i = 0; i < num_longs; i++)
-               target->map[i] = from->map[i];
-}
-
-#endif  /*  0  */
-
-/* Returns whether the recovery bit was actually set - it may not be
- * if a node is still marked as needing recovery */
-int ocfs2_recovery_map_set(struct ocfs2_super *osb,
-                          int num)
-{
-       int set = 0;
-
-       spin_lock(&osb->node_map_lock);
-
-       if (!test_bit(num, osb->recovery_map.map)) {
-           __ocfs2_node_map_set_bit(&osb->recovery_map, num);
-           set = 1;
-       }
-
-       spin_unlock(&osb->node_map_lock);
-
-       return set;
-}
-
-void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
-                             int num)
-{
-       ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num);
-}
-
-int ocfs2_node_map_iterate(struct ocfs2_super *osb,
-                          struct ocfs2_node_map *map,
-                          int idx)
-{
-       int i = idx;
-
-       idx = O2NM_INVALID_NODE_NUM;
-       spin_lock(&osb->node_map_lock);
-       if ((i != O2NM_INVALID_NODE_NUM) &&
-           (i >= 0) &&
-           (i < map->num_nodes)) {
-               while(i < map->num_nodes) {
-                       if (test_bit(i, map->map)) {
-                               idx = i;
-                               break;
-                       }
-                       i++;
-               }
-       }
-       spin_unlock(&osb->node_map_lock);
-       return idx;
-}
index eac63aed7611c2105d5026c58affea7f2b562ed4..74b9c5dda28d307a7c0d571f7ae5f3f173730716 100644 (file)
 
 void ocfs2_init_node_maps(struct ocfs2_super *osb);
 
-void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
-void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
+void ocfs2_do_node_down(int node_num, void *data);
 
 /* node map functions - used to keep track of mounted and in-recovery
  * nodes. */
-int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
-                           struct ocfs2_node_map *map);
 void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
                            struct ocfs2_node_map *map,
                            int bit);
@@ -44,17 +41,5 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
 int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
                            struct ocfs2_node_map *map,
                            int bit);
-int ocfs2_node_map_iterate(struct ocfs2_super *osb,
-                          struct ocfs2_node_map *map,
-                          int idx);
-static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb,
-                                              struct ocfs2_node_map *map)
-{
-       return ocfs2_node_map_iterate(osb, map, 0);
-}
-int ocfs2_recovery_map_set(struct ocfs2_super *osb,
-                          int num);
-void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
-                             int num);
 
 #endif /* OCFS2_HEARTBEAT_H */
index 5177fba5162b55616c77d8a1a2db7b5c28842297..b413166dd16340c0a159abbbf24efdd71d69ad2c 100644 (file)
@@ -7,6 +7,7 @@
 
 #include <linux/fs.h>
 #include <linux/mount.h>
+#include <linux/smp_lock.h>
 
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -112,9 +113,9 @@ bail:
        return status;
 }
 
-int ocfs2_ioctl(struct inode * inode, struct file * filp,
-       unsigned int cmd, unsigned long arg)
+long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+       struct inode *inode = filp->f_path.dentry->d_inode;
        unsigned int flags;
        int new_clusters;
        int status;
@@ -168,9 +169,6 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
 #ifdef CONFIG_COMPAT
 long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
-       struct inode *inode = file->f_path.dentry->d_inode;
-       int ret;
-
        switch (cmd) {
        case OCFS2_IOC32_GETFLAGS:
                cmd = OCFS2_IOC_GETFLAGS;
@@ -190,9 +188,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
                return -ENOIOCTLCMD;
        }
 
-       lock_kernel();
-       ret = ocfs2_ioctl(inode, file, cmd, arg);
-       unlock_kernel();
-       return ret;
+       return ocfs2_ioctl(file, cmd, arg);
 }
 #endif
index 4d6c4f430d0d14e174bf89c490128014421c14f4..cf9a5ee30fefb718369e83b761bbce9bafe3dc90 100644 (file)
@@ -10,8 +10,7 @@
 #ifndef OCFS2_IOCTL_H
 #define OCFS2_IOCTL_H
 
-int ocfs2_ioctl(struct inode * inode, struct file * filp,
-       unsigned int cmd, unsigned long arg);
+long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
 long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
 
 #endif /* OCFS2_IOCTL_H */
index f31c7e8c19c32bf091117a8fbd114b69c200f7ae..9698338adc3915e66a3db0eda77268e33b51f607 100644 (file)
@@ -64,6 +64,137 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                                 int slot);
 static int ocfs2_commit_thread(void *arg);
 
+
+/*
+ * The recovery_list is a simple linked list of node numbers to recover.
+ * It is protected by the recovery_lock.
+ */
+
+struct ocfs2_recovery_map {
+       unsigned int rm_used;
+       unsigned int *rm_entries;
+};
+
+int ocfs2_recovery_init(struct ocfs2_super *osb)
+{
+       struct ocfs2_recovery_map *rm;
+
+       mutex_init(&osb->recovery_lock);
+       osb->disable_recovery = 0;
+       osb->recovery_thread_task = NULL;
+       init_waitqueue_head(&osb->recovery_event);
+
+       rm = kzalloc(sizeof(struct ocfs2_recovery_map) +
+                    osb->max_slots * sizeof(unsigned int),
+                    GFP_KERNEL);
+       if (!rm) {
+               mlog_errno(-ENOMEM);
+               return -ENOMEM;
+       }
+
+       rm->rm_entries = (unsigned int *)((char *)rm +
+                                         sizeof(struct ocfs2_recovery_map));
+       osb->recovery_map = rm;
+
+       return 0;
+}
+
+/* we can't grab the goofy sem lock from inside wait_event, so we use
+ * memory barriers to make sure that we'll see the null task before
+ * being woken up */
+static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
+{
+       mb();
+       return osb->recovery_thread_task != NULL;
+}
+
+void ocfs2_recovery_exit(struct ocfs2_super *osb)
+{
+       struct ocfs2_recovery_map *rm;
+
+       /* disable any new recovery threads and wait for any currently
+        * running ones to exit. Do this before setting the vol_state. */
+       mutex_lock(&osb->recovery_lock);
+       osb->disable_recovery = 1;
+       mutex_unlock(&osb->recovery_lock);
+       wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
+
+       /* At this point, we know that no more recovery threads can be
+        * launched, so wait for any recovery completion work to
+        * complete. */
+       flush_workqueue(ocfs2_wq);
+
+       /*
+        * Now that recovery is shut down, and the osb is about to be
+        * freed,  the osb_lock is not taken here.
+        */
+       rm = osb->recovery_map;
+       /* XXX: Should we bug if there are dirty entries? */
+
+       kfree(rm);
+}
+
+static int __ocfs2_recovery_map_test(struct ocfs2_super *osb,
+                                    unsigned int node_num)
+{
+       int i;
+       struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+       assert_spin_locked(&osb->osb_lock);
+
+       for (i = 0; i < rm->rm_used; i++) {
+               if (rm->rm_entries[i] == node_num)
+                       return 1;
+       }
+
+       return 0;
+}
+
+/* Behaves like test-and-set.  Returns the previous value */
+static int ocfs2_recovery_map_set(struct ocfs2_super *osb,
+                                 unsigned int node_num)
+{
+       struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+       spin_lock(&osb->osb_lock);
+       if (__ocfs2_recovery_map_test(osb, node_num)) {
+               spin_unlock(&osb->osb_lock);
+               return 1;
+       }
+
+       /* XXX: Can this be exploited? Not from o2dlm... */
+       BUG_ON(rm->rm_used >= osb->max_slots);
+
+       rm->rm_entries[rm->rm_used] = node_num;
+       rm->rm_used++;
+       spin_unlock(&osb->osb_lock);
+
+       return 0;
+}
+
+static void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
+                                    unsigned int node_num)
+{
+       int i;
+       struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+       spin_lock(&osb->osb_lock);
+
+       for (i = 0; i < rm->rm_used; i++) {
+               if (rm->rm_entries[i] == node_num)
+                       break;
+       }
+
+       if (i < rm->rm_used) {
+               /* XXX: be careful with the pointer math */
+               memmove(&(rm->rm_entries[i]), &(rm->rm_entries[i + 1]),
+                       (rm->rm_used - i - 1) * sizeof(unsigned int));
+               rm->rm_used--;
+       }
+
+       spin_unlock(&osb->osb_lock);
+}
+
 static int ocfs2_commit_cache(struct ocfs2_super *osb)
 {
        int status = 0;
@@ -586,8 +717,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
 
        mlog_entry_void();
 
-       if (!journal)
-               BUG();
+       BUG_ON(!journal);
 
        osb = journal->j_osb;
 
@@ -650,6 +780,23 @@ bail:
        return status;
 }
 
+static int ocfs2_recovery_completed(struct ocfs2_super *osb)
+{
+       int empty;
+       struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+       spin_lock(&osb->osb_lock);
+       empty = (rm->rm_used == 0);
+       spin_unlock(&osb->osb_lock);
+
+       return empty;
+}
+
+void ocfs2_wait_for_recovery(struct ocfs2_super *osb)
+{
+       wait_event(osb->recovery_event, ocfs2_recovery_completed(osb));
+}
+
 /*
  * JBD Might read a cached version of another nodes journal file. We
  * don't want this as this file changes often and we get no
@@ -848,6 +995,7 @@ static int __ocfs2_recovery_thread(void *arg)
 {
        int status, node_num;
        struct ocfs2_super *osb = arg;
+       struct ocfs2_recovery_map *rm = osb->recovery_map;
 
        mlog_entry_void();
 
@@ -863,26 +1011,29 @@ restart:
                goto bail;
        }
 
-       while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
-               node_num = ocfs2_node_map_first_set_bit(osb,
-                                                       &osb->recovery_map);
-               if (node_num == O2NM_INVALID_NODE_NUM) {
-                       mlog(0, "Out of nodes to recover.\n");
-                       break;
-               }
+       spin_lock(&osb->osb_lock);
+       while (rm->rm_used) {
+               /* It's always safe to remove entry zero, as we won't
+                * clear it until ocfs2_recover_node() has succeeded. */
+               node_num = rm->rm_entries[0];
+               spin_unlock(&osb->osb_lock);
 
                status = ocfs2_recover_node(osb, node_num);
-               if (status < 0) {
+               if (!status) {
+                       ocfs2_recovery_map_clear(osb, node_num);
+               } else {
                        mlog(ML_ERROR,
                             "Error %d recovering node %d on device (%u,%u)!\n",
                             status, node_num,
                             MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
                        mlog(ML_ERROR, "Volume requires unmount.\n");
-                       continue;
                }
 
-               ocfs2_recovery_map_clear(osb, node_num);
+               spin_lock(&osb->osb_lock);
        }
+       spin_unlock(&osb->osb_lock);
+       mlog(0, "All nodes recovered\n");
+
        ocfs2_super_unlock(osb, 1);
 
        /* We always run recovery on our own orphan dir - the dead
@@ -893,8 +1044,7 @@ restart:
 
 bail:
        mutex_lock(&osb->recovery_lock);
-       if (!status &&
-           !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
+       if (!status && !ocfs2_recovery_completed(osb)) {
                mutex_unlock(&osb->recovery_lock);
                goto restart;
        }
@@ -924,8 +1074,8 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
 
        /* People waiting on recovery will wait on
         * the recovery map to empty. */
-       if (!ocfs2_recovery_map_set(osb, node_num))
-               mlog(0, "node %d already be in recovery.\n", node_num);
+       if (ocfs2_recovery_map_set(osb, node_num))
+               mlog(0, "node %d already in recovery map.\n", node_num);
 
        mlog(0, "starting recovery thread...\n");
 
@@ -1079,7 +1229,6 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
 {
        int status = 0;
        int slot_num;
-       struct ocfs2_slot_info *si = osb->slot_info;
        struct ocfs2_dinode *la_copy = NULL;
        struct ocfs2_dinode *tl_copy = NULL;
 
@@ -1092,8 +1241,8 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
         * case we should've called ocfs2_journal_load instead. */
        BUG_ON(osb->node_num == node_num);
 
-       slot_num = ocfs2_node_num_to_slot(si, node_num);
-       if (slot_num == OCFS2_INVALID_SLOT) {
+       slot_num = ocfs2_node_num_to_slot(osb, node_num);
+       if (slot_num == -ENOENT) {
                status = 0;
                mlog(0, "no slot for this node, so no recovery required.\n");
                goto done;
@@ -1123,8 +1272,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
 
        /* Likewise, this would be a strange but ultimately not so
         * harmful place to get an error... */
-       ocfs2_clear_slot(si, slot_num);
-       status = ocfs2_update_disk_slots(osb, si);
+       status = ocfs2_clear_slot(osb, slot_num);
        if (status < 0)
                mlog_errno(status);
 
@@ -1184,23 +1332,24 @@ bail:
  * slot info struct has been updated from disk. */
 int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
 {
-       int status, i, node_num;
-       struct ocfs2_slot_info *si = osb->slot_info;
+       unsigned int node_num;
+       int status, i;
 
        /* This is called with the super block cluster lock, so we
         * know that the slot map can't change underneath us. */
 
-       spin_lock(&si->si_lock);
-       for(i = 0; i < si->si_num_slots; i++) {
+       spin_lock(&osb->osb_lock);
+       for (i = 0; i < osb->max_slots; i++) {
                if (i == osb->slot_num)
                        continue;
-               if (ocfs2_is_empty_slot(si, i))
+
+               status = ocfs2_slot_to_node_num_locked(osb, i, &node_num);
+               if (status == -ENOENT)
                        continue;
 
-               node_num = si->si_global_node_nums[i];
-               if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
+               if (__ocfs2_recovery_map_test(osb, node_num))
                        continue;
-               spin_unlock(&si->si_lock);
+               spin_unlock(&osb->osb_lock);
 
                /* Ok, we have a slot occupied by another node which
                 * is not in the recovery map. We trylock his journal
@@ -1216,9 +1365,9 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
                        goto bail;
                }
 
-               spin_lock(&si->si_lock);
+               spin_lock(&osb->osb_lock);
        }
-       spin_unlock(&si->si_lock);
+       spin_unlock(&osb->osb_lock);
 
        status = 0;
 bail:
index 220f3e818e78c4148db49bb6dea4fc5e22ca0728..db82be2532ed5ef333c8a58a14e19cfbd3bbd6aa 100644 (file)
@@ -134,6 +134,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
 
 /* Exported only for the journal struct init code in super.c. Do not call. */
 void ocfs2_complete_recovery(struct work_struct *work);
+void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
+
+int ocfs2_recovery_init(struct ocfs2_super *osb);
+void ocfs2_recovery_exit(struct ocfs2_super *osb);
 
 /*
  *  Journal Control:
index ab83fd5624294561541dba9663f128336f296116..ce0dc147602acdb409c339b0812027fa985731c2 100644 (file)
@@ -447,6 +447,8 @@ out_mutex:
        iput(main_bm_inode);
 
 out:
+       if (!status)
+               ocfs2_init_inode_steal_slot(osb);
        mlog_exit(status);
        return status;
 }
@@ -523,6 +525,8 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
        }
 
        ac->ac_inode = local_alloc_inode;
+       /* We should never use localalloc from another slot */
+       ac->ac_alloc_slot = osb->slot_num;
        ac->ac_which = OCFS2_AC_USE_LOCAL;
        get_bh(osb->local_alloc_bh);
        ac->ac_bh = osb->local_alloc_bh;
index ae9ad9587516641366ddcb5753b61a120d9b137b..d5d808fe0140f7fb1e590c0255dd045542219bd1 100644 (file)
@@ -424,7 +424,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
        fe->i_blkno = cpu_to_le64(fe_blkno);
        fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
-       fe->i_suballoc_slot = cpu_to_le16(osb->slot_num);
+       fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
        fe->i_uid = cpu_to_le32(current->fsuid);
        if (dir->i_mode & S_ISGID) {
                fe->i_gid = cpu_to_le32(dir->i_gid);
@@ -997,7 +997,7 @@ static int ocfs2_rename(struct inode *old_dir,
         *
         * And that's why, just like the VFS, we need a file system
         * rename lock. */
-       if (old_dentry != new_dentry) {
+       if (old_dir != new_dir && S_ISDIR(old_inode->i_mode)) {
                status = ocfs2_rename_lock(osb);
                if (status < 0) {
                        mlog_errno(status);
index 6546cef212e3d1a14ca1797127d789c4cfc53eec..31692379c17059154640ed2bd9ed1e09b0d0ff03 100644 (file)
 #include <linux/mutex.h>
 #include <linux/jbd.h>
 
-#include "cluster/nodemanager.h"
-#include "cluster/heartbeat.h"
-#include "cluster/tcp.h"
-
-#include "dlm/dlmapi.h"
+/* For union ocfs2_dlm_lksb */
+#include "stackglue.h"
 
 #include "ocfs2_fs.h"
 #include "ocfs2_lockid.h"
@@ -101,6 +98,9 @@ enum ocfs2_unlock_action {
                                               * dropped. */
 #define OCFS2_LOCK_QUEUED        (0x00000100) /* queued for downconvert */
 #define OCFS2_LOCK_NOCACHE       (0x00000200) /* don't use a holder count */
+#define OCFS2_LOCK_PENDING       (0x00000400) /* This lockres is pending a
+                                                call to dlm_lock.  Only
+                                                exists with BUSY set. */
 
 struct ocfs2_lock_res_ops;
 
@@ -120,13 +120,14 @@ struct ocfs2_lock_res {
        int                      l_level;
        unsigned int             l_ro_holders;
        unsigned int             l_ex_holders;
-       struct dlm_lockstatus    l_lksb;
+       union ocfs2_dlm_lksb     l_lksb;
 
        /* used from AST/BAST funcs. */
        enum ocfs2_ast_action    l_action;
        enum ocfs2_unlock_action l_unlock_action;
        int                      l_requested;
        int                      l_blocking;
+       unsigned int             l_pending_gen;
 
        wait_queue_head_t        l_event;
 
@@ -179,6 +180,8 @@ enum ocfs2_mount_options
 #define OCFS2_DEFAULT_ATIME_QUANTUM    60
 
 struct ocfs2_journal;
+struct ocfs2_slot_info;
+struct ocfs2_recovery_map;
 struct ocfs2_super
 {
        struct task_struct *commit_task;
@@ -190,7 +193,6 @@ struct ocfs2_super
        struct ocfs2_slot_info *slot_info;
 
        spinlock_t node_map_lock;
-       struct ocfs2_node_map recovery_map;
 
        u64 root_blkno;
        u64 system_dir_blkno;
@@ -206,25 +208,29 @@ struct ocfs2_super
        u32 s_feature_incompat;
        u32 s_feature_ro_compat;
 
-       /* Protects s_next_generaion, osb_flags. Could protect more on
-        * osb as it's very short lived. */
+       /* Protects s_next_generation, osb_flags and s_inode_steal_slot.
+        * Could protect more on osb as it's very short lived.
+        */
        spinlock_t osb_lock;
        u32 s_next_generation;
        unsigned long osb_flags;
+       s16 s_inode_steal_slot;
+       atomic_t s_num_inodes_stolen;
 
        unsigned long s_mount_opt;
        unsigned int s_atime_quantum;
 
-       u16 max_slots;
-       s16 node_num;
-       s16 slot_num;
-       s16 preferred_slot;
+       unsigned int max_slots;
+       unsigned int node_num;
+       int slot_num;
+       int preferred_slot;
        int s_sectsize_bits;
        int s_clustersize;
        int s_clustersize_bits;
 
        atomic_t vol_state;
        struct mutex recovery_lock;
+       struct ocfs2_recovery_map *recovery_map;
        struct task_struct *recovery_thread_task;
        int disable_recovery;
        wait_queue_head_t checkpoint_event;
@@ -245,12 +251,11 @@ struct ocfs2_super
        struct ocfs2_alloc_stats alloc_stats;
        char dev_str[20];               /* "major,minor" of the device */
 
-       struct dlm_ctxt *dlm;
+       char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
+       struct ocfs2_cluster_connection *cconn;
        struct ocfs2_lock_res osb_super_lockres;
        struct ocfs2_lock_res osb_rename_lockres;
-       struct dlm_eviction_cb osb_eviction_cb;
        struct ocfs2_dlm_debug *osb_dlm_debug;
-       struct dlm_protocol_version osb_locking_proto;
 
        struct dentry *osb_debug_root;
 
@@ -367,11 +372,24 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
        return ret;
 }
 
+static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
+{
+       return (osb->s_feature_incompat &
+               OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK);
+}
+
 static inline int ocfs2_mount_local(struct ocfs2_super *osb)
 {
        return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT);
 }
 
+static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
+{
+       return (osb->s_feature_incompat &
+               OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP);
+}
+
+
 #define OCFS2_IS_VALID_DINODE(ptr)                                     \
        (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
 
@@ -522,6 +540,33 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
        return pages_per_cluster;
 }
 
+static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
+{
+       spin_lock(&osb->osb_lock);
+       osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
+       spin_unlock(&osb->osb_lock);
+       atomic_set(&osb->s_num_inodes_stolen, 0);
+}
+
+static inline void ocfs2_set_inode_steal_slot(struct ocfs2_super *osb,
+                                             s16 slot)
+{
+       spin_lock(&osb->osb_lock);
+       osb->s_inode_steal_slot = slot;
+       spin_unlock(&osb->osb_lock);
+}
+
+static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
+{
+       s16 slot;
+
+       spin_lock(&osb->osb_lock);
+       slot = osb->s_inode_steal_slot;
+       spin_unlock(&osb->osb_lock);
+
+       return slot;
+}
+
 #define ocfs2_set_bit ext2_set_bit
 #define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit
index 3633edd3982f731a1c1c67bf1cdab945eea0689e..52c426665154312cb0d7c812bed5f0b928a8e62d 100644 (file)
@@ -88,7 +88,9 @@
 #define OCFS2_FEATURE_COMPAT_SUPP      OCFS2_FEATURE_COMPAT_BACKUP_SB
 #define OCFS2_FEATURE_INCOMPAT_SUPP    (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
                                         | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
-                                        | OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
+                                        | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
+                                        | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
+                                        | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP   OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
 
 /*
 /* Support for data packed into inode blocks */
 #define OCFS2_FEATURE_INCOMPAT_INLINE_DATA     0x0040
 
+/* Support for the extended slot map */
+#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
+
+
+/*
+ * Support for alternate, userspace cluster stacks.  If set, the superblock
+ * field s_cluster_info contains a tag for the alternate stack in use as
+ * well as the name of the cluster being joined.
+ * mount.ocfs2 must pass in a matching stack name.
+ *
+ * If not set, the classic stack will be used.  This is compatbile with
+ * all older versions.
+ */
+#define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080
+
 /*
  * backup superblock flag is used to indicate that this volume
  * has backup superblocks.
@@ -267,6 +284,10 @@ struct ocfs2_new_group_input {
 #define OCFS2_VOL_UUID_LEN             16
 #define OCFS2_MAX_VOL_LABEL_LEN                64
 
+/* The alternate, userspace stack fields */
+#define OCFS2_STACK_LABEL_LEN          4
+#define OCFS2_CLUSTER_NAME_LEN         16
+
 /* Journal limits (in bytes) */
 #define OCFS2_MIN_JOURNAL_SIZE         (4 * 1024 * 1024)
 
@@ -474,6 +495,47 @@ struct ocfs2_extent_block
 /* Actual on-disk size is one block */
 };
 
+/*
+ * On disk slot map for OCFS2.  This defines the contents of the "slot_map"
+ * system file.  A slot is valid if it contains a node number >= 0.  The
+ * value -1 (0xFFFF) is OCFS2_INVALID_SLOT.  This marks a slot empty.
+ */
+struct ocfs2_slot_map {
+/*00*/ __le16 sm_slots[0];
+/*
+ * Actual on-disk size is one block.  OCFS2_MAX_SLOTS is 255,
+ * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize.
+ */
+};
+
+struct ocfs2_extended_slot {
+/*00*/ __u8    es_valid;
+       __u8    es_reserved1[3];
+       __le32  es_node_num;
+/*10*/
+};
+
+/*
+ * The extended slot map, used when OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP
+ * is set.  It separates out the valid marker from the node number, and
+ * has room to grow.  Unlike the old slot map, this format is defined by
+ * i_size.
+ */
+struct ocfs2_slot_map_extended {
+/*00*/ struct ocfs2_extended_slot se_slots[0];
+/*
+ * Actual size is i_size of the slot_map system file.  It should
+ * match s_max_slots * sizeof(struct ocfs2_extended_slot)
+ */
+};
+
+struct ocfs2_cluster_info {
+/*00*/ __u8   ci_stack[OCFS2_STACK_LABEL_LEN];
+       __le32 ci_reserved;
+/*08*/ __u8   ci_cluster[OCFS2_CLUSTER_NAME_LEN];
+/*18*/
+};
+
 /*
  * On disk superblock for OCFS2
  * Note that it is contained inside an ocfs2_dinode, so all offsets
@@ -506,7 +568,20 @@ struct ocfs2_super_block {
                                         * group header */
 /*50*/ __u8  s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
 /*90*/ __u8  s_uuid[OCFS2_VOL_UUID_LEN];       /* 128-bit uuid */
-/*A0*/
+/*A0*/  struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
+                                                    stack.  Only valid
+                                                    with INCOMPAT flag. */
+/*B8*/  __le64 s_reserved2[17];                /* Fill out superblock */
+/*140*/
+
+       /*
+        * NOTE: As stated above, all offsets are relative to
+        * ocfs2_dinode.id2, which is at 0xC0 in the inode.
+        * 0xC0 + 0x140 = 0x200 or 512 bytes.  A superblock must fit within
+        * our smallest blocksize, which is 512 bytes.  To ensure this,
+        * we reserve the space in s_reserved2.  Anything past s_reserved2
+        * will not be available on the smallest blocksize.
+        */
 };
 
 /*
index 86f3e3799c2b00d91f5c456792b3c73ed791be51..82c200f7a8f1e327b34b065770c6758892ea70ba 100644 (file)
@@ -100,7 +100,7 @@ static char *ocfs2_lock_type_strings[] = {
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
 {
 #ifdef __KERNEL__
-       mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type);
+       BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
 #endif
        return ocfs2_lock_type_strings[type];
 }
index 3a50ce555e641f339c72b820a52eae07b047343b..bb5ff8939bf1ef28a76dfbda74715ea05a4828f7 100644 (file)
 
 #include "buffer_head_io.h"
 
-static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
-                                   s16 global);
-static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
-                             s16 slot_num,
-                             s16 node_num);
-
-/* post the slot information on disk into our slot_info struct. */
-void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
+
+struct ocfs2_slot {
+       int sl_valid;
+       unsigned int sl_node_num;
+};
+
+struct ocfs2_slot_info {
+       int si_extended;
+       int si_slots_per_block;
+       struct inode *si_inode;
+       unsigned int si_blocks;
+       struct buffer_head **si_bh;
+       unsigned int si_num_slots;
+       struct ocfs2_slot *si_slots;
+};
+
+
+static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+                                   unsigned int node_num);
+
+static void ocfs2_invalidate_slot(struct ocfs2_slot_info *si,
+                                 int slot_num)
+{
+       BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
+       si->si_slots[slot_num].sl_valid = 0;
+}
+
+static void ocfs2_set_slot(struct ocfs2_slot_info *si,
+                          int slot_num, unsigned int node_num)
+{
+       BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
+
+       si->si_slots[slot_num].sl_valid = 1;
+       si->si_slots[slot_num].sl_node_num = node_num;
+}
+
+/* This version is for the extended slot map */
+static void ocfs2_update_slot_info_extended(struct ocfs2_slot_info *si)
+{
+       int b, i, slotno;
+       struct ocfs2_slot_map_extended *se;
+
+       slotno = 0;
+       for (b = 0; b < si->si_blocks; b++) {
+               se = (struct ocfs2_slot_map_extended *)si->si_bh[b]->b_data;
+               for (i = 0;
+                    (i < si->si_slots_per_block) &&
+                    (slotno < si->si_num_slots);
+                    i++, slotno++) {
+                       if (se->se_slots[i].es_valid)
+                               ocfs2_set_slot(si, slotno,
+                                              le32_to_cpu(se->se_slots[i].es_node_num));
+                       else
+                               ocfs2_invalidate_slot(si, slotno);
+               }
+       }
+}
+
+/*
+ * Post the slot information on disk into our slot_info struct.
+ * Must be protected by osb_lock.
+ */
+static void ocfs2_update_slot_info_old(struct ocfs2_slot_info *si)
 {
        int i;
-       __le16 *disk_info;
+       struct ocfs2_slot_map *sm;
 
-       /* we don't read the slot block here as ocfs2_super_lock
-        * should've made sure we have the most recent copy. */
-       spin_lock(&si->si_lock);
-       disk_info = (__le16 *) si->si_bh->b_data;
+       sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
 
-       for (i = 0; i < si->si_size; i++)
-               si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
+       for (i = 0; i < si->si_num_slots; i++) {
+               if (le16_to_cpu(sm->sm_slots[i]) == (u16)OCFS2_INVALID_SLOT)
+                       ocfs2_invalidate_slot(si, i);
+               else
+                       ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i]));
+       }
+}
 
-       spin_unlock(&si->si_lock);
+static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
+{
+       /*
+        * The slot data will have been refreshed when ocfs2_super_lock
+        * was taken.
+        */
+       if (si->si_extended)
+               ocfs2_update_slot_info_extended(si);
+       else
+               ocfs2_update_slot_info_old(si);
+}
+
+int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
+{
+       int ret;
+       struct ocfs2_slot_info *si = osb->slot_info;
+
+       if (si == NULL)
+               return 0;
+
+       BUG_ON(si->si_blocks == 0);
+       BUG_ON(si->si_bh == NULL);
+
+       mlog(0, "Refreshing slot map, reading %u block(s)\n",
+            si->si_blocks);
+
+       /*
+        * We pass -1 as blocknr because we expect all of si->si_bh to
+        * be !NULL.  Thus, ocfs2_read_blocks() will ignore blocknr.  If
+        * this is not true, the read of -1 (UINT64_MAX) will fail.
+        */
+       ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0,
+                               si->si_inode);
+       if (ret == 0) {
+               spin_lock(&osb->osb_lock);
+               ocfs2_update_slot_info(si);
+               spin_unlock(&osb->osb_lock);
+       }
+
+       return ret;
 }
 
 /* post the our slot info stuff into it's destination bh and write it
  * out. */
-int ocfs2_update_disk_slots(struct ocfs2_super *osb,
-                           struct ocfs2_slot_info *si)
+static void ocfs2_update_disk_slot_extended(struct ocfs2_slot_info *si,
+                                           int slot_num,
+                                           struct buffer_head **bh)
 {
-       int status, i;
-       __le16 *disk_info = (__le16 *) si->si_bh->b_data;
+       int blkind = slot_num / si->si_slots_per_block;
+       int slotno = slot_num % si->si_slots_per_block;
+       struct ocfs2_slot_map_extended *se;
+
+       BUG_ON(blkind >= si->si_blocks);
+
+       se = (struct ocfs2_slot_map_extended *)si->si_bh[blkind]->b_data;
+       se->se_slots[slotno].es_valid = si->si_slots[slot_num].sl_valid;
+       if (si->si_slots[slot_num].sl_valid)
+               se->se_slots[slotno].es_node_num =
+                       cpu_to_le32(si->si_slots[slot_num].sl_node_num);
+       *bh = si->si_bh[blkind];
+}
 
-       spin_lock(&si->si_lock);
-       for (i = 0; i < si->si_size; i++)
-               disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
-       spin_unlock(&si->si_lock);
+static void ocfs2_update_disk_slot_old(struct ocfs2_slot_info *si,
+                                      int slot_num,
+                                      struct buffer_head **bh)
+{
+       int i;
+       struct ocfs2_slot_map *sm;
+
+       sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
+       for (i = 0; i < si->si_num_slots; i++) {
+               if (si->si_slots[i].sl_valid)
+                       sm->sm_slots[i] =
+                               cpu_to_le16(si->si_slots[i].sl_node_num);
+               else
+                       sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT);
+       }
+       *bh = si->si_bh[0];
+}
+
+static int ocfs2_update_disk_slot(struct ocfs2_super *osb,
+                                 struct ocfs2_slot_info *si,
+                                 int slot_num)
+{
+       int status;
+       struct buffer_head *bh;
+
+       spin_lock(&osb->osb_lock);
+       if (si->si_extended)
+               ocfs2_update_disk_slot_extended(si, slot_num, &bh);
+       else
+               ocfs2_update_disk_slot_old(si, slot_num, &bh);
+       spin_unlock(&osb->osb_lock);
 
-       status = ocfs2_write_block(osb, si->si_bh, si->si_inode);
+       status = ocfs2_write_block(osb, bh, si->si_inode);
        if (status < 0)
                mlog_errno(status);
 
        return status;
 }
 
-/* try to find global node in the slot info. Returns
- * OCFS2_INVALID_SLOT if nothing is found. */
-static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
-                                   s16 global)
+/*
+ * Calculate how many bytes are needed by the slot map.  Returns
+ * an error if the slot map file is too small.
+ */
+static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb,
+                                       struct inode *inode,
+                                       unsigned long long *bytes)
 {
-       int i;
-       s16 ret = OCFS2_INVALID_SLOT;
+       unsigned long long bytes_needed;
+
+       if (ocfs2_uses_extended_slot_map(osb)) {
+               bytes_needed = osb->max_slots *
+                       sizeof(struct ocfs2_extended_slot);
+       } else {
+               bytes_needed = osb->max_slots * sizeof(__le16);
+       }
+       if (bytes_needed > i_size_read(inode)) {
+               mlog(ML_ERROR,
+                    "Slot map file is too small!  (size %llu, needed %llu)\n",
+                    i_size_read(inode), bytes_needed);
+               return -ENOSPC;
+       }
+
+       *bytes = bytes_needed;
+       return 0;
+}
+
+/* try to find global node in the slot info. Returns -ENOENT
+ * if nothing is found. */
+static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
+                                   unsigned int node_num)
+{
+       int i, ret = -ENOENT;
 
        for(i = 0; i < si->si_num_slots; i++) {
-               if (global == si->si_global_node_nums[i]) {
-                       ret = (s16) i;
+               if (si->si_slots[i].sl_valid &&
+                   (node_num == si->si_slots[i].sl_node_num)) {
+                       ret = i;
                        break;
                }
        }
+
        return ret;
 }
 
-static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, s16 preferred)
+static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
+                                  int preferred)
 {
-       int i;
-       s16 ret = OCFS2_INVALID_SLOT;
+       int i, ret = -ENOSPC;
 
-       if (preferred >= 0 && preferred < si->si_num_slots) {
-               if (OCFS2_INVALID_SLOT == si->si_global_node_nums[preferred]) {
+       if ((preferred >= 0) && (preferred < si->si_num_slots)) {
+               if (!si->si_slots[preferred].sl_valid) {
                        ret = preferred;
                        goto out;
                }
        }
 
        for(i = 0; i < si->si_num_slots; i++) {
-               if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
-                       ret = (s16) i;
+               if (!si->si_slots[i].sl_valid) {
+                       ret = i;
                        break;
                }
        }
@@ -124,58 +287,155 @@ out:
        return ret;
 }
 
-s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
-                          s16 global)
+int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num)
 {
-       s16 ret;
+       int slot;
+       struct ocfs2_slot_info *si = osb->slot_info;
 
-       spin_lock(&si->si_lock);
-       ret = __ocfs2_node_num_to_slot(si, global);
-       spin_unlock(&si->si_lock);
-       return ret;
+       spin_lock(&osb->osb_lock);
+       slot = __ocfs2_node_num_to_slot(si, node_num);
+       spin_unlock(&osb->osb_lock);
+
+       return slot;
+}
+
+int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
+                                 unsigned int *node_num)
+{
+       struct ocfs2_slot_info *si = osb->slot_info;
+
+       assert_spin_locked(&osb->osb_lock);
+
+       BUG_ON(slot_num < 0);
+       BUG_ON(slot_num > osb->max_slots);
+
+       if (!si->si_slots[slot_num].sl_valid)
+               return -ENOENT;
+
+       *node_num = si->si_slots[slot_num].sl_node_num;
+       return 0;
 }
 
-static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
-                             s16 slot_num,
-                             s16 node_num)
+static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
 {
-       BUG_ON(slot_num == OCFS2_INVALID_SLOT);
-       BUG_ON(slot_num >= si->si_num_slots);
-       BUG_ON((node_num != O2NM_INVALID_NODE_NUM) &&
-              (node_num >= O2NM_MAX_NODES));
+       unsigned int i;
+
+       if (si == NULL)
+               return;
+
+       if (si->si_inode)
+               iput(si->si_inode);
+       if (si->si_bh) {
+               for (i = 0; i < si->si_blocks; i++) {
+                       if (si->si_bh[i]) {
+                               brelse(si->si_bh[i]);
+                               si->si_bh[i] = NULL;
+                       }
+               }
+               kfree(si->si_bh);
+       }
 
-       si->si_global_node_nums[slot_num] = node_num;
+       kfree(si);
 }
 
-void ocfs2_clear_slot(struct ocfs2_slot_info *si,
-                     s16 slot_num)
+int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num)
 {
-       spin_lock(&si->si_lock);
-       __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT);
-       spin_unlock(&si->si_lock);
+       struct ocfs2_slot_info *si = osb->slot_info;
+
+       if (si == NULL)
+               return 0;
+
+       spin_lock(&osb->osb_lock);
+       ocfs2_invalidate_slot(si, slot_num);
+       spin_unlock(&osb->osb_lock);
+
+       return ocfs2_update_disk_slot(osb, osb->slot_info, slot_num);
 }
 
-int ocfs2_init_slot_info(struct ocfs2_super *osb)
+static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
+                                 struct ocfs2_slot_info *si)
 {
-       int status, i;
+       int status = 0;
        u64 blkno;
+       unsigned long long blocks, bytes;
+       unsigned int i;
+       struct buffer_head *bh;
+
+       status = ocfs2_slot_map_physical_size(osb, si->si_inode, &bytes);
+       if (status)
+               goto bail;
+
+       blocks = ocfs2_blocks_for_bytes(si->si_inode->i_sb, bytes);
+       BUG_ON(blocks > UINT_MAX);
+       si->si_blocks = blocks;
+       if (!si->si_blocks)
+               goto bail;
+
+       if (si->si_extended)
+               si->si_slots_per_block =
+                       (osb->sb->s_blocksize /
+                        sizeof(struct ocfs2_extended_slot));
+       else
+               si->si_slots_per_block = osb->sb->s_blocksize / sizeof(__le16);
+
+       /* The size checks above should ensure this */
+       BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks);
+
+       mlog(0, "Slot map needs %u buffers for %llu bytes\n",
+            si->si_blocks, bytes);
+
+       si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks,
+                           GFP_KERNEL);
+       if (!si->si_bh) {
+               status = -ENOMEM;
+               mlog_errno(status);
+               goto bail;
+       }
+
+       for (i = 0; i < si->si_blocks; i++) {
+               status = ocfs2_extent_map_get_blocks(si->si_inode, i,
+                                                    &blkno, NULL, NULL);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+
+               mlog(0, "Reading slot map block %u at %llu\n", i,
+                    (unsigned long long)blkno);
+
+               bh = NULL;  /* Acquire a fresh bh */
+               status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+
+               si->si_bh[i] = bh;
+       }
+
+bail:
+       return status;
+}
+
+int ocfs2_init_slot_info(struct ocfs2_super *osb)
+{
+       int status;
        struct inode *inode = NULL;
-       struct buffer_head *bh = NULL;
        struct ocfs2_slot_info *si;
 
-       si = kzalloc(sizeof(struct ocfs2_slot_info), GFP_KERNEL);
+       si = kzalloc(sizeof(struct ocfs2_slot_info) +
+                    (sizeof(struct ocfs2_slot) * osb->max_slots),
+                    GFP_KERNEL);
        if (!si) {
                status = -ENOMEM;
                mlog_errno(status);
                goto bail;
        }
 
-       spin_lock_init(&si->si_lock);
+       si->si_extended = ocfs2_uses_extended_slot_map(osb);
        si->si_num_slots = osb->max_slots;
-       si->si_size = OCFS2_MAX_SLOTS;
-
-       for(i = 0; i < si->si_num_slots; i++)
-               si->si_global_node_nums[i] = OCFS2_INVALID_SLOT;
+       si->si_slots = (struct ocfs2_slot *)((char *)si +
+                                            sizeof(struct ocfs2_slot_info));
 
        inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
                                            OCFS2_INVALID_SLOT);
@@ -185,61 +445,53 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
                goto bail;
        }
 
-       status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
-       }
-
-       status = ocfs2_read_block(osb, blkno, &bh, 0, inode);
+       si->si_inode = inode;
+       status = ocfs2_map_slot_buffers(osb, si);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
 
-       si->si_inode = inode;
-       si->si_bh = bh;
-       osb->slot_info = si;
+       osb->slot_info = (struct ocfs2_slot_info *)si;
 bail:
        if (status < 0 && si)
-               ocfs2_free_slot_info(si);
+               __ocfs2_free_slot_info(si);
 
        return status;
 }
 
-void ocfs2_free_slot_info(struct ocfs2_slot_info *si)
+void ocfs2_free_slot_info(struct ocfs2_super *osb)
 {
-       if (si->si_inode)
-               iput(si->si_inode);
-       if (si->si_bh)
-               brelse(si->si_bh);
-       kfree(si);
+       struct ocfs2_slot_info *si = osb->slot_info;
+
+       osb->slot_info = NULL;
+       __ocfs2_free_slot_info(si);
 }
 
 int ocfs2_find_slot(struct ocfs2_super *osb)
 {
        int status;
-       s16 slot;
+       int slot;
        struct ocfs2_slot_info *si;
 
        mlog_entry_void();
 
        si = osb->slot_info;
 
+       spin_lock(&osb->osb_lock);
        ocfs2_update_slot_info(si);
 
-       spin_lock(&si->si_lock);
        /* search for ourselves first and take the slot if it already
         * exists. Perhaps we need to mark this in a variable for our
         * own journal recovery? Possibly not, though we certainly
         * need to warn to the user */
        slot = __ocfs2_node_num_to_slot(si, osb->node_num);
-       if (slot == OCFS2_INVALID_SLOT) {
+       if (slot < 0) {
                /* if no slot yet, then just take 1st available
                 * one. */
                slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
-               if (slot == OCFS2_INVALID_SLOT) {
-                       spin_unlock(&si->si_lock);
+               if (slot < 0) {
+                       spin_unlock(&osb->osb_lock);
                        mlog(ML_ERROR, "no free slots available!\n");
                        status = -EINVAL;
                        goto bail;
@@ -248,13 +500,13 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
                mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
                     slot);
 
-       __ocfs2_fill_slot(si, slot, osb->node_num);
+       ocfs2_set_slot(si, slot, osb->node_num);
        osb->slot_num = slot;
-       spin_unlock(&si->si_lock);
+       spin_unlock(&osb->osb_lock);
 
        mlog(0, "taking node slot %d\n", osb->slot_num);
 
-       status = ocfs2_update_disk_slots(osb, si);
+       status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
        if (status < 0)
                mlog_errno(status);
 
@@ -265,27 +517,27 @@ bail:
 
 void ocfs2_put_slot(struct ocfs2_super *osb)
 {
-       int status;
+       int status, slot_num;
        struct ocfs2_slot_info *si = osb->slot_info;
 
        if (!si)
                return;
 
+       spin_lock(&osb->osb_lock);
        ocfs2_update_slot_info(si);
 
-       spin_lock(&si->si_lock);
-       __ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT);
+       slot_num = osb->slot_num;
+       ocfs2_invalidate_slot(si, osb->slot_num);
        osb->slot_num = OCFS2_INVALID_SLOT;
-       spin_unlock(&si->si_lock);
+       spin_unlock(&osb->osb_lock);
 
-       status = ocfs2_update_disk_slots(osb, si);
+       status = ocfs2_update_disk_slot(osb, si, slot_num);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
 
 bail:
-       osb->slot_info = NULL;
-       ocfs2_free_slot_info(si);
+       ocfs2_free_slot_info(osb);
 }
 
index 1025872aaade6cac6ede633418576dc96f0215e6..601c95fd700300e850644d54a47e8fa01188731b 100644 (file)
 #ifndef SLOTMAP_H
 #define SLOTMAP_H
 
-struct ocfs2_slot_info {
-       spinlock_t si_lock;
-
-               struct inode *si_inode;
-       struct buffer_head *si_bh;
-       unsigned int si_num_slots;
-       unsigned int si_size;
-       s16 si_global_node_nums[OCFS2_MAX_SLOTS];
-};
-
 int ocfs2_init_slot_info(struct ocfs2_super *osb);
-void ocfs2_free_slot_info(struct ocfs2_slot_info *si);
+void ocfs2_free_slot_info(struct ocfs2_super *osb);
 
 int ocfs2_find_slot(struct ocfs2_super *osb);
 void ocfs2_put_slot(struct ocfs2_super *osb);
 
-void ocfs2_update_slot_info(struct ocfs2_slot_info *si);
-int ocfs2_update_disk_slots(struct ocfs2_super *osb,
-                           struct ocfs2_slot_info *si);
-
-s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
-                          s16 global);
-void ocfs2_clear_slot(struct ocfs2_slot_info *si,
-                     s16 slot_num);
+int ocfs2_refresh_slot_info(struct ocfs2_super *osb);
 
-static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
-                                     int slot_num)
-{
-       BUG_ON(slot_num == OCFS2_INVALID_SLOT);
-       assert_spin_locked(&si->si_lock);
+int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num);
+int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
+                                 unsigned int *node_num);
 
-       return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT;
-}
+int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num);
 
 #endif
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
new file mode 100644 (file)
index 0000000..ac1d74c
--- /dev/null
@@ -0,0 +1,420 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stack_o2cb.c
+ *
+ * Code which interfaces ocfs2 with the o2cb stack.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/crc32.h>
+#include <linux/module.h>
+
+/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
+#include <linux/fs.h>
+
+#include "cluster/masklog.h"
+#include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
+
+#include "stackglue.h"
+
+struct o2dlm_private {
+       struct dlm_eviction_cb op_eviction_cb;
+};
+
+static struct ocfs2_stack_plugin o2cb_stack;
+
+/* These should be identical */
+#if (DLM_LOCK_IV != LKM_IVMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_NL != LKM_NLMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_CR != LKM_CRMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_CW != LKM_CWMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_PR != LKM_PRMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_PW != LKM_PWMODE)
+# error Lock modes do not match
+#endif
+#if (DLM_LOCK_EX != LKM_EXMODE)
+# error Lock modes do not match
+#endif
+static inline int mode_to_o2dlm(int mode)
+{
+       BUG_ON(mode > LKM_MAXMODE);
+
+       return mode;
+}
+
+#define map_flag(_generic, _o2dlm)             \
+       if (flags & (_generic)) {               \
+               flags &= ~(_generic);           \
+               o2dlm_flags |= (_o2dlm);        \
+       }
+static int flags_to_o2dlm(u32 flags)
+{
+       int o2dlm_flags = 0;
+
+       map_flag(DLM_LKF_NOQUEUE, LKM_NOQUEUE);
+       map_flag(DLM_LKF_CANCEL, LKM_CANCEL);
+       map_flag(DLM_LKF_CONVERT, LKM_CONVERT);
+       map_flag(DLM_LKF_VALBLK, LKM_VALBLK);
+       map_flag(DLM_LKF_IVVALBLK, LKM_INVVALBLK);
+       map_flag(DLM_LKF_ORPHAN, LKM_ORPHAN);
+       map_flag(DLM_LKF_FORCEUNLOCK, LKM_FORCE);
+       map_flag(DLM_LKF_TIMEOUT, LKM_TIMEOUT);
+       map_flag(DLM_LKF_LOCAL, LKM_LOCAL);
+
+       /* map_flag() should have cleared every flag passed in */
+       BUG_ON(flags != 0);
+
+       return o2dlm_flags;
+}
+#undef map_flag
+
+/*
+ * Map an o2dlm status to standard errno values.
+ *
+ * o2dlm only uses a handful of these, and returns even fewer to the
+ * caller. Still, we try to assign sane values to each error.
+ *
+ * The following value pairs have special meanings to dlmglue, thus
+ * the right hand side needs to stay unique - never duplicate the
+ * mapping elsewhere in the table!
+ *
+ * DLM_NORMAL:         0
+ * DLM_NOTQUEUED:      -EAGAIN
+ * DLM_CANCELGRANT:    -EBUSY
+ * DLM_CANCEL:         -DLM_ECANCEL
+ */
+/* Keep in sync with dlmapi.h */
+static int status_map[] = {
+       [DLM_NORMAL]                    = 0,            /* Success */
+       [DLM_GRANTED]                   = -EINVAL,
+       [DLM_DENIED]                    = -EACCES,
+       [DLM_DENIED_NOLOCKS]            = -EACCES,
+       [DLM_WORKING]                   = -EACCES,
+       [DLM_BLOCKED]                   = -EINVAL,
+       [DLM_BLOCKED_ORPHAN]            = -EINVAL,
+       [DLM_DENIED_GRACE_PERIOD]       = -EACCES,
+       [DLM_SYSERR]                    = -ENOMEM,      /* It is what it is */
+       [DLM_NOSUPPORT]                 = -EPROTO,
+       [DLM_CANCELGRANT]               = -EBUSY,       /* Cancel after grant */
+       [DLM_IVLOCKID]                  = -EINVAL,
+       [DLM_SYNC]                      = -EINVAL,
+       [DLM_BADTYPE]                   = -EINVAL,
+       [DLM_BADRESOURCE]               = -EINVAL,
+       [DLM_MAXHANDLES]                = -ENOMEM,
+       [DLM_NOCLINFO]                  = -EINVAL,
+       [DLM_NOLOCKMGR]                 = -EINVAL,
+       [DLM_NOPURGED]                  = -EINVAL,
+       [DLM_BADARGS]                   = -EINVAL,
+       [DLM_VOID]                      = -EINVAL,
+       [DLM_NOTQUEUED]                 = -EAGAIN,      /* Trylock failed */
+       [DLM_IVBUFLEN]                  = -EINVAL,
+       [DLM_CVTUNGRANT]                = -EPERM,
+       [DLM_BADPARAM]                  = -EINVAL,
+       [DLM_VALNOTVALID]               = -EINVAL,
+       [DLM_REJECTED]                  = -EPERM,
+       [DLM_ABORT]                     = -EINVAL,
+       [DLM_CANCEL]                    = -DLM_ECANCEL, /* Successful cancel */
+       [DLM_IVRESHANDLE]               = -EINVAL,
+       [DLM_DEADLOCK]                  = -EDEADLK,
+       [DLM_DENIED_NOASTS]             = -EINVAL,
+       [DLM_FORWARD]                   = -EINVAL,
+       [DLM_TIMEOUT]                   = -ETIMEDOUT,
+       [DLM_IVGROUPID]                 = -EINVAL,
+       [DLM_VERS_CONFLICT]             = -EOPNOTSUPP,
+       [DLM_BAD_DEVICE_PATH]           = -ENOENT,
+       [DLM_NO_DEVICE_PERMISSION]      = -EPERM,
+       [DLM_NO_CONTROL_DEVICE]         = -ENOENT,
+       [DLM_RECOVERING]                = -ENOTCONN,
+       [DLM_MIGRATING]                 = -ERESTART,
+       [DLM_MAXSTATS]                  = -EINVAL,
+};
+
+static int dlm_status_to_errno(enum dlm_status status)
+{
+       BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0])));
+
+       return status_map[status];
+}
+
+static void o2dlm_lock_ast_wrapper(void *astarg)
+{
+       BUG_ON(o2cb_stack.sp_proto == NULL);
+
+       o2cb_stack.sp_proto->lp_lock_ast(astarg);
+}
+
+static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
+{
+       BUG_ON(o2cb_stack.sp_proto == NULL);
+
+       o2cb_stack.sp_proto->lp_blocking_ast(astarg, level);
+}
+
+static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
+{
+       int error = dlm_status_to_errno(status);
+
+       BUG_ON(o2cb_stack.sp_proto == NULL);
+
+       /*
+        * In o2dlm, you can get both the lock_ast() for the lock being
+        * granted and the unlock_ast() for the CANCEL failing.  A
+        * successful cancel sends DLM_NORMAL here.  If the
+        * lock grant happened before the cancel arrived, you get
+        * DLM_CANCELGRANT.
+        *
+        * There's no need for the double-ast.  If we see DLM_CANCELGRANT,
+        * we just ignore it.  We expect the lock_ast() to handle the
+        * granted lock.
+        */
+       if (status == DLM_CANCELGRANT)
+               return;
+
+       o2cb_stack.sp_proto->lp_unlock_ast(astarg, error);
+}
+
+static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
+                        int mode,
+                        union ocfs2_dlm_lksb *lksb,
+                        u32 flags,
+                        void *name,
+                        unsigned int namelen,
+                        void *astarg)
+{
+       enum dlm_status status;
+       int o2dlm_mode = mode_to_o2dlm(mode);
+       int o2dlm_flags = flags_to_o2dlm(flags);
+       int ret;
+
+       status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
+                        o2dlm_flags, name, namelen,
+                        o2dlm_lock_ast_wrapper, astarg,
+                        o2dlm_blocking_ast_wrapper);
+       ret = dlm_status_to_errno(status);
+       return ret;
+}
+
+static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
+                          union ocfs2_dlm_lksb *lksb,
+                          u32 flags,
+                          void *astarg)
+{
+       enum dlm_status status;
+       int o2dlm_flags = flags_to_o2dlm(flags);
+       int ret;
+
+       status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
+                          o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg);
+       ret = dlm_status_to_errno(status);
+       return ret;
+}
+
+static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+{
+       return dlm_status_to_errno(lksb->lksb_o2dlm.status);
+}
+
+static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+{
+       return (void *)(lksb->lksb_o2dlm.lvb);
+}
+
+static void o2cb_dump_lksb(union ocfs2_dlm_lksb *lksb)
+{
+       dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
+}
+
+/*
+ * Called from the dlm when it's about to evict a node. This is how the
+ * classic stack signals node death.
+ */
+static void o2dlm_eviction_cb(int node_num, void *data)
+{
+       struct ocfs2_cluster_connection *conn = data;
+
+       mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n",
+            node_num, conn->cc_namelen, conn->cc_name);
+
+       conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
+}
+
+static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
+{
+       int rc = 0;
+       u32 dlm_key;
+       struct dlm_ctxt *dlm;
+       struct o2dlm_private *priv;
+       struct dlm_protocol_version dlm_version;
+
+       BUG_ON(conn == NULL);
+       BUG_ON(o2cb_stack.sp_proto == NULL);
+
+       /* for now we only have one cluster/node, make sure we see it
+        * in the heartbeat universe */
+       if (!o2hb_check_local_node_heartbeating()) {
+               rc = -EINVAL;
+               goto out;
+       }
+
+       priv = kzalloc(sizeof(struct o2dlm_private), GFP_KERNEL);
+       if (!priv) {
+               rc = -ENOMEM;
+               goto out_free;
+       }
+
+       /* This just fills the structure in.  It is safe to pass conn. */
+       dlm_setup_eviction_cb(&priv->op_eviction_cb, o2dlm_eviction_cb,
+                             conn);
+
+       conn->cc_private = priv;
+
+       /* used by the dlm code to make message headers unique, each
+        * node in this domain must agree on this. */
+       dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen);
+       dlm_version.pv_major = conn->cc_version.pv_major;
+       dlm_version.pv_minor = conn->cc_version.pv_minor;
+
+       dlm = dlm_register_domain(conn->cc_name, dlm_key, &dlm_version);
+       if (IS_ERR(dlm)) {
+               rc = PTR_ERR(dlm);
+               mlog_errno(rc);
+               goto out_free;
+       }
+
+       conn->cc_version.pv_major = dlm_version.pv_major;
+       conn->cc_version.pv_minor = dlm_version.pv_minor;
+       conn->cc_lockspace = dlm;
+
+       dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
+
+out_free:
+       if (rc && conn->cc_private)
+               kfree(conn->cc_private);
+
+out:
+       return rc;
+}
+
+static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+                                  int hangup_pending)
+{
+       struct dlm_ctxt *dlm = conn->cc_lockspace;
+       struct o2dlm_private *priv = conn->cc_private;
+
+       dlm_unregister_eviction_cb(&priv->op_eviction_cb);
+       conn->cc_private = NULL;
+       kfree(priv);
+
+       dlm_unregister_domain(dlm);
+       conn->cc_lockspace = NULL;
+
+       return 0;
+}
+
+static void o2hb_stop(const char *group)
+{
+       int ret;
+       char *argv[5], *envp[3];
+
+       argv[0] = (char *)o2nm_get_hb_ctl_path();
+       argv[1] = "-K";
+       argv[2] = "-u";
+       argv[3] = (char *)group;
+       argv[4] = NULL;
+
+       mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
+
+       /* minimal command environment taken from cpu_run_sbin_hotplug */
+       envp[0] = "HOME=/";
+       envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+       envp[2] = NULL;
+
+       ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+       if (ret < 0)
+               mlog_errno(ret);
+}
+
+/*
+ * Hangup is a hack for tools compatibility.  Older ocfs2-tools software
+ * expects the filesystem to call "ocfs2_hb_ctl" during unmount.  This
+ * happens regardless of whether the DLM got started, so we can't do it
+ * in ocfs2_cluster_disconnect().  We bring the o2hb_stop() function into
+ * the glue and provide a "hangup" API for super.c to call.
+ *
+ * Other stacks will eventually provide a NULL ->hangup() pointer.
+ */
+static void o2cb_cluster_hangup(const char *group, int grouplen)
+{
+       o2hb_stop(group);
+}
+
+static int o2cb_cluster_this_node(unsigned int *node)
+{
+       int node_num;
+
+       node_num = o2nm_this_node();
+       if (node_num == O2NM_INVALID_NODE_NUM)
+               return -ENOENT;
+
+       if (node_num >= O2NM_MAX_NODES)
+               return -EOVERFLOW;
+
+       *node = node_num;
+       return 0;
+}
+
+struct ocfs2_stack_operations o2cb_stack_ops = {
+       .connect        = o2cb_cluster_connect,
+       .disconnect     = o2cb_cluster_disconnect,
+       .hangup         = o2cb_cluster_hangup,
+       .this_node      = o2cb_cluster_this_node,
+       .dlm_lock       = o2cb_dlm_lock,
+       .dlm_unlock     = o2cb_dlm_unlock,
+       .lock_status    = o2cb_dlm_lock_status,
+       .lock_lvb       = o2cb_dlm_lvb,
+       .dump_lksb      = o2cb_dump_lksb,
+};
+
+static struct ocfs2_stack_plugin o2cb_stack = {
+       .sp_name        = "o2cb",
+       .sp_ops         = &o2cb_stack_ops,
+       .sp_owner       = THIS_MODULE,
+};
+
+static int __init o2cb_stack_init(void)
+{
+       return ocfs2_stack_glue_register(&o2cb_stack);
+}
+
+static void __exit o2cb_stack_exit(void)
+{
+       ocfs2_stack_glue_unregister(&o2cb_stack);
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 driver for the classic o2cb stack");
+MODULE_LICENSE("GPL");
+module_init(o2cb_stack_init);
+module_exit(o2cb_stack_exit);
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
new file mode 100644 (file)
index 0000000..7428663
--- /dev/null
@@ -0,0 +1,883 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stack_user.c
+ *
+ * Code which interfaces ocfs2 with fs/dlm and a userspace stack.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/reboot.h>
+#include <asm/uaccess.h>
+
+#include "ocfs2.h"  /* For struct ocfs2_lock_res */
+#include "stackglue.h"
+
+
+/*
+ * The control protocol starts with a handshake.  Until the handshake
+ * is complete, the control device will fail all write(2)s.
+ *
+ * The handshake is simple.  First, the client reads until EOF.  Each line
+ * of output is a supported protocol tag.  All protocol tags are a single
+ * character followed by a two hex digit version number.  Currently the
+ * only things supported is T01, for "Text-base version 0x01".  Next, the
+ * client writes the version they would like to use, including the newline.
+ * Thus, the protocol tag is 'T01\n'.  If the version tag written is
+ * unknown, -EINVAL is returned.  Once the negotiation is complete, the
+ * client can start sending messages.
+ *
+ * The T01 protocol has three messages.  First is the "SETN" message.
+ * It has the following syntax:
+ *
+ *  SETN<space><8-char-hex-nodenum><newline>
+ *
+ * This is 14 characters.
+ *
+ * The "SETN" message must be the first message following the protocol.
+ * It tells ocfs2_control the local node number.
+ *
+ * Next comes the "SETV" message.  It has the following syntax:
+ *
+ *  SETV<space><2-char-hex-major><space><2-char-hex-minor><newline>
+ *
+ * This is 11 characters.
+ *
+ * The "SETV" message sets the filesystem locking protocol version as
+ * negotiated by the client.  The client negotiates based on the maximum
+ * version advertised in /sys/fs/ocfs2/max_locking_protocol.  The major
+ * number from the "SETV" message must match
+ * user_stack.sp_proto->lp_max_version.pv_major, and the minor number
+ * must be less than or equal to ...->lp_max_version.pv_minor.
+ *
+ * Once this information has been set, mounts will be allowed.  From this
+ * point on, the "DOWN" message can be sent for node down notification.
+ * It has the following syntax:
+ *
+ *  DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline>
+ *
+ * eg:
+ *
+ *  DOWN 632A924FDD844190BDA93C0DF6B94899 00000001\n
+ *
+ * This is 47 characters.
+ */
+
+/*
+ * Whether or not the client has done the handshake.
+ * For now, we have just one protocol version.
+ */
+#define OCFS2_CONTROL_PROTO                    "T01\n"
+#define OCFS2_CONTROL_PROTO_LEN                        4
+
+/* Handshake states */
+#define OCFS2_CONTROL_HANDSHAKE_INVALID                (0)
+#define OCFS2_CONTROL_HANDSHAKE_READ           (1)
+#define OCFS2_CONTROL_HANDSHAKE_PROTOCOL       (2)
+#define OCFS2_CONTROL_HANDSHAKE_VALID          (3)
+
+/* Messages */
+#define OCFS2_CONTROL_MESSAGE_OP_LEN           4
+#define OCFS2_CONTROL_MESSAGE_SETNODE_OP       "SETN"
+#define OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN        14
+#define OCFS2_CONTROL_MESSAGE_SETVERSION_OP    "SETV"
+#define OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN     11
+#define OCFS2_CONTROL_MESSAGE_DOWN_OP          "DOWN"
+#define OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN   47
+#define OCFS2_TEXT_UUID_LEN                    32
+#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN       2
+#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN      8
+
+/*
+ * ocfs2_live_connection is refcounted because the filesystem and
+ * miscdevice sides can detach in different order.  Let's just be safe.
+ */
+struct ocfs2_live_connection {
+       struct list_head                oc_list;
+       struct ocfs2_cluster_connection *oc_conn;
+};
+
+struct ocfs2_control_private {
+       struct list_head op_list;
+       int op_state;
+       int op_this_node;
+       struct ocfs2_protocol_version op_proto;
+};
+
+/* SETN<space><8-char-hex-nodenum><newline> */
+struct ocfs2_control_message_setn {
+       char    tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+       char    space;
+       char    nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
+       char    newline;
+};
+
+/* SETV<space><2-char-hex-major><space><2-char-hex-minor><newline> */
+struct ocfs2_control_message_setv {
+       char    tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+       char    space1;
+       char    major[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
+       char    space2;
+       char    minor[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
+       char    newline;
+};
+
+/* DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline> */
+struct ocfs2_control_message_down {
+       char    tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+       char    space1;
+       char    uuid[OCFS2_TEXT_UUID_LEN];
+       char    space2;
+       char    nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
+       char    newline;
+};
+
+union ocfs2_control_message {
+       char                                    tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
+       struct ocfs2_control_message_setn       u_setn;
+       struct ocfs2_control_message_setv       u_setv;
+       struct ocfs2_control_message_down       u_down;
+};
+
+static struct ocfs2_stack_plugin user_stack;
+
+static atomic_t ocfs2_control_opened;
+static int ocfs2_control_this_node = -1;
+static struct ocfs2_protocol_version running_proto;
+
+static LIST_HEAD(ocfs2_live_connection_list);
+static LIST_HEAD(ocfs2_control_private_list);
+static DEFINE_MUTEX(ocfs2_control_lock);
+
+static inline void ocfs2_control_set_handshake_state(struct file *file,
+                                                    int state)
+{
+       struct ocfs2_control_private *p = file->private_data;
+       p->op_state = state;
+}
+
+static inline int ocfs2_control_get_handshake_state(struct file *file)
+{
+       struct ocfs2_control_private *p = file->private_data;
+       return p->op_state;
+}
+
+static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
+{
+       size_t len = strlen(name);
+       struct ocfs2_live_connection *c;
+
+       BUG_ON(!mutex_is_locked(&ocfs2_control_lock));
+
+       list_for_each_entry(c, &ocfs2_live_connection_list, oc_list) {
+               if ((c->oc_conn->cc_namelen == len) &&
+                   !strncmp(c->oc_conn->cc_name, name, len))
+                       return c;
+       }
+
+       return c;
+}
+
+/*
+ * ocfs2_live_connection structures are created underneath the ocfs2
+ * mount path.  Since the VFS prevents multiple calls to
+ * fill_super(), we can't get dupes here.
+ */
+static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
+                                    struct ocfs2_live_connection **c_ret)
+{
+       int rc = 0;
+       struct ocfs2_live_connection *c;
+
+       c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
+       if (!c)
+               return -ENOMEM;
+
+       mutex_lock(&ocfs2_control_lock);
+       c->oc_conn = conn;
+
+       if (atomic_read(&ocfs2_control_opened))
+               list_add(&c->oc_list, &ocfs2_live_connection_list);
+       else {
+               printk(KERN_ERR
+                      "ocfs2: Userspace control daemon is not present\n");
+               rc = -ESRCH;
+       }
+
+       mutex_unlock(&ocfs2_control_lock);
+
+       if (!rc)
+               *c_ret = c;
+       else
+               kfree(c);
+
+       return rc;
+}
+
+/*
+ * This function disconnects the cluster connection from ocfs2_control.
+ * Afterwards, userspace can't affect the cluster connection.
+ */
+static void ocfs2_live_connection_drop(struct ocfs2_live_connection *c)
+{
+       mutex_lock(&ocfs2_control_lock);
+       list_del_init(&c->oc_list);
+       c->oc_conn = NULL;
+       mutex_unlock(&ocfs2_control_lock);
+
+       kfree(c);
+}
+
+static int ocfs2_control_cfu(void *target, size_t target_len,
+                            const char __user *buf, size_t count)
+{
+       /* The T01 expects write(2) calls to have exactly one command */
+       if ((count != target_len) ||
+           (count > sizeof(union ocfs2_control_message)))
+               return -EINVAL;
+
+       if (copy_from_user(target, buf, target_len))
+               return -EFAULT;
+
+       return 0;
+}
+
+static ssize_t ocfs2_control_validate_protocol(struct file *file,
+                                              const char __user *buf,
+                                              size_t count)
+{
+       ssize_t ret;
+       char kbuf[OCFS2_CONTROL_PROTO_LEN];
+
+       ret = ocfs2_control_cfu(kbuf, OCFS2_CONTROL_PROTO_LEN,
+                               buf, count);
+       if (ret)
+               return ret;
+
+       if (strncmp(kbuf, OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN))
+               return -EINVAL;
+
+       ocfs2_control_set_handshake_state(file,
+                                         OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
+
+       return count;
+}
+
+static void ocfs2_control_send_down(const char *uuid,
+                                   int nodenum)
+{
+       struct ocfs2_live_connection *c;
+
+       mutex_lock(&ocfs2_control_lock);
+
+       c = ocfs2_connection_find(uuid);
+       if (c) {
+               BUG_ON(c->oc_conn == NULL);
+               c->oc_conn->cc_recovery_handler(nodenum,
+                                               c->oc_conn->cc_recovery_data);
+       }
+
+       mutex_unlock(&ocfs2_control_lock);
+}
+
+/*
+ * Called whenever configuration elements are sent to /dev/ocfs2_control.
+ * If all configuration elements are present, try to set the global
+ * values.  If there is a problem, return an error.  Skip any missing
+ * elements, and only bump ocfs2_control_opened when we have all elements
+ * and are successful.
+ */
+static int ocfs2_control_install_private(struct file *file)
+{
+       int rc = 0;
+       int set_p = 1;
+       struct ocfs2_control_private *p = file->private_data;
+
+       BUG_ON(p->op_state != OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
+
+       mutex_lock(&ocfs2_control_lock);
+
+       if (p->op_this_node < 0) {
+               set_p = 0;
+       } else if ((ocfs2_control_this_node >= 0) &&
+                  (ocfs2_control_this_node != p->op_this_node)) {
+               rc = -EINVAL;
+               goto out_unlock;
+       }
+
+       if (!p->op_proto.pv_major) {
+               set_p = 0;
+       } else if (!list_empty(&ocfs2_live_connection_list) &&
+                  ((running_proto.pv_major != p->op_proto.pv_major) ||
+                   (running_proto.pv_minor != p->op_proto.pv_minor))) {
+               rc = -EINVAL;
+               goto out_unlock;
+       }
+
+       if (set_p) {
+               ocfs2_control_this_node = p->op_this_node;
+               running_proto.pv_major = p->op_proto.pv_major;
+               running_proto.pv_minor = p->op_proto.pv_minor;
+       }
+
+out_unlock:
+       mutex_unlock(&ocfs2_control_lock);
+
+       if (!rc && set_p) {
+               /* We set the global values successfully */
+               atomic_inc(&ocfs2_control_opened);
+               ocfs2_control_set_handshake_state(file,
+                                       OCFS2_CONTROL_HANDSHAKE_VALID);
+       }
+
+       return rc;
+}
+
+static int ocfs2_control_get_this_node(void)
+{
+       int rc;
+
+       mutex_lock(&ocfs2_control_lock);
+       if (ocfs2_control_this_node < 0)
+               rc = -EINVAL;
+       else
+               rc = ocfs2_control_this_node;
+       mutex_unlock(&ocfs2_control_lock);
+
+       return rc;
+}
+
+static int ocfs2_control_do_setnode_msg(struct file *file,
+                                       struct ocfs2_control_message_setn *msg)
+{
+       long nodenum;
+       char *ptr = NULL;
+       struct ocfs2_control_private *p = file->private_data;
+
+       if (ocfs2_control_get_handshake_state(file) !=
+           OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
+               return -EINVAL;
+
+       if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
+                   OCFS2_CONTROL_MESSAGE_OP_LEN))
+               return -EINVAL;
+
+       if ((msg->space != ' ') || (msg->newline != '\n'))
+               return -EINVAL;
+       msg->space = msg->newline = '\0';
+
+       nodenum = simple_strtol(msg->nodestr, &ptr, 16);
+       if (!ptr || *ptr)
+               return -EINVAL;
+
+       if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
+           (nodenum > INT_MAX) || (nodenum < 0))
+               return -ERANGE;
+       p->op_this_node = nodenum;
+
+       return ocfs2_control_install_private(file);
+}
+
+static int ocfs2_control_do_setversion_msg(struct file *file,
+                                          struct ocfs2_control_message_setv *msg)
+ {
+       long major, minor;
+       char *ptr = NULL;
+       struct ocfs2_control_private *p = file->private_data;
+       struct ocfs2_protocol_version *max =
+               &user_stack.sp_proto->lp_max_version;
+
+       if (ocfs2_control_get_handshake_state(file) !=
+           OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
+               return -EINVAL;
+
+       if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
+                   OCFS2_CONTROL_MESSAGE_OP_LEN))
+               return -EINVAL;
+
+       if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
+           (msg->newline != '\n'))
+               return -EINVAL;
+       msg->space1 = msg->space2 = msg->newline = '\0';
+
+       major = simple_strtol(msg->major, &ptr, 16);
+       if (!ptr || *ptr)
+               return -EINVAL;
+       minor = simple_strtol(msg->minor, &ptr, 16);
+       if (!ptr || *ptr)
+               return -EINVAL;
+
+       /*
+        * The major must be between 1 and 255, inclusive.  The minor
+        * must be between 0 and 255, inclusive.  The version passed in
+        * must be within the maximum version supported by the filesystem.
+        */
+       if ((major == LONG_MIN) || (major == LONG_MAX) ||
+           (major > (u8)-1) || (major < 1))
+               return -ERANGE;
+       if ((minor == LONG_MIN) || (minor == LONG_MAX) ||
+           (minor > (u8)-1) || (minor < 0))
+               return -ERANGE;
+       if ((major != max->pv_major) ||
+           (minor > max->pv_minor))
+               return -EINVAL;
+
+       p->op_proto.pv_major = major;
+       p->op_proto.pv_minor = minor;
+
+       return ocfs2_control_install_private(file);
+}
+
+static int ocfs2_control_do_down_msg(struct file *file,
+                                    struct ocfs2_control_message_down *msg)
+{
+       long nodenum;
+       char *p = NULL;
+
+       if (ocfs2_control_get_handshake_state(file) !=
+           OCFS2_CONTROL_HANDSHAKE_VALID)
+               return -EINVAL;
+
+       if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
+                   OCFS2_CONTROL_MESSAGE_OP_LEN))
+               return -EINVAL;
+
+       if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
+           (msg->newline != '\n'))
+               return -EINVAL;
+       msg->space1 = msg->space2 = msg->newline = '\0';
+
+       nodenum = simple_strtol(msg->nodestr, &p, 16);
+       if (!p || *p)
+               return -EINVAL;
+
+       if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
+           (nodenum > INT_MAX) || (nodenum < 0))
+               return -ERANGE;
+
+       ocfs2_control_send_down(msg->uuid, nodenum);
+
+       return 0;
+}
+
+static ssize_t ocfs2_control_message(struct file *file,
+                                    const char __user *buf,
+                                    size_t count)
+{
+       ssize_t ret;
+       union ocfs2_control_message msg;
+
+       /* Try to catch padding issues */
+       WARN_ON(offsetof(struct ocfs2_control_message_down, uuid) !=
+               (sizeof(msg.u_down.tag) + sizeof(msg.u_down.space1)));
+
+       memset(&msg, 0, sizeof(union ocfs2_control_message));
+       ret = ocfs2_control_cfu(&msg, count, buf, count);
+       if (ret)
+               goto out;
+
+       if ((count == OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN) &&
+           !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
+                    OCFS2_CONTROL_MESSAGE_OP_LEN))
+               ret = ocfs2_control_do_setnode_msg(file, &msg.u_setn);
+       else if ((count == OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN) &&
+                !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
+                         OCFS2_CONTROL_MESSAGE_OP_LEN))
+               ret = ocfs2_control_do_setversion_msg(file, &msg.u_setv);
+       else if ((count == OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN) &&
+                !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
+                         OCFS2_CONTROL_MESSAGE_OP_LEN))
+               ret = ocfs2_control_do_down_msg(file, &msg.u_down);
+       else
+               ret = -EINVAL;
+
+out:
+       return ret ? ret : count;
+}
+
+static ssize_t ocfs2_control_write(struct file *file,
+                                  const char __user *buf,
+                                  size_t count,
+                                  loff_t *ppos)
+{
+       ssize_t ret;
+
+       switch (ocfs2_control_get_handshake_state(file)) {
+               case OCFS2_CONTROL_HANDSHAKE_INVALID:
+                       ret = -EINVAL;
+                       break;
+
+               case OCFS2_CONTROL_HANDSHAKE_READ:
+                       ret = ocfs2_control_validate_protocol(file, buf,
+                                                             count);
+                       break;
+
+               case OCFS2_CONTROL_HANDSHAKE_PROTOCOL:
+               case OCFS2_CONTROL_HANDSHAKE_VALID:
+                       ret = ocfs2_control_message(file, buf, count);
+                       break;
+
+               default:
+                       BUG();
+                       ret = -EIO;
+                       break;
+       }
+
+       return ret;
+}
+
+/*
+ * This is a naive version.  If we ever have a new protocol, we'll expand
+ * it.  Probably using seq_file.
+ */
+static ssize_t ocfs2_control_read(struct file *file,
+                                 char __user *buf,
+                                 size_t count,
+                                 loff_t *ppos)
+{
+       char *proto_string = OCFS2_CONTROL_PROTO;
+       size_t to_write = 0;
+
+       if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
+               return 0;
+
+       to_write = OCFS2_CONTROL_PROTO_LEN - *ppos;
+       if (to_write > count)
+               to_write = count;
+       if (copy_to_user(buf, proto_string + *ppos, to_write))
+               return -EFAULT;
+
+       *ppos += to_write;
+
+       /* Have we read the whole protocol list? */
+       if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
+               ocfs2_control_set_handshake_state(file,
+                                                 OCFS2_CONTROL_HANDSHAKE_READ);
+
+       return to_write;
+}
+
+static int ocfs2_control_release(struct inode *inode, struct file *file)
+{
+       struct ocfs2_control_private *p = file->private_data;
+
+       mutex_lock(&ocfs2_control_lock);
+
+       if (ocfs2_control_get_handshake_state(file) !=
+           OCFS2_CONTROL_HANDSHAKE_VALID)
+               goto out;
+
+       if (atomic_dec_and_test(&ocfs2_control_opened)) {
+               if (!list_empty(&ocfs2_live_connection_list)) {
+                       /* XXX: Do bad things! */
+                       printk(KERN_ERR
+                              "ocfs2: Unexpected release of ocfs2_control!\n"
+                              "       Loss of cluster connection requires "
+                              "an emergency restart!\n");
+                       emergency_restart();
+               }
+               /*
+                * Last valid close clears the node number and resets
+                * the locking protocol version
+                */
+               ocfs2_control_this_node = -1;
+               running_proto.pv_major = 0;
+               running_proto.pv_major = 0;
+       }
+
+out:
+       list_del_init(&p->op_list);
+       file->private_data = NULL;
+
+       mutex_unlock(&ocfs2_control_lock);
+
+       kfree(p);
+
+       return 0;
+}
+
+static int ocfs2_control_open(struct inode *inode, struct file *file)
+{
+       struct ocfs2_control_private *p;
+
+       p = kzalloc(sizeof(struct ocfs2_control_private), GFP_KERNEL);
+       if (!p)
+               return -ENOMEM;
+       p->op_this_node = -1;
+
+       mutex_lock(&ocfs2_control_lock);
+       file->private_data = p;
+       list_add(&p->op_list, &ocfs2_control_private_list);
+       mutex_unlock(&ocfs2_control_lock);
+
+       return 0;
+}
+
+static const struct file_operations ocfs2_control_fops = {
+       .open    = ocfs2_control_open,
+       .release = ocfs2_control_release,
+       .read    = ocfs2_control_read,
+       .write   = ocfs2_control_write,
+       .owner   = THIS_MODULE,
+};
+
+struct miscdevice ocfs2_control_device = {
+       .minor          = MISC_DYNAMIC_MINOR,
+       .name           = "ocfs2_control",
+       .fops           = &ocfs2_control_fops,
+};
+
+static int ocfs2_control_init(void)
+{
+       int rc;
+
+       atomic_set(&ocfs2_control_opened, 0);
+
+       rc = misc_register(&ocfs2_control_device);
+       if (rc)
+               printk(KERN_ERR
+                      "ocfs2: Unable to register ocfs2_control device "
+                      "(errno %d)\n",
+                      -rc);
+
+       return rc;
+}
+
+static void ocfs2_control_exit(void)
+{
+       int rc;
+
+       rc = misc_deregister(&ocfs2_control_device);
+       if (rc)
+               printk(KERN_ERR
+                      "ocfs2: Unable to deregister ocfs2_control device "
+                      "(errno %d)\n",
+                      -rc);
+}
+
+static struct dlm_lksb *fsdlm_astarg_to_lksb(void *astarg)
+{
+       struct ocfs2_lock_res *res = astarg;
+       return &res->l_lksb.lksb_fsdlm;
+}
+
+static void fsdlm_lock_ast_wrapper(void *astarg)
+{
+       struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg);
+       int status = lksb->sb_status;
+
+       BUG_ON(user_stack.sp_proto == NULL);
+
+       /*
+        * For now we're punting on the issue of other non-standard errors
+        * where we can't tell if the unlock_ast or lock_ast should be called.
+        * The main "other error" that's possible is EINVAL which means the
+        * function was called with invalid args, which shouldn't be possible
+        * since the caller here is under our control.  Other non-standard
+        * errors probably fall into the same category, or otherwise are fatal
+        * which means we can't carry on anyway.
+        */
+
+       if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
+               user_stack.sp_proto->lp_unlock_ast(astarg, 0);
+       else
+               user_stack.sp_proto->lp_lock_ast(astarg);
+}
+
+static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
+{
+       BUG_ON(user_stack.sp_proto == NULL);
+
+       user_stack.sp_proto->lp_blocking_ast(astarg, level);
+}
+
+static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
+                        int mode,
+                        union ocfs2_dlm_lksb *lksb,
+                        u32 flags,
+                        void *name,
+                        unsigned int namelen,
+                        void *astarg)
+{
+       int ret;
+
+       if (!lksb->lksb_fsdlm.sb_lvbptr)
+               lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
+                                            sizeof(struct dlm_lksb);
+
+       ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
+                      flags|DLM_LKF_NODLCKWT, name, namelen, 0,
+                      fsdlm_lock_ast_wrapper, astarg,
+                      fsdlm_blocking_ast_wrapper);
+       return ret;
+}
+
+static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
+                          union ocfs2_dlm_lksb *lksb,
+                          u32 flags,
+                          void *astarg)
+{
+       int ret;
+
+       ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
+                        flags, &lksb->lksb_fsdlm, astarg);
+       return ret;
+}
+
+static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+{
+       return lksb->lksb_fsdlm.sb_status;
+}
+
+static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+{
+       return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
+}
+
+static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
+{
+}
+
+/*
+ * Compare a requested locking protocol version against the current one.
+ *
+ * If the major numbers are different, they are incompatible.
+ * If the current minor is greater than the request, they are incompatible.
+ * If the current minor is less than or equal to the request, they are
+ * compatible, and the requester should run at the current minor version.
+ */
+static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
+                              struct ocfs2_protocol_version *request)
+{
+       if (existing->pv_major != request->pv_major)
+               return 1;
+
+       if (existing->pv_minor > request->pv_minor)
+               return 1;
+
+       if (existing->pv_minor < request->pv_minor)
+               request->pv_minor = existing->pv_minor;
+
+       return 0;
+}
+
+static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
+{
+       dlm_lockspace_t *fsdlm;
+       struct ocfs2_live_connection *control;
+       int rc = 0;
+
+       BUG_ON(conn == NULL);
+
+       rc = ocfs2_live_connection_new(conn, &control);
+       if (rc)
+               goto out;
+
+       /*
+        * running_proto must have been set before we allowed any mounts
+        * to proceed.
+        */
+       if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
+               printk(KERN_ERR
+                      "Unable to mount with fs locking protocol version "
+                      "%u.%u because the userspace control daemon has "
+                      "negotiated %u.%u\n",
+                      conn->cc_version.pv_major, conn->cc_version.pv_minor,
+                      running_proto.pv_major, running_proto.pv_minor);
+               rc = -EPROTO;
+               ocfs2_live_connection_drop(control);
+               goto out;
+       }
+
+       rc = dlm_new_lockspace(conn->cc_name, strlen(conn->cc_name),
+                              &fsdlm, DLM_LSFL_FS, DLM_LVB_LEN);
+       if (rc) {
+               ocfs2_live_connection_drop(control);
+               goto out;
+       }
+
+       conn->cc_private = control;
+       conn->cc_lockspace = fsdlm;
+out:
+       return rc;
+}
+
+static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+                                  int hangup_pending)
+{
+       dlm_release_lockspace(conn->cc_lockspace, 2);
+       conn->cc_lockspace = NULL;
+       ocfs2_live_connection_drop(conn->cc_private);
+       conn->cc_private = NULL;
+       return 0;
+}
+
+static int user_cluster_this_node(unsigned int *this_node)
+{
+       int rc;
+
+       rc = ocfs2_control_get_this_node();
+       if (rc < 0)
+               return rc;
+
+       *this_node = rc;
+       return 0;
+}
+
+static struct ocfs2_stack_operations user_stack_ops = {
+       .connect        = user_cluster_connect,
+       .disconnect     = user_cluster_disconnect,
+       .this_node      = user_cluster_this_node,
+       .dlm_lock       = user_dlm_lock,
+       .dlm_unlock     = user_dlm_unlock,
+       .lock_status    = user_dlm_lock_status,
+       .lock_lvb       = user_dlm_lvb,
+       .dump_lksb      = user_dlm_dump_lksb,
+};
+
+static struct ocfs2_stack_plugin user_stack = {
+       .sp_name        = "user",
+       .sp_ops         = &user_stack_ops,
+       .sp_owner       = THIS_MODULE,
+};
+
+
+static int __init user_stack_init(void)
+{
+       int rc;
+
+       rc = ocfs2_control_init();
+       if (!rc) {
+               rc = ocfs2_stack_glue_register(&user_stack);
+               if (rc)
+                       ocfs2_control_exit();
+       }
+
+       return rc;
+}
+
+static void __exit user_stack_exit(void)
+{
+       ocfs2_stack_glue_unregister(&user_stack);
+       ocfs2_control_exit();
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 driver for userspace cluster stacks");
+MODULE_LICENSE("GPL");
+module_init(user_stack_init);
+module_exit(user_stack_exit);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
new file mode 100644 (file)
index 0000000..119f60c
--- /dev/null
@@ -0,0 +1,568 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stackglue.c
+ *
+ * Code which implements an OCFS2 specific interface to underlying
+ * cluster stacks.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+
+#include "ocfs2_fs.h"
+
+#include "stackglue.h"
+
+#define OCFS2_STACK_PLUGIN_O2CB                "o2cb"
+#define OCFS2_STACK_PLUGIN_USER                "user"
+
+static struct ocfs2_locking_protocol *lproto;
+static DEFINE_SPINLOCK(ocfs2_stack_lock);
+static LIST_HEAD(ocfs2_stack_list);
+static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
+
+/*
+ * The stack currently in use.  If not null, active_stack->sp_count > 0,
+ * the module is pinned, and the locking protocol cannot be changed.
+ */
+static struct ocfs2_stack_plugin *active_stack;
+
+static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
+{
+       struct ocfs2_stack_plugin *p;
+
+       assert_spin_locked(&ocfs2_stack_lock);
+
+       list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+               if (!strcmp(p->sp_name, name))
+                       return p;
+       }
+
+       return NULL;
+}
+
+static int ocfs2_stack_driver_request(const char *stack_name,
+                                     const char *plugin_name)
+{
+       int rc;
+       struct ocfs2_stack_plugin *p;
+
+       spin_lock(&ocfs2_stack_lock);
+
+       /*
+        * If the stack passed by the filesystem isn't the selected one,
+        * we can't continue.
+        */
+       if (strcmp(stack_name, cluster_stack_name)) {
+               rc = -EBUSY;
+               goto out;
+       }
+
+       if (active_stack) {
+               /*
+                * If the active stack isn't the one we want, it cannot
+                * be selected right now.
+                */
+               if (!strcmp(active_stack->sp_name, plugin_name))
+                       rc = 0;
+               else
+                       rc = -EBUSY;
+               goto out;
+       }
+
+       p = ocfs2_stack_lookup(plugin_name);
+       if (!p || !try_module_get(p->sp_owner)) {
+               rc = -ENOENT;
+               goto out;
+       }
+
+       /* Ok, the stack is pinned */
+       p->sp_count++;
+       active_stack = p;
+
+       rc = 0;
+
+out:
+       spin_unlock(&ocfs2_stack_lock);
+       return rc;
+}
+
+/*
+ * This function looks up the appropriate stack and makes it active.  If
+ * there is no stack, it tries to load it.  It will fail if the stack still
+ * cannot be found.  It will also fail if a different stack is in use.
+ */
+static int ocfs2_stack_driver_get(const char *stack_name)
+{
+       int rc;
+       char *plugin_name = OCFS2_STACK_PLUGIN_O2CB;
+
+       /*
+        * Classic stack does not pass in a stack name.  This is
+        * compatible with older tools as well.
+        */
+       if (!stack_name || !*stack_name)
+               stack_name = OCFS2_STACK_PLUGIN_O2CB;
+
+       if (strlen(stack_name) != OCFS2_STACK_LABEL_LEN) {
+               printk(KERN_ERR
+                      "ocfs2 passed an invalid cluster stack label: \"%s\"\n",
+                      stack_name);
+               return -EINVAL;
+       }
+
+       /* Anything that isn't the classic stack is a user stack */
+       if (strcmp(stack_name, OCFS2_STACK_PLUGIN_O2CB))
+               plugin_name = OCFS2_STACK_PLUGIN_USER;
+
+       rc = ocfs2_stack_driver_request(stack_name, plugin_name);
+       if (rc == -ENOENT) {
+               request_module("ocfs2_stack_%s", plugin_name);
+               rc = ocfs2_stack_driver_request(stack_name, plugin_name);
+       }
+
+       if (rc == -ENOENT) {
+               printk(KERN_ERR
+                      "ocfs2: Cluster stack driver \"%s\" cannot be found\n",
+                      plugin_name);
+       } else if (rc == -EBUSY) {
+               printk(KERN_ERR
+                      "ocfs2: A different cluster stack is in use\n");
+       }
+
+       return rc;
+}
+
+static void ocfs2_stack_driver_put(void)
+{
+       spin_lock(&ocfs2_stack_lock);
+       BUG_ON(active_stack == NULL);
+       BUG_ON(active_stack->sp_count == 0);
+
+       active_stack->sp_count--;
+       if (!active_stack->sp_count) {
+               module_put(active_stack->sp_owner);
+               active_stack = NULL;
+       }
+       spin_unlock(&ocfs2_stack_lock);
+}
+
+int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
+{
+       int rc;
+
+       spin_lock(&ocfs2_stack_lock);
+       if (!ocfs2_stack_lookup(plugin->sp_name)) {
+               plugin->sp_count = 0;
+               plugin->sp_proto = lproto;
+               list_add(&plugin->sp_list, &ocfs2_stack_list);
+               printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
+                      plugin->sp_name);
+               rc = 0;
+       } else {
+               printk(KERN_ERR "ocfs2: Stack \"%s\" already registered\n",
+                      plugin->sp_name);
+               rc = -EEXIST;
+       }
+       spin_unlock(&ocfs2_stack_lock);
+
+       return rc;
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_register);
+
+void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin)
+{
+       struct ocfs2_stack_plugin *p;
+
+       spin_lock(&ocfs2_stack_lock);
+       p = ocfs2_stack_lookup(plugin->sp_name);
+       if (p) {
+               BUG_ON(p != plugin);
+               BUG_ON(plugin == active_stack);
+               BUG_ON(plugin->sp_count != 0);
+               list_del_init(&plugin->sp_list);
+               printk(KERN_INFO "ocfs2: Unregistered cluster interface %s\n",
+                      plugin->sp_name);
+       } else {
+               printk(KERN_ERR "Stack \"%s\" is not registered\n",
+                      plugin->sp_name);
+       }
+       spin_unlock(&ocfs2_stack_lock);
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister);
+
+void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
+{
+       struct ocfs2_stack_plugin *p;
+
+       BUG_ON(proto == NULL);
+
+       spin_lock(&ocfs2_stack_lock);
+       BUG_ON(active_stack != NULL);
+
+       lproto = proto;
+       list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+               p->sp_proto = lproto;
+       }
+
+       spin_unlock(&ocfs2_stack_lock);
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol);
+
+
+/*
+ * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take
+ * "struct ocfs2_lock_res *astarg" instead of "void *astarg" because the
+ * underlying stack plugins need to pilfer the lksb off of the lock_res.
+ * If some other structure needs to be passed as an astarg, the plugins
+ * will need to be given a different avenue to the lksb.
+ */
+int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
+                  int mode,
+                  union ocfs2_dlm_lksb *lksb,
+                  u32 flags,
+                  void *name,
+                  unsigned int namelen,
+                  struct ocfs2_lock_res *astarg)
+{
+       BUG_ON(lproto == NULL);
+
+       return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
+                                             name, namelen, astarg);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
+
+int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
+                    union ocfs2_dlm_lksb *lksb,
+                    u32 flags,
+                    struct ocfs2_lock_res *astarg)
+{
+       BUG_ON(lproto == NULL);
+
+       return active_stack->sp_ops->dlm_unlock(conn, lksb, flags, astarg);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
+
+int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+{
+       return active_stack->sp_ops->lock_status(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
+
+/*
+ * Why don't we cast to ocfs2_meta_lvb?  The "clean" answer is that we
+ * don't cast at the glue level.  The real answer is that the header
+ * ordering is nigh impossible.
+ */
+void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+{
+       return active_stack->sp_ops->lock_lvb(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb);
+
+void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
+{
+       active_stack->sp_ops->dump_lksb(lksb);
+}
+EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);
+
+int ocfs2_cluster_connect(const char *stack_name,
+                         const char *group,
+                         int grouplen,
+                         void (*recovery_handler)(int node_num,
+                                                  void *recovery_data),
+                         void *recovery_data,
+                         struct ocfs2_cluster_connection **conn)
+{
+       int rc = 0;
+       struct ocfs2_cluster_connection *new_conn;
+
+       BUG_ON(group == NULL);
+       BUG_ON(conn == NULL);
+       BUG_ON(recovery_handler == NULL);
+
+       if (grouplen > GROUP_NAME_MAX) {
+               rc = -EINVAL;
+               goto out;
+       }
+
+       new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
+                          GFP_KERNEL);
+       if (!new_conn) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       memcpy(new_conn->cc_name, group, grouplen);
+       new_conn->cc_namelen = grouplen;
+       new_conn->cc_recovery_handler = recovery_handler;
+       new_conn->cc_recovery_data = recovery_data;
+
+       /* Start the new connection at our maximum compatibility level */
+       new_conn->cc_version = lproto->lp_max_version;
+
+       /* This will pin the stack driver if successful */
+       rc = ocfs2_stack_driver_get(stack_name);
+       if (rc)
+               goto out_free;
+
+       rc = active_stack->sp_ops->connect(new_conn);
+       if (rc) {
+               ocfs2_stack_driver_put();
+               goto out_free;
+       }
+
+       *conn = new_conn;
+
+out_free:
+       if (rc)
+               kfree(new_conn);
+
+out:
+       return rc;
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_connect);
+
+/* If hangup_pending is 0, the stack driver will be dropped */
+int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+                            int hangup_pending)
+{
+       int ret;
+
+       BUG_ON(conn == NULL);
+
+       ret = active_stack->sp_ops->disconnect(conn, hangup_pending);
+
+       /* XXX Should we free it anyway? */
+       if (!ret) {
+               kfree(conn);
+               if (!hangup_pending)
+                       ocfs2_stack_driver_put();
+       }
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect);
+
+void ocfs2_cluster_hangup(const char *group, int grouplen)
+{
+       BUG_ON(group == NULL);
+       BUG_ON(group[grouplen] != '\0');
+
+       if (active_stack->sp_ops->hangup)
+               active_stack->sp_ops->hangup(group, grouplen);
+
+       /* cluster_disconnect() was called with hangup_pending==1 */
+       ocfs2_stack_driver_put();
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
+
+int ocfs2_cluster_this_node(unsigned int *node)
+{
+       return active_stack->sp_ops->this_node(node);
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
+
+
+/*
+ * Sysfs bits
+ */
+
+static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
+                                              struct kobj_attribute *attr,
+                                              char *buf)
+{
+       ssize_t ret = 0;
+
+       spin_lock(&ocfs2_stack_lock);
+       if (lproto)
+               ret = snprintf(buf, PAGE_SIZE, "%u.%u\n",
+                              lproto->lp_max_version.pv_major,
+                              lproto->lp_max_version.pv_minor);
+       spin_unlock(&ocfs2_stack_lock);
+
+       return ret;
+}
+
+static struct kobj_attribute ocfs2_attr_max_locking_protocol =
+       __ATTR(max_locking_protocol, S_IFREG | S_IRUGO,
+              ocfs2_max_locking_protocol_show, NULL);
+
+static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
+                                                struct kobj_attribute *attr,
+                                                char *buf)
+{
+       ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
+       struct ocfs2_stack_plugin *p;
+
+       spin_lock(&ocfs2_stack_lock);
+       list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+               ret = snprintf(buf, remain, "%s\n",
+                              p->sp_name);
+               if (ret < 0) {
+                       total = ret;
+                       break;
+               }
+               if (ret == remain) {
+                       /* snprintf() didn't fit */
+                       total = -E2BIG;
+                       break;
+               }
+               total += ret;
+               remain -= ret;
+       }
+       spin_unlock(&ocfs2_stack_lock);
+
+       return total;
+}
+
+static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins =
+       __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO,
+              ocfs2_loaded_cluster_plugins_show, NULL);
+
+static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
+                                               struct kobj_attribute *attr,
+                                               char *buf)
+{
+       ssize_t ret = 0;
+
+       spin_lock(&ocfs2_stack_lock);
+       if (active_stack) {
+               ret = snprintf(buf, PAGE_SIZE, "%s\n",
+                              active_stack->sp_name);
+               if (ret == PAGE_SIZE)
+                       ret = -E2BIG;
+       }
+       spin_unlock(&ocfs2_stack_lock);
+
+       return ret;
+}
+
+static struct kobj_attribute ocfs2_attr_active_cluster_plugin =
+       __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO,
+              ocfs2_active_cluster_plugin_show, NULL);
+
+static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj,
+                                       struct kobj_attribute *attr,
+                                       char *buf)
+{
+       ssize_t ret;
+       spin_lock(&ocfs2_stack_lock);
+       ret = snprintf(buf, PAGE_SIZE, "%s\n", cluster_stack_name);
+       spin_unlock(&ocfs2_stack_lock);
+
+       return ret;
+}
+
+static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj,
+                                        struct kobj_attribute *attr,
+                                        const char *buf, size_t count)
+{
+       size_t len = count;
+       ssize_t ret;
+
+       if (len == 0)
+               return len;
+
+       if (buf[len - 1] == '\n')
+               len--;
+
+       if ((len != OCFS2_STACK_LABEL_LEN) ||
+           (strnlen(buf, len) != len))
+               return -EINVAL;
+
+       spin_lock(&ocfs2_stack_lock);
+       if (active_stack) {
+               if (!strncmp(buf, cluster_stack_name, len))
+                       ret = count;
+               else
+                       ret = -EBUSY;
+       } else {
+               memcpy(cluster_stack_name, buf, len);
+               ret = count;
+       }
+       spin_unlock(&ocfs2_stack_lock);
+
+       return ret;
+}
+
+
+static struct kobj_attribute ocfs2_attr_cluster_stack =
+       __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR,
+              ocfs2_cluster_stack_show,
+              ocfs2_cluster_stack_store);
+
+static struct attribute *ocfs2_attrs[] = {
+       &ocfs2_attr_max_locking_protocol.attr,
+       &ocfs2_attr_loaded_cluster_plugins.attr,
+       &ocfs2_attr_active_cluster_plugin.attr,
+       &ocfs2_attr_cluster_stack.attr,
+       NULL,
+};
+
+static struct attribute_group ocfs2_attr_group = {
+       .attrs = ocfs2_attrs,
+};
+
+static struct kset *ocfs2_kset;
+
+static void ocfs2_sysfs_exit(void)
+{
+       kset_unregister(ocfs2_kset);
+}
+
+static int ocfs2_sysfs_init(void)
+{
+       int ret;
+
+       ocfs2_kset = kset_create_and_add("ocfs2", NULL, fs_kobj);
+       if (!ocfs2_kset)
+               return -ENOMEM;
+
+       ret = sysfs_create_group(&ocfs2_kset->kobj, &ocfs2_attr_group);
+       if (ret)
+               goto error;
+
+       return 0;
+
+error:
+       kset_unregister(ocfs2_kset);
+       return ret;
+}
+
+static int __init ocfs2_stack_glue_init(void)
+{
+       strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
+
+       return ocfs2_sysfs_init();
+}
+
+static void __exit ocfs2_stack_glue_exit(void)
+{
+       lproto = NULL;
+       ocfs2_sysfs_exit();
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_DESCRIPTION("ocfs2 cluter stack glue layer");
+MODULE_LICENSE("GPL");
+module_init(ocfs2_stack_glue_init);
+module_exit(ocfs2_stack_glue_exit);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
new file mode 100644 (file)
index 0000000..005e4f1
--- /dev/null
@@ -0,0 +1,261 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * stackglue.h
+ *
+ * Glue to the underlying cluster stack.
+ *
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+
+#ifndef STACKGLUE_H
+#define STACKGLUE_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/dlmconstants.h>
+
+#include "dlm/dlmapi.h"
+#include <linux/dlm.h>
+
+/*
+ * dlmconstants.h does not have a LOCAL flag.  We hope to remove it
+ * some day, but right now we need it.  Let's fake it.  This value is larger
+ * than any flag in dlmconstants.h.
+ */
+#define DLM_LKF_LOCAL          0x00100000
+
+/*
+ * This shadows DLM_LOCKSPACE_LEN in fs/dlm/dlm_internal.h.  That probably
+ * wants to be in a public header.
+ */
+#define GROUP_NAME_MAX         64
+
+
+/*
+ * ocfs2_protocol_version changes when ocfs2 does something different in
+ * its inter-node behavior.  See dlmglue.c for more information.
+ */
+struct ocfs2_protocol_version {
+       u8 pv_major;
+       u8 pv_minor;
+};
+
+/*
+ * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
+ */
+struct ocfs2_locking_protocol {
+       struct ocfs2_protocol_version lp_max_version;
+       void (*lp_lock_ast)(void *astarg);
+       void (*lp_blocking_ast)(void *astarg, int level);
+       void (*lp_unlock_ast)(void *astarg, int error);
+};
+
+
+/*
+ * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
+ * has a pointer to separately allocated lvb space.  This struct exists only to
+ * include in the lksb union to make space for a combined dlm_lksb and lvb.
+ */
+struct fsdlm_lksb_plus_lvb {
+       struct dlm_lksb lksb;
+       char lvb[DLM_LVB_LEN];
+};
+
+/*
+ * A union of all lock status structures.  We define it here so that the
+ * size of the union is known.  Lock status structures are embedded in
+ * ocfs2 inodes.
+ */
+union ocfs2_dlm_lksb {
+       struct dlm_lockstatus lksb_o2dlm;
+       struct dlm_lksb lksb_fsdlm;
+       struct fsdlm_lksb_plus_lvb padding;
+};
+
+/*
+ * A cluster connection.  Mostly opaque to ocfs2, the connection holds
+ * state for the underlying stack.  ocfs2 does use cc_version to determine
+ * locking compatibility.
+ */
+struct ocfs2_cluster_connection {
+       char cc_name[GROUP_NAME_MAX];
+       int cc_namelen;
+       struct ocfs2_protocol_version cc_version;
+       void (*cc_recovery_handler)(int node_num, void *recovery_data);
+       void *cc_recovery_data;
+       void *cc_lockspace;
+       void *cc_private;
+};
+
+/*
+ * Each cluster stack implements the stack operations structure.  Not used
+ * in the ocfs2 code, the stackglue code translates generic cluster calls
+ * into stack operations.
+ */
+struct ocfs2_stack_operations {
+       /*
+        * The fs code calls ocfs2_cluster_connect() to attach a new
+        * filesystem to the cluster stack.  The ->connect() op is passed
+        * an ocfs2_cluster_connection with the name and recovery field
+        * filled in.
+        *
+        * The stack must set up any notification mechanisms and create
+        * the filesystem lockspace in the DLM.  The lockspace should be
+        * stored on cc_lockspace.  Any other information can be stored on
+        * cc_private.
+        *
+        * ->connect() must not return until it is guaranteed that
+        *
+        *  - Node down notifications for the filesystem will be recieved
+        *    and passed to conn->cc_recovery_handler().
+        *  - Locking requests for the filesystem will be processed.
+        */
+       int (*connect)(struct ocfs2_cluster_connection *conn);
+
+       /*
+        * The fs code calls ocfs2_cluster_disconnect() when a filesystem
+        * no longer needs cluster services.  All DLM locks have been
+        * dropped, and recovery notification is being ignored by the
+        * fs code.  The stack must disengage from the DLM and discontinue
+        * recovery notification.
+        *
+        * Once ->disconnect() has returned, the connection structure will
+        * be freed.  Thus, a stack must not return from ->disconnect()
+        * until it will no longer reference the conn pointer.
+        *
+        * If hangup_pending is zero, ocfs2_cluster_disconnect() will also
+        * be dropping the reference on the module.
+        */
+       int (*disconnect)(struct ocfs2_cluster_connection *conn,
+                         int hangup_pending);
+
+       /*
+        * ocfs2_cluster_hangup() exists for compatibility with older
+        * ocfs2 tools.  Only the classic stack really needs it.  As such
+        * ->hangup() is not required of all stacks.  See the comment by
+        * ocfs2_cluster_hangup() for more details.
+        *
+        * Note that ocfs2_cluster_hangup() can only be called if
+        * hangup_pending was passed to ocfs2_cluster_disconnect().
+        */
+       void (*hangup)(const char *group, int grouplen);
+
+       /*
+        * ->this_node() returns the cluster's unique identifier for the
+        * local node.
+        */
+       int (*this_node)(unsigned int *node);
+
+       /*
+        * Call the underlying dlm lock function.  The ->dlm_lock()
+        * callback should convert the flags and mode as appropriate.
+        *
+        * ast and bast functions are not part of the call because the
+        * stack will likely want to wrap ast and bast calls before passing
+        * them to stack->sp_proto.
+        */
+       int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
+                       int mode,
+                       union ocfs2_dlm_lksb *lksb,
+                       u32 flags,
+                       void *name,
+                       unsigned int namelen,
+                       void *astarg);
+
+       /*
+        * Call the underlying dlm unlock function.  The ->dlm_unlock()
+        * function should convert the flags as appropriate.
+        *
+        * The unlock ast is not passed, as the stack will want to wrap
+        * it before calling stack->sp_proto->lp_unlock_ast().
+        */
+       int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
+                         union ocfs2_dlm_lksb *lksb,
+                         u32 flags,
+                         void *astarg);
+
+       /*
+        * Return the status of the current lock status block.  The fs
+        * code should never dereference the union.  The ->lock_status()
+        * callback pulls out the stack-specific lksb, converts the status
+        * to a proper errno, and returns it.
+        */
+       int (*lock_status)(union ocfs2_dlm_lksb *lksb);
+
+       /*
+        * Pull the lvb pointer off of the stack-specific lksb.
+        */
+       void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
+
+       /*
+        * This is an optoinal debugging hook.  If provided, the
+        * stack can dump debugging information about this lock.
+        */
+       void (*dump_lksb)(union ocfs2_dlm_lksb *lksb);
+};
+
+/*
+ * Each stack plugin must describe itself by registering a
+ * ocfs2_stack_plugin structure.  This is only seen by stackglue and the
+ * stack driver.
+ */
+struct ocfs2_stack_plugin {
+       char *sp_name;
+       struct ocfs2_stack_operations *sp_ops;
+       struct module *sp_owner;
+
+       /* These are managed by the stackglue code. */
+       struct list_head sp_list;
+       unsigned int sp_count;
+       struct ocfs2_locking_protocol *sp_proto;
+};
+
+
+/* Used by the filesystem */
+int ocfs2_cluster_connect(const char *stack_name,
+                         const char *group,
+                         int grouplen,
+                         void (*recovery_handler)(int node_num,
+                                                  void *recovery_data),
+                         void *recovery_data,
+                         struct ocfs2_cluster_connection **conn);
+int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
+                            int hangup_pending);
+void ocfs2_cluster_hangup(const char *group, int grouplen);
+int ocfs2_cluster_this_node(unsigned int *node);
+
+struct ocfs2_lock_res;
+int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
+                  int mode,
+                  union ocfs2_dlm_lksb *lksb,
+                  u32 flags,
+                  void *name,
+                  unsigned int namelen,
+                  struct ocfs2_lock_res *astarg);
+int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
+                    union ocfs2_dlm_lksb *lksb,
+                    u32 flags,
+                    struct ocfs2_lock_res *astarg);
+
+int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
+void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
+void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
+
+void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);
+
+
+/* Used by stack plugins */
+int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
+void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
+#endif  /* STACKGLUE_H */
index 72c198a004df46744c1c025e253763727ce635bb..d2d278fb9819ee3da92577a367b79f0bdc18681f 100644 (file)
 
 #include "buffer_head_io.h"
 
+#define NOT_ALLOC_NEW_GROUP            0
+#define ALLOC_NEW_GROUP                        1
+
+#define OCFS2_MAX_INODES_TO_STEAL      1024
+
 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -106,7 +111,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
                                                u64 *bg_blkno,
                                                u16 *bg_bit_off);
 
-void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
+static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 {
        struct inode *inode = ac->ac_inode;
 
@@ -117,9 +122,17 @@ void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
                mutex_unlock(&inode->i_mutex);
 
                iput(inode);
+               ac->ac_inode = NULL;
        }
-       if (ac->ac_bh)
+       if (ac->ac_bh) {
                brelse(ac->ac_bh);
+               ac->ac_bh = NULL;
+       }
+}
+
+void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
+{
+       ocfs2_free_ac_resource(ac);
        kfree(ac);
 }
 
@@ -391,7 +404,8 @@ bail:
 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
                                       struct ocfs2_alloc_context *ac,
                                       int type,
-                                      u32 slot)
+                                      u32 slot,
+                                      int alloc_new_group)
 {
        int status;
        u32 bits_wanted = ac->ac_bits_wanted;
@@ -420,6 +434,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
        }
 
        ac->ac_inode = alloc_inode;
+       ac->ac_alloc_slot = slot;
 
        fe = (struct ocfs2_dinode *) bh->b_data;
        if (!OCFS2_IS_VALID_DINODE(fe)) {
@@ -446,6 +461,14 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
                        goto bail;
                }
 
+               if (alloc_new_group != ALLOC_NEW_GROUP) {
+                       mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
+                            "and we don't alloc a new group for it.\n",
+                            slot, bits_wanted, free_bits);
+                       status = -ENOSPC;
+                       goto bail;
+               }
+
                status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
                if (status < 0) {
                        if (status != -ENOSPC)
@@ -490,7 +513,8 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
        (*ac)->ac_group_search = ocfs2_block_group_search;
 
        status = ocfs2_reserve_suballoc_bits(osb, (*ac),
-                                            EXTENT_ALLOC_SYSTEM_INODE, slot);
+                                            EXTENT_ALLOC_SYSTEM_INODE,
+                                            slot, ALLOC_NEW_GROUP);
        if (status < 0) {
                if (status != -ENOSPC)
                        mlog_errno(status);
@@ -508,10 +532,42 @@ bail:
        return status;
 }
 
+static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
+                                             struct ocfs2_alloc_context *ac)
+{
+       int i, status = -ENOSPC;
+       s16 slot = ocfs2_get_inode_steal_slot(osb);
+
+       /* Start to steal inodes from the first slot after ours. */
+       if (slot == OCFS2_INVALID_SLOT)
+               slot = osb->slot_num + 1;
+
+       for (i = 0; i < osb->max_slots; i++, slot++) {
+               if (slot == osb->max_slots)
+                       slot = 0;
+
+               if (slot == osb->slot_num)
+                       continue;
+
+               status = ocfs2_reserve_suballoc_bits(osb, ac,
+                                                    INODE_ALLOC_SYSTEM_INODE,
+                                                    slot, NOT_ALLOC_NEW_GROUP);
+               if (status >= 0) {
+                       ocfs2_set_inode_steal_slot(osb, slot);
+                       break;
+               }
+
+               ocfs2_free_ac_resource(ac);
+       }
+
+       return status;
+}
+
 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
                            struct ocfs2_alloc_context **ac)
 {
        int status;
+       s16 slot = ocfs2_get_inode_steal_slot(osb);
 
        *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
        if (!(*ac)) {
@@ -525,9 +581,43 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 
        (*ac)->ac_group_search = ocfs2_block_group_search;
 
+       /*
+        * slot is set when we successfully steal inode from other nodes.
+        * It is reset in 3 places:
+        * 1. when we flush the truncate log
+        * 2. when we complete local alloc recovery.
+        * 3. when we successfully allocate from our own slot.
+        * After it is set, we will go on stealing inodes until we find the
+        * need to check our slots to see whether there is some space for us.
+        */
+       if (slot != OCFS2_INVALID_SLOT &&
+           atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL)
+               goto inode_steal;
+
+       atomic_set(&osb->s_num_inodes_stolen, 0);
        status = ocfs2_reserve_suballoc_bits(osb, *ac,
                                             INODE_ALLOC_SYSTEM_INODE,
-                                            osb->slot_num);
+                                            osb->slot_num, ALLOC_NEW_GROUP);
+       if (status >= 0) {
+               status = 0;
+
+               /*
+                * Some inodes must be freed by us, so try to allocate
+                * from our own next time.
+                */
+               if (slot != OCFS2_INVALID_SLOT)
+                       ocfs2_init_inode_steal_slot(osb);
+               goto bail;
+       } else if (status < 0 && status != -ENOSPC) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       ocfs2_free_ac_resource(*ac);
+
+inode_steal:
+       status = ocfs2_steal_inode_from_other_nodes(osb, *ac);
+       atomic_inc(&osb->s_num_inodes_stolen);
        if (status < 0) {
                if (status != -ENOSPC)
                        mlog_errno(status);
@@ -557,7 +647,8 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
 
        status = ocfs2_reserve_suballoc_bits(osb, ac,
                                             GLOBAL_BITMAP_SYSTEM_INODE,
-                                            OCFS2_INVALID_SLOT);
+                                            OCFS2_INVALID_SLOT,
+                                            ALLOC_NEW_GROUP);
        if (status < 0 && status != -ENOSPC) {
                mlog_errno(status);
                goto bail;
index 8799033bb459d9adc4b558d0e50c1b4a52562b61..544c600662bd074c1b76ca1c9ead16ab0f24402a 100644 (file)
@@ -36,6 +36,7 @@ typedef int (group_search_t)(struct inode *,
 struct ocfs2_alloc_context {
        struct inode *ac_inode;    /* which bitmap are we allocating from? */
        struct buffer_head *ac_bh; /* file entry bh */
+       u32    ac_alloc_slot;   /* which slot are we allocating from? */
        u32    ac_bits_wanted;
        u32    ac_bits_given;
 #define OCFS2_AC_USE_LOCAL 1
index bec75aff3d9f0d4746e71ceb1d70a56032eee3ec..df63ba20ae9016d7957930da638b729d9ab242bb 100644 (file)
@@ -40,8 +40,7 @@
 #include <linux/crc32.h>
 #include <linux/debugfs.h>
 #include <linux/mount.h>
-
-#include <cluster/nodemanager.h>
+#include <linux/seq_file.h>
 
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -88,6 +87,7 @@ struct mount_options
        unsigned int    atime_quantum;
        signed short    slot;
        unsigned int    localalloc_opt;
+       char            cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
 };
 
 static int ocfs2_parse_options(struct super_block *sb, char *options,
@@ -109,7 +109,6 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait);
 static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
 static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
 static void ocfs2_release_system_inodes(struct ocfs2_super *osb);
-static int ocfs2_fill_local_node_info(struct ocfs2_super *osb);
 static int ocfs2_check_volume(struct ocfs2_super *osb);
 static int ocfs2_verify_volume(struct ocfs2_dinode *di,
                               struct buffer_head *bh,
@@ -154,6 +153,7 @@ enum {
        Opt_commit,
        Opt_localalloc,
        Opt_localflocks,
+       Opt_stack,
        Opt_err,
 };
 
@@ -172,6 +172,7 @@ static match_table_t tokens = {
        {Opt_commit, "commit=%u"},
        {Opt_localalloc, "localalloc=%d"},
        {Opt_localflocks, "localflocks"},
+       {Opt_stack, "cluster_stack=%s"},
        {Opt_err, NULL}
 };
 
@@ -551,8 +552,17 @@ static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
                }
        }
 
+       if (ocfs2_userspace_stack(osb)) {
+               if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+                       mlog(ML_ERROR, "Userspace stack expected, but "
+                            "o2cb heartbeat arguments passed to mount\n");
+                       return -EINVAL;
+               }
+       }
+
        if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
-               if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb)) {
+               if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
+                   !ocfs2_userspace_stack(osb)) {
                        mlog(ML_ERROR, "Heartbeat has to be started to mount "
                             "a read-write clustered device.\n");
                        return -EINVAL;
@@ -562,6 +572,35 @@ static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
        return 0;
 }
 
+/*
+ * If we're using a userspace stack, mount should have passed
+ * a name that matches the disk.  If not, mount should not
+ * have passed a stack.
+ */
+static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
+                                       struct mount_options *mopt)
+{
+       if (!ocfs2_userspace_stack(osb) && mopt->cluster_stack[0]) {
+               mlog(ML_ERROR,
+                    "cluster stack passed to mount, but this filesystem "
+                    "does not support it\n");
+               return -EINVAL;
+       }
+
+       if (ocfs2_userspace_stack(osb) &&
+           strncmp(osb->osb_cluster_stack, mopt->cluster_stack,
+                   OCFS2_STACK_LABEL_LEN)) {
+               mlog(ML_ERROR,
+                    "cluster stack passed to mount (\"%s\") does not "
+                    "match the filesystem (\"%s\")\n",
+                    mopt->cluster_stack,
+                    osb->osb_cluster_stack);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct dentry *root;
@@ -579,15 +618,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
                goto read_super_error;
        }
 
-       /* for now we only have one cluster/node, make sure we see it
-        * in the heartbeat universe */
-       if (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL) {
-               if (!o2hb_check_local_node_heartbeating()) {
-                       status = -EINVAL;
-                       goto read_super_error;
-               }
-       }
-
        /* probe for superblock */
        status = ocfs2_sb_probe(sb, &bh, &sector_size);
        if (status < 0) {
@@ -609,6 +639,10 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        osb->osb_commit_interval = parsed_options.commit_interval;
        osb->local_alloc_size = parsed_options.localalloc_opt;
 
+       status = ocfs2_verify_userspace_stack(osb, &parsed_options);
+       if (status)
+               goto read_super_error;
+
        sb->s_magic = OCFS2_SUPER_MAGIC;
 
        /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
@@ -694,7 +728,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        if (ocfs2_mount_local(osb))
                snprintf(nodestr, sizeof(nodestr), "local");
        else
-               snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num);
+               snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
 
        printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) "
               "with %s data mode.\n",
@@ -763,6 +797,7 @@ static int ocfs2_parse_options(struct super_block *sb,
        mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
        mopt->slot = OCFS2_INVALID_SLOT;
        mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+       mopt->cluster_stack[0] = '\0';
 
        if (!options) {
                status = 1;
@@ -864,6 +899,25 @@ static int ocfs2_parse_options(struct super_block *sb,
                        if (!is_remount)
                                mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
                        break;
+               case Opt_stack:
+                       /* Check both that the option we were passed
+                        * is of the right length and that it is a proper
+                        * string of the right length.
+                        */
+                       if (((args[0].to - args[0].from) !=
+                            OCFS2_STACK_LABEL_LEN) ||
+                           (strnlen(args[0].from,
+                                    OCFS2_STACK_LABEL_LEN) !=
+                            OCFS2_STACK_LABEL_LEN)) {
+                               mlog(ML_ERROR,
+                                    "Invalid cluster_stack option\n");
+                               status = 0;
+                               goto bail;
+                       }
+                       memcpy(mopt->cluster_stack, args[0].from,
+                              OCFS2_STACK_LABEL_LEN);
+                       mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+                       break;
                default:
                        mlog(ML_ERROR,
                             "Unrecognized mount option \"%s\" "
@@ -922,6 +976,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        if (opts & OCFS2_MOUNT_LOCALFLOCKS)
                seq_printf(s, ",localflocks,");
 
+       if (osb->osb_cluster_stack[0])
+               seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
+                          osb->osb_cluster_stack);
+
        return 0;
 }
 
@@ -957,6 +1015,8 @@ static int __init ocfs2_init(void)
                mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
        }
 
+       ocfs2_set_locking_protocol();
+
 leave:
        if (status < 0) {
                ocfs2_free_mem_caches();
@@ -1132,31 +1192,6 @@ static int ocfs2_get_sector(struct super_block *sb,
        return 0;
 }
 
-/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */
-static int ocfs2_fill_local_node_info(struct ocfs2_super *osb)
-{
-       int status;
-
-       /* XXX hold a ref on the node while mounte?  easy enough, if
-        * desirable. */
-       if (ocfs2_mount_local(osb))
-               osb->node_num = 0;
-       else
-               osb->node_num = o2nm_this_node();
-
-       if (osb->node_num == O2NM_MAX_NODES) {
-               mlog(ML_ERROR, "could not find this host's node number\n");
-               status = -ENOENT;
-               goto bail;
-       }
-
-       mlog(0, "I am node %d\n", osb->node_num);
-
-       status = 0;
-bail:
-       return status;
-}
-
 static int ocfs2_mount_volume(struct super_block *sb)
 {
        int status = 0;
@@ -1168,12 +1203,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
        if (ocfs2_is_hard_readonly(osb))
                goto leave;
 
-       status = ocfs2_fill_local_node_info(osb);
-       if (status < 0) {
-               mlog_errno(status);
-               goto leave;
-       }
-
        status = ocfs2_dlm_init(osb);
        if (status < 0) {
                mlog_errno(status);
@@ -1224,18 +1253,9 @@ leave:
        return status;
 }
 
-/* we can't grab the goofy sem lock from inside wait_event, so we use
- * memory barriers to make sure that we'll see the null task before
- * being woken up */
-static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
-{
-       mb();
-       return osb->recovery_thread_task != NULL;
-}
-
 static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 {
-       int tmp;
+       int tmp, hangup_needed = 0;
        struct ocfs2_super *osb = NULL;
        char nodestr[8];
 
@@ -1249,25 +1269,16 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 
        ocfs2_truncate_log_shutdown(osb);
 
-       /* disable any new recovery threads and wait for any currently
-        * running ones to exit. Do this before setting the vol_state. */
-       mutex_lock(&osb->recovery_lock);
-       osb->disable_recovery = 1;
-       mutex_unlock(&osb->recovery_lock);
-       wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
-
-       /* At this point, we know that no more recovery threads can be
-        * launched, so wait for any recovery completion work to
-        * complete. */
-       flush_workqueue(ocfs2_wq);
+       /* This will disable recovery and flush any recovery work. */
+       ocfs2_recovery_exit(osb);
 
        ocfs2_journal_shutdown(osb);
 
        ocfs2_sync_blockdev(sb);
 
-       /* No dlm means we've failed during mount, so skip all the
-        * steps which depended on that to complete. */
-       if (osb->dlm) {
+       /* No cluster connection means we've failed during mount, so skip
+        * all the steps which depended on that to complete. */
+       if (osb->cconn) {
                tmp = ocfs2_super_lock(osb, 1);
                if (tmp < 0) {
                        mlog_errno(tmp);
@@ -1278,25 +1289,34 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        if (osb->slot_num != OCFS2_INVALID_SLOT)
                ocfs2_put_slot(osb);
 
-       if (osb->dlm)
+       if (osb->cconn)
                ocfs2_super_unlock(osb, 1);
 
        ocfs2_release_system_inodes(osb);
 
-       if (osb->dlm)
-               ocfs2_dlm_shutdown(osb);
+       /*
+        * If we're dismounting due to mount error, mount.ocfs2 will clean
+        * up heartbeat.  If we're a local mount, there is no heartbeat.
+        * If we failed before we got a uuid_str yet, we can't stop
+        * heartbeat.  Otherwise, do it.
+        */
+       if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str)
+               hangup_needed = 1;
+
+       if (osb->cconn)
+               ocfs2_dlm_shutdown(osb, hangup_needed);
 
        debugfs_remove(osb->osb_debug_root);
 
-       if (!mnt_err)
-               ocfs2_stop_heartbeat(osb);
+       if (hangup_needed)
+               ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str));
 
        atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
 
        if (ocfs2_mount_local(osb))
                snprintf(nodestr, sizeof(nodestr), "local");
        else
-               snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num);
+               snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
 
        printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n",
               osb->dev_str, nodestr);
@@ -1355,7 +1375,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
        sb->s_fs_info = osb;
        sb->s_op = &ocfs2_sops;
        sb->s_export_op = &ocfs2_export_ops;
-       osb->osb_locking_proto = ocfs2_locking_protocol;
        sb->s_time_gran = 1;
        sb->s_flags |= MS_NOATIME;
        /* this is needed to support O_LARGEFILE */
@@ -1368,7 +1387,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
        osb->s_sectsize_bits = blksize_bits(sector_size);
        BUG_ON(!osb->s_sectsize_bits);
 
-       init_waitqueue_head(&osb->recovery_event);
        spin_lock_init(&osb->dc_task_lock);
        init_waitqueue_head(&osb->dc_event);
        osb->dc_work_sequence = 0;
@@ -1376,6 +1394,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
        INIT_LIST_HEAD(&osb->blocked_lock_list);
        osb->blocked_lock_count = 0;
        spin_lock_init(&osb->osb_lock);
+       ocfs2_init_inode_steal_slot(osb);
 
        atomic_set(&osb->alloc_stats.moves, 0);
        atomic_set(&osb->alloc_stats.local_data, 0);
@@ -1388,24 +1407,23 @@ static int ocfs2_initialize_super(struct super_block *sb,
        snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
                 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
 
-       mutex_init(&osb->recovery_lock);
-
-       osb->disable_recovery = 0;
-       osb->recovery_thread_task = NULL;
+       status = ocfs2_recovery_init(osb);
+       if (status) {
+               mlog(ML_ERROR, "Unable to initialize recovery state\n");
+               mlog_errno(status);
+               goto bail;
+       }
 
        init_waitqueue_head(&osb->checkpoint_event);
        atomic_set(&osb->needs_checkpoint, 0);
 
        osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
 
-       osb->node_num = O2NM_INVALID_NODE_NUM;
        osb->slot_num = OCFS2_INVALID_SLOT;
 
        osb->local_alloc_state = OCFS2_LA_UNUSED;
        osb->local_alloc_bh = NULL;
 
-       ocfs2_setup_hb_callbacks(osb);
-
        init_waitqueue_head(&osb->osb_mount_event);
 
        osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
@@ -1455,6 +1473,25 @@ static int ocfs2_initialize_super(struct super_block *sb,
                goto bail;
        }
 
+       if (ocfs2_userspace_stack(osb)) {
+               memcpy(osb->osb_cluster_stack,
+                      OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
+                      OCFS2_STACK_LABEL_LEN);
+               osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+               if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
+                       mlog(ML_ERROR,
+                            "couldn't mount because of an invalid "
+                            "cluster stack label (%s) \n",
+                            osb->osb_cluster_stack);
+                       status = -EINVAL;
+                       goto bail;
+               }
+       } else {
+               /* The empty string is identical with classic tools that
+                * don't know about s_cluster_info. */
+               osb->osb_cluster_stack[0] = '\0';
+       }
+
        get_random_bytes(&osb->s_next_generation, sizeof(u32));
 
        /* FIXME
@@ -1724,8 +1761,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
 
        /* This function assumes that the caller has the main osb resource */
 
-       if (osb->slot_info)
-               ocfs2_free_slot_info(osb->slot_info);
+       ocfs2_free_slot_info(osb);
 
        kfree(osb->osb_orphan_wipes);
        /* FIXME
index 5f66c446615176eff46e1864a7343f228bac19e6..817f5966edcac2c2e36033902f2ac2ce0f89f626 100644 (file)
@@ -87,7 +87,14 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
 
 void sysfs_remove_link(struct kobject * kobj, const char * name)
 {
-       sysfs_hash_and_remove(kobj->sd, name);
+       struct sysfs_dirent *parent_sd = NULL;
+
+       if (!kobj)
+               parent_sd = &sysfs_root;
+       else
+               parent_sd = kobj->sd;
+
+       sysfs_hash_and_remove(parent_sd, name);
 }
 
 static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,