[PATCH 1/2] ocfs2: add flock lock type

author Mark Fasheh <mark.fasheh@oracle.com>

Fri, 21 Dec 2007 00:43:10 +0000 (16:43 -0800)

committer Mark Fasheh <mark.fasheh@oracle.com>

Fri, 25 Jan 2008 23:05:43 +0000 (15:05 -0800)
author Mark Fasheh <mark.fasheh@oracle.com>
Fri, 21 Dec 2007 00:43:10 +0000 (16:43 -0800)
committer Mark Fasheh <mark.fasheh@oracle.com>
Fri, 25 Jan 2008 23:05:43 +0000 (15:05 -0800)
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c

index fa5e3bdc295d2a89a2298665939f74a50996b5db..3867244fb144a4c639f0e1ed2fdd41dcffc7798d 100644 (file)
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -68,6 +68,7 @@ struct ocfs2_mask_waiter {
  
  static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
  static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
  
  static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
  static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
+static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
  
  /*
   * Return value from ->downconvert_worker functions.
  
  /*
   * Return value from ->downconvert_worker functions.
@@ -252,6 +253,11 @@ static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
         .flags          = 0,
  };
  
         .flags          = 0,
  };
  
+static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
+       .get_osb        = ocfs2_get_file_osb,
+       .flags          = 0,
+};
+
  static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
  {
         return lockres->l_type == OCFS2_LOCK_TYPE_META ||
  static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
  {
         return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -310,6 +316,17 @@ static int ocfs2_inode_lock_update(struct inode *inode,
                                   struct buffer_head **bh);
  static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
  static inline int ocfs2_highest_compat_lock_level(int level);
                                   struct buffer_head **bh);
  static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
  static inline int ocfs2_highest_compat_lock_level(int level);
+static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
+                                     int new_level);
+static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
+                                 struct ocfs2_lock_res *lockres,
+                                 int new_level,
+                                 int lvb);
+static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
+                                       struct ocfs2_lock_res *lockres);
+static int ocfs2_cancel_convert(struct ocfs2_super *osb,
+                               struct ocfs2_lock_res *lockres);
+
  
  static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
                                   u64 blkno,
  
  static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
                                   u64 blkno,
@@ -419,6 +436,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
         return OCFS2_SB(inode->i_sb);
  }
  
         return OCFS2_SB(inode->i_sb);
  }
  
+static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
+{
+       struct ocfs2_file_private *fp = lockres->l_priv;
+
+       return OCFS2_SB(fp->fp_file->f_mapping->host->i_sb);
+}
+
  static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres)
  {
         __be64 inode_blkno_be;
  static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres)
  {
         __be64 inode_blkno_be;
@@ -499,6 +523,21 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
                                    &ocfs2_rename_lops, osb);
  }
  
                                    &ocfs2_rename_lops, osb);
  }
  
+void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
+                             struct ocfs2_file_private *fp)
+{
+       struct inode *inode = fp->fp_file->f_mapping->host;
+       struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+       ocfs2_lock_res_init_once(lockres);
+       ocfs2_build_lock_name(OCFS2_LOCK_TYPE_FLOCK, oi->ip_blkno,
+                             inode->i_generation, lockres->l_name);
+       ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
+                                  OCFS2_LOCK_TYPE_FLOCK, &ocfs2_flock_lops,
+                                  fp);
+       lockres->l_flags |= OCFS2_LOCK_NOCACHE;
+}
+
  void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
  {
         mlog_entry_void();
  void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
  {
         mlog_entry_void();
@@ -715,6 +754,13 @@ static void ocfs2_blocking_ast(void *opaque, int level)
              lockres->l_name, level, lockres->l_level,
              ocfs2_lock_type_string(lockres->l_type));
  
              lockres->l_name, level, lockres->l_level,
              ocfs2_lock_type_string(lockres->l_type));
  
+       /*
+        * We can skip the bast for locks which don't enable caching -
+        * they'll be dropped at the earliest possible time anyway.
+        */
+       if (lockres->l_flags & OCFS2_LOCK_NOCACHE)
+               return;
+
         spin_lock_irqsave(&lockres->l_lock, flags);
         needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
         if (needs_downconvert)
         spin_lock_irqsave(&lockres->l_lock, flags);
         needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
         if (needs_downconvert)
@@ -926,6 +972,21 @@ static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
  
  }
  
  
  }
  
+static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
+                                            struct ocfs2_lock_res *lockres)
+{
+       int ret;
+
+       ret = wait_for_completion_interruptible(&mw->mw_complete);
+       if (ret)
+               lockres_remove_mask_waiter(lockres, mw);
+       else
+               ret = mw->mw_status;
+       /* Re-arm the completion in case we want to wait on it again */
+       INIT_COMPLETION(mw->mw_complete);
+       return ret;
+}
+
  static int ocfs2_cluster_lock(struct ocfs2_super *osb,
                               struct ocfs2_lock_res *lockres,
                               int level,
  static int ocfs2_cluster_lock(struct ocfs2_super *osb,
                               struct ocfs2_lock_res *lockres,
                               int level,
@@ -1296,6 +1357,212 @@ out:
         mlog_exit_void();
  }
  
         mlog_exit_void();
  }
  
+static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
+                                    int level)
+{
+       int ret;
+       struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
+       unsigned long flags;
+       struct ocfs2_mask_waiter mw;
+
+       ocfs2_init_mask_waiter(&mw);
+
+retry_cancel:
+       spin_lock_irqsave(&lockres->l_lock, flags);
+       if (lockres->l_flags & OCFS2_LOCK_BUSY) {
+               ret = ocfs2_prepare_cancel_convert(osb, lockres);
+               if (ret) {
+                       spin_unlock_irqrestore(&lockres->l_lock, flags);
+                       ret = ocfs2_cancel_convert(osb, lockres);
+                       if (ret < 0) {
+                               mlog_errno(ret);
+                               goto out;
+                       }
+                       goto retry_cancel;
+               }
+               lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+               spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+               ocfs2_wait_for_mask(&mw);
+               goto retry_cancel;
+       }
+
+       ret = -ERESTARTSYS;
+       /*
+        * We may still have gotten the lock, in which case there's no
+        * point to restarting the syscall.
+        */
+       if (lockres->l_level == level)
+               ret = 0;
+
+       mlog(0, "Cancel returning %d. flags: 0x%lx, level: %d, act: %d\n", ret,
+            lockres->l_flags, lockres->l_level, lockres->l_action);
+
+       spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+out:
+       return ret;
+}
+
+/*
+ * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
+ * flock() calls. The locking approach this requires is sufficiently
+ * different from all other cluster lock types that we implement a
+ * seperate path to the "low-level" dlm calls. In particular:
+ *
+ * - No optimization of lock levels is done - we take at exactly
+ *   what's been requested.
+ *
+ * - No lock caching is employed. We immediately downconvert to
+ *   no-lock at unlock time. This also means flock locks never go on
+ *   the blocking list).
+ *
+ * - Since userspace can trivially deadlock itself with flock, we make
+ *   sure to allow cancellation of a misbehaving applications flock()
+ *   request.
+ *
+ * - Access to any flock lockres doesn't require concurrency, so we
+ *   can simplify the code by requiring the caller to guarantee
+ *   serialization of dlmglue flock calls.
+ */
+int ocfs2_file_lock(struct file *file, int ex, int trylock)
+{
+       int ret, level = ex ? LKM_EXMODE : LKM_PRMODE;
+       unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0;
+       unsigned long flags;
+       struct ocfs2_file_private *fp = file->private_data;
+       struct ocfs2_lock_res *lockres = &fp->fp_flock;
+       struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
+       struct ocfs2_mask_waiter mw;
+
+       ocfs2_init_mask_waiter(&mw);
+
+       if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
+           (lockres->l_level > LKM_NLMODE)) {
+               mlog(ML_ERROR,
+                    "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
+                    "level: %u\n", lockres->l_name, lockres->l_flags,
+                    lockres->l_level);
+               return -EINVAL;
+       }
+
+       spin_lock_irqsave(&lockres->l_lock, flags);
+       if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
+               lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+               spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+               /*
+                * Get the lock at NLMODE to start - that way we
+                * can cancel the upconvert request if need be.
+                */
+               ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               ret = ocfs2_wait_for_mask(&mw);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+               spin_lock_irqsave(&lockres->l_lock, flags);
+       }
+
+       lockres->l_action = OCFS2_AST_CONVERT;
+       lkm_flags |= LKM_CONVERT;
+       lockres->l_requested = level;
+       lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
+
+       lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+       spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+       ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags,
+                     lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
+                     ocfs2_locking_ast, lockres, ocfs2_blocking_ast);
+       if (ret != DLM_NORMAL) {
+               if (trylock && ret == DLM_NOTQUEUED)
+                       ret = -EAGAIN;
+               else {
+                       ocfs2_log_dlm_error("dlmlock", ret, lockres);
+                       ret = -EINVAL;
+               }
+
+               ocfs2_recover_from_dlm_error(lockres, 1);
+               lockres_remove_mask_waiter(lockres, &mw);
+               goto out;
+       }
+
+       ret = ocfs2_wait_for_mask_interruptible(&mw, lockres);
+       if (ret == -ERESTARTSYS) {
+               /*
+                * Userspace can cause deadlock itself with
+                * flock(). Current behavior locally is to allow the
+                * deadlock, but abort the system call if a signal is
+                * received. We follow this example, otherwise a
+                * poorly written program could sit in kernel until
+                * reboot.
+                *
+                * Handling this is a bit more complicated for Ocfs2
+                * though. We can't exit this function with an
+                * outstanding lock request, so a cancel convert is
+                * required. We intentionally overwrite 'ret' - if the
+                * cancel fails and the lock was granted, it's easier
+                * to just bubble sucess back up to the user.
+                */
+               ret = ocfs2_flock_handle_signal(lockres, level);
+       }
+
+out:
+
+       mlog(0, "Lock: \"%s\" ex: %d, trylock: %d, returns: %d\n",
+            lockres->l_name, ex, trylock, ret);
+       return ret;
+}
+
+void ocfs2_file_unlock(struct file *file)
+{
+       int ret;
+       unsigned long flags;
+       struct ocfs2_file_private *fp = file->private_data;
+       struct ocfs2_lock_res *lockres = &fp->fp_flock;
+       struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
+       struct ocfs2_mask_waiter mw;
+
+       ocfs2_init_mask_waiter(&mw);
+
+       if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
+               return;
+
+       if (lockres->l_level == LKM_NLMODE)
+               return;
+
+       mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
+            lockres->l_name, lockres->l_flags, lockres->l_level,
+            lockres->l_action);
+
+       spin_lock_irqsave(&lockres->l_lock, flags);
+       /*
+        * Fake a blocking ast for the downconvert code.
+        */
+       lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
+       lockres->l_blocking = LKM_EXMODE;
+
+       ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
+       lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
+       spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+       ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0);
+       if (ret) {
+               mlog_errno(ret);
+               return;
+       }
+
+       ret = ocfs2_wait_for_mask(&mw);
+       if (ret)
+               mlog_errno(ret);
+}
+
  static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
                                         struct ocfs2_lock_res *lockres)
  {
  static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
                                         struct ocfs2_lock_res *lockres)
  {
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h

index 6dcbc944e8cefa3c38a6da4844513bbf68f683f4..5f17243ba5017dfe3af0f2384bee683a4d77f6d3 100644 (file)
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -66,6 +66,9 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
                                struct inode *inode);
  void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
                                 u64 parent, struct inode *inode);
                                struct inode *inode);
  void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
                                 u64 parent, struct inode *inode);
+struct ocfs2_file_private;
+void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
+                             struct ocfs2_file_private *fp);
  void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
  int ocfs2_create_new_inode_locks(struct inode *inode);
  int ocfs2_drop_inode_locks(struct inode *inode);
  void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
  int ocfs2_create_new_inode_locks(struct inode *inode);
  int ocfs2_drop_inode_locks(struct inode *inode);
@@ -98,6 +101,8 @@ int ocfs2_rename_lock(struct ocfs2_super *osb);
  void ocfs2_rename_unlock(struct ocfs2_super *osb);
  int ocfs2_dentry_lock(struct dentry *dentry, int ex);
  void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
  void ocfs2_rename_unlock(struct ocfs2_super *osb);
  int ocfs2_dentry_lock(struct dentry *dentry, int ex);
  void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
+int ocfs2_file_lock(struct file *file, int ex, int trylock);
+void ocfs2_file_unlock(struct file *file);
  
  void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
  void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
  
  void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
  void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h

index 066f14add3a8c2f06ad465b2d9b81011ffaf0e4b..048ddcaf5c80e9b2bb76b1e06d3f6ddb340856c8 100644 (file)
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -32,6 +32,12 @@ extern const struct inode_operations ocfs2_file_iops;
  extern const struct inode_operations ocfs2_special_file_iops;
  struct ocfs2_alloc_context;
  
  extern const struct inode_operations ocfs2_special_file_iops;
  struct ocfs2_alloc_context;
  
+struct ocfs2_file_private {
+       struct file             *fp_file;
+       struct mutex            fp_mutex;
+       struct ocfs2_lock_res   fp_flock;
+};
+
  enum ocfs2_alloc_restarted {
         RESTART_NONE = 0,
         RESTART_TRANS,
  enum ocfs2_alloc_restarted {
         RESTART_NONE = 0,
         RESTART_TRANS,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h

index d12bd7036da7ad52ada64b73d31727ae34ffa2ee..63c131e1cc77b4e3eb1218a8990a411cda07f8f5 100644 (file)
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -101,6 +101,7 @@ enum ocfs2_unlock_action {
                                                * about to be
                                                * dropped. */
  #define OCFS2_LOCK_QUEUED        (0x00000100) /* queued for downconvert */
                                                * about to be
                                                * dropped. */
  #define OCFS2_LOCK_QUEUED        (0x00000100) /* queued for downconvert */
+#define OCFS2_LOCK_NOCACHE       (0x00000200) /* don't use a holder count */
  
  struct ocfs2_lock_res_ops;
  
  
  struct ocfs2_lock_res_ops;
  
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h

index 4ca02b1c38ac548ab3abf61424bcc72b9464c26b..86f3e3799c2b00d91f5c456792b3c73ed791be51 100644 (file)
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -45,6 +45,7 @@ enum ocfs2_lock_type {
         OCFS2_LOCK_TYPE_RW,
         OCFS2_LOCK_TYPE_DENTRY,
         OCFS2_LOCK_TYPE_OPEN,
         OCFS2_LOCK_TYPE_RW,
         OCFS2_LOCK_TYPE_DENTRY,
         OCFS2_LOCK_TYPE_OPEN,
+       OCFS2_LOCK_TYPE_FLOCK,
         OCFS2_NUM_LOCK_TYPES
  };
  
         OCFS2_NUM_LOCK_TYPES
  };
  
@@ -73,6 +74,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
                 case OCFS2_LOCK_TYPE_OPEN:
                         c = 'O';
                         break;
                 case OCFS2_LOCK_TYPE_OPEN:
                         c = 'O';
                         break;
+               case OCFS2_LOCK_TYPE_FLOCK:
+                       c = 'F';
+                       break;
                 default:
                         c = '\0';
         }
                 default:
                         c = '\0';
         }
@@ -90,6 +94,7 @@ static char *ocfs2_lock_type_strings[] = {
         [OCFS2_LOCK_TYPE_RW] = "Write/Read",
         [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
         [OCFS2_LOCK_TYPE_OPEN] = "Open",
         [OCFS2_LOCK_TYPE_RW] = "Write/Read",
         [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
         [OCFS2_LOCK_TYPE_OPEN] = "Open",
+       [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
  };
  
  static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
  };
  
  static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
author	Mark Fasheh <mark.fasheh@oracle.com>
	Fri, 21 Dec 2007 00:43:10 +0000 (16:43 -0800)
committer	Mark Fasheh <mark.fasheh@oracle.com>
	Fri, 25 Jan 2008 23:05:43 +0000 (15:05 -0800)
fs/ocfs2/dlmglue.c		patch \| blob \| history
fs/ocfs2/dlmglue.h		patch \| blob \| history
fs/ocfs2/file.h		patch \| blob \| history
fs/ocfs2/ocfs2.h		patch \| blob \| history
fs/ocfs2/ocfs2_lockid.h		patch \| blob \| history