rw = (pb->pb_flags & PBF_READ) ? READ : WRITE;
        }
 
+       if (pb->pb_flags & PBF_ORDERED) {
+               ASSERT(!(pb->pb_flags & PBF_READ));
+               rw = WRITE_BARRIER;
+       }
+
        /* Special code path for reading a sub page size pagebuf in --
         * we populate up the whole page, and hence the other metadata
         * in the same page.  This optimization is only valid when the
 
        PBF_DELWRI = (1 << 6),  /* buffer has dirty pages                  */
        PBF_STALE = (1 << 7),   /* buffer has been staled, do not find it  */
        PBF_FS_MANAGED = (1 << 8),  /* filesystem controls freeing memory  */
-       PBF_FLUSH = (1 << 11),      /* flush disk write cache              */
+       PBF_ORDERED = (1 << 11),    /* use ordered writes                  */
        PBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead             */
 
        /* flags used only as arguments to access routines */
 #define XFS_BUF_UNASYNC(x)      ((x)->pb_flags &= ~PBF_ASYNC)
 #define XFS_BUF_ISASYNC(x)      ((x)->pb_flags & PBF_ASYNC)
 
-#define XFS_BUF_FLUSH(x)        ((x)->pb_flags |= PBF_FLUSH)
-#define XFS_BUF_UNFLUSH(x)      ((x)->pb_flags &= ~PBF_FLUSH)
-#define XFS_BUF_ISFLUSH(x)      ((x)->pb_flags & PBF_FLUSH)
+#define XFS_BUF_ORDERED(x)      ((x)->pb_flags |= PBF_ORDERED)
+#define XFS_BUF_UNORDERED(x)    ((x)->pb_flags &= ~PBF_ORDERED)
+#define XFS_BUF_ISORDERED(x)    ((x)->pb_flags & PBF_ORDERED)
 
 #define XFS_BUF_SHUT(x)                 printk("XFS_BUF_SHUT not implemented yet\n")
 #define XFS_BUF_UNSHUT(x)       printk("XFS_BUF_UNSHUT not implemented yet\n")
 
                close_bdev_excl(bdev);
 }
 
+/*
+ * Try to write out the superblock using barriers.
+ */
+STATIC int
+xfs_barrier_test(
+       xfs_mount_t     *mp)
+{
+       xfs_buf_t       *sbp = xfs_getsb(mp, 0);
+       int             error;
+
+       XFS_BUF_UNDONE(sbp);
+       XFS_BUF_UNREAD(sbp);
+       XFS_BUF_UNDELAYWRITE(sbp);
+       XFS_BUF_WRITE(sbp);
+       XFS_BUF_UNASYNC(sbp);
+       XFS_BUF_ORDERED(sbp);
+
+       xfsbdstrat(mp, sbp);
+       error = xfs_iowait(sbp);
+
+       /*
+        * Clear all the flags we set and possible error state in the
+        * buffer.  We only did the write to try out whether barriers
+        * worked and shouldn't leave any traces in the superblock
+        * buffer.
+        */
+       XFS_BUF_DONE(sbp);
+       XFS_BUF_ERROR(sbp, 0);
+       XFS_BUF_UNORDERED(sbp);
+
+       xfs_buf_relse(sbp);
+       return error;
+}
+
+void
+xfs_mountfs_check_barriers(xfs_mount_t *mp)
+{
+       int error;
+
+       if (mp->m_logdev_targp != mp->m_ddev_targp) {
+               xfs_fs_cmn_err(CE_NOTE, mp,
+                 "Disabling barriers, not supported with external log device");
+               mp->m_flags &= ~XFS_MOUNT_BARRIER;
+       }
+
+       if (mp->m_ddev_targp->pbr_bdev->bd_disk->queue->ordered ==
+                                       QUEUE_ORDERED_NONE) {
+               xfs_fs_cmn_err(CE_NOTE, mp,
+                 "Disabling barriers, not supported by the underlying device");
+               mp->m_flags &= ~XFS_MOUNT_BARRIER;
+       }
+
+       error = xfs_barrier_test(mp);
+       if (error) {
+               xfs_fs_cmn_err(CE_NOTE, mp,
+                 "Disabling barriers, trial barrier write failed");
+               mp->m_flags &= ~XFS_MOUNT_BARRIER;
+       }
+}
+
+void
+xfs_blkdev_issue_flush(
+       xfs_buftarg_t           *buftarg)
+{
+       blkdev_issue_flush(buftarg->pbr_bdev, NULL);
+}
 
 STATIC struct inode *
 linvfs_alloc_inode(
 
 extern int  xfs_blkdev_get(struct xfs_mount *, const char *,
                                struct block_device **);
 extern void xfs_blkdev_put(struct block_device *);
+extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 
 extern struct export_operations linvfs_export_ops;
 
 
                                                 * enforcement */
 #define XFSMNT_NOUUID          0x01000000      /* Ignore fs uuid */
 #define XFSMNT_DMAPI           0x02000000      /* enable dmapi/xdsm */
-#define XFSMNT_NOLOGFLUSH      0x04000000      /* Don't flush for log blocks */
+#define XFSMNT_BARRIER         0x04000000      /* use write barriers */
 #define XFSMNT_IDELETE         0x08000000      /* inode cluster delete */
 #define XFSMNT_SWALLOC         0x10000000      /* turn on stripe width
                                                 * allocation */
 
 STATIC void xlog_state_switch_iclogs(xlog_t            *log,
                                     xlog_in_core_t *iclog,
                                     int                eventual_size);
-STATIC int  xlog_state_sync(xlog_t *log, xfs_lsn_t lsn, uint flags);
-STATIC int  xlog_state_sync_all(xlog_t *log, uint flags);
+STATIC int  xlog_state_sync(xlog_t                     *log,
+                           xfs_lsn_t                   lsn,
+                           uint                        flags,
+                           int                         *log_flushed);
+STATIC int  xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed);
 STATIC void xlog_state_want_sync(xlog_t        *log, xlog_in_core_t *iclog);
 
 /* local functions to manipulate grant head */
  * semaphore.
  */
 int
-xfs_log_force(xfs_mount_t *mp,
-             xfs_lsn_t   lsn,
-             uint        flags)
+_xfs_log_force(
+       xfs_mount_t     *mp,
+       xfs_lsn_t       lsn,
+       uint            flags,
+       int             *log_flushed)
 {
-       int     rval;
-       xlog_t *log = mp->m_log;
+       xlog_t          *log = mp->m_log;
+       int             dummy;
+
+       if (!log_flushed)
+               log_flushed = &dummy;
 
 #if defined(DEBUG) || defined(XLOG_NOLOG)
        if (!xlog_debug && xlog_target == log->l_targ)
 
        XFS_STATS_INC(xs_log_force);
 
-       if ((log->l_flags & XLOG_IO_ERROR) == 0) {
-               if (lsn == 0)
-                       rval = xlog_state_sync_all(log, flags);
-               else
-                       rval = xlog_state_sync(log, lsn, flags);
-       } else {
-               rval = XFS_ERROR(EIO);
-       }
-
-       return rval;
-
+       if (log->l_flags & XLOG_IO_ERROR)
+               return XFS_ERROR(EIO);
+       if (lsn == 0)
+               return xlog_state_sync_all(log, flags, log_flushed);
+       else
+               return xlog_state_sync(log, lsn, flags, log_flushed);
 }      /* xfs_log_force */
 
 /*
        XFS_BUF_BUSY(bp);
        XFS_BUF_ASYNC(bp);
        /*
-        * Do a disk write cache flush for the log block.
-        * This is a bit of a sledgehammer, it would be better
-        * to use a tag barrier here that just prevents reordering.
+        * Do an ordered write for the log block.
+        *
         * It may not be needed to flush the first split block in the log wrap
         * case, but do it anyways to be safe -AK
         */
-       if (!(log->l_mp->m_flags & XFS_MOUNT_NOLOGFLUSH))
-               XFS_BUF_FLUSH(bp);
+       if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
+               XFS_BUF_ORDERED(bp);
 
        ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
        ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
                XFS_BUF_SET_FSPRIVATE(bp, iclog);
                XFS_BUF_BUSY(bp);
                XFS_BUF_ASYNC(bp);
-               if (!(log->l_mp->m_flags & XFS_MOUNT_NOLOGFLUSH))
-                       XFS_BUF_FLUSH(bp);
+               if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
+                       XFS_BUF_ORDERED(bp);
                dptr = XFS_BUF_PTR(bp);
                /*
                 * Bump the cycle numbers at the start of each block
  *             not in the active nor dirty state.
  */
 STATIC int
-xlog_state_sync_all(xlog_t *log, uint flags)
+xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
 {
        xlog_in_core_t  *iclog;
        xfs_lsn_t       lsn;
 
                                if (xlog_state_release_iclog(log, iclog))
                                        return XFS_ERROR(EIO);
+                               *log_flushed = 1;
                                s = LOG_LOCK(log);
                                if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) == lsn &&
                                    iclog->ic_state != XLOG_STATE_DIRTY)
                 */
                if (iclog->ic_state & XLOG_STATE_IOERROR)
                        return XFS_ERROR(EIO);
+               *log_flushed = 1;
 
        } else {
 
 int
 xlog_state_sync(xlog_t   *log,
                xfs_lsn_t lsn,
-               uint      flags)
+               uint      flags,
+               int       *log_flushed)
 {
     xlog_in_core_t     *iclog;
     int                        already_slept = 0;
                        XFS_STATS_INC(xs_log_force_sleep);
                        sv_wait(&iclog->ic_prev->ic_writesema, PSWP,
                                &log->l_icloglock, s);
+                       *log_flushed = 1;
                        already_slept = 1;
                        goto try_again;
                } else {
                        LOG_UNLOCK(log, s);
                        if (xlog_state_release_iclog(log, iclog))
                                return XFS_ERROR(EIO);
+                       *log_flushed = 1;
                        s = LOG_LOCK(log);
                }
        }
                 */
                if (iclog->ic_state & XLOG_STATE_IOERROR)
                        return XFS_ERROR(EIO);
+               *log_flushed = 1;
        } else {                /* just return */
                LOG_UNLOCK(log, s);
        }
        xlog_ticket_t   *tic;
        xlog_t          *log;
        int             retval;
+       int             dummy;
        SPLDECL(s);
        SPLDECL(s2);
 
                 * Force the incore logs to disk before shutting the
                 * log down completely.
                 */
-               xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC);
+               xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC, &dummy);
                s2 = LOG_LOCK(log);
                retval = xlog_state_ioerror(log);
                LOG_UNLOCK(log, s2);
 
                       xfs_log_ticket_t ticket,
                       void             **iclog,
                       uint             flags);
-int      xfs_log_force(struct xfs_mount *mp,
-                       xfs_lsn_t        lsn,
-                       uint             flags);
+int      _xfs_log_force(struct xfs_mount *mp,
+                        xfs_lsn_t      lsn,
+                        uint           flags,
+                        int            *log_forced);
+#define xfs_log_force(mp, lsn, flags) \
+       _xfs_log_force(mp, lsn, flags, NULL);
 int      xfs_log_mount(struct xfs_mount        *mp,
                        struct xfs_buftarg      *log_target,
                        xfs_daddr_t             start_block,
 
                                                 * 32 bits in size */
 #define XFS_MOUNT_32BITINOOPT  0x00008000      /* saved mount option state */
 #define XFS_MOUNT_NOUUID       0x00010000      /* ignore uuid during mount */
-#define XFS_MOUNT_NOLOGFLUSH   0x00020000
+#define XFS_MOUNT_BARRIER      0x00020000
 #define XFS_MOUNT_IDELETE      0x00040000      /* delete empty inode clusters*/
 #define XFS_MOUNT_SWALLOC      0x00080000      /* turn on stripe width
                                                 * allocation */
 extern void    xfs_mod_sb(xfs_trans_t *, __int64_t);
 extern void    xfs_mount_free(xfs_mount_t *mp, int remove_bhv);
 extern int     xfs_mountfs(struct vfs *, xfs_mount_t *mp, int);
+extern void    xfs_mountfs_check_barriers(xfs_mount_t *mp);
 
 extern int     xfs_unmountfs(xfs_mount_t *, struct cred *);
 extern void    xfs_unmountfs_close(xfs_mount_t *, struct cred *);
 
  */
  /*ARGSUSED*/
 int
-xfs_trans_commit(
+_xfs_trans_commit(
        xfs_trans_t     *tp,
        uint            flags,
-       xfs_lsn_t       *commit_lsn_p)
+       xfs_lsn_t       *commit_lsn_p,
+       int             *log_flushed)
 {
        xfs_log_iovec_t         *log_vector;
        int                     nvec;
         * log out now and wait for it.
         */
        if (sync) {
-               if (!error)
-                       error = xfs_log_force(mp, commit_lsn,
-                                     XFS_LOG_FORCE | XFS_LOG_SYNC);
+               if (!error) {
+                       error = _xfs_log_force(mp, commit_lsn,
+                                     XFS_LOG_FORCE | XFS_LOG_SYNC,
+                                     log_flushed);
+               }
                XFS_STATS_INC(xs_trans_sync);
        } else {
                XFS_STATS_INC(xs_trans_async);
 
                                         struct xfs_efd_log_item *,
                                         xfs_fsblock_t,
                                         xfs_extlen_t);
-int            xfs_trans_commit(xfs_trans_t *, uint flags, xfs_lsn_t *);
+int            _xfs_trans_commit(xfs_trans_t *,
+                                 uint flags,
+                                 xfs_lsn_t *,
+                                 int *);
+#define xfs_trans_commit(tp, flags, lsn) \
+       _xfs_trans_commit(tp, flags, lsn, NULL)
 void           xfs_trans_cancel(xfs_trans_t *, int);
 void           xfs_trans_ail_init(struct xfs_mount *);
 xfs_lsn_t      xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t);
 
 
        if (ap->flags & XFSMNT_NOUUID)
                mp->m_flags |= XFS_MOUNT_NOUUID;
-       if (ap->flags & XFSMNT_NOLOGFLUSH)
-               mp->m_flags |= XFS_MOUNT_NOLOGFLUSH;
+       if (ap->flags & XFSMNT_BARRIER)
+               mp->m_flags |= XFS_MOUNT_BARRIER;
 
        return 0;
 }
                goto error2;
 
        error = XFS_IOINIT(vfsp, args, flags);
-       if (!error)
-               return 0;
+       if (error)
+               goto error2;
+
+       if ((args->flags & XFSMNT_BARRIER) &&
+           !(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY))
+               xfs_mountfs_check_barriers(mp);
+       return 0;
+
 error2:
        if (mp->m_sb_bp)
                xfs_freesb(mp);
        else
                mp->m_flags &= ~XFS_MOUNT_NOATIME;
 
-       if (!(vfsp->vfs_flag & VFS_RDONLY)) {
-               VFS_SYNC(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL, error);
+       if ((vfsp->vfs_flag & VFS_RDONLY) &&
+           !(*flags & MS_RDONLY)) {
+               vfsp->vfs_flag &= ~VFS_RDONLY;
+
+               if (args->flags & XFSMNT_BARRIER)
+                       xfs_mountfs_check_barriers(mp);
        }
 
-       if (*flags & MS_RDONLY) {
+       if (!(vfsp->vfs_flag & VFS_RDONLY) &&
+           (*flags & MS_RDONLY)) {
+               VFS_SYNC(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL, error);
+
                xfs_quiesce_fs(mp);
 
                /* Ok now write out an unmount record */
                xfs_log_unmount_write(mp);
                xfs_unmountfs_writesb(mp);
                vfsp->vfs_flag |= VFS_RDONLY;
-       } else {
-               vfsp->vfs_flag &= ~VFS_RDONLY;
        }
 
        return 0;
 #define MNTOPT_ALLOCSIZE    "allocsize"    /* preferred allocation size */
 #define MNTOPT_IHASHSIZE    "ihashsize"    /* size of inode hash table */
 #define MNTOPT_NORECOVERY   "norecovery"   /* don't run XFS recovery */
-#define MNTOPT_NOLOGFLUSH   "nologflush"   /* don't hard flush on log writes */
+#define MNTOPT_BARRIER "barrier"       /* use writer barriers for log write and
+                                          unwritten extent conversion */
 #define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */
 #define MNTOPT_64BITINODE   "inode64"  /* inodes can be allocated anywhere */
 #define MNTOPT_IKEEP   "ikeep"         /* do not free empty inode clusters */
 #endif
                } else if (!strcmp(this_char, MNTOPT_NOUUID)) {
                        args->flags |= XFSMNT_NOUUID;
-               } else if (!strcmp(this_char, MNTOPT_NOLOGFLUSH)) {
-                       args->flags |= XFSMNT_NOLOGFLUSH;
+               } else if (!strcmp(this_char, MNTOPT_BARRIER)) {
+                       args->flags |= XFSMNT_BARRIER;
                } else if (!strcmp(this_char, MNTOPT_IKEEP)) {
                        args->flags &= ~XFSMNT_IDELETE;
                } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
                { XFS_MOUNT_NOUUID,             "," MNTOPT_NOUUID },
                { XFS_MOUNT_NORECOVERY,         "," MNTOPT_NORECOVERY },
                { XFS_MOUNT_OSYNCISOSYNC,       "," MNTOPT_OSYNCISOSYNC },
-               { XFS_MOUNT_NOLOGFLUSH,         "," MNTOPT_NOLOGFLUSH },
+               { XFS_MOUNT_BARRIER,            "," MNTOPT_BARRIER },
                { XFS_MOUNT_IDELETE,            "," MNTOPT_NOIKEEP },
                { 0, NULL }
        };
 
        xfs_inode_t     *ip;
        xfs_trans_t     *tp;
        int             error;
+       int             log_flushed = 0, changed = 1;
 
        vn_trace_entry(BHV_TO_VNODE(bdp),
                        __FUNCTION__, (inst_t *)__return_address);
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
                if (xfs_ipincount(ip)) {
-                       xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
+                       _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
                                      XFS_LOG_FORCE |
                                      ((flag & FSYNC_WAIT)
-                                      ? XFS_LOG_SYNC : 0));
+                                      ? XFS_LOG_SYNC : 0),
+                                     &log_flushed);
+               } else {
+                       /*
+                        * If the inode is not pinned and nothing
+                        * has changed we don't need to flush the
+                        * cache.
+                        */
+                       changed = 0;
                }
                error = 0;
        } else  {
                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
                if (flag & FSYNC_WAIT)
                        xfs_trans_set_sync(tp);
-               error = xfs_trans_commit(tp, 0, NULL);
+               error = _xfs_trans_commit(tp, 0, NULL, &log_flushed);
 
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
        }
+
+       if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
+               /*
+                * If the log write didn't issue an ordered tag we need
+                * to flush the disk cache for the data device now.
+                */
+               if (!log_flushed)
+                       xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
+
+               /*
+                * If this inode is on the RT dev we need to flush that
+                * cache aswell.
+                */
+               if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)
+                       xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
+       }
+
        return error;
 }