From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 3 Apr 2009 18:10:33 +0000 (-0700)
Subject: Merge branch 'ext3-latency-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git... 
X-Git-Url: http://www.pilppa.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=20bec8ab1458c24bed0d5492ee15d87807fc415a;hp=-c;p=linux-2.6-omap-h63xx.git

Merge branch 'ext3-latency-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

* 'ext3-latency-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
  ext3: Add replace-on-rename hueristics for data=writeback mode
  ext3: Add replace-on-truncate hueristics for data=writeback mode
  ext3: Use WRITE_SYNC for commits which are caused by fsync()
  block_write_full_page: Use synchronous writes for WBC_SYNC_ALL writebacks
---

20bec8ab1458c24bed0d5492ee15d87807fc415a
diff --combined fs/buffer.c
index c2fa1be4923,e7ebd95e0c6..5d55a896ff7
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@@ -165,6 -165,151 +165,6 @@@ void end_buffer_write_sync(struct buffe
  	put_bh(bh);
  }
  
 -/*
 - * Write out and wait upon all the dirty data associated with a block
 - * device via its mapping.  Does not take the superblock lock.
 - */
 -int sync_blockdev(struct block_device *bdev)
 -{
 -	int ret = 0;
 -
 -	if (bdev)
 -		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
 -	return ret;
 -}
 -EXPORT_SYMBOL(sync_blockdev);
 -
 -/*
 - * Write out and wait upon all dirty data associated with this
 - * device.   Filesystem data as well as the underlying block
 - * device.  Takes the superblock lock.
 - */
 -int fsync_bdev(struct block_device *bdev)
 -{
 -	struct super_block *sb = get_super(bdev);
 -	if (sb) {
 -		int res = fsync_super(sb);
 -		drop_super(sb);
 -		return res;
 -	}
 -	return sync_blockdev(bdev);
 -}
 -
 -/**
 - * freeze_bdev  --  lock a filesystem and force it into a consistent state
 - * @bdev:	blockdevice to lock
 - *
 - * This takes the block device bd_mount_sem to make sure no new mounts
 - * happen on bdev until thaw_bdev() is called.
 - * If a superblock is found on this device, we take the s_umount semaphore
 - * on it to make sure nobody unmounts until the snapshot creation is done.
 - * The reference counter (bd_fsfreeze_count) guarantees that only the last
 - * unfreeze process can unfreeze the frozen filesystem actually when multiple
 - * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
 - * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
 - * actually.
 - */
 -struct super_block *freeze_bdev(struct block_device *bdev)
 -{
 -	struct super_block *sb;
 -	int error = 0;
 -
 -	mutex_lock(&bdev->bd_fsfreeze_mutex);
 -	if (bdev->bd_fsfreeze_count > 0) {
 -		bdev->bd_fsfreeze_count++;
 -		sb = get_super(bdev);
 -		mutex_unlock(&bdev->bd_fsfreeze_mutex);
 -		return sb;
 -	}
 -	bdev->bd_fsfreeze_count++;
 -
 -	down(&bdev->bd_mount_sem);
 -	sb = get_super(bdev);
 -	if (sb && !(sb->s_flags & MS_RDONLY)) {
 -		sb->s_frozen = SB_FREEZE_WRITE;
 -		smp_wmb();
 -
 -		__fsync_super(sb);
 -
 -		sb->s_frozen = SB_FREEZE_TRANS;
 -		smp_wmb();
 -
 -		sync_blockdev(sb->s_bdev);
 -
 -		if (sb->s_op->freeze_fs) {
 -			error = sb->s_op->freeze_fs(sb);
 -			if (error) {
 -				printk(KERN_ERR
 -					"VFS:Filesystem freeze failed\n");
 -				sb->s_frozen = SB_UNFROZEN;
 -				drop_super(sb);
 -				up(&bdev->bd_mount_sem);
 -				bdev->bd_fsfreeze_count--;
 -				mutex_unlock(&bdev->bd_fsfreeze_mutex);
 -				return ERR_PTR(error);
 -			}
 -		}
 -	}
 -
 -	sync_blockdev(bdev);
 -	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 -
 -	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
 -}
 -EXPORT_SYMBOL(freeze_bdev);
 -
 -/**
 - * thaw_bdev  -- unlock filesystem
 - * @bdev:	blockdevice to unlock
 - * @sb:		associated superblock
 - *
 - * Unlocks the filesystem and marks it writeable again after freeze_bdev().
 - */
 -int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 -{
 -	int error = 0;
 -
 -	mutex_lock(&bdev->bd_fsfreeze_mutex);
 -	if (!bdev->bd_fsfreeze_count) {
 -		mutex_unlock(&bdev->bd_fsfreeze_mutex);
 -		return -EINVAL;
 -	}
 -
 -	bdev->bd_fsfreeze_count--;
 -	if (bdev->bd_fsfreeze_count > 0) {
 -		if (sb)
 -			drop_super(sb);
 -		mutex_unlock(&bdev->bd_fsfreeze_mutex);
 -		return 0;
 -	}
 -
 -	if (sb) {
 -		BUG_ON(sb->s_bdev != bdev);
 -		if (!(sb->s_flags & MS_RDONLY)) {
 -			if (sb->s_op->unfreeze_fs) {
 -				error = sb->s_op->unfreeze_fs(sb);
 -				if (error) {
 -					printk(KERN_ERR
 -						"VFS:Filesystem thaw failed\n");
 -					sb->s_frozen = SB_FREEZE_TRANS;
 -					bdev->bd_fsfreeze_count++;
 -					mutex_unlock(&bdev->bd_fsfreeze_mutex);
 -					return error;
 -				}
 -			}
 -			sb->s_frozen = SB_UNFROZEN;
 -			smp_wmb();
 -			wake_up(&sb->s_wait_unfrozen);
 -		}
 -		drop_super(sb);
 -	}
 -
 -	up(&bdev->bd_mount_sem);
 -	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 -	return 0;
 -}
 -EXPORT_SYMBOL(thaw_bdev);
 -
  /*
   * Various filesystems appear to want __find_get_block to be non-blocking.
   * But it's the page lock which protects the buffers.  To get around this,
@@@ -199,13 -344,13 +199,13 @@@ __find_get_block_slow(struct block_devi
  	head = page_buffers(page);
  	bh = head;
  	do {
 -		if (bh->b_blocknr == block) {
 +		if (!buffer_mapped(bh))
 +			all_mapped = 0;
 +		else if (bh->b_blocknr == block) {
  			ret = bh;
  			get_bh(bh);
  			goto out_unlock;
  		}
 -		if (!buffer_mapped(bh))
 -			all_mapped = 0;
  		bh = bh->b_this_page;
  	} while (bh != head);
  
@@@ -290,7 -435,7 +290,7 @@@ static void free_more_memory(void
  						&zone);
  		if (zone)
  			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
 -						GFP_NOFS);
 +						GFP_NOFS, NULL);
  	}
  }
  
@@@ -547,39 -692,6 +547,39 @@@ repeat
  	return err;
  }
  
 +void do_thaw_all(unsigned long unused)
 +{
 +	struct super_block *sb;
 +	char b[BDEVNAME_SIZE];
 +
 +	spin_lock(&sb_lock);
 +restart:
 +	list_for_each_entry(sb, &super_blocks, s_list) {
 +		sb->s_count++;
 +		spin_unlock(&sb_lock);
 +		down_read(&sb->s_umount);
 +		while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
 +			printk(KERN_WARNING "Emergency Thaw on %s\n",
 +			       bdevname(sb->s_bdev, b));
 +		up_read(&sb->s_umount);
 +		spin_lock(&sb_lock);
 +		if (__put_super_and_need_restart(sb))
 +			goto restart;
 +	}
 +	spin_unlock(&sb_lock);
 +	printk(KERN_WARNING "Emergency Thaw complete\n");
 +}
 +
 +/**
 + * emergency_thaw_all -- forcibly thaw every frozen filesystem
 + *
 + * Used for emergency unfreeze of all filesystems via SysRq
 + */
 +void emergency_thaw_all(void)
 +{
 +	pdflush_operation(do_thaw_all, 0);
 +}
 +
  /**
   * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
   * @mapping: the mapping which wants those buffers written
@@@ -654,7 -766,14 +654,7 @@@ static void __set_page_dirty(struct pag
  	spin_lock_irq(&mapping->tree_lock);
  	if (page->mapping) {	/* Race with truncate? */
  		WARN_ON_ONCE(warn && !PageUptodate(page));
 -
 -		if (mapping_cap_account_dirty(mapping)) {
 -			__inc_zone_page_state(page, NR_FILE_DIRTY);
 -			__inc_bdi_stat(mapping->backing_dev_info,
 -					BDI_RECLAIMABLE);
 -			task_dirty_inc(current);
 -			task_io_account_write(PAGE_CACHE_SIZE);
 -		}
 +		account_page_dirtied(page, mapping);
  		radix_tree_tag_set(&mapping->page_tree,
  				page_index(page), PAGECACHE_TAG_DIRTY);
  	}
@@@ -1595,6 -1714,7 +1595,7 @@@ static int __block_write_full_page(stru
  	struct buffer_head *bh, *head;
  	const unsigned blocksize = 1 << inode->i_blkbits;
  	int nr_underway = 0;
+ 	int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
  
  	BUG_ON(!PageLocked(page));
  
@@@ -1686,7 -1806,7 +1687,7 @@@
  	do {
  		struct buffer_head *next = bh->b_this_page;
  		if (buffer_async_write(bh)) {
- 			submit_bh(WRITE, bh);
+ 			submit_bh(write_op, bh);
  			nr_underway++;
  		}
  		bh = next;
@@@ -1740,7 -1860,7 +1741,7 @@@ recover
  		struct buffer_head *next = bh->b_this_page;
  		if (buffer_async_write(bh)) {
  			clear_buffer_dirty(bh);
- 			submit_bh(WRITE, bh);
+ 			submit_bh(write_op, bh);
  			nr_underway++;
  		}
  		bh = next;
@@@ -2346,14 -2466,13 +2347,14 @@@ int block_commit_write(struct page *pag
   * unlock the page.
   */
  int
 -block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
 +block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
  		   get_block_t get_block)
  {
 +	struct page *page = vmf->page;
  	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
  	unsigned long end;
  	loff_t size;
 -	int ret = -EINVAL;
 +	int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
  
  	lock_page(page);
  	size = i_size_read(inode);
@@@ -2373,13 -2492,6 +2374,13 @@@
  	if (!ret)
  		ret = block_commit_write(page, 0, end);
  
 +	if (unlikely(ret)) {
 +		if (ret == -ENOMEM)
 +			ret = VM_FAULT_OOM;
 +		else /* -ENOSPC, -EIO, etc */
 +			ret = VM_FAULT_SIGBUS;
 +	}
 +
  out_unlock:
  	unlock_page(page);
  	return ret;
@@@ -3315,6 -3427,7 +3316,6 @@@ EXPORT_SYMBOL(cont_write_begin)
  EXPORT_SYMBOL(end_buffer_read_sync);
  EXPORT_SYMBOL(end_buffer_write_sync);
  EXPORT_SYMBOL(file_fsync);
 -EXPORT_SYMBOL(fsync_bdev);
  EXPORT_SYMBOL(generic_block_bmap);
  EXPORT_SYMBOL(generic_cont_expand_simple);
  EXPORT_SYMBOL(init_buffer);
diff --combined fs/ext3/file.c
index 521f8238b2f,4a04cbb1c23..5b49704b231
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@@ -33,6 -33,10 +33,10 @@@
   */
  static int ext3_release_file (struct inode * inode, struct file * filp)
  {
+ 	if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) {
+ 		filemap_flush(inode->i_mapping);
+ 		EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE;
+ 	}
  	/* if we are the last writer on the inode, drop the block reservation */
  	if ((filp->f_mode & FMODE_WRITE) &&
  			(atomic_read(&inode->i_writecount) == 1))
@@@ -112,7 -116,7 +116,7 @@@ const struct file_operations ext3_file_
  	.write		= do_sync_write,
  	.aio_read	= generic_file_aio_read,
  	.aio_write	= ext3_file_write,
 -	.ioctl		= ext3_ioctl,
 +	.unlocked_ioctl	= ext3_ioctl,
  #ifdef CONFIG_COMPAT
  	.compat_ioctl	= ext3_compat_ioctl,
  #endif
diff --combined fs/ext3/inode.c
index d3ef6566b01,0f5bca0d82f..466a332e0bd
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@@ -1149,15 -1149,12 +1149,15 @@@ static int ext3_write_begin(struct fil
  				struct page **pagep, void **fsdata)
  {
  	struct inode *inode = mapping->host;
 -	int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
 +	int ret;
  	handle_t *handle;
  	int retries = 0;
  	struct page *page;
  	pgoff_t index;
  	unsigned from, to;
 +	/* Reserve one block more for addition to orphan list in case
 +	 * we allocate blocks but write fails for some reason */
 +	int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
  
  	index = pos >> PAGE_CACHE_SHIFT;
  	from = pos & (PAGE_CACHE_SIZE - 1);
@@@ -1187,19 -1184,14 +1187,19 @@@ retry
  	}
  write_begin_failed:
  	if (ret) {
 -		ext3_journal_stop(handle);
 -		unlock_page(page);
 -		page_cache_release(page);
  		/*
  		 * block_write_begin may have instantiated a few blocks
  		 * outside i_size.  Trim these off again. Don't need
  		 * i_size_read because we hold i_mutex.
 +		 *
 +		 * Add inode to orphan list in case we crash before truncate
 +		 * finishes.
  		 */
 +		if (pos + len > inode->i_size)
 +			ext3_orphan_add(handle, inode);
 +		ext3_journal_stop(handle);
 +		unlock_page(page);
 +		page_cache_release(page);
  		if (pos + len > inode->i_size)
  			vmtruncate(inode, inode->i_size);
  	}
@@@ -1219,18 -1211,6 +1219,18 @@@ int ext3_journal_dirty_data(handle_t *h
  	return err;
  }
  
 +/* For ordered writepage and write_end functions */
 +static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
 +{
 +	/*
 +	 * Write could have mapped the buffer but it didn't copy the data in
 +	 * yet. So avoid filing such buffer into a transaction.
 +	 */
 +	if (buffer_mapped(bh) && buffer_uptodate(bh))
 +		return ext3_journal_dirty_data(handle, bh);
 +	return 0;
 +}
 +
  /* For write_end() in data=journal mode */
  static int write_end_fn(handle_t *handle, struct buffer_head *bh)
  {
@@@ -1241,20 -1221,26 +1241,20 @@@
  }
  
  /*
 - * Generic write_end handler for ordered and writeback ext3 journal modes.
 - * We can't use generic_write_end, because that unlocks the page and we need to
 - * unlock the page after ext3_journal_stop, but ext3_journal_stop must run
 - * after block_write_end.
 + * This is nasty and subtle: ext3_write_begin() could have allocated blocks
 + * for the whole page but later we failed to copy the data in. Update inode
 + * size according to what we managed to copy. The rest is going to be
 + * truncated in write_end function.
   */
 -static int ext3_generic_write_end(struct file *file,
 -				struct address_space *mapping,
 -				loff_t pos, unsigned len, unsigned copied,
 -				struct page *page, void *fsdata)
 +static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied)
  {
 -	struct inode *inode = file->f_mapping->host;
 -
 -	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 -
 -	if (pos+copied > inode->i_size) {
 -		i_size_write(inode, pos+copied);
 +	/* What matters to us is i_disksize. We don't write i_size anywhere */
 +	if (pos + copied > inode->i_size)
 +		i_size_write(inode, pos + copied);
 +	if (pos + copied > EXT3_I(inode)->i_disksize) {
 +		EXT3_I(inode)->i_disksize = pos + copied;
  		mark_inode_dirty(inode);
  	}
 -
 -	return copied;
  }
  
  /*
@@@ -1274,29 -1260,35 +1274,29 @@@ static int ext3_ordered_write_end(struc
  	unsigned from, to;
  	int ret = 0, ret2;
  
 -	from = pos & (PAGE_CACHE_SIZE - 1);
 -	to = from + len;
 +	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
  
 +	from = pos & (PAGE_CACHE_SIZE - 1);
 +	to = from + copied;
  	ret = walk_page_buffers(handle, page_buffers(page),
 -		from, to, NULL, ext3_journal_dirty_data);
 +		from, to, NULL, journal_dirty_data_fn);
  
 -	if (ret == 0) {
 -		/*
 -		 * generic_write_end() will run mark_inode_dirty() if i_size
 -		 * changes.  So let's piggyback the i_disksize mark_inode_dirty
 -		 * into that.
 -		 */
 -		loff_t new_i_size;
 -
 -		new_i_size = pos + copied;
 -		if (new_i_size > EXT3_I(inode)->i_disksize)
 -			EXT3_I(inode)->i_disksize = new_i_size;
 -		ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
 -							page, fsdata);
 -		copied = ret2;
 -		if (ret2 < 0)
 -			ret = ret2;
 -	}
 +	if (ret == 0)
 +		update_file_sizes(inode, pos, copied);
 +	/*
 +	 * There may be allocated blocks outside of i_size because
 +	 * we failed to copy some data. Prepare for truncate.
 +	 */
 +	if (pos + len > inode->i_size)
 +		ext3_orphan_add(handle, inode);
  	ret2 = ext3_journal_stop(handle);
  	if (!ret)
  		ret = ret2;
  	unlock_page(page);
  	page_cache_release(page);
  
 +	if (pos + len > inode->i_size)
 +		vmtruncate(inode, inode->i_size);
  	return ret ? ret : copied;
  }
  
@@@ -1307,22 -1299,25 +1307,22 @@@ static int ext3_writeback_write_end(str
  {
  	handle_t *handle = ext3_journal_current_handle();
  	struct inode *inode = file->f_mapping->host;
 -	int ret = 0, ret2;
 -	loff_t new_i_size;
 -
 -	new_i_size = pos + copied;
 -	if (new_i_size > EXT3_I(inode)->i_disksize)
 -		EXT3_I(inode)->i_disksize = new_i_size;
 -
 -	ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
 -							page, fsdata);
 -	copied = ret2;
 -	if (ret2 < 0)
 -		ret = ret2;
 +	int ret;
  
 -	ret2 = ext3_journal_stop(handle);
 -	if (!ret)
 -		ret = ret2;
 +	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 +	update_file_sizes(inode, pos, copied);
 +	/*
 +	 * There may be allocated blocks outside of i_size because
 +	 * we failed to copy some data. Prepare for truncate.
 +	 */
 +	if (pos + len > inode->i_size)
 +		ext3_orphan_add(handle, inode);
 +	ret = ext3_journal_stop(handle);
  	unlock_page(page);
  	page_cache_release(page);
  
 +	if (pos + len > inode->i_size)
 +		vmtruncate(inode, inode->i_size);
  	return ret ? ret : copied;
  }
  
@@@ -1343,23 -1338,15 +1343,23 @@@ static int ext3_journalled_write_end(st
  	if (copied < len) {
  		if (!PageUptodate(page))
  			copied = 0;
 -		page_zero_new_buffers(page, from+copied, to);
 +		page_zero_new_buffers(page, from + copied, to);
 +		to = from + copied;
  	}
  
  	ret = walk_page_buffers(handle, page_buffers(page), from,
  				to, &partial, write_end_fn);
  	if (!partial)
  		SetPageUptodate(page);
 -	if (pos+copied > inode->i_size)
 -		i_size_write(inode, pos+copied);
 +
 +	if (pos + copied > inode->i_size)
 +		i_size_write(inode, pos + copied);
 +	/*
 +	 * There may be allocated blocks outside of i_size because
 +	 * we failed to copy some data. Prepare for truncate.
 +	 */
 +	if (pos + len > inode->i_size)
 +		ext3_orphan_add(handle, inode);
  	EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
  	if (inode->i_size > EXT3_I(inode)->i_disksize) {
  		EXT3_I(inode)->i_disksize = inode->i_size;
@@@ -1374,8 -1361,6 +1374,8 @@@
  	unlock_page(page);
  	page_cache_release(page);
  
 +	if (pos + len > inode->i_size)
 +		vmtruncate(inode, inode->i_size);
  	return ret ? ret : copied;
  }
  
@@@ -1443,9 -1428,11 +1443,9 @@@ static int bput_one(handle_t *handle, s
  	return 0;
  }
  
 -static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
 +static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
  {
 -	if (buffer_mapped(bh))
 -		return ext3_journal_dirty_data(handle, bh);
 -	return 0;
 +	return !buffer_mapped(bh);
  }
  
  /*
@@@ -1518,15 -1505,6 +1518,15 @@@ static int ext3_ordered_writepage(struc
  	if (ext3_journal_current_handle())
  		goto out_fail;
  
 +	if (!page_has_buffers(page)) {
 +		create_empty_buffers(page, inode->i_sb->s_blocksize,
 +				(1 << BH_Dirty)|(1 << BH_Uptodate));
 +	} else if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
 +		/* Provide NULL instead of get_block so that we catch bugs if buffers weren't really mapped */
 +		return block_write_full_page(page, NULL, wbc);
 +	}
 +	page_bufs = page_buffers(page);
 +
  	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
  
  	if (IS_ERR(handle)) {
@@@ -1534,6 -1512,11 +1534,6 @@@
  		goto out_fail;
  	}
  
 -	if (!page_has_buffers(page)) {
 -		create_empty_buffers(page, inode->i_sb->s_blocksize,
 -				(1 << BH_Dirty)|(1 << BH_Uptodate));
 -	}
 -	page_bufs = page_buffers(page);
  	walk_page_buffers(handle, page_bufs, 0,
  			PAGE_CACHE_SIZE, NULL, bget_one);
  
@@@ -2363,6 -2346,9 +2363,9 @@@ void ext3_truncate(struct inode *inode
  	if (!ext3_can_truncate(inode))
  		return;
  
+ 	if (inode->i_size == 0 && ext3_should_writeback_data(inode))
+ 		ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE;
+ 
  	/*
  	 * We have to lock the EOF page here, because lock_page() nests
  	 * outside journal_start().
@@@ -3072,7 -3058,7 +3075,7 @@@ int ext3_setattr(struct dentry *dentry
  			error = PTR_ERR(handle);
  			goto err_out;
  		}
 -		error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
 +		error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
  		if (error) {
  			ext3_journal_stop(handle);
  			return error;
@@@ -3163,7 -3149,7 +3166,7 @@@ static int ext3_writepage_trans_blocks(
  		ret = 2 * (bpp + indirects) + 2;
  
  #ifdef CONFIG_QUOTA
 -	/* We know that structure was already allocated during DQUOT_INIT so
 +	/* We know that structure was already allocated during vfs_dq_init so
  	 * we will be updating only the data blocks + inodes */
  	ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb);
  #endif
@@@ -3254,7 -3240,7 +3257,7 @@@ int ext3_mark_inode_dirty(handle_t *han
   * i_size has been changed by generic_commit_write() and we thus need
   * to include the updated inode in the current transaction.
   *
 - * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
 + * Also, vfs_dq_alloc_space() will always dirty the inode when blocks
   * are allocated to the file.
   *
   * If the inode is marked synchronous, we don't honour that here - doing
diff --combined fs/ext3/namei.c
index 6ddaa0a42b2,ab98a66ab8c..6ff7b973023
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@@ -161,12 -161,12 +161,12 @@@ static struct dx_frame *dx_probe(struc
  				 struct dx_frame *frame,
  				 int *err);
  static void dx_release (struct dx_frame *frames);
 -static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
 +static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
  			struct dx_hash_info *hinfo, struct dx_map_entry map[]);
  static void dx_sort_map(struct dx_map_entry *map, unsigned count);
  static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
  		struct dx_map_entry *offsets, int count);
 -static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
 +static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize);
  static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
  static int ext3_htree_next_block(struct inode *dir, __u32 hash,
  				 struct dx_frame *frame,
@@@ -708,14 -708,14 +708,14 @@@ errout
   * Create map of hash values, offsets, and sizes, stored at end of block.
   * Returns number of entries mapped.
   */
 -static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
 -			struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
 +static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
 +		struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
  {
  	int count = 0;
  	char *base = (char *) de;
  	struct dx_hash_info h = *hinfo;
  
 -	while ((char *) de < base + size)
 +	while ((char *) de < base + blocksize)
  	{
  		if (de->name_len && de->inode) {
  			ext3fs_dirhash(de->name, de->name_len, &h);
@@@ -1047,16 -1047,8 +1047,16 @@@ static struct dentry *ext3_lookup(struc
  			return ERR_PTR(-EIO);
  		}
  		inode = ext3_iget(dir->i_sb, ino);
 -		if (IS_ERR(inode))
 -			return ERR_CAST(inode);
 +		if (unlikely(IS_ERR(inode))) {
 +			if (PTR_ERR(inode) == -ESTALE) {
 +				ext3_error(dir->i_sb, __func__,
 +						"deleted inode referenced: %lu",
 +						ino);
 +				return ERR_PTR(-EIO);
 +			} else {
 +				return ERR_CAST(inode);
 +			}
 +		}
  	}
  	return d_splice_alias(inode, dentry);
  }
@@@ -1128,14 -1120,13 +1128,14 @@@ dx_move_dirents(char *from, char *to, s
   * Compact each dir entry in the range to the minimal rec_len.
   * Returns pointer to last entry in range.
   */
 -static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
 +static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize)
  {
 -	struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
 +	struct ext3_dir_entry_2 *next, *to, *prev;
 +	struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base;
  	unsigned rec_len = 0;
  
  	prev = to = de;
 -	while ((char*)de < base + size) {
 +	while ((char *)de < base + blocksize) {
  		next = ext3_next_entry(de);
  		if (de->inode && de->name_len) {
  			rec_len = EXT3_DIR_REC_LEN(de->name_len);
@@@ -2058,7 -2049,7 +2058,7 @@@ static int ext3_rmdir (struct inode * d
  
  	/* Initialize quotas before so that eventual writes go in
  	 * separate transaction */
 -	DQUOT_INIT(dentry->d_inode);
 +	vfs_dq_init(dentry->d_inode);
  	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
  	if (IS_ERR(handle))
  		return PTR_ERR(handle);
@@@ -2117,7 -2108,7 +2117,7 @@@ static int ext3_unlink(struct inode * d
  
  	/* Initialize quotas before so that eventual writes go
  	 * in separate transaction */
 -	DQUOT_INIT(dentry->d_inode);
 +	vfs_dq_init(dentry->d_inode);
  	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
  	if (IS_ERR(handle))
  		return PTR_ERR(handle);
@@@ -2274,14 -2265,14 +2274,14 @@@ static int ext3_rename (struct inode * 
  	struct inode * old_inode, * new_inode;
  	struct buffer_head * old_bh, * new_bh, * dir_bh;
  	struct ext3_dir_entry_2 * old_de, * new_de;
- 	int retval;
+ 	int retval, flush_file = 0;
  
  	old_bh = new_bh = dir_bh = NULL;
  
  	/* Initialize quotas before so that eventual writes go
  	 * in separate transaction */
  	if (new_dentry->d_inode)
 -		DQUOT_INIT(new_dentry->d_inode);
 +		vfs_dq_init(new_dentry->d_inode);
  	handle = ext3_journal_start(old_dir, 2 *
  					EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
  					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
@@@ -2410,6 -2401,8 +2410,8 @@@
  		ext3_mark_inode_dirty(handle, new_inode);
  		if (!new_inode->i_nlink)
  			ext3_orphan_add(handle, new_inode);
+ 		if (ext3_should_writeback_data(new_inode))
+ 			flush_file = 1;
  	}
  	retval = 0;
  
@@@ -2418,6 -2411,8 +2420,8 @@@ end_rename
  	brelse (old_bh);
  	brelse (new_bh);
  	ext3_journal_stop(handle);
+ 	if (retval == 0 && flush_file)
+ 		filemap_flush(old_inode->i_mapping);
  	return retval;
  }
  
diff --combined include/linux/ext3_fs.h
index e263acaa405,d2630c56cb3..634a5e5aba3
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@@ -208,6 -208,7 +208,7 @@@ static inline __u32 ext3_mask_flags(umo
  #define EXT3_STATE_JDATA		0x00000001 /* journaled data exists */
  #define EXT3_STATE_NEW			0x00000002 /* inode is newly created */
  #define EXT3_STATE_XATTR		0x00000004 /* has in-inode xattrs */
+ #define EXT3_STATE_FLUSH_ON_CLOSE	0x00000008
  
  /* Used to pass group descriptor data when online resize is done */
  struct ext3_new_group_input {
@@@ -893,8 -894,9 +894,8 @@@ extern int ext3_fiemap(struct inode *in
  		       u64 start, u64 len);
  
  /* ioctl.c */
 -extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
 -		       unsigned long);
 -extern long ext3_compat_ioctl (struct file *, unsigned int, unsigned long);
 +extern long ext3_ioctl(struct file *, unsigned int, unsigned long);
 +extern long ext3_compat_ioctl(struct file *, unsigned int, unsigned long);
  
  /* namei.c */
  extern int ext3_orphan_add(handle_t *, struct inode *);