]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/commitdiff
Merge git://git.kernel.org/pub/scm/linux/kernel/git/czankel/xtensa-2.6
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 3 Apr 2009 22:15:25 +0000 (15:15 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 3 Apr 2009 22:15:25 +0000 (15:15 -0700)
* git://git.kernel.org/pub/scm/linux/kernel/git/czankel/xtensa-2.6: (21 commits)
  xtensa: we don't need to include asm/io.h
  xtensa: only build platform or variant if they contain a Makefile
  xtensa: make startup code discardable
  xtensa: ccount clocksource
  xtensa: remove platform rtc hooks
  xtensa: use generic sched_clock()
  xtensa: platform: s6105
  xtensa: let platform override KERNELOFFSET
  xtensa: s6000 variant
  xtensa: s6000 variant core definitions
  xtensa: variant irq set callbacks
  xtensa: variant-specific code
  xtensa: nommu support
  xtensa: add flat support
  xtensa: enforce slab alignment to maximum register width
  xtensa: cope with ram beginning at higher addresses
  xtensa: don't make bootmem bitmap larger than required
  xtensa: fix init_bootmem_node() argument order
  xtensa: use correct stack pointer for stack traces
  xtensa: beat Kconfig into shape
  ...

514 files changed:
Documentation/ABI/testing/sysfs-class-regulator
Documentation/RCU/listRCU.txt
Documentation/RCU/rcu.txt
Documentation/RCU/rculist_nulls.txt
Documentation/filesystems/caching/backend-api.txt [new file with mode: 0644]
Documentation/filesystems/caching/cachefiles.txt [new file with mode: 0644]
Documentation/filesystems/caching/fscache.txt [new file with mode: 0644]
Documentation/filesystems/caching/netfs-api.txt [new file with mode: 0644]
Documentation/filesystems/caching/object.txt [new file with mode: 0644]
Documentation/filesystems/caching/operations.txt [new file with mode: 0644]
Documentation/filesystems/exofs.txt [new file with mode: 0644]
Documentation/filesystems/udf.txt
Documentation/kernel-parameters.txt
Documentation/md.txt
Documentation/slow-work.txt [new file with mode: 0644]
MAINTAINERS
arch/arm/configs/omap_ldp_defconfig
arch/arm/configs/pcm037_defconfig
arch/arm/configs/realview-smp_defconfig
arch/arm/configs/realview_defconfig
arch/arm/mach-at91/pm.c
arch/arm/mach-gemini/include/mach/system.h
arch/arm/mach-mmp/include/mach/system.h
arch/arm/mach-mx3/pcm037.c
arch/arm/mach-omap2/Makefile
arch/arm/mach-omap2/board-ldp.c
arch/arm/mach-omap2/board-overo.c
arch/arm/mach-realview/core.c
arch/arm/mach-realview/localtimer.c
arch/arm/mm/abort-ev6.S
arch/arm/mm/cache-feroceon-l2.c
arch/arm/vfp/entry.S
arch/arm/vfp/vfphw.S
arch/arm/vfp/vfpmodule.c
arch/m68k/include/asm/bootinfo.h
arch/m68k/include/asm/bootinfo_mm.h [deleted file]
arch/m68k/include/asm/bootinfo_no.h [deleted file]
arch/m68k/include/asm/bug.h
arch/m68k/include/asm/bug_mm.h [deleted file]
arch/m68k/include/asm/bug_no.h [deleted file]
arch/m68k/include/asm/bugs.h
arch/m68k/include/asm/bugs_mm.h [deleted file]
arch/m68k/include/asm/bugs_no.h [deleted file]
arch/m68k/include/asm/cache.h
arch/m68k/include/asm/cache_mm.h [deleted file]
arch/m68k/include/asm/cache_no.h [deleted file]
arch/m68k/include/asm/current.h
arch/m68k/include/asm/current_mm.h [deleted file]
arch/m68k/include/asm/current_no.h [deleted file]
arch/m68k/include/asm/div64.h
arch/m68k/include/asm/div64_mm.h [deleted file]
arch/m68k/include/asm/div64_no.h [deleted file]
arch/m68k/include/asm/dma-mapping.h
arch/m68k/include/asm/dma-mapping_mm.h [deleted file]
arch/m68k/include/asm/dma-mapping_no.h [deleted file]
arch/m68k/include/asm/elf.h
arch/m68k/include/asm/elf_mm.h [deleted file]
arch/m68k/include/asm/elf_no.h [deleted file]
arch/m68k/include/asm/fb.h
arch/m68k/include/asm/fb_mm.h [deleted file]
arch/m68k/include/asm/fb_no.h [deleted file]
arch/m68k/include/asm/fpu.h
arch/m68k/include/asm/fpu_mm.h [deleted file]
arch/m68k/include/asm/fpu_no.h [deleted file]
arch/m68k/include/asm/hw_irq.h
arch/m68k/include/asm/hw_irq_mm.h [deleted file]
arch/m68k/include/asm/hw_irq_no.h [deleted file]
arch/m68k/include/asm/kmap_types.h
arch/m68k/include/asm/kmap_types_mm.h [deleted file]
arch/m68k/include/asm/kmap_types_no.h [deleted file]
arch/m68k/include/asm/m532xsim.h
arch/m68k/include/asm/mc146818rtc.h
arch/m68k/include/asm/mc146818rtc_mm.h [deleted file]
arch/m68k/include/asm/mc146818rtc_no.h [deleted file]
arch/m68k/include/asm/mcfpci.h [deleted file]
arch/m68k/include/asm/mmu.h
arch/m68k/include/asm/mmu_context.h
arch/m68k/include/asm/mmu_context_mm.h [deleted file]
arch/m68k/include/asm/mmu_context_no.h [deleted file]
arch/m68k/include/asm/mmu_mm.h [deleted file]
arch/m68k/include/asm/mmu_no.h [deleted file]
arch/m68k/include/asm/module.h
arch/m68k/include/asm/module_mm.h [deleted file]
arch/m68k/include/asm/module_no.h [deleted file]
arch/m68k/include/asm/page_offset.h
arch/m68k/include/asm/page_offset_mm.h [deleted file]
arch/m68k/include/asm/page_offset_no.h [deleted file]
arch/m68k/include/asm/pci.h
arch/m68k/include/asm/pci_mm.h [deleted file]
arch/m68k/include/asm/pci_no.h [deleted file]
arch/m68k/include/asm/pgalloc.h
arch/m68k/include/asm/pgalloc_mm.h [deleted file]
arch/m68k/include/asm/pgalloc_no.h [deleted file]
arch/m68k/include/asm/pgtable_no.h
arch/m68k/include/asm/rtc.h
arch/m68k/include/asm/scatterlist.h
arch/m68k/include/asm/scatterlist_mm.h [deleted file]
arch/m68k/include/asm/scatterlist_no.h [deleted file]
arch/m68k/include/asm/segment.h
arch/m68k/include/asm/segment_mm.h [deleted file]
arch/m68k/include/asm/segment_no.h [deleted file]
arch/m68k/include/asm/timex.h
arch/m68k/include/asm/timex_mm.h [deleted file]
arch/m68k/include/asm/timex_no.h [deleted file]
arch/m68k/include/asm/tlbflush.h
arch/m68k/include/asm/tlbflush_mm.h [deleted file]
arch/m68k/include/asm/tlbflush_no.h [deleted file]
arch/m68k/include/asm/ucontext.h
arch/m68k/include/asm/ucontext_mm.h [deleted file]
arch/m68k/include/asm/ucontext_no.h [deleted file]
arch/m68k/include/asm/unaligned.h
arch/m68k/include/asm/unaligned_mm.h [deleted file]
arch/m68k/include/asm/unaligned_no.h [deleted file]
arch/m68k/kernel/time.c
arch/m68knommu/Makefile
arch/m68knommu/kernel/dma.c
arch/m68knommu/kernel/irq.c
arch/m68knommu/mm/init.c
arch/m68knommu/platform/5249/config.c
arch/m68knommu/platform/5307/config.c
arch/m68knommu/platform/5407/config.c
arch/m68knommu/platform/coldfire/Makefile
arch/m68knommu/platform/coldfire/clk.c [new file with mode: 0644]
arch/parisc/Kconfig
arch/parisc/Makefile
arch/parisc/include/asm/atomic.h
arch/parisc/include/asm/cacheflush.h
arch/parisc/include/asm/elf.h
arch/parisc/include/asm/ftrace.h [new file with mode: 0644]
arch/parisc/include/asm/page.h
arch/parisc/include/asm/pdc.h
arch/parisc/include/asm/pgtable.h
arch/parisc/include/asm/smp.h
arch/parisc/kernel/Makefile
arch/parisc/kernel/entry.S
arch/parisc/kernel/firmware.c
arch/parisc/kernel/ftrace.c [new file with mode: 0644]
arch/parisc/kernel/irq.c
arch/parisc/kernel/module.c
arch/parisc/kernel/parisc_ksyms.c
arch/parisc/kernel/process.c
arch/parisc/kernel/processor.c
arch/parisc/kernel/smp.c
arch/parisc/kernel/stacktrace.c [new file with mode: 0644]
arch/parisc/kernel/syscall.S
arch/parisc/kernel/time.c
arch/parisc/kernel/traps.c
arch/parisc/kernel/vmlinux.lds.S
arch/parisc/mm/init.c
arch/powerpc/include/asm/ps3.h
arch/powerpc/kernel/time.c
arch/powerpc/platforms/ps3/os-area.c
arch/powerpc/platforms/ps3/platform.h
arch/powerpc/platforms/ps3/setup.c
arch/powerpc/platforms/ps3/time.c
arch/um/drivers/ubd_kern.c
arch/x86/Kconfig
arch/x86/kernel/amd_iommu.c
arch/x86/mm/highmem_32.c
arch/x86/mm/iomap_32.c
arch/x86/pci/i386.c
crypto/async_tx/async_tx.c
crypto/async_tx/async_xor.c
crypto/shash.c
crypto/xor.c
drivers/base/iommu.c
drivers/block/aoe/aoecmd.c
drivers/block/hd.c
drivers/block/xsysace.c
drivers/char/hw_random/timeriomem-rng.c
drivers/crypto/ixp4xx_crypto.c
drivers/dma/Kconfig
drivers/dma/dmaengine.c
drivers/dma/dmatest.c
drivers/dma/dw_dmac.c
drivers/dma/dw_dmac_regs.h
drivers/dma/fsldma.c
drivers/dma/ioat_dma.c
drivers/dma/iop-adma.c
drivers/dma/ipu/ipu_idmac.c
drivers/dma/ipu/ipu_irq.c
drivers/dma/mv_xor.c
drivers/md/Kconfig
drivers/md/Makefile
drivers/md/bitmap.c
drivers/md/bitmap.h [moved from include/linux/raid/bitmap.h with 100% similarity]
drivers/md/dm-bio-list.h
drivers/md/dm-bio-record.h
drivers/md/dm-crypt.c
drivers/md/dm-exception-store.c
drivers/md/dm-exception-store.h
drivers/md/dm-io.c
drivers/md/dm-log.c
drivers/md/dm-path-selector.c
drivers/md/dm-raid1.c
drivers/md/dm-snap-persistent.c
drivers/md/dm-snap-transient.c
drivers/md/dm-snap.c
drivers/md/dm-snap.h [deleted file]
drivers/md/dm-table.c
drivers/md/dm-target.c
drivers/md/dm.c
drivers/md/dm.h
drivers/md/faulty.c
drivers/md/linear.c
drivers/md/linear.h [moved from include/linux/raid/linear.h with 95% similarity]
drivers/md/md.c
drivers/md/md.h [moved from include/linux/raid/md_k.h with 83% similarity]
drivers/md/mktables.c
drivers/md/multipath.c
drivers/md/multipath.h [moved from include/linux/raid/multipath.h with 96% similarity]
drivers/md/raid0.c
drivers/md/raid0.h [moved from include/linux/raid/raid0.h with 96% similarity]
drivers/md/raid1.c
drivers/md/raid1.h [moved from include/linux/raid/raid1.h with 99% similarity]
drivers/md/raid10.c
drivers/md/raid10.h [moved from include/linux/raid/raid10.h with 99% similarity]
drivers/md/raid5.c
drivers/md/raid5.h [moved from include/linux/raid/raid5.h with 81% similarity]
drivers/md/raid6algos.c
drivers/md/raid6altivec.uc
drivers/md/raid6int.uc
drivers/md/raid6mmx.c
drivers/md/raid6recov.c
drivers/md/raid6sse1.c
drivers/md/raid6sse2.c
drivers/md/raid6test/Makefile
drivers/md/raid6test/test.c
drivers/md/raid6x86.h
drivers/mfd/twl4030-core.c
drivers/mmc/core/core.c
drivers/mtd/maps/pxa2xx-flash.c
drivers/parisc/asp.c
drivers/parisc/ccio-dma.c
drivers/parisc/dino.c
drivers/parisc/eisa.c
drivers/parisc/eisa_enumerator.c
drivers/parisc/iosapic.c
drivers/parisc/led.c
drivers/pci/intel-iommu.c
drivers/pcmcia/pxa2xx_cm_x255.c
drivers/regulator/Kconfig
drivers/regulator/Makefile
drivers/regulator/bq24022.c
drivers/regulator/core.c
drivers/regulator/da903x.c
drivers/regulator/fixed.c
drivers/regulator/pcf50633-regulator.c
drivers/regulator/twl4030-regulator.c [new file with mode: 0644]
drivers/regulator/virtual.c
drivers/regulator/wm8350-regulator.c
drivers/regulator/wm8400-regulator.c
drivers/rtc/Kconfig
drivers/rtc/Makefile
drivers/rtc/rtc-generic.c [new file with mode: 0644]
drivers/rtc/rtc-parisc.c [deleted file]
drivers/rtc/rtc-ppc.c [deleted file]
drivers/rtc/rtc-ps3.c [new file with mode: 0644]
drivers/serial/mcf.c
drivers/usb/storage/isd200.c
fs/Kconfig
fs/Makefile
fs/afs/Kconfig
fs/afs/Makefile
fs/afs/cache.c
fs/afs/cache.h
fs/afs/cell.c
fs/afs/file.c
fs/afs/inode.c
fs/afs/internal.h
fs/afs/main.c
fs/afs/mntpt.c
fs/afs/vlocation.c
fs/afs/volume.c
fs/afs/write.c
fs/btrfs/async-thread.c
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/delayed-ref.c
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_map.c
fs/btrfs/free-space-cache.c
fs/btrfs/free-space-cache.h [new file with mode: 0644]
fs/btrfs/inode.c
fs/btrfs/locking.c
fs/btrfs/super.c
fs/btrfs/transaction.c
fs/btrfs/tree-log.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h
fs/buffer.c
fs/cachefiles/Kconfig [new file with mode: 0644]
fs/cachefiles/Makefile [new file with mode: 0644]
fs/cachefiles/bind.c [new file with mode: 0644]
fs/cachefiles/daemon.c [new file with mode: 0644]
fs/cachefiles/interface.c [new file with mode: 0644]
fs/cachefiles/internal.h [new file with mode: 0644]
fs/cachefiles/key.c [new file with mode: 0644]
fs/cachefiles/main.c [new file with mode: 0644]
fs/cachefiles/namei.c [new file with mode: 0644]
fs/cachefiles/proc.c [new file with mode: 0644]
fs/cachefiles/rdwr.c [new file with mode: 0644]
fs/cachefiles/security.c [new file with mode: 0644]
fs/cachefiles/xattr.c [new file with mode: 0644]
fs/compat_ioctl.c
fs/exofs/BUGS [new file with mode: 0644]
fs/exofs/Kbuild [new file with mode: 0644]
fs/exofs/Kconfig [new file with mode: 0644]
fs/exofs/common.h [new file with mode: 0644]
fs/exofs/dir.c [new file with mode: 0644]
fs/exofs/exofs.h [new file with mode: 0644]
fs/exofs/file.c [new file with mode: 0644]
fs/exofs/inode.c [new file with mode: 0644]
fs/exofs/namei.c [new file with mode: 0644]
fs/exofs/osd.c [new file with mode: 0644]
fs/exofs/super.c [new file with mode: 0644]
fs/exofs/symlink.c [new file with mode: 0644]
fs/ext3/file.c
fs/ext3/inode.c
fs/ext3/namei.c
fs/fscache/Kconfig [new file with mode: 0644]
fs/fscache/Makefile [new file with mode: 0644]
fs/fscache/cache.c [new file with mode: 0644]
fs/fscache/cookie.c [new file with mode: 0644]
fs/fscache/fsdef.c [new file with mode: 0644]
fs/fscache/histogram.c [new file with mode: 0644]
fs/fscache/internal.h [new file with mode: 0644]
fs/fscache/main.c [new file with mode: 0644]
fs/fscache/netfs.c [new file with mode: 0644]
fs/fscache/object.c [new file with mode: 0644]
fs/fscache/operation.c [new file with mode: 0644]
fs/fscache/page.c [new file with mode: 0644]
fs/fscache/proc.c [new file with mode: 0644]
fs/fscache/stats.c [new file with mode: 0644]
fs/jbd/commit.c
fs/jbd/transaction.c
fs/nfs/Kconfig
fs/nfs/Makefile
fs/nfs/client.c
fs/nfs/file.c
fs/nfs/fscache-index.c [new file with mode: 0644]
fs/nfs/fscache.c [new file with mode: 0644]
fs/nfs/fscache.h [new file with mode: 0644]
fs/nfs/inode.c
fs/nfs/internal.h
fs/nfs/iostat.h
fs/nfs/read.c
fs/nfs/super.c
fs/ocfs2/alloc.c
fs/ocfs2/alloc.h
fs/ocfs2/aops.c
fs/ocfs2/cluster/heartbeat.c
fs/ocfs2/cluster/heartbeat.h
fs/ocfs2/cluster/nodemanager.c
fs/ocfs2/dir.c
fs/ocfs2/dir.h
fs/ocfs2/dlm/dlmcommon.h
fs/ocfs2/dlm/dlmdebug.c
fs/ocfs2/dlm/dlmdomain.c
fs/ocfs2/dlm/dlmmaster.c
fs/ocfs2/dlm/dlmthread.c
fs/ocfs2/dlmglue.c
fs/ocfs2/dlmglue.h
fs/ocfs2/export.c
fs/ocfs2/inode.c
fs/ocfs2/inode.h
fs/ocfs2/journal.c
fs/ocfs2/journal.h
fs/ocfs2/localalloc.c
fs/ocfs2/namei.c
fs/ocfs2/ocfs2.h
fs/ocfs2/ocfs2_fs.h
fs/ocfs2/ocfs2_lockid.h
fs/ocfs2/suballoc.c
fs/ocfs2/suballoc.h
fs/ocfs2/super.c
fs/ocfs2/xattr.c
fs/ocfs2/xattr.h
fs/splice.c
fs/super.c
fs/udf/balloc.c
fs/udf/dir.c
fs/udf/directory.c
fs/udf/ecma_167.h
fs/udf/ialloc.c
fs/udf/inode.c
fs/udf/misc.c
fs/udf/namei.c
fs/udf/osta_udf.h
fs/udf/partition.c
fs/udf/super.c
fs/udf/truncate.c
fs/udf/udf_i.h
fs/udf/udf_sb.h
fs/udf/udfdecl.h
fs/udf/udfend.h
fs/udf/udftime.c
fs/udf/unicode.c
fs/xfs/Makefile
fs/xfs/linux-2.6/mutex.h [deleted file]
fs/xfs/linux-2.6/xfs_aops.c
fs/xfs/linux-2.6/xfs_ioctl.c
fs/xfs/linux-2.6/xfs_iops.c
fs/xfs/linux-2.6/xfs_linux.h
fs/xfs/linux-2.6/xfs_quotaops.c [new file with mode: 0644]
fs/xfs/linux-2.6/xfs_super.c
fs/xfs/linux-2.6/xfs_super.h
fs/xfs/linux-2.6/xfs_sync.h
fs/xfs/linux-2.6/xfs_vnode.h
fs/xfs/quota/xfs_dquot.c
fs/xfs/quota/xfs_dquot.h
fs/xfs/quota/xfs_qm.c
fs/xfs/quota/xfs_qm.h
fs/xfs/quota/xfs_qm_bhv.c
fs/xfs/quota/xfs_qm_syscalls.c
fs/xfs/quota/xfs_quota_priv.h
fs/xfs/quota/xfs_trans_dquot.c
fs/xfs/support/debug.c
fs/xfs/support/uuid.c
fs/xfs/support/uuid.h
fs/xfs/xfs_ag.h
fs/xfs/xfs_alloc.c
fs/xfs/xfs_alloc.h
fs/xfs/xfs_attr_leaf.c
fs/xfs/xfs_bmap.c
fs/xfs/xfs_bmap.h
fs/xfs/xfs_btree.c
fs/xfs/xfs_btree.h
fs/xfs/xfs_da_btree.c
fs/xfs/xfs_da_btree.h
fs/xfs/xfs_dfrag.c
fs/xfs/xfs_dinode.h
fs/xfs/xfs_dir2.c
fs/xfs/xfs_dir2_block.c
fs/xfs/xfs_dir2_data.h
fs/xfs/xfs_dir2_leaf.c
fs/xfs/xfs_dir2_node.c
fs/xfs/xfs_dir2_sf.c
fs/xfs/xfs_extfree_item.h
fs/xfs/xfs_filestream.c
fs/xfs/xfs_fsops.c
fs/xfs/xfs_ialloc.c
fs/xfs/xfs_ialloc_btree.c
fs/xfs/xfs_ialloc_btree.h
fs/xfs/xfs_inode.h
fs/xfs/xfs_inode_item.h
fs/xfs/xfs_iomap.h
fs/xfs/xfs_itable.c
fs/xfs/xfs_log.c
fs/xfs/xfs_log.h
fs/xfs/xfs_log_priv.h
fs/xfs/xfs_log_recover.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_qmops.c
fs/xfs/xfs_quota.h
fs/xfs/xfs_rtalloc.c
fs/xfs/xfs_rtalloc.h
fs/xfs/xfs_trans.h
fs/xfs/xfs_trans_ail.c
fs/xfs/xfs_trans_item.c
fs/xfs/xfs_trans_space.h
fs/xfs/xfs_types.h
fs/xfs/xfs_utils.c
fs/xfs/xfs_vnodeops.c
fs/xfs/xfs_vnodeops.h
include/linux/async_tx.h
include/linux/device-mapper.h
include/linux/dm-dirty-log.h
include/linux/dma_remapping.h
include/linux/dmaengine.h
include/linux/dw_dmac.h
include/linux/ext3_fs.h
include/linux/fscache-cache.h [new file with mode: 0644]
include/linux/fscache.h [new file with mode: 0644]
include/linux/hdreg.h
include/linux/highmem.h
include/linux/i2c/twl4030.h
include/linux/intel-iommu.h
include/linux/iommu.h
include/linux/jbd.h
include/linux/mmc/host.h
include/linux/nfs_fs.h
include/linux/nfs_fs_sb.h
include/linux/nfs_iostat.h
include/linux/page-flags.h
include/linux/pagemap.h
include/linux/raid/md.h [deleted file]
include/linux/raid/md_u.h
include/linux/raid/pq.h [moved from drivers/md/raid6.h with 86% similarity]
include/linux/raid/xor.h
include/linux/regulator/bq24022.h
include/linux/regulator/consumer.h
include/linux/regulator/driver.h
include/linux/regulator/fixed.h
include/linux/regulator/machine.h
include/linux/slow-work.h [new file with mode: 0644]
include/linux/timeriomem-rng.h
init/Kconfig
init/do_mounts.h
init/do_mounts_md.c
kernel/Makefile
kernel/slow-work.c [new file with mode: 0644]
kernel/sysctl.c
mm/filemap.c
mm/migrate.c
mm/readahead.c
mm/swap.c
mm/truncate.c
mm/vmscan.c
scripts/package/buildtar
security/security.c

index 873ef1fc1569ae1fe0733c4691fa58947ae8c434..e091fa8737929966b0939e1a96e4e9fe05d89c39 100644 (file)
@@ -4,8 +4,8 @@ KernelVersion:  2.6.26
 Contact:       Liam Girdwood <lrg@slimlogic.co.uk>
 Description:
                Some regulator directories will contain a field called
-               state. This reports the regulator enable status, for
-               regulators which can report that value.
+               state. This reports the regulator enable control, for
+               regulators which can report that input value.
 
                This will be one of the following strings:
 
@@ -14,16 +14,54 @@ Description:
                'unknown'
 
                'enabled' means the regulator output is ON and is supplying
-               power to the system.
+               power to the system (assuming no error prevents it).
 
                'disabled' means the regulator output is OFF and is not
-               supplying power to the system..
+               supplying power to the system (unless some non-Linux
+               control has enabled it).
 
                'unknown' means software cannot determine the state, or
                the reported state is invalid.
 
                NOTE: this field can be used in conjunction with microvolts
-               and microamps to determine regulator output levels.
+               or microamps to determine configured regulator output levels.
+
+
+What:          /sys/class/regulator/.../status
+Description:
+               Some regulator directories will contain a field called
+               "status". This reports the current regulator status, for
+               regulators which can report that output value.
+
+               This will be one of the following strings:
+
+                       off
+                       on
+                       error
+                       fast
+                       normal
+                       idle
+                       standby
+
+               "off" means the regulator is not supplying power to the
+               system.
+
+               "on" means the regulator is supplying power to the system,
+               and the regulator can't report a detailed operation mode.
+
+               "error" indicates an out-of-regulation status such as being
+               disabled due to thermal shutdown, or voltage being unstable
+               because of problems with the input power supply.
+
+               "fast", "normal", "idle", and "standby" are all detailed
+               regulator operation modes (described elsewhere).  They
+               imply "on", but provide more detail.
+
+               Note that regulator status is a function of many inputs,
+               not limited to control inputs from Linux.  For example,
+               the actual load presented may trigger "error" status; or
+               a regulator may be enabled by another user, even though
+               Linux did not enable it.
 
 
 What:          /sys/class/regulator/.../type
@@ -58,7 +96,7 @@ Description:
                Some regulator directories will contain a field called
                microvolts. This holds the regulator output voltage setting
                measured in microvolts (i.e. E-6 Volts), for regulators
-               which can report that voltage.
+               which can report the control input for voltage.
 
                NOTE: This value should not be used to determine the regulator
                output voltage level as this value is the same regardless of
@@ -73,7 +111,7 @@ Description:
                Some regulator directories will contain a field called
                microamps. This holds the regulator output current limit
                setting measured in microamps (i.e. E-6 Amps), for regulators
-               which can report that current.
+               which can report the control input for a current limit.
 
                NOTE: This value should not be used to determine the regulator
                output current level as this value is the same regardless of
@@ -87,7 +125,7 @@ Contact:     Liam Girdwood <lrg@slimlogic.co.uk>
 Description:
                Some regulator directories will contain a field called
                opmode. This holds the current regulator operating mode,
-               for regulators which can report it.
+               for regulators which can report that control input value.
 
                The opmode value can be one of the following strings:
 
@@ -101,7 +139,8 @@ Description:
 
                NOTE: This value should not be used to determine the regulator
                output operating mode as this value is the same regardless of
-               whether the regulator is enabled or disabled.
+               whether the regulator is enabled or disabled.  A "status"
+               attribute may be available to determine the actual mode.
 
 
 What:          /sys/class/regulator/.../min_microvolts
index 1fd175368a875107892f72f412a980e9a93f7d77..4349c1487e919ce0e49702313229027a18bc7245 100644 (file)
@@ -118,7 +118,7 @@ Following are the RCU equivalents for these two functions:
                list_for_each_entry(e, list, list) {
                        if (!audit_compare_rule(rule, &e->rule)) {
                                list_del_rcu(&e->list);
-                               call_rcu(&e->rcu, audit_free_rule, e);
+                               call_rcu(&e->rcu, audit_free_rule);
                                return 0;
                        }
                }
@@ -206,7 +206,7 @@ RCU ("read-copy update") its name.  The RCU code is as follows:
                                ne->rule.action = newaction;
                                ne->rule.file_count = newfield_count;
                                list_replace_rcu(e, ne);
-                               call_rcu(&e->rcu, audit_free_rule, e);
+                               call_rcu(&e->rcu, audit_free_rule);
                                return 0;
                        }
                }
@@ -283,7 +283,7 @@ flag under the spinlock as follows:
                                list_del_rcu(&e->list);
                                e->deleted = 1;
                                spin_unlock(&e->lock);
-                               call_rcu(&e->rcu, audit_free_rule, e);
+                               call_rcu(&e->rcu, audit_free_rule);
                                return 0;
                        }
                }
index 95821a29ae418b6b275b01869fd40e29900baadf..7aa2002ade7780515de469435ab5bfc65d2f3176 100644 (file)
@@ -81,7 +81,7 @@ o     I hear that RCU needs work in order to support realtime kernels?
        This work is largely completed.  Realtime-friendly RCU can be
        enabled via the CONFIG_PREEMPT_RCU kernel configuration parameter.
        However, work is in progress for enabling priority boosting of
-       preempted RCU read-side critical sections.This is needed if you
+       preempted RCU read-side critical sections.  This is needed if you
        have CPU-bound realtime threads.
 
 o      Where can I find more information on RCU?
index 239f542d48baa9425f29a54b6b43f00468d986c2..6389dec33459e84228ef43a00f9d80be158bab5d 100644 (file)
@@ -21,7 +21,7 @@ if (obj) {
   /*
    * Because a writer could delete object, and a writer could
    * reuse these object before the RCU grace period, we
-   * must check key after geting the reference on object
+   * must check key after getting the reference on object
    */
   if (obj->key != key) { // not the object we expected
      put_ref(obj);
@@ -117,7 +117,7 @@ a race (some writer did a delete and/or a move of an object
 to another chain) checking the final 'nulls' value if
 the lookup met the end of chain. If final 'nulls' value
 is not the slot number, then we must restart the lookup at
-the begining. If the object was moved to same chain,
+the beginning. If the object was moved to the same chain,
 then the reader doesnt care : It might eventually
 scan the list again without harm.
 
diff --git a/Documentation/filesystems/caching/backend-api.txt b/Documentation/filesystems/caching/backend-api.txt
new file mode 100644 (file)
index 0000000..382d52c
--- /dev/null
@@ -0,0 +1,658 @@
+                         ==========================
+                         FS-CACHE CACHE BACKEND API
+                         ==========================
+
+The FS-Cache system provides an API by which actual caches can be supplied to
+FS-Cache for it to then serve out to network filesystems and other interested
+parties.
+
+This API is declared in <linux/fscache-cache.h>.
+
+
+====================================
+INITIALISING AND REGISTERING A CACHE
+====================================
+
+To start off, a cache definition must be initialised and registered for each
+cache the backend wants to make available.  For instance, CacheFS does this in
+the fill_super() operation on mounting.
+
+The cache definition (struct fscache_cache) should be initialised by calling:
+
+       void fscache_init_cache(struct fscache_cache *cache,
+                               struct fscache_cache_ops *ops,
+                               const char *idfmt,
+                               ...);
+
+Where:
+
+ (*) "cache" is a pointer to the cache definition;
+
+ (*) "ops" is a pointer to the table of operations that the backend supports on
+     this cache; and
+
+ (*) "idfmt" is a format and printf-style arguments for constructing a label
+     for the cache.
+
+
+The cache should then be registered with FS-Cache by passing a pointer to the
+previously initialised cache definition to:
+
+       int fscache_add_cache(struct fscache_cache *cache,
+                             struct fscache_object *fsdef,
+                             const char *tagname);
+
+Two extra arguments should also be supplied:
+
+ (*) "fsdef" which should point to the object representation for the FS-Cache
+     master index in this cache.  Netfs primary index entries will be created
+     here.  FS-Cache keeps the caller's reference to the index object if
+     successful and will release it upon withdrawal of the cache.
+
+ (*) "tagname" which, if given, should be a text string naming this cache.  If
+     this is NULL, the identifier will be used instead.  For CacheFS, the
+     identifier is set to name the underlying block device and the tag can be
+     supplied by mount.
+
+This function may return -ENOMEM if it ran out of memory or -EEXIST if the tag
+is already in use.  0 will be returned on success.
+
+
+=====================
+UNREGISTERING A CACHE
+=====================
+
+A cache can be withdrawn from the system by calling this function with a
+pointer to the cache definition:
+
+       void fscache_withdraw_cache(struct fscache_cache *cache);
+
+In CacheFS's case, this is called by put_super().
+
+
+========
+SECURITY
+========
+
+The cache methods are executed one of two contexts:
+
+ (1) that of the userspace process that issued the netfs operation that caused
+     the cache method to be invoked, or
+
+ (2) that of one of the processes in the FS-Cache thread pool.
+
+In either case, this may not be an appropriate context in which to access the
+cache.
+
+The calling process's fsuid, fsgid and SELinux security identities may need to
+be masqueraded for the duration of the cache driver's access to the cache.
+This is left to the cache to handle; FS-Cache makes no effort in this regard.
+
+
+===================================
+CONTROL AND STATISTICS PRESENTATION
+===================================
+
+The cache may present data to the outside world through FS-Cache's interfaces
+in sysfs and procfs - the former for control and the latter for statistics.
+
+A sysfs directory called /sys/fs/fscache/<cachetag>/ is created if CONFIG_SYSFS
+is enabled.  This is accessible through the kobject struct fscache_cache::kobj
+and is for use by the cache as it sees fit.
+
+
+========================
+RELEVANT DATA STRUCTURES
+========================
+
+ (*) Index/Data file FS-Cache representation cookie:
+
+       struct fscache_cookie {
+               struct fscache_object_def       *def;
+               struct fscache_netfs            *netfs;
+               void                            *netfs_data;
+               ...
+       };
+
+     The fields that might be of use to the backend describe the object
+     definition, the netfs definition and the netfs's data for this cookie.
+     The object definition contain functions supplied by the netfs for loading
+     and matching index entries; these are required to provide some of the
+     cache operations.
+
+
+ (*) In-cache object representation:
+
+       struct fscache_object {
+               int                             debug_id;
+               enum {
+                       FSCACHE_OBJECT_RECYCLING,
+                       ...
+               }                               state;
+               spinlock_t                      lock
+               struct fscache_cache            *cache;
+               struct fscache_cookie           *cookie;
+               ...
+       };
+
+     Structures of this type should be allocated by the cache backend and
+     passed to FS-Cache when requested by the appropriate cache operation.  In
+     the case of CacheFS, they're embedded in CacheFS's internal object
+     structures.
+
+     The debug_id is a simple integer that can be used in debugging messages
+     that refer to a particular object.  In such a case it should be printed
+     using "OBJ%x" to be consistent with FS-Cache.
+
+     Each object contains a pointer to the cookie that represents the object it
+     is backing.  An object should retired when put_object() is called if it is
+     in state FSCACHE_OBJECT_RECYCLING.  The fscache_object struct should be
+     initialised by calling fscache_object_init(object).
+
+
+ (*) FS-Cache operation record:
+
+       struct fscache_operation {
+               atomic_t                usage;
+               struct fscache_object   *object;
+               unsigned long           flags;
+       #define FSCACHE_OP_EXCLUSIVE
+               void (*processor)(struct fscache_operation *op);
+               void (*release)(struct fscache_operation *op);
+               ...
+       };
+
+     FS-Cache has a pool of threads that it uses to give CPU time to the
+     various asynchronous operations that need to be done as part of driving
+     the cache.  These are represented by the above structure.  The processor
+     method is called to give the op CPU time, and the release method to get
+     rid of it when its usage count reaches 0.
+
+     An operation can be made exclusive upon an object by setting the
+     appropriate flag before enqueuing it with fscache_enqueue_operation().  If
+     an operation needs more processing time, it should be enqueued again.
+
+
+ (*) FS-Cache retrieval operation record:
+
+       struct fscache_retrieval {
+               struct fscache_operation op;
+               struct address_space    *mapping;
+               struct list_head        *to_do;
+               ...
+       };
+
+     A structure of this type is allocated by FS-Cache to record retrieval and
+     allocation requests made by the netfs.  This struct is then passed to the
+     backend to do the operation.  The backend may get extra refs to it by
+     calling fscache_get_retrieval() and refs may be discarded by calling
+     fscache_put_retrieval().
+
+     A retrieval operation can be used by the backend to do retrieval work.  To
+     do this, the retrieval->op.processor method pointer should be set
+     appropriately by the backend and fscache_enqueue_retrieval() called to
+     submit it to the thread pool.  CacheFiles, for example, uses this to queue
+     page examination when it detects PG_lock being cleared.
+
+     The to_do field is an empty list available for the cache backend to use as
+     it sees fit.
+
+
+ (*) FS-Cache storage operation record:
+
+       struct fscache_storage {
+               struct fscache_operation op;
+               pgoff_t                 store_limit;
+               ...
+       };
+
+     A structure of this type is allocated by FS-Cache to record outstanding
+     writes to be made.  FS-Cache itself enqueues this operation and invokes
+     the write_page() method on the object at appropriate times to effect
+     storage.
+
+
+================
+CACHE OPERATIONS
+================
+
+The cache backend provides FS-Cache with a table of operations that can be
+performed on the denizens of the cache.  These are held in a structure of type:
+
+       struct fscache_cache_ops
+
+ (*) Name of cache provider [mandatory]:
+
+       const char *name
+
+     This isn't strictly an operation, but should be pointed at a string naming
+     the backend.
+
+
+ (*) Allocate a new object [mandatory]:
+
+       struct fscache_object *(*alloc_object)(struct fscache_cache *cache,
+                                              struct fscache_cookie *cookie)
+
+     This method is used to allocate a cache object representation to back a
+     cookie in a particular cache.  fscache_object_init() should be called on
+     the object to initialise it prior to returning.
+
+     This function may also be used to parse the index key to be used for
+     multiple lookup calls to turn it into a more convenient form.  FS-Cache
+     will call the lookup_complete() method to allow the cache to release the
+     form once lookup is complete or aborted.
+
+
+ (*) Look up and create object [mandatory]:
+
+       void (*lookup_object)(struct fscache_object *object)
+
+     This method is used to look up an object, given that the object is already
+     allocated and attached to the cookie.  This should instantiate that object
+     in the cache if it can.
+
+     The method should call fscache_object_lookup_negative() as soon as
+     possible if it determines the object doesn't exist in the cache.  If the
+     object is found to exist and the netfs indicates that it is valid then
+     fscache_obtained_object() should be called once the object is in a
+     position to have data stored in it.  Similarly, fscache_obtained_object()
+     should also be called once a non-present object has been created.
+
+     If a lookup error occurs, fscache_object_lookup_error() should be called
+     to abort the lookup of that object.
+
+
+ (*) Release lookup data [mandatory]:
+
+       void (*lookup_complete)(struct fscache_object *object)
+
+     This method is called to ask the cache to release any resources it was
+     using to perform a lookup.
+
+
+ (*) Increment object refcount [mandatory]:
+
+       struct fscache_object *(*grab_object)(struct fscache_object *object)
+
+     This method is called to increment the reference count on an object.  It
+     may fail (for instance if the cache is being withdrawn) by returning NULL.
+     It should return the object pointer if successful.
+
+
+ (*) Lock/Unlock object [mandatory]:
+
+       void (*lock_object)(struct fscache_object *object)
+       void (*unlock_object)(struct fscache_object *object)
+
+     These methods are used to exclusively lock an object.  It must be possible
+     to schedule with the lock held, so a spinlock isn't sufficient.
+
+
+ (*) Pin/Unpin object [optional]:
+
+       int (*pin_object)(struct fscache_object *object)
+       void (*unpin_object)(struct fscache_object *object)
+
+     These methods are used to pin an object into the cache.  Once pinned an
+     object cannot be reclaimed to make space.  Return -ENOSPC if there's not
+     enough space in the cache to permit this.
+
+
+ (*) Update object [mandatory]:
+
+       int (*update_object)(struct fscache_object *object)
+
+     This is called to update the index entry for the specified object.  The
+     new information should be in object->cookie->netfs_data.  This can be
+     obtained by calling object->cookie->def->get_aux()/get_attr().
+
+
+ (*) Discard object [mandatory]:
+
+       void (*drop_object)(struct fscache_object *object)
+
+     This method is called to indicate that an object has been unbound from its
+     cookie, and that the cache should release the object's resources and
+     retire it if it's in state FSCACHE_OBJECT_RECYCLING.
+
+     This method should not attempt to release any references held by the
+     caller.  The caller will invoke the put_object() method as appropriate.
+
+
+ (*) Release object reference [mandatory]:
+
+       void (*put_object)(struct fscache_object *object)
+
+     This method is used to discard a reference to an object.  The object may
+     be freed when all the references to it are released.
+
+
+ (*) Synchronise a cache [mandatory]:
+
+       void (*sync)(struct fscache_cache *cache)
+
+     This is called to ask the backend to synchronise a cache with its backing
+     device.
+
+
+ (*) Dissociate a cache [mandatory]:
+
+       void (*dissociate_pages)(struct fscache_cache *cache)
+
+     This is called to ask a cache to perform any page dissociations as part of
+     cache withdrawal.
+
+
+ (*) Notification that the attributes on a netfs file changed [mandatory]:
+
+       int (*attr_changed)(struct fscache_object *object);
+
+     This is called to indicate to the cache that certain attributes on a netfs
+     file have changed (for example the maximum size a file may reach).  The
+     cache can read these from the netfs by calling the cookie's get_attr()
+     method.
+
+     The cache may use the file size information to reserve space on the cache.
+     It should also call fscache_set_store_limit() to indicate to FS-Cache the
+     highest byte it's willing to store for an object.
+
+     This method may return -ve if an error occurred or the cache object cannot
+     be expanded.  In such a case, the object will be withdrawn from service.
+
+     This operation is run asynchronously from FS-Cache's thread pool, and
+     storage and retrieval operations from the netfs are excluded during the
+     execution of this operation.
+
+
+ (*) Reserve cache space for an object's data [optional]:
+
+       int (*reserve_space)(struct fscache_object *object, loff_t size);
+
+     This is called to request that cache space be reserved to hold the data
+     for an object and the metadata used to track it.  Zero size should be
+     taken as request to cancel a reservation.
+
+     This should return 0 if successful, -ENOSPC if there isn't enough space
+     available, or -ENOMEM or -EIO on other errors.
+
+     The reservation may exceed the current size of the object, thus permitting
+     future expansion.  If the amount of space consumed by an object would
+     exceed the reservation, it's permitted to refuse requests to allocate
+     pages, but not required.  An object may be pruned down to its reservation
+     size if larger than that already.
+
+
+ (*) Request page be read from cache [mandatory]:
+
+       int (*read_or_alloc_page)(struct fscache_retrieval *op,
+                                 struct page *page,
+                                 gfp_t gfp)
+
+     This is called to attempt to read a netfs page from the cache, or to
+     reserve a backing block if not.  FS-Cache will have done as much checking
+     as it can before calling, but most of the work belongs to the backend.
+
+     If there's no page in the cache, then -ENODATA should be returned if the
+     backend managed to reserve a backing block; -ENOBUFS or -ENOMEM if it
+     didn't.
+
+     If there is suitable data in the cache, then a read operation should be
+     queued and 0 returned.  When the read finishes, fscache_end_io() should be
+     called.
+
+     The fscache_mark_pages_cached() should be called for the page if any cache
+     metadata is retained.  This will indicate to the netfs that the page needs
+     explicit uncaching.  This operation takes a pagevec, thus allowing several
+     pages to be marked at once.
+
+     The retrieval record pointed to by op should be retained for each page
+     queued and released when I/O on the page has been formally ended.
+     fscache_get/put_retrieval() are available for this purpose.
+
+     The retrieval record may be used to get CPU time via the FS-Cache thread
+     pool.  If this is desired, the op->op.processor should be set to point to
+     the appropriate processing routine, and fscache_enqueue_retrieval() should
+     be called at an appropriate point to request CPU time.  For instance, the
+     retrieval routine could be enqueued upon the completion of a disk read.
+     The to_do field in the retrieval record is provided to aid in this.
+
+     If an I/O error occurs, fscache_io_error() should be called and -ENOBUFS
+     returned if possible or fscache_end_io() called with a suitable error
+     code..
+
+
+ (*) Request pages be read from cache [mandatory]:
+
+       int (*read_or_alloc_pages)(struct fscache_retrieval *op,
+                                  struct list_head *pages,
+                                  unsigned *nr_pages,
+                                  gfp_t gfp)
+
+     This is like the read_or_alloc_page() method, except it is handed a list
+     of pages instead of one page.  Any pages on which a read operation is
+     started must be added to the page cache for the specified mapping and also
+     to the LRU.  Such pages must also be removed from the pages list and
+     *nr_pages decremented per page.
+
+     If there was an error such as -ENOMEM, then that should be returned; else
+     if one or more pages couldn't be read or allocated, then -ENOBUFS should
+     be returned; else if one or more pages couldn't be read, then -ENODATA
+     should be returned.  If all the pages are dispatched then 0 should be
+     returned.
+
+
+ (*) Request page be allocated in the cache [mandatory]:
+
+       int (*allocate_page)(struct fscache_retrieval *op,
+                            struct page *page,
+                            gfp_t gfp)
+
+     This is like the read_or_alloc_page() method, except that it shouldn't
+     read from the cache, even if there's data there that could be retrieved.
+     It should, however, set up any internal metadata required such that
+     the write_page() method can write to the cache.
+
+     If there's no backing block available, then -ENOBUFS should be returned
+     (or -ENOMEM if there were other problems).  If a block is successfully
+     allocated, then the netfs page should be marked and 0 returned.
+
+
+ (*) Request pages be allocated in the cache [mandatory]:
+
+       int (*allocate_pages)(struct fscache_retrieval *op,
+                             struct list_head *pages,
+                             unsigned *nr_pages,
+                             gfp_t gfp)
+
+     This is an multiple page version of the allocate_page() method.  pages and
+     nr_pages should be treated as for the read_or_alloc_pages() method.
+
+
+ (*) Request page be written to cache [mandatory]:
+
+       int (*write_page)(struct fscache_storage *op,
+                         struct page *page);
+
+     This is called to write from a page on which there was a previously
+     successful read_or_alloc_page() call or similar.  FS-Cache filters out
+     pages that don't have mappings.
+
+     This method is called asynchronously from the FS-Cache thread pool.  It is
+     not required to actually store anything, provided -ENODATA is then
+     returned to the next read of this page.
+
+     If an error occurred, then a negative error code should be returned,
+     otherwise zero should be returned.  FS-Cache will take appropriate action
+     in response to an error, such as withdrawing this object.
+
+     If this method returns success then FS-Cache will inform the netfs
+     appropriately.
+
+
+ (*) Discard retained per-page metadata [mandatory]:
+
+       void (*uncache_page)(struct fscache_object *object, struct page *page)
+
+     This is called when a netfs page is being evicted from the pagecache.  The
+     cache backend should tear down any internal representation or tracking it
+     maintains for this page.
+
+
+==================
+FS-CACHE UTILITIES
+==================
+
+FS-Cache provides some utilities that a cache backend may make use of:
+
+ (*) Note occurrence of an I/O error in a cache:
+
+       void fscache_io_error(struct fscache_cache *cache)
+
+     This tells FS-Cache that an I/O error occurred in the cache.  After this
+     has been called, only resource dissociation operations (object and page
+     release) will be passed from the netfs to the cache backend for the
+     specified cache.
+
+     This does not actually withdraw the cache.  That must be done separately.
+
+
+ (*) Invoke the retrieval I/O completion function:
+
+       void fscache_end_io(struct fscache_retrieval *op, struct page *page,
+                           int error);
+
+     This is called to note the end of an attempt to retrieve a page.  The
+     error value should be 0 if successful and an error otherwise.
+
+
+ (*) Set highest store limit:
+
+       void fscache_set_store_limit(struct fscache_object *object,
+                                    loff_t i_size);
+
+     This sets the limit FS-Cache imposes on the highest byte it's willing to
+     try and store for a netfs.  Any page over this limit is automatically
+     rejected by fscache_read_alloc_page() and co with -ENOBUFS.
+
+
+ (*) Mark pages as being cached:
+
+       void fscache_mark_pages_cached(struct fscache_retrieval *op,
+                                      struct pagevec *pagevec);
+
+     This marks a set of pages as being cached.  After this has been called,
+     the netfs must call fscache_uncache_page() to unmark the pages.
+
+
+ (*) Perform coherency check on an object:
+
+       enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
+                                               const void *data,
+                                               uint16_t datalen);
+
+     This asks the netfs to perform a coherency check on an object that has
+     just been looked up.  The cookie attached to the object will determine the
+     netfs to use.  data and datalen should specify where the auxiliary data
+     retrieved from the cache can be found.
+
+     One of three values will be returned:
+
+       (*) FSCACHE_CHECKAUX_OKAY
+
+           The coherency data indicates the object is valid as is.
+
+       (*) FSCACHE_CHECKAUX_NEEDS_UPDATE
+
+           The coherency data needs updating, but otherwise the object is
+           valid.
+
+       (*) FSCACHE_CHECKAUX_OBSOLETE
+
+           The coherency data indicates that the object is obsolete and should
+           be discarded.
+
+
+ (*) Initialise a freshly allocated object:
+
+       void fscache_object_init(struct fscache_object *object);
+
+     This initialises all the fields in an object representation.
+
+
+ (*) Indicate the destruction of an object:
+
+       void fscache_object_destroyed(struct fscache_cache *cache);
+
+     This must be called to inform FS-Cache that an object that belonged to a
+     cache has been destroyed and deallocated.  This will allow continuation
+     of the cache withdrawal process when it is stopped pending destruction of
+     all the objects.
+
+
+ (*) Indicate negative lookup on an object:
+
+       void fscache_object_lookup_negative(struct fscache_object *object);
+
+     This is called to indicate to FS-Cache that a lookup process for an object
+     found a negative result.
+
+     This changes the state of an object to permit reads pending on lookup
+     completion to go off and start fetching data from the netfs server as it's
+     known at this point that there can't be any data in the cache.
+
+     This may be called multiple times on an object.  Only the first call is
+     significant - all subsequent calls are ignored.
+
+
+ (*) Indicate an object has been obtained:
+
+       void fscache_obtained_object(struct fscache_object *object);
+
+     This is called to indicate to FS-Cache that a lookup process for an object
+     produced a positive result, or that an object was created.  This should
+     only be called once for any particular object.
+
+     This changes the state of an object to indicate:
+
+       (1) if no call to fscache_object_lookup_negative() has been made on
+           this object, that there may be data available, and that reads can
+           now go and look for it; and
+
+        (2) that writes may now proceed against this object.
+
+
+ (*) Indicate that object lookup failed:
+
+       void fscache_object_lookup_error(struct fscache_object *object);
+
+     This marks an object as having encountered a fatal error (usually EIO)
+     and causes it to move into a state whereby it will be withdrawn as soon
+     as possible.
+
+
+ (*) Get and release references on a retrieval record:
+
+       void fscache_get_retrieval(struct fscache_retrieval *op);
+       void fscache_put_retrieval(struct fscache_retrieval *op);
+
+     These two functions are used to retain a retrieval record whilst doing
+     asynchronous data retrieval and block allocation.
+
+
+ (*) Enqueue a retrieval record for processing.
+
+       void fscache_enqueue_retrieval(struct fscache_retrieval *op);
+
+     This enqueues a retrieval record for processing by the FS-Cache thread
+     pool.  One of the threads in the pool will invoke the retrieval record's
+     op->op.processor callback function.  This function may be called from
+     within the callback function.
+
+
+ (*) List of object state names:
+
+       const char *fscache_object_states[];
+
+     For debugging purposes, this may be used to turn the state that an object
+     is in into a text string for display purposes.
diff --git a/Documentation/filesystems/caching/cachefiles.txt b/Documentation/filesystems/caching/cachefiles.txt
new file mode 100644 (file)
index 0000000..c78a49b
--- /dev/null
@@ -0,0 +1,501 @@
+              ===============================================
+              CacheFiles: CACHE ON ALREADY MOUNTED FILESYSTEM
+              ===============================================
+
+Contents:
+
+ (*) Overview.
+
+ (*) Requirements.
+
+ (*) Configuration.
+
+ (*) Starting the cache.
+
+ (*) Things to avoid.
+
+ (*) Cache culling.
+
+ (*) Cache structure.
+
+ (*) Security model and SELinux.
+
+ (*) A note on security.
+
+ (*) Statistical information.
+
+ (*) Debugging.
+
+
+========
+OVERVIEW
+========
+
+CacheFiles is a caching backend that's meant to use as a cache a directory on
+an already mounted filesystem of a local type (such as Ext3).
+
+CacheFiles uses a userspace daemon to do some of the cache management - such as
+reaping stale nodes and culling.  This is called cachefilesd and lives in
+/sbin.
+
+The filesystem and data integrity of the cache are only as good as those of the
+filesystem providing the backing services.  Note that CacheFiles does not
+attempt to journal anything since the journalling interfaces of the various
+filesystems are very specific in nature.
+
+CacheFiles creates a misc character device - "/dev/cachefiles" - that is used
+to communication with the daemon.  Only one thing may have this open at once,
+and whilst it is open, a cache is at least partially in existence.  The daemon
+opens this and sends commands down it to control the cache.
+
+CacheFiles is currently limited to a single cache.
+
+CacheFiles attempts to maintain at least a certain percentage of free space on
+the filesystem, shrinking the cache by culling the objects it contains to make
+space if necessary - see the "Cache Culling" section.  This means it can be
+placed on the same medium as a live set of data, and will expand to make use of
+spare space and automatically contract when the set of data requires more
+space.
+
+
+============
+REQUIREMENTS
+============
+
+The use of CacheFiles and its daemon requires the following features to be
+available in the system and in the cache filesystem:
+
+       - dnotify.
+
+       - extended attributes (xattrs).
+
+       - openat() and friends.
+
+       - bmap() support on files in the filesystem (FIBMAP ioctl).
+
+       - The use of bmap() to detect a partial page at the end of the file.
+
+It is strongly recommended that the "dir_index" option is enabled on Ext3
+filesystems being used as a cache.
+
+
+=============
+CONFIGURATION
+=============
+
+The cache is configured by a script in /etc/cachefilesd.conf.  These commands
+set up cache ready for use.  The following script commands are available:
+
+ (*) brun <N>%
+ (*) bcull <N>%
+ (*) bstop <N>%
+ (*) frun <N>%
+ (*) fcull <N>%
+ (*) fstop <N>%
+
+       Configure the culling limits.  Optional.  See the section on culling
+       The defaults are 7% (run), 5% (cull) and 1% (stop) respectively.
+
+       The commands beginning with a 'b' are file space (block) limits, those
+       beginning with an 'f' are file count limits.
+
+ (*) dir <path>
+
+       Specify the directory containing the root of the cache.  Mandatory.
+
+ (*) tag <name>
+
+       Specify a tag to FS-Cache to use in distinguishing multiple caches.
+       Optional.  The default is "CacheFiles".
+
+ (*) debug <mask>
+
+       Specify a numeric bitmask to control debugging in the kernel module.
+       Optional.  The default is zero (all off).  The following values can be
+       OR'd into the mask to collect various information:
+
+               1       Turn on trace of function entry (_enter() macros)
+               2       Turn on trace of function exit (_leave() macros)
+               4       Turn on trace of internal debug points (_debug())
+
+       This mask can also be set through sysfs, eg:
+
+               echo 5 >/sys/modules/cachefiles/parameters/debug
+
+
+==================
+STARTING THE CACHE
+==================
+
+The cache is started by running the daemon.  The daemon opens the cache device,
+configures the cache and tells it to begin caching.  At that point the cache
+binds to fscache and the cache becomes live.
+
+The daemon is run as follows:
+
+       /sbin/cachefilesd [-d]* [-s] [-n] [-f <configfile>]
+
+The flags are:
+
+ (*) -d
+
+       Increase the debugging level.  This can be specified multiple times and
+       is cumulative with itself.
+
+ (*) -s
+
+       Send messages to stderr instead of syslog.
+
+ (*) -n
+
+       Don't daemonise and go into background.
+
+ (*) -f <configfile>
+
+       Use an alternative configuration file rather than the default one.
+
+
+===============
+THINGS TO AVOID
+===============
+
+Do not mount other things within the cache as this will cause problems.  The
+kernel module contains its own very cut-down path walking facility that ignores
+mountpoints, but the daemon can't avoid them.
+
+Do not create, rename or unlink files and directories in the cache whilst the
+cache is active, as this may cause the state to become uncertain.
+
+Renaming files in the cache might make objects appear to be other objects (the
+filename is part of the lookup key).
+
+Do not change or remove the extended attributes attached to cache files by the
+cache as this will cause the cache state management to get confused.
+
+Do not create files or directories in the cache, lest the cache get confused or
+serve incorrect data.
+
+Do not chmod files in the cache.  The module creates things with minimal
+permissions to prevent random users being able to access them directly.
+
+
+=============
+CACHE CULLING
+=============
+
+The cache may need culling occasionally to make space.  This involves
+discarding objects from the cache that have been used less recently than
+anything else.  Culling is based on the access time of data objects.  Empty
+directories are culled if not in use.
+
+Cache culling is done on the basis of the percentage of blocks and the
+percentage of files available in the underlying filesystem.  There are six
+"limits":
+
+ (*) brun
+ (*) frun
+
+     If the amount of free space and the number of available files in the cache
+     rises above both these limits, then culling is turned off.
+
+ (*) bcull
+ (*) fcull
+
+     If the amount of available space or the number of available files in the
+     cache falls below either of these limits, then culling is started.
+
+ (*) bstop
+ (*) fstop
+
+     If the amount of available space or the number of available files in the
+     cache falls below either of these limits, then no further allocation of
+     disk space or files is permitted until culling has raised things above
+     these limits again.
+
+These must be configured thusly:
+
+       0 <= bstop < bcull < brun < 100
+       0 <= fstop < fcull < frun < 100
+
+Note that these are percentages of available space and available files, and do
+_not_ appear as 100 minus the percentage displayed by the "df" program.
+
+The userspace daemon scans the cache to build up a table of cullable objects.
+These are then culled in least recently used order.  A new scan of the cache is
+started as soon as space is made in the table.  Objects will be skipped if
+their atimes have changed or if the kernel module says it is still using them.
+
+
+===============
+CACHE STRUCTURE
+===============
+
+The CacheFiles module will create two directories in the directory it was
+given:
+
+ (*) cache/
+
+ (*) graveyard/
+
+The active cache objects all reside in the first directory.  The CacheFiles
+kernel module moves any retired or culled objects that it can't simply unlink
+to the graveyard from which the daemon will actually delete them.
+
+The daemon uses dnotify to monitor the graveyard directory, and will delete
+anything that appears therein.
+
+
+The module represents index objects as directories with the filename "I..." or
+"J...".  Note that the "cache/" directory is itself a special index.
+
+Data objects are represented as files if they have no children, or directories
+if they do.  Their filenames all begin "D..." or "E...".  If represented as a
+directory, data objects will have a file in the directory called "data" that
+actually holds the data.
+
+Special objects are similar to data objects, except their filenames begin
+"S..." or "T...".
+
+
+If an object has children, then it will be represented as a directory.
+Immediately in the representative directory are a collection of directories
+named for hash values of the child object keys with an '@' prepended.  Into
+this directory, if possible, will be placed the representations of the child
+objects:
+
+       INDEX     INDEX      INDEX                             DATA FILES
+       ========= ========== ================================= ================
+       cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400
+       cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...DB1ry
+       cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...N22ry
+       cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...FP1ry
+
+
+If the key is so long that it exceeds NAME_MAX with the decorations added on to
+it, then it will be cut into pieces, the first few of which will be used to
+make a nest of directories, and the last one of which will be the objects
+inside the last directory.  The names of the intermediate directories will have
+'+' prepended:
+
+       J1223/@23/+xy...z/+kl...m/Epqr
+
+
+Note that keys are raw data, and not only may they exceed NAME_MAX in size,
+they may also contain things like '/' and NUL characters, and so they may not
+be suitable for turning directly into a filename.
+
+To handle this, CacheFiles will use a suitably printable filename directly and
+"base-64" encode ones that aren't directly suitable.  The two versions of
+object filenames indicate the encoding:
+
+       OBJECT TYPE     PRINTABLE       ENCODED
+       =============== =============== ===============
+       Index           "I..."          "J..."
+       Data            "D..."          "E..."
+       Special         "S..."          "T..."
+
+Intermediate directories are always "@" or "+" as appropriate.
+
+
+Each object in the cache has an extended attribute label that holds the object
+type ID (required to distinguish special objects) and the auxiliary data from
+the netfs.  The latter is used to detect stale objects in the cache and update
+or retire them.
+
+
+Note that CacheFiles will erase from the cache any file it doesn't recognise or
+any file of an incorrect type (such as a FIFO file or a device file).
+
+
+==========================
+SECURITY MODEL AND SELINUX
+==========================
+
+CacheFiles is implemented to deal properly with the LSM security features of
+the Linux kernel and the SELinux facility.
+
+One of the problems that CacheFiles faces is that it is generally acting on
+behalf of a process, and running in that process's context, and that includes a
+security context that is not appropriate for accessing the cache - either
+because the files in the cache are inaccessible to that process, or because if
+the process creates a file in the cache, that file may be inaccessible to other
+processes.
+
+The way CacheFiles works is to temporarily change the security context (fsuid,
+fsgid and actor security label) that the process acts as - without changing the
+security context of the process when it the target of an operation performed by
+some other process (so signalling and suchlike still work correctly).
+
+
+When the CacheFiles module is asked to bind to its cache, it:
+
+ (1) Finds the security label attached to the root cache directory and uses
+     that as the security label with which it will create files.  By default,
+     this is:
+
+       cachefiles_var_t
+
+ (2) Finds the security label of the process which issued the bind request
+     (presumed to be the cachefilesd daemon), which by default will be:
+
+       cachefilesd_t
+
+     and asks LSM to supply a security ID as which it should act given the
+     daemon's label.  By default, this will be:
+
+       cachefiles_kernel_t
+
+     SELinux transitions the daemon's security ID to the module's security ID
+     based on a rule of this form in the policy.
+
+       type_transition <daemon's-ID> kernel_t : process <module's-ID>;
+
+     For instance:
+
+       type_transition cachefilesd_t kernel_t : process cachefiles_kernel_t;
+
+
+The module's security ID gives it permission to create, move and remove files
+and directories in the cache, to find and access directories and files in the
+cache, to set and access extended attributes on cache objects, and to read and
+write files in the cache.
+
+The daemon's security ID gives it only a very restricted set of permissions: it
+may scan directories, stat files and erase files and directories.  It may
+not read or write files in the cache, and so it is precluded from accessing the
+data cached therein; nor is it permitted to create new files in the cache.
+
+
+There are policy source files available in:
+
+       http://people.redhat.com/~dhowells/fscache/cachefilesd-0.8.tar.bz2
+
+and later versions.  In that tarball, see the files:
+
+       cachefilesd.te
+       cachefilesd.fc
+       cachefilesd.if
+
+They are built and installed directly by the RPM.
+
+If a non-RPM based system is being used, then copy the above files to their own
+directory and run:
+
+       make -f /usr/share/selinux/devel/Makefile
+       semodule -i cachefilesd.pp
+
+You will need checkpolicy and selinux-policy-devel installed prior to the
+build.
+
+
+By default, the cache is located in /var/fscache, but if it is desirable that
+it should be elsewhere, than either the above policy files must be altered, or
+an auxiliary policy must be installed to label the alternate location of the
+cache.
+
+For instructions on how to add an auxiliary policy to enable the cache to be
+located elsewhere when SELinux is in enforcing mode, please see:
+
+       /usr/share/doc/cachefilesd-*/move-cache.txt
+
+When the cachefilesd rpm is installed; alternatively, the document can be found
+in the sources.
+
+
+==================
+A NOTE ON SECURITY
+==================
+
+CacheFiles makes use of the split security in the task_struct.  It allocates
+its own task_security structure, and redirects current->act_as to point to it
+when it acts on behalf of another process, in that process's context.
+
+The reason it does this is that it calls vfs_mkdir() and suchlike rather than
+bypassing security and calling inode ops directly.  Therefore the VFS and LSM
+may deny the CacheFiles access to the cache data because under some
+circumstances the caching code is running in the security context of whatever
+process issued the original syscall on the netfs.
+
+Furthermore, should CacheFiles create a file or directory, the security
+parameters with that object is created (UID, GID, security label) would be
+derived from that process that issued the system call, thus potentially
+preventing other processes from accessing the cache - including CacheFiles's
+cache management daemon (cachefilesd).
+
+What is required is to temporarily override the security of the process that
+issued the system call.  We can't, however, just do an in-place change of the
+security data as that affects the process as an object, not just as a subject.
+This means it may lose signals or ptrace events for example, and affects what
+the process looks like in /proc.
+
+So CacheFiles makes use of a logical split in the security between the
+objective security (task->sec) and the subjective security (task->act_as).  The
+objective security holds the intrinsic security properties of a process and is
+never overridden.  This is what appears in /proc, and is what is used when a
+process is the target of an operation by some other process (SIGKILL for
+example).
+
+The subjective security holds the active security properties of a process, and
+may be overridden.  This is not seen externally, and is used whan a process
+acts upon another object, for example SIGKILLing another process or opening a
+file.
+
+LSM hooks exist that allow SELinux (or Smack or whatever) to reject a request
+for CacheFiles to run in a context of a specific security label, or to create
+files and directories with another security label.
+
+
+=======================
+STATISTICAL INFORMATION
+=======================
+
+If FS-Cache is compiled with the following option enabled:
+
+       CONFIG_CACHEFILES_HISTOGRAM=y
+
+then it will gather certain statistics and display them through a proc file.
+
+ (*) /proc/fs/cachefiles/histogram
+
+       cat /proc/fs/cachefiles/histogram
+       JIFS  SECS  LOOKUPS   MKDIRS    CREATES
+       ===== ===== ========= ========= =========
+
+     This shows the breakdown of the number of times each amount of time
+     between 0 jiffies and HZ-1 jiffies a variety of tasks took to run.  The
+     columns are as follows:
+
+       COLUMN          TIME MEASUREMENT
+       =======         =======================================================
+       LOOKUPS         Length of time to perform a lookup on the backing fs
+       MKDIRS          Length of time to perform a mkdir on the backing fs
+       CREATES         Length of time to perform a create on the backing fs
+
+     Each row shows the number of events that took a particular range of times.
+     Each step is 1 jiffy in size.  The JIFS column indicates the particular
+     jiffy range covered, and the SECS field the equivalent number of seconds.
+
+
+=========
+DEBUGGING
+=========
+
+If CONFIG_CACHEFILES_DEBUG is enabled, the CacheFiles facility can have runtime
+debugging enabled by adjusting the value in:
+
+       /sys/module/cachefiles/parameters/debug
+
+This is a bitmask of debugging streams to enable:
+
+       BIT     VALUE   STREAM                          POINT
+       ======= ======= =============================== =======================
+       0       1       General                         Function entry trace
+       1       2                                       Function exit trace
+       2       4                                       General
+
+The appropriate set of values should be OR'd together and the result written to
+the control file.  For example:
+
+       echo $((1|4|8)) >/sys/module/cachefiles/parameters/debug
+
+will turn on all function entry debugging.
diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt
new file mode 100644 (file)
index 0000000..9e94b94
--- /dev/null
@@ -0,0 +1,333 @@
+                         ==========================
+                         General Filesystem Caching
+                         ==========================
+
+========
+OVERVIEW
+========
+
+This facility is a general purpose cache for network filesystems, though it
+could be used for caching other things such as ISO9660 filesystems too.
+
+FS-Cache mediates between cache backends (such as CacheFS) and network
+filesystems:
+
+       +---------+
+       |         |                        +--------------+
+       |   NFS   |--+                     |              |
+       |         |  |                 +-->|   CacheFS    |
+       +---------+  |   +----------+  |   |  /dev/hda5   |
+                    |   |          |  |   +--------------+
+       +---------+  +-->|          |  |
+       |         |      |          |--+
+       |   AFS   |----->| FS-Cache |
+       |         |      |          |--+
+       +---------+  +-->|          |  |
+                    |   |          |  |   +--------------+
+       +---------+  |   +----------+  |   |              |
+       |         |  |                 +-->|  CacheFiles  |
+       |  ISOFS  |--+                     |  /var/cache  |
+       |         |                        +--------------+
+       +---------+
+
+Or to look at it another way, FS-Cache is a module that provides a caching
+facility to a network filesystem such that the cache is transparent to the
+user:
+
+       +---------+
+       |         |
+       | Server  |
+       |         |
+       +---------+
+            |                  NETWORK
+       ~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            |
+            |           +----------+
+            V           |          |
+       +---------+      |          |
+       |         |      |          |
+       |   NFS   |----->| FS-Cache |
+       |         |      |          |--+
+       +---------+      |          |  |   +--------------+   +--------------+
+            |           |          |  |   |              |   |              |
+            V           +----------+  +-->|  CacheFiles  |-->|  Ext3        |
+       +---------+                        |  /var/cache  |   |  /dev/sda6   |
+       |         |                        +--------------+   +--------------+
+       |   VFS   |                                ^                     ^
+       |         |                                |                     |
+       +---------+                                +--------------+      |
+            |                  KERNEL SPACE                      |      |
+       ~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|~~~~~~|~~~~
+            |                  USER SPACE                        |      |
+            V                                                    |      |
+       +---------+                                           +--------------+
+       |         |                                           |              |
+       | Process |                                           | cachefilesd  |
+       |         |                                           |              |
+       +---------+                                           +--------------+
+
+
+FS-Cache does not follow the idea of completely loading every netfs file
+opened in its entirety into a cache before permitting it to be accessed and
+then serving the pages out of that cache rather than the netfs inode because:
+
+ (1) It must be practical to operate without a cache.
+
+ (2) The size of any accessible file must not be limited to the size of the
+     cache.
+
+ (3) The combined size of all opened files (this includes mapped libraries)
+     must not be limited to the size of the cache.
+
+ (4) The user should not be forced to download an entire file just to do a
+     one-off access of a small portion of it (such as might be done with the
+     "file" program).
+
+It instead serves the cache out in PAGE_SIZE chunks as and when requested by
+the netfs('s) using it.
+
+
+FS-Cache provides the following facilities:
+
+ (1) More than one cache can be used at once.  Caches can be selected
+     explicitly by use of tags.
+
+ (2) Caches can be added / removed at any time.
+
+ (3) The netfs is provided with an interface that allows either party to
+     withdraw caching facilities from a file (required for (2)).
+
+ (4) The interface to the netfs returns as few errors as possible, preferring
+     rather to let the netfs remain oblivious.
+
+ (5) Cookies are used to represent indices, files and other objects to the
+     netfs.  The simplest cookie is just a NULL pointer - indicating nothing
+     cached there.
+
+ (6) The netfs is allowed to propose - dynamically - any index hierarchy it
+     desires, though it must be aware that the index search function is
+     recursive, stack space is limited, and indices can only be children of
+     indices.
+
+ (7) Data I/O is done direct to and from the netfs's pages.  The netfs
+     indicates that page A is at index B of the data-file represented by cookie
+     C, and that it should be read or written.  The cache backend may or may
+     not start I/O on that page, but if it does, a netfs callback will be
+     invoked to indicate completion.  The I/O may be either synchronous or
+     asynchronous.
+
+ (8) Cookies can be "retired" upon release.  At this point FS-Cache will mark
+     them as obsolete and the index hierarchy rooted at that point will get
+     recycled.
+
+ (9) The netfs provides a "match" function for index searches.  In addition to
+     saying whether a match was made or not, this can also specify that an
+     entry should be updated or deleted.
+
+(10) As much as possible is done asynchronously.
+
+
+FS-Cache maintains a virtual indexing tree in which all indices, files, objects
+and pages are kept.  Bits of this tree may actually reside in one or more
+caches.
+
+                                           FSDEF
+                                             |
+                        +------------------------------------+
+                        |                                    |
+                       NFS                                  AFS
+                        |                                    |
+           +--------------------------+                +-----------+
+           |                          |                |           |
+        homedir                     mirror          afs.org   redhat.com
+           |                          |                            |
+     +------------+           +---------------+              +----------+
+     |            |           |               |              |          |
+   00001        00002       00007           00125        vol00001   vol00002
+     |            |           |               |                         |
+ +---+---+     +-----+      +---+      +------+------+            +-----+----+
+ |   |   |     |     |      |   |      |      |      |            |     |    |
+PG0 PG1 PG2   PG0  XATTR   PG0 PG1   DIRENT DIRENT DIRENT        R/W   R/O  Bak
+                     |                                            |
+                    PG0                                       +-------+
+                                                              |       |
+                                                            00001   00003
+                                                              |
+                                                          +---+---+
+                                                          |   |   |
+                                                         PG0 PG1 PG2
+
+In the example above, you can see two netfs's being backed: NFS and AFS.  These
+have different index hierarchies:
+
+ (*) The NFS primary index contains per-server indices.  Each server index is
+     indexed by NFS file handles to get data file objects.  Each data file
+     objects can have an array of pages, but may also have further child
+     objects, such as extended attributes and directory entries.  Extended
+     attribute objects themselves have page-array contents.
+
+ (*) The AFS primary index contains per-cell indices.  Each cell index contains
+     per-logical-volume indices.  Each of volume index contains up to three
+     indices for the read-write, read-only and backup mirrors of those volumes.
+     Each of these contains vnode data file objects, each of which contains an
+     array of pages.
+
+The very top index is the FS-Cache master index in which individual netfs's
+have entries.
+
+Any index object may reside in more than one cache, provided it only has index
+children.  Any index with non-index object children will be assumed to only
+reside in one cache.
+
+
+The netfs API to FS-Cache can be found in:
+
+       Documentation/filesystems/caching/netfs-api.txt
+
+The cache backend API to FS-Cache can be found in:
+
+       Documentation/filesystems/caching/backend-api.txt
+
+A description of the internal representations and object state machine can be
+found in:
+
+       Documentation/filesystems/caching/object.txt
+
+
+=======================
+STATISTICAL INFORMATION
+=======================
+
+If FS-Cache is compiled with the following options enabled:
+
+       CONFIG_FSCACHE_STATS=y
+       CONFIG_FSCACHE_HISTOGRAM=y
+
+then it will gather certain statistics and display them through a number of
+proc files.
+
+ (*) /proc/fs/fscache/stats
+
+     This shows counts of a number of events that can happen in FS-Cache:
+
+       CLASS   EVENT   MEANING
+       ======= ======= =======================================================
+       Cookies idx=N   Number of index cookies allocated
+               dat=N   Number of data storage cookies allocated
+               spc=N   Number of special cookies allocated
+       Objects alc=N   Number of objects allocated
+               nal=N   Number of object allocation failures
+               avl=N   Number of objects that reached the available state
+               ded=N   Number of objects that reached the dead state
+       ChkAux  non=N   Number of objects that didn't have a coherency check
+               ok=N    Number of objects that passed a coherency check
+               upd=N   Number of objects that needed a coherency data update
+               obs=N   Number of objects that were declared obsolete
+       Pages   mrk=N   Number of pages marked as being cached
+               unc=N   Number of uncache page requests seen
+       Acquire n=N     Number of acquire cookie requests seen
+               nul=N   Number of acq reqs given a NULL parent
+               noc=N   Number of acq reqs rejected due to no cache available
+               ok=N    Number of acq reqs succeeded
+               nbf=N   Number of acq reqs rejected due to error
+               oom=N   Number of acq reqs failed on ENOMEM
+       Lookups n=N     Number of lookup calls made on cache backends
+               neg=N   Number of negative lookups made
+               pos=N   Number of positive lookups made
+               crt=N   Number of objects created by lookup
+       Updates n=N     Number of update cookie requests seen
+               nul=N   Number of upd reqs given a NULL parent
+               run=N   Number of upd reqs granted CPU time
+       Relinqs n=N     Number of relinquish cookie requests seen
+               nul=N   Number of rlq reqs given a NULL parent
+               wcr=N   Number of rlq reqs waited on completion of creation
+       AttrChg n=N     Number of attribute changed requests seen
+               ok=N    Number of attr changed requests queued
+               nbf=N   Number of attr changed rejected -ENOBUFS
+               oom=N   Number of attr changed failed -ENOMEM
+               run=N   Number of attr changed ops given CPU time
+       Allocs  n=N     Number of allocation requests seen
+               ok=N    Number of successful alloc reqs
+               wt=N    Number of alloc reqs that waited on lookup completion
+               nbf=N   Number of alloc reqs rejected -ENOBUFS
+               ops=N   Number of alloc reqs submitted
+               owt=N   Number of alloc reqs waited for CPU time
+       Retrvls n=N     Number of retrieval (read) requests seen
+               ok=N    Number of successful retr reqs
+               wt=N    Number of retr reqs that waited on lookup completion
+               nod=N   Number of retr reqs returned -ENODATA
+               nbf=N   Number of retr reqs rejected -ENOBUFS
+               int=N   Number of retr reqs aborted -ERESTARTSYS
+               oom=N   Number of retr reqs failed -ENOMEM
+               ops=N   Number of retr reqs submitted
+               owt=N   Number of retr reqs waited for CPU time
+       Stores  n=N     Number of storage (write) requests seen
+               ok=N    Number of successful store reqs
+               agn=N   Number of store reqs on a page already pending storage
+               nbf=N   Number of store reqs rejected -ENOBUFS
+               oom=N   Number of store reqs failed -ENOMEM
+               ops=N   Number of store reqs submitted
+               run=N   Number of store reqs granted CPU time
+       Ops     pend=N  Number of times async ops added to pending queues
+               run=N   Number of times async ops given CPU time
+               enq=N   Number of times async ops queued for processing
+               dfr=N   Number of async ops queued for deferred release
+               rel=N   Number of async ops released
+               gc=N    Number of deferred-release async ops garbage collected
+
+
+ (*) /proc/fs/fscache/histogram
+
+       cat /proc/fs/fscache/histogram
+       JIFS  SECS  OBJ INST  OP RUNS   OBJ RUNS  RETRV DLY RETRIEVLS
+       ===== ===== ========= ========= ========= ========= =========
+
+     This shows the breakdown of the number of times each amount of time
+     between 0 jiffies and HZ-1 jiffies a variety of tasks took to run.  The
+     columns are as follows:
+
+       COLUMN          TIME MEASUREMENT
+       =======         =======================================================
+       OBJ INST        Length of time to instantiate an object
+       OP RUNS         Length of time a call to process an operation took
+       OBJ RUNS        Length of time a call to process an object event took
+       RETRV DLY       Time between an requesting a read and lookup completing
+       RETRIEVLS       Time between beginning and end of a retrieval
+
+     Each row shows the number of events that took a particular range of times.
+     Each step is 1 jiffy in size.  The JIFS column indicates the particular
+     jiffy range covered, and the SECS field the equivalent number of seconds.
+
+
+=========
+DEBUGGING
+=========
+
+If CONFIG_FSCACHE_DEBUG is enabled, the FS-Cache facility can have runtime
+debugging enabled by adjusting the value in:
+
+       /sys/module/fscache/parameters/debug
+
+This is a bitmask of debugging streams to enable:
+
+       BIT     VALUE   STREAM                          POINT
+       ======= ======= =============================== =======================
+       0       1       Cache management                Function entry trace
+       1       2                                       Function exit trace
+       2       4                                       General
+       3       8       Cookie management               Function entry trace
+       4       16                                      Function exit trace
+       5       32                                      General
+       6       64      Page handling                   Function entry trace
+       7       128                                     Function exit trace
+       8       256                                     General
+       9       512     Operation management            Function entry trace
+       10      1024                                    Function exit trace
+       11      2048                                    General
+
+The appropriate set of values should be OR'd together and the result written to
+the control file.  For example:
+
+       echo $((1|8|64)) >/sys/module/fscache/parameters/debug
+
+will turn on all function entry debugging.
diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
new file mode 100644 (file)
index 0000000..4db125b
--- /dev/null
@@ -0,0 +1,778 @@
+                       ===============================
+                       FS-CACHE NETWORK FILESYSTEM API
+                       ===============================
+
+There's an API by which a network filesystem can make use of the FS-Cache
+facilities.  This is based around a number of principles:
+
+ (1) Caches can store a number of different object types.  There are two main
+     object types: indices and files.  The first is a special type used by
+     FS-Cache to make finding objects faster and to make retiring of groups of
+     objects easier.
+
+ (2) Every index, file or other object is represented by a cookie.  This cookie
+     may or may not have anything associated with it, but the netfs doesn't
+     need to care.
+
+ (3) Barring the top-level index (one entry per cached netfs), the index
+     hierarchy for each netfs is structured according the whim of the netfs.
+
+This API is declared in <linux/fscache.h>.
+
+This document contains the following sections:
+
+        (1) Network filesystem definition
+        (2) Index definition
+        (3) Object definition
+        (4) Network filesystem (un)registration
+        (5) Cache tag lookup
+        (6) Index registration
+        (7) Data file registration
+        (8) Miscellaneous object registration
+        (9) Setting the data file size
+       (10) Page alloc/read/write
+       (11) Page uncaching
+       (12) Index and data file update
+       (13) Miscellaneous cookie operations
+       (14) Cookie unregistration
+       (15) Index and data file invalidation
+       (16) FS-Cache specific page flags.
+
+
+=============================
+NETWORK FILESYSTEM DEFINITION
+=============================
+
+FS-Cache needs a description of the network filesystem.  This is specified
+using a record of the following structure:
+
+       struct fscache_netfs {
+               uint32_t                        version;
+               const char                      *name;
+               struct fscache_cookie           *primary_index;
+               ...
+       };
+
+This first two fields should be filled in before registration, and the third
+will be filled in by the registration function; any other fields should just be
+ignored and are for internal use only.
+
+The fields are:
+
+ (1) The name of the netfs (used as the key in the toplevel index).
+
+ (2) The version of the netfs (if the name matches but the version doesn't, the
+     entire in-cache hierarchy for this netfs will be scrapped and begun
+     afresh).
+
+ (3) The cookie representing the primary index will be allocated according to
+     another parameter passed into the registration function.
+
+For example, kAFS (linux/fs/afs/) uses the following definitions to describe
+itself:
+
+       struct fscache_netfs afs_cache_netfs = {
+               .version        = 0,
+               .name           = "afs",
+       };
+
+
+================
+INDEX DEFINITION
+================
+
+Indices are used for two purposes:
+
+ (1) To aid the finding of a file based on a series of keys (such as AFS's
+     "cell", "volume ID", "vnode ID").
+
+ (2) To make it easier to discard a subset of all the files cached based around
+     a particular key - for instance to mirror the removal of an AFS volume.
+
+However, since it's unlikely that any two netfs's are going to want to define
+their index hierarchies in quite the same way, FS-Cache tries to impose as few
+restraints as possible on how an index is structured and where it is placed in
+the tree.  The netfs can even mix indices and data files at the same level, but
+it's not recommended.
+
+Each index entry consists of a key of indeterminate length plus some auxilliary
+data, also of indeterminate length.
+
+There are some limits on indices:
+
+ (1) Any index containing non-index objects should be restricted to a single
+     cache.  Any such objects created within an index will be created in the
+     first cache only.  The cache in which an index is created can be
+     controlled by cache tags (see below).
+
+ (2) The entry data must be atomically journallable, so it is limited to about
+     400 bytes at present.  At least 400 bytes will be available.
+
+ (3) The depth of the index tree should be judged with care as the search
+     function is recursive.  Too many layers will run the kernel out of stack.
+
+
+=================
+OBJECT DEFINITION
+=================
+
+To define an object, a structure of the following type should be filled out:
+
+       struct fscache_cookie_def
+       {
+               uint8_t name[16];
+               uint8_t type;
+
+               struct fscache_cache_tag *(*select_cache)(
+                       const void *parent_netfs_data,
+                       const void *cookie_netfs_data);
+
+               uint16_t (*get_key)(const void *cookie_netfs_data,
+                                   void *buffer,
+                                   uint16_t bufmax);
+
+               void (*get_attr)(const void *cookie_netfs_data,
+                                uint64_t *size);
+
+               uint16_t (*get_aux)(const void *cookie_netfs_data,
+                                   void *buffer,
+                                   uint16_t bufmax);
+
+               enum fscache_checkaux (*check_aux)(void *cookie_netfs_data,
+                                                  const void *data,
+                                                  uint16_t datalen);
+
+               void (*get_context)(void *cookie_netfs_data, void *context);
+
+               void (*put_context)(void *cookie_netfs_data, void *context);
+
+               void (*mark_pages_cached)(void *cookie_netfs_data,
+                                         struct address_space *mapping,
+                                         struct pagevec *cached_pvec);
+
+               void (*now_uncached)(void *cookie_netfs_data);
+       };
+
+This has the following fields:
+
+ (1) The type of the object [mandatory].
+
+     This is one of the following values:
+
+       (*) FSCACHE_COOKIE_TYPE_INDEX
+
+           This defines an index, which is a special FS-Cache type.
+
+       (*) FSCACHE_COOKIE_TYPE_DATAFILE
+
+           This defines an ordinary data file.
+
+       (*) Any other value between 2 and 255
+
+           This defines an extraordinary object such as an XATTR.
+
+ (2) The name of the object type (NUL terminated unless all 16 chars are used)
+     [optional].
+
+ (3) A function to select the cache in which to store an index [optional].
+
+     This function is invoked when an index needs to be instantiated in a cache
+     during the instantiation of a non-index object.  Only the immediate index
+     parent for the non-index object will be queried.  Any indices above that
+     in the hierarchy may be stored in multiple caches.  This function does not
+     need to be supplied for any non-index object or any index that will only
+     have index children.
+
+     If this function is not supplied or if it returns NULL then the first
+     cache in the parent's list will be chosed, or failing that, the first
+     cache in the master list.
+
+ (4) A function to retrieve an object's key from the netfs [mandatory].
+
+     This function will be called with the netfs data that was passed to the
+     cookie acquisition function and the maximum length of key data that it may
+     provide.  It should write the required key data into the given buffer and
+     return the quantity it wrote.
+
+ (5) A function to retrieve attribute data from the netfs [optional].
+
+     This function will be called with the netfs data that was passed to the
+     cookie acquisition function.  It should return the size of the file if
+     this is a data file.  The size may be used to govern how much cache must
+     be reserved for this file in the cache.
+
+     If the function is absent, a file size of 0 is assumed.
+
+ (6) A function to retrieve auxilliary data from the netfs [optional].
+
+     This function will be called with the netfs data that was passed to the
+     cookie acquisition function and the maximum length of auxilliary data that
+     it may provide.  It should write the auxilliary data into the given buffer
+     and return the quantity it wrote.
+
+     If this function is absent, the auxilliary data length will be set to 0.
+
+     The length of the auxilliary data buffer may be dependent on the key
+     length.  A netfs mustn't rely on being able to provide more than 400 bytes
+     for both.
+
+ (7) A function to check the auxilliary data [optional].
+
+     This function will be called to check that a match found in the cache for
+     this object is valid.  For instance with AFS it could check the auxilliary
+     data against the data version number returned by the server to determine
+     whether the index entry in a cache is still valid.
+
+     If this function is absent, it will be assumed that matching objects in a
+     cache are always valid.
+
+     If present, the function should return one of the following values:
+
+       (*) FSCACHE_CHECKAUX_OKAY               - the entry is okay as is
+       (*) FSCACHE_CHECKAUX_NEEDS_UPDATE       - the entry requires update
+       (*) FSCACHE_CHECKAUX_OBSOLETE           - the entry should be deleted
+
+     This function can also be used to extract data from the auxilliary data in
+     the cache and copy it into the netfs's structures.
+
+ (8) A pair of functions to manage contexts for the completion callback
+     [optional].
+
+     The cache read/write functions are passed a context which is then passed
+     to the I/O completion callback function.  To ensure this context remains
+     valid until after the I/O completion is called, two functions may be
+     provided: one to get an extra reference on the context, and one to drop a
+     reference to it.
+
+     If the context is not used or is a type of object that won't go out of
+     scope, then these functions are not required.  These functions are not
+     required for indices as indices may not contain data.  These functions may
+     be called in interrupt context and so may not sleep.
+
+ (9) A function to mark a page as retaining cache metadata [optional].
+
+     This is called by the cache to indicate that it is retaining in-memory
+     information for this page and that the netfs should uncache the page when
+     it has finished.  This does not indicate whether there's data on the disk
+     or not.  Note that several pages at once may be presented for marking.
+
+     The PG_fscache bit is set on the pages before this function would be
+     called, so the function need not be provided if this is sufficient.
+
+     This function is not required for indices as they're not permitted data.
+
+(10) A function to unmark all the pages retaining cache metadata [mandatory].
+
+     This is called by FS-Cache to indicate that a backing store is being
+     unbound from a cookie and that all the marks on the pages should be
+     cleared to prevent confusion.  Note that the cache will have torn down all
+     its tracking information so that the pages don't need to be explicitly
+     uncached.
+
+     This function is not required for indices as they're not permitted data.
+
+
+===================================
+NETWORK FILESYSTEM (UN)REGISTRATION
+===================================
+
+The first step is to declare the network filesystem to the cache.  This also
+involves specifying the layout of the primary index (for AFS, this would be the
+"cell" level).
+
+The registration function is:
+
+       int fscache_register_netfs(struct fscache_netfs *netfs);
+
+It just takes a pointer to the netfs definition.  It returns 0 or an error as
+appropriate.
+
+For kAFS, registration is done as follows:
+
+       ret = fscache_register_netfs(&afs_cache_netfs);
+
+The last step is, of course, unregistration:
+
+       void fscache_unregister_netfs(struct fscache_netfs *netfs);
+
+
+================
+CACHE TAG LOOKUP
+================
+
+FS-Cache permits the use of more than one cache.  To permit particular index
+subtrees to be bound to particular caches, the second step is to look up cache
+representation tags.  This step is optional; it can be left entirely up to
+FS-Cache as to which cache should be used.  The problem with doing that is that
+FS-Cache will always pick the first cache that was registered.
+
+To get the representation for a named tag:
+
+       struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name);
+
+This takes a text string as the name and returns a representation of a tag.  It
+will never return an error.  It may return a dummy tag, however, if it runs out
+of memory; this will inhibit caching with this tag.
+
+Any representation so obtained must be released by passing it to this function:
+
+       void fscache_release_cache_tag(struct fscache_cache_tag *tag);
+
+The tag will be retrieved by FS-Cache when it calls the object definition
+operation select_cache().
+
+
+==================
+INDEX REGISTRATION
+==================
+
+The third step is to inform FS-Cache about part of an index hierarchy that can
+be used to locate files.  This is done by requesting a cookie for each index in
+the path to the file:
+
+       struct fscache_cookie *
+       fscache_acquire_cookie(struct fscache_cookie *parent,
+                              const struct fscache_object_def *def,
+                              void *netfs_data);
+
+This function creates an index entry in the index represented by parent,
+filling in the index entry by calling the operations pointed to by def.
+
+Note that this function never returns an error - all errors are handled
+internally.  It may, however, return NULL to indicate no cookie.  It is quite
+acceptable to pass this token back to this function as the parent to another
+acquisition (or even to the relinquish cookie, read page and write page
+functions - see below).
+
+Note also that no indices are actually created in a cache until a non-index
+object needs to be created somewhere down the hierarchy.  Furthermore, an index
+may be created in several different caches independently at different times.
+This is all handled transparently, and the netfs doesn't see any of it.
+
+For example, with AFS, a cell would be added to the primary index.  This index
+entry would have a dependent inode containing a volume location index for the
+volume mappings within this cell:
+
+       cell->cache =
+               fscache_acquire_cookie(afs_cache_netfs.primary_index,
+                                      &afs_cell_cache_index_def,
+                                      cell);
+
+Then when a volume location was accessed, it would be entered into the cell's
+index and an inode would be allocated that acts as a volume type and hash chain
+combination:
+
+       vlocation->cache =
+               fscache_acquire_cookie(cell->cache,
+                                      &afs_vlocation_cache_index_def,
+                                      vlocation);
+
+And then a particular flavour of volume (R/O for example) could be added to
+that index, creating another index for vnodes (AFS inode equivalents):
+
+       volume->cache =
+               fscache_acquire_cookie(vlocation->cache,
+                                      &afs_volume_cache_index_def,
+                                      volume);
+
+
+======================
+DATA FILE REGISTRATION
+======================
+
+The fourth step is to request a data file be created in the cache.  This is
+identical to index cookie acquisition.  The only difference is that the type in
+the object definition should be something other than index type.
+
+       vnode->cache =
+               fscache_acquire_cookie(volume->cache,
+                                      &afs_vnode_cache_object_def,
+                                      vnode);
+
+
+=================================
+MISCELLANEOUS OBJECT REGISTRATION
+=================================
+
+An optional step is to request an object of miscellaneous type be created in
+the cache.  This is almost identical to index cookie acquisition.  The only
+difference is that the type in the object definition should be something other
+than index type.  Whilst the parent object could be an index, it's more likely
+it would be some other type of object such as a data file.
+
+       xattr->cache =
+               fscache_acquire_cookie(vnode->cache,
+                                      &afs_xattr_cache_object_def,
+                                      xattr);
+
+Miscellaneous objects might be used to store extended attributes or directory
+entries for example.
+
+
+==========================
+SETTING THE DATA FILE SIZE
+==========================
+
+The fifth step is to set the physical attributes of the file, such as its size.
+This doesn't automatically reserve any space in the cache, but permits the
+cache to adjust its metadata for data tracking appropriately:
+
+       int fscache_attr_changed(struct fscache_cookie *cookie);
+
+The cache will return -ENOBUFS if there is no backing cache or if there is no
+space to allocate any extra metadata required in the cache.  The attributes
+will be accessed with the get_attr() cookie definition operation.
+
+Note that attempts to read or write data pages in the cache over this size may
+be rebuffed with -ENOBUFS.
+
+This operation schedules an attribute adjustment to happen asynchronously at
+some point in the future, and as such, it may happen after the function returns
+to the caller.  The attribute adjustment excludes read and write operations.
+
+
+=====================
+PAGE READ/ALLOC/WRITE
+=====================
+
+And the sixth step is to store and retrieve pages in the cache.  There are
+three functions that are used to do this.
+
+Note:
+
+ (1) A page should not be re-read or re-allocated without uncaching it first.
+
+ (2) A read or allocated page must be uncached when the netfs page is released
+     from the pagecache.
+
+ (3) A page should only be written to the cache if previous read or allocated.
+
+This permits the cache to maintain its page tracking in proper order.
+
+
+PAGE READ
+---------
+
+Firstly, the netfs should ask FS-Cache to examine the caches and read the
+contents cached for a particular page of a particular file if present, or else
+allocate space to store the contents if not:
+
+       typedef
+       void (*fscache_rw_complete_t)(struct page *page,
+                                     void *context,
+                                     int error);
+
+       int fscache_read_or_alloc_page(struct fscache_cookie *cookie,
+                                      struct page *page,
+                                      fscache_rw_complete_t end_io_func,
+                                      void *context,
+                                      gfp_t gfp);
+
+The cookie argument must specify a cookie for an object that isn't an index,
+the page specified will have the data loaded into it (and is also used to
+specify the page number), and the gfp argument is used to control how any
+memory allocations made are satisfied.
+
+If the cookie indicates the inode is not cached:
+
+ (1) The function will return -ENOBUFS.
+
+Else if there's a copy of the page resident in the cache:
+
+ (1) The mark_pages_cached() cookie operation will be called on that page.
+
+ (2) The function will submit a request to read the data from the cache's
+     backing device directly into the page specified.
+
+ (3) The function will return 0.
+
+ (4) When the read is complete, end_io_func() will be invoked with:
+
+     (*) The netfs data supplied when the cookie was created.
+
+     (*) The page descriptor.
+
+     (*) The context argument passed to the above function.  This will be
+         maintained with the get_context/put_context functions mentioned above.
+
+     (*) An argument that's 0 on success or negative for an error code.
+
+     If an error occurs, it should be assumed that the page contains no usable
+     data.
+
+     end_io_func() will be called in process context if the read is results in
+     an error, but it might be called in interrupt context if the read is
+     successful.
+
+Otherwise, if there's not a copy available in cache, but the cache may be able
+to store the page:
+
+ (1) The mark_pages_cached() cookie operation will be called on that page.
+
+ (2) A block may be reserved in the cache and attached to the object at the
+     appropriate place.
+
+ (3) The function will return -ENODATA.
+
+This function may also return -ENOMEM or -EINTR, in which case it won't have
+read any data from the cache.
+
+
+PAGE ALLOCATE
+-------------
+
+Alternatively, if there's not expected to be any data in the cache for a page
+because the file has been extended, a block can simply be allocated instead:
+
+       int fscache_alloc_page(struct fscache_cookie *cookie,
+                              struct page *page,
+                              gfp_t gfp);
+
+This is similar to the fscache_read_or_alloc_page() function, except that it
+never reads from the cache.  It will return 0 if a block has been allocated,
+rather than -ENODATA as the other would.  One or the other must be performed
+before writing to the cache.
+
+The mark_pages_cached() cookie operation will be called on the page if
+successful.
+
+
+PAGE WRITE
+----------
+
+Secondly, if the netfs changes the contents of the page (either due to an
+initial download or if a user performs a write), then the page should be
+written back to the cache:
+
+       int fscache_write_page(struct fscache_cookie *cookie,
+                              struct page *page,
+                              gfp_t gfp);
+
+The cookie argument must specify a data file cookie, the page specified should
+contain the data to be written (and is also used to specify the page number),
+and the gfp argument is used to control how any memory allocations made are
+satisfied.
+
+The page must have first been read or allocated successfully and must not have
+been uncached before writing is performed.
+
+If the cookie indicates the inode is not cached then:
+
+ (1) The function will return -ENOBUFS.
+
+Else if space can be allocated in the cache to hold this page:
+
+ (1) PG_fscache_write will be set on the page.
+
+ (2) The function will submit a request to write the data to cache's backing
+     device directly from the page specified.
+
+ (3) The function will return 0.
+
+ (4) When the write is complete PG_fscache_write is cleared on the page and
+     anyone waiting for that bit will be woken up.
+
+Else if there's no space available in the cache, -ENOBUFS will be returned.  It
+is also possible for the PG_fscache_write bit to be cleared when no write took
+place if unforeseen circumstances arose (such as a disk error).
+
+Writing takes place asynchronously.
+
+
+MULTIPLE PAGE READ
+------------------
+
+A facility is provided to read several pages at once, as requested by the
+readpages() address space operation:
+
+       int fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
+                                       struct address_space *mapping,
+                                       struct list_head *pages,
+                                       int *nr_pages,
+                                       fscache_rw_complete_t end_io_func,
+                                       void *context,
+                                       gfp_t gfp);
+
+This works in a similar way to fscache_read_or_alloc_page(), except:
+
+ (1) Any page it can retrieve data for is removed from pages and nr_pages and
+     dispatched for reading to the disk.  Reads of adjacent pages on disk may
+     be merged for greater efficiency.
+
+ (2) The mark_pages_cached() cookie operation will be called on several pages
+     at once if they're being read or allocated.
+
+ (3) If there was an general error, then that error will be returned.
+
+     Else if some pages couldn't be allocated or read, then -ENOBUFS will be
+     returned.
+
+     Else if some pages couldn't be read but were allocated, then -ENODATA will
+     be returned.
+
+     Otherwise, if all pages had reads dispatched, then 0 will be returned, the
+     list will be empty and *nr_pages will be 0.
+
+ (4) end_io_func will be called once for each page being read as the reads
+     complete.  It will be called in process context if error != 0, but it may
+     be called in interrupt context if there is no error.
+
+Note that a return of -ENODATA, -ENOBUFS or any other error does not preclude
+some of the pages being read and some being allocated.  Those pages will have
+been marked appropriately and will need uncaching.
+
+
+==============
+PAGE UNCACHING
+==============
+
+To uncache a page, this function should be called:
+
+       void fscache_uncache_page(struct fscache_cookie *cookie,
+                                 struct page *page);
+
+This function permits the cache to release any in-memory representation it
+might be holding for this netfs page.  This function must be called once for
+each page on which the read or write page functions above have been called to
+make sure the cache's in-memory tracking information gets torn down.
+
+Note that pages can't be explicitly deleted from the a data file.  The whole
+data file must be retired (see the relinquish cookie function below).
+
+Furthermore, note that this does not cancel the asynchronous read or write
+operation started by the read/alloc and write functions, so the page
+invalidation and release functions must use:
+
+       bool fscache_check_page_write(struct fscache_cookie *cookie,
+                                     struct page *page);
+
+to see if a page is being written to the cache, and:
+
+       void fscache_wait_on_page_write(struct fscache_cookie *cookie,
+                                       struct page *page);
+
+to wait for it to finish if it is.
+
+
+==========================
+INDEX AND DATA FILE UPDATE
+==========================
+
+To request an update of the index data for an index or other object, the
+following function should be called:
+
+       void fscache_update_cookie(struct fscache_cookie *cookie);
+
+This function will refer back to the netfs_data pointer stored in the cookie by
+the acquisition function to obtain the data to write into each revised index
+entry.  The update method in the parent index definition will be called to
+transfer the data.
+
+Note that partial updates may happen automatically at other times, such as when
+data blocks are added to a data file object.
+
+
+===============================
+MISCELLANEOUS COOKIE OPERATIONS
+===============================
+
+There are a number of operations that can be used to control cookies:
+
+ (*) Cookie pinning:
+
+       int fscache_pin_cookie(struct fscache_cookie *cookie);
+       void fscache_unpin_cookie(struct fscache_cookie *cookie);
+
+     These operations permit data cookies to be pinned into the cache and to
+     have the pinning removed.  They are not permitted on index cookies.
+
+     The pinning function will return 0 if successful, -ENOBUFS in the cookie
+     isn't backed by a cache, -EOPNOTSUPP if the cache doesn't support pinning,
+     -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or
+     -EIO if there's any other problem.
+
+ (*) Data space reservation:
+
+       int fscache_reserve_space(struct fscache_cookie *cookie, loff_t size);
+
+     This permits a netfs to request cache space be reserved to store up to the
+     given amount of a file.  It is permitted to ask for more than the current
+     size of the file to allow for future file expansion.
+
+     If size is given as zero then the reservation will be cancelled.
+
+     The function will return 0 if successful, -ENOBUFS in the cookie isn't
+     backed by a cache, -EOPNOTSUPP if the cache doesn't support reservations,
+     -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or
+     -EIO if there's any other problem.
+
+     Note that this doesn't pin an object in a cache; it can still be culled to
+     make space if it's not in use.
+
+
+=====================
+COOKIE UNREGISTRATION
+=====================
+
+To get rid of a cookie, this function should be called.
+
+       void fscache_relinquish_cookie(struct fscache_cookie *cookie,
+                                      int retire);
+
+If retire is non-zero, then the object will be marked for recycling, and all
+copies of it will be removed from all active caches in which it is present.
+Not only that but all child objects will also be retired.
+
+If retire is zero, then the object may be available again when next the
+acquisition function is called.  Retirement here will overrule the pinning on a
+cookie.
+
+One very important note - relinquish must NOT be called for a cookie unless all
+the cookies for "child" indices, objects and pages have been relinquished
+first.
+
+
+================================
+INDEX AND DATA FILE INVALIDATION
+================================
+
+There is no direct way to invalidate an index subtree or a data file.  To do
+this, the caller should relinquish and retire the cookie they have, and then
+acquire a new one.
+
+
+===========================
+FS-CACHE SPECIFIC PAGE FLAG
+===========================
+
+FS-Cache makes use of a page flag, PG_private_2, for its own purpose.  This is
+given the alternative name PG_fscache.
+
+PG_fscache is used to indicate that the page is known by the cache, and that
+the cache must be informed if the page is going to go away.  It's an indication
+to the netfs that the cache has an interest in this page, where an interest may
+be a pointer to it, resources allocated or reserved for it, or I/O in progress
+upon it.
+
+The netfs can use this information in methods such as releasepage() to
+determine whether it needs to uncache a page or update it.
+
+Furthermore, if this bit is set, releasepage() and invalidatepage() operations
+will be called on a page to get rid of it, even if PG_private is not set.  This
+allows caching to attempted on a page before read_cache_pages() to be called
+after fscache_read_or_alloc_pages() as the former will try and release pages it
+was given under certain circumstances.
+
+This bit does not overlap with such as PG_private.  This means that FS-Cache
+can be used with a filesystem that uses the block buffering code.
+
+There are a number of operations defined on this flag:
+
+       int PageFsCache(struct page *page);
+       void SetPageFsCache(struct page *page)
+       void ClearPageFsCache(struct page *page)
+       int TestSetPageFsCache(struct page *page)
+       int TestClearPageFsCache(struct page *page)
+
+These functions are bit test, bit set, bit clear, bit test and set and bit
+test and clear operations on PG_fscache.
diff --git a/Documentation/filesystems/caching/object.txt b/Documentation/filesystems/caching/object.txt
new file mode 100644 (file)
index 0000000..e8b0a35
--- /dev/null
@@ -0,0 +1,313 @@
+            ====================================================
+            IN-KERNEL CACHE OBJECT REPRESENTATION AND MANAGEMENT
+            ====================================================
+
+By: David Howells <dhowells@redhat.com>
+
+Contents:
+
+ (*) Representation
+
+ (*) Object management state machine.
+
+     - Provision of cpu time.
+     - Locking simplification.
+
+ (*) The set of states.
+
+ (*) The set of events.
+
+
+==============
+REPRESENTATION
+==============
+
+FS-Cache maintains an in-kernel representation of each object that a netfs is
+currently interested in.  Such objects are represented by the fscache_cookie
+struct and are referred to as cookies.
+
+FS-Cache also maintains a separate in-kernel representation of the objects that
+a cache backend is currently actively caching.  Such objects are represented by
+the fscache_object struct.  The cache backends allocate these upon request, and
+are expected to embed them in their own representations.  These are referred to
+as objects.
+
+There is a 1:N relationship between cookies and objects.  A cookie may be
+represented by multiple objects - an index may exist in more than one cache -
+or even by no objects (it may not be cached).
+
+Furthermore, both cookies and objects are hierarchical.  The two hierarchies
+correspond, but the cookies tree is a superset of the union of the object trees
+of multiple caches:
+
+           NETFS INDEX TREE               :      CACHE 1     :      CACHE 2
+                                          :                  :
+                                          :   +-----------+  :
+                                 +----------->|  IObject  |  :
+             +-----------+       |        :   +-----------+  :
+             |  ICookie  |-------+        :         |        :
+             +-----------+       |        :         |        :   +-----------+
+                   |             +------------------------------>|  IObject  |
+                   |                      :         |        :   +-----------+
+                   |                      :         V        :         |
+                   |                      :   +-----------+  :         |
+                   V             +----------->|  IObject  |  :         |
+             +-----------+       |        :   +-----------+  :         |
+             |  ICookie  |-------+        :         |        :         V
+             +-----------+       |        :         |        :   +-----------+
+                   |             +------------------------------>|  IObject  |
+             +-----+-----+                :         |        :   +-----------+
+             |           |                :         |        :         |
+             V           |                :         V        :         |
+       +-----------+     |                :   +-----------+  :         |
+       |  ICookie  |------------------------->|  IObject  |  :         |
+       +-----------+     |                :   +-----------+  :         |
+             |           V                :         |        :         V
+             |     +-----------+          :         |        :   +-----------+
+             |     |  ICookie  |-------------------------------->|  IObject  |
+             |     +-----------+          :         |        :   +-----------+
+             V           |                :         V        :         |
+       +-----------+     |                :   +-----------+  :         |
+       |  DCookie  |------------------------->|  DObject  |  :         |
+       +-----------+     |                :   +-----------+  :         |
+                         |                :                  :         |
+                 +-------+-------+        :                  :         |
+                 |               |        :                  :         |
+                 V               V        :                  :         V
+           +-----------+   +-----------+  :                  :   +-----------+
+           |  DCookie  |   |  DCookie  |------------------------>|  DObject  |
+           +-----------+   +-----------+  :                  :   +-----------+
+                                          :                  :
+
+In the above illustration, ICookie and IObject represent indices and DCookie
+and DObject represent data storage objects.  Indices may have representation in
+multiple caches, but currently, non-index objects may not.  Objects of any type
+may also be entirely unrepresented.
+
+As far as the netfs API goes, the netfs is only actually permitted to see
+pointers to the cookies.  The cookies themselves and any objects attached to
+those cookies are hidden from it.
+
+
+===============================
+OBJECT MANAGEMENT STATE MACHINE
+===============================
+
+Within FS-Cache, each active object is managed by its own individual state
+machine.  The state for an object is kept in the fscache_object struct, in
+object->state.  A cookie may point to a set of objects that are in different
+states.
+
+Each state has an action associated with it that is invoked when the machine
+wakes up in that state.  There are four logical sets of states:
+
+ (1) Preparation: states that wait for the parent objects to become ready.  The
+     representations are hierarchical, and it is expected that an object must
+     be created or accessed with respect to its parent object.
+
+ (2) Initialisation: states that perform lookups in the cache and validate
+     what's found and that create on disk any missing metadata.
+
+ (3) Normal running: states that allow netfs operations on objects to proceed
+     and that update the state of objects.
+
+ (4) Termination: states that detach objects from their netfs cookies, that
+     delete objects from disk, that handle disk and system errors and that free
+     up in-memory resources.
+
+
+In most cases, transitioning between states is in response to signalled events.
+When a state has finished processing, it will usually set the mask of events in
+which it is interested (object->event_mask) and relinquish the worker thread.
+Then when an event is raised (by calling fscache_raise_event()), if the event
+is not masked, the object will be queued for processing (by calling
+fscache_enqueue_object()).
+
+
+PROVISION OF CPU TIME
+---------------------
+
+The work to be done by the various states is given CPU time by the threads of
+the slow work facility (see Documentation/slow-work.txt).  This is used in
+preference to the workqueue facility because:
+
+ (1) Threads may be completely occupied for very long periods of time by a
+     particular work item.  These state actions may be doing sequences of
+     synchronous, journalled disk accesses (lookup, mkdir, create, setxattr,
+     getxattr, truncate, unlink, rmdir, rename).
+
+ (2) Threads may do little actual work, but may rather spend a lot of time
+     sleeping on I/O.  This means that single-threaded and 1-per-CPU-threaded
+     workqueues don't necessarily have the right numbers of threads.
+
+
+LOCKING SIMPLIFICATION
+----------------------
+
+Because only one worker thread may be operating on any particular object's
+state machine at once, this simplifies the locking, particularly with respect
+to disconnecting the netfs's representation of a cache object (fscache_cookie)
+from the cache backend's representation (fscache_object) - which may be
+requested from either end.
+
+
+=================
+THE SET OF STATES
+=================
+
+The object state machine has a set of states that it can be in.  There are
+preparation states in which the object sets itself up and waits for its parent
+object to transit to a state that allows access to its children:
+
+ (1) State FSCACHE_OBJECT_INIT.
+
+     Initialise the object and wait for the parent object to become active.  In
+     the cache, it is expected that it will not be possible to look an object
+     up from the parent object, until that parent object itself has been looked
+     up.
+
+There are initialisation states in which the object sets itself up and accesses
+disk for the object metadata:
+
+ (2) State FSCACHE_OBJECT_LOOKING_UP.
+
+     Look up the object on disk, using the parent as a starting point.
+     FS-Cache expects the cache backend to probe the cache to see whether this
+     object is represented there, and if it is, to see if it's valid (coherency
+     management).
+
+     The cache should call fscache_object_lookup_negative() to indicate lookup
+     failure for whatever reason, and should call fscache_obtained_object() to
+     indicate success.
+
+     At the completion of lookup, FS-Cache will let the netfs go ahead with
+     read operations, no matter whether the file is yet cached.  If not yet
+     cached, read operations will be immediately rejected with ENODATA until
+     the first known page is uncached - as to that point there can be no data
+     to be read out of the cache for that file that isn't currently also held
+     in the pagecache.
+
+ (3) State FSCACHE_OBJECT_CREATING.
+
+     Create an object on disk, using the parent as a starting point.  This
+     happens if the lookup failed to find the object, or if the object's
+     coherency data indicated what's on disk is out of date.  In this state,
+     FS-Cache expects the cache to create
+
+     The cache should call fscache_obtained_object() if creation completes
+     successfully, fscache_object_lookup_negative() otherwise.
+
+     At the completion of creation, FS-Cache will start processing write
+     operations the netfs has queued for an object.  If creation failed, the
+     write ops will be transparently discarded, and nothing recorded in the
+     cache.
+
+There are some normal running states in which the object spends its time
+servicing netfs requests:
+
+ (4) State FSCACHE_OBJECT_AVAILABLE.
+
+     A transient state in which pending operations are started, child objects
+     are permitted to advance from FSCACHE_OBJECT_INIT state, and temporary
+     lookup data is freed.
+
+ (5) State FSCACHE_OBJECT_ACTIVE.
+
+     The normal running state.  In this state, requests the netfs makes will be
+     passed on to the cache.
+
+ (6) State FSCACHE_OBJECT_UPDATING.
+
+     The state machine comes here to update the object in the cache from the
+     netfs's records.  This involves updating the auxiliary data that is used
+     to maintain coherency.
+
+And there are terminal states in which an object cleans itself up, deallocates
+memory and potentially deletes stuff from disk:
+
+ (7) State FSCACHE_OBJECT_LC_DYING.
+
+     The object comes here if it is dying because of a lookup or creation
+     error.  This would be due to a disk error or system error of some sort.
+     Temporary data is cleaned up, and the parent is released.
+
+ (8) State FSCACHE_OBJECT_DYING.
+
+     The object comes here if it is dying due to an error, because its parent
+     cookie has been relinquished by the netfs or because the cache is being
+     withdrawn.
+
+     Any child objects waiting on this one are given CPU time so that they too
+     can destroy themselves.  This object waits for all its children to go away
+     before advancing to the next state.
+
+ (9) State FSCACHE_OBJECT_ABORT_INIT.
+
+     The object comes to this state if it was waiting on its parent in
+     FSCACHE_OBJECT_INIT, but its parent died.  The object will destroy itself
+     so that the parent may proceed from the FSCACHE_OBJECT_DYING state.
+
+(10) State FSCACHE_OBJECT_RELEASING.
+(11) State FSCACHE_OBJECT_RECYCLING.
+
+     The object comes to one of these two states when dying once it is rid of
+     all its children, if it is dying because the netfs relinquished its
+     cookie.  In the first state, the cached data is expected to persist, and
+     in the second it will be deleted.
+
+(12) State FSCACHE_OBJECT_WITHDRAWING.
+
+     The object transits to this state if the cache decides it wants to
+     withdraw the object from service, perhaps to make space, but also due to
+     error or just because the whole cache is being withdrawn.
+
+(13) State FSCACHE_OBJECT_DEAD.
+
+     The object transits to this state when the in-memory object record is
+     ready to be deleted.  The object processor shouldn't ever see an object in
+     this state.
+
+
+THE SET OF EVENTS
+-----------------
+
+There are a number of events that can be raised to an object state machine:
+
+ (*) FSCACHE_OBJECT_EV_UPDATE
+
+     The netfs requested that an object be updated.  The state machine will ask
+     the cache backend to update the object, and the cache backend will ask the
+     netfs for details of the change through its cookie definition ops.
+
+ (*) FSCACHE_OBJECT_EV_CLEARED
+
+     This is signalled in two circumstances:
+
+     (a) when an object's last child object is dropped and
+
+     (b) when the last operation outstanding on an object is completed.
+
+     This is used to proceed from the dying state.
+
+ (*) FSCACHE_OBJECT_EV_ERROR
+
+     This is signalled when an I/O error occurs during the processing of some
+     object.
+
+ (*) FSCACHE_OBJECT_EV_RELEASE
+ (*) FSCACHE_OBJECT_EV_RETIRE
+
+     These are signalled when the netfs relinquishes a cookie it was using.
+     The event selected depends on whether the netfs asks for the backing
+     object to be retired (deleted) or retained.
+
+ (*) FSCACHE_OBJECT_EV_WITHDRAW
+
+     This is signalled when the cache backend wants to withdraw an object.
+     This means that the object will have to be detached from the netfs's
+     cookie.
+
+Because the withdrawing releasing/retiring events are all handled by the object
+state machine, it doesn't matter if there's a collision with both ends trying
+to sever the connection at the same time.  The state machine can just pick
+which one it wants to honour, and that effects the other.
diff --git a/Documentation/filesystems/caching/operations.txt b/Documentation/filesystems/caching/operations.txt
new file mode 100644 (file)
index 0000000..b6b070c
--- /dev/null
@@ -0,0 +1,213 @@
+                      ================================
+                      ASYNCHRONOUS OPERATIONS HANDLING
+                      ================================
+
+By: David Howells <dhowells@redhat.com>
+
+Contents:
+
+ (*) Overview.
+
+ (*) Operation record initialisation.
+
+ (*) Parameters.
+
+ (*) Procedure.
+
+ (*) Asynchronous callback.
+
+
+========
+OVERVIEW
+========
+
+FS-Cache has an asynchronous operations handling facility that it uses for its
+data storage and retrieval routines.  Its operations are represented by
+fscache_operation structs, though these are usually embedded into some other
+structure.
+
+This facility is available to and expected to be be used by the cache backends,
+and FS-Cache will create operations and pass them off to the appropriate cache
+backend for completion.
+
+To make use of this facility, <linux/fscache-cache.h> should be #included.
+
+
+===============================
+OPERATION RECORD INITIALISATION
+===============================
+
+An operation is recorded in an fscache_operation struct:
+
+       struct fscache_operation {
+               union {
+                       struct work_struct fast_work;
+                       struct slow_work slow_work;
+               };
+               unsigned long           flags;
+               fscache_operation_processor_t processor;
+               ...
+       };
+
+Someone wanting to issue an operation should allocate something with this
+struct embedded in it.  They should initialise it by calling:
+
+       void fscache_operation_init(struct fscache_operation *op,
+                                   fscache_operation_release_t release);
+
+with the operation to be initialised and the release function to use.
+
+The op->flags parameter should be set to indicate the CPU time provision and
+the exclusivity (see the Parameters section).
+
+The op->fast_work, op->slow_work and op->processor flags should be set as
+appropriate for the CPU time provision (see the Parameters section).
+
+FSCACHE_OP_WAITING may be set in op->flags prior to each submission of the
+operation and waited for afterwards.
+
+
+==========
+PARAMETERS
+==========
+
+There are a number of parameters that can be set in the operation record's flag
+parameter.  There are three options for the provision of CPU time in these
+operations:
+
+ (1) The operation may be done synchronously (FSCACHE_OP_MYTHREAD).  A thread
+     may decide it wants to handle an operation itself without deferring it to
+     another thread.
+
+     This is, for example, used in read operations for calling readpages() on
+     the backing filesystem in CacheFiles.  Although readpages() does an
+     asynchronous data fetch, the determination of whether pages exist is done
+     synchronously - and the netfs does not proceed until this has been
+     determined.
+
+     If this option is to be used, FSCACHE_OP_WAITING must be set in op->flags
+     before submitting the operation, and the operating thread must wait for it
+     to be cleared before proceeding:
+
+               wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
+                           fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+
+
+ (2) The operation may be fast asynchronous (FSCACHE_OP_FAST), in which case it
+     will be given to keventd to process.  Such an operation is not permitted
+     to sleep on I/O.
+
+     This is, for example, used by CacheFiles to copy data from a backing fs
+     page to a netfs page after the backing fs has read the page in.
+
+     If this option is used, op->fast_work and op->processor must be
+     initialised before submitting the operation:
+
+               INIT_WORK(&op->fast_work, do_some_work);
+
+
+ (3) The operation may be slow asynchronous (FSCACHE_OP_SLOW), in which case it
+     will be given to the slow work facility to process.  Such an operation is
+     permitted to sleep on I/O.
+
+     This is, for example, used by FS-Cache to handle background writes of
+     pages that have just been fetched from a remote server.
+
+     If this option is used, op->slow_work and op->processor must be
+     initialised before submitting the operation:
+
+               fscache_operation_init_slow(op, processor)
+
+
+Furthermore, operations may be one of two types:
+
+ (1) Exclusive (FSCACHE_OP_EXCLUSIVE).  Operations of this type may not run in
+     conjunction with any other operation on the object being operated upon.
+
+     An example of this is the attribute change operation, in which the file
+     being written to may need truncation.
+
+ (2) Shareable.  Operations of this type may be running simultaneously.  It's
+     up to the operation implementation to prevent interference between other
+     operations running at the same time.
+
+
+=========
+PROCEDURE
+=========
+
+Operations are used through the following procedure:
+
+ (1) The submitting thread must allocate the operation and initialise it
+     itself.  Normally this would be part of a more specific structure with the
+     generic op embedded within.
+
+ (2) The submitting thread must then submit the operation for processing using
+     one of the following two functions:
+
+       int fscache_submit_op(struct fscache_object *object,
+                             struct fscache_operation *op);
+
+       int fscache_submit_exclusive_op(struct fscache_object *object,
+                                       struct fscache_operation *op);
+
+     The first function should be used to submit non-exclusive ops and the
+     second to submit exclusive ones.  The caller must still set the
+     FSCACHE_OP_EXCLUSIVE flag.
+
+     If successful, both functions will assign the operation to the specified
+     object and return 0.  -ENOBUFS will be returned if the object specified is
+     permanently unavailable.
+
+     The operation manager will defer operations on an object that is still
+     undergoing lookup or creation.  The operation will also be deferred if an
+     operation of conflicting exclusivity is in progress on the object.
+
+     If the operation is asynchronous, the manager will retain a reference to
+     it, so the caller should put their reference to it by passing it to:
+
+       void fscache_put_operation(struct fscache_operation *op);
+
+ (3) If the submitting thread wants to do the work itself, and has marked the
+     operation with FSCACHE_OP_MYTHREAD, then it should monitor
+     FSCACHE_OP_WAITING as described above and check the state of the object if
+     necessary (the object might have died whilst the thread was waiting).
+
+     When it has finished doing its processing, it should call
+     fscache_put_operation() on it.
+
+ (4) The operation holds an effective lock upon the object, preventing other
+     exclusive ops conflicting until it is released.  The operation can be
+     enqueued for further immediate asynchronous processing by adjusting the
+     CPU time provisioning option if necessary, eg:
+
+       op->flags &= ~FSCACHE_OP_TYPE;
+       op->flags |= ~FSCACHE_OP_FAST;
+
+     and calling:
+
+       void fscache_enqueue_operation(struct fscache_operation *op)
+
+     This can be used to allow other things to have use of the worker thread
+     pools.
+
+
+=====================
+ASYNCHRONOUS CALLBACK
+=====================
+
+When used in asynchronous mode, the worker thread pool will invoke the
+processor method with a pointer to the operation.  This should then get at the
+container struct by using container_of():
+
+       static void fscache_write_op(struct fscache_operation *_op)
+       {
+               struct fscache_storage *op =
+                       container_of(_op, struct fscache_storage, op);
+       ...
+       }
+
+The caller holds a reference on the operation, and will invoke
+fscache_put_operation() when the processor function returns.  The processor
+function is at liberty to call fscache_enqueue_operation() or to take extra
+references.
diff --git a/Documentation/filesystems/exofs.txt b/Documentation/filesystems/exofs.txt
new file mode 100644 (file)
index 0000000..0ced74c
--- /dev/null
@@ -0,0 +1,176 @@
+===============================================================================
+WHAT IS EXOFS?
+===============================================================================
+
+exofs is a file system that uses an OSD and exports the API of a normal Linux
+file system. Users access exofs like any other local file system, and exofs
+will in turn issue commands to the local OSD initiator.
+
+OSD is a new T10 command set that views storage devices not as a large/flat
+array of sectors but as a container of objects, each having a length, quota,
+time attributes and more. Each object is addressed by a 64bit ID, and is
+contained in a 64bit ID partition. Each object has associated attributes
+attached to it, which are integral part of the object and provide metadata about
+the object. The standard defines some common obligatory attributes, but user
+attributes can be added as needed.
+
+===============================================================================
+ENVIRONMENT
+===============================================================================
+
+To use this file system, you need to have an object store to run it on.  You
+may download a target from:
+http://open-osd.org
+
+See Documentation/scsi/osd.txt for how to setup a working osd environment.
+
+===============================================================================
+USAGE
+===============================================================================
+
+1. Download and compile exofs and open-osd initiator:
+  You need an external Kernel source tree or kernel headers from your
+  distribution. (anything based on 2.6.26 or later).
+
+  a. download open-osd including exofs source using:
+     [parent-directory]$ git clone git://git.open-osd.org/open-osd.git
+
+  b. Build the library module like this:
+     [parent-directory]$ make -C KSRC=$(KER_DIR) open-osd
+
+     This will build both the open-osd initiator as well as the exofs kernel
+     module. Use whatever parameters you compiled your Kernel with and
+     $(KER_DIR) above pointing to the Kernel you compile against. See the file
+     open-osd/top-level-Makefile for an example.
+
+2. Get the OSD initiator and target set up properly, and login to the target.
+  See Documentation/scsi/osd.txt for farther instructions. Also see ./do-osd
+  for example script that does all these steps.
+
+3. Insmod the exofs.ko module:
+   [exofs]$ insmod exofs.ko
+
+4. Make sure the directory where you want to mount exists. If not, create it.
+   (For example, mkdir /mnt/exofs)
+
+5. At first run you will need to invoke the mkfs.exofs application
+
+   As an example, this will create the file system on:
+   /dev/osd0 partition ID 65536
+
+   mkfs.exofs --pid=65536 --format /dev/osd0
+
+   The --format is optional if not specified no OSD_FORMAT will be
+   preformed and a clean file system will be created in the specified pid,
+   in the available space of the target. (Use --format=size_in_meg to limit
+   the total LUN space available)
+
+   If pid already exist it will be deleted and a new one will be created in it's
+   place. Be careful.
+
+   An exofs lives inside a single OSD partition. You can create multiple exofs
+   filesystems on the same device using multiple pids.
+
+   (run mkfs.exofs without any parameters for usage help message)
+
+6. Mount the file system.
+
+   For example, to mount /dev/osd0, partition ID 0x10000 on /mnt/exofs:
+
+       mount -t exofs -o pid=65536 /dev/osd0 /mnt/exofs/
+
+7. For reference (See do-exofs example script):
+       do-exofs start - an example of how to perform the above steps.
+       do-exofs stop -  an example of how to unmount the file system.
+       do-exofs format - an example of how to format and mkfs a new exofs.
+
+8. Extra compilation flags (uncomment in fs/exofs/Kbuild):
+       CONFIG_EXOFS_DEBUG - for debug messages and extra checks.
+
+===============================================================================
+exofs mount options
+===============================================================================
+Similar to any mount command:
+       mount -t exofs -o exofs_options /dev/osdX mount_exofs_directory
+
+Where:
+    -t exofs: specifies the exofs file system
+
+    /dev/osdX: X is a decimal number. /dev/osdX was created after a successful
+               login into an OSD target.
+
+    mount_exofs_directory: The directory to mount the file system on
+
+    exofs specific options: Options are separated by commas (,)
+               pid=<integer> - The partition number to mount/create as
+                                container of the filesystem.
+                                This option is mandatory
+                to=<integer>  - Timeout in ticks for a single command
+                                default is (60 * HZ) [for debugging only]
+
+===============================================================================
+DESIGN
+===============================================================================
+
+* The file system control block (AKA on-disk superblock) resides in an object
+  with a special ID (defined in common.h).
+  Information included in the file system control block is used to fill the
+  in-memory superblock structure at mount time. This object is created before
+  the file system is used by mkexofs.c It contains information such as:
+       - The file system's magic number
+       - The next inode number to be allocated
+
+* Each file resides in its own object and contains the data (and it will be
+  possible to extend the file over multiple objects, though this has not been
+  implemented yet).
+
+* A directory is treated as a file, and essentially contains a list of <file
+  name, inode #> pairs for files that are found in that directory. The object
+  IDs correspond to the files' inode numbers and will be allocated according to
+  a bitmap (stored in a separate object). Now they are allocated using a
+  counter.
+
+* Each file's control block (AKA on-disk inode) is stored in its object's
+  attributes. This applies to both regular files and other types (directories,
+  device files, symlinks, etc.).
+
+* Credentials are generated per object (inode and superblock) when they is
+  created in memory (read off disk or created). The credential works for all
+  operations and is used as long as the object remains in memory.
+
+* Async OSD operations are used whenever possible, but the target may execute
+  them out of order. The operations that concern us are create, delete,
+  readpage, writepage, update_inode, and truncate. The following pairs of
+  operations should execute in the order written, and we need to prevent them
+  from executing in reverse order:
+       - The following are handled with the OBJ_CREATED and OBJ_2BCREATED
+         flags. OBJ_CREATED is set when we know the object exists on the OSD -
+         in create's callback function, and when we successfully do a read_inode.
+         OBJ_2BCREATED is set in the beginning of the create function, so we
+         know that we should wait.
+               - create/delete: delete should wait until the object is created
+                 on the OSD.
+               - create/readpage: readpage should be able to return a page
+                 full of zeroes in this case. If there was a write already
+                 en-route (i.e. create, writepage, readpage) then the page
+                 would be locked, and so it would really be the same as
+                 create/writepage.
+               - create/writepage: if writepage is called for a sync write, it
+                 should wait until the object is created on the OSD.
+                 Otherwise, it should just return.
+               - create/truncate: truncate should wait until the object is
+                 created on the OSD.
+               - create/update_inode: update_inode should wait until the
+                 object is created on the OSD.
+       - Handled by VFS locks:
+               - readpage/delete: shouldn't happen because of page lock.
+               - writepage/delete: shouldn't happen because of page lock.
+               - readpage/writepage: shouldn't happen because of page lock.
+
+===============================================================================
+LICENSE/COPYRIGHT
+===============================================================================
+The exofs file system is based on ext2 v0.5b (distributed with the Linux kernel
+version 2.6.10).  All files include the original copyrights, and the license
+is GPL version 2 (only version 2, as is true for the Linux kernel).  The
+Linux kernel can be downloaded from www.kernel.org.
index fde829a756e6510a9d5cd4d56c981e204151fa8c..902b95d0ee511a95825dc61193347ce72cb6a5d2 100644 (file)
@@ -24,6 +24,8 @@ The following mount options are supported:
 
        gid=            Set the default group.
        umask=          Set the default umask.
+       mode=           Set the default file permissions.
+       dmode=          Set the default directory permissions.
        uid=            Set the default user.
        bs=             Set the block size.
        unhide          Show otherwise hidden files.
index 240257dd4238fc55608989541b79c37e40101abc..bdc0c433e88c6852898220a1c11f63c29f89b41d 100644 (file)
@@ -1523,7 +1523,9 @@ and is between 256 and 4096 characters. It is defined in the file
 
        noclflush       [BUGS=X86] Don't use the CLFLUSH instruction
 
-       nohlt           [BUGS=ARM,SH]
+       nohlt           [BUGS=ARM,SH] Tells the kernel that the sleep(SH) or
+                       wfi(ARM) instruction doesn't work correctly and not to
+                       use it. This is also useful when using JTAG debugger.
 
        no-hlt          [BUGS=X86-32] Tells the kernel that the hlt
                        instruction doesn't work correctly and not to
index 1da9d1b1793f3436b3561a48de7fbcd8c893e74e..4edd39ec7db91abcbcbac352fc3c5e6d3e3c3eca 100644 (file)
@@ -164,15 +164,19 @@ All md devices contain:
   raid_disks
      a text file with a simple number indicating the number of devices
      in a fully functional array.  If this is not yet known, the file
-     will be empty.  If an array is being resized (not currently
-     possible) this will contain the larger of the old and new sizes.
-     Some raid level (RAID1) allow this value to be set while the
-     array is active.  This will reconfigure the array.   Otherwise
-     it can only be set while assembling an array.
+     will be empty.  If an array is being resized this will contain
+     the new number of devices.
+     Some raid levels allow this value to be set while the array is
+     active.  This will reconfigure the array.   Otherwise it can only
+     be set while assembling an array.
+     A change to this attribute will not be permitted if it would
+     reduce the size of the array.  To reduce the number of drives
+     in an e.g. raid5, the array size must first be reduced by
+     setting the 'array_size' attribute.
 
   chunk_size
-     This is the size if bytes for 'chunks' and is only relevant to
-     raid levels that involve striping (1,4,5,6,10). The address space
+     This is the size in bytes for 'chunks' and is only relevant to
+     raid levels that involve striping (0,4,5,6,10). The address space
      of the array is conceptually divided into chunks and consecutive
      chunks are striped onto neighbouring devices.
      The size should be at least PAGE_SIZE (4k) and should be a power
@@ -183,6 +187,20 @@ All md devices contain:
      simply a number that is interpretted differently by different
      levels.  It can be written while assembling an array.
 
+  array_size
+     This can be used to artificially constrain the available space in
+     the array to be less than is actually available on the combined
+     devices.  Writing a number (in Kilobytes) which is less than
+     the available size will set the size.  Any reconfiguration of the
+     array (e.g. adding devices) will not cause the size to change.
+     Writing the word 'default' will cause the effective size of the
+     array to be whatever size is actually available based on
+     'level', 'chunk_size' and 'component_size'.
+
+     This can be used to reduce the size of the array before reducing
+     the number of devices in a raid4/5/6, or to support external
+     metadata formats which mandate such clipping.
+
   reshape_position
      This is either "none" or a sector number within the devices of
      the array where "reshape" is up to.  If this is set, the three
@@ -207,6 +225,11 @@ All md devices contain:
      about the array.  It can be 0.90 (traditional format), 1.0, 1.1,
      1.2 (newer format in varying locations) or "none" indicating that
      the kernel isn't managing metadata at all.
+     Alternately it can be "external:" followed by a string which
+     is set by user-space.  This indicates that metadata is managed
+     by a user-space program.  Any device failure or other event that
+     requires a metadata update will cause array activity to be
+     suspended until the event is acknowledged.
 
   resync_start
      The point at which resync should start.  If no resync is needed,
diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
new file mode 100644 (file)
index 0000000..ebc50f8
--- /dev/null
@@ -0,0 +1,174 @@
+                    ====================================
+                    SLOW WORK ITEM EXECUTION THREAD POOL
+                    ====================================
+
+By: David Howells <dhowells@redhat.com>
+
+The slow work item execution thread pool is a pool of threads for performing
+things that take a relatively long time, such as making mkdir calls.
+Typically, when processing something, these items will spend a lot of time
+blocking a thread on I/O, thus making that thread unavailable for doing other
+work.
+
+The standard workqueue model is unsuitable for this class of work item as that
+limits the owner to a single thread or a single thread per CPU.  For some
+tasks, however, more threads - or fewer - are required.
+
+There is just one pool per system.  It contains no threads unless something
+wants to use it - and that something must register its interest first.  When
+the pool is active, the number of threads it contains is dynamic, varying
+between a maximum and minimum setting, depending on the load.
+
+
+====================
+CLASSES OF WORK ITEM
+====================
+
+This pool support two classes of work items:
+
+ (*) Slow work items.
+
+ (*) Very slow work items.
+
+The former are expected to finish much quicker than the latter.
+
+An operation of the very slow class may do a batch combination of several
+lookups, mkdirs, and a create for instance.
+
+An operation of the ordinarily slow class may, for example, write stuff or
+expand files, provided the time taken to do so isn't too long.
+
+Operations of both types may sleep during execution, thus tying up the thread
+loaned to it.
+
+
+THREAD-TO-CLASS ALLOCATION
+--------------------------
+
+Not all the threads in the pool are available to work on very slow work items.
+The number will be between one and one fewer than the number of active threads.
+This is configurable (see the "Pool Configuration" section).
+
+All the threads are available to work on ordinarily slow work items, but a
+percentage of the threads will prefer to work on very slow work items.
+
+The configuration ensures that at least one thread will be available to work on
+very slow work items, and at least one thread will be available that won't work
+on very slow work items at all.
+
+
+=====================
+USING SLOW WORK ITEMS
+=====================
+
+Firstly, a module or subsystem wanting to make use of slow work items must
+register its interest:
+
+        int ret = slow_work_register_user();
+
+This will return 0 if successful, or a -ve error upon failure.
+
+
+Slow work items may then be set up by:
+
+ (1) Declaring a slow_work struct type variable:
+
+       #include <linux/slow-work.h>
+
+       struct slow_work myitem;
+
+ (2) Declaring the operations to be used for this item:
+
+       struct slow_work_ops myitem_ops = {
+               .get_ref = myitem_get_ref,
+               .put_ref = myitem_put_ref,
+               .execute = myitem_execute,
+       };
+
+     [*] For a description of the ops, see section "Item Operations".
+
+ (3) Initialising the item:
+
+       slow_work_init(&myitem, &myitem_ops);
+
+     or:
+
+       vslow_work_init(&myitem, &myitem_ops);
+
+     depending on its class.
+
+A suitably set up work item can then be enqueued for processing:
+
+       int ret = slow_work_enqueue(&myitem);
+
+This will return a -ve error if the thread pool is unable to gain a reference
+on the item, 0 otherwise.
+
+
+The items are reference counted, so there ought to be no need for a flush
+operation.  When all a module's slow work items have been processed, and the
+module has no further interest in the facility, it should unregister its
+interest:
+
+       slow_work_unregister_user();
+
+
+===============
+ITEM OPERATIONS
+===============
+
+Each work item requires a table of operations of type struct slow_work_ops.
+All members are required:
+
+ (*) Get a reference on an item:
+
+       int (*get_ref)(struct slow_work *work);
+
+     This allows the thread pool to attempt to pin an item by getting a
+     reference on it.  This function should return 0 if the reference was
+     granted, or a -ve error otherwise.  If an error is returned,
+     slow_work_enqueue() will fail.
+
+     The reference is held whilst the item is queued and whilst it is being
+     executed.  The item may then be requeued with the same reference held, or
+     the reference will be released.
+
+ (*) Release a reference on an item:
+
+       void (*put_ref)(struct slow_work *work);
+
+     This allows the thread pool to unpin an item by releasing the reference on
+     it.  The thread pool will not touch the item again once this has been
+     called.
+
+ (*) Execute an item:
+
+       void (*execute)(struct slow_work *work);
+
+     This should perform the work required of the item.  It may sleep, it may
+     perform disk I/O and it may wait for locks.
+
+
+==================
+POOL CONFIGURATION
+==================
+
+The slow-work thread pool has a number of configurables:
+
+ (*) /proc/sys/kernel/slow-work/min-threads
+
+     The minimum number of threads that should be in the pool whilst it is in
+     use.  This may be anywhere between 2 and max-threads.
+
+ (*) /proc/sys/kernel/slow-work/max-threads
+
+     The maximum number of threads that should in the pool.  This may be
+     anywhere between min-threads and 255 or NR_CPUS * 2, whichever is greater.
+
+ (*) /proc/sys/kernel/slow-work/vslow-percentage
+
+     The percentage of active threads in the pool that may be used to execute
+     very slow work items.  This may be between 1 and 99.  The resultant number
+     is bounded to between 1 and one fewer than the number of active threads.
+     This ensures there is always at least one thread that can process very
+     slow work items, and always at least one thread that won't.
index ebaf77ebd8b717e8fe42208f7b49c9b38f55e180..6fe6f39a3d310e9b897859c9f3c664eca3d975a4 100644 (file)
@@ -4847,7 +4847,7 @@ M:        lrg@slimlogic.co.uk
 P:     Mark Brown
 M:     broonie@opensource.wolfsonmicro.com
 W:     http://opensource.wolfsonmicro.com/node/15
-W:     http://www.slimlogic.co.uk/?page_id=5
+W:     http://www.slimlogic.co.uk/?p=48
 T:     git kernel.org/pub/scm/linux/kernel/git/lrg/voltage-2.6.git
 S:     Supported
 
@@ -4969,7 +4969,8 @@ S:        Supported
 
 XFS FILESYSTEM
 P:     Silicon Graphics Inc
-P:     Bill O'Donnell
+P:     Felix Blyakher
+M:     felixb@sgi.com
 M:     xfs-masters@oss.sgi.com
 L:     xfs@oss.sgi.com
 W:     http://oss.sgi.com/projects/xfs
index aa9d34feddc666b1dea00bab8445a51673cf88d6..679a4a3e265e8222f91c67a17ab58d838a1dddb0 100644 (file)
@@ -474,14 +474,34 @@ CONFIG_NETDEVICES=y
 # CONFIG_EQUALIZER is not set
 # CONFIG_TUN is not set
 # CONFIG_VETH is not set
-# CONFIG_PHYLIB is not set
+CONFIG_PHYLIB=y
+
+#
+# MII PHY device drivers
+#
+# CONFIG_MARVELL_PHY is not set
+# CONFIG_DAVICOM_PHY is not set
+# CONFIG_QSEMI_PHY is not set
+# CONFIG_LXT_PHY is not set
+# CONFIG_CICADA_PHY is not set
+# CONFIG_VITESSE_PHY is not set
+CONFIG_SMSC_PHY=y
+# CONFIG_BROADCOM_PHY is not set
+# CONFIG_ICPLUS_PHY is not set
+# CONFIG_REALTEK_PHY is not set
+# CONFIG_NATIONAL_PHY is not set
+# CONFIG_STE10XP is not set
+# CONFIG_LSI_ET1011C_PHY is not set
+# CONFIG_FIXED_PHY is not set
+# CONFIG_MDIO_BITBANG is not set
 CONFIG_NET_ETHERNET=y
 CONFIG_MII=y
 # CONFIG_AX88796 is not set
 # CONFIG_SMC91X is not set
 # CONFIG_DM9000 is not set
 # CONFIG_ENC28J60 is not set
-CONFIG_SMC911X=y
+# CONFIG_SMC911X is not set
+CONFIG_SMSC911X=y
 # CONFIG_IBM_NEW_EMAC_ZMII is not set
 # CONFIG_IBM_NEW_EMAC_RGMII is not set
 # CONFIG_IBM_NEW_EMAC_TAH is not set
index 62747458647027e15bfa956587adb693d2ce25ba..6e37c77c47605f06a08e1876d80fe64c0981cc50 100644 (file)
@@ -465,12 +465,33 @@ CONFIG_NETDEVICES=y
 # CONFIG_EQUALIZER is not set
 # CONFIG_TUN is not set
 # CONFIG_VETH is not set
-# CONFIG_PHYLIB is not set
+CONFIG_PHYLIB=y
+
+#
+# MII PHY device drivers
+#
+# CONFIG_MARVELL_PHY is not set
+# CONFIG_DAVICOM_PHY is not set
+# CONFIG_QSEMI_PHY is not set
+# CONFIG_LXT_PHY is not set
+# CONFIG_CICADA_PHY is not set
+# CONFIG_VITESSE_PHY is not set
+CONFIG_SMSC_PHY=y
+# CONFIG_BROADCOM_PHY is not set
+# CONFIG_ICPLUS_PHY is not set
+# CONFIG_REALTEK_PHY is not set
+# CONFIG_NATIONAL_PHY is not set
+# CONFIG_STE10XP is not set
+# CONFIG_LSI_ET1011C_PHY is not set
+# CONFIG_FIXED_PHY is not set
+# CONFIG_MDIO_BITBANG is not set
 CONFIG_NET_ETHERNET=y
 CONFIG_MII=y
 # CONFIG_AX88796 is not set
 CONFIG_SMC91X=y
 # CONFIG_DM9000 is not set
+# CONFIG_SMC911X is not set
+CONFIG_SMSC911X=y
 # CONFIG_IBM_NEW_EMAC_ZMII is not set
 # CONFIG_IBM_NEW_EMAC_RGMII is not set
 # CONFIG_IBM_NEW_EMAC_TAH is not set
index cd29824d791c7bf55457608912b3ca51716a2bd2..21db4b3ec8ff104c8b95e9c9eb1d3854536c2d25 100644 (file)
@@ -496,13 +496,33 @@ CONFIG_NETDEVICES=y
 # CONFIG_EQUALIZER is not set
 # CONFIG_TUN is not set
 # CONFIG_VETH is not set
-# CONFIG_PHYLIB is not set
+CONFIG_PHYLIB=y
+
+#
+# MII PHY device drivers
+#
+# CONFIG_MARVELL_PHY is not set
+# CONFIG_DAVICOM_PHY is not set
+# CONFIG_QSEMI_PHY is not set
+# CONFIG_LXT_PHY is not set
+# CONFIG_CICADA_PHY is not set
+# CONFIG_VITESSE_PHY is not set
+CONFIG_SMSC_PHY=y
+# CONFIG_BROADCOM_PHY is not set
+# CONFIG_ICPLUS_PHY is not set
+# CONFIG_REALTEK_PHY is not set
+# CONFIG_NATIONAL_PHY is not set
+# CONFIG_STE10XP is not set
+# CONFIG_LSI_ET1011C_PHY is not set
+# CONFIG_FIXED_PHY is not set
+# CONFIG_MDIO_BITBANG is not set
 CONFIG_NET_ETHERNET=y
 CONFIG_MII=y
 # CONFIG_AX88796 is not set
 CONFIG_SMC91X=y
 # CONFIG_DM9000 is not set
-CONFIG_SMC911X=y
+# CONFIG_SMC911X is not set
+CONFIG_SMSC911X=y
 # CONFIG_IBM_NEW_EMAC_ZMII is not set
 # CONFIG_IBM_NEW_EMAC_RGMII is not set
 # CONFIG_IBM_NEW_EMAC_TAH is not set
index 7e253f58ed18a34af0e5997a5ab497cc07ef3fdb..9a75c30b910d4564a5dc78b55cfb7a378dcf21d6 100644 (file)
@@ -490,13 +490,33 @@ CONFIG_NETDEVICES=y
 # CONFIG_EQUALIZER is not set
 # CONFIG_TUN is not set
 # CONFIG_VETH is not set
-# CONFIG_PHYLIB is not set
+CONFIG_PHYLIB=y
+
+#
+# MII PHY device drivers
+#
+# CONFIG_MARVELL_PHY is not set
+# CONFIG_DAVICOM_PHY is not set
+# CONFIG_QSEMI_PHY is not set
+# CONFIG_LXT_PHY is not set
+# CONFIG_CICADA_PHY is not set
+# CONFIG_VITESSE_PHY is not set
+CONFIG_SMSC_PHY=y
+# CONFIG_BROADCOM_PHY is not set
+# CONFIG_ICPLUS_PHY is not set
+# CONFIG_REALTEK_PHY is not set
+# CONFIG_NATIONAL_PHY is not set
+# CONFIG_STE10XP is not set
+# CONFIG_LSI_ET1011C_PHY is not set
+# CONFIG_FIXED_PHY is not set
+# CONFIG_MDIO_BITBANG is not set
 CONFIG_NET_ETHERNET=y
 CONFIG_MII=y
 # CONFIG_AX88796 is not set
 CONFIG_SMC91X=y
 # CONFIG_DM9000 is not set
-CONFIG_SMC911X=y
+# CONFIG_SMC911X is not set
+CONFIG_SMSC911X=y
 # CONFIG_IBM_NEW_EMAC_ZMII is not set
 # CONFIG_IBM_NEW_EMAC_RGMII is not set
 # CONFIG_IBM_NEW_EMAC_TAH is not set
index 7ac812dc055a792476ffdc5a93e6568ae75e069a..e26c4fe61faeaa6180c0735cff5c94c18d33073b 100644 (file)
@@ -198,17 +198,17 @@ static int at91_pm_verify_clocks(void)
        /* USB must not be using PLLB */
        if (cpu_is_at91rm9200()) {
                if ((scsr & (AT91RM9200_PMC_UHP | AT91RM9200_PMC_UDP)) != 0) {
-                       pr_debug("AT91: PM - Suspend-to-RAM with USB still active\n");
+                       pr_err("AT91: PM - Suspend-to-RAM with USB still active\n");
                        return 0;
                }
        } else if (cpu_is_at91sam9260() || cpu_is_at91sam9261() || cpu_is_at91sam9263() || cpu_is_at91sam9g20()) {
                if ((scsr & (AT91SAM926x_PMC_UHP | AT91SAM926x_PMC_UDP)) != 0) {
-                       pr_debug("AT91: PM - Suspend-to-RAM with USB still active\n");
+                       pr_err("AT91: PM - Suspend-to-RAM with USB still active\n");
                        return 0;
                }
        } else if (cpu_is_at91cap9()) {
                if ((scsr & AT91CAP9_PMC_UHP) != 0) {
-                       pr_debug("AT91: PM - Suspend-to-RAM with USB still active\n");
+                       pr_err("AT91: PM - Suspend-to-RAM with USB still active\n");
                        return 0;
                }
        }
@@ -223,7 +223,7 @@ static int at91_pm_verify_clocks(void)
 
                css = at91_sys_read(AT91_PMC_PCKR(i)) & AT91_PMC_CSS;
                if (css != AT91_PMC_CSS_SLOW) {
-                       pr_debug("AT91: PM - Suspend-to-RAM with PCK%d src %d\n", i, css);
+                       pr_err("AT91: PM - Suspend-to-RAM with PCK%d src %d\n", i, css);
                        return 0;
                }
        }
index bbbd72767a02eadeea8752f3670d79e6c39b4cff..4d9c1f872472a8e89abf73e8f1633990598d95ee 100644 (file)
@@ -28,7 +28,7 @@ static inline void arch_idle(void)
        cpu_do_idle();
 }
 
-static inline void arch_reset(char mode)
+static inline void arch_reset(char mode, const char *cmd)
 {
        __raw_writel(RESET_GLOBAL | RESET_CPU1,
                     IO_ADDRESS(GEMINI_GLOBAL_BASE) + GLOBAL_RESET);
index 001edfefec195e37752f7f26fd24d8a3c97723f2..4f5b0e0ce6cf8f87e0e843b82ce2f8b0b4cbc34c 100644 (file)
@@ -14,7 +14,7 @@ static inline void arch_idle(void)
        cpu_do_idle();
 }
 
-static inline void arch_reset(char mode)
+static inline void arch_reset(char mode, const char *cmd)
 {
        cpu_reset(0);
 }
index 5fce022114dece491665a5774945e45645030c69..c3648eff51371fe6382e3a1a75b82fb8b5e32071 100644 (file)
@@ -24,7 +24,7 @@
 #include <linux/mtd/plat-ram.h>
 #include <linux/memory.h>
 #include <linux/gpio.h>
-#include <linux/smc911x.h>
+#include <linux/smsc911x.h>
 #include <linux/interrupt.h>
 #include <linux/i2c.h>
 #include <linux/i2c/at24.h>
@@ -70,7 +70,7 @@ static struct imxuart_platform_data uart_pdata = {
        .flags = IMXUART_HAVE_RTSCTS,
 };
 
-static struct resource smc911x_resources[] = {
+static struct resource smsc911x_resources[] = {
        [0] = {
                .start          = CS1_BASE_ADDR + 0x300,
                .end            = CS1_BASE_ADDR + 0x300 + SZ_64K - 1,
@@ -79,22 +79,25 @@ static struct resource smc911x_resources[] = {
        [1] = {
                .start          = IOMUX_TO_IRQ(MX31_PIN_GPIO3_1),
                .end            = IOMUX_TO_IRQ(MX31_PIN_GPIO3_1),
-               .flags          = IORESOURCE_IRQ,
+               .flags          = IORESOURCE_IRQ | IORESOURCE_IRQ_LOWLEVEL,
        },
 };
 
-static struct smc911x_platdata smc911x_info = {
-       .flags          = SMC911X_USE_32BIT,
-       .irq_flags      = IRQF_SHARED | IRQF_TRIGGER_LOW,
+static struct smsc911x_platform_config smsc911x_info = {
+       .flags          = SMSC911X_USE_32BIT | SMSC911X_FORCE_INTERNAL_PHY |
+                         SMSC911X_SAVE_MAC_ADDRESS,
+       .irq_polarity   = SMSC911X_IRQ_POLARITY_ACTIVE_LOW,
+       .irq_type       = SMSC911X_IRQ_TYPE_OPEN_DRAIN,
+       .phy_interface  = PHY_INTERFACE_MODE_MII,
 };
 
 static struct platform_device pcm037_eth = {
-       .name           = "smc911x",
+       .name           = "smsc911x",
        .id             = -1,
-       .num_resources  = ARRAY_SIZE(smc911x_resources),
-       .resource       = smc911x_resources,
+       .num_resources  = ARRAY_SIZE(smsc911x_resources),
+       .resource       = smsc911x_resources,
        .dev            = {
-               .platform_data = &smc911x_info,
+               .platform_data = &smsc911x_info,
        },
 };
 
index a2c3fcc27a22990e4d1a2f3220b3d6a079100148..c49d9bfa3abde7694e261115e40424f586f1152f 100644 (file)
@@ -47,6 +47,8 @@ obj-$(CONFIG_MACH_OMAP_3430SDP)               += board-3430sdp.o \
 
 obj-$(CONFIG_MACH_NOKIA_RX51)          += board-rx51.o \
                                           board-rx51-peripherals.o \
+                                          mmc-twl4030.o
+
 # Platform specific device init code
 ifeq ($(CONFIG_USB_MUSB_SOC),y)
 obj-y                                  += usb-musb.o
index e096f776f996cf51df6aeb5271764cc88d7368be..da57b0fcda14c5c2ca0ac92e1da076f7176763ee 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/spi/ads7846.h>
 #include <linux/i2c/twl4030.h>
 #include <linux/io.h>
+#include <linux/smsc911x.h>
 
 #include <mach/hardware.h>
 #include <asm/mach-types.h>
 
 #include "mmc-twl4030.h"
 
-#define LDP_SMC911X_CS         1
-#define LDP_SMC911X_GPIO       152
+#define LDP_SMSC911X_CS                1
+#define LDP_SMSC911X_GPIO      152
 #define DEBUG_BASE             0x08000000
 #define LDP_ETHR_START         DEBUG_BASE
 
-static struct resource ldp_smc911x_resources[] = {
+static struct resource ldp_smsc911x_resources[] = {
        [0] = {
                .start  = LDP_ETHR_START,
                .end    = LDP_ETHR_START + SZ_4K,
@@ -59,40 +60,50 @@ static struct resource ldp_smc911x_resources[] = {
        },
 };
 
-static struct platform_device ldp_smc911x_device = {
-       .name           = "smc911x",
+static struct smsc911x_platform_config ldp_smsc911x_config = {
+       .irq_polarity   = SMSC911X_IRQ_POLARITY_ACTIVE_LOW,
+       .irq_type       = SMSC911X_IRQ_TYPE_OPEN_DRAIN,
+       .flags          = SMSC911X_USE_32BIT,
+       .phy_interface  = PHY_INTERFACE_MODE_MII,
+};
+
+static struct platform_device ldp_smsc911x_device = {
+       .name           = "smsc911x",
        .id             = -1,
-       .num_resources  = ARRAY_SIZE(ldp_smc911x_resources),
-       .resource       = ldp_smc911x_resources,
+       .num_resources  = ARRAY_SIZE(ldp_smsc911x_resources),
+       .resource       = ldp_smsc911x_resources,
+       .dev            = {
+               .platform_data = &ldp_smsc911x_config,
+       },
 };
 
 static struct platform_device *ldp_devices[] __initdata = {
-       &ldp_smc911x_device,
+       &ldp_smsc911x_device,
 };
 
-static inline void __init ldp_init_smc911x(void)
+static inline void __init ldp_init_smsc911x(void)
 {
        int eth_cs;
        unsigned long cs_mem_base;
        int eth_gpio = 0;
 
-       eth_cs = LDP_SMC911X_CS;
+       eth_cs = LDP_SMSC911X_CS;
 
        if (gpmc_cs_request(eth_cs, SZ_16M, &cs_mem_base) < 0) {
-               printk(KERN_ERR "Failed to request GPMC mem for smc911x\n");
+               printk(KERN_ERR "Failed to request GPMC mem for smsc911x\n");
                return;
        }
 
-       ldp_smc911x_resources[0].start = cs_mem_base + 0x0;
-       ldp_smc911x_resources[0].end   = cs_mem_base + 0xff;
+       ldp_smsc911x_resources[0].start = cs_mem_base + 0x0;
+       ldp_smsc911x_resources[0].end   = cs_mem_base + 0xff;
        udelay(100);
 
-       eth_gpio = LDP_SMC911X_GPIO;
+       eth_gpio = LDP_SMSC911X_GPIO;
 
-       ldp_smc911x_resources[1].start = OMAP_GPIO_IRQ(eth_gpio);
+       ldp_smsc911x_resources[1].start = OMAP_GPIO_IRQ(eth_gpio);
 
-       if (gpio_request(eth_gpio, "smc911x irq") < 0) {
-               printk(KERN_ERR "Failed to request GPIO%d for smc911x IRQ\n",
+       if (gpio_request(eth_gpio, "smsc911x irq") < 0) {
+               printk(KERN_ERR "Failed to request GPIO%d for smsc911x IRQ\n",
                                eth_gpio);
                return;
        }
@@ -104,7 +115,7 @@ static void __init omap_ldp_init_irq(void)
        omap2_init_common_hw(NULL);
        omap_init_irq();
        omap_gpio_init();
-       ldp_init_smc911x();
+       ldp_init_smsc911x();
 }
 
 static struct omap_uart_config ldp_uart_config __initdata = {
index b3f6e9d81807c9197c4b9d63ef52891f82c90844..b1f23bea863fec71ff972b5c9aec8952222f9fc4 100644 (file)
@@ -57,6 +57,9 @@
 #define GPMC_CS0_BASE  0x60
 #define GPMC_CS_SIZE   0x30
 
+#define OVERO_SMSC911X_CS      5
+#define OVERO_SMSC911X_GPIO    176
+
 #if defined(CONFIG_TOUCHSCREEN_ADS7846) || \
        defined(CONFIG_TOUCHSCREEN_ADS7846_MODULE)
 
@@ -116,6 +119,67 @@ static void __init overo_ads7846_init(void)
 static inline void __init overo_ads7846_init(void) { return; }
 #endif
 
+#if defined(CONFIG_SMSC911X) || defined(CONFIG_SMSC911X_MODULE)
+
+#include <linux/smsc911x.h>
+
+static struct resource overo_smsc911x_resources[] = {
+       {
+               .name   = "smsc911x-memory",
+               .flags  = IORESOURCE_MEM,
+       },
+       {
+               .flags  = IORESOURCE_IRQ | IORESOURCE_IRQ_LOWLEVEL,
+       },
+};
+
+static struct smsc911x_platform_config overo_smsc911x_config = {
+       .irq_polarity   = SMSC911X_IRQ_POLARITY_ACTIVE_LOW,
+       .irq_type       = SMSC911X_IRQ_TYPE_OPEN_DRAIN,
+       .flags          = SMSC911X_USE_32BIT ,
+       .phy_interface  = PHY_INTERFACE_MODE_MII,
+};
+
+static struct platform_device overo_smsc911x_device = {
+       .name           = "smsc911x",
+       .id             = -1,
+       .num_resources  = ARRAY_SIZE(overo_smsc911x_resources),
+       .resource       = &overo_smsc911x_resources,
+       .dev            = {
+               .platform_data = &overo_smsc911x_config,
+       },
+};
+
+static inline void __init overo_init_smsc911x(void)
+{
+       unsigned long cs_mem_base;
+
+       if (gpmc_cs_request(OVERO_SMSC911X_CS, SZ_16M, &cs_mem_base) < 0) {
+               printk(KERN_ERR "Failed request for GPMC mem for smsc911x\n");
+               return;
+       }
+
+       overo_smsc911x_resources[0].start = cs_mem_base + 0x0;
+       overo_smsc911x_resources[0].end   = cs_mem_base + 0xff;
+
+       if ((gpio_request(OVERO_SMSC911X_GPIO, "SMSC911X IRQ") == 0) &&
+           (gpio_direction_input(OVERO_SMSC911X_GPIO) == 0)) {
+               gpio_export(OVERO_SMSC911X_GPIO, 0);
+       } else {
+               printk(KERN_ERR "could not obtain gpio for SMSC911X IRQ\n");
+               return;
+       }
+
+       overo_smsc911x_resources[1].start = OMAP_GPIO_IRQ(OVERO_SMSC911X_GPIO);
+       overo_smsc911x_resources[1].end   = 0;
+
+       platform_device_register(&overo_smsc911x_device);
+}
+
+#else
+static inline void __init overo_init_smsc911x(void) { return; }
+#endif
+
 static struct mtd_partition overo_nand_partitions[] = {
        {
                .name           = "xloader",
@@ -290,6 +354,7 @@ static void __init overo_init(void)
        overo_flash_init();
        usb_musb_init();
        overo_ads7846_init();
+       overo_init_smsc911x();
 
        if ((gpio_request(OVERO_GPIO_W2W_NRESET,
                          "OVERO_GPIO_W2W_NRESET") == 0) &&
index d6766685cfc7d1b9e1c2eb95d94cdfbbc9798aca..9ab947c14f260c4be91dbc988d30e64954bf4e46 100644 (file)
@@ -28,7 +28,7 @@
 #include <linux/clocksource.h>
 #include <linux/clockchips.h>
 #include <linux/io.h>
-#include <linux/smc911x.h>
+#include <linux/smsc911x.h>
 #include <linux/ata_platform.h>
 
 #include <asm/clkdev.h>
@@ -128,14 +128,15 @@ int realview_flash_register(struct resource *res, u32 num)
        return platform_device_register(&realview_flash_device);
 }
 
-static struct smc911x_platdata realview_smc911x_platdata = {
-       .flags          = SMC911X_USE_32BIT,
-       .irq_flags      = IRQF_SHARED,
-       .irq_polarity   = 1,
+static struct smsc911x_platform_config smsc911x_config = {
+       .flags          = SMSC911X_USE_32BIT,
+       .irq_polarity   = SMSC911X_IRQ_POLARITY_ACTIVE_HIGH,
+       .irq_type       = SMSC911X_IRQ_TYPE_PUSH_PULL,
+       .phy_interface  = PHY_INTERFACE_MODE_MII,
 };
 
 static struct platform_device realview_eth_device = {
-       .name           = "smc911x",
+       .name           = "smsc911x",
        .id             = 0,
        .num_resources  = 2,
 };
@@ -145,8 +146,8 @@ int realview_eth_register(const char *name, struct resource *res)
        if (name)
                realview_eth_device.name = name;
        realview_eth_device.resource = res;
-       if (strcmp(realview_eth_device.name, "smc911x") == 0)
-               realview_eth_device.dev.platform_data = &realview_smc911x_platdata;
+       if (strcmp(realview_eth_device.name, "smsc911x") == 0)
+               realview_eth_device.dev.platform_data = &smsc911x_config;
 
        return platform_device_register(&realview_eth_device);
 }
index 67d6d9cc68b2a693b5edc2a89aa8d78125e294d3..d0d39adf640777c9f56222fbe9be9028f0926243 100644 (file)
@@ -191,6 +191,7 @@ void __cpuinit local_timer_setup(void)
        clk->name               = "dummy_timer";
        clk->features           = CLOCK_EVT_FEAT_DUMMY;
        clk->rating             = 200;
+       clk->mult               = 1;
        clk->set_mode           = dummy_timer_set_mode;
        clk->broadcast          = smp_timer_broadcast;
        clk->cpumask            = cpumask_of(cpu);
index 94077fbd96b7691850275d71114eb2b9d328f5d3..6f7e70907e443c708b6cd67e59df07ba86f56302 100644 (file)
@@ -29,10 +29,10 @@ ENTRY(v6_early_abort)
        mrc     p15, 0, r1, c5, c0, 0           @ get FSR
        mrc     p15, 0, r0, c6, c0, 0           @ get FAR
 /*
- * Faulty SWP instruction on 1136 doesn't set bit 11 in DFSR.
+ * Faulty SWP instruction on 1136 doesn't set bit 11 in DFSR (erratum 326103).
  * The test below covers all the write situations, including Java bytecodes
  */
-       bic     r1, r1, #1 << 11 | 1 << 10      @ clear bits 11 and 10 of FSR
+       bic     r1, r1, #1 << 11                @ clear bit 11 of FSR
        tst     r3, #PSR_J_BIT                  @ Java?
        movne   pc, lr
        do_thumb_abort
index d6dd83826f8af0baffd833ad11807d18377d5cfb..6e77c042d8e9417ad5b9141c6eab37767693e3ed 100644 (file)
@@ -115,6 +115,10 @@ static inline void l2_inv_pa_range(unsigned long start, unsigned long end)
        raw_local_irq_restore(flags);
 }
 
+static inline void l2_inv_all(void)
+{
+       __asm__("mcr p15, 1, %0, c15, c11, 0" : : "r" (0));
+}
 
 /*
  * Linux primitives.
@@ -254,9 +258,7 @@ static void __init enable_dcache(void)
 
 static void __init __invalidate_icache(void)
 {
-       int dummy;
-
-       __asm__ __volatile__("mcr p15, 0, %0, c7, c5, 0" : "=r" (dummy));
+       __asm__("mcr p15, 0, %0, c7, c5, 0" : : "r" (0));
 }
 
 static int __init invalidate_and_disable_icache(void)
@@ -321,6 +323,7 @@ static void __init enable_l2(void)
 
                d = flush_and_disable_dcache();
                i = invalidate_and_disable_icache();
+               l2_inv_all();
                write_extra_features(u | 0x00400000);
                if (i)
                        enable_icache();
index ba592a9e6fb36cfa868c2b4a8ade37248c0743b8..a2bed62aec21900ba3b023c7a9c39a359ea123f1 100644 (file)
  *  r10 = thread_info structure
  *  lr  = failure return
  */
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/asm-offsets.h>
-#include <asm/assembler.h>
+#include <asm/thread_info.h>
 #include <asm/vfpmacros.h>
+#include "../kernel/entry-header.S"
 
 ENTRY(do_vfp)
+#ifdef CONFIG_PREEMPT
+       ldr     r4, [r10, #TI_PREEMPT]  @ get preempt count
+       add     r11, r4, #1             @ increment it
+       str     r11, [r10, #TI_PREEMPT]
+#endif
        enable_irq
        ldr     r4, .LCvfp
        ldr     r11, [r10, #TI_CPU]     @ CPU number
@@ -30,6 +33,12 @@ ENTRY(do_vfp)
 ENDPROC(do_vfp)
 
 ENTRY(vfp_null_entry)
+#ifdef CONFIG_PREEMPT
+       get_thread_info r10
+       ldr     r4, [r10, #TI_PREEMPT]  @ get preempt count
+       sub     r11, r4, #1             @ decrement it
+       str     r11, [r10, #TI_PREEMPT]
+#endif
        mov     pc, lr
 ENDPROC(vfp_null_entry)
 
@@ -41,6 +50,12 @@ ENDPROC(vfp_null_entry)
 
        __INIT
 ENTRY(vfp_testing_entry)
+#ifdef CONFIG_PREEMPT
+       get_thread_info r10
+       ldr     r4, [r10, #TI_PREEMPT]  @ get preempt count
+       sub     r11, r4, #1             @ decrement it
+       str     r11, [r10, #TI_PREEMPT]
+#endif
        ldr     r0, VFP_arch_address
        str     r5, [r0]                @ known non-zero value
        mov     pc, r9                  @ we have handled the fault
index a5a4e57763c391598bc089102c90af94e89def6a..83c4e384b16d07efa738e293ae05ef79ea61be1f 100644 (file)
@@ -137,6 +137,12 @@ check_for_exception:
        VFPFMXR FPEXC, r1               @ restore FPEXC last
        sub     r2, r2, #4
        str     r2, [sp, #S_PC]         @ retry the instruction
+#ifdef CONFIG_PREEMPT
+       get_thread_info r10
+       ldr     r4, [r10, #TI_PREEMPT]  @ get preempt count
+       sub     r11, r4, #1             @ decrement it
+       str     r11, [r10, #TI_PREEMPT]
+#endif
        mov     pc, r9                  @ we think we have handled things
 
 
@@ -155,6 +161,12 @@ look_for_VFP_exceptions:
        @ not recognised by VFP
 
        DBGSTR  "not VFP"
+#ifdef CONFIG_PREEMPT
+       get_thread_info r10
+       ldr     r4, [r10, #TI_PREEMPT]  @ get preempt count
+       sub     r11, r4, #1             @ decrement it
+       str     r11, [r10, #TI_PREEMPT]
+#endif
        mov     pc, lr
 
 process_exception:
index 75457b30d813c1d3a359e346e418bbc4725dbea2..01599c4ef7266f9f3eb27b56bbe0cdd3f3931121 100644 (file)
@@ -266,7 +266,7 @@ void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs)
                 * on VFP subarch 1.
                 */
                 vfp_raise_exceptions(VFP_EXCEPTION_ERROR, trigger, fpscr, regs);
-                return;
+               goto exit;
        }
 
        /*
@@ -297,7 +297,7 @@ void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs)
         * the FPEXC.FP2V bit is valid only if FPEXC.EX is 1.
         */
        if (fpexc ^ (FPEXC_EX | FPEXC_FP2V))
-               return;
+               goto exit;
 
        /*
         * The barrier() here prevents fpinst2 being read
@@ -310,6 +310,8 @@ void VFP_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs)
        exceptions = vfp_emulate_instruction(trigger, orig_fpscr, regs);
        if (exceptions)
                vfp_raise_exceptions(exceptions, trigger, orig_fpscr, regs);
+ exit:
+       preempt_enable();
 }
 
 static void vfp_enable(void *unused)
index fedf3e326121cc6e61b3459b552c2cd993bcfe0b..fb8a06b9ab6a6dd39fc385956983d7c3b8d14339 100644 (file)
@@ -1,5 +1,378 @@
-#ifdef __uClinux__
-#include "bootinfo_no.h"
-#else
-#include "bootinfo_mm.h"
+/*
+** asm/bootinfo.h -- Definition of the Linux/m68k boot information structure
+**
+** Copyright 1992 by Greg Harp
+**
+** This file is subject to the terms and conditions of the GNU General Public
+** License.  See the file COPYING in the main directory of this archive
+** for more details.
+**
+** Created 09/29/92 by Greg Harp
+**
+** 5/2/94 Roman Hodek:
+**   Added bi_atari part of the machine dependent union bi_un; for now it
+**   contains just a model field to distinguish between TT and Falcon.
+** 26/7/96 Roman Zippel:
+**   Renamed to setup.h; added some useful macros to allow gcc some
+**   optimizations if possible.
+** 5/10/96 Geert Uytterhoeven:
+**   Redesign of the boot information structure; renamed to bootinfo.h again
+** 27/11/96 Geert Uytterhoeven:
+**   Backwards compatibility with bootinfo interface version 1.0
+*/
+
+#ifndef _M68K_BOOTINFO_H
+#define _M68K_BOOTINFO_H
+
+
+    /*
+     *  Bootinfo definitions
+     *
+     *  This is an easily parsable and extendable structure containing all
+     *  information to be passed from the bootstrap to the kernel.
+     *
+     *  This way I hope to keep all future changes back/forewards compatible.
+     *  Thus, keep your fingers crossed...
+     *
+     *  This structure is copied right after the kernel bss by the bootstrap
+     *  routine.
+     */
+
+#ifndef __ASSEMBLY__
+
+struct bi_record {
+    unsigned short tag;                        /* tag ID */
+    unsigned short size;               /* size of record (in bytes) */
+    unsigned long data[0];             /* data */
+};
+
+#endif /* __ASSEMBLY__ */
+
+
+    /*
+     *  Tag Definitions
+     *
+     *  Machine independent tags start counting from 0x0000
+     *  Machine dependent tags start counting from 0x8000
+     */
+
+#define BI_LAST                        0x0000  /* last record (sentinel) */
+#define BI_MACHTYPE            0x0001  /* machine type (u_long) */
+#define BI_CPUTYPE             0x0002  /* cpu type (u_long) */
+#define BI_FPUTYPE             0x0003  /* fpu type (u_long) */
+#define BI_MMUTYPE             0x0004  /* mmu type (u_long) */
+#define BI_MEMCHUNK            0x0005  /* memory chunk address and size */
+                                       /* (struct mem_info) */
+#define BI_RAMDISK             0x0006  /* ramdisk address and size */
+                                       /* (struct mem_info) */
+#define BI_COMMAND_LINE                0x0007  /* kernel command line parameters */
+                                       /* (string) */
+
+    /*
+     *  Amiga-specific tags
+     */
+
+#define BI_AMIGA_MODEL         0x8000  /* model (u_long) */
+#define BI_AMIGA_AUTOCON       0x8001  /* AutoConfig device */
+                                       /* (struct ConfigDev) */
+#define BI_AMIGA_CHIP_SIZE     0x8002  /* size of Chip RAM (u_long) */
+#define BI_AMIGA_VBLANK                0x8003  /* VBLANK frequency (u_char) */
+#define BI_AMIGA_PSFREQ                0x8004  /* power supply frequency (u_char) */
+#define BI_AMIGA_ECLOCK                0x8005  /* EClock frequency (u_long) */
+#define BI_AMIGA_CHIPSET       0x8006  /* native chipset present (u_long) */
+#define BI_AMIGA_SERPER                0x8007  /* serial port period (u_short) */
+
+    /*
+     *  Atari-specific tags
+     */
+
+#define BI_ATARI_MCH_COOKIE    0x8000  /* _MCH cookie from TOS (u_long) */
+#define BI_ATARI_MCH_TYPE      0x8001  /* special machine type (u_long) */
+                                       /* (values are ATARI_MACH_* defines */
+
+/* mch_cookie values (upper word) */
+#define ATARI_MCH_ST           0
+#define ATARI_MCH_STE          1
+#define ATARI_MCH_TT           2
+#define ATARI_MCH_FALCON       3
+
+/* mch_type values */
+#define ATARI_MACH_NORMAL      0       /* no special machine type */
+#define ATARI_MACH_MEDUSA      1       /* Medusa 040 */
+#define ATARI_MACH_HADES       2       /* Hades 040 or 060 */
+#define ATARI_MACH_AB40                3       /* Afterburner040 on Falcon */
+
+    /*
+     *  VME-specific tags
+     */
+
+#define BI_VME_TYPE            0x8000  /* VME sub-architecture (u_long) */
+#define BI_VME_BRDINFO         0x8001  /* VME board information (struct) */
+
+/* BI_VME_TYPE codes */
+#define        VME_TYPE_TP34V          0x0034  /* Tadpole TP34V */
+#define VME_TYPE_MVME147       0x0147  /* Motorola MVME147 */
+#define VME_TYPE_MVME162       0x0162  /* Motorola MVME162 */
+#define VME_TYPE_MVME166       0x0166  /* Motorola MVME166 */
+#define VME_TYPE_MVME167       0x0167  /* Motorola MVME167 */
+#define VME_TYPE_MVME172       0x0172  /* Motorola MVME172 */
+#define VME_TYPE_MVME177       0x0177  /* Motorola MVME177 */
+#define VME_TYPE_BVME4000      0x4000  /* BVM Ltd. BVME4000 */
+#define VME_TYPE_BVME6000      0x6000  /* BVM Ltd. BVME6000 */
+
+/* BI_VME_BRDINFO is a 32 byte struct as returned by the Bug code on
+ * Motorola VME boards.  Contains board number, Bug version, board
+ * configuration options, etc.  See include/asm/mvme16xhw.h for details.
+ */
+
+
+    /*
+     *  Macintosh-specific tags (all u_long)
+     */
+
+#define BI_MAC_MODEL           0x8000  /* Mac Gestalt ID (model type) */
+#define BI_MAC_VADDR           0x8001  /* Mac video base address */
+#define BI_MAC_VDEPTH          0x8002  /* Mac video depth */
+#define BI_MAC_VROW            0x8003  /* Mac video rowbytes */
+#define BI_MAC_VDIM            0x8004  /* Mac video dimensions */
+#define BI_MAC_VLOGICAL                0x8005  /* Mac video logical base */
+#define BI_MAC_SCCBASE         0x8006  /* Mac SCC base address */
+#define BI_MAC_BTIME           0x8007  /* Mac boot time */
+#define BI_MAC_GMTBIAS         0x8008  /* Mac GMT timezone offset */
+#define BI_MAC_MEMSIZE         0x8009  /* Mac RAM size (sanity check) */
+#define BI_MAC_CPUID           0x800a  /* Mac CPU type (sanity check) */
+#define BI_MAC_ROMBASE         0x800b  /* Mac system ROM base address */
+
+    /*
+     *  Macintosh hardware profile data - unused, see macintosh.h for
+     *  resonable type values
+     */
+
+#define BI_MAC_VIA1BASE                0x8010  /* Mac VIA1 base address (always present) */
+#define BI_MAC_VIA2BASE                0x8011  /* Mac VIA2 base address (type varies) */
+#define BI_MAC_VIA2TYPE                0x8012  /* Mac VIA2 type (VIA, RBV, OSS) */
+#define BI_MAC_ADBTYPE         0x8013  /* Mac ADB interface type */
+#define BI_MAC_ASCBASE         0x8014  /* Mac Apple Sound Chip base address */
+#define BI_MAC_SCSI5380                0x8015  /* Mac NCR 5380 SCSI (base address, multi) */
+#define BI_MAC_SCSIDMA         0x8016  /* Mac SCSI DMA (base address) */
+#define BI_MAC_SCSI5396                0x8017  /* Mac NCR 53C96 SCSI (base address, multi) */
+#define BI_MAC_IDETYPE         0x8018  /* Mac IDE interface type */
+#define BI_MAC_IDEBASE         0x8019  /* Mac IDE interface base address */
+#define BI_MAC_NUBUS           0x801a  /* Mac Nubus type (none, regular, pseudo) */
+#define BI_MAC_SLOTMASK                0x801b  /* Mac Nubus slots present */
+#define BI_MAC_SCCTYPE         0x801c  /* Mac SCC serial type (normal, IOP) */
+#define BI_MAC_ETHTYPE         0x801d  /* Mac builtin ethernet type (Sonic, MACE */
+#define BI_MAC_ETHBASE         0x801e  /* Mac builtin ethernet base address */
+#define BI_MAC_PMU             0x801f  /* Mac power management / poweroff hardware */
+#define BI_MAC_IOP_SWIM                0x8020  /* Mac SWIM floppy IOP */
+#define BI_MAC_IOP_ADB         0x8021  /* Mac ADB IOP */
+
+    /*
+     * Mac: compatibility with old booter data format (temporarily)
+     * Fields unused with the new bootinfo can be deleted now; instead of
+     * adding new fields the struct might be splitted into a hardware address
+     * part and a hardware type part
+     */
+
+#ifndef __ASSEMBLY__
+
+struct mac_booter_data
+{
+       unsigned long videoaddr;
+       unsigned long videorow;
+       unsigned long videodepth;
+       unsigned long dimensions;
+       unsigned long args;
+       unsigned long boottime;
+       unsigned long gmtbias;
+       unsigned long bootver;
+       unsigned long videological;
+       unsigned long sccbase;
+       unsigned long id;
+       unsigned long memsize;
+       unsigned long serialmf;
+       unsigned long serialhsk;
+       unsigned long serialgpi;
+       unsigned long printmf;
+       unsigned long printhsk;
+       unsigned long printgpi;
+       unsigned long cpuid;
+       unsigned long rombase;
+       unsigned long adbdelay;
+       unsigned long timedbra;
+};
+
+extern struct mac_booter_data
+       mac_bi_data;
+
 #endif
+
+    /*
+     *  Apollo-specific tags
+     */
+
+#define BI_APOLLO_MODEL         0x8000  /* model (u_long) */
+
+    /*
+     *  HP300-specific tags
+     */
+
+#define BI_HP300_MODEL         0x8000  /* model (u_long) */
+#define BI_HP300_UART_SCODE    0x8001  /* UART select code (u_long) */
+#define BI_HP300_UART_ADDR     0x8002  /* phys. addr of UART (u_long) */
+
+    /*
+     * Stuff for bootinfo interface versioning
+     *
+     * At the start of kernel code, a 'struct bootversion' is located.
+     * bootstrap checks for a matching version of the interface before booting
+     * a kernel, to avoid user confusion if kernel and bootstrap don't work
+     * together :-)
+     *
+     * If incompatible changes are made to the bootinfo interface, the major
+     * number below should be stepped (and the minor reset to 0) for the
+     * appropriate machine. If a change is backward-compatible, the minor
+     * should be stepped. "Backwards-compatible" means that booting will work,
+     * but certain features may not.
+     */
+
+#define BOOTINFOV_MAGIC                        0x4249561A      /* 'BIV^Z' */
+#define MK_BI_VERSION(major,minor)     (((major)<<16)+(minor))
+#define BI_VERSION_MAJOR(v)            (((v) >> 16) & 0xffff)
+#define BI_VERSION_MINOR(v)            ((v) & 0xffff)
+
+#ifndef __ASSEMBLY__
+
+struct bootversion {
+    unsigned short branch;
+    unsigned long magic;
+    struct {
+       unsigned long machtype;
+       unsigned long version;
+    } machversions[0];
+};
+
+#endif /* __ASSEMBLY__ */
+
+#define AMIGA_BOOTI_VERSION    MK_BI_VERSION( 2, 0 )
+#define ATARI_BOOTI_VERSION    MK_BI_VERSION( 2, 1 )
+#define MAC_BOOTI_VERSION      MK_BI_VERSION( 2, 0 )
+#define MVME147_BOOTI_VERSION  MK_BI_VERSION( 2, 0 )
+#define MVME16x_BOOTI_VERSION  MK_BI_VERSION( 2, 0 )
+#define BVME6000_BOOTI_VERSION MK_BI_VERSION( 2, 0 )
+#define Q40_BOOTI_VERSION      MK_BI_VERSION( 2, 0 )
+#define HP300_BOOTI_VERSION    MK_BI_VERSION( 2, 0 )
+
+#ifdef BOOTINFO_COMPAT_1_0
+
+    /*
+     *  Backwards compatibility with bootinfo interface version 1.0
+     */
+
+#define COMPAT_AMIGA_BOOTI_VERSION    MK_BI_VERSION( 1, 0 )
+#define COMPAT_ATARI_BOOTI_VERSION    MK_BI_VERSION( 1, 0 )
+#define COMPAT_MAC_BOOTI_VERSION      MK_BI_VERSION( 1, 0 )
+
+#include <linux/zorro.h>
+
+#define COMPAT_NUM_AUTO    16
+
+struct compat_bi_Amiga {
+    int model;
+    int num_autocon;
+    struct ConfigDev autocon[COMPAT_NUM_AUTO];
+    unsigned long chip_size;
+    unsigned char vblank;
+    unsigned char psfreq;
+    unsigned long eclock;
+    unsigned long chipset;
+    unsigned long hw_present;
+};
+
+struct compat_bi_Atari {
+    unsigned long hw_present;
+    unsigned long mch_cookie;
+};
+
+#ifndef __ASSEMBLY__
+
+struct compat_bi_Macintosh
+{
+       unsigned long videoaddr;
+       unsigned long videorow;
+       unsigned long videodepth;
+       unsigned long dimensions;
+       unsigned long args;
+       unsigned long boottime;
+       unsigned long gmtbias;
+       unsigned long bootver;
+       unsigned long videological;
+       unsigned long sccbase;
+       unsigned long id;
+       unsigned long memsize;
+       unsigned long serialmf;
+       unsigned long serialhsk;
+       unsigned long serialgpi;
+       unsigned long printmf;
+       unsigned long printhsk;
+       unsigned long printgpi;
+       unsigned long cpuid;
+       unsigned long rombase;
+       unsigned long adbdelay;
+       unsigned long timedbra;
+};
+
+#endif
+
+struct compat_mem_info {
+    unsigned long addr;
+    unsigned long size;
+};
+
+#define COMPAT_NUM_MEMINFO  4
+
+#define COMPAT_CPUB_68020 0
+#define COMPAT_CPUB_68030 1
+#define COMPAT_CPUB_68040 2
+#define COMPAT_CPUB_68060 3
+#define COMPAT_FPUB_68881 5
+#define COMPAT_FPUB_68882 6
+#define COMPAT_FPUB_68040 7
+#define COMPAT_FPUB_68060 8
+
+#define COMPAT_CPU_68020    (1<<COMPAT_CPUB_68020)
+#define COMPAT_CPU_68030    (1<<COMPAT_CPUB_68030)
+#define COMPAT_CPU_68040    (1<<COMPAT_CPUB_68040)
+#define COMPAT_CPU_68060    (1<<COMPAT_CPUB_68060)
+#define COMPAT_CPU_MASK     (31)
+#define COMPAT_FPU_68881    (1<<COMPAT_FPUB_68881)
+#define COMPAT_FPU_68882    (1<<COMPAT_FPUB_68882)
+#define COMPAT_FPU_68040    (1<<COMPAT_FPUB_68040)
+#define COMPAT_FPU_68060    (1<<COMPAT_FPUB_68060)
+#define COMPAT_FPU_MASK     (0xfe0)
+
+#define COMPAT_CL_SIZE      (256)
+
+struct compat_bootinfo {
+    unsigned long machtype;
+    unsigned long cputype;
+    struct compat_mem_info memory[COMPAT_NUM_MEMINFO];
+    int num_memory;
+    unsigned long ramdisk_size;
+    unsigned long ramdisk_addr;
+    char command_line[COMPAT_CL_SIZE];
+    union {
+       struct compat_bi_Amiga     bi_ami;
+       struct compat_bi_Atari     bi_ata;
+       struct compat_bi_Macintosh bi_mac;
+    } bi_un;
+};
+
+#define bi_amiga       bi_un.bi_ami
+#define bi_atari       bi_un.bi_ata
+#define bi_mac         bi_un.bi_mac
+
+#endif /* BOOTINFO_COMPAT_1_0 */
+
+
+#endif /* _M68K_BOOTINFO_H */
diff --git a/arch/m68k/include/asm/bootinfo_mm.h b/arch/m68k/include/asm/bootinfo_mm.h
deleted file mode 100644 (file)
index fb8a06b..0000000
+++ /dev/null
@@ -1,378 +0,0 @@
-/*
-** asm/bootinfo.h -- Definition of the Linux/m68k boot information structure
-**
-** Copyright 1992 by Greg Harp
-**
-** This file is subject to the terms and conditions of the GNU General Public
-** License.  See the file COPYING in the main directory of this archive
-** for more details.
-**
-** Created 09/29/92 by Greg Harp
-**
-** 5/2/94 Roman Hodek:
-**   Added bi_atari part of the machine dependent union bi_un; for now it
-**   contains just a model field to distinguish between TT and Falcon.
-** 26/7/96 Roman Zippel:
-**   Renamed to setup.h; added some useful macros to allow gcc some
-**   optimizations if possible.
-** 5/10/96 Geert Uytterhoeven:
-**   Redesign of the boot information structure; renamed to bootinfo.h again
-** 27/11/96 Geert Uytterhoeven:
-**   Backwards compatibility with bootinfo interface version 1.0
-*/
-
-#ifndef _M68K_BOOTINFO_H
-#define _M68K_BOOTINFO_H
-
-
-    /*
-     *  Bootinfo definitions
-     *
-     *  This is an easily parsable and extendable structure containing all
-     *  information to be passed from the bootstrap to the kernel.
-     *
-     *  This way I hope to keep all future changes back/forewards compatible.
-     *  Thus, keep your fingers crossed...
-     *
-     *  This structure is copied right after the kernel bss by the bootstrap
-     *  routine.
-     */
-
-#ifndef __ASSEMBLY__
-
-struct bi_record {
-    unsigned short tag;                        /* tag ID */
-    unsigned short size;               /* size of record (in bytes) */
-    unsigned long data[0];             /* data */
-};
-
-#endif /* __ASSEMBLY__ */
-
-
-    /*
-     *  Tag Definitions
-     *
-     *  Machine independent tags start counting from 0x0000
-     *  Machine dependent tags start counting from 0x8000
-     */
-
-#define BI_LAST                        0x0000  /* last record (sentinel) */
-#define BI_MACHTYPE            0x0001  /* machine type (u_long) */
-#define BI_CPUTYPE             0x0002  /* cpu type (u_long) */
-#define BI_FPUTYPE             0x0003  /* fpu type (u_long) */
-#define BI_MMUTYPE             0x0004  /* mmu type (u_long) */
-#define BI_MEMCHUNK            0x0005  /* memory chunk address and size */
-                                       /* (struct mem_info) */
-#define BI_RAMDISK             0x0006  /* ramdisk address and size */
-                                       /* (struct mem_info) */
-#define BI_COMMAND_LINE                0x0007  /* kernel command line parameters */
-                                       /* (string) */
-
-    /*
-     *  Amiga-specific tags
-     */
-
-#define BI_AMIGA_MODEL         0x8000  /* model (u_long) */
-#define BI_AMIGA_AUTOCON       0x8001  /* AutoConfig device */
-                                       /* (struct ConfigDev) */
-#define BI_AMIGA_CHIP_SIZE     0x8002  /* size of Chip RAM (u_long) */
-#define BI_AMIGA_VBLANK                0x8003  /* VBLANK frequency (u_char) */
-#define BI_AMIGA_PSFREQ                0x8004  /* power supply frequency (u_char) */
-#define BI_AMIGA_ECLOCK                0x8005  /* EClock frequency (u_long) */
-#define BI_AMIGA_CHIPSET       0x8006  /* native chipset present (u_long) */
-#define BI_AMIGA_SERPER                0x8007  /* serial port period (u_short) */
-
-    /*
-     *  Atari-specific tags
-     */
-
-#define BI_ATARI_MCH_COOKIE    0x8000  /* _MCH cookie from TOS (u_long) */
-#define BI_ATARI_MCH_TYPE      0x8001  /* special machine type (u_long) */
-                                       /* (values are ATARI_MACH_* defines */
-
-/* mch_cookie values (upper word) */
-#define ATARI_MCH_ST           0
-#define ATARI_MCH_STE          1
-#define ATARI_MCH_TT           2
-#define ATARI_MCH_FALCON       3
-
-/* mch_type values */
-#define ATARI_MACH_NORMAL      0       /* no special machine type */
-#define ATARI_MACH_MEDUSA      1       /* Medusa 040 */
-#define ATARI_MACH_HADES       2       /* Hades 040 or 060 */
-#define ATARI_MACH_AB40                3       /* Afterburner040 on Falcon */
-
-    /*
-     *  VME-specific tags
-     */
-
-#define BI_VME_TYPE            0x8000  /* VME sub-architecture (u_long) */
-#define BI_VME_BRDINFO         0x8001  /* VME board information (struct) */
-
-/* BI_VME_TYPE codes */
-#define        VME_TYPE_TP34V          0x0034  /* Tadpole TP34V */
-#define VME_TYPE_MVME147       0x0147  /* Motorola MVME147 */
-#define VME_TYPE_MVME162       0x0162  /* Motorola MVME162 */
-#define VME_TYPE_MVME166       0x0166  /* Motorola MVME166 */
-#define VME_TYPE_MVME167       0x0167  /* Motorola MVME167 */
-#define VME_TYPE_MVME172       0x0172  /* Motorola MVME172 */
-#define VME_TYPE_MVME177       0x0177  /* Motorola MVME177 */
-#define VME_TYPE_BVME4000      0x4000  /* BVM Ltd. BVME4000 */
-#define VME_TYPE_BVME6000      0x6000  /* BVM Ltd. BVME6000 */
-
-/* BI_VME_BRDINFO is a 32 byte struct as returned by the Bug code on
- * Motorola VME boards.  Contains board number, Bug version, board
- * configuration options, etc.  See include/asm/mvme16xhw.h for details.
- */
-
-
-    /*
-     *  Macintosh-specific tags (all u_long)
-     */
-
-#define BI_MAC_MODEL           0x8000  /* Mac Gestalt ID (model type) */
-#define BI_MAC_VADDR           0x8001  /* Mac video base address */
-#define BI_MAC_VDEPTH          0x8002  /* Mac video depth */
-#define BI_MAC_VROW            0x8003  /* Mac video rowbytes */
-#define BI_MAC_VDIM            0x8004  /* Mac video dimensions */
-#define BI_MAC_VLOGICAL                0x8005  /* Mac video logical base */
-#define BI_MAC_SCCBASE         0x8006  /* Mac SCC base address */
-#define BI_MAC_BTIME           0x8007  /* Mac boot time */
-#define BI_MAC_GMTBIAS         0x8008  /* Mac GMT timezone offset */
-#define BI_MAC_MEMSIZE         0x8009  /* Mac RAM size (sanity check) */
-#define BI_MAC_CPUID           0x800a  /* Mac CPU type (sanity check) */
-#define BI_MAC_ROMBASE         0x800b  /* Mac system ROM base address */
-
-    /*
-     *  Macintosh hardware profile data - unused, see macintosh.h for
-     *  resonable type values
-     */
-
-#define BI_MAC_VIA1BASE                0x8010  /* Mac VIA1 base address (always present) */
-#define BI_MAC_VIA2BASE                0x8011  /* Mac VIA2 base address (type varies) */
-#define BI_MAC_VIA2TYPE                0x8012  /* Mac VIA2 type (VIA, RBV, OSS) */
-#define BI_MAC_ADBTYPE         0x8013  /* Mac ADB interface type */
-#define BI_MAC_ASCBASE         0x8014  /* Mac Apple Sound Chip base address */
-#define BI_MAC_SCSI5380                0x8015  /* Mac NCR 5380 SCSI (base address, multi) */
-#define BI_MAC_SCSIDMA         0x8016  /* Mac SCSI DMA (base address) */
-#define BI_MAC_SCSI5396                0x8017  /* Mac NCR 53C96 SCSI (base address, multi) */
-#define BI_MAC_IDETYPE         0x8018  /* Mac IDE interface type */
-#define BI_MAC_IDEBASE         0x8019  /* Mac IDE interface base address */
-#define BI_MAC_NUBUS           0x801a  /* Mac Nubus type (none, regular, pseudo) */
-#define BI_MAC_SLOTMASK                0x801b  /* Mac Nubus slots present */
-#define BI_MAC_SCCTYPE         0x801c  /* Mac SCC serial type (normal, IOP) */
-#define BI_MAC_ETHTYPE         0x801d  /* Mac builtin ethernet type (Sonic, MACE */
-#define BI_MAC_ETHBASE         0x801e  /* Mac builtin ethernet base address */
-#define BI_MAC_PMU             0x801f  /* Mac power management / poweroff hardware */
-#define BI_MAC_IOP_SWIM                0x8020  /* Mac SWIM floppy IOP */
-#define BI_MAC_IOP_ADB         0x8021  /* Mac ADB IOP */
-
-    /*
-     * Mac: compatibility with old booter data format (temporarily)
-     * Fields unused with the new bootinfo can be deleted now; instead of
-     * adding new fields the struct might be splitted into a hardware address
-     * part and a hardware type part
-     */
-
-#ifndef __ASSEMBLY__
-
-struct mac_booter_data
-{
-       unsigned long videoaddr;
-       unsigned long videorow;
-       unsigned long videodepth;
-       unsigned long dimensions;
-       unsigned long args;
-       unsigned long boottime;
-       unsigned long gmtbias;
-       unsigned long bootver;
-       unsigned long videological;
-       unsigned long sccbase;
-       unsigned long id;
-       unsigned long memsize;
-       unsigned long serialmf;
-       unsigned long serialhsk;
-       unsigned long serialgpi;
-       unsigned long printmf;
-       unsigned long printhsk;
-       unsigned long printgpi;
-       unsigned long cpuid;
-       unsigned long rombase;
-       unsigned long adbdelay;
-       unsigned long timedbra;
-};
-
-extern struct mac_booter_data
-       mac_bi_data;
-
-#endif
-
-    /*
-     *  Apollo-specific tags
-     */
-
-#define BI_APOLLO_MODEL         0x8000  /* model (u_long) */
-
-    /*
-     *  HP300-specific tags
-     */
-
-#define BI_HP300_MODEL         0x8000  /* model (u_long) */
-#define BI_HP300_UART_SCODE    0x8001  /* UART select code (u_long) */
-#define BI_HP300_UART_ADDR     0x8002  /* phys. addr of UART (u_long) */
-
-    /*
-     * Stuff for bootinfo interface versioning
-     *
-     * At the start of kernel code, a 'struct bootversion' is located.
-     * bootstrap checks for a matching version of the interface before booting
-     * a kernel, to avoid user confusion if kernel and bootstrap don't work
-     * together :-)
-     *
-     * If incompatible changes are made to the bootinfo interface, the major
-     * number below should be stepped (and the minor reset to 0) for the
-     * appropriate machine. If a change is backward-compatible, the minor
-     * should be stepped. "Backwards-compatible" means that booting will work,
-     * but certain features may not.
-     */
-
-#define BOOTINFOV_MAGIC                        0x4249561A      /* 'BIV^Z' */
-#define MK_BI_VERSION(major,minor)     (((major)<<16)+(minor))
-#define BI_VERSION_MAJOR(v)            (((v) >> 16) & 0xffff)
-#define BI_VERSION_MINOR(v)            ((v) & 0xffff)
-
-#ifndef __ASSEMBLY__
-
-struct bootversion {
-    unsigned short branch;
-    unsigned long magic;
-    struct {
-       unsigned long machtype;
-       unsigned long version;
-    } machversions[0];
-};
-
-#endif /* __ASSEMBLY__ */
-
-#define AMIGA_BOOTI_VERSION    MK_BI_VERSION( 2, 0 )
-#define ATARI_BOOTI_VERSION    MK_BI_VERSION( 2, 1 )
-#define MAC_BOOTI_VERSION      MK_BI_VERSION( 2, 0 )
-#define MVME147_BOOTI_VERSION  MK_BI_VERSION( 2, 0 )
-#define MVME16x_BOOTI_VERSION  MK_BI_VERSION( 2, 0 )
-#define BVME6000_BOOTI_VERSION MK_BI_VERSION( 2, 0 )
-#define Q40_BOOTI_VERSION      MK_BI_VERSION( 2, 0 )
-#define HP300_BOOTI_VERSION    MK_BI_VERSION( 2, 0 )
-
-#ifdef BOOTINFO_COMPAT_1_0
-
-    /*
-     *  Backwards compatibility with bootinfo interface version 1.0
-     */
-
-#define COMPAT_AMIGA_BOOTI_VERSION    MK_BI_VERSION( 1, 0 )
-#define COMPAT_ATARI_BOOTI_VERSION    MK_BI_VERSION( 1, 0 )
-#define COMPAT_MAC_BOOTI_VERSION      MK_BI_VERSION( 1, 0 )
-
-#include <linux/zorro.h>
-
-#define COMPAT_NUM_AUTO    16
-
-struct compat_bi_Amiga {
-    int model;
-    int num_autocon;
-    struct ConfigDev autocon[COMPAT_NUM_AUTO];
-    unsigned long chip_size;
-    unsigned char vblank;
-    unsigned char psfreq;
-    unsigned long eclock;
-    unsigned long chipset;
-    unsigned long hw_present;
-};
-
-struct compat_bi_Atari {
-    unsigned long hw_present;
-    unsigned long mch_cookie;
-};
-
-#ifndef __ASSEMBLY__
-
-struct compat_bi_Macintosh
-{
-       unsigned long videoaddr;
-       unsigned long videorow;
-       unsigned long videodepth;
-       unsigned long dimensions;
-       unsigned long args;
-       unsigned long boottime;
-       unsigned long gmtbias;
-       unsigned long bootver;
-       unsigned long videological;
-       unsigned long sccbase;
-       unsigned long id;
-       unsigned long memsize;
-       unsigned long serialmf;
-       unsigned long serialhsk;
-       unsigned long serialgpi;
-       unsigned long printmf;
-       unsigned long printhsk;
-       unsigned long printgpi;
-       unsigned long cpuid;
-       unsigned long rombase;
-       unsigned long adbdelay;
-       unsigned long timedbra;
-};
-
-#endif
-
-struct compat_mem_info {
-    unsigned long addr;
-    unsigned long size;
-};
-
-#define COMPAT_NUM_MEMINFO  4
-
-#define COMPAT_CPUB_68020 0
-#define COMPAT_CPUB_68030 1
-#define COMPAT_CPUB_68040 2
-#define COMPAT_CPUB_68060 3
-#define COMPAT_FPUB_68881 5
-#define COMPAT_FPUB_68882 6
-#define COMPAT_FPUB_68040 7
-#define COMPAT_FPUB_68060 8
-
-#define COMPAT_CPU_68020    (1<<COMPAT_CPUB_68020)
-#define COMPAT_CPU_68030    (1<<COMPAT_CPUB_68030)
-#define COMPAT_CPU_68040    (1<<COMPAT_CPUB_68040)
-#define COMPAT_CPU_68060    (1<<COMPAT_CPUB_68060)
-#define COMPAT_CPU_MASK     (31)
-#define COMPAT_FPU_68881    (1<<COMPAT_FPUB_68881)
-#define COMPAT_FPU_68882    (1<<COMPAT_FPUB_68882)
-#define COMPAT_FPU_68040    (1<<COMPAT_FPUB_68040)
-#define COMPAT_FPU_68060    (1<<COMPAT_FPUB_68060)
-#define COMPAT_FPU_MASK     (0xfe0)
-
-#define COMPAT_CL_SIZE      (256)
-
-struct compat_bootinfo {
-    unsigned long machtype;
-    unsigned long cputype;
-    struct compat_mem_info memory[COMPAT_NUM_MEMINFO];
-    int num_memory;
-    unsigned long ramdisk_size;
-    unsigned long ramdisk_addr;
-    char command_line[COMPAT_CL_SIZE];
-    union {
-       struct compat_bi_Amiga     bi_ami;
-       struct compat_bi_Atari     bi_ata;
-       struct compat_bi_Macintosh bi_mac;
-    } bi_un;
-};
-
-#define bi_amiga       bi_un.bi_ami
-#define bi_atari       bi_un.bi_ata
-#define bi_mac         bi_un.bi_mac
-
-#endif /* BOOTINFO_COMPAT_1_0 */
-
-
-#endif /* _M68K_BOOTINFO_H */
diff --git a/arch/m68k/include/asm/bootinfo_no.h b/arch/m68k/include/asm/bootinfo_no.h
deleted file mode 100644 (file)
index c12e526..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-
-/* Nothing for m68knommu */
index 997e0944ebc1568c2609bf3c13ef8fd73482f620..ef9a2e47352f0d8a03c55ccb2c32966b1e0d748f 100644 (file)
@@ -1,5 +1,30 @@
-#ifdef __uClinux__
-#include "bug_no.h"
+#ifndef _M68K_BUG_H
+#define _M68K_BUG_H
+
+#ifdef CONFIG_MMU
+#ifdef CONFIG_BUG
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+#ifndef CONFIG_SUN3
+#define BUG() do { \
+       printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \
+       __builtin_trap(); \
+} while (0)
 #else
-#include "bug_mm.h"
+#define BUG() do { \
+       printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \
+       panic("BUG!"); \
+} while (0)
+#endif
+#else
+#define BUG() do { \
+       __builtin_trap(); \
+} while (0)
+#endif
+
+#define HAVE_ARCH_BUG
+#endif
+#endif /* CONFIG_MMU */
+
+#include <asm-generic/bug.h>
+
 #endif
diff --git a/arch/m68k/include/asm/bug_mm.h b/arch/m68k/include/asm/bug_mm.h
deleted file mode 100644 (file)
index e5b528d..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef _M68K_BUG_H
-#define _M68K_BUG_H
-
-
-#ifdef CONFIG_BUG
-#ifdef CONFIG_DEBUG_BUGVERBOSE
-#ifndef CONFIG_SUN3
-#define BUG() do { \
-       printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \
-       __builtin_trap(); \
-} while (0)
-#else
-#define BUG() do { \
-       printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \
-       panic("BUG!"); \
-} while (0)
-#endif
-#else
-#define BUG() do { \
-       __builtin_trap(); \
-} while (0)
-#endif
-
-#define HAVE_ARCH_BUG
-#endif
-
-#include <asm-generic/bug.h>
-
-#endif
diff --git a/arch/m68k/include/asm/bug_no.h b/arch/m68k/include/asm/bug_no.h
deleted file mode 100644 (file)
index 70e7dc0..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef _M68KNOMMU_BUG_H
-#define _M68KNOMMU_BUG_H
-#include <asm-generic/bug.h>
-#endif
index 01f047d784ec1e6ca568a82a06afdb29ddd2bd01..d06207b9ba5ad25f449831d72975f1136b0d3207 100644 (file)
@@ -1,5 +1,20 @@
-#ifdef __uClinux__
-#include "bugs_no.h"
+/*
+ *  include/asm-m68k/bugs.h
+ *
+ *  Copyright (C) 1994  Linus Torvalds
+ */
+
+/*
+ * This is included by init/main.c to check for architecture-dependent bugs.
+ *
+ * Needs:
+ *     void check_bugs(void);
+ */
+
+#ifdef CONFIG_MMU
+extern void check_bugs(void);  /* in arch/m68k/kernel/setup.c */
 #else
-#include "bugs_mm.h"
+static void check_bugs(void)
+{
+}
 #endif
diff --git a/arch/m68k/include/asm/bugs_mm.h b/arch/m68k/include/asm/bugs_mm.h
deleted file mode 100644 (file)
index d019355..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- *  include/asm-m68k/bugs.h
- *
- *  Copyright (C) 1994  Linus Torvalds
- */
-
-/*
- * This is included by init/main.c to check for architecture-dependent bugs.
- *
- * Needs:
- *     void check_bugs(void);
- */
-
-extern void check_bugs(void);  /* in arch/m68k/kernel/setup.c */
diff --git a/arch/m68k/include/asm/bugs_no.h b/arch/m68k/include/asm/bugs_no.h
deleted file mode 100644 (file)
index 5f382da..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- *  include/asm-m68k/bugs.h
- *
- *  Copyright (C) 1994  Linus Torvalds
- */
-
-/*
- * This is included by init/main.c to check for architecture-dependent bugs.
- *
- * Needs:
- *     void check_bugs(void);
- */
-
-static void check_bugs(void)
-{
-}
index 599c29bc8f40c9a958edbeaabcaf36ee8482d44f..fed3fd30de7e468797a85ff5945119c436f92fce 100644 (file)
@@ -1,5 +1,11 @@
-#ifdef __uClinux__
-#include "cache_no.h"
-#else
-#include "cache_mm.h"
+/*
+ * include/asm-m68k/cache.h
+ */
+#ifndef __ARCH_M68K_CACHE_H
+#define __ARCH_M68K_CACHE_H
+
+/* bytes per L1 cache line */
+#define        L1_CACHE_SHIFT  4
+#define        L1_CACHE_BYTES  (1<< L1_CACHE_SHIFT)
+
 #endif
diff --git a/arch/m68k/include/asm/cache_mm.h b/arch/m68k/include/asm/cache_mm.h
deleted file mode 100644 (file)
index fed3fd3..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-/*
- * include/asm-m68k/cache.h
- */
-#ifndef __ARCH_M68K_CACHE_H
-#define __ARCH_M68K_CACHE_H
-
-/* bytes per L1 cache line */
-#define        L1_CACHE_SHIFT  4
-#define        L1_CACHE_BYTES  (1<< L1_CACHE_SHIFT)
-
-#endif
diff --git a/arch/m68k/include/asm/cache_no.h b/arch/m68k/include/asm/cache_no.h
deleted file mode 100644 (file)
index 24e9eac..0000000
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef __ARCH_M68KNOMMU_CACHE_H
-#define __ARCH_M68KNOMMU_CACHE_H
-
-/* bytes per L1 cache line */
-#define        L1_CACHE_BYTES  16      /* this need to be at least 1 */
-
-/* m68k-elf-gcc  2.95.2 doesn't like these */
-
-#define __cacheline_aligned
-#define ____cacheline_aligned
-
-#endif
index 51b056dfaedd7f2bdfcb42582769eb546472257c..91fcc5358cfea7c77b44e62792bb1e6ae497180f 100644 (file)
@@ -1,5 +1,28 @@
-#ifdef __uClinux__
-#include "current_no.h"
+#ifndef _M68K_CURRENT_H
+#define _M68K_CURRENT_H
+
+#ifdef CONFIG_MMU
+
+register struct task_struct *current __asm__("%a2");
+
 #else
-#include "current_mm.h"
-#endif
+
+/*
+ *     Rather than dedicate a register (as the m68k source does), we
+ *     just keep a global,  we should probably just change it all to be
+ *     current and lose _current_task.
+ */
+#include <linux/thread_info.h>
+
+struct task_struct;
+
+static inline struct task_struct *get_current(void)
+{
+       return(current_thread_info()->task);
+}
+
+#define        current get_current()
+
+#endif /* CONFNIG_MMU */
+
+#endif /* !(_M68K_CURRENT_H) */
diff --git a/arch/m68k/include/asm/current_mm.h b/arch/m68k/include/asm/current_mm.h
deleted file mode 100644 (file)
index 8de8f8c..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _M68K_CURRENT_H
-#define _M68K_CURRENT_H
-
-register struct task_struct *current __asm__("%a2");
-
-#endif /* !(_M68K_CURRENT_H) */
diff --git a/arch/m68k/include/asm/current_no.h b/arch/m68k/include/asm/current_no.h
deleted file mode 100644 (file)
index 53ee0f9..0000000
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef _M68KNOMMU_CURRENT_H
-#define _M68KNOMMU_CURRENT_H
-/*
- *     current.h
- *     (C) Copyright 2000, Lineo, David McCullough <davidm@uclinux.org>
- *     (C) Copyright 2002, Greg Ungerer (gerg@snapgear.com)
- *
- *     rather than dedicate a register (as the m68k source does), we
- *     just keep a global,  we should probably just change it all to be
- *     current and lose _current_task.
- */
-
-#include <linux/thread_info.h>
-
-struct task_struct;
-
-static inline struct task_struct *get_current(void)
-{
-       return(current_thread_info()->task);
-}
-
-#define        current get_current()
-
-#endif /* _M68KNOMMU_CURRENT_H */
index d211d9f54276766317f725a168f0461008381b1f..edb66148a71dc85886e3112d9af2f08019d7856e 100644 (file)
@@ -1,5 +1,34 @@
-#ifdef __uClinux__
-#include "div64_no.h"
+#ifndef _M68K_DIV64_H
+#define _M68K_DIV64_H
+
+#ifdef CONFIG_MMU
+
+#include <linux/types.h>
+
+/* n = n / base; return rem; */
+
+#define do_div(n, base) ({                                     \
+       union {                                                 \
+               unsigned long n32[2];                           \
+               unsigned long long n64;                         \
+       } __n;                                                  \
+       unsigned long __rem, __upper;                           \
+                                                               \
+       __n.n64 = (n);                                          \
+       if ((__upper = __n.n32[0])) {                           \
+               asm ("divul.l %2,%1:%0"                         \
+                       : "=d" (__n.n32[0]), "=d" (__upper)     \
+                       : "d" (base), "0" (__n.n32[0]));        \
+       }                                                       \
+       asm ("divu.l %2,%1:%0"                                  \
+               : "=d" (__n.n32[1]), "=d" (__rem)               \
+               : "d" (base), "1" (__upper), "0" (__n.n32[1])); \
+       (n) = __n.n64;                                          \
+       __rem;                                                  \
+})
+
 #else
-#include "div64_mm.h"
-#endif
+#include <asm-generic/div64.h>
+#endif /* CONFIG_MMU */
+
+#endif /* _M68K_DIV64_H */
diff --git a/arch/m68k/include/asm/div64_mm.h b/arch/m68k/include/asm/div64_mm.h
deleted file mode 100644 (file)
index 8243c93..0000000
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef _M68K_DIV64_H
-#define _M68K_DIV64_H
-
-#include <linux/types.h>
-
-/* n = n / base; return rem; */
-
-#define do_div(n, base) ({                                     \
-       union {                                                 \
-               unsigned long n32[2];                           \
-               unsigned long long n64;                         \
-       } __n;                                                  \
-       unsigned long __rem, __upper;                           \
-                                                               \
-       __n.n64 = (n);                                          \
-       if ((__upper = __n.n32[0])) {                           \
-               asm ("divul.l %2,%1:%0"                         \
-                       : "=d" (__n.n32[0]), "=d" (__upper)     \
-                       : "d" (base), "0" (__n.n32[0]));        \
-       }                                                       \
-       asm ("divu.l %2,%1:%0"                                  \
-               : "=d" (__n.n32[1]), "=d" (__rem)               \
-               : "d" (base), "1" (__upper), "0" (__n.n32[1])); \
-       (n) = __n.n64;                                          \
-       __rem;                                                  \
-})
-
-#endif /* _M68K_DIV64_H */
diff --git a/arch/m68k/include/asm/div64_no.h b/arch/m68k/include/asm/div64_no.h
deleted file mode 100644 (file)
index 6cd978c..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/div64.h>
index f4a4c7638f898997c5346c8f1564b819dafdca9a..26f505488c1197dbe004cc6486d8c41c7285ab65 100644 (file)
@@ -1,5 +1,112 @@
-#ifdef __uClinux__
-#include "dma-mapping_no.h"
+#ifndef _M68K_DMA_MAPPING_H
+#define _M68K_DMA_MAPPING_H
+
+#include <asm/cache.h>
+
+struct scatterlist;
+
+#ifndef CONFIG_MMU_SUN3
+static inline int dma_supported(struct device *dev, u64 mask)
+{
+       return 1;
+}
+
+static inline int dma_set_mask(struct device *dev, u64 mask)
+{
+       return 0;
+}
+
+static inline int dma_get_cache_alignment(void)
+{
+       return 1 << L1_CACHE_SHIFT;
+}
+
+static inline int dma_is_consistent(struct device *dev, dma_addr_t dma_addr)
+{
+       return 0;
+}
+
+extern void *dma_alloc_coherent(struct device *, size_t,
+                               dma_addr_t *, gfp_t);
+extern void dma_free_coherent(struct device *, size_t,
+                             void *, dma_addr_t);
+
+static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
+                                         dma_addr_t *handle, gfp_t flag)
+{
+       return dma_alloc_coherent(dev, size, handle, flag);
+}
+static inline void dma_free_noncoherent(struct device *dev, size_t size,
+                                       void *addr, dma_addr_t handle)
+{
+       dma_free_coherent(dev, size, addr, handle);
+}
+static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+                                 enum dma_data_direction dir)
+{
+       /* we use coherent allocation, so not much to do here. */
+}
+
+extern dma_addr_t dma_map_single(struct device *, void *, size_t,
+                                enum dma_data_direction);
+static inline void dma_unmap_single(struct device *dev, dma_addr_t addr,
+                                   size_t size, enum dma_data_direction dir)
+{
+}
+
+extern dma_addr_t dma_map_page(struct device *, struct page *,
+                              unsigned long, size_t size,
+                              enum dma_data_direction);
+static inline void dma_unmap_page(struct device *dev, dma_addr_t address,
+                                 size_t size, enum dma_data_direction dir)
+{
+}
+
+extern int dma_map_sg(struct device *, struct scatterlist *, int,
+                     enum dma_data_direction);
+static inline void dma_unmap_sg(struct device *dev, struct scatterlist *sg,
+                               int nhwentries, enum dma_data_direction dir)
+{
+}
+
+extern void dma_sync_single_for_device(struct device *, dma_addr_t, size_t,
+                                      enum dma_data_direction);
+extern void dma_sync_sg_for_device(struct device *, struct scatterlist *, int,
+                                  enum dma_data_direction);
+
+static inline void dma_sync_single_range_for_device(struct device *dev,
+               dma_addr_t dma_handle, unsigned long offset, size_t size,
+               enum dma_data_direction direction)
+{
+       /* just sync everything for now */
+       dma_sync_single_for_device(dev, dma_handle, offset + size, direction);
+}
+
+static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle,
+                                          size_t size, enum dma_data_direction dir)
+{
+}
+
+static inline void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+                                      int nents, enum dma_data_direction dir)
+{
+}
+
+static inline void dma_sync_single_range_for_cpu(struct device *dev,
+               dma_addr_t dma_handle, unsigned long offset, size_t size,
+               enum dma_data_direction direction)
+{
+       /* just sync everything for now */
+       dma_sync_single_for_cpu(dev, dma_handle, offset + size, direction);
+}
+
+static inline int dma_mapping_error(struct device *dev, dma_addr_t handle)
+{
+       return 0;
+}
+
 #else
-#include "dma-mapping_mm.h"
+#include <asm-generic/dma-mapping-broken.h>
 #endif
+
+#endif  /* _M68K_DMA_MAPPING_H */
diff --git a/arch/m68k/include/asm/dma-mapping_mm.h b/arch/m68k/include/asm/dma-mapping_mm.h
deleted file mode 100644 (file)
index 26f5054..0000000
+++ /dev/null
@@ -1,112 +0,0 @@
-#ifndef _M68K_DMA_MAPPING_H
-#define _M68K_DMA_MAPPING_H
-
-#include <asm/cache.h>
-
-struct scatterlist;
-
-#ifndef CONFIG_MMU_SUN3
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       return 1;
-}
-
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-       return 0;
-}
-
-static inline int dma_get_cache_alignment(void)
-{
-       return 1 << L1_CACHE_SHIFT;
-}
-
-static inline int dma_is_consistent(struct device *dev, dma_addr_t dma_addr)
-{
-       return 0;
-}
-
-extern void *dma_alloc_coherent(struct device *, size_t,
-                               dma_addr_t *, gfp_t);
-extern void dma_free_coherent(struct device *, size_t,
-                             void *, dma_addr_t);
-
-static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
-                                         dma_addr_t *handle, gfp_t flag)
-{
-       return dma_alloc_coherent(dev, size, handle, flag);
-}
-static inline void dma_free_noncoherent(struct device *dev, size_t size,
-                                       void *addr, dma_addr_t handle)
-{
-       dma_free_coherent(dev, size, addr, handle);
-}
-static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
-                                 enum dma_data_direction dir)
-{
-       /* we use coherent allocation, so not much to do here. */
-}
-
-extern dma_addr_t dma_map_single(struct device *, void *, size_t,
-                                enum dma_data_direction);
-static inline void dma_unmap_single(struct device *dev, dma_addr_t addr,
-                                   size_t size, enum dma_data_direction dir)
-{
-}
-
-extern dma_addr_t dma_map_page(struct device *, struct page *,
-                              unsigned long, size_t size,
-                              enum dma_data_direction);
-static inline void dma_unmap_page(struct device *dev, dma_addr_t address,
-                                 size_t size, enum dma_data_direction dir)
-{
-}
-
-extern int dma_map_sg(struct device *, struct scatterlist *, int,
-                     enum dma_data_direction);
-static inline void dma_unmap_sg(struct device *dev, struct scatterlist *sg,
-                               int nhwentries, enum dma_data_direction dir)
-{
-}
-
-extern void dma_sync_single_for_device(struct device *, dma_addr_t, size_t,
-                                      enum dma_data_direction);
-extern void dma_sync_sg_for_device(struct device *, struct scatterlist *, int,
-                                  enum dma_data_direction);
-
-static inline void dma_sync_single_range_for_device(struct device *dev,
-               dma_addr_t dma_handle, unsigned long offset, size_t size,
-               enum dma_data_direction direction)
-{
-       /* just sync everything for now */
-       dma_sync_single_for_device(dev, dma_handle, offset + size, direction);
-}
-
-static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle,
-                                          size_t size, enum dma_data_direction dir)
-{
-}
-
-static inline void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
-                                      int nents, enum dma_data_direction dir)
-{
-}
-
-static inline void dma_sync_single_range_for_cpu(struct device *dev,
-               dma_addr_t dma_handle, unsigned long offset, size_t size,
-               enum dma_data_direction direction)
-{
-       /* just sync everything for now */
-       dma_sync_single_for_cpu(dev, dma_handle, offset + size, direction);
-}
-
-static inline int dma_mapping_error(struct device *dev, dma_addr_t handle)
-{
-       return 0;
-}
-
-#else
-#include <asm-generic/dma-mapping-broken.h>
-#endif
-
-#endif  /* _M68K_DMA_MAPPING_H */
diff --git a/arch/m68k/include/asm/dma-mapping_no.h b/arch/m68k/include/asm/dma-mapping_no.h
deleted file mode 100644 (file)
index 1748f2b..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _M68KNOMMU_DMA_MAPPING_H
-#define _M68KNOMMU_DMA_MAPPING_H
-
-#include <asm-generic/dma-mapping-broken.h>
-
-#endif  /* _M68KNOMMU_DMA_MAPPING_H */
index 04ce488bc63f8d5622c1894c00bbeda2781433c2..0b0f49eb876b11db0ce7eb6cd1703be2657fbd54 100644 (file)
@@ -1,5 +1,119 @@
-#ifdef __uClinux__
-#include "elf_no.h"
+#ifndef __ASMm68k_ELF_H
+#define __ASMm68k_ELF_H
+
+/*
+ * ELF register definitions..
+ */
+
+#include <asm/ptrace.h>
+#include <asm/user.h>
+
+/*
+ * 68k ELF relocation types
+ */
+#define R_68K_NONE     0
+#define R_68K_32       1
+#define R_68K_16       2
+#define R_68K_8                3
+#define R_68K_PC32     4
+#define R_68K_PC16     5
+#define R_68K_PC8      6
+#define R_68K_GOT32    7
+#define R_68K_GOT16    8
+#define R_68K_GOT8     9
+#define R_68K_GOT32O   10
+#define R_68K_GOT16O   11
+#define R_68K_GOT8O    12
+#define R_68K_PLT32    13
+#define R_68K_PLT16    14
+#define R_68K_PLT8     15
+#define R_68K_PLT32O   16
+#define R_68K_PLT16O   17
+#define R_68K_PLT8O    18
+#define R_68K_COPY     19
+#define R_68K_GLOB_DAT 20
+#define R_68K_JMP_SLOT 21
+#define R_68K_RELATIVE 22
+
+typedef unsigned long elf_greg_t;
+
+#define ELF_NGREG (sizeof(struct user_regs_struct) / sizeof(elf_greg_t))
+typedef elf_greg_t elf_gregset_t[ELF_NGREG];
+
+typedef struct user_m68kfp_struct elf_fpregset_t;
+
+/*
+ * This is used to ensure we don't load something for the wrong architecture.
+ */
+#define elf_check_arch(x) ((x)->e_machine == EM_68K)
+
+/*
+ * These are used to set parameters in the core dumps.
+ */
+#define ELF_CLASS      ELFCLASS32
+#define ELF_DATA       ELFDATA2MSB
+#define ELF_ARCH       EM_68K
+
+/* For SVR4/m68k the function pointer to be registered with `atexit' is
+   passed in %a1.  Although my copy of the ABI has no such statement, it
+   is actually used on ASV.  */
+#define ELF_PLAT_INIT(_r, load_addr)   _r->a1 = 0
+
+#define USE_ELF_CORE_DUMP
+#ifndef CONFIG_SUN3
+#define ELF_EXEC_PAGESIZE      4096
 #else
-#include "elf_mm.h"
+#define ELF_EXEC_PAGESIZE      8192
+#endif
+
+/* This is the location that an ET_DYN program is loaded if exec'ed.  Typical
+   use of this is to invoke "./ld.so someprog" to test out a new version of
+   the loader.  We need to make sure that it is out of the way of the program
+   that it will "exec", and that there is sufficient room for the brk.  */
+
+#ifndef CONFIG_SUN3
+#define ELF_ET_DYN_BASE         0xD0000000UL
+#else
+#define ELF_ET_DYN_BASE         0x0D800000UL
+#endif
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs)                               \
+       /* Bleech. */                                                   \
+       pr_reg[0] = regs->d1;                                           \
+       pr_reg[1] = regs->d2;                                           \
+       pr_reg[2] = regs->d3;                                           \
+       pr_reg[3] = regs->d4;                                           \
+       pr_reg[4] = regs->d5;                                           \
+       pr_reg[7] = regs->a0;                                           \
+       pr_reg[8] = regs->a1;                                           \
+       pr_reg[9] = regs->a2;                                           \
+       pr_reg[14] = regs->d0;                                          \
+       pr_reg[15] = rdusp();                                           \
+       pr_reg[16] = regs->orig_d0;                                     \
+       pr_reg[17] = regs->sr;                                          \
+       pr_reg[18] = regs->pc;                                          \
+       pr_reg[19] = (regs->format << 12) | regs->vector;               \
+       {                                                               \
+         struct switch_stack *sw = ((struct switch_stack *)regs) - 1;  \
+         pr_reg[5] = sw->d6;                                           \
+         pr_reg[6] = sw->d7;                                           \
+         pr_reg[10] = sw->a3;                                          \
+         pr_reg[11] = sw->a4;                                          \
+         pr_reg[12] = sw->a5;                                          \
+         pr_reg[13] = sw->a6;                                          \
+       }
+
+/* This yields a mask that user programs can use to figure out what
+   instruction set this cpu supports.  */
+
+#define ELF_HWCAP      (0)
+
+/* This yields a string that ld.so will use to load implementation
+   specific libraries for optimization.  This is more specific in
+   intent than poking at uname or /proc/cpuinfo.  */
+
+#define ELF_PLATFORM  (NULL)
+
+#define SET_PERSONALITY(ex) set_personality(PER_LINUX)
+
 #endif
diff --git a/arch/m68k/include/asm/elf_mm.h b/arch/m68k/include/asm/elf_mm.h
deleted file mode 100644 (file)
index 0b0f49e..0000000
+++ /dev/null
@@ -1,119 +0,0 @@
-#ifndef __ASMm68k_ELF_H
-#define __ASMm68k_ELF_H
-
-/*
- * ELF register definitions..
- */
-
-#include <asm/ptrace.h>
-#include <asm/user.h>
-
-/*
- * 68k ELF relocation types
- */
-#define R_68K_NONE     0
-#define R_68K_32       1
-#define R_68K_16       2
-#define R_68K_8                3
-#define R_68K_PC32     4
-#define R_68K_PC16     5
-#define R_68K_PC8      6
-#define R_68K_GOT32    7
-#define R_68K_GOT16    8
-#define R_68K_GOT8     9
-#define R_68K_GOT32O   10
-#define R_68K_GOT16O   11
-#define R_68K_GOT8O    12
-#define R_68K_PLT32    13
-#define R_68K_PLT16    14
-#define R_68K_PLT8     15
-#define R_68K_PLT32O   16
-#define R_68K_PLT16O   17
-#define R_68K_PLT8O    18
-#define R_68K_COPY     19
-#define R_68K_GLOB_DAT 20
-#define R_68K_JMP_SLOT 21
-#define R_68K_RELATIVE 22
-
-typedef unsigned long elf_greg_t;
-
-#define ELF_NGREG (sizeof(struct user_regs_struct) / sizeof(elf_greg_t))
-typedef elf_greg_t elf_gregset_t[ELF_NGREG];
-
-typedef struct user_m68kfp_struct elf_fpregset_t;
-
-/*
- * This is used to ensure we don't load something for the wrong architecture.
- */
-#define elf_check_arch(x) ((x)->e_machine == EM_68K)
-
-/*
- * These are used to set parameters in the core dumps.
- */
-#define ELF_CLASS      ELFCLASS32
-#define ELF_DATA       ELFDATA2MSB
-#define ELF_ARCH       EM_68K
-
-/* For SVR4/m68k the function pointer to be registered with `atexit' is
-   passed in %a1.  Although my copy of the ABI has no such statement, it
-   is actually used on ASV.  */
-#define ELF_PLAT_INIT(_r, load_addr)   _r->a1 = 0
-
-#define USE_ELF_CORE_DUMP
-#ifndef CONFIG_SUN3
-#define ELF_EXEC_PAGESIZE      4096
-#else
-#define ELF_EXEC_PAGESIZE      8192
-#endif
-
-/* This is the location that an ET_DYN program is loaded if exec'ed.  Typical
-   use of this is to invoke "./ld.so someprog" to test out a new version of
-   the loader.  We need to make sure that it is out of the way of the program
-   that it will "exec", and that there is sufficient room for the brk.  */
-
-#ifndef CONFIG_SUN3
-#define ELF_ET_DYN_BASE         0xD0000000UL
-#else
-#define ELF_ET_DYN_BASE         0x0D800000UL
-#endif
-
-#define ELF_CORE_COPY_REGS(pr_reg, regs)                               \
-       /* Bleech. */                                                   \
-       pr_reg[0] = regs->d1;                                           \
-       pr_reg[1] = regs->d2;                                           \
-       pr_reg[2] = regs->d3;                                           \
-       pr_reg[3] = regs->d4;                                           \
-       pr_reg[4] = regs->d5;                                           \
-       pr_reg[7] = regs->a0;                                           \
-       pr_reg[8] = regs->a1;                                           \
-       pr_reg[9] = regs->a2;                                           \
-       pr_reg[14] = regs->d0;                                          \
-       pr_reg[15] = rdusp();                                           \
-       pr_reg[16] = regs->orig_d0;                                     \
-       pr_reg[17] = regs->sr;                                          \
-       pr_reg[18] = regs->pc;                                          \
-       pr_reg[19] = (regs->format << 12) | regs->vector;               \
-       {                                                               \
-         struct switch_stack *sw = ((struct switch_stack *)regs) - 1;  \
-         pr_reg[5] = sw->d6;                                           \
-         pr_reg[6] = sw->d7;                                           \
-         pr_reg[10] = sw->a3;                                          \
-         pr_reg[11] = sw->a4;                                          \
-         pr_reg[12] = sw->a5;                                          \
-         pr_reg[13] = sw->a6;                                          \
-       }
-
-/* This yields a mask that user programs can use to figure out what
-   instruction set this cpu supports.  */
-
-#define ELF_HWCAP      (0)
-
-/* This yields a string that ld.so will use to load implementation
-   specific libraries for optimization.  This is more specific in
-   intent than poking at uname or /proc/cpuinfo.  */
-
-#define ELF_PLATFORM  (NULL)
-
-#define SET_PERSONALITY(ex) set_personality(PER_LINUX)
-
-#endif
diff --git a/arch/m68k/include/asm/elf_no.h b/arch/m68k/include/asm/elf_no.h
deleted file mode 100644 (file)
index b804683..0000000
+++ /dev/null
@@ -1,110 +0,0 @@
-#ifndef __ASMm68k_ELF_H
-#define __ASMm68k_ELF_H
-
-/*
- * ELF register definitions..
- */
-
-#include <asm/ptrace.h>
-#include <asm/user.h>
-
-/*
- * 68k ELF relocation types
- */
-#define R_68K_NONE  0
-#define R_68K_32    1
-#define R_68K_16    2
-#define R_68K_8     3
-#define R_68K_PC32  4
-#define R_68K_PC16  5
-#define R_68K_PC8   6
-#define R_68K_GOT32 7
-#define R_68K_GOT16 8
-#define R_68K_GOT8  9
-#define R_68K_GOT32O    10
-#define R_68K_GOT16O    11
-#define R_68K_GOT8O 12
-#define R_68K_PLT32 13
-#define R_68K_PLT16 14
-#define R_68K_PLT8  15
-#define R_68K_PLT32O    16
-#define R_68K_PLT16O    17
-#define R_68K_PLT8O 18
-#define R_68K_COPY  19
-#define R_68K_GLOB_DAT  20
-#define R_68K_JMP_SLOT  21
-#define R_68K_RELATIVE  22
-
-typedef unsigned long elf_greg_t;
-
-#define ELF_NGREG (sizeof(struct user_regs_struct) / sizeof(elf_greg_t))
-typedef elf_greg_t elf_gregset_t[ELF_NGREG];
-
-typedef struct user_m68kfp_struct elf_fpregset_t;
-
-/*
- * This is used to ensure we don't load something for the wrong architecture.
- */
-#define elf_check_arch(x) ((x)->e_machine == EM_68K)
-
-/*
- * These are used to set parameters in the core dumps.
- */
-#define ELF_CLASS      ELFCLASS32
-#define ELF_DATA       ELFDATA2MSB
-#define ELF_ARCH       EM_68K
-
-/* For SVR4/m68k the function pointer to be registered with `atexit' is
-   passed in %a1.  Although my copy of the ABI has no such statement, it
-   is actually used on ASV.  */
-#define ELF_PLAT_INIT(_r, load_addr)   _r->a1 = 0
-
-#define USE_ELF_CORE_DUMP
-#define ELF_EXEC_PAGESIZE      4096
-
-/* This is the location that an ET_DYN program is loaded if exec'ed.  Typical
-   use of this is to invoke "./ld.so someprog" to test out a new version of
-   the loader.  We need to make sure that it is out of the way of the program
-   that it will "exec", and that there is sufficient room for the brk.  */
-
-#define ELF_ET_DYN_BASE         0xD0000000UL
-
-#define ELF_CORE_COPY_REGS(pr_reg, regs)                               \
-       /* Bleech. */                                                   \
-       pr_reg[0] = regs->d1;                                           \
-       pr_reg[1] = regs->d2;                                           \
-       pr_reg[2] = regs->d3;                                           \
-       pr_reg[3] = regs->d4;                                           \
-       pr_reg[4] = regs->d5;                                           \
-       pr_reg[7] = regs->a0;                                           \
-       pr_reg[8] = regs->a1;                                           \
-       pr_reg[14] = regs->d0;                                          \
-       pr_reg[15] = rdusp();                                           \
-       pr_reg[16] = 0 /* regs->orig_d0 */;                             \
-       pr_reg[17] = regs->sr;                                          \
-       pr_reg[18] = regs->pc;                                          \
-       /* pr_reg[19] = (regs->format << 12) | regs->vector; */         \
-       {                                                               \
-         struct switch_stack *sw = ((struct switch_stack *)regs) - 1;  \
-         pr_reg[5] = sw->d6;                                           \
-         pr_reg[6] = sw->d7;                                           \
-         pr_reg[10] = sw->a3;                                          \
-         pr_reg[11] = sw->a4;                                          \
-         pr_reg[12] = sw->a5;                                          \
-         pr_reg[13] = sw->a6;                                          \
-       }
-
-/* This yields a mask that user programs can use to figure out what
-   instruction set this cpu supports.  */
-
-#define ELF_HWCAP      (0)
-
-/* This yields a string that ld.so will use to load implementation
-   specific libraries for optimization.  This is more specific in
-   intent than poking at uname or /proc/cpuinfo.  */
-
-#define ELF_PLATFORM  (NULL)
-
-#define SET_PERSONALITY(ex) set_personality(PER_LINUX)
-
-#endif
index 97bcaefd2064e0776e0fb30d00dfa193cb4958ed..be4e4c6797e822eb497e72d59f33145efbb099f1 100644 (file)
@@ -1,5 +1,38 @@
-#ifdef __uClinux__
-#include "fb_no.h"
+#ifndef _ASM_FB_H_
+#define _ASM_FB_H_
+
+#include <linux/fb.h>
+#include <linux/fs.h>
+#include <asm/page.h>
+#include <asm/setup.h>
+
+#ifdef CONFIG_MMU
+#ifdef CONFIG_SUN3
+static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma,
+                               unsigned long off)
+{
+       pgprot_val(vma->vm_page_prot) |= SUN3_PAGE_NOCACHE;
+}
 #else
-#include "fb_mm.h"
-#endif
+static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma,
+                               unsigned long off)
+{
+       if (CPU_IS_020_OR_030)
+               pgprot_val(vma->vm_page_prot) |= _PAGE_NOCACHE030;
+       if (CPU_IS_040_OR_060) {
+               pgprot_val(vma->vm_page_prot) &= _CACHEMASK040;
+               /* Use no-cache mode, serialized */
+               pgprot_val(vma->vm_page_prot) |= _PAGE_NOCACHE_S;
+       }
+}
+#endif /* CONFIG_SUN3 */
+#else
+#define fb_pgprotect(...) do {} while (0)
+#endif /* CONFIG_MMU */
+
+static inline int fb_is_primary_device(struct fb_info *info)
+{
+       return 0;
+}
+
+#endif /* _ASM_FB_H_ */
diff --git a/arch/m68k/include/asm/fb_mm.h b/arch/m68k/include/asm/fb_mm.h
deleted file mode 100644 (file)
index 380b97a..0000000
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef _ASM_FB_H_
-#define _ASM_FB_H_
-
-#include <linux/fb.h>
-#include <linux/fs.h>
-#include <asm/page.h>
-#include <asm/setup.h>
-
-#ifdef CONFIG_SUN3
-static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma,
-                               unsigned long off)
-{
-       pgprot_val(vma->vm_page_prot) |= SUN3_PAGE_NOCACHE;
-}
-#else
-static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma,
-                               unsigned long off)
-{
-       if (CPU_IS_020_OR_030)
-               pgprot_val(vma->vm_page_prot) |= _PAGE_NOCACHE030;
-       if (CPU_IS_040_OR_060) {
-               pgprot_val(vma->vm_page_prot) &= _CACHEMASK040;
-               /* Use no-cache mode, serialized */
-               pgprot_val(vma->vm_page_prot) |= _PAGE_NOCACHE_S;
-       }
-}
-#endif /* CONFIG_SUN3 */
-
-static inline int fb_is_primary_device(struct fb_info *info)
-{
-       return 0;
-}
-
-#endif /* _ASM_FB_H_ */
diff --git a/arch/m68k/include/asm/fb_no.h b/arch/m68k/include/asm/fb_no.h
deleted file mode 100644 (file)
index c7df380..0000000
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _ASM_FB_H_
-#define _ASM_FB_H_
-#include <linux/fb.h>
-
-#define fb_pgprotect(...) do {} while (0)
-
-static inline int fb_is_primary_device(struct fb_info *info)
-{
-       return 0;
-}
-
-#endif /* _ASM_FB_H_ */
index e19bc5ed9c377480400782004f7719be439c0d6a..ffb6b8cfc6d59c73851c82060ff5bcbedeb8c615 100644 (file)
@@ -1,5 +1,21 @@
-#ifdef __uClinux__
-#include "fpu_no.h"
+#ifndef __M68K_FPU_H
+#define __M68K_FPU_H
+
+
+/*
+ * MAX floating point unit state size (FSAVE/FRESTORE)
+ */
+
+#if defined(CONFIG_M68020) || defined(CONFIG_M68030)
+#define FPSTATESIZE (216)
+#elif defined(CONFIG_M68040)
+#define FPSTATESIZE (96)
+#elif defined(CONFIG_M68KFPU_EMU)
+#define FPSTATESIZE (28)
+#elif defined(CONFIG_M68060)
+#define FPSTATESIZE (12)
 #else
-#include "fpu_mm.h"
+#define FPSTATESIZE (0)
 #endif
+
+#endif /* __M68K_FPU_H */
diff --git a/arch/m68k/include/asm/fpu_mm.h b/arch/m68k/include/asm/fpu_mm.h
deleted file mode 100644 (file)
index ffb6b8c..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef __M68K_FPU_H
-#define __M68K_FPU_H
-
-
-/*
- * MAX floating point unit state size (FSAVE/FRESTORE)
- */
-
-#if defined(CONFIG_M68020) || defined(CONFIG_M68030)
-#define FPSTATESIZE (216)
-#elif defined(CONFIG_M68040)
-#define FPSTATESIZE (96)
-#elif defined(CONFIG_M68KFPU_EMU)
-#define FPSTATESIZE (28)
-#elif defined(CONFIG_M68060)
-#define FPSTATESIZE (12)
-#else
-#define FPSTATESIZE (0)
-#endif
-
-#endif /* __M68K_FPU_H */
diff --git a/arch/m68k/include/asm/fpu_no.h b/arch/m68k/include/asm/fpu_no.h
deleted file mode 100644 (file)
index b16b2e4..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef __M68KNOMMU_FPU_H
-#define __M68KNOMMU_FPU_H
-
-
-/*
- * MAX floating point unit state size (FSAVE/FRESTORE)
- */
-#if defined(CONFIG_M68020) || defined(CONFIG_M68030)
-#define FPSTATESIZE (216/sizeof(unsigned char))
-#elif defined(CONFIG_M68040)
-#define FPSTATESIZE (96/sizeof(unsigned char))
-#elif defined(CONFIG_M68KFPU_EMU)
-#define FPSTATESIZE (28/sizeof(unsigned char))
-#elif defined(CONFIG_M68060)
-#define FPSTATESIZE (12/sizeof(unsigned char))
-#else
-/* Assume no FP unit present then... */
-#define FPSTATESIZE (2) /* dummy size */
-#endif
-
-#endif /* __M68K_FPU_H */
index e19526015890e2c3bc937cbed6a1fe8bb3c1042d..eacef0951fbf63da8cafe3719033a0fa99559d78 100644 (file)
@@ -1,5 +1,6 @@
-#ifdef __uClinux__
-#include "hw_irq_no.h"
-#else
-#include "hw_irq_mm.h"
+#ifndef __ASM_M68K_HW_IRQ_H
+#define __ASM_M68K_HW_IRQ_H
+
+/* Dummy include. */
+
 #endif
diff --git a/arch/m68k/include/asm/hw_irq_mm.h b/arch/m68k/include/asm/hw_irq_mm.h
deleted file mode 100644 (file)
index eacef09..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __ASM_M68K_HW_IRQ_H
-#define __ASM_M68K_HW_IRQ_H
-
-/* Dummy include. */
-
-#endif
diff --git a/arch/m68k/include/asm/hw_irq_no.h b/arch/m68k/include/asm/hw_irq_no.h
deleted file mode 100644 (file)
index f3ec9e5..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef __M68KNOMMU_HW_IRQ_H__
-#define __M68KNOMMU_HW_IRQ_H__
-
-#endif /* __M68KNOMMU_HW_IRQ_H__ */
index 045d9fd122a216f4b5701373598f821431b7df53..c843c63d380161411f70e046e16c66705b9a6fac 100644 (file)
@@ -1,5 +1,21 @@
-#ifdef __uClinux__
-#include "kmap_types_no.h"
-#else
-#include "kmap_types_mm.h"
-#endif
+#ifndef __ASM_M68K_KMAP_TYPES_H
+#define __ASM_M68K_KMAP_TYPES_H
+
+enum km_type {
+       KM_BOUNCE_READ,
+       KM_SKB_SUNRPC_DATA,
+       KM_SKB_DATA_SOFTIRQ,
+       KM_USER0,
+       KM_USER1,
+       KM_BIO_SRC_IRQ,
+       KM_BIO_DST_IRQ,
+       KM_PTE0,
+       KM_PTE1,
+       KM_IRQ0,
+       KM_IRQ1,
+       KM_SOFTIRQ0,
+       KM_SOFTIRQ1,
+       KM_TYPE_NR
+};
+
+#endif /* __ASM_M68K_KMAP_TYPES_H */
diff --git a/arch/m68k/include/asm/kmap_types_mm.h b/arch/m68k/include/asm/kmap_types_mm.h
deleted file mode 100644 (file)
index c843c63..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef __ASM_M68K_KMAP_TYPES_H
-#define __ASM_M68K_KMAP_TYPES_H
-
-enum km_type {
-       KM_BOUNCE_READ,
-       KM_SKB_SUNRPC_DATA,
-       KM_SKB_DATA_SOFTIRQ,
-       KM_USER0,
-       KM_USER1,
-       KM_BIO_SRC_IRQ,
-       KM_BIO_DST_IRQ,
-       KM_PTE0,
-       KM_PTE1,
-       KM_IRQ0,
-       KM_IRQ1,
-       KM_SOFTIRQ0,
-       KM_SOFTIRQ1,
-       KM_TYPE_NR
-};
-
-#endif /* __ASM_M68K_KMAP_TYPES_H */
diff --git a/arch/m68k/include/asm/kmap_types_no.h b/arch/m68k/include/asm/kmap_types_no.h
deleted file mode 100644 (file)
index bfb6707..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef __ASM_M68K_KMAP_TYPES_H
-#define __ASM_M68K_KMAP_TYPES_H
-
-enum km_type {
-       KM_BOUNCE_READ,
-       KM_SKB_SUNRPC_DATA,
-       KM_SKB_DATA_SOFTIRQ,
-       KM_USER0,
-       KM_USER1,
-       KM_BIO_SRC_IRQ,
-       KM_BIO_DST_IRQ,
-       KM_PTE0,
-       KM_PTE1,
-       KM_IRQ0,
-       KM_IRQ1,
-       KM_SOFTIRQ0,
-       KM_SOFTIRQ1,
-       KM_TYPE_NR
-};
-
-#endif
index 1835fd20a82cd9c1c5d30e2e174ee40b520a4871..ce603451b55e33c4d7290b61cdd26fcf722ef2a0 100644 (file)
@@ -16,6 +16,7 @@
 #define MCFINT_VECBASE      64
 #define MCFINT_UART0        26          /* Interrupt number for UART0 */
 #define MCFINT_UART1        27          /* Interrupt number for UART1 */
+#define MCFINT_UART2        28          /* Interrupt number for UART2 */
 
 #define MCF_WTM_WCR    MCF_REG16(0xFC098000)
 
index fb90dcf784262b50ebba94dfffc0305f3e7e8027..9f70a01f73dc9960e00817963aceb75b5e89db24 100644 (file)
@@ -1,5 +1,26 @@
-#ifdef __uClinux__
-#include "mc146818rtc_no.h"
-#else
-#include "mc146818rtc_mm.h"
-#endif
+/*
+ * Machine dependent access functions for RTC registers.
+ */
+#ifndef _ASM_MC146818RTC_H
+#define _ASM_MC146818RTC_H
+
+
+#ifdef CONFIG_ATARI
+/* RTC in Atari machines */
+
+#include <asm/atarihw.h>
+
+#define RTC_PORT(x)    (TT_RTC_BAS + 2*(x))
+#define RTC_ALWAYS_BCD 0
+
+#define CMOS_READ(addr) ({ \
+atari_outb_p((addr),RTC_PORT(0)); \
+atari_inb_p(RTC_PORT(1)); \
+})
+#define CMOS_WRITE(val, addr) ({ \
+atari_outb_p((addr),RTC_PORT(0)); \
+atari_outb_p((val),RTC_PORT(1)); \
+})
+#endif /* CONFIG_ATARI */
+
+#endif /* _ASM_MC146818RTC_H */
diff --git a/arch/m68k/include/asm/mc146818rtc_mm.h b/arch/m68k/include/asm/mc146818rtc_mm.h
deleted file mode 100644 (file)
index 9f70a01..0000000
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Machine dependent access functions for RTC registers.
- */
-#ifndef _ASM_MC146818RTC_H
-#define _ASM_MC146818RTC_H
-
-
-#ifdef CONFIG_ATARI
-/* RTC in Atari machines */
-
-#include <asm/atarihw.h>
-
-#define RTC_PORT(x)    (TT_RTC_BAS + 2*(x))
-#define RTC_ALWAYS_BCD 0
-
-#define CMOS_READ(addr) ({ \
-atari_outb_p((addr),RTC_PORT(0)); \
-atari_inb_p(RTC_PORT(1)); \
-})
-#define CMOS_WRITE(val, addr) ({ \
-atari_outb_p((addr),RTC_PORT(0)); \
-atari_outb_p((val),RTC_PORT(1)); \
-})
-#endif /* CONFIG_ATARI */
-
-#endif /* _ASM_MC146818RTC_H */
diff --git a/arch/m68k/include/asm/mc146818rtc_no.h b/arch/m68k/include/asm/mc146818rtc_no.h
deleted file mode 100644 (file)
index 907a048..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- * Machine dependent access functions for RTC registers.
- */
-#ifndef _M68KNOMMU_MC146818RTC_H
-#define _M68KNOMMU_MC146818RTC_H
-
-/* empty include file to satisfy the include in genrtc.c/ide-geometry.c */
-
-#endif /* _M68KNOMMU_MC146818RTC_H */
diff --git a/arch/m68k/include/asm/mcfpci.h b/arch/m68k/include/asm/mcfpci.h
deleted file mode 100644 (file)
index f1507dd..0000000
+++ /dev/null
@@ -1,119 +0,0 @@
-/****************************************************************************/
-
-/*
- *     mcfpci.h -- PCI bridge on ColdFire eval boards.
- *
- *     (C) Copyright 2000, Greg Ungerer (gerg@snapgear.com)
- *     (C) Copyright 2000, Lineo Inc. (www.lineo.com)
- */
-
-/****************************************************************************/
-#ifndef        mcfpci_h
-#define        mcfpci_h
-/****************************************************************************/
-
-
-#ifdef CONFIG_PCI
-
-/*
- *     Address regions in the PCI address space are not mapped into the
- *     normal memory space of the ColdFire. They must be accessed via
- *     handler routines. This is easy for I/O space (inb/outb/etc) but
- *     needs some code changes to support ordinary memory. Interrupts
- *     also need to be vectored through the PCI handler first, then it
- *     will call the actual driver sub-handlers.
- */
-
-/*
- *     Un-define all the standard I/O access routines.
- */
-#undef inb
-#undef inw
-#undef inl
-#undef inb_p
-#undef inw_p
-#undef insb
-#undef insw
-#undef insl
-#undef outb
-#undef outw
-#undef outl
-#undef outb_p
-#undef outw_p
-#undef outsb
-#undef outsw
-#undef outsl
-
-#undef request_irq
-#undef free_irq
-
-#undef bus_to_virt
-#undef virt_to_bus
-
-
-/*
- *     Re-direct all I/O memory accesses functions to PCI specific ones.
- */
-#define        inb     pci_inb
-#define        inw     pci_inw
-#define        inl     pci_inl
-#define        inb_p   pci_inb
-#define        inw_p   pci_inw
-#define        insb    pci_insb
-#define        insw    pci_insw
-#define        insl    pci_insl
-
-#define        outb    pci_outb
-#define        outw    pci_outw
-#define        outl    pci_outl
-#define        outb_p  pci_outb
-#define        outw_p  pci_outw
-#define        outsb   pci_outsb
-#define        outsw   pci_outsw
-#define        outsl   pci_outsl
-
-#define        request_irq     pci_request_irq
-#define        free_irq        pci_free_irq
-
-#define        virt_to_bus     pci_virt_to_bus
-#define        bus_to_virt     pci_bus_to_virt
-
-#define        CONFIG_COMEMPCI 1
-
-
-/*
- *     Prototypes of the real PCI functions (defined in bios32.c).
- */
-unsigned char  pci_inb(unsigned int addr);
-unsigned short pci_inw(unsigned int addr);
-unsigned int   pci_inl(unsigned int addr);
-void           pci_insb(void *addr, void *buf, int len);
-void           pci_insw(void *addr, void *buf, int len);
-void           pci_insl(void *addr, void *buf, int len);
-
-void           pci_outb(unsigned char val, unsigned int addr);
-void           pci_outw(unsigned short val, unsigned int addr);
-void           pci_outl(unsigned int val, unsigned int addr);
-void           pci_outsb(void *addr, void *buf, int len);
-void           pci_outsw(void *addr, void *buf, int len);
-void           pci_outsl(void *addr, void *buf, int len);
-
-int            pci_request_irq(unsigned int irq,
-                       void (*handler)(int, void *, struct pt_regs *),
-                       unsigned long flags,
-                       const char *device,
-                       void *dev_id);
-void           pci_free_irq(unsigned int irq, void *dev_id);
-
-void           *pci_bmalloc(int size);
-void           pci_bmfree(void *bmp, int len);
-void           pci_copytoshmem(unsigned long bmp, void *src, int size);
-void           pci_copyfromshmem(void *dst, unsigned long bmp, int size);
-unsigned long  pci_virt_to_bus(volatile void *address);
-void           *pci_bus_to_virt(unsigned long address);
-void           pci_bmcpyto(void *dst, void *src, int len);
-void           pci_bmcpyfrom(void *dst, void *src, int len);
-
-#endif /* CONFIG_PCI */
-/****************************************************************************/
-#endif /* mcfpci_h */
index a81d3946675fa7812cce2237deeafd1e0fdb68e0..8a11a63ee15a4e228d7622165f73095f59714326 100644 (file)
@@ -1,5 +1,13 @@
-#ifdef __uClinux__
-#include "mmu_no.h"
+#ifndef __MMU_H
+#define __MMU_H
+
+#ifdef CONFIG_MMU
+/* Default "unsigned long" context */
+typedef unsigned long mm_context_t;
 #else
-#include "mmu_mm.h"
+typedef struct {
+       unsigned long           end_brk;
+} mm_context_t;
+#endif
+
 #endif
index b440928fc6c79b592d59b39505fdd6cfc07dc28b..7d4341e55a99b85396757228e924c3b5acbbc095 100644 (file)
@@ -1,5 +1,175 @@
-#ifdef __uClinux__
-#include "mmu_context_no.h"
+#ifndef __M68K_MMU_CONTEXT_H
+#define __M68K_MMU_CONTEXT_H
+
+#include <asm-generic/mm_hooks.h>
+
+static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+}
+
+#ifdef CONFIG_MMU
+#ifndef CONFIG_SUN3
+
+#include <asm/setup.h>
+#include <asm/page.h>
+#include <asm/pgalloc.h>
+
+static inline int init_new_context(struct task_struct *tsk,
+                                  struct mm_struct *mm)
+{
+       mm->context = virt_to_phys(mm->pgd);
+       return 0;
+}
+
+#define destroy_context(mm)            do { } while(0)
+
+static inline void switch_mm_0230(struct mm_struct *mm)
+{
+       unsigned long crp[2] = {
+               0x80000000 | _PAGE_TABLE, mm->context
+       };
+       unsigned long tmp;
+
+       asm volatile (".chip 68030");
+
+       /* flush MC68030/MC68020 caches (they are virtually addressed) */
+       asm volatile (
+               "movec %%cacr,%0;"
+               "orw %1,%0; "
+               "movec %0,%%cacr"
+               : "=d" (tmp) : "di" (FLUSH_I_AND_D));
+
+       /* Switch the root pointer. For a 030-only kernel,
+        * avoid flushing the whole ATC, we only need to
+        * flush the user entries. The 68851 does this by
+        * itself. Avoid a runtime check here.
+        */
+       asm volatile (
+#ifdef CPU_M68030_ONLY
+               "pmovefd %0,%%crp; "
+               "pflush #0,#4"
 #else
-#include "mmu_context_mm.h"
+               "pmove %0,%%crp"
 #endif
+               : : "m" (crp[0]));
+
+       asm volatile (".chip 68k");
+}
+
+static inline void switch_mm_0460(struct mm_struct *mm)
+{
+       asm volatile (".chip 68040");
+
+       /* flush address translation cache (user entries) */
+       asm volatile ("pflushan");
+
+       /* switch the root pointer */
+       asm volatile ("movec %0,%%urp" : : "r" (mm->context));
+
+       if (CPU_IS_060) {
+               unsigned long tmp;
+
+               /* clear user entries in the branch cache */
+               asm volatile (
+                       "movec %%cacr,%0; "
+                       "orl %1,%0; "
+                       "movec %0,%%cacr"
+                       : "=d" (tmp): "di" (0x00200000));
+       }
+
+       asm volatile (".chip 68k");
+}
+
+static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk)
+{
+       if (prev != next) {
+               if (CPU_IS_020_OR_030)
+                       switch_mm_0230(next);
+               else
+                       switch_mm_0460(next);
+       }
+}
+
+#define deactivate_mm(tsk,mm)  do { } while (0)
+
+static inline void activate_mm(struct mm_struct *prev_mm,
+                              struct mm_struct *next_mm)
+{
+       next_mm->context = virt_to_phys(next_mm->pgd);
+
+       if (CPU_IS_020_OR_030)
+               switch_mm_0230(next_mm);
+       else
+               switch_mm_0460(next_mm);
+}
+
+#else  /* CONFIG_SUN3 */
+#include <asm/sun3mmu.h>
+#include <linux/sched.h>
+
+extern unsigned long get_free_context(struct mm_struct *mm);
+extern void clear_context(unsigned long context);
+
+/* set the context for a new task to unmapped */
+static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+       mm->context = SUN3_INVALID_CONTEXT;
+       return 0;
+}
+
+/* find the context given to this process, and if it hasn't already
+   got one, go get one for it. */
+static inline void get_mmu_context(struct mm_struct *mm)
+{
+       if(mm->context == SUN3_INVALID_CONTEXT)
+               mm->context = get_free_context(mm);
+}
+
+/* flush context if allocated... */
+static inline void destroy_context(struct mm_struct *mm)
+{
+       if(mm->context != SUN3_INVALID_CONTEXT)
+               clear_context(mm->context);
+}
+
+static inline void activate_context(struct mm_struct *mm)
+{
+       get_mmu_context(mm);
+       sun3_put_context(mm->context);
+}
+
+static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk)
+{
+       activate_context(tsk->mm);
+}
+
+#define deactivate_mm(tsk,mm)  do { } while (0)
+
+static inline void activate_mm(struct mm_struct *prev_mm,
+                              struct mm_struct *next_mm)
+{
+       activate_context(next_mm);
+}
+
+#endif
+#else /* !CONFIG_MMU */
+
+static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+       return 0;
+}
+
+
+static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk)
+{
+}
+
+#define destroy_context(mm)    do { } while (0)
+#define deactivate_mm(tsk,mm)  do { } while (0)
+
+static inline void activate_mm(struct mm_struct *prev_mm, struct mm_struct *next_mm)
+{
+}
+
+#endif /* CONFIG_MMU */
+#endif /* __M68K_MMU_CONTEXT_H */
diff --git a/arch/m68k/include/asm/mmu_context_mm.h b/arch/m68k/include/asm/mmu_context_mm.h
deleted file mode 100644 (file)
index 894dacb..0000000
+++ /dev/null
@@ -1,154 +0,0 @@
-#ifndef __M68K_MMU_CONTEXT_H
-#define __M68K_MMU_CONTEXT_H
-
-#include <asm-generic/mm_hooks.h>
-
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-}
-
-#ifndef CONFIG_SUN3
-
-#include <asm/setup.h>
-#include <asm/page.h>
-#include <asm/pgalloc.h>
-
-static inline int init_new_context(struct task_struct *tsk,
-                                  struct mm_struct *mm)
-{
-       mm->context = virt_to_phys(mm->pgd);
-       return 0;
-}
-
-#define destroy_context(mm)            do { } while(0)
-
-static inline void switch_mm_0230(struct mm_struct *mm)
-{
-       unsigned long crp[2] = {
-               0x80000000 | _PAGE_TABLE, mm->context
-       };
-       unsigned long tmp;
-
-       asm volatile (".chip 68030");
-
-       /* flush MC68030/MC68020 caches (they are virtually addressed) */
-       asm volatile (
-               "movec %%cacr,%0;"
-               "orw %1,%0; "
-               "movec %0,%%cacr"
-               : "=d" (tmp) : "di" (FLUSH_I_AND_D));
-
-       /* Switch the root pointer. For a 030-only kernel,
-        * avoid flushing the whole ATC, we only need to
-        * flush the user entries. The 68851 does this by
-        * itself. Avoid a runtime check here.
-        */
-       asm volatile (
-#ifdef CPU_M68030_ONLY
-               "pmovefd %0,%%crp; "
-               "pflush #0,#4"
-#else
-               "pmove %0,%%crp"
-#endif
-               : : "m" (crp[0]));
-
-       asm volatile (".chip 68k");
-}
-
-static inline void switch_mm_0460(struct mm_struct *mm)
-{
-       asm volatile (".chip 68040");
-
-       /* flush address translation cache (user entries) */
-       asm volatile ("pflushan");
-
-       /* switch the root pointer */
-       asm volatile ("movec %0,%%urp" : : "r" (mm->context));
-
-       if (CPU_IS_060) {
-               unsigned long tmp;
-
-               /* clear user entries in the branch cache */
-               asm volatile (
-                       "movec %%cacr,%0; "
-                       "orl %1,%0; "
-                       "movec %0,%%cacr"
-                       : "=d" (tmp): "di" (0x00200000));
-       }
-
-       asm volatile (".chip 68k");
-}
-
-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk)
-{
-       if (prev != next) {
-               if (CPU_IS_020_OR_030)
-                       switch_mm_0230(next);
-               else
-                       switch_mm_0460(next);
-       }
-}
-
-#define deactivate_mm(tsk,mm)  do { } while (0)
-
-static inline void activate_mm(struct mm_struct *prev_mm,
-                              struct mm_struct *next_mm)
-{
-       next_mm->context = virt_to_phys(next_mm->pgd);
-
-       if (CPU_IS_020_OR_030)
-               switch_mm_0230(next_mm);
-       else
-               switch_mm_0460(next_mm);
-}
-
-#else  /* CONFIG_SUN3 */
-#include <asm/sun3mmu.h>
-#include <linux/sched.h>
-
-extern unsigned long get_free_context(struct mm_struct *mm);
-extern void clear_context(unsigned long context);
-
-/* set the context for a new task to unmapped */
-static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
-{
-       mm->context = SUN3_INVALID_CONTEXT;
-       return 0;
-}
-
-/* find the context given to this process, and if it hasn't already
-   got one, go get one for it. */
-static inline void get_mmu_context(struct mm_struct *mm)
-{
-       if(mm->context == SUN3_INVALID_CONTEXT)
-               mm->context = get_free_context(mm);
-}
-
-/* flush context if allocated... */
-static inline void destroy_context(struct mm_struct *mm)
-{
-       if(mm->context != SUN3_INVALID_CONTEXT)
-               clear_context(mm->context);
-}
-
-static inline void activate_context(struct mm_struct *mm)
-{
-       get_mmu_context(mm);
-       sun3_put_context(mm->context);
-}
-
-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk)
-{
-       activate_context(tsk->mm);
-}
-
-#define deactivate_mm(tsk,mm)  do { } while (0)
-
-static inline void activate_mm(struct mm_struct *prev_mm,
-                              struct mm_struct *next_mm)
-{
-       activate_context(next_mm);
-}
-
-#endif
-#endif
diff --git a/arch/m68k/include/asm/mmu_context_no.h b/arch/m68k/include/asm/mmu_context_no.h
deleted file mode 100644 (file)
index 9ccee42..0000000
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef __M68KNOMMU_MMU_CONTEXT_H
-#define __M68KNOMMU_MMU_CONTEXT_H
-
-#include <asm/setup.h>
-#include <asm/page.h>
-#include <asm/pgalloc.h>
-#include <asm-generic/mm_hooks.h>
-
-static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-}
-
-static inline int
-init_new_context(struct task_struct *tsk, struct mm_struct *mm)
-{
-       // mm->context = virt_to_phys(mm->pgd);
-       return(0);
-}
-
-#define destroy_context(mm)            do { } while(0)
-
-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk)
-{
-}
-
-#define deactivate_mm(tsk,mm)  do { } while (0)
-
-static inline void activate_mm(struct mm_struct *prev_mm,
-                              struct mm_struct *next_mm)
-{
-}
-
-#endif
diff --git a/arch/m68k/include/asm/mmu_mm.h b/arch/m68k/include/asm/mmu_mm.h
deleted file mode 100644 (file)
index ccd36d2..0000000
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef __MMU_H
-#define __MMU_H
-
-/* Default "unsigned long" context */
-typedef unsigned long mm_context_t;
-
-#endif
diff --git a/arch/m68k/include/asm/mmu_no.h b/arch/m68k/include/asm/mmu_no.h
deleted file mode 100644 (file)
index e2da1e6..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef __M68KNOMMU_MMU_H
-#define __M68KNOMMU_MMU_H
-
-/* Copyright (C) 2002, David McCullough <davidm@snapgear.com> */
-
-typedef struct {
-       unsigned long           end_brk;
-} mm_context_t;
-
-#endif /* __M68KNOMMU_MMU_H */
index 79b59d137dd0732daeca7d24ef15d9386a07e92d..5f21e11071bdfc1aad9bf5706ff47aa3e6c4a72f 100644 (file)
@@ -1,5 +1,48 @@
-#ifdef __uClinux__
-#include "module_no.h"
+#ifndef _ASM_M68K_MODULE_H
+#define _ASM_M68K_MODULE_H
+
+#ifdef CONFIG_MMU
+
+struct mod_arch_specific {
+       struct m68k_fixup_info *fixup_start, *fixup_end;
+};
+
+#define MODULE_ARCH_INIT {                             \
+       .fixup_start            = __start_fixup,        \
+       .fixup_end              = __stop_fixup,         \
+}
+
+
+enum m68k_fixup_type {
+       m68k_fixup_memoffset,
+       m68k_fixup_vnode_shift,
+};
+
+struct m68k_fixup_info {
+       enum m68k_fixup_type type;
+       void *addr;
+};
+
+#define m68k_fixup(type, addr)                 \
+       "       .section \".m68k_fixup\",\"aw\"\n"      \
+       "       .long " #type "," #addr "\n"    \
+       "       .previous\n"
+
+extern struct m68k_fixup_info __start_fixup[], __stop_fixup[];
+
+struct module;
+extern void module_fixup(struct module *mod, struct m68k_fixup_info *start,
+                        struct m68k_fixup_info *end);
+
 #else
-#include "module_mm.h"
-#endif
+
+struct mod_arch_specific {
+};
+
+#endif /* CONFIG_MMU */
+
+#define Elf_Shdr Elf32_Shdr
+#define Elf_Sym Elf32_Sym
+#define Elf_Ehdr Elf32_Ehdr
+
+#endif /* _ASM_M68K_MODULE_H */
diff --git a/arch/m68k/include/asm/module_mm.h b/arch/m68k/include/asm/module_mm.h
deleted file mode 100644 (file)
index 382d20a..0000000
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef _ASM_M68K_MODULE_H
-#define _ASM_M68K_MODULE_H
-
-struct mod_arch_specific {
-       struct m68k_fixup_info *fixup_start, *fixup_end;
-};
-
-#define MODULE_ARCH_INIT {                             \
-       .fixup_start            = __start_fixup,        \
-       .fixup_end              = __stop_fixup,         \
-}
-
-#define Elf_Shdr Elf32_Shdr
-#define Elf_Sym Elf32_Sym
-#define Elf_Ehdr Elf32_Ehdr
-
-
-enum m68k_fixup_type {
-       m68k_fixup_memoffset,
-       m68k_fixup_vnode_shift,
-};
-
-struct m68k_fixup_info {
-       enum m68k_fixup_type type;
-       void *addr;
-};
-
-#define m68k_fixup(type, addr)                 \
-       "       .section \".m68k_fixup\",\"aw\"\n"      \
-       "       .long " #type "," #addr "\n"    \
-       "       .previous\n"
-
-extern struct m68k_fixup_info __start_fixup[], __stop_fixup[];
-
-struct module;
-extern void module_fixup(struct module *mod, struct m68k_fixup_info *start,
-                        struct m68k_fixup_info *end);
-
-#endif /* _ASM_M68K_MODULE_H */
diff --git a/arch/m68k/include/asm/module_no.h b/arch/m68k/include/asm/module_no.h
deleted file mode 100644 (file)
index 2e45ab5..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef ASM_M68KNOMMU_MODULE_H
-#define ASM_M68KNOMMU_MODULE_H
-
-struct mod_arch_specific {
-};
-
-#define Elf_Shdr Elf32_Shdr
-#define Elf_Sym Elf32_Sym
-#define Elf_Ehdr Elf32_Ehdr
-
-#endif /* ASM_M68KNOMMU_MODULE_H */
index 66455c849fbbc7fed7f3aa948e90917a73ed5d2f..1780152d81dace541507110a04e4065f49fb3cdc 100644 (file)
@@ -1,5 +1,11 @@
-#ifdef __uClinux__
-#include "page_offset_no.h"
+/* This handles the memory map.. */
+
+#ifdef CONFIG_MMU
+#ifndef CONFIG_SUN3
+#define PAGE_OFFSET_RAW                0x00000000
 #else
-#include "page_offset_mm.h"
+#define PAGE_OFFSET_RAW                0x0E000000
+#endif
+#else
+#define        PAGE_OFFSET_RAW         CONFIG_RAMBASE
 #endif
diff --git a/arch/m68k/include/asm/page_offset_mm.h b/arch/m68k/include/asm/page_offset_mm.h
deleted file mode 100644 (file)
index 1cbdb7f..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-
-/* This handles the memory map.. */
-#ifndef CONFIG_SUN3
-#define PAGE_OFFSET_RAW                0x00000000
-#else
-#define PAGE_OFFSET_RAW                0x0E000000
-#endif
-
diff --git a/arch/m68k/include/asm/page_offset_no.h b/arch/m68k/include/asm/page_offset_no.h
deleted file mode 100644 (file)
index d4e73e0..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-
-
-/* This handles the memory map.. */
-#define        PAGE_OFFSET_RAW         CONFIG_RAMBASE
-
index dbea95373080d26236121b8c498fe89d0028e620..4ad0aea48ab4e9d2b5c96a278d09c2f3eadb3ce5 100644 (file)
@@ -1,5 +1,12 @@
-#ifdef __uClinux__
-#include "pci_no.h"
-#else
-#include "pci_mm.h"
-#endif
+#ifndef _ASM_M68K_PCI_H
+#define _ASM_M68K_PCI_H
+
+#include <asm-generic/pci-dma-compat.h>
+
+/* The PCI address space does equal the physical memory
+ * address space.  The networking and block device layers use
+ * this boolean for bounce buffer decisions.
+ */
+#define PCI_DMA_BUS_IS_PHYS    (1)
+
+#endif /* _ASM_M68K_PCI_H */
diff --git a/arch/m68k/include/asm/pci_mm.h b/arch/m68k/include/asm/pci_mm.h
deleted file mode 100644 (file)
index 4ad0aea..0000000
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _ASM_M68K_PCI_H
-#define _ASM_M68K_PCI_H
-
-#include <asm-generic/pci-dma-compat.h>
-
-/* The PCI address space does equal the physical memory
- * address space.  The networking and block device layers use
- * this boolean for bounce buffer decisions.
- */
-#define PCI_DMA_BUS_IS_PHYS    (1)
-
-#endif /* _ASM_M68K_PCI_H */
diff --git a/arch/m68k/include/asm/pci_no.h b/arch/m68k/include/asm/pci_no.h
deleted file mode 100644 (file)
index 9abbc03..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef M68KNOMMU_PCI_H
-#define        M68KNOMMU_PCI_H
-
-#include <asm/pci_mm.h>
-
-#ifdef CONFIG_COMEMPCI
-/*
- *     These are pretty much arbitary with the CoMEM implementation.
- *     We have the whole address space to ourselves.
- */
-#define PCIBIOS_MIN_IO         0x100
-#define PCIBIOS_MIN_MEM                0x00010000
-
-#define pcibios_scan_all_fns(a, b)     0
-
-/*
- * Return whether the given PCI device DMA address mask can
- * be supported properly.  For example, if your device can
- * only drive the low 24-bits during PCI bus mastering, then
- * you would pass 0x00ffffff as the mask to this function.
- */
-static inline int pci_dma_supported(struct pci_dev *hwdev, u64 mask)
-{
-       return 1;
-}
-
-#endif /* CONFIG_COMEMPCI */
-
-#endif /* M68KNOMMU_PCI_H */
index 059cb73e78fc456006646659fcc076cca916f972..c294aad8a9000bd9d891d5a0a35b9987e5ed9c6b 100644 (file)
@@ -1,5 +1,19 @@
-#ifdef __uClinux__
-#include "pgalloc_no.h"
+#ifndef M68K_PGALLOC_H
+#define M68K_PGALLOC_H
+
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <asm/setup.h>
+
+#ifdef CONFIG_MMU
+#include <asm/virtconvert.h>
+#ifdef CONFIG_SUN3
+#include <asm/sun3_pgalloc.h>
 #else
-#include "pgalloc_mm.h"
+#include <asm/motorola_pgalloc.h>
 #endif
+
+extern void m68k_setup_node(int node);
+#endif
+
+#endif /* M68K_PGALLOC_H */
diff --git a/arch/m68k/include/asm/pgalloc_mm.h b/arch/m68k/include/asm/pgalloc_mm.h
deleted file mode 100644 (file)
index 4cb1a57..0000000
+++ /dev/null
@@ -1,19 +0,0 @@
-
-#ifndef M68K_PGALLOC_H
-#define M68K_PGALLOC_H
-
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <asm/setup.h>
-#include <asm/virtconvert.h>
-
-
-#ifdef CONFIG_SUN3
-#include <asm/sun3_pgalloc.h>
-#else
-#include <asm/motorola_pgalloc.h>
-#endif
-
-extern void m68k_setup_node(int node);
-
-#endif /* M68K_PGALLOC_H */
diff --git a/arch/m68k/include/asm/pgalloc_no.h b/arch/m68k/include/asm/pgalloc_no.h
deleted file mode 100644 (file)
index d6352f6..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _M68KNOMMU_PGALLOC_H
-#define _M68KNOMMU_PGALLOC_H
-
-#include <asm/setup.h>
-
-#define check_pgt_cache()      do { } while (0)
-
-#endif /* _M68KNOMMU_PGALLOC_H */
index 46251016e8212bdfef68b594e6ad1cf0e939248d..bf86b29fe64a2b025fe1890da80d90270e730912 100644 (file)
@@ -67,4 +67,6 @@ extern unsigned int kobjsize(const void *objp);
 
 #include <asm-generic/pgtable.h>
 
+#define check_pgt_cache()      do { } while (0)
+
 #endif /* _M68KNOMMU_PGTABLE_H */
index 5d3e038598441c58b459c37e30aa1e6b0a09ccc5..a4d08ea122ee3f310cd3f7caaf3205aa6b524fc1 100644 (file)
@@ -36,13 +36,16 @@ static inline unsigned int get_rtc_time(struct rtc_time *time)
         * RTC has RTC_DAY_OF_WEEK, we ignore it, as it is only updated
         * by the RTC when initially set to a non-zero value.
         */
-       mach_hwclk(0, time);
+       if (mach_hwclk)
+               mach_hwclk(0, time);
        return RTC_24H;
 }
 
 static inline int set_rtc_time(struct rtc_time *time)
 {
-       return mach_hwclk(1, time);
+       if (mach_hwclk)
+               return mach_hwclk(1, time);
+       return -EINVAL;
 }
 
 static inline unsigned int get_rtc_ss(void)
index b7e528636252ec488d43efb2c5684a86ad25e1b4..e27ad902b1cff9bb26792f3fe99ee96a45fa8bc5 100644 (file)
@@ -1,5 +1,23 @@
-#ifdef __uClinux__
-#include "scatterlist_no.h"
-#else
-#include "scatterlist_mm.h"
+#ifndef _M68K_SCATTERLIST_H
+#define _M68K_SCATTERLIST_H
+
+#include <linux/types.h>
+
+struct scatterlist {
+#ifdef CONFIG_DEBUG_SG
+       unsigned long sg_magic;
 #endif
+       unsigned long page_link;
+       unsigned int offset;
+       unsigned int length;
+
+       dma_addr_t dma_address; /* A place to hang host-specific addresses at. */
+};
+
+/* This is bogus and should go away. */
+#define ISA_DMA_THRESHOLD (0x00ffffff)
+
+#define sg_dma_address(sg)     ((sg)->dma_address)
+#define sg_dma_len(sg)         ((sg)->length)
+
+#endif /* !(_M68K_SCATTERLIST_H) */
diff --git a/arch/m68k/include/asm/scatterlist_mm.h b/arch/m68k/include/asm/scatterlist_mm.h
deleted file mode 100644 (file)
index d3a7a0e..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _M68K_SCATTERLIST_H
-#define _M68K_SCATTERLIST_H
-
-#include <linux/types.h>
-
-struct scatterlist {
-#ifdef CONFIG_DEBUG_SG
-       unsigned long sg_magic;
-#endif
-       unsigned long page_link;
-       unsigned int offset;
-       unsigned int length;
-
-       __u32 dma_address;      /* A place to hang host-specific addresses at. */
-};
-
-/* This is bogus and should go away. */
-#define ISA_DMA_THRESHOLD (0x00ffffff)
-
-#define sg_dma_address(sg)     ((sg)->dma_address)
-#define sg_dma_len(sg)         ((sg)->length)
-
-#endif /* !(_M68K_SCATTERLIST_H) */
diff --git a/arch/m68k/include/asm/scatterlist_no.h b/arch/m68k/include/asm/scatterlist_no.h
deleted file mode 100644 (file)
index afc4788..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _M68KNOMMU_SCATTERLIST_H
-#define _M68KNOMMU_SCATTERLIST_H
-
-#include <linux/mm.h>
-#include <asm/types.h>
-
-struct scatterlist {
-#ifdef CONFIG_DEBUG_SG
-       unsigned long   sg_magic;
-#endif
-       unsigned long   page_link;
-       unsigned int    offset;
-       dma_addr_t      dma_address;
-       unsigned int    length;
-};
-
-#define sg_dma_address(sg)      ((sg)->dma_address)
-#define sg_dma_len(sg)          ((sg)->length)
-
-#define ISA_DMA_THRESHOLD      (0xffffffff)
-
-#endif /* !(_M68KNOMMU_SCATTERLIST_H) */
index 82583bc004bd6a3fe5f40ca1a7ba58e46130bd3a..ee959219fdfe0fb1698e456344eeedfa39271ca2 100644 (file)
@@ -1,5 +1,63 @@
-#ifdef __uClinux__
-#include "segment_no.h"
+#ifndef _M68K_SEGMENT_H
+#define _M68K_SEGMENT_H
+
+/* define constants */
+/* Address spaces (FC0-FC2) */
+#define USER_DATA     (1)
+#ifndef __USER_DS
+#define __USER_DS     (USER_DATA)
+#endif
+#define USER_PROGRAM  (2)
+#define SUPER_DATA    (5)
+#ifndef __KERNEL_DS
+#define __KERNEL_DS   (SUPER_DATA)
+#endif
+#define SUPER_PROGRAM (6)
+#define CPU_SPACE     (7)
+
+#ifndef __ASSEMBLY__
+
+typedef struct {
+       unsigned long seg;
+} mm_segment_t;
+
+#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
+#define USER_DS                MAKE_MM_SEG(__USER_DS)
+#define KERNEL_DS      MAKE_MM_SEG(__KERNEL_DS)
+
+/*
+ * Get/set the SFC/DFC registers for MOVES instructions
+ */
+
+static inline mm_segment_t get_fs(void)
+{
+#ifdef CONFIG_MMU
+       mm_segment_t _v;
+       __asm__ ("movec %/dfc,%0":"=r" (_v.seg):);
+
+       return _v;
 #else
-#include "segment_mm.h"
+       return USER_DS;
+#endif
+}
+
+static inline mm_segment_t get_ds(void)
+{
+    /* return the supervisor data space code */
+    return KERNEL_DS;
+}
+
+static inline void set_fs(mm_segment_t val)
+{
+#ifdef CONFIG_MMU
+       __asm__ __volatile__ ("movec %0,%/sfc\n\t"
+                             "movec %0,%/dfc\n\t"
+                             : /* no outputs */ : "r" (val.seg) : "memory");
 #endif
+}
+
+#define segment_eq(a,b)        ((a).seg == (b).seg)
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _M68K_SEGMENT_H */
diff --git a/arch/m68k/include/asm/segment_mm.h b/arch/m68k/include/asm/segment_mm.h
deleted file mode 100644 (file)
index 7b0b2d3..0000000
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifndef _M68K_SEGMENT_H
-#define _M68K_SEGMENT_H
-
-/* define constants */
-/* Address spaces (FC0-FC2) */
-#define USER_DATA     (1)
-#ifndef __USER_DS
-#define __USER_DS     (USER_DATA)
-#endif
-#define USER_PROGRAM  (2)
-#define SUPER_DATA    (5)
-#ifndef __KERNEL_DS
-#define __KERNEL_DS   (SUPER_DATA)
-#endif
-#define SUPER_PROGRAM (6)
-#define CPU_SPACE     (7)
-
-#ifndef __ASSEMBLY__
-
-typedef struct {
-       unsigned long seg;
-} mm_segment_t;
-
-#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
-#define USER_DS                MAKE_MM_SEG(__USER_DS)
-#define KERNEL_DS      MAKE_MM_SEG(__KERNEL_DS)
-
-/*
- * Get/set the SFC/DFC registers for MOVES instructions
- */
-
-static inline mm_segment_t get_fs(void)
-{
-       mm_segment_t _v;
-       __asm__ ("movec %/dfc,%0":"=r" (_v.seg):);
-
-       return _v;
-}
-
-static inline mm_segment_t get_ds(void)
-{
-    /* return the supervisor data space code */
-    return KERNEL_DS;
-}
-
-static inline void set_fs(mm_segment_t val)
-{
-       __asm__ __volatile__ ("movec %0,%/sfc\n\t"
-                             "movec %0,%/dfc\n\t"
-                             : /* no outputs */ : "r" (val.seg) : "memory");
-}
-
-#define segment_eq(a,b)        ((a).seg == (b).seg)
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _M68K_SEGMENT_H */
diff --git a/arch/m68k/include/asm/segment_no.h b/arch/m68k/include/asm/segment_no.h
deleted file mode 100644 (file)
index 42318eb..0000000
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifndef _M68K_SEGMENT_H
-#define _M68K_SEGMENT_H
-
-/* define constants */
-/* Address spaces (FC0-FC2) */
-#define USER_DATA     (1)
-#ifndef __USER_DS
-#define __USER_DS     (USER_DATA)
-#endif
-#define USER_PROGRAM  (2)
-#define SUPER_DATA    (5)
-#ifndef __KERNEL_DS
-#define __KERNEL_DS   (SUPER_DATA)
-#endif
-#define SUPER_PROGRAM (6)
-#define CPU_SPACE     (7)
-
-#ifndef __ASSEMBLY__
-
-typedef struct {
-       unsigned long seg;
-} mm_segment_t;
-
-#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
-#define USER_DS                MAKE_MM_SEG(__USER_DS)
-#define KERNEL_DS      MAKE_MM_SEG(__KERNEL_DS)
-
-/*
- * Get/set the SFC/DFC registers for MOVES instructions
- */
-
-static inline mm_segment_t get_fs(void)
-{
-    return USER_DS;
-}
-
-static inline mm_segment_t get_ds(void)
-{
-    /* return the supervisor data space code */
-    return KERNEL_DS;
-}
-
-static inline void set_fs(mm_segment_t val)
-{
-}
-
-#define segment_eq(a,b)        ((a).seg == (b).seg)
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _M68K_SEGMENT_H */
index 719762980578daffdaf4dfe67156f59eef4fbb78..b87f2f278f673feda6270d9bf36ce70fe6c126e1 100644 (file)
@@ -1,5 +1,18 @@
-#ifdef __uClinux__
-#include "timex_no.h"
-#else
-#include "timex_mm.h"
+/*
+ * linux/include/asm-m68k/timex.h
+ *
+ * m68k architecture timex specifications
+ */
+#ifndef _ASMm68k_TIMEX_H
+#define _ASMm68k_TIMEX_H
+
+#define CLOCK_TICK_RATE        1193180 /* Underlying HZ */
+
+typedef unsigned long cycles_t;
+
+static inline cycles_t get_cycles(void)
+{
+       return 0;
+}
+
 #endif
diff --git a/arch/m68k/include/asm/timex_mm.h b/arch/m68k/include/asm/timex_mm.h
deleted file mode 100644 (file)
index b87f2f2..0000000
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * linux/include/asm-m68k/timex.h
- *
- * m68k architecture timex specifications
- */
-#ifndef _ASMm68k_TIMEX_H
-#define _ASMm68k_TIMEX_H
-
-#define CLOCK_TICK_RATE        1193180 /* Underlying HZ */
-
-typedef unsigned long cycles_t;
-
-static inline cycles_t get_cycles(void)
-{
-       return 0;
-}
-
-#endif
diff --git a/arch/m68k/include/asm/timex_no.h b/arch/m68k/include/asm/timex_no.h
deleted file mode 100644 (file)
index 109050f..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * linux/include/asm-m68knommu/timex.h
- *
- * m68knommu architecture timex specifications
- */
-#ifndef _ASM_M68KNOMMU_TIMEX_H
-#define _ASM_M68KNOMMU_TIMEX_H
-
-#ifdef CONFIG_COLDFIRE
-#include <asm/coldfire.h>
-#define CLOCK_TICK_RATE        MCF_CLK
-#else
-#define CLOCK_TICK_RATE        1193180 /* Underlying HZ */
-#endif
-
-typedef unsigned long cycles_t;
-
-static inline cycles_t get_cycles(void)
-{
-       return 0;
-}
-
-#endif
index b6f93b30951e0faa799a78aa0694f6b489a59c6e..a6b4ed4fc90faf9acdb31262fca3623f1f09cc40 100644 (file)
@@ -1,5 +1,267 @@
-#ifdef __uClinux__
-#include "tlbflush_no.h"
+#ifndef _M68K_TLBFLUSH_H
+#define _M68K_TLBFLUSH_H
+
+#ifdef CONFIG_MMU
+#ifndef CONFIG_SUN3
+
+#include <asm/current.h>
+
+static inline void flush_tlb_kernel_page(void *addr)
+{
+       if (CPU_IS_040_OR_060) {
+               mm_segment_t old_fs = get_fs();
+               set_fs(KERNEL_DS);
+               __asm__ __volatile__(".chip 68040\n\t"
+                                    "pflush (%0)\n\t"
+                                    ".chip 68k"
+                                    : : "a" (addr));
+               set_fs(old_fs);
+       } else if (CPU_IS_020_OR_030)
+               __asm__ __volatile__("pflush #4,#4,(%0)" : : "a" (addr));
+}
+
+/*
+ * flush all user-space atc entries.
+ */
+static inline void __flush_tlb(void)
+{
+       if (CPU_IS_040_OR_060)
+               __asm__ __volatile__(".chip 68040\n\t"
+                                    "pflushan\n\t"
+                                    ".chip 68k");
+       else if (CPU_IS_020_OR_030)
+               __asm__ __volatile__("pflush #0,#4");
+}
+
+static inline void __flush_tlb040_one(unsigned long addr)
+{
+       __asm__ __volatile__(".chip 68040\n\t"
+                            "pflush (%0)\n\t"
+                            ".chip 68k"
+                            : : "a" (addr));
+}
+
+static inline void __flush_tlb_one(unsigned long addr)
+{
+       if (CPU_IS_040_OR_060)
+               __flush_tlb040_one(addr);
+       else if (CPU_IS_020_OR_030)
+               __asm__ __volatile__("pflush #0,#4,(%0)" : : "a" (addr));
+}
+
+#define flush_tlb() __flush_tlb()
+
+/*
+ * flush all atc entries (both kernel and user-space entries).
+ */
+static inline void flush_tlb_all(void)
+{
+       if (CPU_IS_040_OR_060)
+               __asm__ __volatile__(".chip 68040\n\t"
+                                    "pflusha\n\t"
+                                    ".chip 68k");
+       else if (CPU_IS_020_OR_030)
+               __asm__ __volatile__("pflusha");
+}
+
+static inline void flush_tlb_mm(struct mm_struct *mm)
+{
+       if (mm == current->active_mm)
+               __flush_tlb();
+}
+
+static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)
+{
+       if (vma->vm_mm == current->active_mm) {
+               mm_segment_t old_fs = get_fs();
+               set_fs(USER_DS);
+               __flush_tlb_one(addr);
+               set_fs(old_fs);
+       }
+}
+
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+                                  unsigned long start, unsigned long end)
+{
+       if (vma->vm_mm == current->active_mm)
+               __flush_tlb();
+}
+
+static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+       flush_tlb_all();
+}
+
 #else
-#include "tlbflush_mm.h"
+
+
+/* Reserved PMEGs. */
+extern char sun3_reserved_pmeg[SUN3_PMEGS_NUM];
+extern unsigned long pmeg_vaddr[SUN3_PMEGS_NUM];
+extern unsigned char pmeg_alloc[SUN3_PMEGS_NUM];
+extern unsigned char pmeg_ctx[SUN3_PMEGS_NUM];
+
+/* Flush all userspace mappings one by one...  (why no flush command,
+   sun?) */
+static inline void flush_tlb_all(void)
+{
+       unsigned long addr;
+       unsigned char ctx, oldctx;
+
+       oldctx = sun3_get_context();
+       for(addr = 0x00000000; addr < TASK_SIZE; addr += SUN3_PMEG_SIZE) {
+              for(ctx = 0; ctx < 8; ctx++) {
+                      sun3_put_context(ctx);
+                      sun3_put_segmap(addr, SUN3_INVALID_PMEG);
+              }
+       }
+
+       sun3_put_context(oldctx);
+       /* erase all of the userspace pmeg maps, we've clobbered them
+         all anyway */
+       for(addr = 0; addr < SUN3_INVALID_PMEG; addr++) {
+              if(pmeg_alloc[addr] == 1) {
+                      pmeg_alloc[addr] = 0;
+                      pmeg_ctx[addr] = 0;
+                      pmeg_vaddr[addr] = 0;
+              }
+       }
+
+}
+
+/* Clear user TLB entries within the context named in mm */
+static inline void flush_tlb_mm (struct mm_struct *mm)
+{
+     unsigned char oldctx;
+     unsigned char seg;
+     unsigned long i;
+
+     oldctx = sun3_get_context();
+     sun3_put_context(mm->context);
+
+     for(i = 0; i < TASK_SIZE; i += SUN3_PMEG_SIZE) {
+            seg = sun3_get_segmap(i);
+            if(seg == SUN3_INVALID_PMEG)
+                    continue;
+
+            sun3_put_segmap(i, SUN3_INVALID_PMEG);
+            pmeg_alloc[seg] = 0;
+            pmeg_ctx[seg] = 0;
+            pmeg_vaddr[seg] = 0;
+     }
+
+     sun3_put_context(oldctx);
+
+}
+
+/* Flush a single TLB page. In this case, we're limited to flushing a
+   single PMEG */
+static inline void flush_tlb_page (struct vm_area_struct *vma,
+                                  unsigned long addr)
+{
+       unsigned char oldctx;
+       unsigned char i;
+
+       oldctx = sun3_get_context();
+       sun3_put_context(vma->vm_mm->context);
+       addr &= ~SUN3_PMEG_MASK;
+       if((i = sun3_get_segmap(addr)) != SUN3_INVALID_PMEG)
+       {
+               pmeg_alloc[i] = 0;
+               pmeg_ctx[i] = 0;
+               pmeg_vaddr[i] = 0;
+               sun3_put_segmap (addr,  SUN3_INVALID_PMEG);
+       }
+       sun3_put_context(oldctx);
+
+}
+/* Flush a range of pages from TLB. */
+
+static inline void flush_tlb_range (struct vm_area_struct *vma,
+                     unsigned long start, unsigned long end)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       unsigned char seg, oldctx;
+
+       start &= ~SUN3_PMEG_MASK;
+
+       oldctx = sun3_get_context();
+       sun3_put_context(mm->context);
+
+       while(start < end)
+       {
+               if((seg = sun3_get_segmap(start)) == SUN3_INVALID_PMEG)
+                    goto next;
+               if(pmeg_ctx[seg] == mm->context) {
+                       pmeg_alloc[seg] = 0;
+                       pmeg_ctx[seg] = 0;
+                       pmeg_vaddr[seg] = 0;
+               }
+               sun3_put_segmap(start, SUN3_INVALID_PMEG);
+       next:
+               start += SUN3_PMEG_SIZE;
+       }
+}
+
+static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+       flush_tlb_all();
+}
+
+/* Flush kernel page from TLB. */
+static inline void flush_tlb_kernel_page (unsigned long addr)
+{
+       sun3_put_segmap (addr & ~(SUN3_PMEG_SIZE - 1), SUN3_INVALID_PMEG);
+}
+
 #endif
+
+#else /* !CONFIG_MMU */
+
+/*
+ * flush all user-space atc entries.
+ */
+static inline void __flush_tlb(void)
+{
+       BUG();
+}
+
+static inline void __flush_tlb_one(unsigned long addr)
+{
+       BUG();
+}
+
+#define flush_tlb() __flush_tlb()
+
+/*
+ * flush all atc entries (both kernel and user-space entries).
+ */
+static inline void flush_tlb_all(void)
+{
+       BUG();
+}
+
+static inline void flush_tlb_mm(struct mm_struct *mm)
+{
+       BUG();
+}
+
+static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)
+{
+       BUG();
+}
+
+static inline void flush_tlb_range(struct mm_struct *mm,
+                                  unsigned long start, unsigned long end)
+{
+       BUG();
+}
+
+static inline void flush_tlb_kernel_page(unsigned long addr)
+{
+       BUG();
+}
+
+#endif /* CONFIG_MMU */
+
+#endif /* _M68K_TLBFLUSH_H */
diff --git a/arch/m68k/include/asm/tlbflush_mm.h b/arch/m68k/include/asm/tlbflush_mm.h
deleted file mode 100644 (file)
index acb6bf2..0000000
+++ /dev/null
@@ -1,219 +0,0 @@
-#ifndef _M68K_TLBFLUSH_H
-#define _M68K_TLBFLUSH_H
-
-
-#ifndef CONFIG_SUN3
-
-#include <asm/current.h>
-
-static inline void flush_tlb_kernel_page(void *addr)
-{
-       if (CPU_IS_040_OR_060) {
-               mm_segment_t old_fs = get_fs();
-               set_fs(KERNEL_DS);
-               __asm__ __volatile__(".chip 68040\n\t"
-                                    "pflush (%0)\n\t"
-                                    ".chip 68k"
-                                    : : "a" (addr));
-               set_fs(old_fs);
-       } else if (CPU_IS_020_OR_030)
-               __asm__ __volatile__("pflush #4,#4,(%0)" : : "a" (addr));
-}
-
-/*
- * flush all user-space atc entries.
- */
-static inline void __flush_tlb(void)
-{
-       if (CPU_IS_040_OR_060)
-               __asm__ __volatile__(".chip 68040\n\t"
-                                    "pflushan\n\t"
-                                    ".chip 68k");
-       else if (CPU_IS_020_OR_030)
-               __asm__ __volatile__("pflush #0,#4");
-}
-
-static inline void __flush_tlb040_one(unsigned long addr)
-{
-       __asm__ __volatile__(".chip 68040\n\t"
-                            "pflush (%0)\n\t"
-                            ".chip 68k"
-                            : : "a" (addr));
-}
-
-static inline void __flush_tlb_one(unsigned long addr)
-{
-       if (CPU_IS_040_OR_060)
-               __flush_tlb040_one(addr);
-       else if (CPU_IS_020_OR_030)
-               __asm__ __volatile__("pflush #0,#4,(%0)" : : "a" (addr));
-}
-
-#define flush_tlb() __flush_tlb()
-
-/*
- * flush all atc entries (both kernel and user-space entries).
- */
-static inline void flush_tlb_all(void)
-{
-       if (CPU_IS_040_OR_060)
-               __asm__ __volatile__(".chip 68040\n\t"
-                                    "pflusha\n\t"
-                                    ".chip 68k");
-       else if (CPU_IS_020_OR_030)
-               __asm__ __volatile__("pflusha");
-}
-
-static inline void flush_tlb_mm(struct mm_struct *mm)
-{
-       if (mm == current->active_mm)
-               __flush_tlb();
-}
-
-static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)
-{
-       if (vma->vm_mm == current->active_mm) {
-               mm_segment_t old_fs = get_fs();
-               set_fs(USER_DS);
-               __flush_tlb_one(addr);
-               set_fs(old_fs);
-       }
-}
-
-static inline void flush_tlb_range(struct vm_area_struct *vma,
-                                  unsigned long start, unsigned long end)
-{
-       if (vma->vm_mm == current->active_mm)
-               __flush_tlb();
-}
-
-static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
-{
-       flush_tlb_all();
-}
-
-#else
-
-
-/* Reserved PMEGs. */
-extern char sun3_reserved_pmeg[SUN3_PMEGS_NUM];
-extern unsigned long pmeg_vaddr[SUN3_PMEGS_NUM];
-extern unsigned char pmeg_alloc[SUN3_PMEGS_NUM];
-extern unsigned char pmeg_ctx[SUN3_PMEGS_NUM];
-
-/* Flush all userspace mappings one by one...  (why no flush command,
-   sun?) */
-static inline void flush_tlb_all(void)
-{
-       unsigned long addr;
-       unsigned char ctx, oldctx;
-
-       oldctx = sun3_get_context();
-       for(addr = 0x00000000; addr < TASK_SIZE; addr += SUN3_PMEG_SIZE) {
-              for(ctx = 0; ctx < 8; ctx++) {
-                      sun3_put_context(ctx);
-                      sun3_put_segmap(addr, SUN3_INVALID_PMEG);
-              }
-       }
-
-       sun3_put_context(oldctx);
-       /* erase all of the userspace pmeg maps, we've clobbered them
-         all anyway */
-       for(addr = 0; addr < SUN3_INVALID_PMEG; addr++) {
-              if(pmeg_alloc[addr] == 1) {
-                      pmeg_alloc[addr] = 0;
-                      pmeg_ctx[addr] = 0;
-                      pmeg_vaddr[addr] = 0;
-              }
-       }
-
-}
-
-/* Clear user TLB entries within the context named in mm */
-static inline void flush_tlb_mm (struct mm_struct *mm)
-{
-     unsigned char oldctx;
-     unsigned char seg;
-     unsigned long i;
-
-     oldctx = sun3_get_context();
-     sun3_put_context(mm->context);
-
-     for(i = 0; i < TASK_SIZE; i += SUN3_PMEG_SIZE) {
-            seg = sun3_get_segmap(i);
-            if(seg == SUN3_INVALID_PMEG)
-                    continue;
-
-            sun3_put_segmap(i, SUN3_INVALID_PMEG);
-            pmeg_alloc[seg] = 0;
-            pmeg_ctx[seg] = 0;
-            pmeg_vaddr[seg] = 0;
-     }
-
-     sun3_put_context(oldctx);
-
-}
-
-/* Flush a single TLB page. In this case, we're limited to flushing a
-   single PMEG */
-static inline void flush_tlb_page (struct vm_area_struct *vma,
-                                  unsigned long addr)
-{
-       unsigned char oldctx;
-       unsigned char i;
-
-       oldctx = sun3_get_context();
-       sun3_put_context(vma->vm_mm->context);
-       addr &= ~SUN3_PMEG_MASK;
-       if((i = sun3_get_segmap(addr)) != SUN3_INVALID_PMEG)
-       {
-               pmeg_alloc[i] = 0;
-               pmeg_ctx[i] = 0;
-               pmeg_vaddr[i] = 0;
-               sun3_put_segmap (addr,  SUN3_INVALID_PMEG);
-       }
-       sun3_put_context(oldctx);
-
-}
-/* Flush a range of pages from TLB. */
-
-static inline void flush_tlb_range (struct vm_area_struct *vma,
-                     unsigned long start, unsigned long end)
-{
-       struct mm_struct *mm = vma->vm_mm;
-       unsigned char seg, oldctx;
-
-       start &= ~SUN3_PMEG_MASK;
-
-       oldctx = sun3_get_context();
-       sun3_put_context(mm->context);
-
-       while(start < end)
-       {
-               if((seg = sun3_get_segmap(start)) == SUN3_INVALID_PMEG)
-                    goto next;
-               if(pmeg_ctx[seg] == mm->context) {
-                       pmeg_alloc[seg] = 0;
-                       pmeg_ctx[seg] = 0;
-                       pmeg_vaddr[seg] = 0;
-               }
-               sun3_put_segmap(start, SUN3_INVALID_PMEG);
-       next:
-               start += SUN3_PMEG_SIZE;
-       }
-}
-
-static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
-{
-       flush_tlb_all();
-}
-
-/* Flush kernel page from TLB. */
-static inline void flush_tlb_kernel_page (unsigned long addr)
-{
-       sun3_put_segmap (addr & ~(SUN3_PMEG_SIZE - 1), SUN3_INVALID_PMEG);
-}
-
-#endif
-
-#endif /* _M68K_TLBFLUSH_H */
diff --git a/arch/m68k/include/asm/tlbflush_no.h b/arch/m68k/include/asm/tlbflush_no.h
deleted file mode 100644 (file)
index a470cfb..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-#ifndef _M68KNOMMU_TLBFLUSH_H
-#define _M68KNOMMU_TLBFLUSH_H
-
-/*
- * Copyright (C) 2000 Lineo, David McCullough <davidm@uclinux.org>
- * Copyright (C) 2000-2002, Greg Ungerer <gerg@snapgear.com>
- */
-
-#include <asm/setup.h>
-
-/*
- * flush all user-space atc entries.
- */
-static inline void __flush_tlb(void)
-{
-       BUG();
-}
-
-static inline void __flush_tlb_one(unsigned long addr)
-{
-       BUG();
-}
-
-#define flush_tlb() __flush_tlb()
-
-/*
- * flush all atc entries (both kernel and user-space entries).
- */
-static inline void flush_tlb_all(void)
-{
-       BUG();
-}
-
-static inline void flush_tlb_mm(struct mm_struct *mm)
-{
-       BUG();
-}
-
-static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)
-{
-       BUG();
-}
-
-static inline void flush_tlb_range(struct mm_struct *mm,
-                                  unsigned long start, unsigned long end)
-{
-       BUG();
-}
-
-static inline void flush_tlb_kernel_page(unsigned long addr)
-{
-       BUG();
-}
-
-#endif /* _M68KNOMMU_TLBFLUSH_H */
index b53cd160c0b33849d127a687f160d2cd6173d4be..e4e22669edc0669c55c1edc9a1947664b3c1d4bc 100644 (file)
@@ -1,5 +1,30 @@
-#ifdef __uClinux__
-#include "ucontext_no.h"
-#else
-#include "ucontext_mm.h"
+#ifndef _M68K_UCONTEXT_H
+#define _M68K_UCONTEXT_H
+
+typedef int greg_t;
+#define NGREG 18
+typedef greg_t gregset_t[NGREG];
+
+typedef struct fpregset {
+       int f_fpcntl[3];
+       int f_fpregs[8*3];
+} fpregset_t;
+
+struct mcontext {
+       int version;
+       gregset_t gregs;
+       fpregset_t fpregs;
+};
+
+#define MCONTEXT_VERSION 2
+
+struct ucontext {
+       unsigned long     uc_flags;
+       struct ucontext  *uc_link;
+       stack_t           uc_stack;
+       struct mcontext   uc_mcontext;
+       unsigned long     uc_filler[80];
+       sigset_t          uc_sigmask;   /* mask last for extensibility */
+};
+
 #endif
diff --git a/arch/m68k/include/asm/ucontext_mm.h b/arch/m68k/include/asm/ucontext_mm.h
deleted file mode 100644 (file)
index e4e2266..0000000
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef _M68K_UCONTEXT_H
-#define _M68K_UCONTEXT_H
-
-typedef int greg_t;
-#define NGREG 18
-typedef greg_t gregset_t[NGREG];
-
-typedef struct fpregset {
-       int f_fpcntl[3];
-       int f_fpregs[8*3];
-} fpregset_t;
-
-struct mcontext {
-       int version;
-       gregset_t gregs;
-       fpregset_t fpregs;
-};
-
-#define MCONTEXT_VERSION 2
-
-struct ucontext {
-       unsigned long     uc_flags;
-       struct ucontext  *uc_link;
-       stack_t           uc_stack;
-       struct mcontext   uc_mcontext;
-       unsigned long     uc_filler[80];
-       sigset_t          uc_sigmask;   /* mask last for extensibility */
-};
-
-#endif
diff --git a/arch/m68k/include/asm/ucontext_no.h b/arch/m68k/include/asm/ucontext_no.h
deleted file mode 100644 (file)
index 713a27f..0000000
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef _M68KNOMMU_UCONTEXT_H
-#define _M68KNOMMU_UCONTEXT_H
-
-typedef int greg_t;
-#define NGREG 18
-typedef greg_t gregset_t[NGREG];
-
-typedef struct fpregset {
-       int f_pcr;
-       int f_psr;
-       int f_fpiaddr;
-       int f_fpregs[8][3];
-} fpregset_t;
-
-struct mcontext {
-       int version;
-       gregset_t gregs;
-       fpregset_t fpregs;
-};
-
-#define MCONTEXT_VERSION 2
-
-struct ucontext {
-       unsigned long     uc_flags;
-       struct ucontext  *uc_link;
-       stack_t           uc_stack;
-       struct mcontext   uc_mcontext;
-       unsigned long     uc_filler[80];
-       sigset_t          uc_sigmask;   /* mask last for extensibility */
-};
-
-#endif
index c640bba3bdf455f0d1830998198d7a74f2618ab4..019caa740c21122a9a39892f69831a060a549021 100644 (file)
@@ -1,5 +1,25 @@
-#ifdef __uClinux__
-#include "unaligned_no.h"
+#ifndef _ASM_M68K_UNALIGNED_H
+#define _ASM_M68K_UNALIGNED_H
+
+
+#ifdef CONFIG_COLDFIRE
+#include <linux/unaligned/be_struct.h>
+#include <linux/unaligned/le_byteshift.h>
+#include <linux/unaligned/generic.h>
+
+#define get_unaligned  __get_unaligned_be
+#define put_unaligned  __put_unaligned_be
+
 #else
-#include "unaligned_mm.h"
+/*
+ * The m68k can do unaligned accesses itself. 
+ */
+#include <linux/unaligned/access_ok.h>
+#include <linux/unaligned/generic.h>
+
+#define get_unaligned  __get_unaligned_be
+#define put_unaligned  __put_unaligned_be
+
 #endif
+
+#endif /* _ASM_M68K_UNALIGNED_H */
diff --git a/arch/m68k/include/asm/unaligned_mm.h b/arch/m68k/include/asm/unaligned_mm.h
deleted file mode 100644 (file)
index 77698f2..0000000
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef _ASM_M68K_UNALIGNED_H
-#define _ASM_M68K_UNALIGNED_H
-
-/*
- * The m68k can do unaligned accesses itself.
- */
-#include <linux/unaligned/access_ok.h>
-#include <linux/unaligned/generic.h>
-
-#define get_unaligned  __get_unaligned_be
-#define put_unaligned  __put_unaligned_be
-
-#endif /* _ASM_M68K_UNALIGNED_H */
diff --git a/arch/m68k/include/asm/unaligned_no.h b/arch/m68k/include/asm/unaligned_no.h
deleted file mode 100644 (file)
index eb1ea4c..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef _ASM_M68KNOMMU_UNALIGNED_H
-#define _ASM_M68KNOMMU_UNALIGNED_H
-
-
-#ifdef CONFIG_COLDFIRE
-#include <linux/unaligned/be_struct.h>
-#include <linux/unaligned/le_byteshift.h>
-#include <linux/unaligned/generic.h>
-
-#define get_unaligned  __get_unaligned_be
-#define put_unaligned  __put_unaligned_be
-
-#else
-/*
- * The m68k can do unaligned accesses itself. 
- */
-#include <linux/unaligned/access_ok.h>
-#include <linux/unaligned/generic.h>
-
-#define get_unaligned  __get_unaligned_be
-#define put_unaligned  __put_unaligned_be
-
-#endif
-
-#endif /* _ASM_M68KNOMMU_UNALIGNED_H */
index 7db41594d7b6b92d4beed91da8a6d847e16128af..54d980795fc45ceeaa021ad4fa98bd77f4d65508 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/rtc.h>
+#include <linux/platform_device.h>
 
 #include <asm/machdep.h>
 #include <asm/io.h>
@@ -159,3 +160,20 @@ int do_settimeofday(struct timespec *tv)
 }
 
 EXPORT_SYMBOL(do_settimeofday);
+
+
+static int __init rtc_init(void)
+{
+       struct platform_device *pdev;
+
+       if (!mach_hwclk)
+               return -ENODEV;
+
+       pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0);
+       if (IS_ERR(pdev))
+               return PTR_ERR(pdev);
+
+       return 0;
+}
+
+module_init(rtc_init);
index fd0fb303d885c1f048a9ec130695c8c3a4b8c21f..ce404bc9ccbdff8acf6c2160719964e4a0cfd6cd 100644 (file)
@@ -88,18 +88,18 @@ export PLATFORM BOARD MODEL CPUCLASS
 #
 # Some CFLAG additions based on specific CPU type.
 #
-cflags-$(CONFIG_M5206)         := -m5200
-cflags-$(CONFIG_M5206e)                := -m5200
-cflags-$(CONFIG_M520x)         := -m5307
+cflags-$(CONFIG_M5206)         := $(call cc-option,-mcpu=5206,-m5200)
+cflags-$(CONFIG_M5206e)                := $(call cc-option,-m5206e,-m5200)
+cflags-$(CONFIG_M520x)         := $(call cc-option,-mcpu=5208,-m5200)
 cflags-$(CONFIG_M523x)         := $(call cc-option,-mcpu=523x,-m5307)
-cflags-$(CONFIG_M5249)         := -m5200
+cflags-$(CONFIG_M5249)         := $(call cc-option,-mcpu=5249,-m5200)
 cflags-$(CONFIG_M5271)         := $(call cc-option,-mcpu=5271,-m5307)
-cflags-$(CONFIG_M5272)         := -m5307
+cflags-$(CONFIG_M5272)         := $(call cc-option,-mcpu=5271,-m5200)
 cflags-$(CONFIG_M5275)         := $(call cc-option,-mcpu=5275,-m5307)
 cflags-$(CONFIG_M528x)         := $(call cc-option,-m528x,-m5307)
-cflags-$(CONFIG_M5307)         := -m5307
+cflags-$(CONFIG_M5307)         := $(call cc-option,-m5307,-m5200)
 cflags-$(CONFIG_M532x)         := $(call cc-option,-mcpu=532x,-m5307)
-cflags-$(CONFIG_M5407)         := -m5200
+cflags-$(CONFIG_M5407)         := $(call cc-option,-m5407,-m5200)
 cflags-$(CONFIG_M68328)                := -m68000
 cflags-$(CONFIG_M68EZ328)      := -m68000
 cflags-$(CONFIG_M68VZ328)      := -m68000
index e10eafc52789e5f8dc3ca428449ff8fd0adb0b1e..93612580663851055d058e89321c791d6dfe4765 100644 (file)
@@ -9,10 +9,11 @@
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/device.h>
+#include <linux/dma-mapping.h>
 #include <asm/io.h>
 
 void *dma_alloc_coherent(struct device *dev, size_t size,
-                          dma_addr_t *dma_handle, int gfp)
+                          dma_addr_t *dma_handle, gfp_t gfp)
 {
        void *ret;
        /* ignore region specifiers */
@@ -34,3 +35,8 @@ void dma_free_coherent(struct device *dev, size_t size,
 {
        free_pages((unsigned long)vaddr, get_order(size));
 }
+
+void dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle, size_t size, enum dma_data_direction dir)
+{
+}
+
index bba1bb48a21f298d1b85ab5e66715d4674e60ad7..56e0f4c55a67bb32b9d9bdf4b147519de774f68d 100644 (file)
@@ -23,7 +23,7 @@ asmlinkage void do_IRQ(int irq, struct pt_regs *regs)
        struct pt_regs *oldregs = set_irq_regs(regs);
 
        irq_enter();
-       __do_IRQ(irq);
+       generic_handle_irq(irq);
        irq_exit();
 
        set_irq_regs(oldregs);
index 3bf249c53e414834114bc845dde4b63afe8e2add..7befc0c357e0850a1635032c26c1632bd580c38c 100644 (file)
@@ -111,11 +111,7 @@ void __init paging_init(void)
        {
                unsigned long zones_size[MAX_NR_ZONES] = {0, };
 
-               zones_size[ZONE_DMA] = 0 >> PAGE_SHIFT;
-               zones_size[ZONE_NORMAL] = (end_mem - PAGE_OFFSET) >> PAGE_SHIFT;
-#ifdef CONFIG_HIGHMEM
-               zones_size[ZONE_HIGHMEM] = 0;
-#endif
+               zones_size[ZONE_DMA] = (end_mem - PAGE_OFFSET) >> PAGE_SHIFT;
                free_area_init(zones_size);
        }
 }
index d299f7b8768a9981640733e60b51b5bb106a3bfb..9eab19d01eb1098f615694a577395fe77a693a31 100644 (file)
@@ -32,7 +32,8 @@ static struct mcf_platform_uart m5249_uart_platform[] = {
        {
                .mapbase        = MCF_MBAR + MCFUART_BASE2,
                .irq            = 74,
-       }
+       },
+       { },
 };
 
 static struct platform_device m5249_uart = {
@@ -50,12 +51,12 @@ static struct platform_device *m5249_devices[] __initdata = {
 static void __init m5249_uart_init_line(int line, int irq)
 {
        if (line == 0) {
-               writel(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI1, MCF_MBAR + MCFSIM_UART1ICR);
-               writeb(irq, MCFUART_BASE1 + MCFUART_UIVR);
+               writeb(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI1, MCF_MBAR + MCFSIM_UART1ICR);
+               writeb(irq, MCF_MBAR + MCFUART_BASE1 + MCFUART_UIVR);
                mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART1);
        } else if (line == 1) {
-               writel(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI2, MCF_MBAR + MCFSIM_UART2ICR);
-               writeb(irq, MCFUART_BASE2 + MCFUART_UIVR);
+               writeb(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI2, MCF_MBAR + MCFSIM_UART2ICR);
+               writeb(irq, MCF_MBAR + MCFUART_BASE2 + MCFUART_UIVR);
                mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART2);
        }
 }
index 724faf05852ae350ff827d09f76493d38fa65d76..44803bf70a6e0988868f3bd6a8815e1a35e2312c 100644 (file)
@@ -65,12 +65,12 @@ static struct platform_device *m5307_devices[] __initdata = {
 static void __init m5307_uart_init_line(int line, int irq)
 {
        if (line == 0) {
-               writel(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI1, MCF_MBAR + MCFSIM_UART1ICR);
-               writeb(irq, MCFUART_BASE1 + MCFUART_UIVR);
+               writeb(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI1, MCF_MBAR + MCFSIM_UART1ICR);
+               writeb(irq, MCF_MBAR + MCFUART_BASE1 + MCFUART_UIVR);
                mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART1);
        } else if (line == 1) {
-               writel(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI2, MCF_MBAR + MCFSIM_UART2ICR);
-               writeb(irq, MCFUART_BASE2 + MCFUART_UIVR);
+               writeb(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI2, MCF_MBAR + MCFSIM_UART2ICR);
+               writeb(irq, MCF_MBAR + MCFUART_BASE2 + MCFUART_UIVR);
                mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART2);
        }
 }
index 648b8b778211639fbf54ef9e7fae9d0e9e708e01..0ee8c1a200c87d5a4cad4e5e76421526fe947f1e 100644 (file)
@@ -56,12 +56,12 @@ static struct platform_device *m5407_devices[] __initdata = {
 static void __init m5407_uart_init_line(int line, int irq)
 {
        if (line == 0) {
-               writel(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI1, MCF_MBAR + MCFSIM_UART1ICR);
-               writeb(irq, MCFUART_BASE1 + MCFUART_UIVR);
+               writeb(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI1, MCF_MBAR + MCFSIM_UART1ICR);
+               writeb(irq, MCF_MBAR + MCFUART_BASE1 + MCFUART_UIVR);
                mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART1);
        } else if (line == 1) {
-               writel(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI2, MCF_MBAR + MCFSIM_UART2ICR);
-               writeb(irq, MCFUART_BASE2 + MCFUART_UIVR);
+               writeb(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI2, MCF_MBAR + MCFSIM_UART2ICR);
+               writeb(irq, MCF_MBAR + MCFUART_BASE2 + MCFUART_UIVR);
                mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART2);
        }
 }
index 4f416a91a829b38c9f130ecad667307ea0ef98f6..1bcb9372353fe8496966ce50f7a67cf2d32f80db 100644 (file)
@@ -14,7 +14,7 @@
 
 asflags-$(CONFIG_FULLDEBUG) := -DDEBUGGER_COMPATIBLE_CACHE=1
 
-obj-$(CONFIG_COLDFIRE) += dma.o entry.o vectors.o
+obj-$(CONFIG_COLDFIRE) += clk.o dma.o entry.o vectors.o
 obj-$(CONFIG_M5206)    += timers.o
 obj-$(CONFIG_M5206e)   += timers.o
 obj-$(CONFIG_M520x)    += pit.o
diff --git a/arch/m68knommu/platform/coldfire/clk.c b/arch/m68knommu/platform/coldfire/clk.c
new file mode 100644 (file)
index 0000000..7cdbf44
--- /dev/null
@@ -0,0 +1,40 @@
+/***************************************************************************/
+
+/*
+ *     clk.c -- general ColdFire CPU kernel clk handling
+ *
+ *     Copyright (C) 2009, Greg Ungerer (gerg@snapgear.com)
+ */
+
+/***************************************************************************/
+
+#include <linux/kernel.h>
+#include <linux/clk.h>
+#include <asm/coldfire.h>
+
+/***************************************************************************/
+
+struct clk *clk_get(struct device *dev, const char *id)
+{
+       return NULL;
+}
+
+int clk_enable(struct clk *clk)
+{
+       return 0;
+}
+
+void clk_disable(struct clk *clk)
+{
+}
+
+void clk_put(struct clk *clk)
+{
+}
+
+unsigned long clk_get_rate(struct clk *clk)
+{
+       return MCF_CLK;
+}
+
+/***************************************************************************/
index aacf11d33723edc80aa3638ba5256e83d0c74bf2..9038f39d9d736320996f95e6bae5d10473895da0 100644 (file)
@@ -9,9 +9,13 @@ config PARISC
        def_bool y
        select HAVE_IDE
        select HAVE_OPROFILE
+       select HAVE_FUNCTION_TRACER if 64BIT
+       select HAVE_FUNCTION_GRAPH_TRACER if 64BIT
+       select HAVE_FUNCTION_TRACE_MCOUNT_TEST if 64BIT
        select RTC_CLASS
-       select RTC_DRV_PARISC
+       select RTC_DRV_GENERIC
        select INIT_ALL_POSSIBLE
+       select BUG
        help
          The PA-RISC microprocessor is designed by Hewlett-Packard and used
          in many of their workstations & servers (HP9000 700 and 800 series,
@@ -75,6 +79,9 @@ config GENERIC_HARDIRQS
 config GENERIC_IRQ_PROBE
        def_bool y
 
+config HAVE_LATENCYTOP_SUPPORT
+        def_bool y
+
 config IRQ_PER_CPU
        bool
        default y
@@ -83,6 +90,9 @@ config IRQ_PER_CPU
 config PM
        bool
 
+config STACKTRACE_SUPPORT
+       def_bool y
+
 config ISA_DMA_API
        bool
 
index 0d428278356dd2b526ff904b34d9f421dc35c5d1..da6f66901c92191c2069ba2dbd8144951584936b 100644 (file)
@@ -56,7 +56,9 @@ cflags-y      += -mdisable-fpregs
 
 # Without this, "ld -r" results in .text sections that are too big
 # (> 0x40000) for branches to reach stubs.
-cflags-y       += -ffunction-sections
+ifndef CONFIG_FUNCTION_TRACER
+  cflags-y     += -ffunction-sections
+endif
 
 # select which processor to optimise for
 cflags-$(CONFIG_PA7100)                += -march=1.1 -mschedule=7100
index edbfe25c5fc142475e26d7ffff051d5a61003731..ada3e5364d8254993f85d9586c51abb1278cb6bc 100644 (file)
@@ -25,7 +25,7 @@
  * Since "a" is usually an address, use one spinlock per cacheline.
  */
 #  define ATOMIC_HASH_SIZE 4
-#  define ATOMIC_HASH(a) (&(__atomic_hash[ (((unsigned long) a)/L1_CACHE_BYTES) & (ATOMIC_HASH_SIZE-1) ]))
+#  define ATOMIC_HASH(a) (&(__atomic_hash[ (((unsigned long) (a))/L1_CACHE_BYTES) & (ATOMIC_HASH_SIZE-1) ]))
 
 extern raw_spinlock_t __atomic_hash[ATOMIC_HASH_SIZE] __lock_aligned;
 
@@ -222,13 +222,13 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u)
 
 #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
 
-#define atomic_add(i,v)        ((void)(__atomic_add_return( ((int)i),(v))))
-#define atomic_sub(i,v)        ((void)(__atomic_add_return(-((int)i),(v))))
+#define atomic_add(i,v)        ((void)(__atomic_add_return( ((int)(i)),(v))))
+#define atomic_sub(i,v)        ((void)(__atomic_add_return(-((int)(i)),(v))))
 #define atomic_inc(v)  ((void)(__atomic_add_return(   1,(v))))
 #define atomic_dec(v)  ((void)(__atomic_add_return(  -1,(v))))
 
-#define atomic_add_return(i,v) (__atomic_add_return( ((int)i),(v)))
-#define atomic_sub_return(i,v) (__atomic_add_return(-((int)i),(v)))
+#define atomic_add_return(i,v) (__atomic_add_return( ((int)(i)),(v)))
+#define atomic_sub_return(i,v) (__atomic_add_return(-((int)(i)),(v)))
 #define atomic_inc_return(v)   (__atomic_add_return(   1,(v)))
 #define atomic_dec_return(v)   (__atomic_add_return(  -1,(v)))
 
@@ -289,13 +289,13 @@ atomic64_read(const atomic64_t *v)
        return v->counter;
 }
 
-#define atomic64_add(i,v)      ((void)(__atomic64_add_return( ((s64)i),(v))))
-#define atomic64_sub(i,v)      ((void)(__atomic64_add_return(-((s64)i),(v))))
+#define atomic64_add(i,v)      ((void)(__atomic64_add_return( ((s64)(i)),(v))))
+#define atomic64_sub(i,v)      ((void)(__atomic64_add_return(-((s64)(i)),(v))))
 #define atomic64_inc(v)                ((void)(__atomic64_add_return(   1,(v))))
 #define atomic64_dec(v)                ((void)(__atomic64_add_return(  -1,(v))))
 
-#define atomic64_add_return(i,v)       (__atomic64_add_return( ((s64)i),(v)))
-#define atomic64_sub_return(i,v)       (__atomic64_add_return(-((s64)i),(v)))
+#define atomic64_add_return(i,v)       (__atomic64_add_return( ((s64)(i)),(v)))
+#define atomic64_sub_return(i,v)       (__atomic64_add_return(-((s64)(i)),(v)))
 #define atomic64_inc_return(v)         (__atomic64_add_return(   1,(v)))
 #define atomic64_dec_return(v)         (__atomic64_add_return(  -1,(v)))
 
index b7ca6dc7fddc89d484f9a210ff24ea48eaf48d74..724395143f268c3727d716d9dd02cba87ca2c859 100644 (file)
@@ -97,6 +97,9 @@ void mark_rodata_ro(void);
 
 #ifdef CONFIG_PA8X00
 /* Only pa8800, pa8900 needs this */
+
+#include <asm/kmap_types.h>
+
 #define ARCH_HAS_KMAP
 
 void kunmap_parisc(void *addr);
index 7fa675799e6d2cad21304698f7dec833a12224e7..9c802eb4be8491e95f932c9efb487f4b751ac5d4 100644 (file)
@@ -168,6 +168,16 @@ typedef struct elf64_fdesc {
        __u64   gp;
 } Elf64_Fdesc;
 
+#ifdef __KERNEL__
+
+#ifdef CONFIG_64BIT
+#define Elf_Fdesc      Elf64_Fdesc
+#else
+#define Elf_Fdesc      Elf32_Fdesc
+#endif /*CONFIG_64BIT*/
+
+#endif /*__KERNEL__*/
+
 /* Legal values for p_type field of Elf32_Phdr/Elf64_Phdr.  */
 
 #define PT_HP_TLS              (PT_LOOS + 0x0)
diff --git a/arch/parisc/include/asm/ftrace.h b/arch/parisc/include/asm/ftrace.h
new file mode 100644 (file)
index 0000000..2fa05dd
--- /dev/null
@@ -0,0 +1,25 @@
+#ifndef _ASM_PARISC_FTRACE_H
+#define _ASM_PARISC_FTRACE_H
+
+#ifndef __ASSEMBLY__
+extern void mcount(void);
+
+/*
+ * Stack of return addresses for functions of a thread.
+ * Used in struct thread_info
+ */
+struct ftrace_ret_stack {
+       unsigned long ret;
+       unsigned long func;
+       unsigned long long calltime;
+};
+
+/*
+ * Primary handler of a function return.
+ * It relays on ftrace_return_to_handler.
+ * Defined in entry.S
+ */
+extern void return_to_handler(void);
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_PARISC_FTRACE_H */
index c3941f09a87891e79c52f05f182ed84e0f0fc0c9..7bc5125d7d4c9f83a1f7bcd6b4bf980d4fdbaea3 100644 (file)
@@ -36,16 +36,7 @@ void clear_user_page(void *page, unsigned long vaddr, struct page *pg);
  */
 #define STRICT_MM_TYPECHECKS
 #ifdef STRICT_MM_TYPECHECKS
-typedef struct { unsigned long pte;
-#if !defined(CONFIG_64BIT)
-                 unsigned long future_flags;
- /* XXX: it's possible to remove future_flags and change BITS_PER_PTE_ENTRY
-        to 2, but then strangely the identical 32bit kernel boots on a
-        c3000(pa20), but not any longer on a 715(pa11).
-        Still investigating... HelgeD.
-  */
-#endif
-} pte_t; /* either 32 or 64bit */
+typedef struct { unsigned long pte; } pte_t; /* either 32 or 64bit */
 
 /* NOTE: even on 64 bits, these entries are __u32 because we allocate
  * the pmd and pgd in ZONE_DMA (i.e. under 4GB) */
@@ -111,7 +102,7 @@ extern int npmem_ranges;
 #define BITS_PER_PMD_ENTRY     2
 #define BITS_PER_PGD_ENTRY     2
 #else
-#define BITS_PER_PTE_ENTRY     3
+#define BITS_PER_PTE_ENTRY     2
 #define BITS_PER_PMD_ENTRY     2
 #define BITS_PER_PGD_ENTRY     BITS_PER_PMD_ENTRY
 #endif
index 430f1aeea0b896334a2ec6f1affbc7e44694d01f..4ca510b3c6f800e7dabe3affcf02c73b7f44dede 100644 (file)
@@ -49,6 +49,8 @@
 #define PDC_MODEL_CPU_ID       6       /* returns cpu-id (only newer machines!) */
 #define PDC_MODEL_CAPABILITIES 7       /* returns OS32/OS64-flags      */
 /* Values for PDC_MODEL_CAPABILITIES non-equivalent virtual aliasing support */
+#define  PDC_MODEL_OS64                        (1 << 0)
+#define  PDC_MODEL_OS32                        (1 << 1)
 #define  PDC_MODEL_IOPDIR_FDC          (1 << 2)
 #define  PDC_MODEL_NVA_MASK            (3 << 4)
 #define  PDC_MODEL_NVA_SUPPORTED       (0 << 4)
 
 #ifdef __KERNEL__
 
+#include <asm/page.h> /* for __PAGE_OFFSET */
+
 extern int pdc_type;
 
 /* Values for pdc_type */
index 470a4b88124da2fcfb8c9936f523abd2662079a2..a27d2e200fb2a62519ba1dc281ebf0d3abe8f8be 100644 (file)
        printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, (unsigned long)pgd_val(e))
 
 /* This is the size of the initially mapped kernel memory */
-#ifdef CONFIG_64BIT
 #define KERNEL_INITIAL_ORDER   24      /* 0 to 1<<24 = 16MB */
-#else
-#define KERNEL_INITIAL_ORDER   23      /* 0 to 1<<23 = 8MB */
-#endif
 #define KERNEL_INITIAL_SIZE    (1 << KERNEL_INITIAL_ORDER)
 
 #if defined(CONFIG_64BIT) && defined(CONFIG_PARISC_PAGE_SIZE_4KB)
 
 /* Definitions for 1st level */
 #define PGDIR_SHIFT    (PMD_SHIFT + BITS_PER_PMD)
+#if (PGDIR_SHIFT + PAGE_SHIFT + PGD_ORDER - BITS_PER_PGD_ENTRY) > BITS_PER_LONG
+#define BITS_PER_PGD   (BITS_PER_LONG - PGDIR_SHIFT)
+#else
 #define BITS_PER_PGD   (PAGE_SHIFT + PGD_ORDER - BITS_PER_PGD_ENTRY)
+#endif
 #define PGDIR_SIZE     (1UL << PGDIR_SHIFT)
 #define PGDIR_MASK     (~(PGDIR_SIZE-1))
 #define PTRS_PER_PGD    (1UL << BITS_PER_PGD)
 #define USER_PTRS_PER_PGD       PTRS_PER_PGD
 
+#ifdef CONFIG_64BIT
 #define MAX_ADDRBITS   (PGDIR_SHIFT + BITS_PER_PGD)
 #define MAX_ADDRESS    (1UL << MAX_ADDRBITS)
-
 #define SPACEID_SHIFT  (MAX_ADDRBITS - 32)
+#else
+#define MAX_ADDRBITS   (BITS_PER_LONG)
+#define MAX_ADDRESS    (1UL << MAX_ADDRBITS)
+#define SPACEID_SHIFT  0
+#endif
 
 /* This calculates the number of initial pages we need for the initial
  * page tables */
index 6ef4b7867b1b356a933ebf1744e65739adc87838..21eb45a526299d15884aa7ca11ba545f82a8111f 100644 (file)
@@ -29,7 +29,8 @@ extern void smp_send_reschedule(int cpu);
 extern void smp_send_all_nop(void);
 
 extern void arch_send_call_function_single_ipi(int cpu);
-extern void arch_send_call_function_ipi(cpumask_t mask);
+extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
+#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
 
 #endif /* !ASSEMBLY */
 
index 016d3fc4111c6bf3d22872dd23737cc880c15eb4..67db0722e6ca8d4e3c0e323e24ee137325e48c73 100644 (file)
@@ -11,10 +11,25 @@ obj-y               := cache.o pacache.o setup.o traps.o time.o irq.o \
                   process.o processor.o pdc_cons.o pdc_chassis.o unwind.o \
                   topology.o
 
+ifdef CONFIG_FUNCTION_TRACER
+# Do not profile debug and lowlevel utilities
+CFLAGS_REMOVE_ftrace.o = -pg
+CFLAGS_REMOVE_cache.o = -pg
+CFLAGS_REMOVE_irq.o = -pg
+CFLAGS_REMOVE_pacache.o = -pg
+CFLAGS_REMOVE_perf.o = -pg
+CFLAGS_REMOVE_traps.o = -pg
+CFLAGS_REMOVE_unaligned.o = -pg
+CFLAGS_REMOVE_unwind.o = -pg
+endif
+
 obj-$(CONFIG_SMP)      += smp.o
 obj-$(CONFIG_PA11)     += pci-dma.o
 obj-$(CONFIG_PCI)      += pci.o
 obj-$(CONFIG_MODULES)  += module.o
 obj-$(CONFIG_64BIT)    += binfmt_elf32.o sys_parisc32.o signal32.o
+obj-$(CONFIG_STACKTRACE)+= stacktrace.o
 # only supported for PCX-W/U in 64-bit mode at the moment
 obj-$(CONFIG_64BIT)    += perf.o perf_asm.o
+obj-$(CONFIG_FUNCTION_TRACER)          += ftrace.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER)    += ftrace.o
index 0db9fdcb7709889d7ddef590a7e99c579d97acf0..ae3e70cd1e14e4acf2da03717d77d0b9ea0d7faa 100644 (file)
        STREG           \pte,0(\ptep)
        .endm
 
+       /* bitshift difference between a PFN (based on kernel's PAGE_SIZE)
+        * to a CPU TLB 4k PFN (4k => 12 bits to shift) */
+       #define PAGE_ADD_SHIFT  (PAGE_SHIFT-12)
+
+       /* Drop prot bits and convert to page addr for iitlbt and idtlbt */
+       .macro          convert_for_tlb_insert20 pte
+       extrd,u         \pte,(63-ASM_PFN_PTE_SHIFT)+(63-58)+PAGE_ADD_SHIFT,\
+                               64-PAGE_SHIFT-PAGE_ADD_SHIFT,\pte
+       depdi           _PAGE_SIZE_ENCODING_DEFAULT,63,\
+                               (63-58)+PAGE_ADD_SHIFT,\pte
+       .endm
+
        /* Convert the pte and prot to tlb insertion values.  How
         * this happens is quite subtle, read below */
        .macro          make_insert_tlb spc,pte,prot
        depi            1,12,1,\prot
 
        /* Drop prot bits and convert to page addr for iitlbt and idtlbt */
-       extrd,u         \pte,(63-ASM_PFN_PTE_SHIFT)+(63-58),64-PAGE_SHIFT,\pte
-       depdi           _PAGE_SIZE_ENCODING_DEFAULT,63,63-58,\pte
+       convert_for_tlb_insert20 \pte
        .endm
 
        /* Identical macro to make_insert_tlb above, except it
 
        /* Get rid of prot bits and convert to page addr for iitlba */
 
-       depi            _PAGE_SIZE_ENCODING_DEFAULT,31,ASM_PFN_PTE_SHIFT,\pte
-       extru           \pte,24,25,\pte
+       depi            0,31,ASM_PFN_PTE_SHIFT,\pte
+       SHRREG          \pte,(ASM_PFN_PTE_SHIFT-(31-26)),\pte
        .endm
 
        /* This is for ILP32 PA2.0 only.  The TLB insertion needs
@@ -1244,10 +1255,9 @@ nadtlb_check_flush_20w:
        depdi,z         7,7,3,prot
        depdi           1,10,1,prot
 
-       /* Get rid of prot bits and convert to page addr for idtlbt */
+       /* Drop prot bits from pte and convert to page addr for idtlbt */
+       convert_for_tlb_insert20 pte
 
-       depdi           0,63,12,pte
-       extrd,u         pte,56,52,pte
        idtlbt          pte,prot
 
        rfir
@@ -1337,8 +1347,8 @@ nadtlb_check_flush_11:
 
        /* Get rid of prot bits and convert to page addr for idtlba */
 
-       depi            0,31,12,pte
-       extru           pte,24,25,pte
+       depi            0,31,ASM_PFN_PTE_SHIFT,pte
+       SHRREG          pte,(ASM_PFN_PTE_SHIFT-(31-26)),pte
 
        mfsp            %sr1,t0  /* Save sr1 so we can use it in tlb inserts */
        mtsp            spc,%sr1
@@ -1403,10 +1413,9 @@ nadtlb_check_flush_20:
        depdi,z         7,7,3,prot
        depdi           1,10,1,prot
 
-       /* Get rid of prot bits and convert to page addr for idtlbt */
+       /* Drop prot bits from pte and convert to page addr for idtlbt */
+       convert_for_tlb_insert20 pte
 
-       depdi           0,63,12,pte
-       extrd,u         pte,56,32,pte
        idtlbt          pte,prot
 
        rfir
@@ -2176,6 +2185,33 @@ syscall_do_resched:
 ENDPROC(syscall_exit)
 
 
+#ifdef CONFIG_FUNCTION_TRACER
+       .import ftrace_function_trampoline,code
+ENTRY(_mcount)
+       copy    %r3, %arg2
+       b       ftrace_function_trampoline
+       nop
+ENDPROC(_mcount)
+
+ENTRY(return_to_handler)
+       load32  return_trampoline, %rp
+       copy    %ret0, %arg0
+       copy    %ret1, %arg1
+       b       ftrace_return_to_handler
+       nop
+return_trampoline:
+       copy    %ret0, %rp
+       copy    %r23, %ret0
+       copy    %r24, %ret1
+
+.globl ftrace_stub
+ftrace_stub:
+       bv      %r0(%rp)
+       nop
+ENDPROC(return_to_handler)
+#endif /* CONFIG_FUNCTION_TRACER */
+
+
 get_register:
        /*
         * get_register is used by the non access tlb miss handlers to
index f6d241238a78b39b6a39467726d51e939fd3916d..4c247e02d9b1b0e0655e47ebc15470801b4d9881 100644 (file)
@@ -527,7 +527,11 @@ int pdc_model_capabilities(unsigned long *capabilities)
         pdc_result[0] = 0; /* preset zero (call may not be implemented!) */
         retval = mem_pdc_call(PDC_MODEL, PDC_MODEL_CAPABILITIES, __pa(pdc_result), 0);
         convert_to_wide(pdc_result);
-        *capabilities = pdc_result[0];
+        if (retval == PDC_OK) {
+                *capabilities = pdc_result[0];
+        } else {
+                *capabilities = PDC_MODEL_OS32;
+        }
         spin_unlock_irqrestore(&pdc_lock, flags);
 
         return retval;
diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c
new file mode 100644 (file)
index 0000000..9877372
--- /dev/null
@@ -0,0 +1,185 @@
+/*
+ * Code for tracing calls in Linux kernel.
+ * Copyright (C) 2009 Helge Deller <deller@gmx.de>
+ *
+ * based on code for x86 which is:
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ * future possible enhancements:
+ *     - add CONFIG_DYNAMIC_FTRACE
+ *     - add CONFIG_STACK_TRACER
+ */
+
+#include <linux/init.h>
+#include <linux/ftrace.h>
+
+#include <asm/sections.h>
+#include <asm/ftrace.h>
+
+
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+/* Add a function return address to the trace stack on thread info.*/
+static int push_return_trace(unsigned long ret, unsigned long long time,
+                               unsigned long func, int *depth)
+{
+       int index;
+
+       if (!current->ret_stack)
+               return -EBUSY;
+
+       /* The return trace stack is full */
+       if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+               atomic_inc(&current->trace_overrun);
+               return -EBUSY;
+       }
+
+       index = ++current->curr_ret_stack;
+       barrier();
+       current->ret_stack[index].ret = ret;
+       current->ret_stack[index].func = func;
+       current->ret_stack[index].calltime = time;
+       *depth = index;
+
+       return 0;
+}
+
+/* Retrieve a function return address to the trace stack on thread info.*/
+static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
+{
+       int index;
+
+       index = current->curr_ret_stack;
+
+       if (unlikely(index < 0)) {
+               ftrace_graph_stop();
+               WARN_ON(1);
+               /* Might as well panic, otherwise we have no where to go */
+               *ret = (unsigned long)
+                       dereference_function_descriptor(&panic);
+               return;
+       }
+
+       *ret = current->ret_stack[index].ret;
+       trace->func = current->ret_stack[index].func;
+       trace->calltime = current->ret_stack[index].calltime;
+       trace->overrun = atomic_read(&current->trace_overrun);
+       trace->depth = index;
+       barrier();
+       current->curr_ret_stack--;
+
+}
+
+/*
+ * Send the trace to the ring-buffer.
+ * @return the original return address.
+ */
+unsigned long ftrace_return_to_handler(unsigned long retval0,
+                                      unsigned long retval1)
+{
+       struct ftrace_graph_ret trace;
+       unsigned long ret;
+
+       pop_return_trace(&trace, &ret);
+       trace.rettime = cpu_clock(raw_smp_processor_id());
+       ftrace_graph_return(&trace);
+
+       if (unlikely(!ret)) {
+               ftrace_graph_stop();
+               WARN_ON(1);
+               /* Might as well panic. What else to do? */
+               ret = (unsigned long)
+                       dereference_function_descriptor(&panic);
+       }
+
+       /* HACK: we hand over the old functions' return values
+          in %r23 and %r24. Assembly in entry.S will take care
+          and move those to their final registers %ret0 and %ret1 */
+       asm( "copy %0, %%r23 \n\t"
+            "copy %1, %%r24 \n" : : "r" (retval0), "r" (retval1) );
+
+       return ret;
+}
+
+/*
+ * Hook the return address and push it in the stack of return addrs
+ * in current thread info.
+ */
+void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
+{
+       unsigned long old;
+       unsigned long long calltime;
+       struct ftrace_graph_ent trace;
+
+       if (unlikely(atomic_read(&current->tracing_graph_pause)))
+               return;
+
+       old = *parent;
+       *parent = (unsigned long)
+                 dereference_function_descriptor(&return_to_handler);
+
+       if (unlikely(!__kernel_text_address(old))) {
+               ftrace_graph_stop();
+               *parent = old;
+               WARN_ON(1);
+               return;
+       }
+
+       calltime = cpu_clock(raw_smp_processor_id());
+
+       if (push_return_trace(old, calltime,
+                               self_addr, &trace.depth) == -EBUSY) {
+               *parent = old;
+               return;
+       }
+
+       trace.func = self_addr;
+
+       /* Only trace if the calling function expects to */
+       if (!ftrace_graph_entry(&trace)) {
+               current->curr_ret_stack--;
+               *parent = old;
+       }
+}
+
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+
+void ftrace_function_trampoline(unsigned long parent,
+                               unsigned long self_addr,
+                               unsigned long org_sp_gr3)
+{
+       extern ftrace_func_t ftrace_trace_function;
+
+       if (function_trace_stop)
+               return;
+
+       if (ftrace_trace_function != ftrace_stub) {
+               ftrace_trace_function(parent, self_addr);
+               return;
+       }
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       if (ftrace_graph_entry && ftrace_graph_return) {
+               unsigned long sp;
+               unsigned long *parent_rp;
+
+                asm volatile ("copy %%r30, %0" : "=r"(sp));
+               /* sanity check: is stack pointer which we got from
+                  assembler function in entry.S in a reasonable
+                  range compared to current stack pointer? */
+               if ((sp - org_sp_gr3) > 0x400)
+                       return;
+
+               /* calculate pointer to %rp in stack */
+               parent_rp = (unsigned long *) org_sp_gr3 - 0x10;
+               /* sanity check: parent_rp should hold parent */
+               if (*parent_rp != parent)
+                       return;
+               
+               prepare_ftrace_return(parent_rp, self_addr);
+               return;
+       }
+#endif
+}
+
index 1c740f5cbd6347f0046dc1560fb52dc806fd3103..4ea4229d765ccc0657148d4f2268931ea3d6b8f9 100644 (file)
@@ -311,12 +311,12 @@ unsigned long txn_alloc_addr(unsigned int virt_irq)
        next_cpu++; /* assign to "next" CPU we want this bugger on */
 
        /* validate entry */
-       while ((next_cpu < NR_CPUS) &&
+       while ((next_cpu < nr_cpu_ids) &&
                (!per_cpu(cpu_data, next_cpu).txn_addr ||
                 !cpu_online(next_cpu)))
                next_cpu++;
 
-       if (next_cpu >= NR_CPUS
+       if (next_cpu >= nr_cpu_ids
                next_cpu = 0;   /* nothing else, assign monarch */
 
        return txn_affinity_addr(virt_irq, next_cpu);
index 9013243ceccadec711c8f6dd42b2c50e3e0115e0..ecd1c50244470db01620f67615e5562b87d2bcec 100644 (file)
@@ -61,9 +61,7 @@
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/bug.h>
-#include <linux/uaccess.h>
 
-#include <asm/sections.h>
 #include <asm/unwind.h>
 
 #if 0
@@ -115,8 +113,6 @@ struct got_entry {
        Elf32_Addr addr;
 };
 
-#define Elf_Fdesc      Elf32_Fdesc
-
 struct stub_entry {
        Elf32_Word insns[2]; /* each stub entry has two insns */
 };
@@ -125,8 +121,6 @@ struct got_entry {
        Elf64_Addr addr;
 };
 
-#define Elf_Fdesc      Elf64_Fdesc
-
 struct stub_entry {
        Elf64_Word insns[4]; /* each stub entry has four insns */
 };
@@ -916,15 +910,3 @@ void module_arch_cleanup(struct module *mod)
        deregister_unwind_table(mod);
        module_bug_cleanup(mod);
 }
-
-#ifdef CONFIG_64BIT
-void *dereference_function_descriptor(void *ptr)
-{
-       Elf64_Fdesc *desc = ptr;
-       void *p;
-
-       if (!probe_kernel_address(&desc->addr, p))
-               ptr = p;
-       return ptr;
-}
-#endif
index 0eecfbbc59cdcc99c2206cb299f32c986ff84b35..df653663d3dbdd7efdb858e48225f71d4eeebd0b 100644 (file)
@@ -153,5 +153,10 @@ EXPORT_SYMBOL(node_data);
 EXPORT_SYMBOL(pfnnid_map);
 #endif
 
+#ifdef CONFIG_FUNCTION_TRACER
+extern void _mcount(void);
+EXPORT_SYMBOL(_mcount);
+#endif
+
 /* from pacache.S -- needed for copy_page */
 EXPORT_SYMBOL(copy_user_page_asm);
index 8aa591ed9127d5e1f8aa9982ce3827c6b3bc2982..6f69101f90bb24df54b99259a5419f84a76dbb35 100644 (file)
 #include <linux/stddef.h>
 #include <linux/unistd.h>
 #include <linux/kallsyms.h>
+#include <linux/uaccess.h>
 
 #include <asm/io.h>
 #include <asm/asm-offsets.h>
 #include <asm/pdc.h>
 #include <asm/pdc_chassis.h>
 #include <asm/pgalloc.h>
-#include <asm/uaccess.h>
 #include <asm/unwind.h>
+#include <asm/sections.h>
 
 /*
  * The idle thread. There's no useful work to be
@@ -231,8 +232,8 @@ sys_clone(unsigned long clone_flags, unsigned long usp,
           
           However, these last 3 args are only examined
           if the proper flags are set. */
-       int __user *child_tidptr;
-       int __user *parent_tidptr;
+       int __user *parent_tidptr = (int __user *)regs->gr[24];
+       int __user *child_tidptr  = (int __user *)regs->gr[22];
 
        /* usp must be word aligned.  This also prevents users from
         * passing in the value 1 (which is the signal for a special
@@ -243,16 +244,6 @@ sys_clone(unsigned long clone_flags, unsigned long usp,
        if (usp == 0)
          usp = regs->gr[30];
 
-       if (clone_flags & CLONE_PARENT_SETTID)
-         parent_tidptr = (int __user *)regs->gr[24];
-       else
-         parent_tidptr = NULL;
-       
-       if (clone_flags & (CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID))
-         child_tidptr = (int __user *)regs->gr[22];
-       else
-         child_tidptr = NULL;
-
        return do_fork(clone_flags, usp, regs, 0, parent_tidptr, child_tidptr);
 }
 
@@ -400,3 +391,15 @@ get_wchan(struct task_struct *p)
        } while (count++ < 16);
        return 0;
 }
+
+#ifdef CONFIG_64BIT
+void *dereference_function_descriptor(void *ptr)
+{
+       Elf64_Fdesc *desc = ptr;
+       void *p;
+
+       if (!probe_kernel_address(&desc->addr, p))
+               ptr = p;
+       return ptr;
+}
+#endif
index ecb609342feb73164114b4089aafafe3aa2d6f9d..e09d0f7fb6b047ad2c2cf34d95046f89f2263874 100644 (file)
@@ -100,8 +100,8 @@ static int __cpuinit processor_probe(struct parisc_device *dev)
        struct cpuinfo_parisc *p;
 
 #ifdef CONFIG_SMP
-       if (num_online_cpus() >= NR_CPUS) {
-               printk(KERN_INFO "num_online_cpus() >= NR_CPUS\n");
+       if (num_online_cpus() >= nr_cpu_ids) {
+               printk(KERN_INFO "num_online_cpus() >= nr_cpu_ids\n");
                return 1;
        }
 #else
@@ -214,7 +214,7 @@ static int __cpuinit processor_probe(struct parisc_device *dev)
         */
 #ifdef CONFIG_SMP
        if (cpuid) {
-               cpu_set(cpuid, cpu_present_map);
+               set_cpu_present(cpuid, true);
                cpu_up(cpuid);
        }
 #endif
@@ -364,6 +364,13 @@ show_cpuinfo (struct seq_file *m, void *v)
                                 boot_cpu_data.cpu_hz / 1000000,
                                 boot_cpu_data.cpu_hz % 1000000  );
 
+               seq_printf(m, "capabilities\t:");
+               if (boot_cpu_data.pdc.capabilities & PDC_MODEL_OS32)
+                       seq_printf(m, " os32");
+               if (boot_cpu_data.pdc.capabilities & PDC_MODEL_OS64)
+                       seq_printf(m, " os64");
+               seq_printf(m, "\n");
+
                seq_printf(m, "model\t\t: %s\n"
                                "model name\t: %s\n",
                                 boot_cpu_data.pdc.sys_model_name,
index 9995d7ed58198c42de88223b51ff0d3ac4f75b3b..1fd0f0cec037f3fa19ce2630ee7311dc7cc34dbc 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/err.h>
 #include <linux/delay.h>
 #include <linux/bitops.h>
+#include <linux/ftrace.h>
 
 #include <asm/system.h>
 #include <asm/atomic.h>
@@ -113,14 +114,14 @@ halt_processor(void)
 {
        /* REVISIT : redirect I/O Interrupts to another CPU? */
        /* REVISIT : does PM *know* this CPU isn't available? */
-       cpu_clear(smp_processor_id(), cpu_online_map);
+       set_cpu_online(smp_processor_id(), false);
        local_irq_disable();
        for (;;)
                ;
 }
 
 
-irqreturn_t
+irqreturn_t __irq_entry
 ipi_interrupt(int irq, void *dev_id) 
 {
        int this_cpu = smp_processor_id();
@@ -214,11 +215,11 @@ ipi_send(int cpu, enum ipi_message_type op)
 }
 
 static void
-send_IPI_mask(cpumask_t mask, enum ipi_message_type op)
+send_IPI_mask(const struct cpumask *mask, enum ipi_message_type op)
 {
        int cpu;
 
-       for_each_cpu_mask(cpu, mask)
+       for_each_cpu(cpu, mask)
                ipi_send(cpu, op);
 }
 
@@ -257,7 +258,7 @@ smp_send_all_nop(void)
        send_IPI_allbutself(IPI_NOP);
 }
 
-void arch_send_call_function_ipi(cpumask_t mask)
+void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 {
        send_IPI_mask(mask, IPI_CALL_FUNC);
 }
@@ -296,13 +297,14 @@ smp_cpu_init(int cpunum)
        mb();
 
        /* Well, support 2.4 linux scheme as well. */
-       if (cpu_test_and_set(cpunum, cpu_online_map))
+       if (cpu_isset(cpunum, cpu_online_map))
        {
                extern void machine_halt(void); /* arch/parisc.../process.c */
 
                printk(KERN_CRIT "CPU#%d already initialized!\n", cpunum);
                machine_halt();
        }  
+       set_cpu_online(cpunum, true);
 
        /* Initialise the idle task for this CPU */
        atomic_inc(&init_mm.mm_count);
@@ -424,8 +426,8 @@ void __init smp_prepare_boot_cpu(void)
        /* Setup BSP mappings */
        printk(KERN_INFO "SMP: bootstrap CPU ID is %d\n", bootstrap_processor);
 
-       cpu_set(bootstrap_processor, cpu_online_map);
-       cpu_set(bootstrap_processor, cpu_present_map);
+       set_cpu_online(bootstrap_processor, true);
+       set_cpu_present(bootstrap_processor, true);
 }
 
 
@@ -436,8 +438,7 @@ void __init smp_prepare_boot_cpu(void)
 */
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
-       cpus_clear(cpu_present_map);
-       cpu_set(0, cpu_present_map);
+       init_cpu_present(cpumask_of(0));
 
        parisc_max_cpus = max_cpus;
        if (!max_cpus)
diff --git a/arch/parisc/kernel/stacktrace.c b/arch/parisc/kernel/stacktrace.c
new file mode 100644 (file)
index 0000000..2fe914c
--- /dev/null
@@ -0,0 +1,63 @@
+/*
+ * Stack trace management functions
+ *
+ *  Copyright (C) 2009 Helge Deller <deller@gmx.de>
+ *  based on arch/x86/kernel/stacktrace.c by Ingo Molnar <mingo@redhat.com>
+ *  and parisc unwind functions by Randolph Chung <tausq@debian.org>
+ *
+ *  TODO: Userspace stacktrace (CONFIG_USER_STACKTRACE_SUPPORT)
+ */
+#include <linux/module.h>
+#include <linux/stacktrace.h>
+
+#include <asm/unwind.h>
+
+static void dump_trace(struct task_struct *task, struct stack_trace *trace)
+{
+       struct unwind_frame_info info;
+
+       /* initialize unwind info */
+       if (task == current) {
+               unsigned long sp;
+               struct pt_regs r;
+HERE:
+               asm volatile ("copy %%r30, %0" : "=r"(sp));
+               memset(&r, 0, sizeof(struct pt_regs));
+               r.iaoq[0] = (unsigned long)&&HERE;
+               r.gr[2] = (unsigned long)__builtin_return_address(0);
+               r.gr[30] = sp;
+               unwind_frame_init(&info, task, &r);
+       } else {
+               unwind_frame_init_from_blocked_task(&info, task);
+       }
+
+       /* unwind stack and save entries in stack_trace struct */
+       trace->nr_entries = 0;
+       while (trace->nr_entries < trace->max_entries) {
+               if (unwind_once(&info) < 0 || info.ip == 0)
+                       break;
+
+               if (__kernel_text_address(info.ip))
+                       trace->entries[trace->nr_entries++] = info.ip;
+       }
+}
+
+
+/*
+ * Save stack-backtrace addresses into a stack_trace buffer.
+ */
+void save_stack_trace(struct stack_trace *trace)
+{
+       dump_trace(current, trace);
+       if (trace->nr_entries < trace->max_entries)
+               trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+EXPORT_SYMBOL_GPL(save_stack_trace);
+
+void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
+{
+       dump_trace(tsk, trace);
+       if (trace->nr_entries < trace->max_entries)
+               trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
index 69b6eebc466ea4860261d6ff9c503bae308a748c..59fc1a43ec3ef094ae514694e549ab6b518aa05a 100644 (file)
@@ -365,17 +365,51 @@ tracesys_sigexit:
 
 
        /*********************************************************
-               Light-weight-syscall code
+               32/64-bit Light-Weight-Syscall ABI
 
-               r20 - lws number
-               r26,r25,r24,r23,r22 - Input registers
-               r28 - Function return register
-               r21 - Error code.
+               * - Indicates a hint for userspace inline asm
+               implementations.
 
-               Scracth: Any of the above that aren't being
-               currently used, including r1. 
+               Syscall number (caller-saves)
+               - %r20
+               * In asm clobber.
 
-               Return pointer: r31 (Not usable)
+               Argument registers (caller-saves)
+               - %r26, %r25, %r24, %r23, %r22
+               * In asm input.
+
+               Return registers (caller-saves)
+               - %r28 (return), %r21 (errno)
+               * In asm output.
+
+               Caller-saves registers
+               - %r1, %r27, %r29
+               - %r2 (return pointer)
+               - %r31 (ble link register)
+               * In asm clobber.
+
+               Callee-saves registers
+               - %r3-%r18
+               - %r30 (stack pointer)
+               * Not in asm clobber.
+
+               If userspace is 32-bit:
+               Callee-saves registers
+               - %r19 (32-bit PIC register)
+
+               Differences from 32-bit calling convention:
+               - Syscall number in %r20
+               - Additional argument register %r22 (arg4)
+               - Callee-saves %r19.
+
+               If userspace is 64-bit:
+               Callee-saves registers
+               - %r27 (64-bit PIC register)
+
+               Differences from 64-bit calling convention:
+               - Syscall number in %r20
+               - Additional argument register %r22 (arg4)
+               - Callee-saves %r27.
 
                Error codes returned by entry path:
 
@@ -473,7 +507,8 @@ lws_compare_and_swap64:
        b,n     lws_compare_and_swap
 #else
        /* If we are not a 64-bit kernel, then we don't
-        * implement having 64-bit input registers
+        * have 64-bit input registers, and calling
+        * the 64-bit LWS CAS returns ENOSYS.
         */
        b,n     lws_exit_nosys
 #endif
@@ -635,12 +670,15 @@ END(sys_call_table64)
        /*
                All light-weight-syscall atomic operations 
                will use this set of locks 
+
+               NOTE: The lws_lock_start symbol must be
+               at least 16-byte aligned for safe use
+               with ldcw.
        */
        .section .data
        .align  PAGE_SIZE
 ENTRY(lws_lock_start)
        /* lws locks */
-       .align 16
        .rept 16
        /* Keep locks aligned at 16-bytes */
        .word 1
index e75cae6072c574592d9146b8fe30354ebb2cb4a4..d4dd05674c6234b495acdb25431bb5fd9a437261 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/profile.h>
 #include <linux/clocksource.h>
 #include <linux/platform_device.h>
+#include <linux/ftrace.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -53,7 +54,7 @@ static unsigned long clocktick __read_mostly; /* timer cycles per tick */
  * held off for an arbitrarily long period of time by interrupts being
  * disabled, so we may miss one or more ticks.
  */
-irqreturn_t timer_interrupt(int irq, void *dev_id)
+irqreturn_t __irq_entry timer_interrupt(int irq, void *dev_id)
 {
        unsigned long now;
        unsigned long next_tick;
@@ -216,14 +217,14 @@ void __init start_cpu_itimer(void)
        per_cpu(cpu_data, cpu).it_value = next_tick;
 }
 
-static struct platform_device rtc_parisc_dev = {
-       .name = "rtc-parisc",
+static struct platform_device rtc_generic_dev = {
+       .name = "rtc-generic",
        .id = -1,
 };
 
 static int __init rtc_init(void)
 {
-       if (platform_device_register(&rtc_parisc_dev) < 0)
+       if (platform_device_register(&rtc_generic_dev) < 0)
                printk(KERN_ERR "unable to register rtc device...\n");
 
        /* not necessarily an error */
index ba658d2086f77decbc35404da73287338e27618a..c32f5d6d778ec7bd10c028cf40db645af659686a 100644 (file)
@@ -247,6 +247,8 @@ void die_if_kernel(char *str, struct pt_regs *regs, long err)
 
        oops_in_progress = 1;
 
+       oops_enter();
+
        /* Amuse the user in a SPARC fashion */
        if (err) printk(
 KERN_CRIT "      _______________________________ \n"
@@ -293,6 +295,7 @@ KERN_CRIT "                     ||     ||\n");
                panic("Fatal exception");
        }
 
+       oops_exit();
        do_exit(SIGSEGV);
 }
 
@@ -494,7 +497,7 @@ void parisc_terminate(char *msg, struct pt_regs *regs, int code, unsigned long o
        panic(msg);
 }
 
-void handle_interruption(int code, struct pt_regs *regs)
+void notrace handle_interruption(int code, struct pt_regs *regs)
 {
        unsigned long fault_address = 0;
        unsigned long fault_space = 0;
index 1a3b6ccd362064e80b65412a62eee7e7b0f2182d..fd2cc4fd2b65adc5b84dd46639d4eaea1ea58d77 100644 (file)
@@ -54,6 +54,8 @@ SECTIONS
                TEXT_TEXT
                SCHED_TEXT
                LOCK_TEXT
+               KPROBES_TEXT
+               IRQENTRY_TEXT
                *(.text.do_softirq)
                *(.text.sys_exit)
                *(.text.do_sigaltstack)
index 9d704d9831d1612a84eb3a82c6e94bf17041ad7d..4356ceb1e366d9ed010706ac21dceb8ac24ba151 100644 (file)
@@ -456,6 +456,13 @@ void __init mem_init(void)
 {
        int codesize, reservedpages, datasize, initsize;
 
+       /* Do sanity checks on page table constants */
+       BUILD_BUG_ON(PTE_ENTRY_SIZE != sizeof(pte_t));
+       BUILD_BUG_ON(PMD_ENTRY_SIZE != sizeof(pmd_t));
+       BUILD_BUG_ON(PGD_ENTRY_SIZE != sizeof(pgd_t));
+       BUILD_BUG_ON(PAGE_SHIFT + BITS_PER_PTE + BITS_PER_PMD + BITS_PER_PGD
+                       > BITS_PER_LONG);
+
        high_memory = __va((max_pfn << PAGE_SHIFT));
 
 #ifndef CONFIG_DISCONTIGMEM
index 67f1812698d2703e02cad2c561fd1d0e6c6d84b2..cdb6fd814de8880541d2c4130b33ea7e7b9f6234 100644 (file)
@@ -50,6 +50,9 @@ enum ps3_param_av_multi_out {
 
 enum ps3_param_av_multi_out ps3_os_area_get_av_multi_out(void);
 
+extern u64 ps3_os_area_get_rtc_diff(void);
+extern void ps3_os_area_set_rtc_diff(u64 rtc_diff);
+
 /* dma routines */
 
 enum ps3_dma_page_size {
index c9564031a2a9c7878f9e9a694fe8e9ca3622a85e..926ea864e34f576b5b5b36518398fb83a83784f4 100644 (file)
@@ -1127,3 +1127,19 @@ void div128_by_32(u64 dividend_high, u64 dividend_low,
        dr->result_low  = ((u64)y << 32) + z;
 
 }
+
+static int __init rtc_init(void)
+{
+       struct platform_device *pdev;
+
+       if (!ppc_md.get_rtc_time)
+               return -ENODEV;
+
+       pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0);
+       if (IS_ERR(pdev))
+               return PTR_ERR(pdev);
+
+       return 0;
+}
+
+module_init(rtc_init);
index e1c83c23b435a2f8032093def6afa1a844d5d1ab..86e392b1b049ab6855ac8b8f077a26b84a77132b 100644 (file)
@@ -808,6 +808,7 @@ u64 ps3_os_area_get_rtc_diff(void)
 {
        return saved_params.rtc_diff;
 }
+EXPORT_SYMBOL(ps3_os_area_get_rtc_diff);
 
 /**
  * ps3_os_area_set_rtc_diff - Set the rtc diff value.
@@ -823,6 +824,7 @@ void ps3_os_area_set_rtc_diff(u64 rtc_diff)
                os_area_queue_work();
        }
 }
+EXPORT_SYMBOL(ps3_os_area_set_rtc_diff);
 
 /**
  * ps3_os_area_get_av_multi_out - Returns the default video mode.
index 235c13ebacd9a459b2a584de9d536828e945c690..136aa0637d9c0bbc67e10f13b98d96f5540aa288 100644 (file)
@@ -64,8 +64,6 @@ int ps3_set_rtc_time(struct rtc_time *time);
 
 void __init ps3_os_area_save_params(void);
 void __init ps3_os_area_init(void);
-u64 ps3_os_area_get_rtc_diff(void);
-void ps3_os_area_set_rtc_diff(u64 rtc_diff);
 
 /* spu */
 
index 3331ccbb8d389c6d76dd4c548beeb3c30d6404b8..66181821322acc2dba892e26ef3a52182a3ca894 100644 (file)
@@ -270,8 +270,6 @@ define_machine(ps3) {
        .init_IRQ                       = ps3_init_IRQ,
        .panic                          = ps3_panic,
        .get_boot_time                  = ps3_get_boot_time,
-       .set_rtc_time                   = ps3_set_rtc_time,
-       .get_rtc_time                   = ps3_get_rtc_time,
        .set_dabr                       = ps3_set_dabr,
        .calibrate_decr                 = ps3_calibrate_decr,
        .progress                       = ps3_progress,
index d0daf7d6d3b26407d5ec87c0d5caef127943c4fe..b178a1e66c915e8ab6e85b0eec51830b6a05adf1 100644 (file)
@@ -19,6 +19,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/platform_device.h>
 
 #include <asm/rtc.h>
 #include <asm/lv1call.h>
@@ -74,23 +75,20 @@ static u64 read_rtc(void)
        return rtc_val;
 }
 
-int ps3_set_rtc_time(struct rtc_time *tm)
+unsigned long __init ps3_get_boot_time(void)
 {
-       u64 now = mktime(tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday,
-               tm->tm_hour, tm->tm_min, tm->tm_sec);
-
-       ps3_os_area_set_rtc_diff(now - read_rtc());
-       return 0;
+       return read_rtc() + ps3_os_area_get_rtc_diff();
 }
 
-void ps3_get_rtc_time(struct rtc_time *tm)
+static int __init ps3_rtc_init(void)
 {
-       to_tm(read_rtc() + ps3_os_area_get_rtc_diff(), tm);
-       tm->tm_year -= 1900;
-       tm->tm_mon -= 1;
-}
+       struct platform_device *pdev;
 
-unsigned long __init ps3_get_boot_time(void)
-{
-       return read_rtc() + ps3_os_area_get_rtc_diff();
+       pdev = platform_device_register_simple("rtc-ps3", -1, NULL, 0);
+       if (IS_ERR(pdev))
+               return PTR_ERR(pdev);
+
+       return 0;
 }
+
+module_init(ps3_rtc_init);
index d42f826a8ab9f009fe5f06d5133e0e8d578799f1..f934225fd8ef9c702c869c274df0c271c9a6a20b 100644 (file)
@@ -22,6 +22,7 @@
 #include "linux/kernel.h"
 #include "linux/module.h"
 #include "linux/blkdev.h"
+#include "linux/ata.h"
 #include "linux/hdreg.h"
 #include "linux/init.h"
 #include "linux/cdrom.h"
@@ -1308,16 +1309,15 @@ static int ubd_ioctl(struct block_device *bdev, fmode_t mode,
                     unsigned int cmd, unsigned long arg)
 {
        struct ubd *ubd_dev = bdev->bd_disk->private_data;
-       struct hd_driveid ubd_id = {
-               .cyls           = 0,
-               .heads          = 128,
-               .sectors        = 32,
-       };
+       u16 ubd_id[ATA_ID_WORDS];
 
        switch (cmd) {
                struct cdrom_volctrl volume;
        case HDIO_GET_IDENTITY:
-               ubd_id.cyls = ubd_dev->size / (128 * 32 * 512);
+               memset(&ubd_id, 0, ATA_ID_WORDS * 2);
+               ubd_id[ATA_ID_CYLS]     = ubd_dev->size / (128 * 32 * 512);
+               ubd_id[ATA_ID_HEADS]    = 128;
+               ubd_id[ATA_ID_SECTORS]  = 32;
                if(copy_to_user((char __user *) arg, (char *) &ubd_id,
                                 sizeof(ubd_id)))
                        return -EFAULT;
index 748e50a1a15257ac6226eca13869b47e32a59de2..3f27e5c0c9c962a3c7e01c5c35ac1bfc23ec84b0 100644 (file)
@@ -1837,8 +1837,8 @@ config PCI_MMCONFIG
 
 config DMAR
        bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
-       depends on X86_64 && PCI_MSI && ACPI && EXPERIMENTAL
-       ---help---
+       depends on PCI_MSI && ACPI && EXPERIMENTAL
+       help
          DMA remapping (DMAR) devices support enables independent address
          translations for Direct Memory Access (DMA) from devices.
          These DMA remapping devices are reported via ACPI tables
index c5962fe3796fbabe71024e3892354e8297d23a5f..a97db99dad52fedd4bbcf6fe9d46bed8784f5e1f 100644 (file)
@@ -1928,6 +1928,12 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
        return paddr;
 }
 
+static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
+                                   unsigned long cap)
+{
+       return 0;
+}
+
 static struct iommu_ops amd_iommu_ops = {
        .domain_init = amd_iommu_domain_init,
        .domain_destroy = amd_iommu_domain_destroy,
@@ -1936,5 +1942,6 @@ static struct iommu_ops amd_iommu_ops = {
        .map = amd_iommu_map_range,
        .unmap = amd_iommu_unmap_range,
        .iova_to_phys = amd_iommu_iova_to_phys,
+       .domain_has_cap = amd_iommu_domain_has_cap,
 };
 
index 5bc5d1688c1c771730ba6fa1ecdea1bafe959767..8126e8d1a2a4a789509cb49af563b6cbb76395ae 100644 (file)
@@ -40,7 +40,6 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 
        debug_kmap_atomic(type);
 
-       debug_kmap_atomic(type);
        idx = type + KM_TYPE_NR*smp_processor_id();
        vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
        BUG_ON(!pte_none(*(kmap_pte-idx)));
index bff0c9032f8c6f7045518d9fdd2bd417a3be7634..e331f77348a787608c0c6cfe82bd022d81eb42aa 100644 (file)
@@ -39,6 +39,7 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
 
        pagefault_disable();
 
+       debug_kmap_atomic(type);
        idx = type + KM_TYPE_NR * smp_processor_id();
        vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
        set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
@@ -72,7 +73,6 @@ iounmap_atomic(void *kvaddr, enum km_type type)
        unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
        enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
 
-       debug_kmap_atomic(type);
        /*
         * Force other mappings to Oops if they'll try to access this pte
         * without first remap it.  Keeping stale mappings around is a bad idea
index f234a37bd428c73aa0e8623079bfbe5e06923948..f1817f71e009a274bab8b4ed4fe9f0a3baadd75e 100644 (file)
@@ -258,24 +258,7 @@ void pcibios_set_master(struct pci_dev *dev)
        pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
 }
 
-static void pci_unmap_page_range(struct vm_area_struct *vma)
-{
-       u64 addr = (u64)vma->vm_pgoff << PAGE_SHIFT;
-       free_memtype(addr, addr + vma->vm_end - vma->vm_start);
-}
-
-static void pci_track_mmap_page_range(struct vm_area_struct *vma)
-{
-       u64 addr = (u64)vma->vm_pgoff << PAGE_SHIFT;
-       unsigned long flags = pgprot_val(vma->vm_page_prot)
-                                               & _PAGE_CACHE_MASK;
-
-       reserve_memtype(addr, addr + vma->vm_end - vma->vm_start, flags, NULL);
-}
-
 static struct vm_operations_struct pci_mmap_ops = {
-       .open  = pci_track_mmap_page_range,
-       .close = pci_unmap_page_range,
        .access = generic_access_phys,
 };
 
@@ -283,11 +266,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
                        enum pci_mmap_state mmap_state, int write_combine)
 {
        unsigned long prot;
-       u64 addr = vma->vm_pgoff << PAGE_SHIFT;
-       unsigned long len = vma->vm_end - vma->vm_start;
-       unsigned long flags;
-       unsigned long new_flags;
-       int retval;
 
        /* I/O space cannot be accessed via normal processor loads and
         * stores on this platform.
@@ -308,30 +286,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 
        vma->vm_page_prot = __pgprot(prot);
 
-       flags = pgprot_val(vma->vm_page_prot) & _PAGE_CACHE_MASK;
-       retval = reserve_memtype(addr, addr + len, flags, &new_flags);
-       if (retval)
-               return retval;
-
-       if (flags != new_flags) {
-               if (!is_new_memtype_allowed(flags, new_flags)) {
-                       free_memtype(addr, addr+len);
-                       return -EINVAL;
-               }
-               flags = new_flags;
-               vma->vm_page_prot = __pgprot(
-                       (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK) |
-                       flags);
-       }
-
-       if (((vma->vm_pgoff < max_low_pfn_mapped) ||
-            (vma->vm_pgoff >= (1UL<<(32 - PAGE_SHIFT)) &&
-             vma->vm_pgoff < max_pfn_mapped)) &&
-           ioremap_change_attr((unsigned long)__va(addr), len, flags)) {
-               free_memtype(addr, addr + len);
-               return -EINVAL;
-       }
-
        if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
                               vma->vm_end - vma->vm_start,
                               vma->vm_page_prot))
index f21147f3626a628c2fb78e1e5448a0361f9a630d..06eb6cc09fef97714d12ebf8859e5e712d798df5 100644 (file)
@@ -30,7 +30,7 @@
 #ifdef CONFIG_DMA_ENGINE
 static int __init async_tx_init(void)
 {
-       dmaengine_get();
+       async_dmaengine_get();
 
        printk(KERN_INFO "async_tx: api initialized (async)\n");
 
@@ -39,7 +39,7 @@ static int __init async_tx_init(void)
 
 static void __exit async_tx_exit(void)
 {
-       dmaengine_put();
+       async_dmaengine_put();
 }
 
 /**
@@ -56,7 +56,7 @@ __async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
        if (depend_tx &&
            dma_has_cap(tx_type, depend_tx->chan->device->cap_mask))
                return depend_tx->chan;
-       return dma_find_channel(tx_type);
+       return async_dma_find_channel(tx_type);
 }
 EXPORT_SYMBOL_GPL(__async_tx_find_channel);
 #else
index 595b78672b36ad90e27cf50ed28126bb43970fb5..95fe2c8d6c516ea0325b9287df8775adb542cee7 100644 (file)
 #include <linux/raid/xor.h>
 #include <linux/async_tx.h>
 
-/* do_async_xor - dma map the pages and perform the xor with an engine.
- *     This routine is marked __always_inline so it can be compiled away
- *     when CONFIG_DMA_ENGINE=n
- */
-static __always_inline struct dma_async_tx_descriptor *
+/* do_async_xor - dma map the pages and perform the xor with an engine */
+static __async_inline struct dma_async_tx_descriptor *
 do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list,
             unsigned int offset, int src_cnt, size_t len,
             enum async_tx_flags flags,
index 7a659733f94a4e1087fabcfc8d5e48fd6c5c4521..2ccc8b0076ce36ed7d67ff6202e44e90076d7ee2 100644 (file)
@@ -77,6 +77,9 @@ static int shash_update_unaligned(struct shash_desc *desc, const u8 *data,
        u8 buf[shash_align_buffer_size(unaligned_len, alignmask)]
                __attribute__ ((aligned));
 
+       if (unaligned_len > len)
+               unaligned_len = len;
+
        memcpy(buf, data, unaligned_len);
 
        return shash->update(desc, buf, unaligned_len) ?:
index b2e6db075e4993eac4fcffb9004b82ee4a0bca71..996b6ee57d9e3fb3ff48251ea9ee01d405b648ad 100644 (file)
@@ -18,8 +18,8 @@
 
 #define BH_TRACE 0
 #include <linux/module.h>
-#include <linux/raid/md.h>
 #include <linux/raid/xor.h>
+#include <linux/jiffies.h>
 #include <asm/xor.h>
 
 /* The xor routines to use.  */
index c2d1eed903767484304b1f3208c27719b5ec3195..9f0e672f4be84ff35489b8f5e8724898ec3d9f12 100644 (file)
@@ -98,3 +98,10 @@ phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
        return iommu_ops->iova_to_phys(domain, iova);
 }
 EXPORT_SYMBOL_GPL(iommu_iova_to_phys);
+
+int iommu_domain_has_cap(struct iommu_domain *domain,
+                        unsigned long cap)
+{
+       return iommu_ops->domain_has_cap(domain, cap);
+}
+EXPORT_SYMBOL_GPL(iommu_domain_has_cap);
index 45c5a33daf498d623e9749bd75e8f4bf94a9e49e..31693bc24444f3541c74dfccc9c9c384edf8fd5a 100644 (file)
@@ -4,6 +4,7 @@
  * Filesystem request handling methods
  */
 
+#include <linux/ata.h>
 #include <linux/hdreg.h>
 #include <linux/blkdev.h>
 #include <linux/skbuff.h>
@@ -267,7 +268,7 @@ aoecmd_ata_rw(struct aoedev *d)
                writebit = 0;
        }
 
-       ah->cmdstat = WIN_READ | writebit | extbit;
+       ah->cmdstat = ATA_CMD_PIO_READ | writebit | extbit;
 
        /* mark all tracking fields and load out */
        buf->nframesout += 1;
@@ -362,10 +363,10 @@ resend(struct aoedev *d, struct aoetgt *t, struct frame *f)
        switch (ah->cmdstat) {
        default:
                break;
-       case WIN_READ:
-       case WIN_READ_EXT:
-       case WIN_WRITE:
-       case WIN_WRITE_EXT:
+       case ATA_CMD_PIO_READ:
+       case ATA_CMD_PIO_READ_EXT:
+       case ATA_CMD_PIO_WRITE:
+       case ATA_CMD_PIO_WRITE_EXT:
                put_lba(ah, f->lba);
 
                n = f->bcnt;
@@ -812,8 +813,8 @@ aoecmd_ata_rsp(struct sk_buff *skb)
                        d->htgt = NULL;
                n = ahout->scnt << 9;
                switch (ahout->cmdstat) {
-               case WIN_READ:
-               case WIN_READ_EXT:
+               case ATA_CMD_PIO_READ:
+               case ATA_CMD_PIO_READ_EXT:
                        if (skb->len - sizeof *hin - sizeof *ahin < n) {
                                printk(KERN_ERR
                                        "aoe: %s.  skb->len=%d need=%ld\n",
@@ -823,8 +824,8 @@ aoecmd_ata_rsp(struct sk_buff *skb)
                                return;
                        }
                        memcpy(f->bufaddr, ahin+1, n);
-               case WIN_WRITE:
-               case WIN_WRITE_EXT:
+               case ATA_CMD_PIO_WRITE:
+               case ATA_CMD_PIO_WRITE_EXT:
                        ifp = getif(t, skb->dev);
                        if (ifp) {
                                ifp->lost = 0;
@@ -838,7 +839,7 @@ aoecmd_ata_rsp(struct sk_buff *skb)
                                goto xmit;
                        }
                        break;
-               case WIN_IDENTIFY:
+               case ATA_CMD_ID_ATA:
                        if (skb->len - sizeof *hin - sizeof *ahin < 512) {
                                printk(KERN_INFO
                                        "aoe: runt data size in ataid.  skb->len=%d\n",
@@ -914,7 +915,7 @@ aoecmd_ata_id(struct aoedev *d)
 
        /* set up ata header */
        ah->scnt = 1;
-       ah->cmdstat = WIN_IDENTIFY;
+       ah->cmdstat = ATA_CMD_ID_ATA;
        ah->lba3 = 0xa0;
 
        skb->dev = t->ifp->nd;
index 482c0c4b964f31c39a6fe5f3d95b6768776b4393..3c11f062a18cff77c25ab4f9f3159eb7218a4391 100644 (file)
@@ -42,6 +42,8 @@
 #include <linux/ata.h>
 #include <linux/hdreg.h>
 
+#define HD_IRQ 14
+
 #define REALLY_SLOW_IO
 #include <asm/system.h>
 #include <asm/io.h>
index 119be3442f28f7f2c979fd2c5ee64c1967ff16f8..6cccdc3f5220ce58b46bb0a0f2714bb537c5dc6b 100644 (file)
@@ -89,6 +89,7 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
+#include <linux/ata.h>
 #include <linux/hdreg.h>
 #include <linux/platform_device.h>
 #if defined(CONFIG_OF)
@@ -208,7 +209,7 @@ struct ace_device {
        struct gendisk *gd;
 
        /* Inserted CF card parameters */
-       struct hd_driveid cf_id;
+       u16 cf_id[ATA_ID_WORDS];
 };
 
 static int ace_major;
@@ -402,21 +403,14 @@ static void ace_dump_regs(struct ace_device *ace)
                 ace_in32(ace, ACE_CFGLBA), ace_in(ace, ACE_FATSTAT));
 }
 
-void ace_fix_driveid(struct hd_driveid *id)
+void ace_fix_driveid(u16 *id)
 {
 #if defined(__BIG_ENDIAN)
-       u16 *buf = (void *)id;
        int i;
 
        /* All half words have wrong byte order; swap the bytes */
-       for (i = 0; i < sizeof(struct hd_driveid); i += 2, buf++)
-               *buf = le16_to_cpu(*buf);
-
-       /* Some of the data values are 32bit; swap the half words  */
-       id->lba_capacity = ((id->lba_capacity >> 16) & 0x0000FFFF) |
-           ((id->lba_capacity << 16) & 0xFFFF0000);
-       id->spg = ((id->spg >> 16) & 0x0000FFFF) |
-           ((id->spg << 16) & 0xFFFF0000);
+       for (i = 0; i < ATA_ID_WORDS; i++, id++)
+               *id = le16_to_cpu(*id);
 #endif
 }
 
@@ -614,7 +608,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
                break;
 
        case ACE_FSM_STATE_IDENTIFY_COMPLETE:
-               ace_fix_driveid(&ace->cf_id);
+               ace_fix_driveid(&ace->cf_id[0]);
                ace_dump_mem(&ace->cf_id, 512); /* Debug: Dump out disk ID */
 
                if (ace->data_result) {
@@ -627,9 +621,10 @@ static void ace_fsm_dostate(struct ace_device *ace)
                        ace->media_change = 0;
 
                        /* Record disk parameters */
-                       set_capacity(ace->gd, ace->cf_id.lba_capacity);
+                       set_capacity(ace->gd,
+                               ata_id_u32(&ace->cf_id, ATA_ID_LBA_CAPACITY));
                        dev_info(ace->dev, "capacity: %i sectors\n",
-                                ace->cf_id.lba_capacity);
+                               ata_id_u32(&ace->cf_id, ATA_ID_LBA_CAPACITY));
                }
 
                /* We're done, drop to IDLE state and notify waiters */
@@ -928,12 +923,13 @@ static int ace_release(struct gendisk *disk, fmode_t mode)
 static int ace_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
        struct ace_device *ace = bdev->bd_disk->private_data;
+       u16 *cf_id = &ace->cf_id[0];
 
        dev_dbg(ace->dev, "ace_getgeo()\n");
 
-       geo->heads = ace->cf_id.heads;
-       geo->sectors = ace->cf_id.sectors;
-       geo->cylinders = ace->cf_id.cyls;
+       geo->heads      = cf_id[ATA_ID_HEADS];
+       geo->sectors    = cf_id[ATA_ID_SECTORS];
+       geo->cylinders  = cf_id[ATA_ID_CYLS];
 
        return 0;
 }
index 10ad41be5897ec53ab27623d0677aa31ed714365..dcd352ad0e7f1e1378c654efd3266c56bf403604 100644 (file)
@@ -90,10 +90,30 @@ static struct hwrng timeriomem_rng_ops = {
 
 static int __init timeriomem_rng_probe(struct platform_device *pdev)
 {
+       struct resource *res, *mem;
        int ret;
 
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+
+       if (!res)
+               return -ENOENT;
+
+       mem = request_mem_region(res->start, res->end - res->start + 1,
+                                pdev->name);
+       if (mem == NULL)
+               return -EBUSY;
+
+       dev_set_drvdata(&pdev->dev, mem);
+
        timeriomem_rng_data = pdev->dev.platform_data;
 
+       timeriomem_rng_data->address = ioremap(res->start,
+                                               res->end - res->start + 1);
+       if (!timeriomem_rng_data->address) {
+               ret = -ENOMEM;
+               goto err_ioremap;
+       }
+
        if (timeriomem_rng_data->period != 0
                && usecs_to_jiffies(timeriomem_rng_data->period) > 0) {
                timeriomem_rng_timer.expires = jiffies;
@@ -104,23 +124,34 @@ static int __init timeriomem_rng_probe(struct platform_device *pdev)
        timeriomem_rng_data->present = 1;
 
        ret = hwrng_register(&timeriomem_rng_ops);
-       if (ret) {
-               dev_err(&pdev->dev, "problem registering\n");
-               return ret;
-       }
+       if (ret)
+               goto err_register;
 
        dev_info(&pdev->dev, "32bits from 0x%p @ %dus\n",
                        timeriomem_rng_data->address,
                        timeriomem_rng_data->period);
 
        return 0;
+
+err_register:
+       dev_err(&pdev->dev, "problem registering\n");
+       iounmap(timeriomem_rng_data->address);
+err_ioremap:
+       release_resource(mem);
+
+       return ret;
 }
 
 static int __devexit timeriomem_rng_remove(struct platform_device *pdev)
 {
+       struct resource *mem = dev_get_drvdata(&pdev->dev);
+
        del_timer_sync(&timeriomem_rng_timer);
        hwrng_unregister(&timeriomem_rng_ops);
 
+       iounmap(timeriomem_rng_data->address);
+       release_resource(mem);
+
        return 0;
 }
 
index d9e751be8c5fb120e5f0d06ef27684c8f33fa549..af9761ccf9f132a6414aacec926251b06c8b64c6 100644 (file)
@@ -101,6 +101,7 @@ struct buffer_desc {
        u32 phys_addr;
        u32 __reserved[4];
        struct buffer_desc *next;
+       enum dma_data_direction dir;
 };
 
 struct crypt_ctl {
@@ -132,14 +133,10 @@ struct crypt_ctl {
 struct ablk_ctx {
        struct buffer_desc *src;
        struct buffer_desc *dst;
-       unsigned src_nents;
-       unsigned dst_nents;
 };
 
 struct aead_ctx {
        struct buffer_desc *buffer;
-       unsigned short assoc_nents;
-       unsigned short src_nents;
        struct scatterlist ivlist;
        /* used when the hmac is not on one sg entry */
        u8 *hmac_virt;
@@ -312,7 +309,7 @@ static struct crypt_ctl *get_crypt_desc_emerg(void)
        }
 }
 
-static void free_buf_chain(struct buffer_desc *buf, u32 phys)
+static void free_buf_chain(struct device *dev, struct buffer_desc *buf,u32 phys)
 {
        while (buf) {
                struct buffer_desc *buf1;
@@ -320,6 +317,7 @@ static void free_buf_chain(struct buffer_desc *buf, u32 phys)
 
                buf1 = buf->next;
                phys1 = buf->phys_next;
+               dma_unmap_single(dev, buf->phys_next, buf->buf_len, buf->dir);
                dma_pool_free(buffer_pool, buf, phys);
                buf = buf1;
                phys = phys1;
@@ -348,7 +346,6 @@ static void one_packet(dma_addr_t phys)
        struct crypt_ctl *crypt;
        struct ixp_ctx *ctx;
        int failed;
-       enum dma_data_direction src_direction = DMA_BIDIRECTIONAL;
 
        failed = phys & 0x1 ? -EBADMSG : 0;
        phys &= ~0x3;
@@ -358,13 +355,8 @@ static void one_packet(dma_addr_t phys)
        case CTL_FLAG_PERFORM_AEAD: {
                struct aead_request *req = crypt->data.aead_req;
                struct aead_ctx *req_ctx = aead_request_ctx(req);
-               dma_unmap_sg(dev, req->assoc, req_ctx->assoc_nents,
-                               DMA_TO_DEVICE);
-               dma_unmap_sg(dev, &req_ctx->ivlist, 1, DMA_BIDIRECTIONAL);
-               dma_unmap_sg(dev, req->src, req_ctx->src_nents,
-                               DMA_BIDIRECTIONAL);
 
-               free_buf_chain(req_ctx->buffer, crypt->src_buf);
+               free_buf_chain(dev, req_ctx->buffer, crypt->src_buf);
                if (req_ctx->hmac_virt) {
                        finish_scattered_hmac(crypt);
                }
@@ -374,16 +366,11 @@ static void one_packet(dma_addr_t phys)
        case CTL_FLAG_PERFORM_ABLK: {
                struct ablkcipher_request *req = crypt->data.ablk_req;
                struct ablk_ctx *req_ctx = ablkcipher_request_ctx(req);
-               int nents;
+
                if (req_ctx->dst) {
-                       nents = req_ctx->dst_nents;
-                       dma_unmap_sg(dev, req->dst, nents, DMA_FROM_DEVICE);
-                       free_buf_chain(req_ctx->dst, crypt->dst_buf);
-                       src_direction = DMA_TO_DEVICE;
+                       free_buf_chain(dev, req_ctx->dst, crypt->dst_buf);
                }
-               nents = req_ctx->src_nents;
-               dma_unmap_sg(dev, req->src, nents, src_direction);
-               free_buf_chain(req_ctx->src, crypt->src_buf);
+               free_buf_chain(dev, req_ctx->src, crypt->src_buf);
                req->base.complete(&req->base, failed);
                break;
        }
@@ -750,56 +737,35 @@ static int setup_cipher(struct crypto_tfm *tfm, int encrypt,
        return 0;
 }
 
-static int count_sg(struct scatterlist *sg, int nbytes)
+static struct buffer_desc *chainup_buffers(struct device *dev,
+               struct scatterlist *sg, unsigned nbytes,
+               struct buffer_desc *buf, gfp_t flags,
+               enum dma_data_direction dir)
 {
-       int i;
-       for (i = 0; nbytes > 0; i++, sg = sg_next(sg))
-               nbytes -= sg->length;
-       return i;
-}
-
-static struct buffer_desc *chainup_buffers(struct scatterlist *sg,
-                       unsigned nbytes, struct buffer_desc *buf, gfp_t flags)
-{
-       int nents = 0;
-
-       while (nbytes > 0) {
+       for (;nbytes > 0; sg = scatterwalk_sg_next(sg)) {
+               unsigned len = min(nbytes, sg->length);
                struct buffer_desc *next_buf;
                u32 next_buf_phys;
-               unsigned len = min(nbytes, sg_dma_len(sg));
+               void *ptr;
 
-               nents++;
                nbytes -= len;
-               if (!buf->phys_addr) {
-                       buf->phys_addr = sg_dma_address(sg);
-                       buf->buf_len = len;
-                       buf->next = NULL;
-                       buf->phys_next = 0;
-                       goto next;
-               }
-               /* Two consecutive chunks on one page may be handled by the old
-                * buffer descriptor, increased by the length of the new one
-                */
-               if (sg_dma_address(sg) == buf->phys_addr + buf->buf_len) {
-                       buf->buf_len += len;
-                       goto next;
-               }
+               ptr = page_address(sg_page(sg)) + sg->offset;
                next_buf = dma_pool_alloc(buffer_pool, flags, &next_buf_phys);
-               if (!next_buf)
-                       return NULL;
+               if (!next_buf) {
+                       buf = NULL;
+                       break;
+               }
+               sg_dma_address(sg) = dma_map_single(dev, ptr, len, dir);
                buf->next = next_buf;
                buf->phys_next = next_buf_phys;
-
                buf = next_buf;
-               buf->next = NULL;
-               buf->phys_next = 0;
+
                buf->phys_addr = sg_dma_address(sg);
                buf->buf_len = len;
-next:
-               if (nbytes > 0) {
-                       sg = sg_next(sg);
-               }
+               buf->dir = dir;
        }
+       buf->next = NULL;
+       buf->phys_next = 0;
        return buf;
 }
 
@@ -860,12 +826,12 @@ static int ablk_perform(struct ablkcipher_request *req, int encrypt)
        struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
        struct ixp_ctx *ctx = crypto_ablkcipher_ctx(tfm);
        unsigned ivsize = crypto_ablkcipher_ivsize(tfm);
-       int ret = -ENOMEM;
        struct ix_sa_dir *dir;
        struct crypt_ctl *crypt;
-       unsigned int nbytes = req->nbytes, nents;
+       unsigned int nbytes = req->nbytes;
        enum dma_data_direction src_direction = DMA_BIDIRECTIONAL;
        struct ablk_ctx *req_ctx = ablkcipher_request_ctx(req);
+       struct buffer_desc src_hook;
        gfp_t flags = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ?
                                GFP_KERNEL : GFP_ATOMIC;
 
@@ -878,7 +844,7 @@ static int ablk_perform(struct ablkcipher_request *req, int encrypt)
 
        crypt = get_crypt_desc();
        if (!crypt)
-               return ret;
+               return -ENOMEM;
 
        crypt->data.ablk_req = req;
        crypt->crypto_ctx = dir->npe_ctx_phys;
@@ -891,53 +857,41 @@ static int ablk_perform(struct ablkcipher_request *req, int encrypt)
        BUG_ON(ivsize && !req->info);
        memcpy(crypt->iv, req->info, ivsize);
        if (req->src != req->dst) {
+               struct buffer_desc dst_hook;
                crypt->mode |= NPE_OP_NOT_IN_PLACE;
-               nents = count_sg(req->dst, nbytes);
                /* This was never tested by Intel
                 * for more than one dst buffer, I think. */
-               BUG_ON(nents != 1);
-               req_ctx->dst_nents = nents;
-               dma_map_sg(dev, req->dst, nents, DMA_FROM_DEVICE);
-               req_ctx->dst = dma_pool_alloc(buffer_pool, flags,&crypt->dst_buf);
-               if (!req_ctx->dst)
-                       goto unmap_sg_dest;
-               req_ctx->dst->phys_addr = 0;
-               if (!chainup_buffers(req->dst, nbytes, req_ctx->dst, flags))
+               BUG_ON(req->dst->length < nbytes);
+               req_ctx->dst = NULL;
+               if (!chainup_buffers(dev, req->dst, nbytes, &dst_hook,
+                                       flags, DMA_FROM_DEVICE))
                        goto free_buf_dest;
                src_direction = DMA_TO_DEVICE;
+               req_ctx->dst = dst_hook.next;
+               crypt->dst_buf = dst_hook.phys_next;
        } else {
                req_ctx->dst = NULL;
-               req_ctx->dst_nents = 0;
        }
-       nents = count_sg(req->src, nbytes);
-       req_ctx->src_nents = nents;
-       dma_map_sg(dev, req->src, nents, src_direction);
-
-       req_ctx->src = dma_pool_alloc(buffer_pool, flags, &crypt->src_buf);
-       if (!req_ctx->src)
-               goto unmap_sg_src;
-       req_ctx->src->phys_addr = 0;
-       if (!chainup_buffers(req->src, nbytes, req_ctx->src, flags))
+       req_ctx->src = NULL;
+       if (!chainup_buffers(dev, req->src, nbytes, &src_hook,
+                               flags, src_direction))
                goto free_buf_src;
 
+       req_ctx->src = src_hook.next;
+       crypt->src_buf = src_hook.phys_next;
        crypt->ctl_flags |= CTL_FLAG_PERFORM_ABLK;
        qmgr_put_entry(SEND_QID, crypt_virt2phys(crypt));
        BUG_ON(qmgr_stat_overflow(SEND_QID));
        return -EINPROGRESS;
 
 free_buf_src:
-       free_buf_chain(req_ctx->src, crypt->src_buf);
-unmap_sg_src:
-       dma_unmap_sg(dev, req->src, req_ctx->src_nents, src_direction);
+       free_buf_chain(dev, req_ctx->src, crypt->src_buf);
 free_buf_dest:
        if (req->src != req->dst) {
-               free_buf_chain(req_ctx->dst, crypt->dst_buf);
-unmap_sg_dest:
-               dma_unmap_sg(dev, req->src, req_ctx->dst_nents,
-                       DMA_FROM_DEVICE);
+               free_buf_chain(dev, req_ctx->dst, crypt->dst_buf);
        }
        crypt->ctl_flags = CTL_FLAG_UNUSED;
-       return ret;
+       return -ENOMEM;
 }
 
 static int ablk_encrypt(struct ablkcipher_request *req)
@@ -985,7 +939,7 @@ static int hmac_inconsistent(struct scatterlist *sg, unsigned start,
                        break;
 
                offset += sg->length;
-               sg = sg_next(sg);
+               sg = scatterwalk_sg_next(sg);
        }
        return (start + nbytes > offset + sg->length);
 }
@@ -997,11 +951,10 @@ static int aead_perform(struct aead_request *req, int encrypt,
        struct ixp_ctx *ctx = crypto_aead_ctx(tfm);
        unsigned ivsize = crypto_aead_ivsize(tfm);
        unsigned authsize = crypto_aead_authsize(tfm);
-       int ret = -ENOMEM;
        struct ix_sa_dir *dir;
        struct crypt_ctl *crypt;
-       unsigned int cryptlen, nents;
-       struct buffer_desc *buf;
+       unsigned int cryptlen;
+       struct buffer_desc *buf, src_hook;
        struct aead_ctx *req_ctx = aead_request_ctx(req);
        gfp_t flags = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ?
                                GFP_KERNEL : GFP_ATOMIC;
@@ -1022,7 +975,7 @@ static int aead_perform(struct aead_request *req, int encrypt,
        }
        crypt = get_crypt_desc();
        if (!crypt)
-               return ret;
+               return -ENOMEM;
 
        crypt->data.aead_req = req;
        crypt->crypto_ctx = dir->npe_ctx_phys;
@@ -1041,31 +994,27 @@ static int aead_perform(struct aead_request *req, int encrypt,
                BUG(); /* -ENOTSUP because of my lazyness */
        }
 
-       req_ctx->buffer = dma_pool_alloc(buffer_pool, flags, &crypt->src_buf);
-       if (!req_ctx->buffer)
-               goto out;
-       req_ctx->buffer->phys_addr = 0;
        /* ASSOC data */
-       nents = count_sg(req->assoc, req->assoclen);
-       req_ctx->assoc_nents = nents;
-       dma_map_sg(dev, req->assoc, nents, DMA_TO_DEVICE);
-       buf = chainup_buffers(req->assoc, req->assoclen, req_ctx->buffer,flags);
+       buf = chainup_buffers(dev, req->assoc, req->assoclen, &src_hook,
+               flags, DMA_TO_DEVICE);
+       req_ctx->buffer = src_hook.next;
+       crypt->src_buf = src_hook.phys_next;
        if (!buf)
-               goto unmap_sg_assoc;
+               goto out;
        /* IV */
        sg_init_table(&req_ctx->ivlist, 1);
        sg_set_buf(&req_ctx->ivlist, iv, ivsize);
-       dma_map_sg(dev, &req_ctx->ivlist, 1, DMA_BIDIRECTIONAL);
-       buf = chainup_buffers(&req_ctx->ivlist, ivsize, buf, flags);
+       buf = chainup_buffers(dev, &req_ctx->ivlist, ivsize, buf, flags,
+                       DMA_BIDIRECTIONAL);
        if (!buf)
-               goto unmap_sg_iv;
+               goto free_chain;
        if (unlikely(hmac_inconsistent(req->src, cryptlen, authsize))) {
                /* The 12 hmac bytes are scattered,
                 * we need to copy them into a safe buffer */
                req_ctx->hmac_virt = dma_pool_alloc(buffer_pool, flags,
                                &crypt->icv_rev_aes);
                if (unlikely(!req_ctx->hmac_virt))
-                       goto unmap_sg_iv;
+                       goto free_chain;
                if (!encrypt) {
                        scatterwalk_map_and_copy(req_ctx->hmac_virt,
                                req->src, cryptlen, authsize, 0);
@@ -1075,33 +1024,28 @@ static int aead_perform(struct aead_request *req, int encrypt,
                req_ctx->hmac_virt = NULL;
        }
        /* Crypt */
-       nents = count_sg(req->src, cryptlen + authsize);
-       req_ctx->src_nents = nents;
-       dma_map_sg(dev, req->src, nents, DMA_BIDIRECTIONAL);
-       buf = chainup_buffers(req->src, cryptlen + authsize, buf, flags);
+       buf = chainup_buffers(dev, req->src, cryptlen + authsize, buf, flags,
+                       DMA_BIDIRECTIONAL);
        if (!buf)
-               goto unmap_sg_src;
+               goto free_hmac_virt;
        if (!req_ctx->hmac_virt) {
                crypt->icv_rev_aes = buf->phys_addr + buf->buf_len - authsize;
        }
+
        crypt->ctl_flags |= CTL_FLAG_PERFORM_AEAD;
        qmgr_put_entry(SEND_QID, crypt_virt2phys(crypt));
        BUG_ON(qmgr_stat_overflow(SEND_QID));
        return -EINPROGRESS;
-unmap_sg_src:
-       dma_unmap_sg(dev, req->src, req_ctx->src_nents, DMA_BIDIRECTIONAL);
+free_hmac_virt:
        if (req_ctx->hmac_virt) {
                dma_pool_free(buffer_pool, req_ctx->hmac_virt,
                                crypt->icv_rev_aes);
        }
-unmap_sg_iv:
-       dma_unmap_sg(dev, &req_ctx->ivlist, 1, DMA_BIDIRECTIONAL);
-unmap_sg_assoc:
-       dma_unmap_sg(dev, req->assoc, req_ctx->assoc_nents, DMA_TO_DEVICE);
-       free_buf_chain(req_ctx->buffer, crypt->src_buf);
+free_chain:
+       free_buf_chain(dev, req_ctx->buffer, crypt->src_buf);
 out:
        crypt->ctl_flags = CTL_FLAG_UNUSED;
-       return ret;
+       return -ENOMEM;
 }
 
 static int aead_setup(struct crypto_aead *tfm, unsigned int authsize)
index 48ea59e796722cea45e2421830355f349c22c939..3b3c01b6f1ee34333d0f57f558f25560f76b2d44 100644 (file)
@@ -98,6 +98,17 @@ config NET_DMA
          Say Y here if you enabled INTEL_IOATDMA or FSL_DMA, otherwise
          say N.
 
+config ASYNC_TX_DMA
+       bool "Async_tx: Offload support for the async_tx api"
+       depends on DMA_ENGINE
+       help
+         This allows the async_tx api to take advantage of offload engines for
+         memcpy, memset, xor, and raid6 p+q operations.  If your platform has
+         a dma engine that can perform raid operations and you have enabled
+         MD_RAID456 say Y.
+
+         If unsure, say N.
+
 config DMATEST
        tristate "DMA Test client"
        depends on DMA_ENGINE
index 280a9d263eb3c4271d1cff6e55448cae0a93b387..92438e9dacc35887bd2d8b493ea3d93ac3bc20f7 100644 (file)
@@ -507,6 +507,7 @@ struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, v
                         * published in the general-purpose allocator
                         */
                        dma_cap_set(DMA_PRIVATE, device->cap_mask);
+                       device->privatecnt++;
                        err = dma_chan_get(chan);
 
                        if (err == -ENODEV) {
@@ -518,6 +519,8 @@ struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, v
                                       dma_chan_name(chan), err);
                        else
                                break;
+                       if (--device->privatecnt == 0)
+                               dma_cap_clear(DMA_PRIVATE, device->cap_mask);
                        chan->private = NULL;
                        chan = NULL;
                }
@@ -537,6 +540,9 @@ void dma_release_channel(struct dma_chan *chan)
        WARN_ONCE(chan->client_count != 1,
                  "chan reference count %d != 1\n", chan->client_count);
        dma_chan_put(chan);
+       /* drop PRIVATE cap enabled by __dma_request_channel() */
+       if (--chan->device->privatecnt == 0)
+               dma_cap_clear(DMA_PRIVATE, chan->device->cap_mask);
        chan->private = NULL;
        mutex_unlock(&dma_list_mutex);
 }
@@ -602,6 +608,24 @@ void dmaengine_put(void)
 }
 EXPORT_SYMBOL(dmaengine_put);
 
+static int get_dma_id(struct dma_device *device)
+{
+       int rc;
+
+ idr_retry:
+       if (!idr_pre_get(&dma_idr, GFP_KERNEL))
+               return -ENOMEM;
+       mutex_lock(&dma_list_mutex);
+       rc = idr_get_new(&dma_idr, NULL, &device->dev_id);
+       mutex_unlock(&dma_list_mutex);
+       if (rc == -EAGAIN)
+               goto idr_retry;
+       else if (rc != 0)
+               return rc;
+
+       return 0;
+}
+
 /**
  * dma_async_device_register - registers DMA devices found
  * @device: &dma_device
@@ -640,27 +664,25 @@ int dma_async_device_register(struct dma_device *device)
        idr_ref = kmalloc(sizeof(*idr_ref), GFP_KERNEL);
        if (!idr_ref)
                return -ENOMEM;
-       atomic_set(idr_ref, 0);
- idr_retry:
-       if (!idr_pre_get(&dma_idr, GFP_KERNEL))
-               return -ENOMEM;
-       mutex_lock(&dma_list_mutex);
-       rc = idr_get_new(&dma_idr, NULL, &device->dev_id);
-       mutex_unlock(&dma_list_mutex);
-       if (rc == -EAGAIN)
-               goto idr_retry;
-       else if (rc != 0)
+       rc = get_dma_id(device);
+       if (rc != 0) {
+               kfree(idr_ref);
                return rc;
+       }
+
+       atomic_set(idr_ref, 0);
 
        /* represent channels in sysfs. Probably want devs too */
        list_for_each_entry(chan, &device->channels, device_node) {
+               rc = -ENOMEM;
                chan->local = alloc_percpu(typeof(*chan->local));
                if (chan->local == NULL)
-                       continue;
+                       goto err_out;
                chan->dev = kzalloc(sizeof(*chan->dev), GFP_KERNEL);
                if (chan->dev == NULL) {
                        free_percpu(chan->local);
-                       continue;
+                       chan->local = NULL;
+                       goto err_out;
                }
 
                chan->chan_id = chancnt++;
@@ -677,6 +699,8 @@ int dma_async_device_register(struct dma_device *device)
                if (rc) {
                        free_percpu(chan->local);
                        chan->local = NULL;
+                       kfree(chan->dev);
+                       atomic_dec(idr_ref);
                        goto err_out;
                }
                chan->client_count = 0;
@@ -701,12 +725,23 @@ int dma_async_device_register(struct dma_device *device)
                        }
                }
        list_add_tail_rcu(&device->global_node, &dma_device_list);
+       if (dma_has_cap(DMA_PRIVATE, device->cap_mask))
+               device->privatecnt++;   /* Always private */
        dma_channel_rebalance();
        mutex_unlock(&dma_list_mutex);
 
        return 0;
 
 err_out:
+       /* if we never registered a channel just release the idr */
+       if (atomic_read(idr_ref) == 0) {
+               mutex_lock(&dma_list_mutex);
+               idr_remove(&dma_idr, device->dev_id);
+               mutex_unlock(&dma_list_mutex);
+               kfree(idr_ref);
+               return rc;
+       }
+
        list_for_each_entry(chan, &device->channels, device_node) {
                if (chan->local == NULL)
                        continue;
@@ -893,6 +928,7 @@ void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
 {
        tx->chan = chan;
        spin_lock_init(&tx->lock);
+       INIT_LIST_HEAD(&tx->tx_list);
 }
 EXPORT_SYMBOL(dma_async_tx_descriptor_init);
 
index e190d8b30700c40858fceb83afa05f49d9ce8d2e..a27c0fb1bc11f17b6b8c56bdc383aee5a710b051 100644 (file)
@@ -38,6 +38,11 @@ module_param(max_channels, uint, S_IRUGO);
 MODULE_PARM_DESC(max_channels,
                "Maximum number of channels to use (default: all)");
 
+static unsigned int xor_sources = 3;
+module_param(xor_sources, uint, S_IRUGO);
+MODULE_PARM_DESC(xor_sources,
+               "Number of xor source buffers (default: 3)");
+
 /*
  * Initialization patterns. All bytes in the source buffer has bit 7
  * set, all bytes in the destination buffer has bit 7 cleared.
@@ -59,8 +64,9 @@ struct dmatest_thread {
        struct list_head        node;
        struct task_struct      *task;
        struct dma_chan         *chan;
-       u8                      *srcbuf;
-       u8                      *dstbuf;
+       u8                      **srcs;
+       u8                      **dsts;
+       enum dma_transaction_type type;
 };
 
 struct dmatest_chan {
@@ -98,30 +104,37 @@ static unsigned long dmatest_random(void)
        return buf;
 }
 
-static void dmatest_init_srcbuf(u8 *buf, unsigned int start, unsigned int len)
+static void dmatest_init_srcs(u8 **bufs, unsigned int start, unsigned int len)
 {
        unsigned int i;
-
-       for (i = 0; i < start; i++)
-               buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK);
-       for ( ; i < start + len; i++)
-               buf[i] = PATTERN_SRC | PATTERN_COPY
-                       | (~i & PATTERN_COUNT_MASK);;
-       for ( ; i < test_buf_size; i++)
-               buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK);
+       u8 *buf;
+
+       for (; (buf = *bufs); bufs++) {
+               for (i = 0; i < start; i++)
+                       buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK);
+               for ( ; i < start + len; i++)
+                       buf[i] = PATTERN_SRC | PATTERN_COPY
+                               | (~i & PATTERN_COUNT_MASK);;
+               for ( ; i < test_buf_size; i++)
+                       buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK);
+               buf++;
+       }
 }
 
-static void dmatest_init_dstbuf(u8 *buf, unsigned int start, unsigned int len)
+static void dmatest_init_dsts(u8 **bufs, unsigned int start, unsigned int len)
 {
        unsigned int i;
-
-       for (i = 0; i < start; i++)
-               buf[i] = PATTERN_DST | (~i & PATTERN_COUNT_MASK);
-       for ( ; i < start + len; i++)
-               buf[i] = PATTERN_DST | PATTERN_OVERWRITE
-                       | (~i & PATTERN_COUNT_MASK);
-       for ( ; i < test_buf_size; i++)
-               buf[i] = PATTERN_DST | (~i & PATTERN_COUNT_MASK);
+       u8 *buf;
+
+       for (; (buf = *bufs); bufs++) {
+               for (i = 0; i < start; i++)
+                       buf[i] = PATTERN_DST | (~i & PATTERN_COUNT_MASK);
+               for ( ; i < start + len; i++)
+                       buf[i] = PATTERN_DST | PATTERN_OVERWRITE
+                               | (~i & PATTERN_COUNT_MASK);
+               for ( ; i < test_buf_size; i++)
+                       buf[i] = PATTERN_DST | (~i & PATTERN_COUNT_MASK);
+       }
 }
 
 static void dmatest_mismatch(u8 actual, u8 pattern, unsigned int index,
@@ -150,23 +163,30 @@ static void dmatest_mismatch(u8 actual, u8 pattern, unsigned int index,
                                thread_name, index, expected, actual);
 }
 
-static unsigned int dmatest_verify(u8 *buf, unsigned int start,
+static unsigned int dmatest_verify(u8 **bufs, unsigned int start,
                unsigned int end, unsigned int counter, u8 pattern,
                bool is_srcbuf)
 {
        unsigned int i;
        unsigned int error_count = 0;
        u8 actual;
-
-       for (i = start; i < end; i++) {
-               actual = buf[i];
-               if (actual != (pattern | (~counter & PATTERN_COUNT_MASK))) {
-                       if (error_count < 32)
-                               dmatest_mismatch(actual, pattern, i, counter,
-                                               is_srcbuf);
-                       error_count++;
+       u8 expected;
+       u8 *buf;
+       unsigned int counter_orig = counter;
+
+       for (; (buf = *bufs); bufs++) {
+               counter = counter_orig;
+               for (i = start; i < end; i++) {
+                       actual = buf[i];
+                       expected = pattern | (~counter & PATTERN_COUNT_MASK);
+                       if (actual != expected) {
+                               if (error_count < 32)
+                                       dmatest_mismatch(actual, pattern, i,
+                                                        counter, is_srcbuf);
+                               error_count++;
+                       }
+                       counter++;
                }
-               counter++;
        }
 
        if (error_count > 32)
@@ -176,12 +196,17 @@ static unsigned int dmatest_verify(u8 *buf, unsigned int start,
        return error_count;
 }
 
+static void dmatest_callback(void *completion)
+{
+       complete(completion);
+}
+
 /*
  * This function repeatedly tests DMA transfers of various lengths and
- * offsets until it is told to exit by kthread_stop(). There may be
- * multiple threads running this function in parallel for a single
- * channel, and there may be multiple channels being tested in
- * parallel.
+ * offsets for a given operation type until it is told to exit by
+ * kthread_stop(). There may be multiple threads running this function
+ * in parallel for a single channel, and there may be multiple channels
+ * being tested in parallel.
  *
  * Before each test, the source and destination buffer is initialized
  * with a known pattern. This pattern is different depending on
@@ -201,25 +226,57 @@ static int dmatest_func(void *data)
        unsigned int            total_tests = 0;
        dma_cookie_t            cookie;
        enum dma_status         status;
+       enum dma_ctrl_flags     flags;
        int                     ret;
+       int                     src_cnt;
+       int                     dst_cnt;
+       int                     i;
 
        thread_name = current->comm;
 
        ret = -ENOMEM;
-       thread->srcbuf = kmalloc(test_buf_size, GFP_KERNEL);
-       if (!thread->srcbuf)
-               goto err_srcbuf;
-       thread->dstbuf = kmalloc(test_buf_size, GFP_KERNEL);
-       if (!thread->dstbuf)
-               goto err_dstbuf;
 
        smp_rmb();
        chan = thread->chan;
+       if (thread->type == DMA_MEMCPY)
+               src_cnt = dst_cnt = 1;
+       else if (thread->type == DMA_XOR) {
+               src_cnt = xor_sources | 1; /* force odd to ensure dst = src */
+               dst_cnt = 1;
+       } else
+               goto err_srcs;
+
+       thread->srcs = kcalloc(src_cnt+1, sizeof(u8 *), GFP_KERNEL);
+       if (!thread->srcs)
+               goto err_srcs;
+       for (i = 0; i < src_cnt; i++) {
+               thread->srcs[i] = kmalloc(test_buf_size, GFP_KERNEL);
+               if (!thread->srcs[i])
+                       goto err_srcbuf;
+       }
+       thread->srcs[i] = NULL;
+
+       thread->dsts = kcalloc(dst_cnt+1, sizeof(u8 *), GFP_KERNEL);
+       if (!thread->dsts)
+               goto err_dsts;
+       for (i = 0; i < dst_cnt; i++) {
+               thread->dsts[i] = kmalloc(test_buf_size, GFP_KERNEL);
+               if (!thread->dsts[i])
+                       goto err_dstbuf;
+       }
+       thread->dsts[i] = NULL;
+
+       set_user_nice(current, 10);
+
+       flags = DMA_CTRL_ACK | DMA_COMPL_SKIP_DEST_UNMAP | DMA_PREP_INTERRUPT;
 
        while (!kthread_should_stop()) {
                struct dma_device *dev = chan->device;
-               struct dma_async_tx_descriptor *tx;
-               dma_addr_t dma_src, dma_dest;
+               struct dma_async_tx_descriptor *tx = NULL;
+               dma_addr_t dma_srcs[src_cnt];
+               dma_addr_t dma_dsts[dst_cnt];
+               struct completion cmp;
+               unsigned long tmo = msecs_to_jiffies(3000);
 
                total_tests++;
 
@@ -227,22 +284,41 @@ static int dmatest_func(void *data)
                src_off = dmatest_random() % (test_buf_size - len + 1);
                dst_off = dmatest_random() % (test_buf_size - len + 1);
 
-               dmatest_init_srcbuf(thread->srcbuf, src_off, len);
-               dmatest_init_dstbuf(thread->dstbuf, dst_off, len);
+               dmatest_init_srcs(thread->srcs, src_off, len);
+               dmatest_init_dsts(thread->dsts, dst_off, len);
 
-               dma_src = dma_map_single(dev->dev, thread->srcbuf + src_off,
-                               len, DMA_TO_DEVICE);
+               for (i = 0; i < src_cnt; i++) {
+                       u8 *buf = thread->srcs[i] + src_off;
+
+                       dma_srcs[i] = dma_map_single(dev->dev, buf, len,
+                                                    DMA_TO_DEVICE);
+               }
                /* map with DMA_BIDIRECTIONAL to force writeback/invalidate */
-               dma_dest = dma_map_single(dev->dev, thread->dstbuf,
-                               test_buf_size, DMA_BIDIRECTIONAL);
+               for (i = 0; i < dst_cnt; i++) {
+                       dma_dsts[i] = dma_map_single(dev->dev, thread->dsts[i],
+                                                    test_buf_size,
+                                                    DMA_BIDIRECTIONAL);
+               }
+
+               if (thread->type == DMA_MEMCPY)
+                       tx = dev->device_prep_dma_memcpy(chan,
+                                                        dma_dsts[0] + dst_off,
+                                                        dma_srcs[0], len,
+                                                        flags);
+               else if (thread->type == DMA_XOR)
+                       tx = dev->device_prep_dma_xor(chan,
+                                                     dma_dsts[0] + dst_off,
+                                                     dma_srcs, xor_sources,
+                                                     len, flags);
 
-               tx = dev->device_prep_dma_memcpy(chan, dma_dest + dst_off,
-                               dma_src, len,
-                               DMA_CTRL_ACK | DMA_COMPL_SKIP_DEST_UNMAP);
                if (!tx) {
-                       dma_unmap_single(dev->dev, dma_src, len, DMA_TO_DEVICE);
-                       dma_unmap_single(dev->dev, dma_dest,
-                                       test_buf_size, DMA_BIDIRECTIONAL);
+                       for (i = 0; i < src_cnt; i++)
+                               dma_unmap_single(dev->dev, dma_srcs[i], len,
+                                                DMA_TO_DEVICE);
+                       for (i = 0; i < dst_cnt; i++)
+                               dma_unmap_single(dev->dev, dma_dsts[i],
+                                                test_buf_size,
+                                                DMA_BIDIRECTIONAL);
                        pr_warning("%s: #%u: prep error with src_off=0x%x "
                                        "dst_off=0x%x len=0x%x\n",
                                        thread_name, total_tests - 1,
@@ -251,7 +327,10 @@ static int dmatest_func(void *data)
                        failed_tests++;
                        continue;
                }
-               tx->callback = NULL;
+
+               init_completion(&cmp);
+               tx->callback = dmatest_callback;
+               tx->callback_param = &cmp;
                cookie = tx->tx_submit(tx);
 
                if (dma_submit_error(cookie)) {
@@ -263,44 +342,50 @@ static int dmatest_func(void *data)
                        failed_tests++;
                        continue;
                }
-               dma_async_memcpy_issue_pending(chan);
+               dma_async_issue_pending(chan);
 
-               do {
-                       msleep(1);
-                       status = dma_async_memcpy_complete(
-                                       chan, cookie, NULL, NULL);
-               } while (status == DMA_IN_PROGRESS);
+               tmo = wait_for_completion_timeout(&cmp, tmo);
+               status = dma_async_is_tx_complete(chan, cookie, NULL, NULL);
 
-               if (status == DMA_ERROR) {
-                       pr_warning("%s: #%u: error during copy\n",
-                                       thread_name, total_tests - 1);
+               if (tmo == 0) {
+                       pr_warning("%s: #%u: test timed out\n",
+                                  thread_name, total_tests - 1);
+                       failed_tests++;
+                       continue;
+               } else if (status != DMA_SUCCESS) {
+                       pr_warning("%s: #%u: got completion callback,"
+                                  " but status is \'%s\'\n",
+                                  thread_name, total_tests - 1,
+                                  status == DMA_ERROR ? "error" : "in progress");
                        failed_tests++;
                        continue;
                }
+
                /* Unmap by myself (see DMA_COMPL_SKIP_DEST_UNMAP above) */
-               dma_unmap_single(dev->dev, dma_dest,
-                               test_buf_size, DMA_BIDIRECTIONAL);
+               for (i = 0; i < dst_cnt; i++)
+                       dma_unmap_single(dev->dev, dma_dsts[i], test_buf_size,
+                                        DMA_BIDIRECTIONAL);
 
                error_count = 0;
 
                pr_debug("%s: verifying source buffer...\n", thread_name);
-               error_count += dmatest_verify(thread->srcbuf, 0, src_off,
+               error_count += dmatest_verify(thread->srcs, 0, src_off,
                                0, PATTERN_SRC, true);
-               error_count += dmatest_verify(thread->srcbuf, src_off,
+               error_count += dmatest_verify(thread->srcs, src_off,
                                src_off + len, src_off,
                                PATTERN_SRC | PATTERN_COPY, true);
-               error_count += dmatest_verify(thread->srcbuf, src_off + len,
+               error_count += dmatest_verify(thread->srcs, src_off + len,
                                test_buf_size, src_off + len,
                                PATTERN_SRC, true);
 
                pr_debug("%s: verifying dest buffer...\n",
                                thread->task->comm);
-               error_count += dmatest_verify(thread->dstbuf, 0, dst_off,
+               error_count += dmatest_verify(thread->dsts, 0, dst_off,
                                0, PATTERN_DST, false);
-               error_count += dmatest_verify(thread->dstbuf, dst_off,
+               error_count += dmatest_verify(thread->dsts, dst_off,
                                dst_off + len, src_off,
                                PATTERN_SRC | PATTERN_COPY, false);
-               error_count += dmatest_verify(thread->dstbuf, dst_off + len,
+               error_count += dmatest_verify(thread->dsts, dst_off + len,
                                test_buf_size, dst_off + len,
                                PATTERN_DST, false);
 
@@ -319,10 +404,16 @@ static int dmatest_func(void *data)
        }
 
        ret = 0;
-       kfree(thread->dstbuf);
+       for (i = 0; thread->dsts[i]; i++)
+               kfree(thread->dsts[i]);
 err_dstbuf:
-       kfree(thread->srcbuf);
+       kfree(thread->dsts);
+err_dsts:
+       for (i = 0; thread->srcs[i]; i++)
+               kfree(thread->srcs[i]);
 err_srcbuf:
+       kfree(thread->srcs);
+err_srcs:
        pr_notice("%s: terminating after %u tests, %u failures (status %d)\n",
                        thread_name, total_tests, failed_tests, ret);
        return ret;
@@ -344,35 +435,36 @@ static void dmatest_cleanup_channel(struct dmatest_chan *dtc)
        kfree(dtc);
 }
 
-static int dmatest_add_channel(struct dma_chan *chan)
+static int dmatest_add_threads(struct dmatest_chan *dtc, enum dma_transaction_type type)
 {
-       struct dmatest_chan     *dtc;
-       struct dmatest_thread   *thread;
-       unsigned int            i;
-
-       dtc = kmalloc(sizeof(struct dmatest_chan), GFP_KERNEL);
-       if (!dtc) {
-               pr_warning("dmatest: No memory for %s\n", dma_chan_name(chan));
-               return -ENOMEM;
-       }
+       struct dmatest_thread *thread;
+       struct dma_chan *chan = dtc->chan;
+       char *op;
+       unsigned int i;
 
-       dtc->chan = chan;
-       INIT_LIST_HEAD(&dtc->threads);
+       if (type == DMA_MEMCPY)
+               op = "copy";
+       else if (type == DMA_XOR)
+               op = "xor";
+       else
+               return -EINVAL;
 
        for (i = 0; i < threads_per_chan; i++) {
                thread = kzalloc(sizeof(struct dmatest_thread), GFP_KERNEL);
                if (!thread) {
-                       pr_warning("dmatest: No memory for %s-test%u\n",
-                                  dma_chan_name(chan), i);
+                       pr_warning("dmatest: No memory for %s-%s%u\n",
+                                  dma_chan_name(chan), op, i);
+
                        break;
                }
                thread->chan = dtc->chan;
+               thread->type = type;
                smp_wmb();
-               thread->task = kthread_run(dmatest_func, thread, "%s-test%u",
-                               dma_chan_name(chan), i);
+               thread->task = kthread_run(dmatest_func, thread, "%s-%s%u",
+                               dma_chan_name(chan), op, i);
                if (IS_ERR(thread->task)) {
-                       pr_warning("dmatest: Failed to run thread %s-test%u\n",
-                                       dma_chan_name(chan), i);
+                       pr_warning("dmatest: Failed to run thread %s-%s%u\n",
+                                       dma_chan_name(chan), op, i);
                        kfree(thread);
                        break;
                }
@@ -382,7 +474,36 @@ static int dmatest_add_channel(struct dma_chan *chan)
                list_add_tail(&thread->node, &dtc->threads);
        }
 
-       pr_info("dmatest: Started %u threads using %s\n", i, dma_chan_name(chan));
+       return i;
+}
+
+static int dmatest_add_channel(struct dma_chan *chan)
+{
+       struct dmatest_chan     *dtc;
+       struct dma_device       *dma_dev = chan->device;
+       unsigned int            thread_count = 0;
+       unsigned int            cnt;
+
+       dtc = kmalloc(sizeof(struct dmatest_chan), GFP_KERNEL);
+       if (!dtc) {
+               pr_warning("dmatest: No memory for %s\n", dma_chan_name(chan));
+               return -ENOMEM;
+       }
+
+       dtc->chan = chan;
+       INIT_LIST_HEAD(&dtc->threads);
+
+       if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask)) {
+               cnt = dmatest_add_threads(dtc, DMA_MEMCPY);
+               thread_count += cnt > 0 ?: 0;
+       }
+       if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) {
+               cnt = dmatest_add_threads(dtc, DMA_XOR);
+               thread_count += cnt > 0 ?: 0;
+       }
+
+       pr_info("dmatest: Started %u threads using %s\n",
+               thread_count, dma_chan_name(chan));
 
        list_add_tail(&dtc->node, &dmatest_channels);
        nr_channels++;
index 20ad3d26bec2d5330ac40bd8b437475009e0d8f0..98c9a847bf51c27a8671cd3947690458e0896746 100644 (file)
@@ -363,6 +363,82 @@ static void dwc_handle_error(struct dw_dma *dw, struct dw_dma_chan *dwc)
        dwc_descriptor_complete(dwc, bad_desc);
 }
 
+/* --------------------- Cyclic DMA API extensions -------------------- */
+
+inline dma_addr_t dw_dma_get_src_addr(struct dma_chan *chan)
+{
+       struct dw_dma_chan *dwc = to_dw_dma_chan(chan);
+       return channel_readl(dwc, SAR);
+}
+EXPORT_SYMBOL(dw_dma_get_src_addr);
+
+inline dma_addr_t dw_dma_get_dst_addr(struct dma_chan *chan)
+{
+       struct dw_dma_chan *dwc = to_dw_dma_chan(chan);
+       return channel_readl(dwc, DAR);
+}
+EXPORT_SYMBOL(dw_dma_get_dst_addr);
+
+/* called with dwc->lock held and all DMAC interrupts disabled */
+static void dwc_handle_cyclic(struct dw_dma *dw, struct dw_dma_chan *dwc,
+               u32 status_block, u32 status_err, u32 status_xfer)
+{
+       if (status_block & dwc->mask) {
+               void (*callback)(void *param);
+               void *callback_param;
+
+               dev_vdbg(chan2dev(&dwc->chan), "new cyclic period llp 0x%08x\n",
+                               channel_readl(dwc, LLP));
+               dma_writel(dw, CLEAR.BLOCK, dwc->mask);
+
+               callback = dwc->cdesc->period_callback;
+               callback_param = dwc->cdesc->period_callback_param;
+               if (callback) {
+                       spin_unlock(&dwc->lock);
+                       callback(callback_param);
+                       spin_lock(&dwc->lock);
+               }
+       }
+
+       /*
+        * Error and transfer complete are highly unlikely, and will most
+        * likely be due to a configuration error by the user.
+        */
+       if (unlikely(status_err & dwc->mask) ||
+                       unlikely(status_xfer & dwc->mask)) {
+               int i;
+
+               dev_err(chan2dev(&dwc->chan), "cyclic DMA unexpected %s "
+                               "interrupt, stopping DMA transfer\n",
+                               status_xfer ? "xfer" : "error");
+               dev_err(chan2dev(&dwc->chan),
+                       "  SAR: 0x%x DAR: 0x%x LLP: 0x%x CTL: 0x%x:%08x\n",
+                       channel_readl(dwc, SAR),
+                       channel_readl(dwc, DAR),
+                       channel_readl(dwc, LLP),
+                       channel_readl(dwc, CTL_HI),
+                       channel_readl(dwc, CTL_LO));
+
+               channel_clear_bit(dw, CH_EN, dwc->mask);
+               while (dma_readl(dw, CH_EN) & dwc->mask)
+                       cpu_relax();
+
+               /* make sure DMA does not restart by loading a new list */
+               channel_writel(dwc, LLP, 0);
+               channel_writel(dwc, CTL_LO, 0);
+               channel_writel(dwc, CTL_HI, 0);
+
+               dma_writel(dw, CLEAR.BLOCK, dwc->mask);
+               dma_writel(dw, CLEAR.ERROR, dwc->mask);
+               dma_writel(dw, CLEAR.XFER, dwc->mask);
+
+               for (i = 0; i < dwc->cdesc->periods; i++)
+                       dwc_dump_lli(dwc, &dwc->cdesc->desc[i]->lli);
+       }
+}
+
+/* ------------------------------------------------------------------------- */
+
 static void dw_dma_tasklet(unsigned long data)
 {
        struct dw_dma *dw = (struct dw_dma *)data;
@@ -382,7 +458,10 @@ static void dw_dma_tasklet(unsigned long data)
        for (i = 0; i < dw->dma.chancnt; i++) {
                dwc = &dw->chan[i];
                spin_lock(&dwc->lock);
-               if (status_err & (1 << i))
+               if (test_bit(DW_DMA_IS_CYCLIC, &dwc->flags))
+                       dwc_handle_cyclic(dw, dwc, status_block, status_err,
+                                       status_xfer);
+               else if (status_err & (1 << i))
                        dwc_handle_error(dw, dwc);
                else if ((status_block | status_xfer) & (1 << i))
                        dwc_scan_descriptors(dw, dwc);
@@ -826,7 +905,6 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
                dma_async_tx_descriptor_init(&desc->txd, chan);
                desc->txd.tx_submit = dwc_tx_submit;
                desc->txd.flags = DMA_CTRL_ACK;
-               INIT_LIST_HEAD(&desc->txd.tx_list);
                desc->txd.phys = dma_map_single(chan2parent(chan), &desc->lli,
                                sizeof(desc->lli), DMA_TO_DEVICE);
                dwc_desc_put(dwc, desc);
@@ -884,6 +962,257 @@ static void dwc_free_chan_resources(struct dma_chan *chan)
        dev_vdbg(chan2dev(chan), "free_chan_resources done\n");
 }
 
+/* --------------------- Cyclic DMA API extensions -------------------- */
+
+/**
+ * dw_dma_cyclic_start - start the cyclic DMA transfer
+ * @chan: the DMA channel to start
+ *
+ * Must be called with soft interrupts disabled. Returns zero on success or
+ * -errno on failure.
+ */
+int dw_dma_cyclic_start(struct dma_chan *chan)
+{
+       struct dw_dma_chan      *dwc = to_dw_dma_chan(chan);
+       struct dw_dma           *dw = to_dw_dma(dwc->chan.device);
+
+       if (!test_bit(DW_DMA_IS_CYCLIC, &dwc->flags)) {
+               dev_err(chan2dev(&dwc->chan), "missing prep for cyclic DMA\n");
+               return -ENODEV;
+       }
+
+       spin_lock(&dwc->lock);
+
+       /* assert channel is idle */
+       if (dma_readl(dw, CH_EN) & dwc->mask) {
+               dev_err(chan2dev(&dwc->chan),
+                       "BUG: Attempted to start non-idle channel\n");
+               dev_err(chan2dev(&dwc->chan),
+                       "  SAR: 0x%x DAR: 0x%x LLP: 0x%x CTL: 0x%x:%08x\n",
+                       channel_readl(dwc, SAR),
+                       channel_readl(dwc, DAR),
+                       channel_readl(dwc, LLP),
+                       channel_readl(dwc, CTL_HI),
+                       channel_readl(dwc, CTL_LO));
+               spin_unlock(&dwc->lock);
+               return -EBUSY;
+       }
+
+       dma_writel(dw, CLEAR.BLOCK, dwc->mask);
+       dma_writel(dw, CLEAR.ERROR, dwc->mask);
+       dma_writel(dw, CLEAR.XFER, dwc->mask);
+
+       /* setup DMAC channel registers */
+       channel_writel(dwc, LLP, dwc->cdesc->desc[0]->txd.phys);
+       channel_writel(dwc, CTL_LO, DWC_CTLL_LLP_D_EN | DWC_CTLL_LLP_S_EN);
+       channel_writel(dwc, CTL_HI, 0);
+
+       channel_set_bit(dw, CH_EN, dwc->mask);
+
+       spin_unlock(&dwc->lock);
+
+       return 0;
+}
+EXPORT_SYMBOL(dw_dma_cyclic_start);
+
+/**
+ * dw_dma_cyclic_stop - stop the cyclic DMA transfer
+ * @chan: the DMA channel to stop
+ *
+ * Must be called with soft interrupts disabled.
+ */
+void dw_dma_cyclic_stop(struct dma_chan *chan)
+{
+       struct dw_dma_chan      *dwc = to_dw_dma_chan(chan);
+       struct dw_dma           *dw = to_dw_dma(dwc->chan.device);
+
+       spin_lock(&dwc->lock);
+
+       channel_clear_bit(dw, CH_EN, dwc->mask);
+       while (dma_readl(dw, CH_EN) & dwc->mask)
+               cpu_relax();
+
+       spin_unlock(&dwc->lock);
+}
+EXPORT_SYMBOL(dw_dma_cyclic_stop);
+
+/**
+ * dw_dma_cyclic_prep - prepare the cyclic DMA transfer
+ * @chan: the DMA channel to prepare
+ * @buf_addr: physical DMA address where the buffer starts
+ * @buf_len: total number of bytes for the entire buffer
+ * @period_len: number of bytes for each period
+ * @direction: transfer direction, to or from device
+ *
+ * Must be called before trying to start the transfer. Returns a valid struct
+ * dw_cyclic_desc if successful or an ERR_PTR(-errno) if not successful.
+ */
+struct dw_cyclic_desc *dw_dma_cyclic_prep(struct dma_chan *chan,
+               dma_addr_t buf_addr, size_t buf_len, size_t period_len,
+               enum dma_data_direction direction)
+{
+       struct dw_dma_chan              *dwc = to_dw_dma_chan(chan);
+       struct dw_cyclic_desc           *cdesc;
+       struct dw_cyclic_desc           *retval = NULL;
+       struct dw_desc                  *desc;
+       struct dw_desc                  *last = NULL;
+       struct dw_dma_slave             *dws = chan->private;
+       unsigned long                   was_cyclic;
+       unsigned int                    reg_width;
+       unsigned int                    periods;
+       unsigned int                    i;
+
+       spin_lock_bh(&dwc->lock);
+       if (!list_empty(&dwc->queue) || !list_empty(&dwc->active_list)) {
+               spin_unlock_bh(&dwc->lock);
+               dev_dbg(chan2dev(&dwc->chan),
+                               "queue and/or active list are not empty\n");
+               return ERR_PTR(-EBUSY);
+       }
+
+       was_cyclic = test_and_set_bit(DW_DMA_IS_CYCLIC, &dwc->flags);
+       spin_unlock_bh(&dwc->lock);
+       if (was_cyclic) {
+               dev_dbg(chan2dev(&dwc->chan),
+                               "channel already prepared for cyclic DMA\n");
+               return ERR_PTR(-EBUSY);
+       }
+
+       retval = ERR_PTR(-EINVAL);
+       reg_width = dws->reg_width;
+       periods = buf_len / period_len;
+
+       /* Check for too big/unaligned periods and unaligned DMA buffer. */
+       if (period_len > (DWC_MAX_COUNT << reg_width))
+               goto out_err;
+       if (unlikely(period_len & ((1 << reg_width) - 1)))
+               goto out_err;
+       if (unlikely(buf_addr & ((1 << reg_width) - 1)))
+               goto out_err;
+       if (unlikely(!(direction & (DMA_TO_DEVICE | DMA_FROM_DEVICE))))
+               goto out_err;
+
+       retval = ERR_PTR(-ENOMEM);
+
+       if (periods > NR_DESCS_PER_CHANNEL)
+               goto out_err;
+
+       cdesc = kzalloc(sizeof(struct dw_cyclic_desc), GFP_KERNEL);
+       if (!cdesc)
+               goto out_err;
+
+       cdesc->desc = kzalloc(sizeof(struct dw_desc *) * periods, GFP_KERNEL);
+       if (!cdesc->desc)
+               goto out_err_alloc;
+
+       for (i = 0; i < periods; i++) {
+               desc = dwc_desc_get(dwc);
+               if (!desc)
+                       goto out_err_desc_get;
+
+               switch (direction) {
+               case DMA_TO_DEVICE:
+                       desc->lli.dar = dws->tx_reg;
+                       desc->lli.sar = buf_addr + (period_len * i);
+                       desc->lli.ctllo = (DWC_DEFAULT_CTLLO
+                                       | DWC_CTLL_DST_WIDTH(reg_width)
+                                       | DWC_CTLL_SRC_WIDTH(reg_width)
+                                       | DWC_CTLL_DST_FIX
+                                       | DWC_CTLL_SRC_INC
+                                       | DWC_CTLL_FC_M2P
+                                       | DWC_CTLL_INT_EN);
+                       break;
+               case DMA_FROM_DEVICE:
+                       desc->lli.dar = buf_addr + (period_len * i);
+                       desc->lli.sar = dws->rx_reg;
+                       desc->lli.ctllo = (DWC_DEFAULT_CTLLO
+                                       | DWC_CTLL_SRC_WIDTH(reg_width)
+                                       | DWC_CTLL_DST_WIDTH(reg_width)
+                                       | DWC_CTLL_DST_INC
+                                       | DWC_CTLL_SRC_FIX
+                                       | DWC_CTLL_FC_P2M
+                                       | DWC_CTLL_INT_EN);
+                       break;
+               default:
+                       break;
+               }
+
+               desc->lli.ctlhi = (period_len >> reg_width);
+               cdesc->desc[i] = desc;
+
+               if (last) {
+                       last->lli.llp = desc->txd.phys;
+                       dma_sync_single_for_device(chan2parent(chan),
+                                       last->txd.phys, sizeof(last->lli),
+                                       DMA_TO_DEVICE);
+               }
+
+               last = desc;
+       }
+
+       /* lets make a cyclic list */
+       last->lli.llp = cdesc->desc[0]->txd.phys;
+       dma_sync_single_for_device(chan2parent(chan), last->txd.phys,
+                       sizeof(last->lli), DMA_TO_DEVICE);
+
+       dev_dbg(chan2dev(&dwc->chan), "cyclic prepared buf 0x%08x len %zu "
+                       "period %zu periods %d\n", buf_addr, buf_len,
+                       period_len, periods);
+
+       cdesc->periods = periods;
+       dwc->cdesc = cdesc;
+
+       return cdesc;
+
+out_err_desc_get:
+       while (i--)
+               dwc_desc_put(dwc, cdesc->desc[i]);
+out_err_alloc:
+       kfree(cdesc);
+out_err:
+       clear_bit(DW_DMA_IS_CYCLIC, &dwc->flags);
+       return (struct dw_cyclic_desc *)retval;
+}
+EXPORT_SYMBOL(dw_dma_cyclic_prep);
+
+/**
+ * dw_dma_cyclic_free - free a prepared cyclic DMA transfer
+ * @chan: the DMA channel to free
+ */
+void dw_dma_cyclic_free(struct dma_chan *chan)
+{
+       struct dw_dma_chan      *dwc = to_dw_dma_chan(chan);
+       struct dw_dma           *dw = to_dw_dma(dwc->chan.device);
+       struct dw_cyclic_desc   *cdesc = dwc->cdesc;
+       int                     i;
+
+       dev_dbg(chan2dev(&dwc->chan), "cyclic free\n");
+
+       if (!cdesc)
+               return;
+
+       spin_lock_bh(&dwc->lock);
+
+       channel_clear_bit(dw, CH_EN, dwc->mask);
+       while (dma_readl(dw, CH_EN) & dwc->mask)
+               cpu_relax();
+
+       dma_writel(dw, CLEAR.BLOCK, dwc->mask);
+       dma_writel(dw, CLEAR.ERROR, dwc->mask);
+       dma_writel(dw, CLEAR.XFER, dwc->mask);
+
+       spin_unlock_bh(&dwc->lock);
+
+       for (i = 0; i < cdesc->periods; i++)
+               dwc_desc_put(dwc, cdesc->desc[i]);
+
+       kfree(cdesc->desc);
+       kfree(cdesc);
+
+       clear_bit(DW_DMA_IS_CYCLIC, &dwc->flags);
+}
+EXPORT_SYMBOL(dw_dma_cyclic_free);
+
 /*----------------------------------------------------------------------*/
 
 static void dw_dma_off(struct dw_dma *dw)
index b252b202c5cf1888f65149d3e22deee3d9cfeef0..13a580767031a3aac04046ffd563ef1d83f053b0 100644 (file)
@@ -126,6 +126,10 @@ struct dw_dma_regs {
 
 #define DW_REGLEN              0x400
 
+enum dw_dmac_flags {
+       DW_DMA_IS_CYCLIC = 0,
+};
+
 struct dw_dma_chan {
        struct dma_chan         chan;
        void __iomem            *ch_regs;
@@ -134,10 +138,12 @@ struct dw_dma_chan {
        spinlock_t              lock;
 
        /* these other elements are all protected by lock */
+       unsigned long           flags;
        dma_cookie_t            completed;
        struct list_head        active_list;
        struct list_head        queue;
        struct list_head        free_list;
+       struct dw_cyclic_desc   *cdesc;
 
        unsigned int            descs_allocated;
 };
@@ -158,7 +164,6 @@ static inline struct dw_dma_chan *to_dw_dma_chan(struct dma_chan *chan)
        return container_of(chan, struct dw_dma_chan, chan);
 }
 
-
 struct dw_dma {
        struct dma_device       dma;
        void __iomem            *regs;
index 86d6da47f558765736149344b2c595b13a289f3f..da8a8ed9e411008d68e14820444695176caa9318 100644 (file)
@@ -354,7 +354,6 @@ static struct fsl_desc_sw *fsl_dma_alloc_descriptor(
                dma_async_tx_descriptor_init(&desc_sw->async_tx,
                                                &fsl_chan->common);
                desc_sw->async_tx.tx_submit = fsl_dma_tx_submit;
-               INIT_LIST_HEAD(&desc_sw->async_tx.tx_list);
                desc_sw->async_tx.phys = pdesc;
        }
 
index 5905cd36bcd23b43b86dcc532a1a70cc5ce5ce58..e4fc33c1c32f89fc99711ca2c759f15e79eeaa62 100644 (file)
@@ -693,7 +693,6 @@ static struct ioat_desc_sw *ioat_dma_alloc_descriptor(
                desc_sw->async_tx.tx_submit = ioat2_tx_submit;
                break;
        }
-       INIT_LIST_HEAD(&desc_sw->async_tx.tx_list);
 
        desc_sw->hw = desc;
        desc_sw->async_tx.phys = phys;
index 16adbe61cfb2cd60efd1a01246be3f9c9f4eac43..2f052265122f62e2681bbdb0dfbbd558cef24713 100644 (file)
@@ -498,7 +498,6 @@ static int iop_adma_alloc_chan_resources(struct dma_chan *chan)
                slot->async_tx.tx_submit = iop_adma_tx_submit;
                INIT_LIST_HEAD(&slot->chain_node);
                INIT_LIST_HEAD(&slot->slot_node);
-               INIT_LIST_HEAD(&slot->async_tx.tx_list);
                hw_desc = (char *) iop_chan->device->dma_desc_pool;
                slot->async_tx.phys =
                        (dma_addr_t) &hw_desc[idx * IOP_ADMA_SLOT_SIZE];
index da781d1078951f9164dcf5c1da1706f376b1a21d..e202a6ce55735bf3d6fdb456f718fade1301ba32 100644 (file)
@@ -28,6 +28,9 @@
 #define FS_VF_IN_VALID 0x00000002
 #define FS_ENC_IN_VALID        0x00000001
 
+static int ipu_disable_channel(struct idmac *idmac, struct idmac_channel *ichan,
+                              bool wait_for_stop);
+
 /*
  * There can be only one, we could allocate it dynamically, but then we'd have
  * to add an extra parameter to some functions, and use something as ugly as
@@ -107,7 +110,7 @@ static uint32_t bytes_per_pixel(enum pixel_fmt fmt)
        }
 }
 
-/* Enable / disable direct write to memory by the Camera Sensor Interface */
+/* Enable direct write to memory by the Camera Sensor Interface */
 static void ipu_ic_enable_task(struct ipu *ipu, enum ipu_channel channel)
 {
        uint32_t ic_conf, mask;
@@ -126,6 +129,7 @@ static void ipu_ic_enable_task(struct ipu *ipu, enum ipu_channel channel)
        idmac_write_icreg(ipu, ic_conf, IC_CONF);
 }
 
+/* Called under spin_lock_irqsave(&ipu_data.lock) */
 static void ipu_ic_disable_task(struct ipu *ipu, enum ipu_channel channel)
 {
        uint32_t ic_conf, mask;
@@ -422,7 +426,7 @@ static void ipu_ch_param_set_size(union chan_param_mem *params,
                break;
        default:
                dev_err(ipu_data.dev,
-                       "mxc ipu: unimplemented pixel format %d\n", pixel_fmt);
+                       "mx3 ipu: unimplemented pixel format %d\n", pixel_fmt);
                break;
        }
 
@@ -433,20 +437,20 @@ static void ipu_ch_param_set_burst_size(union chan_param_mem *params,
                                        uint16_t burst_pixels)
 {
        params->pp.npb = burst_pixels - 1;
-};
+}
 
 static void ipu_ch_param_set_buffer(union chan_param_mem *params,
                                    dma_addr_t buf0, dma_addr_t buf1)
 {
        params->pp.eba0 = buf0;
        params->pp.eba1 = buf1;
-};
+}
 
 static void ipu_ch_param_set_rotation(union chan_param_mem *params,
                                      enum ipu_rotate_mode rotate)
 {
        params->pp.bam = rotate;
-};
+}
 
 static void ipu_write_param_mem(uint32_t addr, uint32_t *data,
                                uint32_t num_words)
@@ -571,7 +575,7 @@ static uint32_t dma_param_addr(uint32_t dma_ch)
 {
        /* Channel Parameter Memory */
        return 0x10000 | (dma_ch << 4);
-};
+}
 
 static void ipu_channel_set_priority(struct ipu *ipu, enum ipu_channel channel,
                                     bool prio)
@@ -611,7 +615,8 @@ static uint32_t ipu_channel_conf_mask(enum ipu_channel channel)
 
 /**
  * ipu_enable_channel() - enable an IPU channel.
- * @channel:   channel ID.
+ * @idmac:     IPU DMAC context.
+ * @ichan:     IDMAC channel.
  * @return:    0 on success or negative error code on failure.
  */
 static int ipu_enable_channel(struct idmac *idmac, struct idmac_channel *ichan)
@@ -649,7 +654,7 @@ static int ipu_enable_channel(struct idmac *idmac, struct idmac_channel *ichan)
 
 /**
  * ipu_init_channel_buffer() - initialize a buffer for logical IPU channel.
- * @channel:   channel ID.
+ * @ichan:     IDMAC channel.
  * @pixel_fmt: pixel format of buffer. Pixel format is a FOURCC ASCII code.
  * @width:     width of buffer in pixels.
  * @height:    height of buffer in pixels.
@@ -687,7 +692,7 @@ static int ipu_init_channel_buffer(struct idmac_channel *ichan,
        }
 
        /* IC channel's stride must be a multiple of 8 pixels */
-       if ((channel <= 13) && (stride % 8)) {
+       if ((channel <= IDMAC_IC_13) && (stride % 8)) {
                dev_err(ipu->dev, "Stride must be 8 pixel multiple\n");
                return -EINVAL;
        }
@@ -752,7 +757,7 @@ static void ipu_select_buffer(enum ipu_channel channel, int buffer_n)
 
 /**
  * ipu_update_channel_buffer() - update physical address of a channel buffer.
- * @channel:   channel ID.
+ * @ichan:     IDMAC channel.
  * @buffer_n:  buffer number to update.
  *             0 or 1 are the only valid values.
  * @phyaddr:   buffer physical address.
@@ -760,9 +765,10 @@ static void ipu_select_buffer(enum ipu_channel channel, int buffer_n)
  *              function will fail if the buffer is set to ready.
  */
 /* Called under spin_lock(_irqsave)(&ichan->lock) */
-static int ipu_update_channel_buffer(enum ipu_channel channel,
+static int ipu_update_channel_buffer(struct idmac_channel *ichan,
                                     int buffer_n, dma_addr_t phyaddr)
 {
+       enum ipu_channel channel = ichan->dma_chan.chan_id;
        uint32_t reg;
        unsigned long flags;
 
@@ -771,8 +777,8 @@ static int ipu_update_channel_buffer(enum ipu_channel channel,
        if (buffer_n == 0) {
                reg = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF0_RDY);
                if (reg & (1UL << channel)) {
-                       spin_unlock_irqrestore(&ipu_data.lock, flags);
-                       return -EACCES;
+                       ipu_ic_disable_task(&ipu_data, channel);
+                       ichan->status = IPU_CHANNEL_READY;
                }
 
                /* 44.3.3.1.9 - Row Number 1 (WORD1, offset 0) */
@@ -782,8 +788,8 @@ static int ipu_update_channel_buffer(enum ipu_channel channel,
        } else {
                reg = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF1_RDY);
                if (reg & (1UL << channel)) {
-                       spin_unlock_irqrestore(&ipu_data.lock, flags);
-                       return -EACCES;
+                       ipu_ic_disable_task(&ipu_data, channel);
+                       ichan->status = IPU_CHANNEL_READY;
                }
 
                /* Check if double-buffering is already enabled */
@@ -804,6 +810,39 @@ static int ipu_update_channel_buffer(enum ipu_channel channel,
        return 0;
 }
 
+/* Called under spin_lock_irqsave(&ichan->lock) */
+static int ipu_submit_buffer(struct idmac_channel *ichan,
+       struct idmac_tx_desc *desc, struct scatterlist *sg, int buf_idx)
+{
+       unsigned int chan_id = ichan->dma_chan.chan_id;
+       struct device *dev = &ichan->dma_chan.dev->device;
+       int ret;
+
+       if (async_tx_test_ack(&desc->txd))
+               return -EINTR;
+
+       /*
+        * On first invocation this shouldn't be necessary, the call to
+        * ipu_init_channel_buffer() above will set addresses for us, so we
+        * could make it conditional on status >= IPU_CHANNEL_ENABLED, but
+        * doing it again shouldn't hurt either.
+        */
+       ret = ipu_update_channel_buffer(ichan, buf_idx,
+                                       sg_dma_address(sg));
+
+       if (ret < 0) {
+               dev_err(dev, "Updating sg %p on channel 0x%x buffer %d failed!\n",
+                       sg, chan_id, buf_idx);
+               return ret;
+       }
+
+       ipu_select_buffer(chan_id, buf_idx);
+       dev_dbg(dev, "Updated sg %p on channel 0x%x buffer %d\n",
+               sg, chan_id, buf_idx);
+
+       return 0;
+}
+
 /* Called under spin_lock_irqsave(&ichan->lock) */
 static int ipu_submit_channel_buffers(struct idmac_channel *ichan,
                                      struct idmac_tx_desc *desc)
@@ -815,20 +854,10 @@ static int ipu_submit_channel_buffers(struct idmac_channel *ichan,
                if (!ichan->sg[i]) {
                        ichan->sg[i] = sg;
 
-                       /*
-                        * On first invocation this shouldn't be necessary, the
-                        * call to ipu_init_channel_buffer() above will set
-                        * addresses for us, so we could make it conditional
-                        * on status >= IPU_CHANNEL_ENABLED, but doing it again
-                        * shouldn't hurt either.
-                        */
-                       ret = ipu_update_channel_buffer(ichan->dma_chan.chan_id, i,
-                                                       sg_dma_address(sg));
+                       ret = ipu_submit_buffer(ichan, desc, sg, i);
                        if (ret < 0)
                                return ret;
 
-                       ipu_select_buffer(ichan->dma_chan.chan_id, i);
-
                        sg = sg_next(sg);
                }
        }
@@ -842,19 +871,22 @@ static dma_cookie_t idmac_tx_submit(struct dma_async_tx_descriptor *tx)
        struct idmac_channel *ichan = to_idmac_chan(tx->chan);
        struct idmac *idmac = to_idmac(tx->chan->device);
        struct ipu *ipu = to_ipu(idmac);
+       struct device *dev = &ichan->dma_chan.dev->device;
        dma_cookie_t cookie;
        unsigned long flags;
+       int ret;
 
        /* Sanity check */
        if (!list_empty(&desc->list)) {
                /* The descriptor doesn't belong to client */
-               dev_err(&ichan->dma_chan.dev->device,
-                       "Descriptor %p not prepared!\n", tx);
+               dev_err(dev, "Descriptor %p not prepared!\n", tx);
                return -EBUSY;
        }
 
        mutex_lock(&ichan->chan_mutex);
 
+       async_tx_clear_ack(tx);
+
        if (ichan->status < IPU_CHANNEL_READY) {
                struct idmac_video_param *video = &ichan->params.video;
                /*
@@ -878,16 +910,7 @@ static dma_cookie_t idmac_tx_submit(struct dma_async_tx_descriptor *tx)
                        goto out;
        }
 
-       /* ipu->lock can be taken under ichan->lock, but not v.v. */
-       spin_lock_irqsave(&ichan->lock, flags);
-
-       /* submit_buffers() atomically verifies and fills empty sg slots */
-       cookie = ipu_submit_channel_buffers(ichan, desc);
-
-       spin_unlock_irqrestore(&ichan->lock, flags);
-
-       if (cookie < 0)
-               goto out;
+       dev_dbg(dev, "Submitting sg %p\n", &desc->sg[0]);
 
        cookie = ichan->dma_chan.cookie;
 
@@ -897,24 +920,40 @@ static dma_cookie_t idmac_tx_submit(struct dma_async_tx_descriptor *tx)
        /* from dmaengine.h: "last cookie value returned to client" */
        ichan->dma_chan.cookie = cookie;
        tx->cookie = cookie;
+
+       /* ipu->lock can be taken under ichan->lock, but not v.v. */
        spin_lock_irqsave(&ichan->lock, flags);
+
        list_add_tail(&desc->list, &ichan->queue);
+       /* submit_buffers() atomically verifies and fills empty sg slots */
+       ret = ipu_submit_channel_buffers(ichan, desc);
+
        spin_unlock_irqrestore(&ichan->lock, flags);
 
+       if (ret < 0) {
+               cookie = ret;
+               goto dequeue;
+       }
+
        if (ichan->status < IPU_CHANNEL_ENABLED) {
-               int ret = ipu_enable_channel(idmac, ichan);
+               ret = ipu_enable_channel(idmac, ichan);
                if (ret < 0) {
                        cookie = ret;
-                       spin_lock_irqsave(&ichan->lock, flags);
-                       list_del_init(&desc->list);
-                       spin_unlock_irqrestore(&ichan->lock, flags);
-                       tx->cookie = cookie;
-                       ichan->dma_chan.cookie = cookie;
+                       goto dequeue;
                }
        }
 
        dump_idmac_reg(ipu);
 
+dequeue:
+       if (cookie < 0) {
+               spin_lock_irqsave(&ichan->lock, flags);
+               list_del_init(&desc->list);
+               spin_unlock_irqrestore(&ichan->lock, flags);
+               tx->cookie = cookie;
+               ichan->dma_chan.cookie = cookie;
+       }
+
 out:
        mutex_unlock(&ichan->chan_mutex);
 
@@ -944,8 +983,6 @@ static int idmac_desc_alloc(struct idmac_channel *ichan, int n)
                memset(txd, 0, sizeof(*txd));
                dma_async_tx_descriptor_init(txd, &ichan->dma_chan);
                txd->tx_submit          = idmac_tx_submit;
-               txd->chan               = &ichan->dma_chan;
-               INIT_LIST_HEAD(&txd->tx_list);
 
                list_add(&desc->list, &ichan->free_list);
 
@@ -1161,6 +1198,24 @@ static int ipu_disable_channel(struct idmac *idmac, struct idmac_channel *ichan,
        return 0;
 }
 
+static struct scatterlist *idmac_sg_next(struct idmac_channel *ichan,
+       struct idmac_tx_desc **desc, struct scatterlist *sg)
+{
+       struct scatterlist *sgnew = sg ? sg_next(sg) : NULL;
+
+       if (sgnew)
+               /* next sg-element in this list */
+               return sgnew;
+
+       if ((*desc)->list.next == &ichan->queue)
+               /* No more descriptors on the queue */
+               return NULL;
+
+       /* Fetch next descriptor */
+       *desc = list_entry((*desc)->list.next, struct idmac_tx_desc, list);
+       return (*desc)->sg;
+}
+
 /*
  * We have several possibilities here:
  * current BUF         next BUF
@@ -1176,23 +1231,46 @@ static int ipu_disable_channel(struct idmac *idmac, struct idmac_channel *ichan,
 static irqreturn_t idmac_interrupt(int irq, void *dev_id)
 {
        struct idmac_channel *ichan = dev_id;
+       struct device *dev = &ichan->dma_chan.dev->device;
        unsigned int chan_id = ichan->dma_chan.chan_id;
        struct scatterlist **sg, *sgnext, *sgnew = NULL;
        /* Next transfer descriptor */
-       struct idmac_tx_desc *desc = NULL, *descnew;
+       struct idmac_tx_desc *desc, *descnew;
        dma_async_tx_callback callback;
        void *callback_param;
        bool done = false;
-       u32     ready0 = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF0_RDY),
-               ready1 = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF1_RDY),
-               curbuf = idmac_read_ipureg(&ipu_data, IPU_CHA_CUR_BUF);
+       u32 ready0, ready1, curbuf, err;
+       unsigned long flags;
 
        /* IDMAC has cleared the respective BUFx_RDY bit, we manage the buffer */
 
-       pr_debug("IDMAC irq %d\n", irq);
+       dev_dbg(dev, "IDMAC irq %d, buf %d\n", irq, ichan->active_buffer);
+
+       spin_lock_irqsave(&ipu_data.lock, flags);
+
+       ready0  = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF0_RDY);
+       ready1  = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF1_RDY);
+       curbuf  = idmac_read_ipureg(&ipu_data, IPU_CHA_CUR_BUF);
+       err     = idmac_read_ipureg(&ipu_data, IPU_INT_STAT_4);
+
+       if (err & (1 << chan_id)) {
+               idmac_write_ipureg(&ipu_data, 1 << chan_id, IPU_INT_STAT_4);
+               spin_unlock_irqrestore(&ipu_data.lock, flags);
+               /*
+                * Doing this
+                * ichan->sg[0] = ichan->sg[1] = NULL;
+                * you can force channel re-enable on the next tx_submit(), but
+                * this is dirty - think about descriptors with multiple
+                * sg elements.
+                */
+               dev_warn(dev, "NFB4EOF on channel %d, ready %x, %x, cur %x\n",
+                        chan_id, ready0, ready1, curbuf);
+               return IRQ_HANDLED;
+       }
+       spin_unlock_irqrestore(&ipu_data.lock, flags);
+
        /* Other interrupts do not interfere with this channel */
        spin_lock(&ichan->lock);
-
        if (unlikely(chan_id != IDMAC_SDC_0 && chan_id != IDMAC_SDC_1 &&
                     ((curbuf >> chan_id) & 1) == ichan->active_buffer)) {
                int i = 100;
@@ -1207,19 +1285,23 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id)
 
                if (!i) {
                        spin_unlock(&ichan->lock);
-                       dev_dbg(ichan->dma_chan.device->dev,
+                       dev_dbg(dev,
                                "IRQ on active buffer on channel %x, active "
                                "%d, ready %x, %x, current %x!\n", chan_id,
                                ichan->active_buffer, ready0, ready1, curbuf);
                        return IRQ_NONE;
-               }
+               } else
+                       dev_dbg(dev,
+                               "Buffer deactivated on channel %x, active "
+                               "%d, ready %x, %x, current %x, rest %d!\n", chan_id,
+                               ichan->active_buffer, ready0, ready1, curbuf, i);
        }
 
        if (unlikely((ichan->active_buffer && (ready1 >> chan_id) & 1) ||
                     (!ichan->active_buffer && (ready0 >> chan_id) & 1)
                     )) {
                spin_unlock(&ichan->lock);
-               dev_dbg(ichan->dma_chan.device->dev,
+               dev_dbg(dev,
                        "IRQ with active buffer still ready on channel %x, "
                        "active %d, ready %x, %x!\n", chan_id,
                        ichan->active_buffer, ready0, ready1);
@@ -1227,8 +1309,9 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id)
        }
 
        if (unlikely(list_empty(&ichan->queue))) {
+               ichan->sg[ichan->active_buffer] = NULL;
                spin_unlock(&ichan->lock);
-               dev_err(ichan->dma_chan.device->dev,
+               dev_err(dev,
                        "IRQ without queued buffers on channel %x, active %d, "
                        "ready %x, %x!\n", chan_id,
                        ichan->active_buffer, ready0, ready1);
@@ -1243,40 +1326,44 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id)
        sg = &ichan->sg[ichan->active_buffer];
        sgnext = ichan->sg[!ichan->active_buffer];
 
+       if (!*sg) {
+               spin_unlock(&ichan->lock);
+               return IRQ_HANDLED;
+       }
+
+       desc = list_entry(ichan->queue.next, struct idmac_tx_desc, list);
+       descnew = desc;
+
+       dev_dbg(dev, "IDMAC irq %d, dma 0x%08x, next dma 0x%08x, current %d, curbuf 0x%08x\n",
+               irq, sg_dma_address(*sg), sgnext ? sg_dma_address(sgnext) : 0, ichan->active_buffer, curbuf);
+
+       /* Find the descriptor of sgnext */
+       sgnew = idmac_sg_next(ichan, &descnew, *sg);
+       if (sgnext != sgnew)
+               dev_err(dev, "Submitted buffer %p, next buffer %p\n", sgnext, sgnew);
+
        /*
         * if sgnext == NULL sg must be the last element in a scatterlist and
         * queue must be empty
         */
        if (unlikely(!sgnext)) {
-               if (unlikely(sg_next(*sg))) {
-                       dev_err(ichan->dma_chan.device->dev,
-                               "Broken buffer-update locking on channel %x!\n",
-                               chan_id);
-                       /* We'll let the user catch up */
+               if (!WARN_ON(sg_next(*sg)))
+                       dev_dbg(dev, "Underrun on channel %x\n", chan_id);
+               ichan->sg[!ichan->active_buffer] = sgnew;
+
+               if (unlikely(sgnew)) {
+                       ipu_submit_buffer(ichan, descnew, sgnew, !ichan->active_buffer);
                } else {
-                       /* Underrun */
+                       spin_lock_irqsave(&ipu_data.lock, flags);
                        ipu_ic_disable_task(&ipu_data, chan_id);
-                       dev_dbg(ichan->dma_chan.device->dev,
-                               "Underrun on channel %x\n", chan_id);
+                       spin_unlock_irqrestore(&ipu_data.lock, flags);
                        ichan->status = IPU_CHANNEL_READY;
                        /* Continue to check for complete descriptor */
                }
        }
 
-       desc = list_entry(ichan->queue.next, struct idmac_tx_desc, list);
-
-       /* First calculate and submit the next sg element */
-       if (likely(sgnext))
-               sgnew = sg_next(sgnext);
-
-       if (unlikely(!sgnew)) {
-               /* Start a new scatterlist, if any queued */
-               if (likely(desc->list.next != &ichan->queue)) {
-                       descnew = list_entry(desc->list.next,
-                                            struct idmac_tx_desc, list);
-                       sgnew = &descnew->sg[0];
-               }
-       }
+       /* Calculate and submit the next sg element */
+       sgnew = idmac_sg_next(ichan, &descnew, sgnew);
 
        if (unlikely(!sg_next(*sg)) || !sgnext) {
                /*
@@ -1289,17 +1376,13 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id)
 
        *sg = sgnew;
 
-       if (likely(sgnew)) {
-               int ret;
-
-               ret = ipu_update_channel_buffer(chan_id, ichan->active_buffer,
-                                               sg_dma_address(*sg));
-               if (ret < 0)
-                       dev_err(ichan->dma_chan.device->dev,
-                               "Failed to update buffer on channel %x buffer %d!\n",
-                               chan_id, ichan->active_buffer);
-               else
-                       ipu_select_buffer(chan_id, ichan->active_buffer);
+       if (likely(sgnew) &&
+           ipu_submit_buffer(ichan, descnew, sgnew, ichan->active_buffer) < 0) {
+               callback = desc->txd.callback;
+               callback_param = desc->txd.callback_param;
+               spin_unlock(&ichan->lock);
+               callback(callback_param);
+               spin_lock(&ichan->lock);
        }
 
        /* Flip the active buffer - even if update above failed */
@@ -1327,13 +1410,20 @@ static void ipu_gc_tasklet(unsigned long arg)
                struct idmac_channel *ichan = ipu->channel + i;
                struct idmac_tx_desc *desc;
                unsigned long flags;
-               int j;
+               struct scatterlist *sg;
+               int j, k;
 
                for (j = 0; j < ichan->n_tx_desc; j++) {
                        desc = ichan->desc + j;
                        spin_lock_irqsave(&ichan->lock, flags);
                        if (async_tx_test_ack(&desc->txd)) {
                                list_move(&desc->list, &ichan->free_list);
+                               for_each_sg(desc->sg, sg, desc->sg_len, k) {
+                                       if (ichan->sg[0] == sg)
+                                               ichan->sg[0] = NULL;
+                                       else if (ichan->sg[1] == sg)
+                                               ichan->sg[1] = NULL;
+                               }
                                async_tx_clear_ack(&desc->txd);
                        }
                        spin_unlock_irqrestore(&ichan->lock, flags);
@@ -1341,13 +1431,7 @@ static void ipu_gc_tasklet(unsigned long arg)
        }
 }
 
-/*
- * At the time .device_alloc_chan_resources() method is called, we cannot know,
- * whether the client will accept the channel. Thus we must only check, if we
- * can satisfy client's request but the only real criterion to verify, whether
- * the client has accepted our offer is the client_count. That's why we have to
- * perform the rest of our allocation tasks on the first call to this function.
- */
+/* Allocate and initialise a transfer descriptor. */
 static struct dma_async_tx_descriptor *idmac_prep_slave_sg(struct dma_chan *chan,
                struct scatterlist *sgl, unsigned int sg_len,
                enum dma_data_direction direction, unsigned long tx_flags)
@@ -1358,8 +1442,8 @@ static struct dma_async_tx_descriptor *idmac_prep_slave_sg(struct dma_chan *chan
        unsigned long flags;
 
        /* We only can handle these three channels so far */
-       if (ichan->dma_chan.chan_id != IDMAC_SDC_0 && ichan->dma_chan.chan_id != IDMAC_SDC_1 &&
-           ichan->dma_chan.chan_id != IDMAC_IC_7)
+       if (chan->chan_id != IDMAC_SDC_0 && chan->chan_id != IDMAC_SDC_1 &&
+           chan->chan_id != IDMAC_IC_7)
                return NULL;
 
        if (direction != DMA_FROM_DEVICE && direction != DMA_TO_DEVICE) {
@@ -1400,7 +1484,7 @@ static void idmac_issue_pending(struct dma_chan *chan)
 
        /* This is not always needed, but doesn't hurt either */
        spin_lock_irqsave(&ipu->lock, flags);
-       ipu_select_buffer(ichan->dma_chan.chan_id, ichan->active_buffer);
+       ipu_select_buffer(chan->chan_id, ichan->active_buffer);
        spin_unlock_irqrestore(&ipu->lock, flags);
 
        /*
@@ -1432,8 +1516,7 @@ static void __idmac_terminate_all(struct dma_chan *chan)
                        struct idmac_tx_desc *desc = ichan->desc + i;
                        if (list_empty(&desc->list))
                                /* Descriptor was prepared, but not submitted */
-                               list_add(&desc->list,
-                                        &ichan->free_list);
+                               list_add(&desc->list, &ichan->free_list);
 
                        async_tx_clear_ack(&desc->txd);
                }
@@ -1458,6 +1541,28 @@ static void idmac_terminate_all(struct dma_chan *chan)
        mutex_unlock(&ichan->chan_mutex);
 }
 
+#ifdef DEBUG
+static irqreturn_t ic_sof_irq(int irq, void *dev_id)
+{
+       struct idmac_channel *ichan = dev_id;
+       printk(KERN_DEBUG "Got SOF IRQ %d on Channel %d\n",
+              irq, ichan->dma_chan.chan_id);
+       disable_irq(irq);
+       return IRQ_HANDLED;
+}
+
+static irqreturn_t ic_eof_irq(int irq, void *dev_id)
+{
+       struct idmac_channel *ichan = dev_id;
+       printk(KERN_DEBUG "Got EOF IRQ %d on Channel %d\n",
+              irq, ichan->dma_chan.chan_id);
+       disable_irq(irq);
+       return IRQ_HANDLED;
+}
+
+static int ic_sof = -EINVAL, ic_eof = -EINVAL;
+#endif
+
 static int idmac_alloc_chan_resources(struct dma_chan *chan)
 {
        struct idmac_channel *ichan = to_idmac_chan(chan);
@@ -1471,31 +1576,49 @@ static int idmac_alloc_chan_resources(struct dma_chan *chan)
        chan->cookie            = 1;
        ichan->completed        = -ENXIO;
 
-       ret = ipu_irq_map(ichan->dma_chan.chan_id);
+       ret = ipu_irq_map(chan->chan_id);
        if (ret < 0)
                goto eimap;
 
        ichan->eof_irq = ret;
+
+       /*
+        * Important to first disable the channel, because maybe someone
+        * used it before us, e.g., the bootloader
+        */
+       ipu_disable_channel(idmac, ichan, true);
+
+       ret = ipu_init_channel(idmac, ichan);
+       if (ret < 0)
+               goto eichan;
+
        ret = request_irq(ichan->eof_irq, idmac_interrupt, 0,
                          ichan->eof_name, ichan);
        if (ret < 0)
                goto erirq;
 
-       ret = ipu_init_channel(idmac, ichan);
-       if (ret < 0)
-               goto eichan;
+#ifdef DEBUG
+       if (chan->chan_id == IDMAC_IC_7) {
+               ic_sof = ipu_irq_map(69);
+               if (ic_sof > 0)
+                       request_irq(ic_sof, ic_sof_irq, 0, "IC SOF", ichan);
+               ic_eof = ipu_irq_map(70);
+               if (ic_eof > 0)
+                       request_irq(ic_eof, ic_eof_irq, 0, "IC EOF", ichan);
+       }
+#endif
 
        ichan->status = IPU_CHANNEL_INITIALIZED;
 
-       dev_dbg(&ichan->dma_chan.dev->device, "Found channel 0x%x, irq %d\n",
-               ichan->dma_chan.chan_id, ichan->eof_irq);
+       dev_dbg(&chan->dev->device, "Found channel 0x%x, irq %d\n",
+               chan->chan_id, ichan->eof_irq);
 
        return ret;
 
-eichan:
-       free_irq(ichan->eof_irq, ichan);
 erirq:
-       ipu_irq_unmap(ichan->dma_chan.chan_id);
+       ipu_uninit_channel(idmac, ichan);
+eichan:
+       ipu_irq_unmap(chan->chan_id);
 eimap:
        return ret;
 }
@@ -1510,8 +1633,22 @@ static void idmac_free_chan_resources(struct dma_chan *chan)
        __idmac_terminate_all(chan);
 
        if (ichan->status > IPU_CHANNEL_FREE) {
+#ifdef DEBUG
+               if (chan->chan_id == IDMAC_IC_7) {
+                       if (ic_sof > 0) {
+                               free_irq(ic_sof, ichan);
+                               ipu_irq_unmap(69);
+                               ic_sof = -EINVAL;
+                       }
+                       if (ic_eof > 0) {
+                               free_irq(ic_eof, ichan);
+                               ipu_irq_unmap(70);
+                               ic_eof = -EINVAL;
+                       }
+               }
+#endif
                free_irq(ichan->eof_irq, ichan);
-               ipu_irq_unmap(ichan->dma_chan.chan_id);
+               ipu_irq_unmap(chan->chan_id);
        }
 
        ichan->status = IPU_CHANNEL_FREE;
@@ -1573,7 +1710,7 @@ static int __init ipu_idmac_init(struct ipu *ipu)
                dma_chan->device        = &idmac->dma;
                dma_chan->cookie        = 1;
                dma_chan->chan_id       = i;
-               list_add_tail(&ichan->dma_chan.device_node, &dma->channels);
+               list_add_tail(&dma_chan->device_node, &dma->channels);
        }
 
        idmac_write_icreg(ipu, 0x00000070, IDMAC_CONF);
@@ -1581,7 +1718,7 @@ static int __init ipu_idmac_init(struct ipu *ipu)
        return dma_async_device_register(&idmac->dma);
 }
 
-static void ipu_idmac_exit(struct ipu *ipu)
+static void __exit ipu_idmac_exit(struct ipu *ipu)
 {
        int i;
        struct idmac *idmac = &ipu->idmac;
@@ -1600,7 +1737,7 @@ static void ipu_idmac_exit(struct ipu *ipu)
  * IPU common probe / remove
  */
 
-static int ipu_probe(struct platform_device *pdev)
+static int __init ipu_probe(struct platform_device *pdev)
 {
        struct ipu_platform_data *pdata = pdev->dev.platform_data;
        struct resource *mem_ipu, *mem_ic;
@@ -1700,7 +1837,7 @@ err_noirq:
        return ret;
 }
 
-static int ipu_remove(struct platform_device *pdev)
+static int __exit ipu_remove(struct platform_device *pdev)
 {
        struct ipu *ipu = platform_get_drvdata(pdev);
 
@@ -1725,7 +1862,7 @@ static struct platform_driver ipu_platform_driver = {
                .name   = "ipu-core",
                .owner  = THIS_MODULE,
        },
-       .remove         = ipu_remove,
+       .remove         = __exit_p(ipu_remove),
 };
 
 static int __init ipu_init(void)
index 83f532cc767f7db111ed6f6e3cbb0640ebbb6871..dd8ebc75b667ad0089a1152fe7dd416a8ad786f6 100644 (file)
@@ -352,7 +352,7 @@ static struct irq_chip ipu_irq_chip = {
 };
 
 /* Install the IRQ handler */
-int ipu_irq_attach_irq(struct ipu *ipu, struct platform_device *dev)
+int __init ipu_irq_attach_irq(struct ipu *ipu, struct platform_device *dev)
 {
        struct ipu_platform_data *pdata = dev->dev.platform_data;
        unsigned int irq, irq_base, i;
index cb7f26fb9f188ce594081069bd61c6e99c957930..ddab94f512247d600a1488a44d27a832f8ee9d3c 100644 (file)
@@ -632,7 +632,6 @@ static int mv_xor_alloc_chan_resources(struct dma_chan *chan)
                slot->async_tx.tx_submit = mv_xor_tx_submit;
                INIT_LIST_HEAD(&slot->chain_node);
                INIT_LIST_HEAD(&slot->slot_node);
-               INIT_LIST_HEAD(&slot->async_tx.tx_list);
                hw_desc = (char *) mv_chan->device->dma_desc_pool;
                slot->async_tx.phys =
                        (dma_addr_t) &hw_desc[idx * MV_XOR_SLOT_SIZE];
index 2281b5098e95455cd1a6d60af7354f7728888185..36e0675be9f72fe5793687c85c3ed0d7db67efae 100644 (file)
@@ -121,6 +121,7 @@ config MD_RAID10
 config MD_RAID456
        tristate "RAID-4/RAID-5/RAID-6 mode"
        depends on BLK_DEV_MD
+       select MD_RAID6_PQ
        select ASYNC_MEMCPY
        select ASYNC_XOR
        ---help---
@@ -151,34 +152,8 @@ config MD_RAID456
 
          If unsure, say Y.
 
-config MD_RAID5_RESHAPE
-       bool "Support adding drives to a raid-5 array"
-       depends on MD_RAID456
-       default y
-       ---help---
-         A RAID-5 set can be expanded by adding extra drives. This
-         requires "restriping" the array which means (almost) every
-         block must be written to a different place.
-
-          This option allows such restriping to be done while the array
-         is online.
-
-         You will need mdadm version 2.4.1 or later to use this
-         feature safely.  During the early stage of reshape there is
-         a critical section where live data is being over-written.  A
-         crash during this time needs extra care for recovery.  The
-         newer mdadm takes a copy of the data in the critical section
-         and will restore it, if necessary, after a crash.
-
-         The mdadm usage is e.g.
-              mdadm --grow /dev/md1 --raid-disks=6
-         to grow '/dev/md1' to having 6 disks.
-
-         Note: The array can only be expanded, not contracted.
-         There should be enough spares already present to make the new
-         array workable.
-
-         If unsure, say Y.
+config MD_RAID6_PQ
+       tristate
 
 config MD_MULTIPATH
        tristate "Multipath I/O support"
index 72880b7e28d9c464655c471a1d4bb85d1df0fab5..45cc5951d9287030df73363a20659482aa55414d 100644 (file)
@@ -2,20 +2,21 @@
 # Makefile for the kernel software RAID and LVM drivers.
 #
 
-dm-mod-objs    := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
+dm-mod-y       += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
                   dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o
-dm-multipath-objs := dm-path-selector.o dm-mpath.o
-dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-snap-transient.o \
+dm-multipath-y += dm-path-selector.o dm-mpath.o
+dm-snapshot-y  += dm-snap.o dm-exception-store.o dm-snap-transient.o \
                    dm-snap-persistent.o
-dm-mirror-objs := dm-raid1.o
-md-mod-objs     := md.o bitmap.o
-raid456-objs   := raid5.o raid6algos.o raid6recov.o raid6tables.o \
+dm-mirror-y    += dm-raid1.o
+md-mod-y       += md.o bitmap.o
+raid456-y      += raid5.o
+raid6_pq-y     += raid6algos.o raid6recov.o raid6tables.o \
                   raid6int1.o raid6int2.o raid6int4.o \
                   raid6int8.o raid6int16.o raid6int32.o \
                   raid6altivec1.o raid6altivec2.o raid6altivec4.o \
                   raid6altivec8.o \
                   raid6mmx.o raid6sse1.o raid6sse2.o
-hostprogs-y    := mktables
+hostprogs-y    += mktables
 
 # Note: link order is important.  All raid personalities
 # and must come before md.o, as they each initialise 
@@ -26,6 +27,7 @@ obj-$(CONFIG_MD_LINEAR)               += linear.o
 obj-$(CONFIG_MD_RAID0)         += raid0.o
 obj-$(CONFIG_MD_RAID1)         += raid1.o
 obj-$(CONFIG_MD_RAID10)                += raid10.o
+obj-$(CONFIG_MD_RAID6_PQ)      += raid6_pq.o
 obj-$(CONFIG_MD_RAID456)       += raid456.o
 obj-$(CONFIG_MD_MULTIPATH)     += multipath.o
 obj-$(CONFIG_MD_FAULTY)                += faulty.o
index 719943763391263c7a4ab98ad0201244ce1d3d62..f8a9f7ab2cb8ac71884b87fb512a281db3c86789 100644 (file)
@@ -16,6 +16,7 @@
  * wait if count gets too high, wake when it drops to half.
  */
 
+#include <linux/blkdev.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
@@ -26,8 +27,8 @@
 #include <linux/file.h>
 #include <linux/mount.h>
 #include <linux/buffer_head.h>
-#include <linux/raid/md.h>
-#include <linux/raid/bitmap.h>
+#include "md.h"
+#include "bitmap.h"
 
 /* debug macros */
 
@@ -111,9 +112,10 @@ static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int creat
        unsigned char *mappage;
 
        if (page >= bitmap->pages) {
-               printk(KERN_ALERT
-                       "%s: invalid bitmap page request: %lu (> %lu)\n",
-                       bmname(bitmap), page, bitmap->pages-1);
+               /* This can happen if bitmap_start_sync goes beyond
+                * End-of-device while looking for a whole page.
+                * It is harmless.
+                */
                return -EINVAL;
        }
 
@@ -265,7 +267,6 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
        list_for_each_continue_rcu(pos, &mddev->disks) {
                rdev = list_entry(pos, mdk_rdev_t, same_set);
                if (rdev->raid_disk >= 0 &&
-                   test_bit(In_sync, &rdev->flags) &&
                    !test_bit(Faulty, &rdev->flags)) {
                        /* this is a usable devices */
                        atomic_inc(&rdev->nr_pending);
@@ -297,7 +298,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
                                    + size/512 > 0)
                                        /* bitmap runs in to metadata */
                                        goto bad_alignment;
-                               if (rdev->data_offset + mddev->size*2
+                               if (rdev->data_offset + mddev->dev_sectors
                                    > rdev->sb_start + bitmap->offset)
                                        /* data runs in to bitmap */
                                        goto bad_alignment;
@@ -570,7 +571,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
        else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
                 le32_to_cpu(sb->version) > BITMAP_MAJOR_HI)
                reason = "unrecognized superblock version";
-       else if (chunksize < PAGE_SIZE)
+       else if (chunksize < 512)
                reason = "bitmap chunksize too small";
        else if ((1 << ffz(~chunksize)) != chunksize)
                reason = "bitmap chunksize not a power of 2";
@@ -1306,6 +1307,9 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
                PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n",
                  atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
        }
+       if (bitmap->mddev->degraded)
+               /* Never clear bits or update events_cleared when degraded */
+               success = 0;
 
        while (sectors) {
                int blocks;
@@ -1345,8 +1349,8 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
        }
 }
 
-int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
-                       int degraded)
+static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
+                              int degraded)
 {
        bitmap_counter_t *bmc;
        int rv;
@@ -1374,6 +1378,29 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
        return rv;
 }
 
+int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
+                     int degraded)
+{
+       /* bitmap_start_sync must always report on multiples of whole
+        * pages, otherwise resync (which is very PAGE_SIZE based) will
+        * get confused.
+        * So call __bitmap_start_sync repeatedly (if needed) until
+        * At least PAGE_SIZE>>9 blocks are covered.
+        * Return the 'or' of the result.
+        */
+       int rv = 0;
+       int blocks1;
+
+       *blocks = 0;
+       while (*blocks < (PAGE_SIZE>>9)) {
+               rv |= __bitmap_start_sync(bitmap, offset,
+                                         &blocks1, degraded);
+               offset += blocks1;
+               *blocks += blocks1;
+       }
+       return rv;
+}
+
 void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted)
 {
        bitmap_counter_t *bmc;
@@ -1443,6 +1470,8 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
        wait_event(bitmap->mddev->recovery_wait,
                   atomic_read(&bitmap->mddev->recovery_active) == 0);
 
+       bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync;
+       set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
        sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
        s = 0;
        while (s < sector && s < bitmap->mddev->resync_max_sectors) {
index d4509be0fe67f78ecb91c0a5981d28ab9edd31cc..345098b4ca77ac77400c00e7598f2398bec04a06 100644 (file)
@@ -52,6 +52,16 @@ static inline void bio_list_add(struct bio_list *bl, struct bio *bio)
        bl->tail = bio;
 }
 
+static inline void bio_list_add_head(struct bio_list *bl, struct bio *bio)
+{
+       bio->bi_next = bl->head;
+
+       bl->head = bio;
+
+       if (!bl->tail)
+               bl->tail = bio;
+}
+
 static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
 {
        if (!bl2->head)
index d3ec217847d68c9ca2b6d20ca4e9d2eb9cefc3f3..3a8cfa2645c72f6539170f2ab2d3242bb4a6fa58 100644 (file)
  * functions in this file help the target record and restore the
  * original bio state.
  */
+
+struct dm_bio_vec_details {
+#if PAGE_SIZE < 65536
+       __u16 bv_len;
+       __u16 bv_offset;
+#else
+       unsigned bv_len;
+       unsigned bv_offset;
+#endif
+};
+
 struct dm_bio_details {
        sector_t bi_sector;
        struct block_device *bi_bdev;
        unsigned int bi_size;
        unsigned short bi_idx;
        unsigned long bi_flags;
+       struct dm_bio_vec_details bi_io_vec[BIO_MAX_PAGES];
 };
 
 static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
 {
+       unsigned i;
+
        bd->bi_sector = bio->bi_sector;
        bd->bi_bdev = bio->bi_bdev;
        bd->bi_size = bio->bi_size;
        bd->bi_idx = bio->bi_idx;
        bd->bi_flags = bio->bi_flags;
+
+       for (i = 0; i < bio->bi_vcnt; i++) {
+               bd->bi_io_vec[i].bv_len = bio->bi_io_vec[i].bv_len;
+               bd->bi_io_vec[i].bv_offset = bio->bi_io_vec[i].bv_offset;
+       }
 }
 
 static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
 {
+       unsigned i;
+
        bio->bi_sector = bd->bi_sector;
        bio->bi_bdev = bd->bi_bdev;
        bio->bi_size = bd->bi_size;
        bio->bi_idx = bd->bi_idx;
        bio->bi_flags = bd->bi_flags;
+
+       for (i = 0; i < bio->bi_vcnt; i++) {
+               bio->bi_io_vec[i].bv_len = bd->bi_io_vec[i].bv_len;
+               bio->bi_io_vec[i].bv_offset = bd->bi_io_vec[i].bv_offset;
+       }
 }
 
 #endif
index bfefd079a9557b24c4d9d4f91646c1d5f2661942..53394e863c749db9444ab4ae4780b06a61afb8f1 100644 (file)
@@ -1156,8 +1156,7 @@ bad_ivmode:
        crypto_free_ablkcipher(tfm);
 bad_cipher:
        /* Must zero key material before freeing */
-       memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
-       kfree(cc);
+       kzfree(cc);
        return -EINVAL;
 }
 
@@ -1183,8 +1182,7 @@ static void crypt_dtr(struct dm_target *ti)
        dm_put_device(ti, cc->dev);
 
        /* Must zero key material before freeing */
-       memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
-       kfree(cc);
+       kzfree(cc);
 }
 
 static int crypt_map(struct dm_target *ti, struct bio *bio,
index dccbfb0e010fafc7b3470f06de29132b18124e57..a2e26c24214150cac23e92acf1391a212d60271b 100644 (file)
@@ -7,6 +7,7 @@
 
 #include "dm-exception-store.h"
 
+#include <linux/ctype.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/vmalloc.h>
 
 #define DM_MSG_PREFIX "snapshot exception stores"
 
+static LIST_HEAD(_exception_store_types);
+static DEFINE_SPINLOCK(_lock);
+
+static struct dm_exception_store_type *__find_exception_store_type(const char *name)
+{
+       struct dm_exception_store_type *type;
+
+       list_for_each_entry(type, &_exception_store_types, list)
+               if (!strcmp(name, type->name))
+                       return type;
+
+       return NULL;
+}
+
+static struct dm_exception_store_type *_get_exception_store_type(const char *name)
+{
+       struct dm_exception_store_type *type;
+
+       spin_lock(&_lock);
+
+       type = __find_exception_store_type(name);
+
+       if (type && !try_module_get(type->module))
+               type = NULL;
+
+       spin_unlock(&_lock);
+
+       return type;
+}
+
+/*
+ * get_type
+ * @type_name
+ *
+ * Attempt to retrieve the dm_exception_store_type by name.  If not already
+ * available, attempt to load the appropriate module.
+ *
+ * Exstore modules are named "dm-exstore-" followed by the 'type_name'.
+ * Modules may contain multiple types.
+ * This function will first try the module "dm-exstore-<type_name>",
+ * then truncate 'type_name' on the last '-' and try again.
+ *
+ * For example, if type_name was "clustered-shared", it would search
+ * 'dm-exstore-clustered-shared' then 'dm-exstore-clustered'.
+ *
+ * 'dm-exception-store-<type_name>' is too long of a name in my
+ * opinion, which is why I've chosen to have the files
+ * containing exception store implementations be 'dm-exstore-<type_name>'.
+ * If you want your module to be autoloaded, you will follow this
+ * naming convention.
+ *
+ * Returns: dm_exception_store_type* on success, NULL on failure
+ */
+static struct dm_exception_store_type *get_type(const char *type_name)
+{
+       char *p, *type_name_dup;
+       struct dm_exception_store_type *type;
+
+       type = _get_exception_store_type(type_name);
+       if (type)
+               return type;
+
+       type_name_dup = kstrdup(type_name, GFP_KERNEL);
+       if (!type_name_dup) {
+               DMERR("No memory left to attempt load for \"%s\"", type_name);
+               return NULL;
+       }
+
+       while (request_module("dm-exstore-%s", type_name_dup) ||
+              !(type = _get_exception_store_type(type_name))) {
+               p = strrchr(type_name_dup, '-');
+               if (!p)
+                       break;
+               p[0] = '\0';
+       }
+
+       if (!type)
+               DMWARN("Module for exstore type \"%s\" not found.", type_name);
+
+       kfree(type_name_dup);
+
+       return type;
+}
+
+static void put_type(struct dm_exception_store_type *type)
+{
+       spin_lock(&_lock);
+       module_put(type->module);
+       spin_unlock(&_lock);
+}
+
+int dm_exception_store_type_register(struct dm_exception_store_type *type)
+{
+       int r = 0;
+
+       spin_lock(&_lock);
+       if (!__find_exception_store_type(type->name))
+               list_add(&type->list, &_exception_store_types);
+       else
+               r = -EEXIST;
+       spin_unlock(&_lock);
+
+       return r;
+}
+EXPORT_SYMBOL(dm_exception_store_type_register);
+
+int dm_exception_store_type_unregister(struct dm_exception_store_type *type)
+{
+       spin_lock(&_lock);
+
+       if (!__find_exception_store_type(type->name)) {
+               spin_unlock(&_lock);
+               return -EINVAL;
+       }
+
+       list_del(&type->list);
+
+       spin_unlock(&_lock);
+
+       return 0;
+}
+EXPORT_SYMBOL(dm_exception_store_type_unregister);
+
+/*
+ * Round a number up to the nearest 'size' boundary.  size must
+ * be a power of 2.
+ */
+static ulong round_up(ulong n, ulong size)
+{
+       size--;
+       return (n + size) & ~size;
+}
+
+static int set_chunk_size(struct dm_exception_store *store,
+                         const char *chunk_size_arg, char **error)
+{
+       unsigned long chunk_size_ulong;
+       char *value;
+
+       chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10);
+       if (*chunk_size_arg == '\0' || *value != '\0') {
+               *error = "Invalid chunk size";
+               return -EINVAL;
+       }
+
+       if (!chunk_size_ulong) {
+               store->chunk_size = store->chunk_mask = store->chunk_shift = 0;
+               return 0;
+       }
+
+       /*
+        * Chunk size must be multiple of page size.  Silently
+        * round up if it's not.
+        */
+       chunk_size_ulong = round_up(chunk_size_ulong, PAGE_SIZE >> 9);
+
+       /* Check chunk_size is a power of 2 */
+       if (!is_power_of_2(chunk_size_ulong)) {
+               *error = "Chunk size is not a power of 2";
+               return -EINVAL;
+       }
+
+       /* Validate the chunk size against the device block size */
+       if (chunk_size_ulong % (bdev_hardsect_size(store->cow->bdev) >> 9)) {
+               *error = "Chunk size is not a multiple of device blocksize";
+               return -EINVAL;
+       }
+
+       store->chunk_size = chunk_size_ulong;
+       store->chunk_mask = chunk_size_ulong - 1;
+       store->chunk_shift = ffs(chunk_size_ulong) - 1;
+
+       return 0;
+}
+
+int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
+                             unsigned *args_used,
+                             struct dm_exception_store **store)
+{
+       int r = 0;
+       struct dm_exception_store_type *type;
+       struct dm_exception_store *tmp_store;
+       char persistent;
+
+       if (argc < 3) {
+               ti->error = "Insufficient exception store arguments";
+               return -EINVAL;
+       }
+
+       tmp_store = kmalloc(sizeof(*tmp_store), GFP_KERNEL);
+       if (!tmp_store) {
+               ti->error = "Exception store allocation failed";
+               return -ENOMEM;
+       }
+
+       persistent = toupper(*argv[1]);
+       if (persistent != 'P' && persistent != 'N') {
+               ti->error = "Persistent flag is not P or N";
+               return -EINVAL;
+       }
+
+       type = get_type(argv[1]);
+       if (!type) {
+               ti->error = "Exception store type not recognised";
+               r = -EINVAL;
+               goto bad_type;
+       }
+
+       tmp_store->type = type;
+       tmp_store->ti = ti;
+
+       r = dm_get_device(ti, argv[0], 0, 0,
+                         FMODE_READ | FMODE_WRITE, &tmp_store->cow);
+       if (r) {
+               ti->error = "Cannot get COW device";
+               goto bad_cow;
+       }
+
+       r = set_chunk_size(tmp_store, argv[2], &ti->error);
+       if (r)
+               goto bad_cow;
+
+       r = type->ctr(tmp_store, 0, NULL);
+       if (r) {
+               ti->error = "Exception store type constructor failed";
+               goto bad_ctr;
+       }
+
+       *args_used = 3;
+       *store = tmp_store;
+       return 0;
+
+bad_ctr:
+       dm_put_device(ti, tmp_store->cow);
+bad_cow:
+       put_type(type);
+bad_type:
+       kfree(tmp_store);
+       return r;
+}
+EXPORT_SYMBOL(dm_exception_store_create);
+
+void dm_exception_store_destroy(struct dm_exception_store *store)
+{
+       store->type->dtr(store);
+       dm_put_device(store->ti, store->cow);
+       put_type(store->type);
+       kfree(store);
+}
+EXPORT_SYMBOL(dm_exception_store_destroy);
+
 int dm_exception_store_init(void)
 {
        int r;
index bb9f33d5daa2169127d96fd9ebfcbe41d1cc819b..0a2e6e7f67b3c5b6ac4c4cbd7dc2ca6811eaa30c 100644 (file)
@@ -37,11 +37,18 @@ struct dm_snap_exception {
  * Abstraction to handle the meta/layout of exception stores (the
  * COW device).
  */
-struct dm_exception_store {
+struct dm_exception_store;
+struct dm_exception_store_type {
+       const char *name;
+       struct module *module;
+
+       int (*ctr) (struct dm_exception_store *store,
+                   unsigned argc, char **argv);
+
        /*
         * Destroys this object when you've finished with it.
         */
-       void (*destroy) (struct dm_exception_store *store);
+       void (*dtr) (struct dm_exception_store *store);
 
        /*
         * The target shouldn't read the COW device until this is
@@ -72,8 +79,9 @@ struct dm_exception_store {
         */
        void (*drop_snapshot) (struct dm_exception_store *store);
 
-       int (*status) (struct dm_exception_store *store, status_type_t status,
-                      char *result, unsigned int maxlen);
+       unsigned (*status) (struct dm_exception_store *store,
+                           status_type_t status, char *result,
+                           unsigned maxlen);
 
        /*
         * Return how full the snapshot is.
@@ -82,7 +90,21 @@ struct dm_exception_store {
                               sector_t *numerator,
                               sector_t *denominator);
 
-       struct dm_snapshot *snap;
+       /* For internal device-mapper use only. */
+       struct list_head list;
+};
+
+struct dm_exception_store {
+       struct dm_exception_store_type *type;
+       struct dm_target *ti;
+
+       struct dm_dev *cow;
+
+       /* Size of data blocks saved - must be a power of 2 */
+       chunk_t chunk_size;
+       chunk_t chunk_mask;
+       chunk_t chunk_shift;
+
        void *context;
 };
 
@@ -129,6 +151,28 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
 
 #  endif
 
+/*
+ * Return the number of sectors in the device.
+ */
+static inline sector_t get_dev_size(struct block_device *bdev)
+{
+       return bdev->bd_inode->i_size >> SECTOR_SHIFT;
+}
+
+static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
+                                     sector_t sector)
+{
+       return (sector & ~store->chunk_mask) >> store->chunk_shift;
+}
+
+int dm_exception_store_type_register(struct dm_exception_store_type *type);
+int dm_exception_store_type_unregister(struct dm_exception_store_type *type);
+
+int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
+                             unsigned *args_used,
+                             struct dm_exception_store **store);
+void dm_exception_store_destroy(struct dm_exception_store *store);
+
 int dm_exception_store_init(void);
 void dm_exception_store_exit(void);
 
@@ -141,8 +185,4 @@ void dm_persistent_snapshot_exit(void);
 int dm_transient_snapshot_init(void);
 void dm_transient_snapshot_exit(void);
 
-int dm_create_persistent(struct dm_exception_store *store);
-
-int dm_create_transient(struct dm_exception_store *store);
-
 #endif /* _LINUX_DM_EXCEPTION_STORE */
index 36e2b5e46a6b644f0e9902b57b29c0666541d733..e73aabd61cd78abdbc63996704c177e6aea399d1 100644 (file)
@@ -370,16 +370,13 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
        while (1) {
                set_current_state(TASK_UNINTERRUPTIBLE);
 
-               if (!atomic_read(&io.count) || signal_pending(current))
+               if (!atomic_read(&io.count))
                        break;
 
                io_schedule();
        }
        set_current_state(TASK_RUNNING);
 
-       if (atomic_read(&io.count))
-               return -EINTR;
-
        if (error_bits)
                *error_bits = io.error_bits;
 
index 737961f275c196f9b6aa41957dd08c72457c597b..be233bc4d91787a2729ef8ca740a946ddc782332 100644 (file)
 
 #define DM_MSG_PREFIX "dirty region log"
 
-struct dm_dirty_log_internal {
-       struct dm_dirty_log_type *type;
-
-       struct list_head list;
-       long use;
-};
-
 static LIST_HEAD(_log_types);
 static DEFINE_SPINLOCK(_lock);
 
-static struct dm_dirty_log_internal *__find_dirty_log_type(const char *name)
+static struct dm_dirty_log_type *__find_dirty_log_type(const char *name)
 {
-       struct dm_dirty_log_internal *log_type;
+       struct dm_dirty_log_type *log_type;
 
        list_for_each_entry(log_type, &_log_types, list)
-               if (!strcmp(name, log_type->type->name))
+               if (!strcmp(name, log_type->name))
                        return log_type;
 
        return NULL;
 }
 
-static struct dm_dirty_log_internal *_get_dirty_log_type(const char *name)
+static struct dm_dirty_log_type *_get_dirty_log_type(const char *name)
 {
-       struct dm_dirty_log_internal *log_type;
+       struct dm_dirty_log_type *log_type;
 
        spin_lock(&_lock);
 
        log_type = __find_dirty_log_type(name);
-       if (log_type) {
-               if (!log_type->use && !try_module_get(log_type->type->module))
-                       log_type = NULL;
-               else
-                       log_type->use++;
-       }
+       if (log_type && !try_module_get(log_type->module))
+               log_type = NULL;
 
        spin_unlock(&_lock);
 
@@ -76,14 +65,14 @@ static struct dm_dirty_log_internal *_get_dirty_log_type(const char *name)
 static struct dm_dirty_log_type *get_type(const char *type_name)
 {
        char *p, *type_name_dup;
-       struct dm_dirty_log_internal *log_type;
+       struct dm_dirty_log_type *log_type;
 
        if (!type_name)
                return NULL;
 
        log_type = _get_dirty_log_type(type_name);
        if (log_type)
-               return log_type->type;
+               return log_type;
 
        type_name_dup = kstrdup(type_name, GFP_KERNEL);
        if (!type_name_dup) {
@@ -105,56 +94,33 @@ static struct dm_dirty_log_type *get_type(const char *type_name)
 
        kfree(type_name_dup);
 
-       return log_type ? log_type->type : NULL;
+       return log_type;
 }
 
 static void put_type(struct dm_dirty_log_type *type)
 {
-       struct dm_dirty_log_internal *log_type;
-
        if (!type)
                return;
 
        spin_lock(&_lock);
-       log_type = __find_dirty_log_type(type->name);
-       if (!log_type)
+       if (!__find_dirty_log_type(type->name))
                goto out;
 
-       if (!--log_type->use)
-               module_put(type->module);
-
-       BUG_ON(log_type->use < 0);
+       module_put(type->module);
 
 out:
        spin_unlock(&_lock);
 }
 
-static struct dm_dirty_log_internal *_alloc_dirty_log_type(struct dm_dirty_log_type *type)
-{
-       struct dm_dirty_log_internal *log_type = kzalloc(sizeof(*log_type),
-                                                        GFP_KERNEL);
-
-       if (log_type)
-               log_type->type = type;
-
-       return log_type;
-}
-
 int dm_dirty_log_type_register(struct dm_dirty_log_type *type)
 {
-       struct dm_dirty_log_internal *log_type = _alloc_dirty_log_type(type);
        int r = 0;
 
-       if (!log_type)
-               return -ENOMEM;
-
        spin_lock(&_lock);
        if (!__find_dirty_log_type(type->name))
-               list_add(&log_type->list, &_log_types);
-       else {
-               kfree(log_type);
+               list_add(&type->list, &_log_types);
+       else
                r = -EEXIST;
-       }
        spin_unlock(&_lock);
 
        return r;
@@ -163,25 +129,16 @@ EXPORT_SYMBOL(dm_dirty_log_type_register);
 
 int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type)
 {
-       struct dm_dirty_log_internal *log_type;
-
        spin_lock(&_lock);
 
-       log_type = __find_dirty_log_type(type->name);
-       if (!log_type) {
+       if (!__find_dirty_log_type(type->name)) {
                spin_unlock(&_lock);
                return -EINVAL;
        }
 
-       if (log_type->use) {
-               spin_unlock(&_lock);
-               return -ETXTBSY;
-       }
-
-       list_del(&log_type->list);
+       list_del(&type->list);
 
        spin_unlock(&_lock);
-       kfree(log_type);
 
        return 0;
 }
index 96ea226155b10e3dbb560b3faa8c2211ec11f41f..42c04f04a0c4c84e28bb1cd05e56a19370d5e565 100644 (file)
@@ -17,9 +17,7 @@
 
 struct ps_internal {
        struct path_selector_type pst;
-
        struct list_head list;
-       long use;
 };
 
 #define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst)
@@ -45,12 +43,8 @@ static struct ps_internal *get_path_selector(const char *name)
 
        down_read(&_ps_lock);
        psi = __find_path_selector_type(name);
-       if (psi) {
-               if ((psi->use == 0) && !try_module_get(psi->pst.module))
-                       psi = NULL;
-               else
-                       psi->use++;
-       }
+       if (psi && !try_module_get(psi->pst.module))
+               psi = NULL;
        up_read(&_ps_lock);
 
        return psi;
@@ -84,11 +78,7 @@ void dm_put_path_selector(struct path_selector_type *pst)
        if (!psi)
                goto out;
 
-       if (--psi->use == 0)
-               module_put(psi->pst.module);
-
-       BUG_ON(psi->use < 0);
-
+       module_put(psi->pst.module);
 out:
        up_read(&_ps_lock);
 }
@@ -136,11 +126,6 @@ int dm_unregister_path_selector(struct path_selector_type *pst)
                return -EINVAL;
        }
 
-       if (psi->use) {
-               up_write(&_ps_lock);
-               return -ETXTBSY;
-       }
-
        list_del(&psi->list);
 
        up_write(&_ps_lock);
index 4d6bc101962e1965a1b98e0cc1e87363be4bcc3a..536ef0bef154e507aae4f738c01c52e7d166d798 100644 (file)
@@ -145,6 +145,8 @@ struct dm_raid1_read_record {
        struct dm_bio_details details;
 };
 
+static struct kmem_cache *_dm_raid1_read_record_cache;
+
 /*
  * Every mirror should look like this one.
  */
@@ -586,6 +588,9 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
        int state;
        struct bio *bio;
        struct bio_list sync, nosync, recover, *this_list = NULL;
+       struct bio_list requeue;
+       struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
+       region_t region;
 
        if (!writes->head)
                return;
@@ -596,10 +601,18 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
        bio_list_init(&sync);
        bio_list_init(&nosync);
        bio_list_init(&recover);
+       bio_list_init(&requeue);
 
        while ((bio = bio_list_pop(writes))) {
-               state = dm_rh_get_state(ms->rh,
-                                       dm_rh_bio_to_region(ms->rh, bio), 1);
+               region = dm_rh_bio_to_region(ms->rh, bio);
+
+               if (log->type->is_remote_recovering &&
+                   log->type->is_remote_recovering(log, region)) {
+                       bio_list_add(&requeue, bio);
+                       continue;
+               }
+
+               state = dm_rh_get_state(ms->rh, region, 1);
                switch (state) {
                case DM_RH_CLEAN:
                case DM_RH_DIRTY:
@@ -618,6 +631,16 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
                bio_list_add(this_list, bio);
        }
 
+       /*
+        * Add bios that are delayed due to remote recovery
+        * back on to the write queue
+        */
+       if (unlikely(requeue.head)) {
+               spin_lock_irq(&ms->lock);
+               bio_list_merge(&ms->writes, &requeue);
+               spin_unlock_irq(&ms->lock);
+       }
+
        /*
         * Increment the pending counts for any regions that will
         * be written to (writes to recover regions are going to
@@ -764,9 +787,9 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
        atomic_set(&ms->suspend, 0);
        atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
 
-       len = sizeof(struct dm_raid1_read_record);
-       ms->read_record_pool = mempool_create_kmalloc_pool(MIN_READ_RECORDS,
-                                                          len);
+       ms->read_record_pool = mempool_create_slab_pool(MIN_READ_RECORDS,
+                                               _dm_raid1_read_record_cache);
+
        if (!ms->read_record_pool) {
                ti->error = "Error creating mirror read_record_pool";
                kfree(ms);
@@ -1279,16 +1302,31 @@ static int __init dm_mirror_init(void)
 {
        int r;
 
+       _dm_raid1_read_record_cache = KMEM_CACHE(dm_raid1_read_record, 0);
+       if (!_dm_raid1_read_record_cache) {
+               DMERR("Can't allocate dm_raid1_read_record cache");
+               r = -ENOMEM;
+               goto bad_cache;
+       }
+
        r = dm_register_target(&mirror_target);
-       if (r < 0)
+       if (r < 0) {
                DMERR("Failed to register mirror target");
+               goto bad_target;
+       }
+
+       return 0;
 
+bad_target:
+       kmem_cache_destroy(_dm_raid1_read_record_cache);
+bad_cache:
        return r;
 }
 
 static void __exit dm_mirror_exit(void)
 {
        dm_unregister_target(&mirror_target);
+       kmem_cache_destroy(_dm_raid1_read_record_cache);
 }
 
 /* Module hooks */
index 936b34e0959fdd5cdeaa6eddc51dee888fb05447..e75c6dd76a9adfb3b374aa9cc6d48a2db14596be 100644 (file)
@@ -6,7 +6,6 @@
  */
 
 #include "dm-exception-store.h"
-#include "dm-snap.h"
 
 #include <linux/mm.h>
 #include <linux/pagemap.h>
@@ -89,7 +88,7 @@ struct commit_callback {
  * The top level structure for a persistent exception store.
  */
 struct pstore {
-       struct dm_snapshot *snap;       /* up pointer to my snapshot */
+       struct dm_exception_store *store;
        int version;
        int valid;
        uint32_t exceptions_per_area;
@@ -141,7 +140,7 @@ static int alloc_area(struct pstore *ps)
        int r = -ENOMEM;
        size_t len;
 
-       len = ps->snap->chunk_size << SECTOR_SHIFT;
+       len = ps->store->chunk_size << SECTOR_SHIFT;
 
        /*
         * Allocate the chunk_size block of memory that will hold
@@ -163,9 +162,12 @@ static int alloc_area(struct pstore *ps)
 
 static void free_area(struct pstore *ps)
 {
-       vfree(ps->area);
+       if (ps->area)
+               vfree(ps->area);
        ps->area = NULL;
-       vfree(ps->zero_area);
+
+       if (ps->zero_area)
+               vfree(ps->zero_area);
        ps->zero_area = NULL;
 }
 
@@ -189,9 +191,9 @@ static void do_metadata(struct work_struct *work)
 static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata)
 {
        struct dm_io_region where = {
-               .bdev = ps->snap->cow->bdev,
-               .sector = ps->snap->chunk_size * chunk,
-               .count = ps->snap->chunk_size,
+               .bdev = ps->store->cow->bdev,
+               .sector = ps->store->chunk_size * chunk,
+               .count = ps->store->chunk_size,
        };
        struct dm_io_request io_req = {
                .bi_rw = rw,
@@ -247,15 +249,15 @@ static int area_io(struct pstore *ps, int rw)
 
 static void zero_memory_area(struct pstore *ps)
 {
-       memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
+       memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT);
 }
 
 static int zero_disk_area(struct pstore *ps, chunk_t area)
 {
        struct dm_io_region where = {
-               .bdev = ps->snap->cow->bdev,
-               .sector = ps->snap->chunk_size * area_location(ps, area),
-               .count = ps->snap->chunk_size,
+               .bdev = ps->store->cow->bdev,
+               .sector = ps->store->chunk_size * area_location(ps, area),
+               .count = ps->store->chunk_size,
        };
        struct dm_io_request io_req = {
                .bi_rw = WRITE,
@@ -278,15 +280,15 @@ static int read_header(struct pstore *ps, int *new_snapshot)
        /*
         * Use default chunk size (or hardsect_size, if larger) if none supplied
         */
-       if (!ps->snap->chunk_size) {
-               ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
-                   bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
-               ps->snap->chunk_mask = ps->snap->chunk_size - 1;
-               ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
+       if (!ps->store->chunk_size) {
+               ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
+                   bdev_hardsect_size(ps->store->cow->bdev) >> 9);
+               ps->store->chunk_mask = ps->store->chunk_size - 1;
+               ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
                chunk_size_supplied = 0;
        }
 
-       ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap->
+       ps->io_client = dm_io_client_create(sectors_to_pages(ps->store->
                                                             chunk_size));
        if (IS_ERR(ps->io_client))
                return PTR_ERR(ps->io_client);
@@ -317,22 +319,22 @@ static int read_header(struct pstore *ps, int *new_snapshot)
        ps->version = le32_to_cpu(dh->version);
        chunk_size = le32_to_cpu(dh->chunk_size);
 
-       if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
+       if (!chunk_size_supplied || ps->store->chunk_size == chunk_size)
                return 0;
 
        DMWARN("chunk size %llu in device metadata overrides "
               "table chunk size of %llu.",
               (unsigned long long)chunk_size,
-              (unsigned long long)ps->snap->chunk_size);
+              (unsigned long long)ps->store->chunk_size);
 
        /* We had a bogus chunk_size. Fix stuff up. */
        free_area(ps);
 
-       ps->snap->chunk_size = chunk_size;
-       ps->snap->chunk_mask = chunk_size - 1;
-       ps->snap->chunk_shift = ffs(chunk_size) - 1;
+       ps->store->chunk_size = chunk_size;
+       ps->store->chunk_mask = chunk_size - 1;
+       ps->store->chunk_shift = ffs(chunk_size) - 1;
 
-       r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size),
+       r = dm_io_client_resize(sectors_to_pages(ps->store->chunk_size),
                                ps->io_client);
        if (r)
                return r;
@@ -349,13 +351,13 @@ static int write_header(struct pstore *ps)
 {
        struct disk_header *dh;
 
-       memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
+       memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT);
 
        dh = (struct disk_header *) ps->area;
        dh->magic = cpu_to_le32(SNAP_MAGIC);
        dh->valid = cpu_to_le32(ps->valid);
        dh->version = cpu_to_le32(ps->version);
-       dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
+       dh->chunk_size = cpu_to_le32(ps->store->chunk_size);
 
        return chunk_io(ps, 0, WRITE, 1);
 }
@@ -474,18 +476,25 @@ static struct pstore *get_info(struct dm_exception_store *store)
 static void persistent_fraction_full(struct dm_exception_store *store,
                                     sector_t *numerator, sector_t *denominator)
 {
-       *numerator = get_info(store)->next_free * store->snap->chunk_size;
-       *denominator = get_dev_size(store->snap->cow->bdev);
+       *numerator = get_info(store)->next_free * store->chunk_size;
+       *denominator = get_dev_size(store->cow->bdev);
 }
 
-static void persistent_destroy(struct dm_exception_store *store)
+static void persistent_dtr(struct dm_exception_store *store)
 {
        struct pstore *ps = get_info(store);
 
        destroy_workqueue(ps->metadata_wq);
-       dm_io_client_destroy(ps->io_client);
-       vfree(ps->callbacks);
+
+       /* Created in read_header */
+       if (ps->io_client)
+               dm_io_client_destroy(ps->io_client);
        free_area(ps);
+
+       /* Allocated in persistent_read_metadata */
+       if (ps->callbacks)
+               vfree(ps->callbacks);
+
        kfree(ps);
 }
 
@@ -507,7 +516,7 @@ static int persistent_read_metadata(struct dm_exception_store *store,
        /*
         * Now we know correct chunk_size, complete the initialisation.
         */
-       ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
+       ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) /
                                  sizeof(struct disk_exception);
        ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
                        sizeof(*ps->callbacks));
@@ -564,10 +573,10 @@ static int persistent_prepare_exception(struct dm_exception_store *store,
        struct pstore *ps = get_info(store);
        uint32_t stride;
        chunk_t next_free;
-       sector_t size = get_dev_size(store->snap->cow->bdev);
+       sector_t size = get_dev_size(store->cow->bdev);
 
        /* Is there enough room ? */
-       if (size < ((ps->next_free + 1) * store->snap->chunk_size))
+       if (size < ((ps->next_free + 1) * store->chunk_size))
                return -ENOSPC;
 
        e->new_chunk = ps->next_free;
@@ -656,16 +665,17 @@ static void persistent_drop_snapshot(struct dm_exception_store *store)
                DMWARN("write header failed");
 }
 
-int dm_create_persistent(struct dm_exception_store *store)
+static int persistent_ctr(struct dm_exception_store *store,
+                         unsigned argc, char **argv)
 {
        struct pstore *ps;
 
        /* allocate the pstore */
-       ps = kmalloc(sizeof(*ps), GFP_KERNEL);
+       ps = kzalloc(sizeof(*ps), GFP_KERNEL);
        if (!ps)
                return -ENOMEM;
 
-       ps->snap = store->snap;
+       ps->store = store;
        ps->valid = 1;
        ps->version = SNAPSHOT_DISK_VERSION;
        ps->area = NULL;
@@ -683,22 +693,77 @@ int dm_create_persistent(struct dm_exception_store *store)
                return -ENOMEM;
        }
 
-       store->destroy = persistent_destroy;
-       store->read_metadata = persistent_read_metadata;
-       store->prepare_exception = persistent_prepare_exception;
-       store->commit_exception = persistent_commit_exception;
-       store->drop_snapshot = persistent_drop_snapshot;
-       store->fraction_full = persistent_fraction_full;
        store->context = ps;
 
        return 0;
 }
 
+static unsigned persistent_status(struct dm_exception_store *store,
+                                 status_type_t status, char *result,
+                                 unsigned maxlen)
+{
+       unsigned sz = 0;
+
+       switch (status) {
+       case STATUSTYPE_INFO:
+               break;
+       case STATUSTYPE_TABLE:
+               DMEMIT(" %s P %llu", store->cow->name,
+                      (unsigned long long)store->chunk_size);
+       }
+
+       return sz;
+}
+
+static struct dm_exception_store_type _persistent_type = {
+       .name = "persistent",
+       .module = THIS_MODULE,
+       .ctr = persistent_ctr,
+       .dtr = persistent_dtr,
+       .read_metadata = persistent_read_metadata,
+       .prepare_exception = persistent_prepare_exception,
+       .commit_exception = persistent_commit_exception,
+       .drop_snapshot = persistent_drop_snapshot,
+       .fraction_full = persistent_fraction_full,
+       .status = persistent_status,
+};
+
+static struct dm_exception_store_type _persistent_compat_type = {
+       .name = "P",
+       .module = THIS_MODULE,
+       .ctr = persistent_ctr,
+       .dtr = persistent_dtr,
+       .read_metadata = persistent_read_metadata,
+       .prepare_exception = persistent_prepare_exception,
+       .commit_exception = persistent_commit_exception,
+       .drop_snapshot = persistent_drop_snapshot,
+       .fraction_full = persistent_fraction_full,
+       .status = persistent_status,
+};
+
 int dm_persistent_snapshot_init(void)
 {
-       return 0;
+       int r;
+
+       r = dm_exception_store_type_register(&_persistent_type);
+       if (r) {
+               DMERR("Unable to register persistent exception store type");
+               return r;
+       }
+
+       r = dm_exception_store_type_register(&_persistent_compat_type);
+       if (r) {
+               DMERR("Unable to register old-style persistent exception "
+                     "store type");
+               dm_exception_store_type_unregister(&_persistent_type);
+               return r;
+       }
+
+       return r;
 }
 
 void dm_persistent_snapshot_exit(void)
 {
+       dm_exception_store_type_unregister(&_persistent_type);
+       dm_exception_store_type_unregister(&_persistent_compat_type);
 }
index 7f6e2e6dcb0ddcfa78ba071af928dac0f5401682..cde5aa558e6d77b7d1130777762f8aec84c86ab1 100644 (file)
@@ -6,7 +6,6 @@
  */
 
 #include "dm-exception-store.h"
-#include "dm-snap.h"
 
 #include <linux/mm.h>
 #include <linux/pagemap.h>
@@ -23,7 +22,7 @@ struct transient_c {
        sector_t next_free;
 };
 
-static void transient_destroy(struct dm_exception_store *store)
+static void transient_dtr(struct dm_exception_store *store)
 {
        kfree(store->context);
 }
@@ -39,14 +38,14 @@ static int transient_read_metadata(struct dm_exception_store *store,
 static int transient_prepare_exception(struct dm_exception_store *store,
                                       struct dm_snap_exception *e)
 {
-       struct transient_c *tc = (struct transient_c *) store->context;
-       sector_t size = get_dev_size(store->snap->cow->bdev);
+       struct transient_c *tc = store->context;
+       sector_t size = get_dev_size(store->cow->bdev);
 
-       if (size < (tc->next_free + store->snap->chunk_size))
+       if (size < (tc->next_free + store->chunk_size))
                return -1;
 
-       e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
-       tc->next_free += store->snap->chunk_size;
+       e->new_chunk = sector_to_chunk(store, tc->next_free);
+       tc->next_free += store->chunk_size;
 
        return 0;
 }
@@ -64,20 +63,14 @@ static void transient_fraction_full(struct dm_exception_store *store,
                                    sector_t *numerator, sector_t *denominator)
 {
        *numerator = ((struct transient_c *) store->context)->next_free;
-       *denominator = get_dev_size(store->snap->cow->bdev);
+       *denominator = get_dev_size(store->cow->bdev);
 }
 
-int dm_create_transient(struct dm_exception_store *store)
+static int transient_ctr(struct dm_exception_store *store,
+                        unsigned argc, char **argv)
 {
        struct transient_c *tc;
 
-       store->destroy = transient_destroy;
-       store->read_metadata = transient_read_metadata;
-       store->prepare_exception = transient_prepare_exception;
-       store->commit_exception = transient_commit_exception;
-       store->drop_snapshot = NULL;
-       store->fraction_full = transient_fraction_full;
-
        tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
        if (!tc)
                return -ENOMEM;
@@ -88,11 +81,70 @@ int dm_create_transient(struct dm_exception_store *store)
        return 0;
 }
 
+static unsigned transient_status(struct dm_exception_store *store,
+                                status_type_t status, char *result,
+                                unsigned maxlen)
+{
+       unsigned sz = 0;
+
+       switch (status) {
+       case STATUSTYPE_INFO:
+               break;
+       case STATUSTYPE_TABLE:
+               DMEMIT(" %s N %llu", store->cow->name,
+                      (unsigned long long)store->chunk_size);
+       }
+
+       return sz;
+}
+
+static struct dm_exception_store_type _transient_type = {
+       .name = "transient",
+       .module = THIS_MODULE,
+       .ctr = transient_ctr,
+       .dtr = transient_dtr,
+       .read_metadata = transient_read_metadata,
+       .prepare_exception = transient_prepare_exception,
+       .commit_exception = transient_commit_exception,
+       .fraction_full = transient_fraction_full,
+       .status = transient_status,
+};
+
+static struct dm_exception_store_type _transient_compat_type = {
+       .name = "N",
+       .module = THIS_MODULE,
+       .ctr = transient_ctr,
+       .dtr = transient_dtr,
+       .read_metadata = transient_read_metadata,
+       .prepare_exception = transient_prepare_exception,
+       .commit_exception = transient_commit_exception,
+       .fraction_full = transient_fraction_full,
+       .status = transient_status,
+};
+
 int dm_transient_snapshot_init(void)
 {
-       return 0;
+       int r;
+
+       r = dm_exception_store_type_register(&_transient_type);
+       if (r) {
+               DMWARN("Unable to register transient exception store type");
+               return r;
+       }
+
+       r = dm_exception_store_type_register(&_transient_compat_type);
+       if (r) {
+               DMWARN("Unable to register old-style transient "
+                      "exception store type");
+               dm_exception_store_type_unregister(&_transient_type);
+               return r;
+       }
+
+       return r;
 }
 
 void dm_transient_snapshot_exit(void)
 {
+       dm_exception_store_type_unregister(&_transient_type);
+       dm_exception_store_type_unregister(&_transient_compat_type);
 }
index 65ff82ff124e29a3c7cfedf276df89646fcbc2ac..981a0413068f8028c90e7da039cf5d48fcc27b9b 100644 (file)
@@ -7,7 +7,6 @@
  */
 
 #include <linux/blkdev.h>
-#include <linux/ctype.h>
 #include <linux/device-mapper.h>
 #include <linux/delay.h>
 #include <linux/fs.h>
@@ -20,9 +19,9 @@
 #include <linux/vmalloc.h>
 #include <linux/log2.h>
 #include <linux/dm-kcopyd.h>
+#include <linux/workqueue.h>
 
 #include "dm-exception-store.h"
-#include "dm-snap.h"
 #include "dm-bio-list.h"
 
 #define DM_MSG_PREFIX "snapshots"
  */
 #define MIN_IOS 256
 
+#define DM_TRACKED_CHUNK_HASH_SIZE     16
+#define DM_TRACKED_CHUNK_HASH(x)       ((unsigned long)(x) & \
+                                        (DM_TRACKED_CHUNK_HASH_SIZE - 1))
+
+struct exception_table {
+       uint32_t hash_mask;
+       unsigned hash_shift;
+       struct list_head *table;
+};
+
+struct dm_snapshot {
+       struct rw_semaphore lock;
+
+       struct dm_dev *origin;
+
+       /* List of snapshots per Origin */
+       struct list_head list;
+
+       /* You can't use a snapshot if this is 0 (e.g. if full) */
+       int valid;
+
+       /* Origin writes don't trigger exceptions until this is set */
+       int active;
+
+       mempool_t *pending_pool;
+
+       atomic_t pending_exceptions_count;
+
+       struct exception_table pending;
+       struct exception_table complete;
+
+       /*
+        * pe_lock protects all pending_exception operations and access
+        * as well as the snapshot_bios list.
+        */
+       spinlock_t pe_lock;
+
+       /* The on disk metadata handler */
+       struct dm_exception_store *store;
+
+       struct dm_kcopyd_client *kcopyd_client;
+
+       /* Queue of snapshot writes for ksnapd to flush */
+       struct bio_list queued_bios;
+       struct work_struct queued_bios_work;
+
+       /* Chunks with outstanding reads */
+       mempool_t *tracked_chunk_pool;
+       spinlock_t tracked_chunk_lock;
+       struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
+};
+
 static struct workqueue_struct *ksnapd;
 static void flush_queued_bios(struct work_struct *work);
 
+static sector_t chunk_to_sector(struct dm_exception_store *store,
+                               chunk_t chunk)
+{
+       return chunk << store->chunk_shift;
+}
+
+static int bdev_equal(struct block_device *lhs, struct block_device *rhs)
+{
+       /*
+        * There is only ever one instance of a particular block
+        * device so we can compare pointers safely.
+        */
+       return lhs == rhs;
+}
+
 struct dm_snap_pending_exception {
        struct dm_snap_exception e;
 
@@ -476,11 +542,11 @@ static int init_hash_tables(struct dm_snapshot *s)
         * Calculate based on the size of the original volume or
         * the COW volume...
         */
-       cow_dev_size = get_dev_size(s->cow->bdev);
+       cow_dev_size = get_dev_size(s->store->cow->bdev);
        origin_dev_size = get_dev_size(s->origin->bdev);
        max_buckets = calc_max_buckets();
 
-       hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift;
+       hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift;
        hash_size = min(hash_size, max_buckets);
 
        hash_size = rounddown_pow_of_two(hash_size);
@@ -504,58 +570,6 @@ static int init_hash_tables(struct dm_snapshot *s)
        return 0;
 }
 
-/*
- * Round a number up to the nearest 'size' boundary.  size must
- * be a power of 2.
- */
-static ulong round_up(ulong n, ulong size)
-{
-       size--;
-       return (n + size) & ~size;
-}
-
-static int set_chunk_size(struct dm_snapshot *s, const char *chunk_size_arg,
-                         char **error)
-{
-       unsigned long chunk_size;
-       char *value;
-
-       chunk_size = simple_strtoul(chunk_size_arg, &value, 10);
-       if (*chunk_size_arg == '\0' || *value != '\0') {
-               *error = "Invalid chunk size";
-               return -EINVAL;
-       }
-
-       if (!chunk_size) {
-               s->chunk_size = s->chunk_mask = s->chunk_shift = 0;
-               return 0;
-       }
-
-       /*
-        * Chunk size must be multiple of page size.  Silently
-        * round up if it's not.
-        */
-       chunk_size = round_up(chunk_size, PAGE_SIZE >> 9);
-
-       /* Check chunk_size is a power of 2 */
-       if (!is_power_of_2(chunk_size)) {
-               *error = "Chunk size is not a power of 2";
-               return -EINVAL;
-       }
-
-       /* Validate the chunk size against the device block size */
-       if (chunk_size % (bdev_hardsect_size(s->cow->bdev) >> 9)) {
-               *error = "Chunk size is not a multiple of device blocksize";
-               return -EINVAL;
-       }
-
-       s->chunk_size = chunk_size;
-       s->chunk_mask = chunk_size - 1;
-       s->chunk_shift = ffs(chunk_size) - 1;
-
-       return 0;
-}
-
 /*
  * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
  */
@@ -564,91 +578,68 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        struct dm_snapshot *s;
        int i;
        int r = -EINVAL;
-       char persistent;
        char *origin_path;
-       char *cow_path;
+       struct dm_exception_store *store;
+       unsigned args_used;
 
        if (argc != 4) {
                ti->error = "requires exactly 4 arguments";
                r = -EINVAL;
-               goto bad1;
+               goto bad_args;
        }
 
        origin_path = argv[0];
-       cow_path = argv[1];
-       persistent = toupper(*argv[2]);
+       argv++;
+       argc--;
 
-       if (persistent != 'P' && persistent != 'N') {
-               ti->error = "Persistent flag is not P or N";
+       r = dm_exception_store_create(ti, argc, argv, &args_used, &store);
+       if (r) {
+               ti->error = "Couldn't create exception store";
                r = -EINVAL;
-               goto bad1;
+               goto bad_args;
        }
 
+       argv += args_used;
+       argc -= args_used;
+
        s = kmalloc(sizeof(*s), GFP_KERNEL);
-       if (s == NULL) {
+       if (!s) {
                ti->error = "Cannot allocate snapshot context private "
                    "structure";
                r = -ENOMEM;
-               goto bad1;
+               goto bad_snap;
        }
 
        r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
        if (r) {
                ti->error = "Cannot get origin device";
-               goto bad2;
-       }
-
-       r = dm_get_device(ti, cow_path, 0, 0,
-                         FMODE_READ | FMODE_WRITE, &s->cow);
-       if (r) {
-               dm_put_device(ti, s->origin);
-               ti->error = "Cannot get COW device";
-               goto bad2;
+               goto bad_origin;
        }
 
-       r = set_chunk_size(s, argv[3], &ti->error);
-       if (r)
-               goto bad3;
-
-       s->type = persistent;
-
+       s->store = store;
        s->valid = 1;
        s->active = 0;
        atomic_set(&s->pending_exceptions_count, 0);
        init_rwsem(&s->lock);
        spin_lock_init(&s->pe_lock);
-       s->ti = ti;
 
        /* Allocate hash table for COW data */
        if (init_hash_tables(s)) {
                ti->error = "Unable to allocate hash table space";
                r = -ENOMEM;
-               goto bad3;
-       }
-
-       s->store.snap = s;
-
-       if (persistent == 'P')
-               r = dm_create_persistent(&s->store);
-       else
-               r = dm_create_transient(&s->store);
-
-       if (r) {
-               ti->error = "Couldn't create exception store";
-               r = -EINVAL;
-               goto bad4;
+               goto bad_hash_tables;
        }
 
        r = dm_kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
        if (r) {
                ti->error = "Could not create kcopyd client";
-               goto bad5;
+               goto bad_kcopyd;
        }
 
        s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache);
        if (!s->pending_pool) {
                ti->error = "Could not allocate mempool for pending exceptions";
-               goto bad6;
+               goto bad_pending_pool;
        }
 
        s->tracked_chunk_pool = mempool_create_slab_pool(MIN_IOS,
@@ -665,7 +656,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        spin_lock_init(&s->tracked_chunk_lock);
 
        /* Metadata must only be loaded into one table at once */
-       r = s->store.read_metadata(&s->store, dm_add_exception, (void *)s);
+       r = s->store->type->read_metadata(s->store, dm_add_exception,
+                                         (void *)s);
        if (r < 0) {
                ti->error = "Failed to read snapshot metadata";
                goto bad_load_and_register;
@@ -686,34 +678,33 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        }
 
        ti->private = s;
-       ti->split_io = s->chunk_size;
+       ti->split_io = s->store->chunk_size;
 
        return 0;
 
- bad_load_and_register:
+bad_load_and_register:
        mempool_destroy(s->tracked_chunk_pool);
 
- bad_tracked_chunk_pool:
+bad_tracked_chunk_pool:
        mempool_destroy(s->pending_pool);
 
- bad6:
+bad_pending_pool:
        dm_kcopyd_client_destroy(s->kcopyd_client);
 
- bad5:
-       s->store.destroy(&s->store);
-
- bad4:
+bad_kcopyd:
        exit_exception_table(&s->pending, pending_cache);
        exit_exception_table(&s->complete, exception_cache);
 
- bad3:
-       dm_put_device(ti, s->cow);
+bad_hash_tables:
        dm_put_device(ti, s->origin);
 
- bad2:
+bad_origin:
        kfree(s);
 
- bad1:
+bad_snap:
+       dm_exception_store_destroy(store);
+
+bad_args:
        return r;
 }
 
@@ -724,8 +715,6 @@ static void __free_exceptions(struct dm_snapshot *s)
 
        exit_exception_table(&s->pending, pending_cache);
        exit_exception_table(&s->complete, exception_cache);
-
-       s->store.destroy(&s->store);
 }
 
 static void snapshot_dtr(struct dm_target *ti)
@@ -761,7 +750,8 @@ static void snapshot_dtr(struct dm_target *ti)
        mempool_destroy(s->pending_pool);
 
        dm_put_device(ti, s->origin);
-       dm_put_device(ti, s->cow);
+
+       dm_exception_store_destroy(s->store);
 
        kfree(s);
 }
@@ -820,12 +810,12 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err)
        else if (err == -ENOMEM)
                DMERR("Invalidating snapshot: Unable to allocate exception.");
 
-       if (s->store.drop_snapshot)
-               s->store.drop_snapshot(&s->store);
+       if (s->store->type->drop_snapshot)
+               s->store->type->drop_snapshot(s->store);
 
        s->valid = 0;
 
-       dm_table_event(s->ti->table);
+       dm_table_event(s->store->ti->table);
 }
 
 static void get_pending_exception(struct dm_snap_pending_exception *pe)
@@ -943,8 +933,8 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
 
        else
                /* Update the metadata if we are persistent */
-               s->store.commit_exception(&s->store, &pe->e, commit_callback,
-                                         pe);
+               s->store->type->commit_exception(s->store, &pe->e,
+                                                commit_callback, pe);
 }
 
 /*
@@ -960,11 +950,11 @@ static void start_copy(struct dm_snap_pending_exception *pe)
        dev_size = get_dev_size(bdev);
 
        src.bdev = bdev;
-       src.sector = chunk_to_sector(s, pe->e.old_chunk);
-       src.count = min(s->chunk_size, dev_size - src.sector);
+       src.sector = chunk_to_sector(s->store, pe->e.old_chunk);
+       src.count = min(s->store->chunk_size, dev_size - src.sector);
 
-       dest.bdev = s->cow->bdev;
-       dest.sector = chunk_to_sector(s, pe->e.new_chunk);
+       dest.bdev = s->store->cow->bdev;
+       dest.sector = chunk_to_sector(s->store, pe->e.new_chunk);
        dest.count = src.count;
 
        /* Hand over to kcopyd */
@@ -972,6 +962,17 @@ static void start_copy(struct dm_snap_pending_exception *pe)
                    &src, 1, &dest, 0, copy_callback, pe);
 }
 
+static struct dm_snap_pending_exception *
+__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
+{
+       struct dm_snap_exception *e = lookup_exception(&s->pending, chunk);
+
+       if (!e)
+               return NULL;
+
+       return container_of(e, struct dm_snap_pending_exception, e);
+}
+
 /*
  * Looks to see if this snapshot already has a pending exception
  * for this chunk, otherwise it allocates a new one and inserts
@@ -981,40 +982,15 @@ static void start_copy(struct dm_snap_pending_exception *pe)
  * this.
  */
 static struct dm_snap_pending_exception *
-__find_pending_exception(struct dm_snapshot *s, struct bio *bio)
+__find_pending_exception(struct dm_snapshot *s,
+                        struct dm_snap_pending_exception *pe, chunk_t chunk)
 {
-       struct dm_snap_exception *e;
-       struct dm_snap_pending_exception *pe;
-       chunk_t chunk = sector_to_chunk(s, bio->bi_sector);
+       struct dm_snap_pending_exception *pe2;
 
-       /*
-        * Is there a pending exception for this already ?
-        */
-       e = lookup_exception(&s->pending, chunk);
-       if (e) {
-               /* cast the exception to a pending exception */
-               pe = container_of(e, struct dm_snap_pending_exception, e);
-               goto out;
-       }
-
-       /*
-        * Create a new pending exception, we don't want
-        * to hold the lock while we do this.
-        */
-       up_write(&s->lock);
-       pe = alloc_pending_exception(s);
-       down_write(&s->lock);
-
-       if (!s->valid) {
-               free_pending_exception(pe);
-               return NULL;
-       }
-
-       e = lookup_exception(&s->pending, chunk);
-       if (e) {
+       pe2 = __lookup_pending_exception(s, chunk);
+       if (pe2) {
                free_pending_exception(pe);
-               pe = container_of(e, struct dm_snap_pending_exception, e);
-               goto out;
+               return pe2;
        }
 
        pe->e.old_chunk = chunk;
@@ -1024,7 +1000,7 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
        atomic_set(&pe->ref_count, 0);
        pe->started = 0;
 
-       if (s->store.prepare_exception(&s->store, &pe->e)) {
+       if (s->store->type->prepare_exception(s->store, &pe->e)) {
                free_pending_exception(pe);
                return NULL;
        }
@@ -1032,17 +1008,18 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
        get_pending_exception(pe);
        insert_exception(&s->pending, &pe->e);
 
- out:
        return pe;
 }
 
 static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e,
                            struct bio *bio, chunk_t chunk)
 {
-       bio->bi_bdev = s->cow->bdev;
-       bio->bi_sector = chunk_to_sector(s, dm_chunk_number(e->new_chunk) +
-                        (chunk - e->old_chunk)) +
-                        (bio->bi_sector & s->chunk_mask);
+       bio->bi_bdev = s->store->cow->bdev;
+       bio->bi_sector = chunk_to_sector(s->store,
+                                        dm_chunk_number(e->new_chunk) +
+                                        (chunk - e->old_chunk)) +
+                                        (bio->bi_sector &
+                                         s->store->chunk_mask);
 }
 
 static int snapshot_map(struct dm_target *ti, struct bio *bio,
@@ -1054,7 +1031,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
        chunk_t chunk;
        struct dm_snap_pending_exception *pe = NULL;
 
-       chunk = sector_to_chunk(s, bio->bi_sector);
+       chunk = sector_to_chunk(s->store, bio->bi_sector);
 
        /* Full snapshots are not usable */
        /* To get here the table must be live so s->active is always set. */
@@ -1083,11 +1060,31 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
         * writeable.
         */
        if (bio_rw(bio) == WRITE) {
-               pe = __find_pending_exception(s, bio);
+               pe = __lookup_pending_exception(s, chunk);
                if (!pe) {
-                       __invalidate_snapshot(s, -ENOMEM);
-                       r = -EIO;
-                       goto out_unlock;
+                       up_write(&s->lock);
+                       pe = alloc_pending_exception(s);
+                       down_write(&s->lock);
+
+                       if (!s->valid) {
+                               free_pending_exception(pe);
+                               r = -EIO;
+                               goto out_unlock;
+                       }
+
+                       e = lookup_exception(&s->complete, chunk);
+                       if (e) {
+                               free_pending_exception(pe);
+                               remap_exception(s, e, bio, chunk);
+                               goto out_unlock;
+                       }
+
+                       pe = __find_pending_exception(s, pe, chunk);
+                       if (!pe) {
+                               __invalidate_snapshot(s, -ENOMEM);
+                               r = -EIO;
+                               goto out_unlock;
+                       }
                }
 
                remap_exception(s, &pe->e, bio, chunk);
@@ -1137,24 +1134,25 @@ static void snapshot_resume(struct dm_target *ti)
 static int snapshot_status(struct dm_target *ti, status_type_t type,
                           char *result, unsigned int maxlen)
 {
+       unsigned sz = 0;
        struct dm_snapshot *snap = ti->private;
 
        switch (type) {
        case STATUSTYPE_INFO:
                if (!snap->valid)
-                       snprintf(result, maxlen, "Invalid");
+                       DMEMIT("Invalid");
                else {
-                       if (snap->store.fraction_full) {
+                       if (snap->store->type->fraction_full) {
                                sector_t numerator, denominator;
-                               snap->store.fraction_full(&snap->store,
-                                                         &numerator,
-                                                         &denominator);
-                               snprintf(result, maxlen, "%llu/%llu",
-                                       (unsigned long long)numerator,
-                                       (unsigned long long)denominator);
+                               snap->store->type->fraction_full(snap->store,
+                                                                &numerator,
+                                                                &denominator);
+                               DMEMIT("%llu/%llu",
+                                      (unsigned long long)numerator,
+                                      (unsigned long long)denominator);
                        }
                        else
-                               snprintf(result, maxlen, "Unknown");
+                               DMEMIT("Unknown");
                }
                break;
 
@@ -1164,10 +1162,9 @@ static int snapshot_status(struct dm_target *ti, status_type_t type,
                 * to make private copies if the output is to
                 * make sense.
                 */
-               snprintf(result, maxlen, "%s %s %c %llu",
-                        snap->origin->name, snap->cow->name,
-                        snap->type,
-                        (unsigned long long)snap->chunk_size);
+               DMEMIT("%s", snap->origin->name);
+               snap->store->type->status(snap->store, type, result + sz,
+                                         maxlen - sz);
                break;
        }
 
@@ -1196,14 +1193,14 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
                        goto next_snapshot;
 
                /* Nothing to do if writing beyond end of snapshot */
-               if (bio->bi_sector >= dm_table_get_size(snap->ti->table))
+               if (bio->bi_sector >= dm_table_get_size(snap->store->ti->table))
                        goto next_snapshot;
 
                /*
                 * Remember, different snapshots can have
                 * different chunk sizes.
                 */
-               chunk = sector_to_chunk(snap, bio->bi_sector);
+               chunk = sector_to_chunk(snap->store, bio->bi_sector);
 
                /*
                 * Check exception table to see if block
@@ -1217,10 +1214,28 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
                if (e)
                        goto next_snapshot;
 
-               pe = __find_pending_exception(snap, bio);
+               pe = __lookup_pending_exception(snap, chunk);
                if (!pe) {
-                       __invalidate_snapshot(snap, -ENOMEM);
-                       goto next_snapshot;
+                       up_write(&snap->lock);
+                       pe = alloc_pending_exception(snap);
+                       down_write(&snap->lock);
+
+                       if (!snap->valid) {
+                               free_pending_exception(pe);
+                               goto next_snapshot;
+                       }
+
+                       e = lookup_exception(&snap->complete, chunk);
+                       if (e) {
+                               free_pending_exception(pe);
+                               goto next_snapshot;
+                       }
+
+                       pe = __find_pending_exception(snap, pe, chunk);
+                       if (!pe) {
+                               __invalidate_snapshot(snap, -ENOMEM);
+                               goto next_snapshot;
+                       }
                }
 
                if (!primary_pe) {
@@ -1360,7 +1375,8 @@ static void origin_resume(struct dm_target *ti)
        o = __lookup_origin(dev->bdev);
        if (o)
                list_for_each_entry (snap, &o->snapshots, list)
-                       chunk_size = min_not_zero(chunk_size, snap->chunk_size);
+                       chunk_size = min_not_zero(chunk_size,
+                                                 snap->store->chunk_size);
        up_read(&_origins_lock);
 
        ti->split_io = chunk_size;
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h
deleted file mode 100644 (file)
index d9e62b4..0000000
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
- *
- * This file is released under the GPL.
- */
-
-#ifndef DM_SNAPSHOT_H
-#define DM_SNAPSHOT_H
-
-#include <linux/device-mapper.h>
-#include "dm-exception-store.h"
-#include "dm-bio-list.h"
-#include <linux/blkdev.h>
-#include <linux/workqueue.h>
-
-struct exception_table {
-       uint32_t hash_mask;
-       unsigned hash_shift;
-       struct list_head *table;
-};
-
-#define DM_TRACKED_CHUNK_HASH_SIZE     16
-#define DM_TRACKED_CHUNK_HASH(x)       ((unsigned long)(x) & \
-                                        (DM_TRACKED_CHUNK_HASH_SIZE - 1))
-
-struct dm_snapshot {
-       struct rw_semaphore lock;
-       struct dm_target *ti;
-
-       struct dm_dev *origin;
-       struct dm_dev *cow;
-
-       /* List of snapshots per Origin */
-       struct list_head list;
-
-       /* Size of data blocks saved - must be a power of 2 */
-       chunk_t chunk_size;
-       chunk_t chunk_mask;
-       chunk_t chunk_shift;
-
-       /* You can't use a snapshot if this is 0 (e.g. if full) */
-       int valid;
-
-       /* Origin writes don't trigger exceptions until this is set */
-       int active;
-
-       /* Used for display of table */
-       char type;
-
-       mempool_t *pending_pool;
-
-       atomic_t pending_exceptions_count;
-
-       struct exception_table pending;
-       struct exception_table complete;
-
-       /*
-        * pe_lock protects all pending_exception operations and access
-        * as well as the snapshot_bios list.
-        */
-       spinlock_t pe_lock;
-
-       /* The on disk metadata handler */
-       struct dm_exception_store store;
-
-       struct dm_kcopyd_client *kcopyd_client;
-
-       /* Queue of snapshot writes for ksnapd to flush */
-       struct bio_list queued_bios;
-       struct work_struct queued_bios_work;
-
-       /* Chunks with outstanding reads */
-       mempool_t *tracked_chunk_pool;
-       spinlock_t tracked_chunk_lock;
-       struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
-};
-
-/*
- * Return the number of sectors in the device.
- */
-static inline sector_t get_dev_size(struct block_device *bdev)
-{
-       return bdev->bd_inode->i_size >> SECTOR_SHIFT;
-}
-
-static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector)
-{
-       return (sector & ~s->chunk_mask) >> s->chunk_shift;
-}
-
-static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk)
-{
-       return chunk << s->chunk_shift;
-}
-
-static inline int bdev_equal(struct block_device *lhs, struct block_device *rhs)
-{
-       /*
-        * There is only ever one instance of a particular block
-        * device so we can compare pointers safely.
-        */
-       return lhs == rhs;
-}
-
-#endif
index 2fd66c30f7f8b02fd699f7b3c5cab52bd6f4d89d..e8361b191b9b223baef941bfc14212dc052d127f 100644 (file)
@@ -399,28 +399,30 @@ static int check_device_area(struct dm_dev_internal *dd, sector_t start,
 }
 
 /*
- * This upgrades the mode on an already open dm_dev.  Being
+ * This upgrades the mode on an already open dm_dev, being
  * careful to leave things as they were if we fail to reopen the
- * device.
+ * device and not to touch the existing bdev field in case
+ * it is accessed concurrently inside dm_table_any_congested().
  */
 static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
                        struct mapped_device *md)
 {
        int r;
-       struct dm_dev_internal dd_copy;
-       dev_t dev = dd->dm_dev.bdev->bd_dev;
+       struct dm_dev_internal dd_new, dd_old;
 
-       dd_copy = *dd;
+       dd_new = dd_old = *dd;
+
+       dd_new.dm_dev.mode |= new_mode;
+       dd_new.dm_dev.bdev = NULL;
+
+       r = open_dev(&dd_new, dd->dm_dev.bdev->bd_dev, md);
+       if (r)
+               return r;
 
        dd->dm_dev.mode |= new_mode;
-       dd->dm_dev.bdev = NULL;
-       r = open_dev(dd, dev, md);
-       if (!r)
-               close_dev(&dd_copy, md);
-       else
-               *dd = dd_copy;
+       close_dev(&dd_old, md);
 
-       return r;
+       return 0;
 }
 
 /*
index 7decf10006e41cf9b3372544fa60eff54d195923..04feccf2a997947968260029419e7813cb222553 100644 (file)
 
 #define DM_MSG_PREFIX "target"
 
-struct tt_internal {
-       struct target_type tt;
-
-       struct list_head list;
-       long use;
-};
-
 static LIST_HEAD(_targets);
 static DECLARE_RWSEM(_lock);
 
 #define DM_MOD_NAME_SIZE 32
 
-static inline struct tt_internal *__find_target_type(const char *name)
+static inline struct target_type *__find_target_type(const char *name)
 {
-       struct tt_internal *ti;
+       struct target_type *tt;
 
-       list_for_each_entry (ti, &_targets, list)
-               if (!strcmp(name, ti->tt.name))
-                       return ti;
+       list_for_each_entry(tt, &_targets, list)
+               if (!strcmp(name, tt->name))
+                       return tt;
 
        return NULL;
 }
 
-static struct tt_internal *get_target_type(const char *name)
+static struct target_type *get_target_type(const char *name)
 {
-       struct tt_internal *ti;
+       struct target_type *tt;
 
        down_read(&_lock);
 
-       ti = __find_target_type(name);
-       if (ti) {
-               if ((ti->use == 0) && !try_module_get(ti->tt.module))
-                       ti = NULL;
-               else
-                       ti->use++;
-       }
+       tt = __find_target_type(name);
+       if (tt && !try_module_get(tt->module))
+               tt = NULL;
 
        up_read(&_lock);
-       return ti;
+       return tt;
 }
 
 static void load_module(const char *name)
@@ -62,92 +51,59 @@ static void load_module(const char *name)
 
 struct target_type *dm_get_target_type(const char *name)
 {
-       struct tt_internal *ti = get_target_type(name);
+       struct target_type *tt = get_target_type(name);
 
-       if (!ti) {
+       if (!tt) {
                load_module(name);
-               ti = get_target_type(name);
+               tt = get_target_type(name);
        }
 
-       return ti ? &ti->tt : NULL;
+       return tt;
 }
 
-void dm_put_target_type(struct target_type *t)
+void dm_put_target_type(struct target_type *tt)
 {
-       struct tt_internal *ti = (struct tt_internal *) t;
-
        down_read(&_lock);
-       if (--ti->use == 0)
-               module_put(ti->tt.module);
-
-       BUG_ON(ti->use < 0);
+       module_put(tt->module);
        up_read(&_lock);
-
-       return;
-}
-
-static struct tt_internal *alloc_target(struct target_type *t)
-{
-       struct tt_internal *ti = kzalloc(sizeof(*ti), GFP_KERNEL);
-
-       if (ti)
-               ti->tt = *t;
-
-       return ti;
 }
 
-
 int dm_target_iterate(void (*iter_func)(struct target_type *tt,
                                        void *param), void *param)
 {
-       struct tt_internal *ti;
+       struct target_type *tt;
 
        down_read(&_lock);
-       list_for_each_entry (ti, &_targets, list)
-               iter_func(&ti->tt, param);
+       list_for_each_entry(tt, &_targets, list)
+               iter_func(tt, param);
        up_read(&_lock);
 
        return 0;
 }
 
-int dm_register_target(struct target_type *t)
+int dm_register_target(struct target_type *tt)
 {
        int rv = 0;
-       struct tt_internal *ti = alloc_target(t);
-
-       if (!ti)
-               return -ENOMEM;
 
        down_write(&_lock);
-       if (__find_target_type(t->name))
+       if (__find_target_type(tt->name))
                rv = -EEXIST;
        else
-               list_add(&ti->list, &_targets);
+               list_add(&tt->list, &_targets);
 
        up_write(&_lock);
-       if (rv)
-               kfree(ti);
        return rv;
 }
 
-void dm_unregister_target(struct target_type *t)
+void dm_unregister_target(struct target_type *tt)
 {
-       struct tt_internal *ti;
-
        down_write(&_lock);
-       if (!(ti = __find_target_type(t->name))) {
-               DMCRIT("Unregistering unrecognised target: %s", t->name);
-               BUG();
-       }
-
-       if (ti->use) {
-               DMCRIT("Attempt to unregister target still in use: %s",
-                      t->name);
+       if (!__find_target_type(tt->name)) {
+               DMCRIT("Unregistering unrecognised target: %s", tt->name);
                BUG();
        }
 
-       list_del(&ti->list);
-       kfree(ti);
+       list_del(&tt->list);
 
        up_write(&_lock);
 }
@@ -156,17 +112,17 @@ void dm_unregister_target(struct target_type *t)
  * io-err: always fails an io, useful for bringing
  * up LVs that have holes in them.
  */
-static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args)
+static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args)
 {
        return 0;
 }
 
-static void io_err_dtr(struct dm_target *ti)
+static void io_err_dtr(struct dm_target *tt)
 {
        /* empty */
 }
 
-static int io_err_map(struct dm_target *ti, struct bio *bio,
+static int io_err_map(struct dm_target *tt, struct bio *bio,
                      union map_info *map_context)
 {
        return -EIO;
index 8d40f27cce894e4a3c489a7d84d321fb39b363d6..788ba96a6256aaed6de8625d306cc4d5e33a82f9 100644 (file)
@@ -99,19 +99,9 @@ union map_info *dm_get_mapinfo(struct bio *bio)
 /*
  * Work processed by per-device workqueue.
  */
-struct dm_wq_req {
-       enum {
-               DM_WQ_FLUSH_DEFERRED,
-       } type;
-       struct work_struct work;
-       struct mapped_device *md;
-       void *context;
-};
-
 struct mapped_device {
        struct rw_semaphore io_lock;
        struct mutex suspend_lock;
-       spinlock_t pushback_lock;
        rwlock_t map_lock;
        atomic_t holders;
        atomic_t open_count;
@@ -129,8 +119,9 @@ struct mapped_device {
         */
        atomic_t pending;
        wait_queue_head_t wait;
+       struct work_struct work;
        struct bio_list deferred;
-       struct bio_list pushback;
+       spinlock_t deferred_lock;
 
        /*
         * Processing queue (flush/barriers)
@@ -453,7 +444,9 @@ static int queue_io(struct mapped_device *md, struct bio *bio)
                return 1;
        }
 
+       spin_lock_irq(&md->deferred_lock);
        bio_list_add(&md->deferred, bio);
+       spin_unlock_irq(&md->deferred_lock);
 
        up_write(&md->io_lock);
        return 0;               /* deferred successfully */
@@ -537,16 +530,14 @@ static void dec_pending(struct dm_io *io, int error)
                if (io->error == DM_ENDIO_REQUEUE) {
                        /*
                         * Target requested pushing back the I/O.
-                        * This must be handled before the sleeper on
-                        * suspend queue merges the pushback list.
                         */
-                       spin_lock_irqsave(&md->pushback_lock, flags);
+                       spin_lock_irqsave(&md->deferred_lock, flags);
                        if (__noflush_suspending(md))
-                               bio_list_add(&md->pushback, io->bio);
+                               bio_list_add(&md->deferred, io->bio);
                        else
                                /* noflush suspend was interrupted. */
                                io->error = -EIO;
-                       spin_unlock_irqrestore(&md->pushback_lock, flags);
+                       spin_unlock_irqrestore(&md->deferred_lock, flags);
                }
 
                end_io_acct(io);
@@ -834,20 +825,22 @@ static int __clone_and_map(struct clone_info *ci)
 }
 
 /*
- * Split the bio into several clones.
+ * Split the bio into several clones and submit it to targets.
  */
-static int __split_bio(struct mapped_device *md, struct bio *bio)
+static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 {
        struct clone_info ci;
        int error = 0;
 
        ci.map = dm_get_table(md);
-       if (unlikely(!ci.map))
-               return -EIO;
+       if (unlikely(!ci.map)) {
+               bio_io_error(bio);
+               return;
+       }
        if (unlikely(bio_barrier(bio) && !dm_table_barrier_ok(ci.map))) {
                dm_table_put(ci.map);
                bio_endio(bio, -EOPNOTSUPP);
-               return 0;
+               return;
        }
        ci.md = md;
        ci.bio = bio;
@@ -867,8 +860,6 @@ static int __split_bio(struct mapped_device *md, struct bio *bio)
        /* drop the extra reference count */
        dec_pending(ci.io, error);
        dm_table_put(ci.map);
-
-       return 0;
 }
 /*-----------------------------------------------------------------
  * CRUD END
@@ -959,8 +950,9 @@ static int dm_request(struct request_queue *q, struct bio *bio)
                down_read(&md->io_lock);
        }
 
-       r = __split_bio(md, bio);
+       __split_and_process_bio(md, bio);
        up_read(&md->io_lock);
+       return 0;
 
 out_req:
        if (r < 0)
@@ -1074,6 +1066,8 @@ out:
 
 static struct block_device_operations dm_blk_dops;
 
+static void dm_wq_work(struct work_struct *work);
+
 /*
  * Allocate and initialise a blank device with a given minor.
  */
@@ -1101,7 +1095,7 @@ static struct mapped_device *alloc_dev(int minor)
 
        init_rwsem(&md->io_lock);
        mutex_init(&md->suspend_lock);
-       spin_lock_init(&md->pushback_lock);
+       spin_lock_init(&md->deferred_lock);
        rwlock_init(&md->map_lock);
        atomic_set(&md->holders, 1);
        atomic_set(&md->open_count, 0);
@@ -1118,6 +1112,7 @@ static struct mapped_device *alloc_dev(int minor)
        md->queue->backing_dev_info.congested_fn = dm_any_congested;
        md->queue->backing_dev_info.congested_data = md;
        blk_queue_make_request(md->queue, dm_request);
+       blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
        blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
        md->queue->unplug_fn = dm_unplug_all;
        blk_queue_merge_bvec(md->queue, dm_merge_bvec);
@@ -1140,6 +1135,7 @@ static struct mapped_device *alloc_dev(int minor)
 
        atomic_set(&md->pending, 0);
        init_waitqueue_head(&md->wait);
+       INIT_WORK(&md->work, dm_wq_work);
        init_waitqueue_head(&md->eventq);
 
        md->disk->major = _major;
@@ -1379,18 +1375,24 @@ void dm_put(struct mapped_device *md)
 }
 EXPORT_SYMBOL_GPL(dm_put);
 
-static int dm_wait_for_completion(struct mapped_device *md)
+static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 {
        int r = 0;
+       DECLARE_WAITQUEUE(wait, current);
+
+       dm_unplug_all(md->queue);
+
+       add_wait_queue(&md->wait, &wait);
 
        while (1) {
-               set_current_state(TASK_INTERRUPTIBLE);
+               set_current_state(interruptible);
 
                smp_mb();
                if (!atomic_read(&md->pending))
                        break;
 
-               if (signal_pending(current)) {
+               if (interruptible == TASK_INTERRUPTIBLE &&
+                   signal_pending(current)) {
                        r = -EINTR;
                        break;
                }
@@ -1399,67 +1401,40 @@ static int dm_wait_for_completion(struct mapped_device *md)
        }
        set_current_state(TASK_RUNNING);
 
+       remove_wait_queue(&md->wait, &wait);
+
        return r;
 }
 
 /*
  * Process the deferred bios
  */
-static void __flush_deferred_io(struct mapped_device *md)
+static void dm_wq_work(struct work_struct *work)
 {
+       struct mapped_device *md = container_of(work, struct mapped_device,
+                                               work);
        struct bio *c;
 
-       while ((c = bio_list_pop(&md->deferred))) {
-               if (__split_bio(md, c))
-                       bio_io_error(c);
-       }
-
-       clear_bit(DMF_BLOCK_IO, &md->flags);
-}
+       down_write(&md->io_lock);
 
-static void __merge_pushback_list(struct mapped_device *md)
-{
-       unsigned long flags;
+next_bio:
+       spin_lock_irq(&md->deferred_lock);
+       c = bio_list_pop(&md->deferred);
+       spin_unlock_irq(&md->deferred_lock);
 
-       spin_lock_irqsave(&md->pushback_lock, flags);
-       clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
-       bio_list_merge_head(&md->deferred, &md->pushback);
-       bio_list_init(&md->pushback);
-       spin_unlock_irqrestore(&md->pushback_lock, flags);
-}
+       if (c) {
+               __split_and_process_bio(md, c);
+               goto next_bio;
+       }
 
-static void dm_wq_work(struct work_struct *work)
-{
-       struct dm_wq_req *req = container_of(work, struct dm_wq_req, work);
-       struct mapped_device *md = req->md;
+       clear_bit(DMF_BLOCK_IO, &md->flags);
 
-       down_write(&md->io_lock);
-       switch (req->type) {
-       case DM_WQ_FLUSH_DEFERRED:
-               __flush_deferred_io(md);
-               break;
-       default:
-               DMERR("dm_wq_work: unrecognised work type %d", req->type);
-               BUG();
-       }
        up_write(&md->io_lock);
 }
 
-static void dm_wq_queue(struct mapped_device *md, int type, void *context,
-                       struct dm_wq_req *req)
-{
-       req->type = type;
-       req->md = md;
-       req->context = context;
-       INIT_WORK(&req->work, dm_wq_work);
-       queue_work(md->wq, &req->work);
-}
-
-static void dm_queue_flush(struct mapped_device *md, int type, void *context)
+static void dm_queue_flush(struct mapped_device *md)
 {
-       struct dm_wq_req req;
-
-       dm_wq_queue(md, type, context, &req);
+       queue_work(md->wq, &md->work);
        flush_workqueue(md->wq);
 }
 
@@ -1534,7 +1509,6 @@ static void unlock_fs(struct mapped_device *md)
 int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 {
        struct dm_table *map = NULL;
-       DECLARE_WAITQUEUE(wait, current);
        int r = 0;
        int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
        int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
@@ -1584,28 +1558,22 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
        down_write(&md->io_lock);
        set_bit(DMF_BLOCK_IO, &md->flags);
 
-       add_wait_queue(&md->wait, &wait);
        up_write(&md->io_lock);
 
-       /* unplug */
-       if (map)
-               dm_table_unplug_all(map);
-
        /*
         * Wait for the already-mapped ios to complete.
         */
-       r = dm_wait_for_completion(md);
+       r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
 
        down_write(&md->io_lock);
-       remove_wait_queue(&md->wait, &wait);
 
        if (noflush)
-               __merge_pushback_list(md);
+               clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
        up_write(&md->io_lock);
 
        /* were we interrupted ? */
        if (r < 0) {
-               dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
+               dm_queue_flush(md);
 
                unlock_fs(md);
                goto out; /* pushback list is already flushed, so skip flush */
@@ -1645,7 +1613,7 @@ int dm_resume(struct mapped_device *md)
        if (r)
                goto out;
 
-       dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
+       dm_queue_flush(md);
 
        unlock_fs(md);
 
index 20194e000c5afcda9e10965d17c945b0df5d7171..b48397c0abbd44b253c37ce56db3647f7513eb9f 100644 (file)
@@ -60,7 +60,7 @@ int dm_table_barrier_ok(struct dm_table *t);
 int dm_target_init(void);
 void dm_target_exit(void);
 struct target_type *dm_get_target_type(const char *name);
-void dm_put_target_type(struct target_type *t);
+void dm_put_target_type(struct target_type *tt);
 int dm_target_iterate(void (*iter_func)(struct target_type *tt,
                                        void *param), void *param);
 
index 86d9adf90e791857efdf674117168398b40735a1..8695809b24b05f049c13a3dc667bf2a02d9b4ba6 100644 (file)
 #define        ModeShift       5
 
 #define MaxFault       50
-#include <linux/raid/md.h>
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include "md.h"
+#include <linux/seq_file.h>
 
 
 static void faulty_fail(struct bio *bio, int error)
@@ -280,6 +283,17 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size)
        return 0;
 }
 
+static sector_t faulty_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+       WARN_ONCE(raid_disks,
+                 "%s does not support generic reshape\n", __func__);
+
+       if (sectors == 0)
+               return mddev->dev_sectors;
+
+       return sectors;
+}
+
 static int run(mddev_t *mddev)
 {
        mdk_rdev_t *rdev;
@@ -298,7 +312,7 @@ static int run(mddev_t *mddev)
        list_for_each_entry(rdev, &mddev->disks, same_set)
                conf->rdev = rdev;
 
-       mddev->array_sectors = mddev->size * 2;
+       md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
        mddev->private = conf;
 
        reconfig(mddev, mddev->layout, -1);
@@ -325,6 +339,7 @@ static struct mdk_personality faulty_personality =
        .stop           = stop,
        .status         = status,
        .reconfig       = reconfig,
+       .size           = faulty_size,
 };
 
 static int __init raid_init(void)
index 09658b218474a3a8f676995f02e59fdfa10693b0..7a36e38393a1e9ff24defd03c756138a9f5da903 100644 (file)
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
 */
 
-#include <linux/raid/linear.h>
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "linear.h"
 
 /*
  * find which device holds a particular offset 
@@ -97,6 +101,16 @@ static int linear_congested(void *data, int bits)
        return ret;
 }
 
+static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+       linear_conf_t *conf = mddev_to_conf(mddev);
+
+       WARN_ONCE(sectors || raid_disks,
+                 "%s does not support generic reshape\n", __func__);
+
+       return conf->array_sectors;
+}
+
 static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 {
        linear_conf_t *conf;
@@ -135,8 +149,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
                    mddev->queue->max_sectors > (PAGE_SIZE>>9))
                        blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
-               disk->num_sectors = rdev->size * 2;
-               conf->array_sectors += rdev->size * 2;
+               disk->num_sectors = rdev->sectors;
+               conf->array_sectors += rdev->sectors;
 
                cnt++;
        }
@@ -249,7 +263,7 @@ static int linear_run (mddev_t *mddev)
        if (!conf)
                return 1;
        mddev->private = conf;
-       mddev->array_sectors = conf->array_sectors;
+       md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
 
        blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
        mddev->queue->unplug_fn = linear_unplug;
@@ -283,7 +297,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
        newconf->prev = mddev_to_conf(mddev);
        mddev->private = newconf;
        mddev->raid_disks++;
-       mddev->array_sectors = newconf->array_sectors;
+       md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
        set_capacity(mddev->gendisk, mddev->array_sectors);
        return 0;
 }
@@ -381,6 +395,7 @@ static struct mdk_personality linear_personality =
        .stop           = linear_stop,
        .status         = linear_status,
        .hot_add_disk   = linear_add,
+       .size           = linear_size,
 };
 
 static int __init linear_init (void)
similarity index 95%
rename from include/linux/raid/linear.h
rename to drivers/md/linear.h
index f38b9c586afbc38c2a3187ac479735b14ad5fffb..bf8179587f950b2ace4eba54a73abaf4960463ab 100644 (file)
@@ -1,8 +1,6 @@
 #ifndef _LINEAR_H
 #define _LINEAR_H
 
-#include <linux/raid/md.h>
-
 struct dev_info {
        mdk_rdev_t      *rdev;
        sector_t        num_sectors;
index a307f87eb90ee361ea6c36b2cbdd94c8886dfdec..ed5727c089a9403b2ad10bb7e581beae5b0d36ac 100644 (file)
@@ -33,9 +33,9 @@
 */
 
 #include <linux/kthread.h>
-#include <linux/raid/md.h>
-#include <linux/raid/bitmap.h>
+#include <linux/blkdev.h>
 #include <linux/sysctl.h>
+#include <linux/seq_file.h>
 #include <linux/buffer_head.h> /* for invalidate_bdev */
 #include <linux/poll.h>
 #include <linux/ctype.h>
 #include <linux/reboot.h>
 #include <linux/file.h>
 #include <linux/delay.h>
-
-#define MAJOR_NR MD_MAJOR
-
-/* 63 partitions with the alternate major number (mdp) */
-#define MdpMinorShift 6
+#include <linux/raid/md_p.h>
+#include <linux/raid/md_u.h>
+#include "md.h"
+#include "bitmap.h"
 
 #define DEBUG 0
 #define dprintk(x...) ((void)(DEBUG && printk(x)))
@@ -202,12 +201,68 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
                )
 
 
-static int md_fail_request(struct request_queue *q, struct bio *bio)
+/* Rather than calling directly into the personality make_request function,
+ * IO requests come here first so that we can check if the device is
+ * being suspended pending a reconfiguration.
+ * We hold a refcount over the call to ->make_request.  By the time that
+ * call has finished, the bio has been linked into some internal structure
+ * and so is visible to ->quiesce(), so we don't need the refcount any more.
+ */
+static int md_make_request(struct request_queue *q, struct bio *bio)
 {
-       bio_io_error(bio);
-       return 0;
+       mddev_t *mddev = q->queuedata;
+       int rv;
+       if (mddev == NULL || mddev->pers == NULL) {
+               bio_io_error(bio);
+               return 0;
+       }
+       rcu_read_lock();
+       if (mddev->suspended) {
+               DEFINE_WAIT(__wait);
+               for (;;) {
+                       prepare_to_wait(&mddev->sb_wait, &__wait,
+                                       TASK_UNINTERRUPTIBLE);
+                       if (!mddev->suspended)
+                               break;
+                       rcu_read_unlock();
+                       schedule();
+                       rcu_read_lock();
+               }
+               finish_wait(&mddev->sb_wait, &__wait);
+       }
+       atomic_inc(&mddev->active_io);
+       rcu_read_unlock();
+       rv = mddev->pers->make_request(q, bio);
+       if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
+               wake_up(&mddev->sb_wait);
+
+       return rv;
+}
+
+static void mddev_suspend(mddev_t *mddev)
+{
+       BUG_ON(mddev->suspended);
+       mddev->suspended = 1;
+       synchronize_rcu();
+       wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
+       mddev->pers->quiesce(mddev, 1);
+       md_unregister_thread(mddev->thread);
+       mddev->thread = NULL;
+       /* we now know that no code is executing in the personality module,
+        * except possibly the tail end of a ->bi_end_io function, but that
+        * is certain to complete before the module has a chance to get
+        * unloaded
+        */
+}
+
+static void mddev_resume(mddev_t *mddev)
+{
+       mddev->suspended = 0;
+       wake_up(&mddev->sb_wait);
+       mddev->pers->quiesce(mddev, 0);
 }
 
+
 static inline mddev_t *mddev_get(mddev_t *mddev)
 {
        atomic_inc(&mddev->active);
@@ -310,6 +365,7 @@ static mddev_t * mddev_find(dev_t unit)
        init_timer(&new->safemode_timer);
        atomic_set(&new->active, 1);
        atomic_set(&new->openers, 0);
+       atomic_set(&new->active_io, 0);
        spin_lock_init(&new->write_lock);
        init_waitqueue_head(&new->sb_wait);
        init_waitqueue_head(&new->recovery_wait);
@@ -326,6 +382,11 @@ static inline int mddev_lock(mddev_t * mddev)
        return mutex_lock_interruptible(&mddev->reconfig_mutex);
 }
 
+static inline int mddev_is_locked(mddev_t *mddev)
+{
+       return mutex_is_locked(&mddev->reconfig_mutex);
+}
+
 static inline int mddev_trylock(mddev_t * mddev)
 {
        return mutex_trylock(&mddev->reconfig_mutex);
@@ -409,7 +470,7 @@ static void free_disk_sb(mdk_rdev_t * rdev)
                rdev->sb_loaded = 0;
                rdev->sb_page = NULL;
                rdev->sb_start = 0;
-               rdev->size = 0;
+               rdev->sectors = 0;
        }
 }
 
@@ -775,9 +836,9 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
                else 
                        ret = 0;
        }
-       rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2;
+       rdev->sectors = calc_num_sectors(rdev, sb->chunk_size);
 
-       if (rdev->size < sb->size && sb->level > 1)
+       if (rdev->sectors < sb->size * 2 && sb->level > 1)
                /* "this cannot possibly happen" ... */
                ret = -EINVAL;
 
@@ -812,7 +873,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                mddev->clevel[0] = 0;
                mddev->layout = sb->layout;
                mddev->raid_disks = sb->raid_disks;
-               mddev->size = sb->size;
+               mddev->dev_sectors = sb->size * 2;
                mddev->events = ev1;
                mddev->bitmap_offset = 0;
                mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
@@ -926,7 +987,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 
        sb->ctime = mddev->ctime;
        sb->level = mddev->level;
-       sb->size  = mddev->size;
+       sb->size = mddev->dev_sectors / 2;
        sb->raid_disks = mddev->raid_disks;
        sb->md_minor = mddev->md_minor;
        sb->not_persistent = 0;
@@ -1024,7 +1085,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 static unsigned long long
 super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
 {
-       if (num_sectors && num_sectors < rdev->mddev->size * 2)
+       if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
                return 0; /* component must fit device */
        if (rdev->mddev->bitmap_offset)
                return 0; /* can't move bitmap */
@@ -1180,16 +1241,17 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
                        ret = 0;
        }
        if (minor_version)
-               rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
+               rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) -
+                       le64_to_cpu(sb->data_offset);
        else
-               rdev->size = rdev->sb_start / 2;
-       if (rdev->size < le64_to_cpu(sb->data_size)/2)
+               rdev->sectors = rdev->sb_start;
+       if (rdev->sectors < le64_to_cpu(sb->data_size))
                return -EINVAL;
-       rdev->size = le64_to_cpu(sb->data_size)/2;
+       rdev->sectors = le64_to_cpu(sb->data_size);
        if (le32_to_cpu(sb->chunksize))
-               rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
+               rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1);
 
-       if (le64_to_cpu(sb->size) > rdev->size*2)
+       if (le64_to_cpu(sb->size) > rdev->sectors)
                return -EINVAL;
        return ret;
 }
@@ -1216,7 +1278,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                mddev->clevel[0] = 0;
                mddev->layout = le32_to_cpu(sb->layout);
                mddev->raid_disks = le32_to_cpu(sb->raid_disks);
-               mddev->size = le64_to_cpu(sb->size)/2;
+               mddev->dev_sectors = le64_to_cpu(sb->size);
                mddev->events = ev1;
                mddev->bitmap_offset = 0;
                mddev->default_bitmap_offset = 1024 >> 9;
@@ -1312,7 +1374,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
        sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
 
        sb->raid_disks = cpu_to_le32(mddev->raid_disks);
-       sb->size = cpu_to_le64(mddev->size<<1);
+       sb->size = cpu_to_le64(mddev->dev_sectors);
 
        if (mddev->bitmap && mddev->bitmap_file == NULL) {
                sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
@@ -1320,10 +1382,15 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
        }
 
        if (rdev->raid_disk >= 0 &&
-           !test_bit(In_sync, &rdev->flags) &&
-           rdev->recovery_offset > 0) {
-               sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
-               sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
+           !test_bit(In_sync, &rdev->flags)) {
+               if (mddev->curr_resync_completed > rdev->recovery_offset)
+                       rdev->recovery_offset = mddev->curr_resync_completed;
+               if (rdev->recovery_offset > 0) {
+                       sb->feature_map |=
+                               cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
+                       sb->recovery_offset =
+                               cpu_to_le64(rdev->recovery_offset);
+               }
        }
 
        if (mddev->reshape_position != MaxSector) {
@@ -1365,7 +1432,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
 {
        struct mdp_superblock_1 *sb;
        sector_t max_sectors;
-       if (num_sectors && num_sectors < rdev->mddev->size * 2)
+       if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
                return 0; /* component must fit device */
        if (rdev->sb_start < rdev->data_offset) {
                /* minor versions 1 and 2; superblock before data */
@@ -1381,7 +1448,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
                sector_t sb_start;
                sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
                sb_start &= ~(sector_t)(4*2 - 1);
-               max_sectors = rdev->size * 2 + sb_start - rdev->sb_start;
+               max_sectors = rdev->sectors + sb_start - rdev->sb_start;
                if (!num_sectors || num_sectors > max_sectors)
                        num_sectors = max_sectors;
                rdev->sb_start = sb_start;
@@ -1433,6 +1500,38 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
 
 static LIST_HEAD(pending_raid_disks);
 
+static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev)
+{
+       struct mdk_personality *pers = mddev->pers;
+       struct gendisk *disk = mddev->gendisk;
+       struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
+       struct blk_integrity *bi_mddev = blk_get_integrity(disk);
+
+       /* Data integrity passthrough not supported on RAID 4, 5 and 6 */
+       if (pers && pers->level >= 4 && pers->level <= 6)
+               return;
+
+       /* If rdev is integrity capable, register profile for mddev */
+       if (!bi_mddev && bi_rdev) {
+               if (blk_integrity_register(disk, bi_rdev))
+                       printk(KERN_ERR "%s: %s Could not register integrity!\n",
+                              __func__, disk->disk_name);
+               else
+                       printk(KERN_NOTICE "Enabling data integrity on %s\n",
+                              disk->disk_name);
+               return;
+       }
+
+       /* Check that mddev and rdev have matching profiles */
+       if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) {
+               printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__,
+                      disk->disk_name, rdev->bdev->bd_disk->disk_name);
+               printk(KERN_NOTICE "Disabling data integrity on %s\n",
+                      disk->disk_name);
+               blk_integrity_unregister(disk);
+       }
+}
+
 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 {
        char b[BDEVNAME_SIZE];
@@ -1449,8 +1548,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
        if (find_rdev(mddev, rdev->bdev->bd_dev))
                return -EEXIST;
 
-       /* make sure rdev->size exceeds mddev->size */
-       if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
+       /* make sure rdev->sectors exceeds mddev->dev_sectors */
+       if (rdev->sectors && (mddev->dev_sectors == 0 ||
+                       rdev->sectors < mddev->dev_sectors)) {
                if (mddev->pers) {
                        /* Cannot change size, so fail
                         * If mddev->level <= 0, then we don't care
@@ -1459,7 +1559,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
                        if (mddev->level > 0)
                                return -ENOSPC;
                } else
-                       mddev->size = rdev->size;
+                       mddev->dev_sectors = rdev->sectors;
        }
 
        /* Verify rdev->desc_nr is unique.
@@ -1503,6 +1603,8 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 
        /* May as well allow recovery to be retried once */
        mddev->recovery_disabled = 0;
+
+       md_integrity_check(rdev, mddev);
        return 0;
 
  fail:
@@ -1713,8 +1815,8 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
 static void print_rdev(mdk_rdev_t *rdev, int major_version)
 {
        char b[BDEVNAME_SIZE];
-       printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
-               bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
+       printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
+               bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors,
                test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
                rdev->desc_nr);
        if (rdev->sb_loaded) {
@@ -2153,7 +2255,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                return -EINVAL;
        if (rdev->mddev->pers && rdev->raid_disk >= 0)
                return -EBUSY;
-       if (rdev->size && rdev->mddev->external)
+       if (rdev->sectors && rdev->mddev->external)
                /* Must set offset before size, so overlap checks
                 * can be sane */
                return -EBUSY;
@@ -2167,7 +2269,7 @@ __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
 static ssize_t
 rdev_size_show(mdk_rdev_t *rdev, char *page)
 {
-       return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
+       return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
 }
 
 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
@@ -2180,34 +2282,52 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
        return 1;
 }
 
+static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
+{
+       unsigned long long blocks;
+       sector_t new;
+
+       if (strict_strtoull(buf, 10, &blocks) < 0)
+               return -EINVAL;
+
+       if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
+               return -EINVAL; /* sector conversion overflow */
+
+       new = blocks * 2;
+       if (new != blocks * 2)
+               return -EINVAL; /* unsigned long long to sector_t overflow */
+
+       *sectors = new;
+       return 0;
+}
+
 static ssize_t
 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 {
-       unsigned long long size;
-       unsigned long long oldsize = rdev->size;
        mddev_t *my_mddev = rdev->mddev;
+       sector_t oldsectors = rdev->sectors;
+       sector_t sectors;
 
-       if (strict_strtoull(buf, 10, &size) < 0)
+       if (strict_blocks_to_sectors(buf, &sectors) < 0)
                return -EINVAL;
        if (my_mddev->pers && rdev->raid_disk >= 0) {
                if (my_mddev->persistent) {
-                       size = super_types[my_mddev->major_version].
-                               rdev_size_change(rdev, size * 2);
-                       if (!size)
+                       sectors = super_types[my_mddev->major_version].
+                               rdev_size_change(rdev, sectors);
+                       if (!sectors)
                                return -EBUSY;
-               } else if (!size) {
-                       size = (rdev->bdev->bd_inode->i_size >> 10);
-                       size -= rdev->data_offset/2;
-               }
+               } else if (!sectors)
+                       sectors = (rdev->bdev->bd_inode->i_size >> 9) -
+                               rdev->data_offset;
        }
-       if (size < my_mddev->size)
+       if (sectors < my_mddev->dev_sectors)
                return -EINVAL; /* component must fit device */
 
-       rdev->size = size;
-       if (size > oldsize && my_mddev->external) {
+       rdev->sectors = sectors;
+       if (sectors > oldsectors && my_mddev->external) {
                /* need to check that all other rdevs with the same ->bdev
                 * do not overlap.  We need to unlock the mddev to avoid
-                * a deadlock.  We have already changed rdev->size, and if
+                * a deadlock.  We have already changed rdev->sectors, and if
                 * we have to change it back, we will have the lock again.
                 */
                mddev_t *mddev;
@@ -2223,9 +2343,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                                if (test_bit(AllReserved, &rdev2->flags) ||
                                    (rdev->bdev == rdev2->bdev &&
                                     rdev != rdev2 &&
-                                    overlaps(rdev->data_offset, rdev->size * 2,
+                                    overlaps(rdev->data_offset, rdev->sectors,
                                              rdev2->data_offset,
-                                             rdev2->size * 2))) {
+                                             rdev2->sectors))) {
                                        overlap = 1;
                                        break;
                                }
@@ -2239,11 +2359,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                if (overlap) {
                        /* Someone else could have slipped in a size
                         * change here, but doing so is just silly.
-                        * We put oldsize back because we *know* it is
+                        * We put oldsectors back because we *know* it is
                         * safe, and trust userspace not to race with
                         * itself
                         */
-                       rdev->size = oldsize;
+                       rdev->sectors = oldsectors;
                        return -EBUSY;
                }
        }
@@ -2547,18 +2667,101 @@ level_show(mddev_t *mddev, char *page)
 static ssize_t
 level_store(mddev_t *mddev, const char *buf, size_t len)
 {
+       char level[16];
        ssize_t rv = len;
-       if (mddev->pers)
+       struct mdk_personality *pers;
+       void *priv;
+
+       if (mddev->pers == NULL) {
+               if (len == 0)
+                       return 0;
+               if (len >= sizeof(mddev->clevel))
+                       return -ENOSPC;
+               strncpy(mddev->clevel, buf, len);
+               if (mddev->clevel[len-1] == '\n')
+                       len--;
+               mddev->clevel[len] = 0;
+               mddev->level = LEVEL_NONE;
+               return rv;
+       }
+
+       /* request to change the personality.  Need to ensure:
+        *  - array is not engaged in resync/recovery/reshape
+        *  - old personality can be suspended
+        *  - new personality will access other array.
+        */
+
+       if (mddev->sync_thread || mddev->reshape_position != MaxSector)
                return -EBUSY;
-       if (len == 0)
-               return 0;
-       if (len >= sizeof(mddev->clevel))
-               return -ENOSPC;
-       strncpy(mddev->clevel, buf, len);
-       if (mddev->clevel[len-1] == '\n')
+
+       if (!mddev->pers->quiesce) {
+               printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
+                      mdname(mddev), mddev->pers->name);
+               return -EINVAL;
+       }
+
+       /* Now find the new personality */
+       if (len == 0 || len >= sizeof(level))
+               return -EINVAL;
+       strncpy(level, buf, len);
+       if (level[len-1] == '\n')
                len--;
-       mddev->clevel[len] = 0;
-       mddev->level = LEVEL_NONE;
+       level[len] = 0;
+
+       request_module("md-%s", level);
+       spin_lock(&pers_lock);
+       pers = find_pers(LEVEL_NONE, level);
+       if (!pers || !try_module_get(pers->owner)) {
+               spin_unlock(&pers_lock);
+               printk(KERN_WARNING "md: personality %s not loaded\n", level);
+               return -EINVAL;
+       }
+       spin_unlock(&pers_lock);
+
+       if (pers == mddev->pers) {
+               /* Nothing to do! */
+               module_put(pers->owner);
+               return rv;
+       }
+       if (!pers->takeover) {
+               module_put(pers->owner);
+               printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
+                      mdname(mddev), level);
+               return -EINVAL;
+       }
+
+       /* ->takeover must set new_* and/or delta_disks
+        * if it succeeds, and may set them when it fails.
+        */
+       priv = pers->takeover(mddev);
+       if (IS_ERR(priv)) {
+               mddev->new_level = mddev->level;
+               mddev->new_layout = mddev->layout;
+               mddev->new_chunk = mddev->chunk_size;
+               mddev->raid_disks -= mddev->delta_disks;
+               mddev->delta_disks = 0;
+               module_put(pers->owner);
+               printk(KERN_WARNING "md: %s: %s would not accept array\n",
+                      mdname(mddev), level);
+               return PTR_ERR(priv);
+       }
+
+       /* Looks like we have a winner */
+       mddev_suspend(mddev);
+       mddev->pers->stop(mddev);
+       module_put(mddev->pers->owner);
+       mddev->pers = pers;
+       mddev->private = priv;
+       strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+       mddev->level = mddev->new_level;
+       mddev->layout = mddev->new_layout;
+       mddev->chunk_size = mddev->new_chunk;
+       mddev->delta_disks = 0;
+       pers->run(mddev);
+       mddev_resume(mddev);
+       set_bit(MD_CHANGE_DEVS, &mddev->flags);
+       set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+       md_wakeup_thread(mddev->thread);
        return rv;
 }
 
@@ -2586,12 +2789,18 @@ layout_store(mddev_t *mddev, const char *buf, size_t len)
        if (!*buf || (*e && *e != '\n'))
                return -EINVAL;
 
-       if (mddev->pers)
-               return -EBUSY;
-       if (mddev->reshape_position != MaxSector)
+       if (mddev->pers) {
+               int err;
+               if (mddev->pers->reconfig == NULL)
+                       return -EBUSY;
+               err = mddev->pers->reconfig(mddev, n, -1);
+               if (err)
+                       return err;
+       } else {
                mddev->new_layout = n;
-       else
-               mddev->layout = n;
+               if (mddev->reshape_position == MaxSector)
+                       mddev->layout = n;
+       }
        return len;
 }
 static struct md_sysfs_entry md_layout =
@@ -2648,19 +2857,24 @@ chunk_size_show(mddev_t *mddev, char *page)
 static ssize_t
 chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
 {
-       /* can only set chunk_size if array is not yet active */
        char *e;
        unsigned long n = simple_strtoul(buf, &e, 10);
 
        if (!*buf || (*e && *e != '\n'))
                return -EINVAL;
 
-       if (mddev->pers)
-               return -EBUSY;
-       else if (mddev->reshape_position != MaxSector)
+       if (mddev->pers) {
+               int err;
+               if (mddev->pers->reconfig == NULL)
+                       return -EBUSY;
+               err = mddev->pers->reconfig(mddev, -1, n);
+               if (err)
+                       return err;
+       } else {
                mddev->new_chunk = n;
-       else
-               mddev->chunk_size = n;
+               if (mddev->reshape_position == MaxSector)
+                       mddev->chunk_size = n;
+       }
        return len;
 }
 static struct md_sysfs_entry md_chunk_size =
@@ -2669,6 +2883,8 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
 static ssize_t
 resync_start_show(mddev_t *mddev, char *page)
 {
+       if (mddev->recovery_cp == MaxSector)
+               return sprintf(page, "none\n");
        return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
 }
 
@@ -2766,7 +2982,7 @@ array_state_show(mddev_t *mddev, char *page)
        else {
                if (list_empty(&mddev->disks) &&
                    mddev->raid_disks == 0 &&
-                   mddev->size == 0)
+                   mddev->dev_sectors == 0)
                        st = clear;
                else
                        st = inactive;
@@ -2973,7 +3189,8 @@ __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
 static ssize_t
 size_show(mddev_t *mddev, char *page)
 {
-       return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
+       return sprintf(page, "%llu\n",
+               (unsigned long long)mddev->dev_sectors / 2);
 }
 
 static int update_size(mddev_t *mddev, sector_t num_sectors);
@@ -2985,20 +3202,18 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
         * not increase it (except from 0).
         * If array is active, we can try an on-line resize
         */
-       char *e;
-       int err = 0;
-       unsigned long long size = simple_strtoull(buf, &e, 10);
-       if (!*buf || *buf == '\n' ||
-           (*e && *e != '\n'))
-               return -EINVAL;
+       sector_t sectors;
+       int err = strict_blocks_to_sectors(buf, &sectors);
 
+       if (err < 0)
+               return err;
        if (mddev->pers) {
-               err = update_size(mddev, size * 2);
+               err = update_size(mddev, sectors);
                md_update_sb(mddev, 1);
        } else {
-               if (mddev->size == 0 ||
-                   mddev->size > size)
-                       mddev->size = size;
+               if (mddev->dev_sectors == 0 ||
+                   mddev->dev_sectors > sectors)
+                       mddev->dev_sectors = sectors;
                else
                        err = -ENOSPC;
        }
@@ -3251,6 +3466,8 @@ static ssize_t
 sync_speed_show(mddev_t *mddev, char *page)
 {
        unsigned long resync, dt, db;
+       if (mddev->curr_resync == 0)
+               return sprintf(page, "none\n");
        resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
        dt = (jiffies - mddev->resync_mark) / HZ;
        if (!dt) dt++;
@@ -3263,15 +3480,15 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
 static ssize_t
 sync_completed_show(mddev_t *mddev, char *page)
 {
-       unsigned long max_blocks, resync;
+       unsigned long max_sectors, resync;
 
        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
-               max_blocks = mddev->resync_max_sectors;
+               max_sectors = mddev->resync_max_sectors;
        else
-               max_blocks = mddev->size << 1;
+               max_sectors = mddev->dev_sectors;
 
        resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
-       return sprintf(page, "%lu / %lu\n", resync, max_blocks);
+       return sprintf(page, "%lu / %lu\n", resync, max_sectors);
 }
 
 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
@@ -3431,6 +3648,57 @@ static struct md_sysfs_entry md_reshape_position =
 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
        reshape_position_store);
 
+static ssize_t
+array_size_show(mddev_t *mddev, char *page)
+{
+       if (mddev->external_size)
+               return sprintf(page, "%llu\n",
+                              (unsigned long long)mddev->array_sectors/2);
+       else
+               return sprintf(page, "default\n");
+}
+
+static ssize_t
+array_size_store(mddev_t *mddev, const char *buf, size_t len)
+{
+       sector_t sectors;
+
+       if (strncmp(buf, "default", 7) == 0) {
+               if (mddev->pers)
+                       sectors = mddev->pers->size(mddev, 0, 0);
+               else
+                       sectors = mddev->array_sectors;
+
+               mddev->external_size = 0;
+       } else {
+               if (strict_blocks_to_sectors(buf, &sectors) < 0)
+                       return -EINVAL;
+               if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
+                       return -EINVAL;
+
+               mddev->external_size = 1;
+       }
+
+       mddev->array_sectors = sectors;
+       set_capacity(mddev->gendisk, mddev->array_sectors);
+       if (mddev->pers) {
+               struct block_device *bdev = bdget_disk(mddev->gendisk, 0);
+
+               if (bdev) {
+                       mutex_lock(&bdev->bd_inode->i_mutex);
+                       i_size_write(bdev->bd_inode,
+                                    (loff_t)mddev->array_sectors << 9);
+                       mutex_unlock(&bdev->bd_inode->i_mutex);
+                       bdput(bdev);
+               }
+       }
+
+       return len;
+}
+
+static struct md_sysfs_entry md_array_size =
+__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
+       array_size_store);
 
 static struct attribute *md_default_attrs[] = {
        &md_level.attr,
@@ -3444,6 +3712,7 @@ static struct attribute *md_default_attrs[] = {
        &md_safe_delay.attr,
        &md_array_state.attr,
        &md_reshape_position.attr,
+       &md_array_size.attr,
        NULL,
 };
 
@@ -3602,10 +3871,12 @@ static int md_alloc(dev_t dev, char *name)
                mddev_put(mddev);
                return -ENOMEM;
        }
+       mddev->queue->queuedata = mddev;
+
        /* Can be unlocked because the queue is new: no concurrency */
        queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
 
-       blk_queue_make_request(mddev->queue, md_fail_request);
+       blk_queue_make_request(mddev->queue, md_make_request);
 
        disk = alloc_disk(1 << shift);
        if (!disk) {
@@ -3731,13 +4002,13 @@ static int do_md_run(mddev_t * mddev)
                list_for_each_entry(rdev, &mddev->disks, same_set) {
                        if (test_bit(Faulty, &rdev->flags))
                                continue;
-                       if (rdev->size < chunk_size / 1024) {
+                       if (rdev->sectors < chunk_size / 512) {
                                printk(KERN_WARNING
                                        "md: Dev %s smaller than chunk_size:"
-                                       " %lluk < %dk\n",
+                                       " %llu < %d\n",
                                        bdevname(rdev->bdev,b),
-                                       (unsigned long long)rdev->size,
-                                       chunk_size / 1024);
+                                       (unsigned long long)rdev->sectors,
+                                       chunk_size / 512);
                                return -EINVAL;
                        }
                }
@@ -3761,11 +4032,11 @@ static int do_md_run(mddev_t * mddev)
 
                /* perform some consistency tests on the device.
                 * We don't want the data to overlap the metadata,
-                * Internal Bitmap issues has handled elsewhere.
+                * Internal Bitmap issues have been handled elsewhere.
                 */
                if (rdev->data_offset < rdev->sb_start) {
-                       if (mddev->size &&
-                           rdev->data_offset + mddev->size*2
+                       if (mddev->dev_sectors &&
+                           rdev->data_offset + mddev->dev_sectors
                            > rdev->sb_start) {
                                printk("md: %s: data overlaps metadata\n",
                                       mdname(mddev));
@@ -3801,9 +4072,16 @@ static int do_md_run(mddev_t * mddev)
        }
        mddev->pers = pers;
        spin_unlock(&pers_lock);
-       mddev->level = pers->level;
+       if (mddev->level != pers->level) {
+               mddev->level = pers->level;
+               mddev->new_level = pers->level;
+       }
        strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
 
+       if (pers->level >= 4 && pers->level <= 6)
+               /* Cannot support integrity (yet) */
+               blk_integrity_unregister(mddev->gendisk);
+
        if (mddev->reshape_position != MaxSector &&
            pers->start_reshape == NULL) {
                /* This personality cannot handle reshaping... */
@@ -3843,7 +4121,9 @@ static int do_md_run(mddev_t * mddev)
        }
 
        mddev->recovery = 0;
-       mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
+       /* may be over-ridden by personality */
+       mddev->resync_max_sectors = mddev->dev_sectors;
+
        mddev->barriers_work = 1;
        mddev->ok_start_degraded = start_dirty_degraded;
 
@@ -3853,7 +4133,17 @@ static int do_md_run(mddev_t * mddev)
        err = mddev->pers->run(mddev);
        if (err)
                printk(KERN_ERR "md: pers->run() failed ...\n");
-       else if (mddev->pers->sync_request) {
+       else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
+               WARN_ONCE(!mddev->external_size, "%s: default size too small,"
+                         " but 'external_size' not in effect?\n", __func__);
+               printk(KERN_ERR
+                      "md: invalid array_size %llu > default size %llu\n",
+                      (unsigned long long)mddev->array_sectors / 2,
+                      (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
+               err = -EINVAL;
+               mddev->pers->stop(mddev);
+       }
+       if (err == 0 && mddev->pers->sync_request) {
                err = bitmap_create(mddev);
                if (err) {
                        printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
@@ -3899,16 +4189,6 @@ static int do_md_run(mddev_t * mddev)
 
        set_capacity(disk, mddev->array_sectors);
 
-       /* If we call blk_queue_make_request here, it will
-        * re-initialise max_sectors etc which may have been
-        * refined inside -> run.  So just set the bits we need to set.
-        * Most initialisation happended when we called
-        * blk_queue_make_request(..., md_fail_request)
-        * earlier.
-        */
-       mddev->queue->queuedata = mddev;
-       mddev->queue->make_request_fn = mddev->pers->make_request;
-
        /* If there is a partially-recovered drive we need to
         * start recovery here.  If we leave it to md_check_recovery,
         * it will remove the drives and not do the right thing
@@ -4038,7 +4318,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                        md_super_wait(mddev);
                        if (mddev->ro)
                                set_disk_ro(disk, 0);
-                       blk_queue_make_request(mddev->queue, md_fail_request);
+
                        mddev->pers->stop(mddev);
                        mddev->queue->merge_bvec_fn = NULL;
                        mddev->queue->unplug_fn = NULL;
@@ -4095,7 +4375,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                export_array(mddev);
 
                mddev->array_sectors = 0;
-               mddev->size = 0;
+               mddev->external_size = 0;
+               mddev->dev_sectors = 0;
                mddev->raid_disks = 0;
                mddev->recovery_cp = 0;
                mddev->resync_min = 0;
@@ -4135,6 +4416,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                printk(KERN_INFO "md: %s switched to read-only mode.\n",
                        mdname(mddev));
        err = 0;
+       blk_integrity_unregister(disk);
        md_new_event(mddev);
        sysfs_notify_dirent(mddev->sysfs_state);
 out:
@@ -4300,8 +4582,8 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
        info.patch_version = MD_PATCHLEVEL_VERSION;
        info.ctime         = mddev->ctime;
        info.level         = mddev->level;
-       info.size          = mddev->size;
-       if (info.size != mddev->size) /* overflow */
+       info.size          = mddev->dev_sectors / 2;
+       if (info.size != mddev->dev_sectors / 2) /* overflow */
                info.size = -1;
        info.nr_disks      = nr;
        info.raid_disks    = mddev->raid_disks;
@@ -4480,6 +4762,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
                clear_bit(In_sync, &rdev->flags); /* just to be sure */
                if (info->state & (1<<MD_DISK_WRITEMOSTLY))
                        set_bit(WriteMostly, &rdev->flags);
+               else
+                       clear_bit(WriteMostly, &rdev->flags);
 
                rdev->raid_disk = -1;
                err = bind_rdev_to_array(rdev, mddev);
@@ -4543,7 +4827,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
                        rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
                } else 
                        rdev->sb_start = calc_dev_sboffset(rdev->bdev);
-               rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
+               rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size);
 
                err = bind_rdev_to_array(rdev, mddev);
                if (err) {
@@ -4613,7 +4897,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
        else
                rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
 
-       rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
+       rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size);
 
        if (test_bit(Faulty, &rdev->flags)) {
                printk(KERN_WARNING 
@@ -4749,7 +5033,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 
        mddev->level         = info->level;
        mddev->clevel[0]     = 0;
-       mddev->size          = info->size;
+       mddev->dev_sectors   = 2 * (sector_t)info->size;
        mddev->raid_disks    = info->raid_disks;
        /* don't set md_minor, it is determined by which /dev/md* was
         * openned
@@ -4788,6 +5072,17 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
        return 0;
 }
 
+void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors)
+{
+       WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
+
+       if (mddev->external_size)
+               return;
+
+       mddev->array_sectors = array_sectors;
+}
+EXPORT_SYMBOL(md_set_array_sectors);
+
 static int update_size(mddev_t *mddev, sector_t num_sectors)
 {
        mdk_rdev_t *rdev;
@@ -4814,8 +5109,7 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
                 */
                return -EBUSY;
        list_for_each_entry(rdev, &mddev->disks, same_set) {
-               sector_t avail;
-               avail = rdev->size * 2;
+               sector_t avail = rdev->sectors;
 
                if (fit && (num_sectors == 0 || num_sectors > avail))
                        num_sectors = avail;
@@ -4887,12 +5181,18 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
                )
                return -EINVAL;
        /* Check there is only one change */
-       if (info->size >= 0 && mddev->size != info->size) cnt++;
-       if (mddev->raid_disks != info->raid_disks) cnt++;
-       if (mddev->layout != info->layout) cnt++;
-       if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
-       if (cnt == 0) return 0;
-       if (cnt > 1) return -EINVAL;
+       if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
+               cnt++;
+       if (mddev->raid_disks != info->raid_disks)
+               cnt++;
+       if (mddev->layout != info->layout)
+               cnt++;
+       if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
+               cnt++;
+       if (cnt == 0)
+               return 0;
+       if (cnt > 1)
+               return -EINVAL;
 
        if (mddev->layout != info->layout) {
                /* Change layout
@@ -4904,7 +5204,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
                else
                        return mddev->pers->reconfig(mddev, info->layout, -1);
        }
-       if (info->size >= 0 && mddev->size != info->size)
+       if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
                rv = update_size(mddev, (sector_t)info->size * 2);
 
        if (mddev->raid_disks    != info->raid_disks)
@@ -5331,6 +5631,8 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
 
 void md_unregister_thread(mdk_thread_t *thread)
 {
+       if (!thread)
+               return;
        dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
 
        kthread_stop(thread->tsk);
@@ -5404,7 +5706,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
                max_blocks = mddev->resync_max_sectors >> 1;
        else
-               max_blocks = mddev->size;
+               max_blocks = mddev->dev_sectors / 2;
 
        /*
         * Should not happen.
@@ -5537,7 +5839,7 @@ struct mdstat_info {
 static int md_seq_show(struct seq_file *seq, void *v)
 {
        mddev_t *mddev = v;
-       sector_t size;
+       sector_t sectors;
        mdk_rdev_t *rdev;
        struct mdstat_info *mi = seq->private;
        struct bitmap *bitmap;
@@ -5573,7 +5875,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
                        seq_printf(seq, " %s", mddev->pers->name);
                }
 
-               size = 0;
+               sectors = 0;
                list_for_each_entry(rdev, &mddev->disks, same_set) {
                        char b[BDEVNAME_SIZE];
                        seq_printf(seq, " %s[%d]",
@@ -5585,7 +5887,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
                                continue;
                        } else if (rdev->raid_disk < 0)
                                seq_printf(seq, "(S)"); /* spare */
-                       size += rdev->size;
+                       sectors += rdev->sectors;
                }
 
                if (!list_empty(&mddev->disks)) {
@@ -5595,7 +5897,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
                                           mddev->array_sectors / 2);
                        else
                                seq_printf(seq, "\n      %llu blocks",
-                                          (unsigned long long)size);
+                                          (unsigned long long)sectors / 2);
                }
                if (mddev->persistent) {
                        if (mddev->major_version != 0 ||
@@ -5722,19 +6024,19 @@ int unregister_md_personality(struct mdk_personality *p)
        return 0;
 }
 
-static int is_mddev_idle(mddev_t *mddev)
+static int is_mddev_idle(mddev_t *mddev, int init)
 {
        mdk_rdev_t * rdev;
        int idle;
-       long curr_events;
+       int curr_events;
 
        idle = 1;
        rcu_read_lock();
        rdev_for_each_rcu(rdev, mddev) {
                struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
-               curr_events = part_stat_read(&disk->part0, sectors[0]) +
-                               part_stat_read(&disk->part0, sectors[1]) -
-                               atomic_read(&disk->sync_io);
+               curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
+                             (int)part_stat_read(&disk->part0, sectors[1]) -
+                             atomic_read(&disk->sync_io);
                /* sync IO will cause sync_io to increase before the disk_stats
                 * as sync_io is counted when a request starts, and
                 * disk_stats is counted when it completes.
@@ -5757,7 +6059,7 @@ static int is_mddev_idle(mddev_t *mddev)
                 * always make curr_events less than last_events.
                 *
                 */
-               if (curr_events - rdev->last_events > 4096) {
+               if (init || curr_events - rdev->last_events > 64) {
                        rdev->last_events = curr_events;
                        idle = 0;
                }
@@ -5980,10 +6282,10 @@ void md_do_sync(mddev_t *mddev)
                        j = mddev->recovery_cp;
 
        } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
-               max_sectors = mddev->size << 1;
+               max_sectors = mddev->dev_sectors;
        else {
                /* recovery follows the physical size of devices */
-               max_sectors = mddev->size << 1;
+               max_sectors = mddev->dev_sectors;
                j = MaxSector;
                list_for_each_entry(rdev, &mddev->disks, same_set)
                        if (rdev->raid_disk >= 0 &&
@@ -6000,7 +6302,7 @@ void md_do_sync(mddev_t *mddev)
               "(but not more than %d KB/sec) for %s.\n",
               speed_max(mddev), desc);
 
-       is_mddev_idle(mddev); /* this also initializes IO event counters */
+       is_mddev_idle(mddev, 1); /* this initializes IO event counters */
 
        io_sectors = 0;
        for (m = 0; m < SYNC_MARKS; m++) {
@@ -6040,6 +6342,18 @@ void md_do_sync(mddev_t *mddev)
                }
                if (kthread_should_stop())
                        goto interrupted;
+
+               if (mddev->curr_resync > mddev->curr_resync_completed &&
+                   (mddev->curr_resync - mddev->curr_resync_completed)
+                   > (max_sectors >> 4)) {
+                       /* time to update curr_resync_completed */
+                       blk_unplug(mddev->queue);
+                       wait_event(mddev->recovery_wait,
+                                  atomic_read(&mddev->recovery_active) == 0);
+                       mddev->curr_resync_completed =
+                               mddev->curr_resync;
+                       set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+               }
                sectors = mddev->pers->sync_request(mddev, j, &skipped,
                                                  currspeed < speed_min(mddev));
                if (sectors == 0) {
@@ -6102,7 +6416,7 @@ void md_do_sync(mddev_t *mddev)
 
                if (currspeed > speed_min(mddev)) {
                        if ((currspeed > speed_max(mddev)) ||
-                                       !is_mddev_idle(mddev)) {
+                                       !is_mddev_idle(mddev, 0)) {
                                msleep(500);
                                goto repeat;
                        }
@@ -6173,6 +6487,8 @@ static int remove_and_add_spares(mddev_t *mddev)
        mdk_rdev_t *rdev;
        int spares = 0;
 
+       mddev->curr_resync_completed = 0;
+
        list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->raid_disk >= 0 &&
                    !test_bit(Blocked, &rdev->flags) &&
@@ -6327,6 +6643,9 @@ void md_check_recovery(mddev_t *mddev)
                                        sysfs_notify(&mddev->kobj, NULL,
                                                     "degraded");
                        }
+                       if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+                           mddev->pers->finish_reshape)
+                               mddev->pers->finish_reshape(mddev);
                        md_update_sb(mddev, 1);
 
                        /* if array is no-longer degraded, then any saved_raid_disk
@@ -6470,13 +6789,13 @@ static void md_geninit(void)
 
 static int __init md_init(void)
 {
-       if (register_blkdev(MAJOR_NR, "md"))
+       if (register_blkdev(MD_MAJOR, "md"))
                return -1;
        if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
-               unregister_blkdev(MAJOR_NR, "md");
+               unregister_blkdev(MD_MAJOR, "md");
                return -1;
        }
-       blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE,
+       blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
                            md_probe, NULL, NULL);
        blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
                            md_probe, NULL, NULL);
@@ -6562,10 +6881,10 @@ static __exit void md_exit(void)
        mddev_t *mddev;
        struct list_head *tmp;
 
-       blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS);
+       blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
        blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
 
-       unregister_blkdev(MAJOR_NR,"md");
+       unregister_blkdev(MD_MAJOR,"md");
        unregister_blkdev(mdp_major, "mdp");
        unregister_reboot_notifier(&md_notifier);
        unregister_sysctl_table(raid_table_header);
similarity index 83%
rename from include/linux/raid/md_k.h
rename to drivers/md/md.h
index 9743e4dbc9188031b0c180584df4c5e6ba898513..e9b7f54c24d653f68daacd28cc0c386252669627 100644 (file)
 #ifndef _MD_K_H
 #define _MD_K_H
 
-/* and dm-bio-list.h is not under include/linux because.... ??? */
-#include "../../../drivers/md/dm-bio-list.h"
-
 #ifdef CONFIG_BLOCK
 
-#define        LEVEL_MULTIPATH         (-4)
-#define        LEVEL_LINEAR            (-1)
-#define        LEVEL_FAULTY            (-5)
-
-/* we need a value for 'no level specified' and 0
- * means 'raid0', so we need something else.  This is
- * for internal use only
- */
-#define        LEVEL_NONE              (-1000000)
-
 #define MaxSector (~(sector_t)0)
 
 typedef struct mddev_s mddev_t;
@@ -49,9 +36,9 @@ struct mdk_rdev_s
 {
        struct list_head same_set;      /* RAID devices within the same set */
 
-       sector_t size;                  /* Device size (in blocks) */
+       sector_t sectors;               /* Device size (in 512bytes sectors) */
        mddev_t *mddev;                 /* RAID array if running */
-       long last_events;               /* IO event timestamp */
+       int last_events;                /* IO event timestamp */
 
        struct block_device *bdev;      /* block device handle */
 
@@ -132,6 +119,8 @@ struct mddev_s
 #define MD_CHANGE_CLEAN 1      /* transition to or from 'clean' */
 #define MD_CHANGE_PENDING 2    /* superblock update in progress */
 
+       int                             suspended;
+       atomic_t                        active_io;
        int                             ro;
 
        struct gendisk                  *gendisk;
@@ -155,8 +144,11 @@ struct mddev_s
        char                            clevel[16];
        int                             raid_disks;
        int                             max_disks;
-       sector_t                        size; /* used size of component devices */
+       sector_t                        dev_sectors;    /* used size of
+                                                        * component devices */
        sector_t                        array_sectors; /* exported array size */
+       int                             external_size; /* size managed
+                                                       * externally */
        __u64                           events;
 
        char                            uuid[16];
@@ -172,6 +164,13 @@ struct mddev_s
        struct mdk_thread_s             *thread;        /* management thread */
        struct mdk_thread_s             *sync_thread;   /* doing resync or reconstruct */
        sector_t                        curr_resync;    /* last block scheduled */
+       /* As resync requests can complete out of order, we cannot easily track
+        * how much resync has been completed.  So we occasionally pause until
+        * everything completes, then set curr_resync_completed to curr_resync.
+        * As such it may be well behind the real resync mark, but it is a value
+        * we are certain of.
+        */
+       sector_t                        curr_resync_completed;
        unsigned long                   resync_mark;    /* a recent timestamp */
        sector_t                        resync_mark_cnt;/* blocks written at resync_mark */
        sector_t                        curr_mark_cnt; /* blocks scheduled now */
@@ -315,8 +314,10 @@ struct mdk_personality
        int (*spare_active) (mddev_t *mddev);
        sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster);
        int (*resize) (mddev_t *mddev, sector_t sectors);
+       sector_t (*size) (mddev_t *mddev, sector_t sectors, int raid_disks);
        int (*check_reshape) (mddev_t *mddev);
        int (*start_reshape) (mddev_t *mddev);
+       void (*finish_reshape) (mddev_t *mddev);
        int (*reconfig) (mddev_t *mddev, int layout, int chunk_size);
        /* quiesce moves between quiescence states
         * 0 - fully active
@@ -324,6 +325,16 @@ struct mdk_personality
         * others - reserved
         */
        void (*quiesce) (mddev_t *mddev, int state);
+       /* takeover is used to transition an array from one
+        * personality to another.  The new personality must be able
+        * to handle the data in the current layout.
+        * e.g. 2drive raid1 -> 2drive raid5
+        *      ndrive raid5 -> degraded n+1drive raid6 with special layout
+        * If the takeover succeeds, a new 'private' structure is returned.
+        * This needs to be installed and then ->run used to activate the
+        * array.
+        */
+       void *(*takeover) (mddev_t *mddev);
 };
 
 
@@ -400,3 +411,26 @@ static inline void safe_put_page(struct page *p)
 #endif /* CONFIG_BLOCK */
 #endif
 
+
+extern int register_md_personality(struct mdk_personality *p);
+extern int unregister_md_personality(struct mdk_personality *p);
+extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
+                               mddev_t *mddev, const char *name);
+extern void md_unregister_thread(mdk_thread_t *thread);
+extern void md_wakeup_thread(mdk_thread_t *thread);
+extern void md_check_recovery(mddev_t *mddev);
+extern void md_write_start(mddev_t *mddev, struct bio *bi);
+extern void md_write_end(mddev_t *mddev);
+extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
+extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
+
+extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
+                          sector_t sector, int size, struct page *page);
+extern void md_super_wait(mddev_t *mddev);
+extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+                       struct page *page, int rw);
+extern void md_do_sync(mddev_t *mddev);
+extern void md_new_event(mddev_t *mddev);
+extern int md_allow_write(mddev_t *mddev);
+extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
+extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
index b61d5767aae7c61c3960a09e16c26becb3a1712f..3b1500843bbac2dfa85daaeb85058480294080d4 100644 (file)
@@ -59,7 +59,7 @@ int main(int argc, char *argv[])
        uint8_t v;
        uint8_t exptbl[256], invtbl[256];
 
-       printf("#include \"raid6.h\"\n");
+       printf("#include <linux/raid/pq.h>\n");
 
        /* Compute multiplication table */
        printf("\nconst u8  __attribute__((aligned(256)))\n"
@@ -76,6 +76,9 @@ int main(int argc, char *argv[])
                printf("\t},\n");
        }
        printf("};\n");
+       printf("#ifdef __KERNEL__\n");
+       printf("EXPORT_SYMBOL(raid6_gfmul);\n");
+       printf("#endif\n");
 
        /* Compute power-of-2 table (exponent) */
        v = 1;
@@ -92,6 +95,9 @@ int main(int argc, char *argv[])
                }
        }
        printf("};\n");
+       printf("#ifdef __KERNEL__\n");
+       printf("EXPORT_SYMBOL(raid6_gfexp);\n");
+       printf("#endif\n");
 
        /* Compute inverse table x^-1 == x^254 */
        printf("\nconst u8 __attribute__((aligned(256)))\n"
@@ -104,6 +110,9 @@ int main(int argc, char *argv[])
                }
        }
        printf("};\n");
+       printf("#ifdef __KERNEL__\n");
+       printf("EXPORT_SYMBOL(raid6_gfinv);\n");
+       printf("#endif\n");
 
        /* Compute inv(2^x + 1) (exponent-xor-inverse) table */
        printf("\nconst u8 __attribute__((aligned(256)))\n"
@@ -115,6 +124,9 @@ int main(int argc, char *argv[])
                               (j == 7) ? '\n' : ' ');
        }
        printf("};\n");
+       printf("#ifdef __KERNEL__\n");
+       printf("EXPORT_SYMBOL(raid6_gfexi);\n");
+       printf("#endif\n");
 
        return 0;
 }
index f6d08f2416716f7207fe49aba23d154a5eb41fae..41ced0cbe823c7275cc2f79172cc9913385fcd2f 100644 (file)
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#include <linux/raid/multipath.h>
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "multipath.h"
 
 #define MAX_WORK_PER_DISK 128
 
@@ -402,6 +406,14 @@ static void multipathd (mddev_t *mddev)
        spin_unlock_irqrestore(&conf->device_lock, flags);
 }
 
+static sector_t multipath_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+       WARN_ONCE(sectors || raid_disks,
+                 "%s does not support generic reshape\n", __func__);
+
+       return mddev->dev_sectors;
+}
+
 static int multipath_run (mddev_t *mddev)
 {
        multipath_conf_t *conf;
@@ -498,7 +510,7 @@ static int multipath_run (mddev_t *mddev)
        /*
         * Ok, everything is just fine now
         */
-       mddev->array_sectors = mddev->size * 2;
+       md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
 
        mddev->queue->unplug_fn = multipath_unplug;
        mddev->queue->backing_dev_info.congested_fn = multipath_congested;
@@ -543,6 +555,7 @@ static struct mdk_personality multipath_personality =
        .error_handler  = multipath_error,
        .hot_add_disk   = multipath_add_disk,
        .hot_remove_disk= multipath_remove_disk,
+       .size           = multipath_size,
 };
 
 static int __init multipath_init (void)
similarity index 96%
rename from include/linux/raid/multipath.h
rename to drivers/md/multipath.h
index 6f53fc177a47359b1490ba5b999e9d2b34e7bb2e..6fa70b400cdae6347fa07dedc2c80a5c16cabc90 100644 (file)
@@ -1,8 +1,6 @@
 #ifndef _MULTIPATH_H
 #define _MULTIPATH_H
 
-#include <linux/raid/md.h>
-
 struct multipath_info {
        mdk_rdev_t      *rdev;
 };
index c605ba8055863d2d0ede52e9fe7e9c4e374bd9d3..c08d7559be5531fb01bedb55e4663a0e40092607 100644 (file)
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
 */
 
-#include <linux/raid/raid0.h>
+#include <linux/blkdev.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "raid0.h"
 
 static void raid0_unplug(struct request_queue *q)
 {
@@ -73,16 +76,15 @@ static int create_strip_zones (mddev_t *mddev)
                list_for_each_entry(rdev2, &mddev->disks, same_set) {
                        printk(KERN_INFO "raid0:   comparing %s(%llu)",
                               bdevname(rdev1->bdev,b),
-                              (unsigned long long)rdev1->size);
+                              (unsigned long long)rdev1->sectors);
                        printk(KERN_INFO " with %s(%llu)\n",
                               bdevname(rdev2->bdev,b),
-                              (unsigned long long)rdev2->size);
+                              (unsigned long long)rdev2->sectors);
                        if (rdev2 == rdev1) {
                                printk(KERN_INFO "raid0:   END\n");
                                break;
                        }
-                       if (rdev2->size == rdev1->size)
-                       {
+                       if (rdev2->sectors == rdev1->sectors) {
                                /*
                                 * Not unique, don't count it as a new
                                 * group
@@ -145,7 +147,7 @@ static int create_strip_zones (mddev_t *mddev)
                    mddev->queue->max_sectors > (PAGE_SIZE>>9))
                        blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
-               if (!smallest || (rdev1->size <smallest->size))
+               if (!smallest || (rdev1->sectors < smallest->sectors))
                        smallest = rdev1;
                cnt++;
        }
@@ -155,10 +157,10 @@ static int create_strip_zones (mddev_t *mddev)
                goto abort;
        }
        zone->nb_dev = cnt;
-       zone->sectors = smallest->size * cnt * 2;
+       zone->sectors = smallest->sectors * cnt;
        zone->zone_start = 0;
 
-       current_start = smallest->size * 2;
+       current_start = smallest->sectors;
        curr_zone_start = zone->sectors;
 
        /* now do the other zones */
@@ -177,29 +179,29 @@ static int create_strip_zones (mddev_t *mddev)
                        rdev = conf->strip_zone[0].dev[j];
                        printk(KERN_INFO "raid0: checking %s ...",
                                bdevname(rdev->bdev, b));
-                       if (rdev->size > current_start / 2) {
-                               printk(KERN_INFO " contained as device %d\n",
-                                       c);
-                               zone->dev[c] = rdev;
-                               c++;
-                               if (!smallest || (rdev->size <smallest->size)) {
-                                       smallest = rdev;
-                                       printk(KERN_INFO "  (%llu) is smallest!.\n",
-                                               (unsigned long long)rdev->size);
-                               }
-                       } else
+                       if (rdev->sectors <= current_start) {
                                printk(KERN_INFO " nope.\n");
+                               continue;
+                       }
+                       printk(KERN_INFO " contained as device %d\n", c);
+                       zone->dev[c] = rdev;
+                       c++;
+                       if (!smallest || rdev->sectors < smallest->sectors) {
+                               smallest = rdev;
+                               printk(KERN_INFO "  (%llu) is smallest!.\n",
+                                       (unsigned long long)rdev->sectors);
+                       }
                }
 
                zone->nb_dev = c;
-               zone->sectors = (smallest->size * 2 - current_start) * c;
+               zone->sectors = (smallest->sectors - current_start) * c;
                printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n",
                        zone->nb_dev, (unsigned long long)zone->sectors);
 
                zone->zone_start = curr_zone_start;
                curr_zone_start += zone->sectors;
 
-               current_start = smallest->size * 2;
+               current_start = smallest->sectors;
                printk(KERN_INFO "raid0: current zone start: %llu\n",
                        (unsigned long long)current_start);
        }
@@ -261,12 +263,25 @@ static int raid0_mergeable_bvec(struct request_queue *q,
                return max;
 }
 
+static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+       sector_t array_sectors = 0;
+       mdk_rdev_t *rdev;
+
+       WARN_ONCE(sectors || raid_disks,
+                 "%s does not support generic reshape\n", __func__);
+
+       list_for_each_entry(rdev, &mddev->disks, same_set)
+               array_sectors += rdev->sectors;
+
+       return array_sectors;
+}
+
 static int raid0_run (mddev_t *mddev)
 {
        unsigned  cur=0, i=0, nb_zone;
        s64 sectors;
        raid0_conf_t *conf;
-       mdk_rdev_t *rdev;
 
        if (mddev->chunk_size == 0) {
                printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
@@ -291,16 +306,14 @@ static int raid0_run (mddev_t *mddev)
                goto out_free_conf;
 
        /* calculate array device size */
-       mddev->array_sectors = 0;
-       list_for_each_entry(rdev, &mddev->disks, same_set)
-               mddev->array_sectors += rdev->size * 2;
+       md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
 
        printk(KERN_INFO "raid0 : md_size is %llu sectors.\n",
                (unsigned long long)mddev->array_sectors);
        printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n",
                (unsigned long long)conf->spacing);
        {
-               sector_t s = mddev->array_sectors;
+               sector_t s = raid0_size(mddev, 0, 0);
                sector_t space = conf->spacing;
                int round;
                conf->sector_shift = 0;
@@ -509,6 +522,7 @@ static struct mdk_personality raid0_personality=
        .run            = raid0_run,
        .stop           = raid0_stop,
        .status         = raid0_status,
+       .size           = raid0_size,
 };
 
 static int __init raid0_init (void)
similarity index 96%
rename from include/linux/raid/raid0.h
rename to drivers/md/raid0.h
index fd42aa87c39186791d1a06481fecb6ddd7c03326..824b12eb1d4f50086e8a4dba48d711563362640c 100644 (file)
@@ -1,8 +1,6 @@
 #ifndef _RAID0_H
 #define _RAID0_H
 
-#include <linux/raid/md.h>
-
 struct strip_zone
 {
        sector_t zone_start;    /* Zone offset in md_dev (in sectors) */
index e2466425d9cad798edf40858183404408b319e3d..b4f4badc0068991290515b1281cdaddd898893ce 100644 (file)
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#include "dm-bio-list.h"
 #include <linux/delay.h>
-#include <linux/raid/raid1.h>
-#include <linux/raid/bitmap.h>
+#include <linux/blkdev.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "dm-bio-list.h"
+#include "raid1.h"
+#include "bitmap.h"
 
 #define DEBUG 0
 #if DEBUG
@@ -1723,7 +1726,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                        return 0;
        }
 
-       max_sector = mddev->size << 1;
+       max_sector = mddev->dev_sectors;
        if (sector_nr >= max_sector) {
                /* If we aborted, we need to abort the
                 * sync on the 'current' bitmap chunk (there will
@@ -1919,6 +1922,14 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
        return nr_sectors;
 }
 
+static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+       if (sectors)
+               return sectors;
+
+       return mddev->dev_sectors;
+}
+
 static int run(mddev_t *mddev)
 {
        conf_t *conf;
@@ -2048,7 +2059,7 @@ static int run(mddev_t *mddev)
        /*
         * Ok, everything is just fine now
         */
-       mddev->array_sectors = mddev->size * 2;
+       md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
 
        mddev->queue->unplug_fn = raid1_unplug;
        mddev->queue->backing_dev_info.congested_fn = raid1_congested;
@@ -2089,6 +2100,9 @@ static int stop(mddev_t *mddev)
                /* need to kick something here to make sure I/O goes? */
        }
 
+       raise_barrier(conf);
+       lower_barrier(conf);
+
        md_unregister_thread(mddev->thread);
        mddev->thread = NULL;
        blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
@@ -2110,15 +2124,17 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
         * any io in the removed space completes, but it hardly seems
         * worth it.
         */
-       mddev->array_sectors = sectors;
+       md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0));
+       if (mddev->array_sectors > raid1_size(mddev, sectors, 0))
+               return -EINVAL;
        set_capacity(mddev->gendisk, mddev->array_sectors);
        mddev->changed = 1;
-       if (mddev->array_sectors / 2 > mddev->size &&
+       if (sectors > mddev->dev_sectors &&
            mddev->recovery_cp == MaxSector) {
-               mddev->recovery_cp = mddev->size << 1;
+               mddev->recovery_cp = mddev->dev_sectors;
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        }
-       mddev->size = mddev->array_sectors / 2;
+       mddev->dev_sectors = sectors;
        mddev->resync_max_sectors = sectors;
        return 0;
 }
@@ -2264,6 +2280,7 @@ static struct mdk_personality raid1_personality =
        .spare_active   = raid1_spare_active,
        .sync_request   = sync_request,
        .resize         = raid1_resize,
+       .size           = raid1_size,
        .check_reshape  = raid1_reshape,
        .quiesce        = raid1_quiesce,
 };
similarity index 99%
rename from include/linux/raid/raid1.h
rename to drivers/md/raid1.h
index 0a9ba7c3302e2393e881244983a3397807e29dfb..1620eea3d57c50231729c04c98e6996c086b00c5 100644 (file)
@@ -1,8 +1,6 @@
 #ifndef _RAID1_H
 #define _RAID1_H
 
-#include <linux/raid/md.h>
-
 typedef struct mirror_info mirror_info_t;
 
 struct mirror_info {
index 7301631abe0453a4791dec55ba0eff84912876c3..e293d92641acc2b61c5767d3b343702fd65a509b 100644 (file)
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#include "dm-bio-list.h"
 #include <linux/delay.h>
-#include <linux/raid/raid10.h>
-#include <linux/raid/bitmap.h>
+#include <linux/blkdev.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "dm-bio-list.h"
+#include "raid10.h"
+#include "bitmap.h"
 
 /*
  * RAID10 provides a combination of RAID0 and RAID1 functionality.
@@ -1695,7 +1698,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                        return 0;
 
  skipped:
-       max_sector = mddev->size << 1;
+       max_sector = mddev->dev_sectors;
        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
                max_sector = mddev->resync_max_sectors;
        if (sector_nr >= max_sector) {
@@ -2020,6 +2023,25 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
        goto skipped;
 }
 
+static sector_t
+raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+       sector_t size;
+       conf_t *conf = mddev_to_conf(mddev);
+
+       if (!raid_disks)
+               raid_disks = mddev->raid_disks;
+       if (!sectors)
+               sectors = mddev->dev_sectors;
+
+       size = sectors >> conf->chunk_shift;
+       sector_div(size, conf->far_copies);
+       size = size * raid_disks;
+       sector_div(size, conf->near_copies);
+
+       return size << conf->chunk_shift;
+}
+
 static int run(mddev_t *mddev)
 {
        conf_t *conf;
@@ -2076,7 +2098,7 @@ static int run(mddev_t *mddev)
        conf->far_offset = fo;
        conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
        conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
-       size = mddev->size >> (conf->chunk_shift-1);
+       size = mddev->dev_sectors >> conf->chunk_shift;
        sector_div(size, fc);
        size = size * conf->raid_disks;
        sector_div(size, nc);
@@ -2089,7 +2111,7 @@ static int run(mddev_t *mddev)
         */
        stride += conf->raid_disks - 1;
        sector_div(stride, conf->raid_disks);
-       mddev->size = stride  << (conf->chunk_shift-1);
+       mddev->dev_sectors = stride << conf->chunk_shift;
 
        if (fo)
                stride = 1;
@@ -2171,8 +2193,8 @@ static int run(mddev_t *mddev)
        /*
         * Ok, everything is just fine now
         */
-       mddev->array_sectors = size << conf->chunk_shift;
-       mddev->resync_max_sectors = size << conf->chunk_shift;
+       md_set_array_sectors(mddev, raid10_size(mddev, 0, 0));
+       mddev->resync_max_sectors = raid10_size(mddev, 0, 0);
 
        mddev->queue->unplug_fn = raid10_unplug;
        mddev->queue->backing_dev_info.congested_fn = raid10_congested;
@@ -2208,6 +2230,9 @@ static int stop(mddev_t *mddev)
 {
        conf_t *conf = mddev_to_conf(mddev);
 
+       raise_barrier(conf, 0);
+       lower_barrier(conf);
+
        md_unregister_thread(mddev->thread);
        mddev->thread = NULL;
        blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
@@ -2255,6 +2280,7 @@ static struct mdk_personality raid10_personality =
        .spare_active   = raid10_spare_active,
        .sync_request   = sync_request,
        .quiesce        = raid10_quiesce,
+       .size           = raid10_size,
 };
 
 static int __init raid_init(void)
similarity index 99%
rename from include/linux/raid/raid10.h
rename to drivers/md/raid10.h
index e9091cfeb286c2a9b0d30dc57e6748933bb3a9ee..244dbe507a54fa893a6731f57c56de1dca2a3b70 100644 (file)
@@ -1,8 +1,6 @@
 #ifndef _RAID10_H
 #define _RAID10_H
 
-#include <linux/raid/md.h>
-
 typedef struct mirror_info mirror_info_t;
 
 struct mirror_info {
index a5ba080d303b93bb3a4d764ea621f4bf9bc191d0..3bbc6d647044c6b6d782427a1e3e35a146bc0751 100644 (file)
  * miss any bits.
  */
 
+#include <linux/blkdev.h>
 #include <linux/kthread.h>
-#include "raid6.h"
-
-#include <linux/raid/bitmap.h>
+#include <linux/raid/pq.h>
 #include <linux/async_tx.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "raid5.h"
+#include "bitmap.h"
 
 /*
  * Stripe cache
 
 #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
 
-#if !RAID6_USE_EMPTY_ZERO_PAGE
-/* In .bss so it's zeroed */
-const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
-#endif
-
 /*
  * We maintain a biased count of active stripes in the bottom 16 bits of
  * bi_phys_segments, and a count of processed stripes in the upper 16 bits
@@ -130,12 +128,42 @@ static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
        bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16);
 }
 
+/* Find first data disk in a raid6 stripe */
+static inline int raid6_d0(struct stripe_head *sh)
+{
+       if (sh->ddf_layout)
+               /* ddf always start from first device */
+               return 0;
+       /* md starts just after Q block */
+       if (sh->qd_idx == sh->disks - 1)
+               return 0;
+       else
+               return sh->qd_idx + 1;
+}
 static inline int raid6_next_disk(int disk, int raid_disks)
 {
        disk++;
        return (disk < raid_disks) ? disk : 0;
 }
 
+/* When walking through the disks in a raid5, starting at raid6_d0,
+ * We need to map each disk to a 'slot', where the data disks are slot
+ * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
+ * is raid_disks-1.  This help does that mapping.
+ */
+static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
+                            int *count, int syndrome_disks)
+{
+       int slot;
+
+       if (idx == sh->pd_idx)
+               return syndrome_disks;
+       if (idx == sh->qd_idx)
+               return syndrome_disks + 1;
+       slot = (*count)++;
+       return slot;
+}
+
 static void return_io(struct bio *return_bi)
 {
        struct bio *bi = return_bi;
@@ -193,6 +221,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
                }
        }
 }
+
 static void release_stripe(struct stripe_head *sh)
 {
        raid5_conf_t *conf = sh->raid_conf;
@@ -270,9 +299,11 @@ static int grow_buffers(struct stripe_head *sh, int num)
        return 0;
 }
 
-static void raid5_build_block(struct stripe_head *sh, int i);
+static void raid5_build_block(struct stripe_head *sh, int i, int previous);
+static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
+                           struct stripe_head *sh);
 
-static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks)
+static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 {
        raid5_conf_t *conf = sh->raid_conf;
        int i;
@@ -287,11 +318,12 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
 
        remove_hash(sh);
 
+       sh->generation = conf->generation - previous;
+       sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
        sh->sector = sector;
-       sh->pd_idx = pd_idx;
+       stripe_set_idx(sector, conf, previous, sh);
        sh->state = 0;
 
-       sh->disks = disks;
 
        for (i = sh->disks; i--; ) {
                struct r5dev *dev = &sh->dev[i];
@@ -305,12 +337,13 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
                        BUG();
                }
                dev->flags = 0;
-               raid5_build_block(sh, i);
+               raid5_build_block(sh, i, previous);
        }
        insert_hash(conf, sh);
 }
 
-static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks)
+static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
+                                        short generation)
 {
        struct stripe_head *sh;
        struct hlist_node *hn;
@@ -318,7 +351,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in
        CHECK_DEVLOCK();
        pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
        hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
-               if (sh->sector == sector && sh->disks == disks)
+               if (sh->sector == sector && sh->generation == generation)
                        return sh;
        pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
        return NULL;
@@ -327,8 +360,9 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in
 static void unplug_slaves(mddev_t *mddev);
 static void raid5_unplug_device(struct request_queue *q);
 
-static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks,
-                                            int pd_idx, int noblock)
+static struct stripe_head *
+get_active_stripe(raid5_conf_t *conf, sector_t sector,
+                 int previous, int noblock)
 {
        struct stripe_head *sh;
 
@@ -340,7 +374,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
                wait_event_lock_irq(conf->wait_for_stripe,
                                    conf->quiesce == 0,
                                    conf->device_lock, /* nothing */);
-               sh = __find_stripe(conf, sector, disks);
+               sh = __find_stripe(conf, sector, conf->generation - previous);
                if (!sh) {
                        if (!conf->inactive_blocked)
                                sh = get_free_stripe(conf);
@@ -358,10 +392,11 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
                                        );
                                conf->inactive_blocked = 0;
                        } else
-                               init_stripe(sh, sector, pd_idx, disks);
+                               init_stripe(sh, sector, previous);
                } else {
                        if (atomic_read(&sh->count)) {
-                         BUG_ON(!list_empty(&sh->lru));
+                               BUG_ON(!list_empty(&sh->lru)
+                                   && !test_bit(STRIPE_EXPANDING, &sh->state));
                        } else {
                                if (!test_bit(STRIPE_HANDLE, &sh->state))
                                        atomic_inc(&conf->active_stripes);
@@ -895,8 +930,10 @@ static int grow_stripes(raid5_conf_t *conf, int num)
        struct kmem_cache *sc;
        int devs = conf->raid_disks;
 
-       sprintf(conf->cache_name[0], "raid5-%s", mdname(conf->mddev));
-       sprintf(conf->cache_name[1], "raid5-%s-alt", mdname(conf->mddev));
+       sprintf(conf->cache_name[0],
+               "raid%d-%s", conf->level, mdname(conf->mddev));
+       sprintf(conf->cache_name[1],
+               "raid%d-%s-alt", conf->level, mdname(conf->mddev));
        conf->active_name = 0;
        sc = kmem_cache_create(conf->cache_name[conf->active_name],
                               sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
@@ -911,7 +948,6 @@ static int grow_stripes(raid5_conf_t *conf, int num)
        return 0;
 }
 
-#ifdef CONFIG_MD_RAID5_RESHAPE
 static int resize_stripes(raid5_conf_t *conf, int newsize)
 {
        /* Make all the stripes able to hold 'newsize' devices.
@@ -1036,7 +1072,6 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
        conf->pool_size = newsize;
        return err;
 }
-#endif
 
 static int drop_one_stripe(raid5_conf_t *conf)
 {
@@ -1066,7 +1101,7 @@ static void shrink_stripes(raid5_conf_t *conf)
 
 static void raid5_end_read_request(struct bio * bi, int error)
 {
-       struct stripe_head *sh = bi->bi_private;
+       struct stripe_head *sh = bi->bi_private;
        raid5_conf_t *conf = sh->raid_conf;
        int disks = sh->disks, i;
        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1148,7 +1183,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
 
 static void raid5_end_write_request(struct bio *bi, int error)
 {
-       struct stripe_head *sh = bi->bi_private;
+       struct stripe_head *sh = bi->bi_private;
        raid5_conf_t *conf = sh->raid_conf;
        int disks = sh->disks, i;
        int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1176,9 +1211,9 @@ static void raid5_end_write_request(struct bio *bi, int error)
 }
 
 
-static sector_t compute_blocknr(struct stripe_head *sh, int i);
+static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
        
-static void raid5_build_block(struct stripe_head *sh, int i)
+static void raid5_build_block(struct stripe_head *sh, int i, int previous)
 {
        struct r5dev *dev = &sh->dev[i];
 
@@ -1194,7 +1229,7 @@ static void raid5_build_block(struct stripe_head *sh, int i)
        dev->req.bi_private = sh;
 
        dev->flags = 0;
-       dev->sector = compute_blocknr(sh, i);
+       dev->sector = compute_blocknr(sh, i, previous);
 }
 
 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
@@ -1227,15 +1262,23 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
  * Input: a 'big' sector number,
  * Output: index of the data and parity disk, and the sector # in them.
  */
-static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
-                       unsigned int data_disks, unsigned int * dd_idx,
-                       unsigned int * pd_idx, raid5_conf_t *conf)
+static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
+                                    int previous, int *dd_idx,
+                                    struct stripe_head *sh)
 {
        long stripe;
        unsigned long chunk_number;
        unsigned int chunk_offset;
+       int pd_idx, qd_idx;
+       int ddf_layout = 0;
        sector_t new_sector;
-       int sectors_per_chunk = conf->chunk_size >> 9;
+       int algorithm = previous ? conf->prev_algo
+                                : conf->algorithm;
+       int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
+                                        : (conf->chunk_size >> 9);
+       int raid_disks = previous ? conf->previous_raid_disks
+                                 : conf->raid_disks;
+       int data_disks = raid_disks - conf->max_degraded;
 
        /* First compute the information on this sector */
 
@@ -1259,68 +1302,170 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
        /*
         * Select the parity disk based on the user selected algorithm.
         */
+       pd_idx = qd_idx = ~0;
        switch(conf->level) {
        case 4:
-               *pd_idx = data_disks;
+               pd_idx = data_disks;
                break;
        case 5:
-               switch (conf->algorithm) {
+               switch (algorithm) {
                case ALGORITHM_LEFT_ASYMMETRIC:
-                       *pd_idx = data_disks - stripe % raid_disks;
-                       if (*dd_idx >= *pd_idx)
+                       pd_idx = data_disks - stripe % raid_disks;
+                       if (*dd_idx >= pd_idx)
                                (*dd_idx)++;
                        break;
                case ALGORITHM_RIGHT_ASYMMETRIC:
-                       *pd_idx = stripe % raid_disks;
-                       if (*dd_idx >= *pd_idx)
+                       pd_idx = stripe % raid_disks;
+                       if (*dd_idx >= pd_idx)
                                (*dd_idx)++;
                        break;
                case ALGORITHM_LEFT_SYMMETRIC:
-                       *pd_idx = data_disks - stripe % raid_disks;
-                       *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
+                       pd_idx = data_disks - stripe % raid_disks;
+                       *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
                        break;
                case ALGORITHM_RIGHT_SYMMETRIC:
-                       *pd_idx = stripe % raid_disks;
-                       *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
+                       pd_idx = stripe % raid_disks;
+                       *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
+                       break;
+               case ALGORITHM_PARITY_0:
+                       pd_idx = 0;
+                       (*dd_idx)++;
+                       break;
+               case ALGORITHM_PARITY_N:
+                       pd_idx = data_disks;
                        break;
                default:
                        printk(KERN_ERR "raid5: unsupported algorithm %d\n",
-                               conf->algorithm);
+                               algorithm);
+                       BUG();
                }
                break;
        case 6:
 
-               /**** FIX THIS ****/
-               switch (conf->algorithm) {
+               switch (algorithm) {
                case ALGORITHM_LEFT_ASYMMETRIC:
-                       *pd_idx = raid_disks - 1 - (stripe % raid_disks);
-                       if (*pd_idx == raid_disks-1)
-                               (*dd_idx)++;    /* Q D D D P */
-                       else if (*dd_idx >= *pd_idx)
+                       pd_idx = raid_disks - 1 - (stripe % raid_disks);
+                       qd_idx = pd_idx + 1;
+                       if (pd_idx == raid_disks-1) {
+                               (*dd_idx)++;    /* Q D D D P */
+                               qd_idx = 0;
+                       } else if (*dd_idx >= pd_idx)
                                (*dd_idx) += 2; /* D D P Q D */
                        break;
                case ALGORITHM_RIGHT_ASYMMETRIC:
-                       *pd_idx = stripe % raid_disks;
-                       if (*pd_idx == raid_disks-1)
-                               (*dd_idx)++;    /* Q D D D P */
-                       else if (*dd_idx >= *pd_idx)
+                       pd_idx = stripe % raid_disks;
+                       qd_idx = pd_idx + 1;
+                       if (pd_idx == raid_disks-1) {
+                               (*dd_idx)++;    /* Q D D D P */
+                               qd_idx = 0;
+                       } else if (*dd_idx >= pd_idx)
                                (*dd_idx) += 2; /* D D P Q D */
                        break;
                case ALGORITHM_LEFT_SYMMETRIC:
-                       *pd_idx = raid_disks - 1 - (stripe % raid_disks);
-                       *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
+                       pd_idx = raid_disks - 1 - (stripe % raid_disks);
+                       qd_idx = (pd_idx + 1) % raid_disks;
+                       *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
                        break;
                case ALGORITHM_RIGHT_SYMMETRIC:
-                       *pd_idx = stripe % raid_disks;
-                       *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
+                       pd_idx = stripe % raid_disks;
+                       qd_idx = (pd_idx + 1) % raid_disks;
+                       *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
+                       break;
+
+               case ALGORITHM_PARITY_0:
+                       pd_idx = 0;
+                       qd_idx = 1;
+                       (*dd_idx) += 2;
+                       break;
+               case ALGORITHM_PARITY_N:
+                       pd_idx = data_disks;
+                       qd_idx = data_disks + 1;
                        break;
+
+               case ALGORITHM_ROTATING_ZERO_RESTART:
+                       /* Exactly the same as RIGHT_ASYMMETRIC, but or
+                        * of blocks for computing Q is different.
+                        */
+                       pd_idx = stripe % raid_disks;
+                       qd_idx = pd_idx + 1;
+                       if (pd_idx == raid_disks-1) {
+                               (*dd_idx)++;    /* Q D D D P */
+                               qd_idx = 0;
+                       } else if (*dd_idx >= pd_idx)
+                               (*dd_idx) += 2; /* D D P Q D */
+                       ddf_layout = 1;
+                       break;
+
+               case ALGORITHM_ROTATING_N_RESTART:
+                       /* Same a left_asymmetric, by first stripe is
+                        * D D D P Q  rather than
+                        * Q D D D P
+                        */
+                       pd_idx = raid_disks - 1 - ((stripe + 1) % raid_disks);
+                       qd_idx = pd_idx + 1;
+                       if (pd_idx == raid_disks-1) {
+                               (*dd_idx)++;    /* Q D D D P */
+                               qd_idx = 0;
+                       } else if (*dd_idx >= pd_idx)
+                               (*dd_idx) += 2; /* D D P Q D */
+                       ddf_layout = 1;
+                       break;
+
+               case ALGORITHM_ROTATING_N_CONTINUE:
+                       /* Same as left_symmetric but Q is before P */
+                       pd_idx = raid_disks - 1 - (stripe % raid_disks);
+                       qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
+                       *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
+                       ddf_layout = 1;
+                       break;
+
+               case ALGORITHM_LEFT_ASYMMETRIC_6:
+                       /* RAID5 left_asymmetric, with Q on last device */
+                       pd_idx = data_disks - stripe % (raid_disks-1);
+                       if (*dd_idx >= pd_idx)
+                               (*dd_idx)++;
+                       qd_idx = raid_disks - 1;
+                       break;
+
+               case ALGORITHM_RIGHT_ASYMMETRIC_6:
+                       pd_idx = stripe % (raid_disks-1);
+                       if (*dd_idx >= pd_idx)
+                               (*dd_idx)++;
+                       qd_idx = raid_disks - 1;
+                       break;
+
+               case ALGORITHM_LEFT_SYMMETRIC_6:
+                       pd_idx = data_disks - stripe % (raid_disks-1);
+                       *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
+                       qd_idx = raid_disks - 1;
+                       break;
+
+               case ALGORITHM_RIGHT_SYMMETRIC_6:
+                       pd_idx = stripe % (raid_disks-1);
+                       *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
+                       qd_idx = raid_disks - 1;
+                       break;
+
+               case ALGORITHM_PARITY_0_6:
+                       pd_idx = 0;
+                       (*dd_idx)++;
+                       qd_idx = raid_disks - 1;
+                       break;
+
+
                default:
                        printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
-                              conf->algorithm);
+                              algorithm);
+                       BUG();
                }
                break;
        }
 
+       if (sh) {
+               sh->pd_idx = pd_idx;
+               sh->qd_idx = qd_idx;
+               sh->ddf_layout = ddf_layout;
+       }
        /*
         * Finally, compute the new sector number
         */
@@ -1329,17 +1474,21 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
 }
 
 
-static sector_t compute_blocknr(struct stripe_head *sh, int i)
+static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
 {
        raid5_conf_t *conf = sh->raid_conf;
        int raid_disks = sh->disks;
        int data_disks = raid_disks - conf->max_degraded;
        sector_t new_sector = sh->sector, check;
-       int sectors_per_chunk = conf->chunk_size >> 9;
+       int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
+                                        : (conf->chunk_size >> 9);
+       int algorithm = previous ? conf->prev_algo
+                                : conf->algorithm;
        sector_t stripe;
        int chunk_offset;
-       int chunk_number, dummy1, dummy2, dd_idx = i;
+       int chunk_number, dummy1, dd_idx = i;
        sector_t r_sector;
+       struct stripe_head sh2;
 
 
        chunk_offset = sector_div(new_sector, sectors_per_chunk);
@@ -1351,7 +1500,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
        switch(conf->level) {
        case 4: break;
        case 5:
-               switch (conf->algorithm) {
+               switch (algorithm) {
                case ALGORITHM_LEFT_ASYMMETRIC:
                case ALGORITHM_RIGHT_ASYMMETRIC:
                        if (i > sh->pd_idx)
@@ -1363,19 +1512,27 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
                                i += raid_disks;
                        i -= (sh->pd_idx + 1);
                        break;
+               case ALGORITHM_PARITY_0:
+                       i -= 1;
+                       break;
+               case ALGORITHM_PARITY_N:
+                       break;
                default:
                        printk(KERN_ERR "raid5: unsupported algorithm %d\n",
-                              conf->algorithm);
+                              algorithm);
+                       BUG();
                }
                break;
        case 6:
-               if (i == raid6_next_disk(sh->pd_idx, raid_disks))
+               if (i == sh->qd_idx)
                        return 0; /* It is the Q disk */
-               switch (conf->algorithm) {
+               switch (algorithm) {
                case ALGORITHM_LEFT_ASYMMETRIC:
                case ALGORITHM_RIGHT_ASYMMETRIC:
-                       if (sh->pd_idx == raid_disks-1)
-                               i--;    /* Q D D D P */
+               case ALGORITHM_ROTATING_ZERO_RESTART:
+               case ALGORITHM_ROTATING_N_RESTART:
+                       if (sh->pd_idx == raid_disks-1)
+                               i--;    /* Q D D D P */
                        else if (i > sh->pd_idx)
                                i -= 2; /* D D P Q D */
                        break;
@@ -1390,9 +1547,35 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
                                i -= (sh->pd_idx + 2);
                        }
                        break;
+               case ALGORITHM_PARITY_0:
+                       i -= 2;
+                       break;
+               case ALGORITHM_PARITY_N:
+                       break;
+               case ALGORITHM_ROTATING_N_CONTINUE:
+                       if (sh->pd_idx == 0)
+                               i--;    /* P D D D Q */
+                       else if (i > sh->pd_idx)
+                               i -= 2; /* D D Q P D */
+                       break;
+               case ALGORITHM_LEFT_ASYMMETRIC_6:
+               case ALGORITHM_RIGHT_ASYMMETRIC_6:
+                       if (i > sh->pd_idx)
+                               i--;
+                       break;
+               case ALGORITHM_LEFT_SYMMETRIC_6:
+               case ALGORITHM_RIGHT_SYMMETRIC_6:
+                       if (i < sh->pd_idx)
+                               i += data_disks + 1;
+                       i -= (sh->pd_idx + 1);
+                       break;
+               case ALGORITHM_PARITY_0_6:
+                       i -= 1;
+                       break;
                default:
                        printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
-                              conf->algorithm);
+                              algorithm);
+                       BUG();
                }
                break;
        }
@@ -1400,8 +1583,10 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
        chunk_number = stripe * data_disks + i;
        r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
 
-       check = raid5_compute_sector(r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
-       if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
+       check = raid5_compute_sector(conf, r_sector,
+                                    previous, &dummy1, &sh2);
+       if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
+               || sh2.qd_idx != sh->qd_idx) {
                printk(KERN_ERR "compute_blocknr: map not correct\n");
                return 0;
        }
@@ -1468,14 +1653,16 @@ static void copy_data(int frombio, struct bio *bio,
 
 static void compute_parity6(struct stripe_head *sh, int method)
 {
-       raid6_conf_t *conf = sh->raid_conf;
-       int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
+       raid5_conf_t *conf = sh->raid_conf;
+       int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
+       int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
        struct bio *chosen;
        /**** FIX THIS: This could be very bad if disks is close to 256 ****/
-       void *ptrs[disks];
+       void *ptrs[syndrome_disks+2];
 
-       qd_idx = raid6_next_disk(pd_idx, disks);
-       d0_idx = raid6_next_disk(qd_idx, disks);
+       pd_idx = sh->pd_idx;
+       qd_idx = sh->qd_idx;
+       d0_idx = raid6_d0(sh);
 
        pr_debug("compute_parity, stripe %llu, method %d\n",
                (unsigned long long)sh->sector, method);
@@ -1513,24 +1700,29 @@ static void compute_parity6(struct stripe_head *sh, int method)
                        set_bit(R5_UPTODATE, &sh->dev[i].flags);
                }
 
-//     switch(method) {
-//     case RECONSTRUCT_WRITE:
-//     case CHECK_PARITY:
-//     case UPDATE_PARITY:
-               /* Note that unlike RAID-5, the ordering of the disks matters greatly. */
-               /* FIX: Is this ordering of drives even remotely optimal? */
-               count = 0;
-               i = d0_idx;
-               do {
-                       ptrs[count++] = page_address(sh->dev[i].page);
-                       if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
-                               printk("block %d/%d not uptodate on parity calc\n", i,count);
-                       i = raid6_next_disk(i, disks);
-               } while ( i != d0_idx );
-//             break;
-//     }
-
-       raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs);
+       /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/
+
+       for (i = 0; i < disks; i++)
+               ptrs[i] = (void *)raid6_empty_zero_page;
+
+       count = 0;
+       i = d0_idx;
+       do {
+               int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+
+               ptrs[slot] = page_address(sh->dev[i].page);
+               if (slot < syndrome_disks &&
+                   !test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
+                       printk(KERN_ERR "block %d/%d not uptodate "
+                              "on parity calc\n", i, count);
+                       BUG();
+               }
+
+               i = raid6_next_disk(i, disks);
+       } while (i != d0_idx);
+       BUG_ON(count != syndrome_disks);
+
+       raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs);
 
        switch(method) {
        case RECONSTRUCT_WRITE:
@@ -1552,8 +1744,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
 {
        int i, count, disks = sh->disks;
        void *ptr[MAX_XOR_BLOCKS], *dest, *p;
-       int pd_idx = sh->pd_idx;
-       int qd_idx = raid6_next_disk(pd_idx, disks);
+       int qd_idx = sh->qd_idx;
 
        pr_debug("compute_block_1, stripe %llu, idx %d\n",
                (unsigned long long)sh->sector, dd_idx);
@@ -1589,63 +1780,65 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
 static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
 {
        int i, count, disks = sh->disks;
-       int pd_idx = sh->pd_idx;
-       int qd_idx = raid6_next_disk(pd_idx, disks);
-       int d0_idx = raid6_next_disk(qd_idx, disks);
-       int faila, failb;
+       int syndrome_disks = sh->ddf_layout ? disks : disks-2;
+       int d0_idx = raid6_d0(sh);
+       int faila = -1, failb = -1;
+       /**** FIX THIS: This could be very bad if disks is close to 256 ****/
+       void *ptrs[syndrome_disks+2];
 
-       /* faila and failb are disk numbers relative to d0_idx */
-       /* pd_idx become disks-2 and qd_idx become disks-1 */
-       faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx;
-       failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx;
+       for (i = 0; i < disks ; i++)
+               ptrs[i] = (void *)raid6_empty_zero_page;
+       count = 0;
+       i = d0_idx;
+       do {
+               int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+
+               ptrs[slot] = page_address(sh->dev[i].page);
+
+               if (i == dd_idx1)
+                       faila = slot;
+               if (i == dd_idx2)
+                       failb = slot;
+               i = raid6_next_disk(i, disks);
+       } while (i != d0_idx);
+       BUG_ON(count != syndrome_disks);
 
        BUG_ON(faila == failb);
        if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
 
        pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
-              (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
+                (unsigned long long)sh->sector, dd_idx1, dd_idx2,
+                faila, failb);
 
-       if ( failb == disks-1 ) {
+       if (failb == syndrome_disks+1) {
                /* Q disk is one of the missing disks */
-               if ( faila == disks-2 ) {
+               if (faila == syndrome_disks) {
                        /* Missing P+Q, just recompute */
                        compute_parity6(sh, UPDATE_PARITY);
                        return;
                } else {
                        /* We're missing D+Q; recompute D from P */
-                       compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
+                       compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ?
+                                            dd_idx2 : dd_idx1),
+                                       0);
                        compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
                        return;
                }
        }
 
-       /* We're missing D+P or D+D; build pointer table */
-       {
-               /**** FIX THIS: This could be very bad if disks is close to 256 ****/
-               void *ptrs[disks];
-
-               count = 0;
-               i = d0_idx;
-               do {
-                       ptrs[count++] = page_address(sh->dev[i].page);
-                       i = raid6_next_disk(i, disks);
-                       if (i != dd_idx1 && i != dd_idx2 &&
-                           !test_bit(R5_UPTODATE, &sh->dev[i].flags))
-                               printk("compute_2 with missing block %d/%d\n", count, i);
-               } while ( i != d0_idx );
-
-               if ( failb == disks-2 ) {
-                       /* We're missing D+P. */
-                       raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
-               } else {
-                       /* We're missing D+D. */
-                       raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
-               }
-
-               /* Both the above update both missing blocks */
-               set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
-               set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
+       /* We're missing D+P or D+D; */
+       if (failb == syndrome_disks) {
+               /* We're missing D+P. */
+               raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs);
+       } else {
+               /* We're missing D+D. */
+               raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb,
+                                 ptrs);
        }
+
+       /* Both the above update both missing blocks */
+       set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
+       set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
 }
 
 static void
@@ -1800,17 +1993,21 @@ static int page_is_zero(struct page *p)
                memcmp(a, a+4, STRIPE_SIZE-4)==0);
 }
 
-static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
+static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
+                           struct stripe_head *sh)
 {
-       int sectors_per_chunk = conf->chunk_size >> 9;
-       int pd_idx, dd_idx;
+       int sectors_per_chunk =
+               previous ? (conf->prev_chunk >> 9)
+                        : (conf->chunk_size >> 9);
+       int dd_idx;
        int chunk_offset = sector_div(stripe, sectors_per_chunk);
+       int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
 
-       raid5_compute_sector(stripe * (disks - conf->max_degraded)
+       raid5_compute_sector(conf,
+                            stripe * (disks - conf->max_degraded)
                             *sectors_per_chunk + chunk_offset,
-                            disks, disks - conf->max_degraded,
-                            &dd_idx, &pd_idx, conf);
-       return pd_idx;
+                            previous,
+                            &dd_idx, sh);
 }
 
 static void
@@ -2181,7 +2378,7 @@ static void handle_stripe_dirtying6(raid5_conf_t *conf,
                struct r6_state *r6s, int disks)
 {
        int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i;
-       int qd_idx = r6s->qd_idx;
+       int qd_idx = sh->qd_idx;
        for (i = disks; i--; ) {
                struct r5dev *dev = &sh->dev[i];
                /* Would I have to read this buffer for reconstruct_write */
@@ -2371,7 +2568,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
        int update_p = 0, update_q = 0;
        struct r5dev *dev;
        int pd_idx = sh->pd_idx;
-       int qd_idx = r6s->qd_idx;
+       int qd_idx = sh->qd_idx;
 
        set_bit(STRIPE_HANDLE, &sh->state);
 
@@ -2467,17 +2664,14 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
        struct dma_async_tx_descriptor *tx = NULL;
        clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
        for (i = 0; i < sh->disks; i++)
-               if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) {
-                       int dd_idx, pd_idx, j;
+               if (i != sh->pd_idx && i != sh->qd_idx) {
+                       int dd_idx, j;
                        struct stripe_head *sh2;
 
-                       sector_t bn = compute_blocknr(sh, i);
-                       sector_t s = raid5_compute_sector(bn, conf->raid_disks,
-                                               conf->raid_disks -
-                                               conf->max_degraded, &dd_idx,
-                                               &pd_idx, conf);
-                       sh2 = get_active_stripe(conf, s, conf->raid_disks,
-                                               pd_idx, 1);
+                       sector_t bn = compute_blocknr(sh, i, 1);
+                       sector_t s = raid5_compute_sector(conf, bn, 0,
+                                                         &dd_idx, NULL);
+                       sh2 = get_active_stripe(conf, s, 0, 1);
                        if (sh2 == NULL)
                                /* so far only the early blocks of this stripe
                                 * have been requested.  When later blocks
@@ -2500,8 +2694,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
                        set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
                        for (j = 0; j < conf->raid_disks; j++)
                                if (j != sh2->pd_idx &&
-                                   (!r6s || j != raid6_next_disk(sh2->pd_idx,
-                                                                sh2->disks)) &&
+                                   (!r6s || j != sh2->qd_idx) &&
                                    !test_bit(R5_Expanded, &sh2->dev[j].flags))
                                        break;
                        if (j == conf->raid_disks) {
@@ -2750,6 +2943,23 @@ static bool handle_stripe5(struct stripe_head *sh)
 
        /* Finish reconstruct operations initiated by the expansion process */
        if (sh->reconstruct_state == reconstruct_state_result) {
+               struct stripe_head *sh2
+                       = get_active_stripe(conf, sh->sector, 1, 1);
+               if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
+                       /* sh cannot be written until sh2 has been read.
+                        * so arrange for sh to be delayed a little
+                        */
+                       set_bit(STRIPE_DELAYED, &sh->state);
+                       set_bit(STRIPE_HANDLE, &sh->state);
+                       if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
+                                             &sh2->state))
+                               atomic_inc(&conf->preread_active_stripes);
+                       release_stripe(sh2);
+                       goto unlock;
+               }
+               if (sh2)
+                       release_stripe(sh2);
+
                sh->reconstruct_state = reconstruct_state_idle;
                clear_bit(STRIPE_EXPANDING, &sh->state);
                for (i = conf->raid_disks; i--; ) {
@@ -2763,8 +2973,7 @@ static bool handle_stripe5(struct stripe_head *sh)
            !sh->reconstruct_state) {
                /* Need to write out all blocks after computing parity */
                sh->disks = conf->raid_disks;
-               sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
-                       conf->raid_disks);
+               stripe_set_idx(sh->sector, conf, 0, sh);
                schedule_reconstruction5(sh, &s, 1, 1);
        } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
                clear_bit(STRIPE_EXPAND_READY, &sh->state);
@@ -2796,20 +3005,19 @@ static bool handle_stripe5(struct stripe_head *sh)
 
 static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 {
-       raid6_conf_t *conf = sh->raid_conf;
+       raid5_conf_t *conf = sh->raid_conf;
        int disks = sh->disks;
        struct bio *return_bi = NULL;
-       int i, pd_idx = sh->pd_idx;
+       int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx;
        struct stripe_head_state s;
        struct r6_state r6s;
        struct r5dev *dev, *pdev, *qdev;
        mdk_rdev_t *blocked_rdev = NULL;
 
-       r6s.qd_idx = raid6_next_disk(pd_idx, disks);
        pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
                "pd_idx=%d, qd_idx=%d\n",
               (unsigned long long)sh->sector, sh->state,
-              atomic_read(&sh->count), pd_idx, r6s.qd_idx);
+              atomic_read(&sh->count), pd_idx, qd_idx);
        memset(&s, 0, sizeof(s));
 
        spin_lock(&sh->lock);
@@ -2920,9 +3128,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
        pdev = &sh->dev[pd_idx];
        r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx)
                || (s.failed >= 2 && r6s.failed_num[1] == pd_idx);
-       qdev = &sh->dev[r6s.qd_idx];
-       r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx)
-               || (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx);
+       qdev = &sh->dev[qd_idx];
+       r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx)
+               || (s.failed >= 2 && r6s.failed_num[1] == qd_idx);
 
        if ( s.written &&
             ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
@@ -2980,10 +3188,26 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                }
 
        if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+               struct stripe_head *sh2
+                       = get_active_stripe(conf, sh->sector, 1, 1);
+               if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
+                       /* sh cannot be written until sh2 has been read.
+                        * so arrange for sh to be delayed a little
+                        */
+                       set_bit(STRIPE_DELAYED, &sh->state);
+                       set_bit(STRIPE_HANDLE, &sh->state);
+                       if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
+                                             &sh2->state))
+                               atomic_inc(&conf->preread_active_stripes);
+                       release_stripe(sh2);
+                       goto unlock;
+               }
+               if (sh2)
+                       release_stripe(sh2);
+
                /* Need to write out all blocks after computing P&Q */
                sh->disks = conf->raid_disks;
-               sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
-                                            conf->raid_disks);
+               stripe_set_idx(sh->sector, conf, 0, sh);
                compute_parity6(sh, RECONSTRUCT_WRITE);
                for (i = conf->raid_disks ; i-- ;  ) {
                        set_bit(R5_LOCKED, &sh->dev[i].flags);
@@ -3134,6 +3358,8 @@ static int raid5_mergeable_bvec(struct request_queue *q,
        if ((bvm->bi_rw & 1) == WRITE)
                return biovec->bv_len; /* always allow writes to be mergeable */
 
+       if (mddev->new_chunk < mddev->chunk_size)
+               chunk_sectors = mddev->new_chunk >> 9;
        max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
        if (max < 0) max = 0;
        if (max <= biovec->bv_len && bio_sectors == 0)
@@ -3149,6 +3375,8 @@ static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
        unsigned int chunk_sectors = mddev->chunk_size >> 9;
        unsigned int bio_sectors = bio->bi_size >> 9;
 
+       if (mddev->new_chunk < mddev->chunk_size)
+               chunk_sectors = mddev->new_chunk >> 9;
        return  chunk_sectors >=
                ((sector & (chunk_sectors - 1)) + bio_sectors);
 }
@@ -3255,9 +3483,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
 {
        mddev_t *mddev = q->queuedata;
        raid5_conf_t *conf = mddev_to_conf(mddev);
-       const unsigned int raid_disks = conf->raid_disks;
-       const unsigned int data_disks = raid_disks - conf->max_degraded;
-       unsigned int dd_idx, pd_idx;
+       unsigned int dd_idx;
        struct bio* align_bi;
        mdk_rdev_t *rdev;
 
@@ -3266,7 +3492,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
                return 0;
        }
        /*
-        * use bio_clone to make a copy of the bio
+        * use bio_clone to make a copy of the bio
         */
        align_bi = bio_clone(raid_bio, GFP_NOIO);
        if (!align_bi)
@@ -3280,12 +3506,9 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
        /*
         *      compute position
         */
-       align_bi->bi_sector =  raid5_compute_sector(raid_bio->bi_sector,
-                                       raid_disks,
-                                       data_disks,
-                                       &dd_idx,
-                                       &pd_idx,
-                                       conf);
+       align_bi->bi_sector =  raid5_compute_sector(conf, raid_bio->bi_sector,
+                                                   0,
+                                                   &dd_idx, NULL);
 
        rcu_read_lock();
        rdev = rcu_dereference(conf->disks[dd_idx].rdev);
@@ -3377,7 +3600,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
 {
        mddev_t *mddev = q->queuedata;
        raid5_conf_t *conf = mddev_to_conf(mddev);
-       unsigned int dd_idx, pd_idx;
+       int dd_idx;
        sector_t new_sector;
        sector_t logical_sector, last_sector;
        struct stripe_head *sh;
@@ -3400,7 +3623,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
        if (rw == READ &&
             mddev->reshape_position == MaxSector &&
             chunk_aligned_read(q,bi))
-               return 0;
+               return 0;
 
        logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
        last_sector = bi->bi_sector + (bi->bi_size>>9);
@@ -3410,26 +3633,31 @@ static int make_request(struct request_queue *q, struct bio * bi)
        for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
                DEFINE_WAIT(w);
                int disks, data_disks;
+               int previous;
 
        retry:
+               previous = 0;
+               disks = conf->raid_disks;
                prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
-               if (likely(conf->expand_progress == MaxSector))
-                       disks = conf->raid_disks;
-               else {
-                       /* spinlock is needed as expand_progress may be
+               if (unlikely(conf->reshape_progress != MaxSector)) {
+                       /* spinlock is needed as reshape_progress may be
                         * 64bit on a 32bit platform, and so it might be
                         * possible to see a half-updated value
-                        * Ofcourse expand_progress could change after
+                        * Ofcourse reshape_progress could change after
                         * the lock is dropped, so once we get a reference
                         * to the stripe that we think it is, we will have
                         * to check again.
                         */
                        spin_lock_irq(&conf->device_lock);
-                       disks = conf->raid_disks;
-                       if (logical_sector >= conf->expand_progress)
+                       if (mddev->delta_disks < 0
+                           ? logical_sector < conf->reshape_progress
+                           : logical_sector >= conf->reshape_progress) {
                                disks = conf->previous_raid_disks;
-                       else {
-                               if (logical_sector >= conf->expand_lo) {
+                               previous = 1;
+                       } else {
+                               if (mddev->delta_disks < 0
+                                   ? logical_sector < conf->reshape_safe
+                                   : logical_sector >= conf->reshape_safe) {
                                        spin_unlock_irq(&conf->device_lock);
                                        schedule();
                                        goto retry;
@@ -3439,15 +3667,17 @@ static int make_request(struct request_queue *q, struct bio * bi)
                }
                data_disks = disks - conf->max_degraded;
 
-               new_sector = raid5_compute_sector(logical_sector, disks, data_disks,
-                                                 &dd_idx, &pd_idx, conf);
+               new_sector = raid5_compute_sector(conf, logical_sector,
+                                                 previous,
+                                                 &dd_idx, NULL);
                pr_debug("raid5: make_request, sector %llu logical %llu\n",
                        (unsigned long long)new_sector, 
                        (unsigned long long)logical_sector);
 
-               sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
+               sh = get_active_stripe(conf, new_sector, previous,
+                                      (bi->bi_rw&RWA_MASK));
                if (sh) {
-                       if (unlikely(conf->expand_progress != MaxSector)) {
+                       if (unlikely(previous)) {
                                /* expansion might have moved on while waiting for a
                                 * stripe, so we must do the range check again.
                                 * Expansion could still move past after this
@@ -3458,8 +3688,9 @@ static int make_request(struct request_queue *q, struct bio * bi)
                                 */
                                int must_retry = 0;
                                spin_lock_irq(&conf->device_lock);
-                               if (logical_sector <  conf->expand_progress &&
-                                   disks == conf->previous_raid_disks)
+                               if (mddev->delta_disks < 0
+                                   ? logical_sector >= conf->reshape_progress
+                                   : logical_sector < conf->reshape_progress)
                                        /* mismatch, need to try again */
                                        must_retry = 1;
                                spin_unlock_irq(&conf->device_lock);
@@ -3514,6 +3745,8 @@ static int make_request(struct request_queue *q, struct bio * bi)
        return 0;
 }
 
+static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks);
+
 static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped)
 {
        /* reshaping is quite different to recovery/resync so it is
@@ -3527,61 +3760,118 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
         */
        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
        struct stripe_head *sh;
-       int pd_idx;
        sector_t first_sector, last_sector;
        int raid_disks = conf->previous_raid_disks;
        int data_disks = raid_disks - conf->max_degraded;
        int new_data_disks = conf->raid_disks - conf->max_degraded;
        int i;
        int dd_idx;
-       sector_t writepos, safepos, gap;
-
-       if (sector_nr == 0 &&
-           conf->expand_progress != 0) {
-               /* restarting in the middle, skip the initial sectors */
-               sector_nr = conf->expand_progress;
+       sector_t writepos, readpos, safepos;
+       sector_t stripe_addr;
+       int reshape_sectors;
+       struct list_head stripes;
+
+       if (sector_nr == 0) {
+               /* If restarting in the middle, skip the initial sectors */
+               if (mddev->delta_disks < 0 &&
+                   conf->reshape_progress < raid5_size(mddev, 0, 0)) {
+                       sector_nr = raid5_size(mddev, 0, 0)
+                               - conf->reshape_progress;
+               } else if (mddev->delta_disks > 0 &&
+                          conf->reshape_progress > 0)
+                       sector_nr = conf->reshape_progress;
                sector_div(sector_nr, new_data_disks);
-               *skipped = 1;
-               return sector_nr;
+               if (sector_nr) {
+                       *skipped = 1;
+                       return sector_nr;
+               }
        }
 
+       /* We need to process a full chunk at a time.
+        * If old and new chunk sizes differ, we need to process the
+        * largest of these
+        */
+       if (mddev->new_chunk > mddev->chunk_size)
+               reshape_sectors = mddev->new_chunk / 512;
+       else
+               reshape_sectors = mddev->chunk_size / 512;
+
        /* we update the metadata when there is more than 3Meg
         * in the block range (that is rather arbitrary, should
         * probably be time based) or when the data about to be
         * copied would over-write the source of the data at
         * the front of the range.
-        * i.e. one new_stripe forward from expand_progress new_maps
-        * to after where expand_lo old_maps to
+        * i.e. one new_stripe along from reshape_progress new_maps
+        * to after where reshape_safe old_maps to
         */
-       writepos = conf->expand_progress +
-               conf->chunk_size/512*(new_data_disks);
+       writepos = conf->reshape_progress;
        sector_div(writepos, new_data_disks);
-       safepos = conf->expand_lo;
+       readpos = conf->reshape_progress;
+       sector_div(readpos, data_disks);
+       safepos = conf->reshape_safe;
        sector_div(safepos, data_disks);
-       gap = conf->expand_progress - conf->expand_lo;
+       if (mddev->delta_disks < 0) {
+               writepos -= reshape_sectors;
+               readpos += reshape_sectors;
+               safepos += reshape_sectors;
+       } else {
+               writepos += reshape_sectors;
+               readpos -= reshape_sectors;
+               safepos -= reshape_sectors;
+       }
 
-       if (writepos >= safepos ||
-           gap > (new_data_disks)*3000*2 /*3Meg*/) {
+       /* 'writepos' is the most advanced device address we might write.
+        * 'readpos' is the least advanced device address we might read.
+        * 'safepos' is the least address recorded in the metadata as having
+        *     been reshaped.
+        * If 'readpos' is behind 'writepos', then there is no way that we can
+        * ensure safety in the face of a crash - that must be done by userspace
+        * making a backup of the data.  So in that case there is no particular
+        * rush to update metadata.
+        * Otherwise if 'safepos' is behind 'writepos', then we really need to
+        * update the metadata to advance 'safepos' to match 'readpos' so that
+        * we can be safe in the event of a crash.
+        * So we insist on updating metadata if safepos is behind writepos and
+        * readpos is beyond writepos.
+        * In any case, update the metadata every 10 seconds.
+        * Maybe that number should be configurable, but I'm not sure it is
+        * worth it.... maybe it could be a multiple of safemode_delay???
+        */
+       if ((mddev->delta_disks < 0
+            ? (safepos > writepos && readpos < writepos)
+            : (safepos < writepos && readpos > writepos)) ||
+           time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
                /* Cannot proceed until we've updated the superblock... */
                wait_event(conf->wait_for_overlap,
                           atomic_read(&conf->reshape_stripes)==0);
-               mddev->reshape_position = conf->expand_progress;
+               mddev->reshape_position = conf->reshape_progress;
+               conf->reshape_checkpoint = jiffies;
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
                md_wakeup_thread(mddev->thread);
                wait_event(mddev->sb_wait, mddev->flags == 0 ||
                           kthread_should_stop());
                spin_lock_irq(&conf->device_lock);
-               conf->expand_lo = mddev->reshape_position;
+               conf->reshape_safe = mddev->reshape_position;
                spin_unlock_irq(&conf->device_lock);
                wake_up(&conf->wait_for_overlap);
        }
 
-       for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
+       if (mddev->delta_disks < 0) {
+               BUG_ON(conf->reshape_progress == 0);
+               stripe_addr = writepos;
+               BUG_ON((mddev->dev_sectors &
+                       ~((sector_t)reshape_sectors - 1))
+                      - reshape_sectors - stripe_addr
+                      != sector_nr);
+       } else {
+               BUG_ON(writepos != sector_nr + reshape_sectors);
+               stripe_addr = sector_nr;
+       }
+       INIT_LIST_HEAD(&stripes);
+       for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
                int j;
                int skipped = 0;
-               pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
-               sh = get_active_stripe(conf, sector_nr+i,
-                                      conf->raid_disks, pd_idx, 0);
+               sh = get_active_stripe(conf, stripe_addr+i, 0, 0);
                set_bit(STRIPE_EXPANDING, &sh->state);
                atomic_inc(&conf->reshape_stripes);
                /* If any of this stripe is beyond the end of the old
@@ -3592,10 +3882,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                        if (j == sh->pd_idx)
                                continue;
                        if (conf->level == 6 &&
-                           j == raid6_next_disk(sh->pd_idx, sh->disks))
+                           j == sh->qd_idx)
                                continue;
-                       s = compute_blocknr(sh, j);
-                       if (s < mddev->array_sectors) {
+                       s = compute_blocknr(sh, j, 0);
+                       if (s < raid5_size(mddev, 0, 0)) {
                                skipped = 1;
                                continue;
                        }
@@ -3607,10 +3897,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
                        set_bit(STRIPE_EXPAND_READY, &sh->state);
                        set_bit(STRIPE_HANDLE, &sh->state);
                }
-               release_stripe(sh);
+               list_add(&sh->lru, &stripes);
        }
        spin_lock_irq(&conf->device_lock);
-       conf->expand_progress = (sector_nr + i) * new_data_disks;
+       if (mddev->delta_disks < 0)
+               conf->reshape_progress -= reshape_sectors * new_data_disks;
+       else
+               conf->reshape_progress += reshape_sectors * new_data_disks;
        spin_unlock_irq(&conf->device_lock);
        /* Ok, those stripe are ready. We can start scheduling
         * reads on the source stripes.
@@ -3618,46 +3911,50 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
         * block on the destination stripes.
         */
        first_sector =
-               raid5_compute_sector(sector_nr*(new_data_disks),
-                                    raid_disks, data_disks,
-                                    &dd_idx, &pd_idx, conf);
+               raid5_compute_sector(conf, stripe_addr*(new_data_disks),
+                                    1, &dd_idx, NULL);
        last_sector =
-               raid5_compute_sector((sector_nr+conf->chunk_size/512)
-                                    *(new_data_disks) -1,
-                                    raid_disks, data_disks,
-                                    &dd_idx, &pd_idx, conf);
-       if (last_sector >= (mddev->size<<1))
-               last_sector = (mddev->size<<1)-1;
+               raid5_compute_sector(conf, ((stripe_addr+conf->chunk_size/512)
+                                           *(new_data_disks) - 1),
+                                    1, &dd_idx, NULL);
+       if (last_sector >= mddev->dev_sectors)
+               last_sector = mddev->dev_sectors - 1;
        while (first_sector <= last_sector) {
-               pd_idx = stripe_to_pdidx(first_sector, conf,
-                                        conf->previous_raid_disks);
-               sh = get_active_stripe(conf, first_sector,
-                                      conf->previous_raid_disks, pd_idx, 0);
+               sh = get_active_stripe(conf, first_sector, 1, 0);
                set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
                set_bit(STRIPE_HANDLE, &sh->state);
                release_stripe(sh);
                first_sector += STRIPE_SECTORS;
        }
+       /* Now that the sources are clearly marked, we can release
+        * the destination stripes
+        */
+       while (!list_empty(&stripes)) {
+               sh = list_entry(stripes.next, struct stripe_head, lru);
+               list_del_init(&sh->lru);
+               release_stripe(sh);
+       }
        /* If this takes us to the resync_max point where we have to pause,
         * then we need to write out the superblock.
         */
-       sector_nr += conf->chunk_size>>9;
+       sector_nr += reshape_sectors;
        if (sector_nr >= mddev->resync_max) {
                /* Cannot proceed until we've updated the superblock... */
                wait_event(conf->wait_for_overlap,
                           atomic_read(&conf->reshape_stripes) == 0);
-               mddev->reshape_position = conf->expand_progress;
+               mddev->reshape_position = conf->reshape_progress;
+               conf->reshape_checkpoint = jiffies;
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
                md_wakeup_thread(mddev->thread);
                wait_event(mddev->sb_wait,
                           !test_bit(MD_CHANGE_DEVS, &mddev->flags)
                           || kthread_should_stop());
                spin_lock_irq(&conf->device_lock);
-               conf->expand_lo = mddev->reshape_position;
+               conf->reshape_safe = mddev->reshape_position;
                spin_unlock_irq(&conf->device_lock);
                wake_up(&conf->wait_for_overlap);
        }
-       return conf->chunk_size>>9;
+       return reshape_sectors;
 }
 
 /* FIXME go_faster isn't used */
@@ -3665,9 +3962,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
 {
        raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
        struct stripe_head *sh;
-       int pd_idx;
-       int raid_disks = conf->raid_disks;
-       sector_t max_sector = mddev->size << 1;
+       sector_t max_sector = mddev->dev_sectors;
        int sync_blocks;
        int still_degraded = 0;
        int i;
@@ -3675,6 +3970,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
        if (sector_nr >= max_sector) {
                /* just being told to finish up .. nothing much to do */
                unplug_slaves(mddev);
+
                if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
                        end_reshape(conf);
                        return 0;
@@ -3705,7 +4001,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
         */
        if (mddev->degraded >= conf->max_degraded &&
            test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
-               sector_t rv = (mddev->size << 1) - sector_nr;
+               sector_t rv = mddev->dev_sectors - sector_nr;
                *skipped = 1;
                return rv;
        }
@@ -3721,10 +4017,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
 
        bitmap_cond_end_sync(mddev->bitmap, sector_nr);
 
-       pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks);
-       sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
+       sh = get_active_stripe(conf, sector_nr, 0, 1);
        if (sh == NULL) {
-               sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
+               sh = get_active_stripe(conf, sector_nr, 0, 0);
                /* make sure we don't swamp the stripe cache if someone else
                 * is trying to get access
                 */
@@ -3766,19 +4061,15 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
         * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
         */
        struct stripe_head *sh;
-       int dd_idx, pd_idx;
+       int dd_idx;
        sector_t sector, logical_sector, last_sector;
        int scnt = 0;
        int remaining;
        int handled = 0;
 
        logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
-       sector = raid5_compute_sector(  logical_sector,
-                                       conf->raid_disks,
-                                       conf->raid_disks - conf->max_degraded,
-                                       &dd_idx,
-                                       &pd_idx,
-                                       conf);
+       sector = raid5_compute_sector(conf, logical_sector,
+                                     0, &dd_idx, NULL);
        last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
 
        for (; logical_sector < last_sector;
@@ -3790,7 +4081,7 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
                        /* already done this stripe */
                        continue;
 
-               sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1);
+               sh = get_active_stripe(conf, sector, 0, 1);
 
                if (!sh) {
                        /* failed to get a stripe - must wait */
@@ -3992,89 +4283,69 @@ static struct attribute_group raid5_attrs_group = {
        .attrs = raid5_attrs,
 };
 
-static int run(mddev_t *mddev)
+static sector_t
+raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+       raid5_conf_t *conf = mddev_to_conf(mddev);
+
+       if (!sectors)
+               sectors = mddev->dev_sectors;
+       if (!raid_disks) {
+               /* size is defined by the smallest of previous and new size */
+               if (conf->raid_disks < conf->previous_raid_disks)
+                       raid_disks = conf->raid_disks;
+               else
+                       raid_disks = conf->previous_raid_disks;
+       }
+
+       sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
+       sectors &= ~((sector_t)mddev->new_chunk/512 - 1);
+       return sectors * (raid_disks - conf->max_degraded);
+}
+
+static raid5_conf_t *setup_conf(mddev_t *mddev)
 {
        raid5_conf_t *conf;
        int raid_disk, memory;
        mdk_rdev_t *rdev;
        struct disk_info *disk;
-       int working_disks = 0;
 
-       if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) {
+       if (mddev->new_level != 5
+           && mddev->new_level != 4
+           && mddev->new_level != 6) {
                printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
-                      mdname(mddev), mddev->level);
-               return -EIO;
+                      mdname(mddev), mddev->new_level);
+               return ERR_PTR(-EIO);
        }
-
-       if (mddev->chunk_size < PAGE_SIZE) {
-               printk(KERN_ERR "md/raid5: chunk_size must be at least "
-                      "PAGE_SIZE but %d < %ld\n",
-                      mddev->chunk_size, PAGE_SIZE);
-               return -EINVAL;
+       if ((mddev->new_level == 5
+            && !algorithm_valid_raid5(mddev->new_layout)) ||
+           (mddev->new_level == 6
+            && !algorithm_valid_raid6(mddev->new_layout))) {
+               printk(KERN_ERR "raid5: %s: layout %d not supported\n",
+                      mdname(mddev), mddev->new_layout);
+               return ERR_PTR(-EIO);
        }
-
-       if (mddev->reshape_position != MaxSector) {
-               /* Check that we can continue the reshape.
-                * Currently only disks can change, it must
-                * increase, and we must be past the point where
-                * a stripe over-writes itself
-                */
-               sector_t here_new, here_old;
-               int old_disks;
-               int max_degraded = (mddev->level == 5 ? 1 : 2);
-
-               if (mddev->new_level != mddev->level ||
-                   mddev->new_layout != mddev->layout ||
-                   mddev->new_chunk != mddev->chunk_size) {
-                       printk(KERN_ERR "raid5: %s: unsupported reshape "
-                              "required - aborting.\n",
-                              mdname(mddev));
-                       return -EINVAL;
-               }
-               if (mddev->delta_disks <= 0) {
-                       printk(KERN_ERR "raid5: %s: unsupported reshape "
-                              "(reduce disks) required - aborting.\n",
-                              mdname(mddev));
-                       return -EINVAL;
-               }
-               old_disks = mddev->raid_disks - mddev->delta_disks;
-               /* reshape_position must be on a new-stripe boundary, and one
-                * further up in new geometry must map after here in old
-                * geometry.
-                */
-               here_new = mddev->reshape_position;
-               if (sector_div(here_new, (mddev->chunk_size>>9)*
-                              (mddev->raid_disks - max_degraded))) {
-                       printk(KERN_ERR "raid5: reshape_position not "
-                              "on a stripe boundary\n");
-                       return -EINVAL;
-               }
-               /* here_new is the stripe we will write to */
-               here_old = mddev->reshape_position;
-               sector_div(here_old, (mddev->chunk_size>>9)*
-                          (old_disks-max_degraded));
-               /* here_old is the first stripe that we might need to read
-                * from */
-               if (here_new >= here_old) {
-                       /* Reading from the same stripe as writing to - bad */
-                       printk(KERN_ERR "raid5: reshape_position too early for "
-                              "auto-recovery - aborting.\n");
-                       return -EINVAL;
-               }
-               printk(KERN_INFO "raid5: reshape will continue\n");
-               /* OK, we should be able to continue; */
+       if (mddev->new_level == 6 && mddev->raid_disks < 4) {
+               printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
+                      mdname(mddev), mddev->raid_disks);
+               return ERR_PTR(-EINVAL);
        }
 
+       if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE) {
+               printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
+                       mddev->new_chunk, mdname(mddev));
+               return ERR_PTR(-EINVAL);
+       }
 
-       mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL);
-       if ((conf = mddev->private) == NULL)
+       conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL);
+       if (conf == NULL)
                goto abort;
-       if (mddev->reshape_position == MaxSector) {
-               conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks;
-       } else {
-               conf->raid_disks = mddev->raid_disks;
+
+       conf->raid_disks = mddev->raid_disks;
+       if (mddev->reshape_position == MaxSector)
+               conf->previous_raid_disks = mddev->raid_disks;
+       else
                conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
-       }
 
        conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
                              GFP_KERNEL);
@@ -4086,13 +4357,12 @@ static int run(mddev_t *mddev)
        if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
                goto abort;
 
-       if (mddev->level == 6) {
+       if (mddev->new_level == 6) {
                conf->spare_page = alloc_page(GFP_KERNEL);
                if (!conf->spare_page)
                        goto abort;
        }
        spin_lock_init(&conf->device_lock);
-       mddev->queue->queue_lock = &conf->device_lock;
        init_waitqueue_head(&conf->wait_for_stripe);
        init_waitqueue_head(&conf->wait_for_overlap);
        INIT_LIST_HEAD(&conf->handle_list);
@@ -4121,47 +4391,134 @@ static int run(mddev_t *mddev)
                        printk(KERN_INFO "raid5: device %s operational as raid"
                                " disk %d\n", bdevname(rdev->bdev,b),
                                raid_disk);
-                       working_disks++;
                } else
                        /* Cannot rely on bitmap to complete recovery */
                        conf->fullsync = 1;
        }
 
-       /*
-        * 0 for a fully functional array, 1 or 2 for a degraded array.
-        */
-       mddev->degraded = conf->raid_disks - working_disks;
-       conf->mddev = mddev;
-       conf->chunk_size = mddev->chunk_size;
-       conf->level = mddev->level;
+       conf->chunk_size = mddev->new_chunk;
+       conf->level = mddev->new_level;
        if (conf->level == 6)
                conf->max_degraded = 2;
        else
                conf->max_degraded = 1;
-       conf->algorithm = mddev->layout;
+       conf->algorithm = mddev->new_layout;
        conf->max_nr_stripes = NR_STRIPES;
-       conf->expand_progress = mddev->reshape_position;
-
-       /* device size must be a multiple of chunk size */
-       mddev->size &= ~(mddev->chunk_size/1024 -1);
-       mddev->resync_max_sectors = mddev->size << 1;
+       conf->reshape_progress = mddev->reshape_position;
+       if (conf->reshape_progress != MaxSector) {
+               conf->prev_chunk = mddev->chunk_size;
+               conf->prev_algo = mddev->layout;
+       }
 
-       if (conf->level == 6 && conf->raid_disks < 4) {
-               printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
-                      mdname(mddev), conf->raid_disks);
+       memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
+                conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
+       if (grow_stripes(conf, conf->max_nr_stripes)) {
+               printk(KERN_ERR
+                       "raid5: couldn't allocate %dkB for buffers\n", memory);
                goto abort;
-       }
-       if (!conf->chunk_size || conf->chunk_size % 4) {
-               printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
-                       conf->chunk_size, mdname(mddev));
+       } else
+               printk(KERN_INFO "raid5: allocated %dkB for %s\n",
+                       memory, mdname(mddev));
+
+       conf->thread = md_register_thread(raid5d, mddev, "%s_raid5");
+       if (!conf->thread) {
+               printk(KERN_ERR
+                      "raid5: couldn't allocate thread for %s\n",
+                      mdname(mddev));
                goto abort;
        }
-       if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
-               printk(KERN_ERR 
-                       "raid5: unsupported parity algorithm %d for %s\n",
-                       conf->algorithm, mdname(mddev));
-               goto abort;
+
+       return conf;
+
+ abort:
+       if (conf) {
+               shrink_stripes(conf);
+               safe_put_page(conf->spare_page);
+               kfree(conf->disks);
+               kfree(conf->stripe_hashtbl);
+               kfree(conf);
+               return ERR_PTR(-EIO);
+       } else
+               return ERR_PTR(-ENOMEM);
+}
+
+static int run(mddev_t *mddev)
+{
+       raid5_conf_t *conf;
+       int working_disks = 0;
+       mdk_rdev_t *rdev;
+
+       if (mddev->reshape_position != MaxSector) {
+               /* Check that we can continue the reshape.
+                * Currently only disks can change, it must
+                * increase, and we must be past the point where
+                * a stripe over-writes itself
+                */
+               sector_t here_new, here_old;
+               int old_disks;
+               int max_degraded = (mddev->level == 6 ? 2 : 1);
+
+               if (mddev->new_level != mddev->level) {
+                       printk(KERN_ERR "raid5: %s: unsupported reshape "
+                              "required - aborting.\n",
+                              mdname(mddev));
+                       return -EINVAL;
+               }
+               old_disks = mddev->raid_disks - mddev->delta_disks;
+               /* reshape_position must be on a new-stripe boundary, and one
+                * further up in new geometry must map after here in old
+                * geometry.
+                */
+               here_new = mddev->reshape_position;
+               if (sector_div(here_new, (mddev->new_chunk>>9)*
+                              (mddev->raid_disks - max_degraded))) {
+                       printk(KERN_ERR "raid5: reshape_position not "
+                              "on a stripe boundary\n");
+                       return -EINVAL;
+               }
+               /* here_new is the stripe we will write to */
+               here_old = mddev->reshape_position;
+               sector_div(here_old, (mddev->chunk_size>>9)*
+                          (old_disks-max_degraded));
+               /* here_old is the first stripe that we might need to read
+                * from */
+               if (here_new >= here_old) {
+                       /* Reading from the same stripe as writing to - bad */
+                       printk(KERN_ERR "raid5: reshape_position too early for "
+                              "auto-recovery - aborting.\n");
+                       return -EINVAL;
+               }
+               printk(KERN_INFO "raid5: reshape will continue\n");
+               /* OK, we should be able to continue; */
+       } else {
+               BUG_ON(mddev->level != mddev->new_level);
+               BUG_ON(mddev->layout != mddev->new_layout);
+               BUG_ON(mddev->chunk_size != mddev->new_chunk);
+               BUG_ON(mddev->delta_disks != 0);
        }
+
+       if (mddev->private == NULL)
+               conf = setup_conf(mddev);
+       else
+               conf = mddev->private;
+
+       if (IS_ERR(conf))
+               return PTR_ERR(conf);
+
+       mddev->thread = conf->thread;
+       conf->thread = NULL;
+       mddev->private = conf;
+
+       /*
+        * 0 for a fully functional array, 1 or 2 for a degraded array.
+        */
+       list_for_each_entry(rdev, &mddev->disks, same_set)
+               if (rdev->raid_disk >= 0 &&
+                   test_bit(In_sync, &rdev->flags))
+                       working_disks++;
+
+       mddev->degraded = conf->raid_disks - working_disks;
+
        if (mddev->degraded > conf->max_degraded) {
                printk(KERN_ERR "raid5: not enough operational devices for %s"
                        " (%d/%d failed)\n",
@@ -4169,6 +4526,10 @@ static int run(mddev_t *mddev)
                goto abort;
        }
 
+       /* device size must be a multiple of chunk size */
+       mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1);
+       mddev->resync_max_sectors = mddev->dev_sectors;
+
        if (mddev->degraded > 0 &&
            mddev->recovery_cp != MaxSector) {
                if (mddev->ok_start_degraded)
@@ -4184,43 +4545,22 @@ static int run(mddev_t *mddev)
                }
        }
 
-       {
-               mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5");
-               if (!mddev->thread) {
-                       printk(KERN_ERR 
-                               "raid5: couldn't allocate thread for %s\n",
-                               mdname(mddev));
-                       goto abort;
-               }
-       }
-       memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
-                conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
-       if (grow_stripes(conf, conf->max_nr_stripes)) {
-               printk(KERN_ERR 
-                       "raid5: couldn't allocate %dkB for buffers\n", memory);
-               shrink_stripes(conf);
-               md_unregister_thread(mddev->thread);
-               goto abort;
-       } else
-               printk(KERN_INFO "raid5: allocated %dkB for %s\n",
-                       memory, mdname(mddev));
-
        if (mddev->degraded == 0)
                printk("raid5: raid level %d set %s active with %d out of %d"
-                       " devices, algorithm %d\n", conf->level, mdname(mddev), 
-                       mddev->raid_disks-mddev->degraded, mddev->raid_disks,
-                       conf->algorithm);
+                      " devices, algorithm %d\n", conf->level, mdname(mddev),
+                      mddev->raid_disks-mddev->degraded, mddev->raid_disks,
+                      mddev->new_layout);
        else
                printk(KERN_ALERT "raid5: raid level %d set %s active with %d"
                        " out of %d devices, algorithm %d\n", conf->level,
                        mdname(mddev), mddev->raid_disks - mddev->degraded,
-                       mddev->raid_disks, conf->algorithm);
+                       mddev->raid_disks, mddev->new_layout);
 
        print_raid5_conf(conf);
 
-       if (conf->expand_progress != MaxSector) {
+       if (conf->reshape_progress != MaxSector) {
                printk("...ok start reshape thread\n");
-               conf->expand_lo = conf->expand_progress;
+               conf->reshape_safe = conf->reshape_progress;
                atomic_set(&conf->reshape_stripes, 0);
                clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
                clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -4247,18 +4587,22 @@ static int run(mddev_t *mddev)
                       "raid5: failed to create sysfs attributes for %s\n",
                       mdname(mddev));
 
+       mddev->queue->queue_lock = &conf->device_lock;
+
        mddev->queue->unplug_fn = raid5_unplug_device;
        mddev->queue->backing_dev_info.congested_data = mddev;
        mddev->queue->backing_dev_info.congested_fn = raid5_congested;
 
-       mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks -
-                                           conf->max_degraded);
+       md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
 
        blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
 
        return 0;
 abort:
+       md_unregister_thread(mddev->thread);
+       mddev->thread = NULL;
        if (conf) {
+               shrink_stripes(conf);
                print_raid5_conf(conf);
                safe_put_page(conf->spare_page);
                kfree(conf->disks);
@@ -4396,6 +4740,10 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
        print_raid5_conf(conf);
        rdev = p->rdev;
        if (rdev) {
+               if (number >= conf->raid_disks &&
+                   conf->reshape_progress == MaxSector)
+                       clear_bit(In_sync, &rdev->flags);
+
                if (test_bit(In_sync, &rdev->flags) ||
                    atomic_read(&rdev->nr_pending)) {
                        err = -EBUSY;
@@ -4405,7 +4753,8 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
                 * isn't possible.
                 */
                if (!test_bit(Faulty, &rdev->flags) &&
-                   mddev->degraded <= conf->max_degraded) {
+                   mddev->degraded <= conf->max_degraded &&
+                   number < conf->raid_disks) {
                        err = -EBUSY;
                        goto abort;
                }
@@ -4472,36 +4821,48 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
         * any io in the removed space completes, but it hardly seems
         * worth it.
         */
-       raid5_conf_t *conf = mddev_to_conf(mddev);
-
        sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
-       mddev->array_sectors = sectors * (mddev->raid_disks
-                                         - conf->max_degraded);
+       md_set_array_sectors(mddev, raid5_size(mddev, sectors,
+                                              mddev->raid_disks));
+       if (mddev->array_sectors >
+           raid5_size(mddev, sectors, mddev->raid_disks))
+               return -EINVAL;
        set_capacity(mddev->gendisk, mddev->array_sectors);
        mddev->changed = 1;
-       if (sectors/2  > mddev->size && mddev->recovery_cp == MaxSector) {
-               mddev->recovery_cp = mddev->size << 1;
+       if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
+               mddev->recovery_cp = mddev->dev_sectors;
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        }
-       mddev->size = sectors /2;
+       mddev->dev_sectors = sectors;
        mddev->resync_max_sectors = sectors;
        return 0;
 }
 
-#ifdef CONFIG_MD_RAID5_RESHAPE
 static int raid5_check_reshape(mddev_t *mddev)
 {
        raid5_conf_t *conf = mddev_to_conf(mddev);
-       int err;
 
-       if (mddev->delta_disks < 0 ||
-           mddev->new_level != mddev->level)
-               return -EINVAL; /* Cannot shrink array or change level yet */
-       if (mddev->delta_disks == 0)
-               return 0; /* nothing to do */
+       if (mddev->delta_disks == 0 &&
+           mddev->new_layout == mddev->layout &&
+           mddev->new_chunk == mddev->chunk_size)
+               return -EINVAL; /* nothing to do */
        if (mddev->bitmap)
                /* Cannot grow a bitmap yet */
                return -EBUSY;
+       if (mddev->degraded > conf->max_degraded)
+               return -EINVAL;
+       if (mddev->delta_disks < 0) {
+               /* We might be able to shrink, but the devices must
+                * be made bigger first.
+                * For raid6, 4 is the minimum size.
+                * Otherwise 2 is the minimum
+                */
+               int min = 2;
+               if (mddev->level == 6)
+                       min = 4;
+               if (mddev->raid_disks + mddev->delta_disks < min)
+                       return -EINVAL;
+       }
 
        /* Can only proceed if there are plenty of stripe_heads.
         * We need a minimum of one full stripe,, and for sensible progress
@@ -4514,18 +4875,12 @@ static int raid5_check_reshape(mddev_t *mddev)
        if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes ||
            (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
                printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
-                      (mddev->chunk_size / STRIPE_SIZE)*4);
+                      (max(mddev->chunk_size, mddev->new_chunk)
+                       / STRIPE_SIZE)*4);
                return -ENOSPC;
        }
 
-       err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
-       if (err)
-               return err;
-
-       if (mddev->degraded > conf->max_degraded)
-               return -EINVAL;
-       /* looks like we might be able to manage this */
-       return 0;
+       return resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
 }
 
 static int raid5_start_reshape(mddev_t *mddev)
@@ -4550,12 +4905,31 @@ static int raid5_start_reshape(mddev_t *mddev)
                 */
                return -EINVAL;
 
+       /* Refuse to reduce size of the array.  Any reductions in
+        * array size must be through explicit setting of array_size
+        * attribute.
+        */
+       if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
+           < mddev->array_sectors) {
+               printk(KERN_ERR "md: %s: array size must be reduced "
+                      "before number of disks\n", mdname(mddev));
+               return -EINVAL;
+       }
+
        atomic_set(&conf->reshape_stripes, 0);
        spin_lock_irq(&conf->device_lock);
        conf->previous_raid_disks = conf->raid_disks;
        conf->raid_disks += mddev->delta_disks;
-       conf->expand_progress = 0;
-       conf->expand_lo = 0;
+       conf->prev_chunk = conf->chunk_size;
+       conf->chunk_size = mddev->new_chunk;
+       conf->prev_algo = conf->algorithm;
+       conf->algorithm = mddev->new_layout;
+       if (mddev->delta_disks < 0)
+               conf->reshape_progress = raid5_size(mddev, 0, 0);
+       else
+               conf->reshape_progress = 0;
+       conf->reshape_safe = conf->reshape_progress;
+       conf->generation++;
        spin_unlock_irq(&conf->device_lock);
 
        /* Add some new drives, as many as will fit.
@@ -4580,9 +4954,12 @@ static int raid5_start_reshape(mddev_t *mddev)
                                break;
                }
 
-       spin_lock_irqsave(&conf->device_lock, flags);
-       mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices;
-       spin_unlock_irqrestore(&conf->device_lock, flags);
+       if (mddev->delta_disks > 0) {
+               spin_lock_irqsave(&conf->device_lock, flags);
+               mddev->degraded = (conf->raid_disks - conf->previous_raid_disks)
+                       - added_devices;
+               spin_unlock_irqrestore(&conf->device_lock, flags);
+       }
        mddev->raid_disks = conf->raid_disks;
        mddev->reshape_position = 0;
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -4597,52 +4974,86 @@ static int raid5_start_reshape(mddev_t *mddev)
                mddev->recovery = 0;
                spin_lock_irq(&conf->device_lock);
                mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
-               conf->expand_progress = MaxSector;
+               conf->reshape_progress = MaxSector;
                spin_unlock_irq(&conf->device_lock);
                return -EAGAIN;
        }
+       conf->reshape_checkpoint = jiffies;
        md_wakeup_thread(mddev->sync_thread);
        md_new_event(mddev);
        return 0;
 }
-#endif
 
+/* This is called from the reshape thread and should make any
+ * changes needed in 'conf'
+ */
 static void end_reshape(raid5_conf_t *conf)
 {
-       struct block_device *bdev;
 
        if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
-               conf->mddev->array_sectors = 2 * conf->mddev->size *
-                       (conf->raid_disks - conf->max_degraded);
-               set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors);
-               conf->mddev->changed = 1;
-
-               bdev = bdget_disk(conf->mddev->gendisk, 0);
-               if (bdev) {
-                       mutex_lock(&bdev->bd_inode->i_mutex);
-                       i_size_write(bdev->bd_inode,
-                                    (loff_t)conf->mddev->array_sectors << 9);
-                       mutex_unlock(&bdev->bd_inode->i_mutex);
-                       bdput(bdev);
-               }
+
                spin_lock_irq(&conf->device_lock);
-               conf->expand_progress = MaxSector;
+               conf->previous_raid_disks = conf->raid_disks;
+               conf->reshape_progress = MaxSector;
                spin_unlock_irq(&conf->device_lock);
-               conf->mddev->reshape_position = MaxSector;
+               wake_up(&conf->wait_for_overlap);
 
                /* read-ahead size must cover two whole stripes, which is
                 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
                 */
                {
-                       int data_disks = conf->previous_raid_disks - conf->max_degraded;
-                       int stripe = data_disks *
-                               (conf->mddev->chunk_size / PAGE_SIZE);
+                       int data_disks = conf->raid_disks - conf->max_degraded;
+                       int stripe = data_disks * (conf->chunk_size
+                                                  / PAGE_SIZE);
                        if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
                                conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
                }
        }
 }
 
+/* This is called from the raid5d thread with mddev_lock held.
+ * It makes config changes to the device.
+ */
+static void raid5_finish_reshape(mddev_t *mddev)
+{
+       struct block_device *bdev;
+       raid5_conf_t *conf = mddev_to_conf(mddev);
+
+       if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+
+               if (mddev->delta_disks > 0) {
+                       md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
+                       set_capacity(mddev->gendisk, mddev->array_sectors);
+                       mddev->changed = 1;
+
+                       bdev = bdget_disk(mddev->gendisk, 0);
+                       if (bdev) {
+                               mutex_lock(&bdev->bd_inode->i_mutex);
+                               i_size_write(bdev->bd_inode,
+                                            (loff_t)mddev->array_sectors << 9);
+                               mutex_unlock(&bdev->bd_inode->i_mutex);
+                               bdput(bdev);
+                       }
+               } else {
+                       int d;
+                       mddev->degraded = conf->raid_disks;
+                       for (d = 0; d < conf->raid_disks ; d++)
+                               if (conf->disks[d].rdev &&
+                                   test_bit(In_sync,
+                                            &conf->disks[d].rdev->flags))
+                                       mddev->degraded--;
+                       for (d = conf->raid_disks ;
+                            d < conf->raid_disks - mddev->delta_disks;
+                            d++)
+                               raid5_remove_disk(mddev, d);
+               }
+               mddev->layout = conf->algorithm;
+               mddev->chunk_size = conf->chunk_size;
+               mddev->reshape_position = MaxSector;
+               mddev->delta_disks = 0;
+       }
+}
+
 static void raid5_quiesce(mddev_t *mddev, int state)
 {
        raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -4672,6 +5083,212 @@ static void raid5_quiesce(mddev_t *mddev, int state)
        }
 }
 
+
+static void *raid5_takeover_raid1(mddev_t *mddev)
+{
+       int chunksect;
+
+       if (mddev->raid_disks != 2 ||
+           mddev->degraded > 1)
+               return ERR_PTR(-EINVAL);
+
+       /* Should check if there are write-behind devices? */
+
+       chunksect = 64*2; /* 64K by default */
+
+       /* The array must be an exact multiple of chunksize */
+       while (chunksect && (mddev->array_sectors & (chunksect-1)))
+               chunksect >>= 1;
+
+       if ((chunksect<<9) < STRIPE_SIZE)
+               /* array size does not allow a suitable chunk size */
+               return ERR_PTR(-EINVAL);
+
+       mddev->new_level = 5;
+       mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
+       mddev->new_chunk = chunksect << 9;
+
+       return setup_conf(mddev);
+}
+
+static void *raid5_takeover_raid6(mddev_t *mddev)
+{
+       int new_layout;
+
+       switch (mddev->layout) {
+       case ALGORITHM_LEFT_ASYMMETRIC_6:
+               new_layout = ALGORITHM_LEFT_ASYMMETRIC;
+               break;
+       case ALGORITHM_RIGHT_ASYMMETRIC_6:
+               new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
+               break;
+       case ALGORITHM_LEFT_SYMMETRIC_6:
+               new_layout = ALGORITHM_LEFT_SYMMETRIC;
+               break;
+       case ALGORITHM_RIGHT_SYMMETRIC_6:
+               new_layout = ALGORITHM_RIGHT_SYMMETRIC;
+               break;
+       case ALGORITHM_PARITY_0_6:
+               new_layout = ALGORITHM_PARITY_0;
+               break;
+       case ALGORITHM_PARITY_N:
+               new_layout = ALGORITHM_PARITY_N;
+               break;
+       default:
+               return ERR_PTR(-EINVAL);
+       }
+       mddev->new_level = 5;
+       mddev->new_layout = new_layout;
+       mddev->delta_disks = -1;
+       mddev->raid_disks -= 1;
+       return setup_conf(mddev);
+}
+
+
+static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
+{
+       /* For a 2-drive array, the layout and chunk size can be changed
+        * immediately as not restriping is needed.
+        * For larger arrays we record the new value - after validation
+        * to be used by a reshape pass.
+        */
+       raid5_conf_t *conf = mddev_to_conf(mddev);
+
+       if (new_layout >= 0 && !algorithm_valid_raid5(new_layout))
+               return -EINVAL;
+       if (new_chunk > 0) {
+               if (new_chunk & (new_chunk-1))
+                       /* not a power of 2 */
+                       return -EINVAL;
+               if (new_chunk < PAGE_SIZE)
+                       return -EINVAL;
+               if (mddev->array_sectors & ((new_chunk>>9)-1))
+                       /* not factor of array size */
+                       return -EINVAL;
+       }
+
+       /* They look valid */
+
+       if (mddev->raid_disks == 2) {
+
+               if (new_layout >= 0) {
+                       conf->algorithm = new_layout;
+                       mddev->layout = mddev->new_layout = new_layout;
+               }
+               if (new_chunk > 0) {
+                       conf->chunk_size = new_chunk;
+                       mddev->chunk_size = mddev->new_chunk = new_chunk;
+               }
+               set_bit(MD_CHANGE_DEVS, &mddev->flags);
+               md_wakeup_thread(mddev->thread);
+       } else {
+               if (new_layout >= 0)
+                       mddev->new_layout = new_layout;
+               if (new_chunk > 0)
+                       mddev->new_chunk = new_chunk;
+       }
+       return 0;
+}
+
+static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
+{
+       if (new_layout >= 0 && !algorithm_valid_raid6(new_layout))
+               return -EINVAL;
+       if (new_chunk > 0) {
+               if (new_chunk & (new_chunk-1))
+                       /* not a power of 2 */
+                       return -EINVAL;
+               if (new_chunk < PAGE_SIZE)
+                       return -EINVAL;
+               if (mddev->array_sectors & ((new_chunk>>9)-1))
+                       /* not factor of array size */
+                       return -EINVAL;
+       }
+
+       /* They look valid */
+
+       if (new_layout >= 0)
+               mddev->new_layout = new_layout;
+       if (new_chunk > 0)
+               mddev->new_chunk = new_chunk;
+
+       return 0;
+}
+
+static void *raid5_takeover(mddev_t *mddev)
+{
+       /* raid5 can take over:
+        *  raid0 - if all devices are the same - make it a raid4 layout
+        *  raid1 - if there are two drives.  We need to know the chunk size
+        *  raid4 - trivial - just use a raid4 layout.
+        *  raid6 - Providing it is a *_6 layout
+        *
+        * For now, just do raid1
+        */
+
+       if (mddev->level == 1)
+               return raid5_takeover_raid1(mddev);
+       if (mddev->level == 4) {
+               mddev->new_layout = ALGORITHM_PARITY_N;
+               mddev->new_level = 5;
+               return setup_conf(mddev);
+       }
+       if (mddev->level == 6)
+               return raid5_takeover_raid6(mddev);
+
+       return ERR_PTR(-EINVAL);
+}
+
+
+static struct mdk_personality raid5_personality;
+
+static void *raid6_takeover(mddev_t *mddev)
+{
+       /* Currently can only take over a raid5.  We map the
+        * personality to an equivalent raid6 personality
+        * with the Q block at the end.
+        */
+       int new_layout;
+
+       if (mddev->pers != &raid5_personality)
+               return ERR_PTR(-EINVAL);
+       if (mddev->degraded > 1)
+               return ERR_PTR(-EINVAL);
+       if (mddev->raid_disks > 253)
+               return ERR_PTR(-EINVAL);
+       if (mddev->raid_disks < 3)
+               return ERR_PTR(-EINVAL);
+
+       switch (mddev->layout) {
+       case ALGORITHM_LEFT_ASYMMETRIC:
+               new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
+               break;
+       case ALGORITHM_RIGHT_ASYMMETRIC:
+               new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
+               break;
+       case ALGORITHM_LEFT_SYMMETRIC:
+               new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
+               break;
+       case ALGORITHM_RIGHT_SYMMETRIC:
+               new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
+               break;
+       case ALGORITHM_PARITY_0:
+               new_layout = ALGORITHM_PARITY_0_6;
+               break;
+       case ALGORITHM_PARITY_N:
+               new_layout = ALGORITHM_PARITY_N;
+               break;
+       default:
+               return ERR_PTR(-EINVAL);
+       }
+       mddev->new_level = 6;
+       mddev->new_layout = new_layout;
+       mddev->delta_disks = 1;
+       mddev->raid_disks += 1;
+       return setup_conf(mddev);
+}
+
+
 static struct mdk_personality raid6_personality =
 {
        .name           = "raid6",
@@ -4687,11 +5304,13 @@ static struct mdk_personality raid6_personality =
        .spare_active   = raid5_spare_active,
        .sync_request   = sync_request,
        .resize         = raid5_resize,
-#ifdef CONFIG_MD_RAID5_RESHAPE
+       .size           = raid5_size,
        .check_reshape  = raid5_check_reshape,
        .start_reshape  = raid5_start_reshape,
-#endif
+       .finish_reshape = raid5_finish_reshape,
        .quiesce        = raid5_quiesce,
+       .takeover       = raid6_takeover,
+       .reconfig       = raid6_reconfig,
 };
 static struct mdk_personality raid5_personality =
 {
@@ -4708,11 +5327,13 @@ static struct mdk_personality raid5_personality =
        .spare_active   = raid5_spare_active,
        .sync_request   = sync_request,
        .resize         = raid5_resize,
-#ifdef CONFIG_MD_RAID5_RESHAPE
+       .size           = raid5_size,
        .check_reshape  = raid5_check_reshape,
        .start_reshape  = raid5_start_reshape,
-#endif
+       .finish_reshape = raid5_finish_reshape,
        .quiesce        = raid5_quiesce,
+       .takeover       = raid5_takeover,
+       .reconfig       = raid5_reconfig,
 };
 
 static struct mdk_personality raid4_personality =
@@ -4730,20 +5351,15 @@ static struct mdk_personality raid4_personality =
        .spare_active   = raid5_spare_active,
        .sync_request   = sync_request,
        .resize         = raid5_resize,
-#ifdef CONFIG_MD_RAID5_RESHAPE
+       .size           = raid5_size,
        .check_reshape  = raid5_check_reshape,
        .start_reshape  = raid5_start_reshape,
-#endif
+       .finish_reshape = raid5_finish_reshape,
        .quiesce        = raid5_quiesce,
 };
 
 static int __init raid5_init(void)
 {
-       int e;
-
-       e = raid6_select_algo();
-       if ( e )
-               return e;
        register_md_personality(&raid6_personality);
        register_md_personality(&raid5_personality);
        register_md_personality(&raid4_personality);
similarity index 81%
rename from include/linux/raid/raid5.h
rename to drivers/md/raid5.h
index 3b2672792457ff08b991acb78e619f3daa3dab6e..52ba99954decf0ceee11bc30da2213a7c6619983 100644 (file)
@@ -1,7 +1,6 @@
 #ifndef _RAID5_H
 #define _RAID5_H
 
-#include <linux/raid/md.h>
 #include <linux/raid/xor.h>
 
 /*
@@ -197,15 +196,19 @@ enum reconstruct_states {
 
 struct stripe_head {
        struct hlist_node       hash;
-       struct list_head        lru;                    /* inactive_list or handle_list */
-       struct raid5_private_data       *raid_conf;
-       sector_t                sector;                 /* sector of this row */
-       int                     pd_idx;                 /* parity disk index */
-       unsigned long           state;                  /* state flags */
-       atomic_t                count;                  /* nr of active thread/requests */
+       struct list_head        lru;          /* inactive_list or handle_list */
+       struct raid5_private_data *raid_conf;
+       short                   generation;     /* increments with every
+                                                * reshape */
+       sector_t                sector;         /* sector of this row */
+       short                   pd_idx;         /* parity disk index */
+       short                   qd_idx;         /* 'Q' disk index for raid6 */
+       short                   ddf_layout;/* use DDF ordering to calculate Q */
+       unsigned long           state;          /* state flags */
+       atomic_t                count;        /* nr of active thread/requests */
        spinlock_t              lock;
        int                     bm_seq; /* sequence number for bitmap flushes */
-       int                     disks;                  /* disks in stripe */
+       int                     disks;          /* disks in stripe */
        enum check_states       check_state;
        enum reconstruct_states reconstruct_state;
        /* stripe_operations
@@ -238,7 +241,7 @@ struct stripe_head_state {
 
 /* r6_state - extra state data only relevant to r6 */
 struct r6_state {
-       int p_failed, q_failed, qd_idx, failed_num[2];
+       int p_failed, q_failed, failed_num[2];
 };
 
 /* Flags */
@@ -268,6 +271,8 @@ struct r6_state {
 #define READ_MODIFY_WRITE      2
 /* not a write method, but a compute_parity mode */
 #define        CHECK_PARITY            3
+/* Additional compute_parity mode -- updates the parity w/o LOCKING */
+#define UPDATE_PARITY          4
 
 /*
  * Stripe state
@@ -319,7 +324,7 @@ struct r6_state {
  * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
  * HANDLE gets cleared if stripe_handle leave nothing locked.
  */
+
 
 struct disk_info {
        mdk_rdev_t      *rdev;
@@ -334,12 +339,21 @@ struct raid5_private_data {
        int                     raid_disks;
        int                     max_nr_stripes;
 
-       /* used during an expand */
-       sector_t                expand_progress;        /* MaxSector when no expand happening */
-       sector_t                expand_lo; /* from here up to expand_progress it out-of-bounds
-                                           * as we haven't flushed the metadata yet
-                                           */
+       /* reshape_progress is the leading edge of a 'reshape'
+        * It has value MaxSector when no reshape is happening
+        * If delta_disks < 0, it is the last sector we started work on,
+        * else is it the next sector to work on.
+        */
+       sector_t                reshape_progress;
+       /* reshape_safe is the trailing edge of a reshape.  We know that
+        * before (or after) this address, all reshape has completed.
+        */
+       sector_t                reshape_safe;
        int                     previous_raid_disks;
+       int                     prev_chunk, prev_algo;
+       short                   generation; /* increments with every reshape */
+       unsigned long           reshape_checkpoint; /* Time we last updated
+                                                    * metadata */
 
        struct list_head        handle_list; /* stripes needing handling */
        struct list_head        hold_list; /* preread ready stripes */
@@ -385,6 +399,11 @@ struct raid5_private_data {
        int                     pool_size; /* number of disks in stripeheads in pool */
        spinlock_t              device_lock;
        struct disk_info        *disks;
+
+       /* When taking over an array from a different personality, we store
+        * the new thread here until we fully activate the array.
+        */
+       struct mdk_thread_s     *thread;
 };
 
 typedef struct raid5_private_data raid5_conf_t;
@@ -394,9 +413,62 @@ typedef struct raid5_private_data raid5_conf_t;
 /*
  * Our supported algorithms
  */
-#define ALGORITHM_LEFT_ASYMMETRIC      0
-#define ALGORITHM_RIGHT_ASYMMETRIC     1
-#define ALGORITHM_LEFT_SYMMETRIC       2
-#define ALGORITHM_RIGHT_SYMMETRIC      3
+#define ALGORITHM_LEFT_ASYMMETRIC      0 /* Rotating Parity N with Data Restart */
+#define ALGORITHM_RIGHT_ASYMMETRIC     1 /* Rotating Parity 0 with Data Restart */
+#define ALGORITHM_LEFT_SYMMETRIC       2 /* Rotating Parity N with Data Continuation */
+#define ALGORITHM_RIGHT_SYMMETRIC      3 /* Rotating Parity 0 with Data Continuation */
+
+/* Define non-rotating (raid4) algorithms.  These allow
+ * conversion of raid4 to raid5.
+ */
+#define ALGORITHM_PARITY_0             4 /* P or P,Q are initial devices */
+#define ALGORITHM_PARITY_N             5 /* P or P,Q are final devices. */
+
+/* DDF RAID6 layouts differ from md/raid6 layouts in two ways.
+ * Firstly, the exact positioning of the parity block is slightly
+ * different between the 'LEFT_*' modes of md and the "_N_*" modes
+ * of DDF.
+ * Secondly, or order of datablocks over which the Q syndrome is computed
+ * is different.
+ * Consequently we have different layouts for DDF/raid6 than md/raid6.
+ * These layouts are from the DDFv1.2 spec.
+ * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but
+ * leaves RLQ=3 as 'Vendor Specific'
+ */
+
+#define ALGORITHM_ROTATING_ZERO_RESTART        8 /* DDF PRL=6 RLQ=1 */
+#define ALGORITHM_ROTATING_N_RESTART   9 /* DDF PRL=6 RLQ=2 */
+#define ALGORITHM_ROTATING_N_CONTINUE  10 /*DDF PRL=6 RLQ=3 */
+
+
+/* For every RAID5 algorithm we define a RAID6 algorithm
+ * with exactly the same layout for data and parity, and
+ * with the Q block always on the last device (N-1).
+ * This allows trivial conversion from RAID5 to RAID6
+ */
+#define ALGORITHM_LEFT_ASYMMETRIC_6    16
+#define ALGORITHM_RIGHT_ASYMMETRIC_6   17
+#define ALGORITHM_LEFT_SYMMETRIC_6     18
+#define ALGORITHM_RIGHT_SYMMETRIC_6    19
+#define ALGORITHM_PARITY_0_6           20
+#define ALGORITHM_PARITY_N_6           ALGORITHM_PARITY_N
+
+static inline int algorithm_valid_raid5(int layout)
+{
+       return (layout >= 0) &&
+               (layout <= 5);
+}
+static inline int algorithm_valid_raid6(int layout)
+{
+       return (layout >= 0 && layout <= 5)
+               ||
+               (layout == 8 || layout == 10)
+               ||
+               (layout >= 16 && layout <= 20);
+}
 
+static inline int algorithm_is_DDF(int layout)
+{
+       return layout >= 8 && layout <= 10;
+}
 #endif
index 21987e3dbe6c0ce4199fcc25f3e81506c337713f..866215ac7f2554f86733cc900f4b902b3f60bc76 100644 (file)
@@ -5,7 +5,7 @@
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
  *   (at your option) any later version; incorporated herein by reference.
  *
  * ----------------------------------------------------------------------- */
  * Algorithm list and algorithm selection for RAID-6
  */
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 #ifndef __KERNEL__
 #include <sys/mman.h>
 #include <stdio.h>
+#else
+#if !RAID6_USE_EMPTY_ZERO_PAGE
+/* In .bss so it's zeroed */
+const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
+EXPORT_SYMBOL(raid6_empty_zero_page);
+#endif
 #endif
 
 struct raid6_calls raid6_call;
+EXPORT_SYMBOL_GPL(raid6_call);
 
 /* Various routine sets */
 extern const struct raid6_calls raid6_intx1;
@@ -79,6 +86,7 @@ const struct raid6_calls * const raid6_algos[] = {
 #else
 /* Need more time to be stable in userspace */
 #define RAID6_TIME_JIFFIES_LG2 9
+#define time_before(x, y) ((x) < (y))
 #endif
 
 /* Try to pick the best algorithm */
@@ -152,3 +160,12 @@ int __init raid6_select_algo(void)
 
        return best ? 0 : -EINVAL;
 }
+
+static void raid6_exit(void)
+{
+       do { } while (0);
+}
+
+subsys_initcall(raid6_select_algo);
+module_exit(raid6_exit);
+MODULE_LICENSE("GPL");
index b9afd35b8812cf46c1994a37b99f4e8935680614..699dfeee494459afdd1ba2ff3b7a7e90c063b12f 100644 (file)
@@ -5,7 +5,7 @@
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
  *   (at your option) any later version; incorporated herein by reference.
  *
  * ----------------------------------------------------------------------- */
@@ -22,7 +22,7 @@
  * bracked this with preempt_disable/enable or in a lock)
  */
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 
 #ifdef CONFIG_ALTIVEC
 
index ad004cee0e261c414e989fec6e9ae4e7a7884011..f9bf9cba357fd1202b6bef088a7e26b49fed6ab2 100644 (file)
@@ -5,7 +5,7 @@
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
  *   (at your option) any later version; incorporated herein by reference.
  *
  * ----------------------------------------------------------------------- */
@@ -18,7 +18,7 @@
  * This file is postprocessed using unroll.pl
  */
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 
 /*
  * This is the C data type to use
index d4e4a1bd70ad2f0841a8dd4036a167dfd1e9a3c0..e7f6c13132bfd12e2f145a337ed954659198d971 100644 (file)
@@ -5,7 +5,7 @@
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
  *   (at your option) any later version; incorporated herein by reference.
  *
  * ----------------------------------------------------------------------- */
@@ -18,7 +18,7 @@
 
 #if defined(__i386__) && !defined(__arch_um__)
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 #include "raid6x86.h"
 
 /* Shared with raid6sse1.c */
index a8c4d9451bd901da3172447f7bd00536ceec794a..2609f00e0d61ed8c5b80347704c4bf6d9fbae29f 100644 (file)
@@ -5,7 +5,7 @@
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
  *   (at your option) any later version; incorporated herein by reference.
  *
  * ----------------------------------------------------------------------- */
@@ -18,7 +18,7 @@
  * the syndrome.)
  */
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 
 /* Recover two failed data blocks. */
 void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
@@ -63,9 +63,7 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
                p++; q++;
        }
 }
-
-
-
+EXPORT_SYMBOL_GPL(raid6_2data_recov);
 
 /* Recover failure of one data block plus the P block */
 void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
@@ -97,9 +95,10 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
                q++; dq++;
        }
 }
+EXPORT_SYMBOL_GPL(raid6_datap_recov);
 
-
-#ifndef __KERNEL__             /* Testing only */
+#ifndef __KERNEL__
+/* Testing only */
 
 /* Recover two failed blocks. */
 void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs)
index 0666237276ff37283e06aacc8980340eada56840..b274dd5eab8f1a39aa3706e0d012bfeb8cef92f2 100644 (file)
@@ -5,7 +5,7 @@
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
  *   (at your option) any later version; incorporated herein by reference.
  *
  * ----------------------------------------------------------------------- */
@@ -23,7 +23,7 @@
 
 #if defined(__i386__) && !defined(__arch_um__)
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 #include "raid6x86.h"
 
 /* Defined in raid6mmx.c */
index b034ad8680397e0ab62eff5d55ddf6f5820979eb..6ed6c6c0389f47aef453c8708defd7f01e4d1c8f 100644 (file)
@@ -5,7 +5,7 @@
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
  *   (at your option) any later version; incorporated herein by reference.
  *
  * ----------------------------------------------------------------------- */
@@ -19,7 +19,7 @@
 
 #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
 
-#include "raid6.h"
+#include <linux/raid/pq.h>
 #include "raid6x86.h"
 
 static const struct raid6_sse_constants {
index 78e0396adf2ac28ab8cc23f0427303b10d2207b2..58ffdf4f51619caa05253d5d42abb520ae5c8e20 100644 (file)
@@ -5,7 +5,7 @@
 
 CC      = gcc
 OPTFLAGS = -O2                 # Adjust as desired
-CFLAGS  = -I.. -g $(OPTFLAGS)
+CFLAGS  = -I.. -I ../../../include -g $(OPTFLAGS)
 LD      = ld
 PERL    = perl
 AR      = ar
index 559cc41b258566d4b0437241559156fbbc574721..7a930318b17d60c406bb1d421ba406c0f3bcb139 100644 (file)
@@ -17,7 +17,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
-#include "raid6.h"
+#include <linux/raid/pq.h>
 
 #define NDISKS         16      /* Including P and Q */
 
index 99fea7a70ca70ace27cca159e039b0ab318368ae..4c22c1568558b7c92dac1b456559d9981cc2ae3c 100644 (file)
@@ -5,7 +5,7 @@
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
  *   (at your option) any later version; incorporated herein by reference.
  *
  * ----------------------------------------------------------------------- */
index 68826f1e36bccd057f938919083e6877731025c3..ec90e953adced9f0d2b3f3d4abe878d6b3511da9 100644 (file)
@@ -592,11 +592,9 @@ add_children(struct twl4030_platform_data *pdata, unsigned long features)
 
        /* maybe add LDOs that are omitted on cost-reduced parts */
        if (twl_has_regulator() && !(features & TPS_SUBSET)) {
-               /*
                child = add_regulator(TWL4030_REG_VPLL2, pdata->vpll2);
                if (IS_ERR(child))
                        return PTR_ERR(child);
-               */
 
                child = add_regulator(TWL4030_REG_VMMC2, pdata->vmmc2);
                if (IS_ERR(child))
index df6ce4a06cf37f42c11935ce7f4e2f5d92409758..1445ea8f10a61bf14df5b7abd25a3850c8b78693 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/leds.h>
 #include <linux/scatterlist.h>
 #include <linux/log2.h>
+#include <linux/regulator/consumer.h>
 
 #include <linux/mmc/card.h>
 #include <linux/mmc/host.h>
@@ -523,6 +524,105 @@ u32 mmc_vddrange_to_ocrmask(int vdd_min, int vdd_max)
 }
 EXPORT_SYMBOL(mmc_vddrange_to_ocrmask);
 
+#ifdef CONFIG_REGULATOR
+
+/**
+ * mmc_regulator_get_ocrmask - return mask of supported voltages
+ * @supply: regulator to use
+ *
+ * This returns either a negative errno, or a mask of voltages that
+ * can be provided to MMC/SD/SDIO devices using the specified voltage
+ * regulator.  This would normally be called before registering the
+ * MMC host adapter.
+ */
+int mmc_regulator_get_ocrmask(struct regulator *supply)
+{
+       int                     result = 0;
+       int                     count;
+       int                     i;
+
+       count = regulator_count_voltages(supply);
+       if (count < 0)
+               return count;
+
+       for (i = 0; i < count; i++) {
+               int             vdd_uV;
+               int             vdd_mV;
+
+               vdd_uV = regulator_list_voltage(supply, i);
+               if (vdd_uV <= 0)
+                       continue;
+
+               vdd_mV = vdd_uV / 1000;
+               result |= mmc_vddrange_to_ocrmask(vdd_mV, vdd_mV);
+       }
+
+       return result;
+}
+EXPORT_SYMBOL(mmc_regulator_get_ocrmask);
+
+/**
+ * mmc_regulator_set_ocr - set regulator to match host->ios voltage
+ * @vdd_bit: zero for power off, else a bit number (host->ios.vdd)
+ * @supply: regulator to use
+ *
+ * Returns zero on success, else negative errno.
+ *
+ * MMC host drivers may use this to enable or disable a regulator using
+ * a particular supply voltage.  This would normally be called from the
+ * set_ios() method.
+ */
+int mmc_regulator_set_ocr(struct regulator *supply, unsigned short vdd_bit)
+{
+       int                     result = 0;
+       int                     min_uV, max_uV;
+       int                     enabled;
+
+       enabled = regulator_is_enabled(supply);
+       if (enabled < 0)
+               return enabled;
+
+       if (vdd_bit) {
+               int             tmp;
+               int             voltage;
+
+               /* REVISIT mmc_vddrange_to_ocrmask() may have set some
+                * bits this regulator doesn't quite support ... don't
+                * be too picky, most cards and regulators are OK with
+                * a 0.1V range goof (it's a small error percentage).
+                */
+               tmp = vdd_bit - ilog2(MMC_VDD_165_195);
+               if (tmp == 0) {
+                       min_uV = 1650 * 1000;
+                       max_uV = 1950 * 1000;
+               } else {
+                       min_uV = 1900 * 1000 + tmp * 100 * 1000;
+                       max_uV = min_uV + 100 * 1000;
+               }
+
+               /* avoid needless changes to this voltage; the regulator
+                * might not allow this operation
+                */
+               voltage = regulator_get_voltage(supply);
+               if (voltage < 0)
+                       result = voltage;
+               else if (voltage < min_uV || voltage > max_uV)
+                       result = regulator_set_voltage(supply, min_uV, max_uV);
+               else
+                       result = 0;
+
+               if (result == 0 && !enabled)
+                       result = regulator_enable(supply);
+       } else if (enabled) {
+               result = regulator_disable(supply);
+       }
+
+       return result;
+}
+EXPORT_SYMBOL(mmc_regulator_set_ocr);
+
+#endif
+
 /*
  * Mask off any voltages we don't support and select
  * the lowest voltage
index e9026cb1c5b2248962e2280628373f33f0f43e2b..572d32fdf38a0d8b421b094e931978e54b633745 100644 (file)
@@ -117,7 +117,7 @@ static int __init pxa2xx_flash_probe(struct platform_device *pdev)
        return 0;
 }
 
-static int __exit pxa2xx_flash_remove(struct platform_device *dev)
+static int __devexit pxa2xx_flash_remove(struct platform_device *dev)
 {
        struct pxa2xx_flash_info *info = platform_get_drvdata(dev);
 
index 7931133526c48dd57438e9bfb2cda2ef6e784824..9ca21098b146c73e54e0facbf101e657e21ddaa0 100644 (file)
@@ -81,7 +81,7 @@ static int __init asp_init_chip(struct parisc_device *dev)
        asp.hpa = ASP_INTERRUPT_ADDR;
 
        printk(KERN_INFO "%s version %d at 0x%lx found.\n", 
-               asp.name, asp.version, dev->hpa.start);
+               asp.name, asp.version, (unsigned long)dev->hpa.start);
 
        /* the IRQ ASP should use */
        ret = -EBUSY;
index cd4dd7ed2c06b60fd471319fc6ef78fca8a91184..5d610cbcfe80cf294a676c38a18bce5dbdd9af4b 100644 (file)
@@ -406,8 +406,6 @@ resource_found:
        }
        ioc->avg_search[ioc->avg_idx++] = cr_start;
        ioc->avg_idx &= CCIO_SEARCH_SAMPLE - 1;
-#endif
-#ifdef CCIO_COLLECT_STATS
        ioc->used_pages += pages_needed;
 #endif
        /* 
@@ -453,10 +451,10 @@ ccio_free_range(struct ioc *ioc, dma_addr_t iova, unsigned long pages_mapped)
                unsigned long mask = ~(~0UL >> pages_mapped);
                CCIO_FREE_MAPPINGS(ioc, res_idx, mask, 8);
 #else
-               CCIO_FREE_MAPPINGS(ioc, res_idx, 0xff, 8);
+               CCIO_FREE_MAPPINGS(ioc, res_idx, 0xffUL, 8);
 #endif
        } else if(pages_mapped <= 16) {
-               CCIO_FREE_MAPPINGS(ioc, res_idx, 0xffff, 16);
+               CCIO_FREE_MAPPINGS(ioc, res_idx, 0xffffUL, 16);
        } else if(pages_mapped <= 32) {
                CCIO_FREE_MAPPINGS(ioc, res_idx, ~(unsigned int)0, 32);
 #ifdef __LP64__
@@ -1028,8 +1026,10 @@ static int ccio_proc_info(struct seq_file *m, void *p)
 
        while (ioc != NULL) {
                unsigned int total_pages = ioc->res_size << 3;
+#ifdef CCIO_COLLECT_STATS
                unsigned long avg = 0, min, max;
                int j;
+#endif
 
                len += seq_printf(m, "%s\n", ioc->name);
                
@@ -1060,8 +1060,7 @@ static int ccio_proc_info(struct seq_file *m, void *p)
                avg /= CCIO_SEARCH_SAMPLE;
                len += seq_printf(m, "  Bitmap search : %ld/%ld/%ld (min/avg/max CPU Cycles)\n",
                                  min, avg, max);
-#endif
-#ifdef CCIO_COLLECT_STATS
+
                len += seq_printf(m, "pci_map_single(): %8ld calls  %8ld pages (avg %d/1000)\n",
                                  ioc->msingle_calls, ioc->msingle_pages,
                                  (int)((ioc->msingle_pages * 1000)/ioc->msingle_calls));
@@ -1400,7 +1399,7 @@ ccio_init_resource(struct resource *res, char *name, void __iomem *ioaddr)
        result = insert_resource(&iomem_resource, res);
        if (result < 0) {
                printk(KERN_ERR "%s() failed to claim CCIO bus address space (%08lx,%08lx)\n", 
-                       __func__, res->start, res->end);
+                       __func__, (unsigned long)res->start, (unsigned long)res->end);
        }
 }
 
@@ -1551,7 +1550,8 @@ static int __init ccio_probe(struct parisc_device *dev)
 
        ioc->name = dev->id.hversion == U2_IOA_RUNWAY ? "U2" : "UTurn";
 
-       printk(KERN_INFO "Found %s at 0x%lx\n", ioc->name, dev->hpa.start);
+       printk(KERN_INFO "Found %s at 0x%lx\n", ioc->name,
+               (unsigned long)dev->hpa.start);
 
        for (i = 0; i < ioc_count; i++) {
                ioc_p = &(*ioc_p)->next;
index bb5a1c9597cb50d814c927ed2a1447ac03495f49..52ae0b1d470ccd24ee39a32fd04d29ac86722d44 100644 (file)
@@ -819,7 +819,9 @@ dino_bridge_init(struct dino_device *dino_dev, const char *name)
 
                result = ccio_request_resource(dino_dev->hba.dev, &res[i]);
                if (result < 0) {
-                       printk(KERN_ERR "%s: failed to claim PCI Bus address space %d (0x%lx-0x%lx)!\n", name, i, res[i].start, res[i].end);
+                       printk(KERN_ERR "%s: failed to claim PCI Bus address "
+                              "space %d (0x%lx-0x%lx)!\n", name, i,
+                              (unsigned long)res[i].start, (unsigned long)res[i].end);
                        return result;
                }
        }
@@ -899,7 +901,8 @@ static int __init dino_common_init(struct parisc_device *dev,
        if (request_resource(&ioport_resource, res) < 0) {
                printk(KERN_ERR "%s: request I/O Port region failed "
                       "0x%lx/%lx (hpa 0x%p)\n",
-                      name, res->start, res->end, dino_dev->hba.base_addr);
+                      name, (unsigned long)res->start, (unsigned long)res->end,
+                      dino_dev->hba.base_addr);
                return 1;
        }
 
index 7891db50c483bc6e6def22133ae78e17cc97da50..f415fdd9a88599296a9c5195854c25192b8fbc0b 100644 (file)
@@ -314,7 +314,7 @@ static int __init eisa_probe(struct parisc_device *dev)
        char *name = is_mongoose(dev) ? "Mongoose" : "Wax";
 
        printk(KERN_INFO "%s EISA Adapter found at 0x%08lx\n", 
-               name, dev->hpa.start);
+               name, (unsigned long)dev->hpa.start);
 
        eisa_dev.hba.dev = dev;
        eisa_dev.hba.iommu = ccio_get_iommu(dev);
index 6d8aae003f6c19c9fd733766b897d082ccb7a657..c709ecc2b7f71626622f9c6f834b4c2160e374b0 100644 (file)
@@ -98,7 +98,7 @@ static int configure_memory(const unsigned char *buf,
                        res->start = mem_parent->start + get_24(buf+len+2);
                        res->end = res->start + get_16(buf+len+5)*1024;
                        res->flags = IORESOURCE_MEM;
-                       printk("memory %lx-%lx ", res->start, res->end);
+                       printk("memory %lx-%lx ", (unsigned long)res->start, (unsigned long)res->end);
                        result = request_resource(mem_parent, res);
                        if (result < 0) {
                                printk("\n" KERN_ERR "EISA Enumerator: failed to claim EISA Bus address space!\n");
@@ -188,7 +188,7 @@ static int configure_port(const unsigned char *buf, struct resource *io_parent,
                        res->start = get_16(buf+len+1);
                        res->end = get_16(buf+len+1)+(c&HPEE_PORT_SIZE_MASK)+1;
                        res->flags = IORESOURCE_IO;
-                       printk("ioports %lx-%lx ", res->start, res->end);
+                       printk("ioports %lx-%lx ", (unsigned long)res->start, (unsigned long)res->end);
                        result = request_resource(io_parent, res);
                        if (result < 0) {
                                printk("\n" KERN_ERR "EISA Enumerator: failed to claim EISA Bus address space!\n");
index 501aaf1f253f28c4af811d747d1d0461f790133d..73348c4047e98d249a9a576b87f2e8d44edaf623 100644 (file)
@@ -714,7 +714,7 @@ static void iosapic_set_affinity_irq(unsigned int irq,
        if (dest_cpu < 0)
                return;
 
-       irq_desc[irq].affinity = cpumask_of_cpu(dest_cpu);
+       cpumask_copy(irq_desc[irq].affinity, cpumask_of(dest_cpu));
        vi->txn_addr = txn_affinity_addr(irq, dest_cpu);
 
        spin_lock_irqsave(&iosapic_lock, flags);
index 454b6532e40998cce76aa3eeb7ed8fdaa901748a..9581d3619450d609ca69e305d4fcad7148f9ab93 100644 (file)
@@ -3,7 +3,7 @@
  *
  *      (c) Copyright 2000 Red Hat Software
  *      (c) Copyright 2000 Helge Deller <hdeller@redhat.com>
- *      (c) Copyright 2001-2005 Helge Deller <deller@gmx.de>
+ *      (c) Copyright 2001-2009 Helge Deller <deller@gmx.de>
  *      (c) Copyright 2001 Randolph Chung <tausq@debian.org>
  *
  *      This program is free software; you can redistribute it and/or modify
@@ -243,13 +243,11 @@ static int __init led_create_procfs(void)
 
        proc_pdc_root = proc_mkdir("pdc", 0);
        if (!proc_pdc_root) return -1;
-       proc_pdc_root->owner = THIS_MODULE;
        ent = create_proc_entry("led", S_IFREG|S_IRUGO|S_IWUSR, proc_pdc_root);
        if (!ent) return -1;
        ent->data = (void *)LED_NOLCD; /* LED */
        ent->read_proc = led_proc_read;
        ent->write_proc = led_proc_write;
-       ent->owner = THIS_MODULE;
 
        if (led_type == LED_HASLCD)
        {
@@ -258,7 +256,6 @@ static int __init led_create_procfs(void)
                ent->data = (void *)LED_HASLCD; /* LCD */
                ent->read_proc = led_proc_read;
                ent->write_proc = led_proc_write;
-               ent->owner = THIS_MODULE;
        }
 
        return 0;
@@ -463,9 +460,20 @@ static void led_work_func (struct work_struct *unused)
        if (likely(led_lanrxtx))  currentleds |= led_get_net_activity();
        if (likely(led_diskio))   currentleds |= led_get_diskio_activity();
 
-       /* blink all LEDs twice a second if we got an Oops (HPMC) */
-       if (unlikely(oops_in_progress)) 
-               currentleds = (count_HZ<=(HZ/2)) ? 0 : 0xff;
+       /* blink LEDs if we got an Oops (HPMC) */
+       if (unlikely(oops_in_progress)) {
+               if (boot_cpu_data.cpu_type >= pcxl2) {
+                       /* newer machines don't have loadavg. LEDs, so we
+                        * let all LEDs blink twice per second instead */
+                       currentleds = (count_HZ <= (HZ/2)) ? 0 : 0xff;
+               } else {
+                       /* old machines: blink loadavg. LEDs twice per second */
+                       if (count_HZ <= (HZ/2))
+                               currentleds &= ~(LED4|LED5|LED6|LED7);
+                       else
+                               currentleds |= (LED4|LED5|LED6|LED7);
+               }
+       }
 
        if (currentleds != lastleds)
        {
@@ -511,7 +519,7 @@ static int led_halt(struct notifier_block *nb, unsigned long event, void *buf)
        
        /* Cancel the work item and delete the queue */
        if (led_wq) {
-               cancel_rearming_delayed_workqueue(led_wq, &led_task);
+               cancel_delayed_work_sync(&led_task);
                destroy_workqueue(led_wq);
                led_wq = NULL;
        }
@@ -630,7 +638,7 @@ int lcd_print( const char *str )
        
        /* temporarily disable the led work task */
        if (led_wq)
-               cancel_rearming_delayed_workqueue(led_wq, &led_task);
+               cancel_delayed_work_sync(&led_task);
 
        /* copy display string to buffer for procfs */
        strlcpy(lcd_text, str, sizeof(lcd_text));
index 9dbd5066acafee0594b7d20938822d6f7a3f3dbd..23e56a564e05056a208ac41d98828d32987865b7 100644 (file)
@@ -164,7 +164,8 @@ static inline void context_clear_entry(struct context_entry *context)
  * 1: writable
  * 2-6: reserved
  * 7: super page
- * 8-11: available
+ * 8-10: available
+ * 11: snoop behavior
  * 12-63: Host physcial address
  */
 struct dma_pte {
@@ -186,6 +187,11 @@ static inline void dma_set_pte_writable(struct dma_pte *pte)
        pte->val |= DMA_PTE_WRITE;
 }
 
+static inline void dma_set_pte_snp(struct dma_pte *pte)
+{
+       pte->val |= DMA_PTE_SNP;
+}
+
 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
 {
        pte->val = (pte->val & ~3) | (prot & 3);
@@ -231,6 +237,7 @@ struct dmar_domain {
        int             flags;          /* flags to find out type of domain */
 
        int             iommu_coherency;/* indicate coherency of iommu access */
+       int             iommu_snooping; /* indicate snooping control feature*/
        int             iommu_count;    /* reference count of iommu */
        spinlock_t      iommu_lock;     /* protect iommu set in domain */
        u64             max_addr;       /* maximum mapped address */
@@ -421,7 +428,6 @@ static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
        return g_iommus[iommu_id];
 }
 
-/* "Coherency" capability may be different across iommus */
 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 {
        int i;
@@ -438,6 +444,29 @@ static void domain_update_iommu_coherency(struct dmar_domain *domain)
        }
 }
 
+static void domain_update_iommu_snooping(struct dmar_domain *domain)
+{
+       int i;
+
+       domain->iommu_snooping = 1;
+
+       i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
+       for (; i < g_num_of_iommus; ) {
+               if (!ecap_sc_support(g_iommus[i]->ecap)) {
+                       domain->iommu_snooping = 0;
+                       break;
+               }
+               i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
+       }
+}
+
+/* Some capabilities may be different across iommus */
+static void domain_update_iommu_cap(struct dmar_domain *domain)
+{
+       domain_update_iommu_coherency(domain);
+       domain_update_iommu_snooping(domain);
+}
+
 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
 {
        struct dmar_drhd_unit *drhd = NULL;
@@ -689,15 +718,17 @@ static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
 {
        int addr_width = agaw_to_width(domain->agaw);
+       int npages;
 
        start &= (((u64)1) << addr_width) - 1;
        end &= (((u64)1) << addr_width) - 1;
        /* in case it's partial page */
        start = PAGE_ALIGN(start);
        end &= PAGE_MASK;
+       npages = (end - start) / VTD_PAGE_SIZE;
 
        /* we don't need lock here, nobody else touches the iova range */
-       while (start < end) {
+       while (npages--) {
                dma_pte_clear_one(domain, start);
                start += VTD_PAGE_SIZE;
        }
@@ -1241,6 +1272,11 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
        else
                domain->iommu_coherency = 0;
 
+       if (ecap_sc_support(iommu->ecap))
+               domain->iommu_snooping = 1;
+       else
+               domain->iommu_snooping = 0;
+
        domain->iommu_count = 1;
 
        /* always allocate the top pgd */
@@ -1369,7 +1405,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
        spin_lock_irqsave(&domain->iommu_lock, flags);
        if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
                domain->iommu_count++;
-               domain_update_iommu_coherency(domain);
+               domain_update_iommu_cap(domain);
        }
        spin_unlock_irqrestore(&domain->iommu_lock, flags);
        return 0;
@@ -1469,6 +1505,8 @@ domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
                BUG_ON(dma_pte_addr(pte));
                dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
                dma_set_pte_prot(pte, prot);
+               if (prot & DMA_PTE_SNP)
+                       dma_set_pte_snp(pte);
                domain_flush_cache(domain, pte, sizeof(*pte));
                start_pfn++;
                index++;
@@ -2119,7 +2157,7 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
 error:
        if (iova)
                __free_iova(&domain->iovad, iova);
-       printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
+       printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
                pci_name(pdev), size, (unsigned long long)paddr, dir);
        return 0;
 }
@@ -2218,7 +2256,7 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
        start_addr = iova->pfn_lo << PAGE_SHIFT;
        size = aligned_size((u64)dev_addr, size);
 
-       pr_debug("Device %s unmapping: %lx@%llx\n",
+       pr_debug("Device %s unmapping: %zx@%llx\n",
                pci_name(pdev), size, (unsigned long long)start_addr);
 
        /*  clear the whole page */
@@ -2282,8 +2320,6 @@ static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
        free_pages((unsigned long)vaddr, order);
 }
 
-#define SG_ENT_VIRT_ADDRESS(sg)        (sg_virt((sg)))
-
 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
                           int nelems, enum dma_data_direction dir,
                           struct dma_attrs *attrs)
@@ -2294,7 +2330,7 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
        unsigned long start_addr;
        struct iova *iova;
        size_t size = 0;
-       void *addr;
+       phys_addr_t addr;
        struct scatterlist *sg;
        struct intel_iommu *iommu;
 
@@ -2310,7 +2346,7 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
        if (!iova)
                return;
        for_each_sg(sglist, sg, nelems, i) {
-               addr = SG_ENT_VIRT_ADDRESS(sg);
+               addr = page_to_phys(sg_page(sg)) + sg->offset;
                size += aligned_size((u64)addr, sg->length);
        }
 
@@ -2337,7 +2373,7 @@ static int intel_nontranslate_map_sg(struct device *hddev,
 
        for_each_sg(sglist, sg, nelems, i) {
                BUG_ON(!sg_page(sg));
-               sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
+               sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
                sg->dma_length = sg->length;
        }
        return nelems;
@@ -2346,7 +2382,7 @@ static int intel_nontranslate_map_sg(struct device *hddev,
 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
                        enum dma_data_direction dir, struct dma_attrs *attrs)
 {
-       void *addr;
+       phys_addr_t addr;
        int i;
        struct pci_dev *pdev = to_pci_dev(hwdev);
        struct dmar_domain *domain;
@@ -2370,8 +2406,7 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne
        iommu = domain_get_iommu(domain);
 
        for_each_sg(sglist, sg, nelems, i) {
-               addr = SG_ENT_VIRT_ADDRESS(sg);
-               addr = (void *)virt_to_phys(addr);
+               addr = page_to_phys(sg_page(sg)) + sg->offset;
                size += aligned_size((u64)addr, sg->length);
        }
 
@@ -2394,8 +2429,7 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne
        start_addr = iova->pfn_lo << PAGE_SHIFT;
        offset = 0;
        for_each_sg(sglist, sg, nelems, i) {
-               addr = SG_ENT_VIRT_ADDRESS(sg);
-               addr = (void *)virt_to_phys(addr);
+               addr = page_to_phys(sg_page(sg)) + sg->offset;
                size = aligned_size((u64)addr, sg->length);
                ret = domain_page_mapping(domain, start_addr + offset,
                        ((u64)addr) & PAGE_MASK,
@@ -2628,6 +2662,33 @@ static int vm_domain_add_dev_info(struct dmar_domain *domain,
        return 0;
 }
 
+static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
+                                          struct pci_dev *pdev)
+{
+       struct pci_dev *tmp, *parent;
+
+       if (!iommu || !pdev)
+               return;
+
+       /* dependent device detach */
+       tmp = pci_find_upstream_pcie_bridge(pdev);
+       /* Secondary interface's bus number and devfn 0 */
+       if (tmp) {
+               parent = pdev->bus->self;
+               while (parent != tmp) {
+                       iommu_detach_dev(iommu, parent->bus->number,
+                               parent->devfn);
+                       parent = parent->bus->self;
+               }
+               if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
+                       iommu_detach_dev(iommu,
+                               tmp->subordinate->number, 0);
+               else /* this is a legacy PCI bridge */
+                       iommu_detach_dev(iommu,
+                               tmp->bus->number, tmp->devfn);
+       }
+}
+
 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
                                          struct pci_dev *pdev)
 {
@@ -2653,6 +2714,7 @@ static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
                        spin_unlock_irqrestore(&device_domain_lock, flags);
 
                        iommu_detach_dev(iommu, info->bus, info->devfn);
+                       iommu_detach_dependent_devices(iommu, pdev);
                        free_devinfo_mem(info);
 
                        spin_lock_irqsave(&device_domain_lock, flags);
@@ -2676,7 +2738,7 @@ static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
                spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
                clear_bit(iommu->seq_id, &domain->iommu_bmp);
                domain->iommu_count--;
-               domain_update_iommu_coherency(domain);
+               domain_update_iommu_cap(domain);
                spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
        }
 
@@ -2702,15 +2764,16 @@ static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
 
                iommu = device_to_iommu(info->bus, info->devfn);
                iommu_detach_dev(iommu, info->bus, info->devfn);
+               iommu_detach_dependent_devices(iommu, info->dev);
 
                /* clear this iommu in iommu_bmp, update iommu count
-                * and coherency
+                * and capabilities
                 */
                spin_lock_irqsave(&domain->iommu_lock, flags2);
                if (test_and_clear_bit(iommu->seq_id,
                                       &domain->iommu_bmp)) {
                        domain->iommu_count--;
-                       domain_update_iommu_coherency(domain);
+                       domain_update_iommu_cap(domain);
                }
                spin_unlock_irqrestore(&domain->iommu_lock, flags2);
 
@@ -2933,6 +2996,8 @@ static int intel_iommu_map_range(struct iommu_domain *domain,
                prot |= DMA_PTE_READ;
        if (iommu_prot & IOMMU_WRITE)
                prot |= DMA_PTE_WRITE;
+       if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
+               prot |= DMA_PTE_SNP;
 
        max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
        if (dmar_domain->max_addr < max_addr) {
@@ -2986,6 +3051,17 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
        return phys;
 }
 
+static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
+                                     unsigned long cap)
+{
+       struct dmar_domain *dmar_domain = domain->priv;
+
+       if (cap == IOMMU_CAP_CACHE_COHERENCY)
+               return dmar_domain->iommu_snooping;
+
+       return 0;
+}
+
 static struct iommu_ops intel_iommu_ops = {
        .domain_init    = intel_iommu_domain_init,
        .domain_destroy = intel_iommu_domain_destroy,
@@ -2994,6 +3070,7 @@ static struct iommu_ops intel_iommu_ops = {
        .map            = intel_iommu_map_range,
        .unmap          = intel_iommu_unmap_range,
        .iova_to_phys   = intel_iommu_iova_to_phys,
+       .domain_has_cap = intel_iommu_domain_has_cap,
 };
 
 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
index 4ed64d8e95e709dd432686486c963df413ea189b..5143a760153b9626aaf15df274e795732951ed80 100644 (file)
@@ -63,7 +63,7 @@ static void cmx255_pcmcia_socket_state(struct soc_pcmcia_socket *skt,
                                       struct pcmcia_state *state)
 {
        int cd = skt->nr ? GPIO_PCMCIA_S1_CD_VALID : GPIO_PCMCIA_S0_CD_VALID;
-       int rdy = skt->nr ? GPIO_PCMCIA_S0_RDYINT : GPIO_PCMCIA_S1_RDYINT;
+       int rdy = skt->nr ? GPIO_PCMCIA_S1_RDYINT : GPIO_PCMCIA_S0_RDYINT;
 
        state->detect = !gpio_get_value(cd);
        state->ready  = !!gpio_get_value(rdy);
index e7e0cf102d6dcacc00272bd299c5366dbfc4f527..e58c0ce65aa6d4f6cf750feba06591df4d4ca33c 100644 (file)
@@ -29,8 +29,12 @@ config REGULATOR_DEBUG
          Say yes here to enable debugging support.
 
 config REGULATOR_FIXED_VOLTAGE
-       tristate
+       tristate "Fixed voltage regulator support"
        default n
+       help
+         This driver provides support for fixed voltage regulators,
+         useful for systems which use a combination of software
+         managed regulators and simple non-configurable regulators.
 
 config REGULATOR_VIRTUAL_CONSUMER
        tristate "Virtual regulator consumer support"
@@ -52,6 +56,13 @@ config REGULATOR_BQ24022
          charging select between 100 mA and 500 mA charging current
          limit.
 
+config REGULATOR_TWL4030
+       bool "TI TWL4030/TWL5030/TPS695x0 PMIC"
+       depends on TWL4030_CORE
+       help
+         This driver supports the voltage regulators provided by
+         this family of companion chips.
+
 config REGULATOR_WM8350
        tristate "Wolfson Microelectroncis WM8350 AudioPlus PMIC"
        depends on MFD_WM8350
index 61b30c6ddecc1cf86b43b694a9939300ddbf941c..bac133afc061473f8cd74662fe9a7af091bba239 100644 (file)
@@ -8,6 +8,7 @@ obj-$(CONFIG_REGULATOR_FIXED_VOLTAGE) += fixed.o
 obj-$(CONFIG_REGULATOR_VIRTUAL_CONSUMER) += virtual.o
 
 obj-$(CONFIG_REGULATOR_BQ24022) += bq24022.o
+obj-$(CONFIG_REGULATOR_TWL4030) += twl4030-regulator.o
 obj-$(CONFIG_REGULATOR_WM8350) += wm8350-regulator.o
 obj-$(CONFIG_REGULATOR_WM8400) += wm8400-regulator.o
 obj-$(CONFIG_REGULATOR_DA903X) += da903x.o
index c175e38a4cd56477e364ade9e5b4295b0b76142b..7ecb820ceebc5b21568ef80c07b019041df7a2d6 100644 (file)
@@ -105,7 +105,8 @@ static int __init bq24022_probe(struct platform_device *pdev)
        ret = gpio_direction_output(pdata->gpio_iset2, 0);
        ret = gpio_direction_output(pdata->gpio_nce, 1);
 
-       bq24022 = regulator_register(&bq24022_desc, &pdev->dev, pdata);
+       bq24022 = regulator_register(&bq24022_desc, &pdev->dev,
+                                    pdata->init_data, pdata);
        if (IS_ERR(bq24022)) {
                dev_dbg(&pdev->dev, "couldn't register regulator\n");
                ret = PTR_ERR(bq24022);
index f511a406fcaac4fca414bb954124d3e11f55cc86..01f7702a805dc462f29d59110c1c76bb9985c480 100644 (file)
 static DEFINE_MUTEX(regulator_list_mutex);
 static LIST_HEAD(regulator_list);
 static LIST_HEAD(regulator_map_list);
-
-/*
- * struct regulator_dev
- *
- * Voltage / Current regulator class device. One for each regulator.
- */
-struct regulator_dev {
-       struct regulator_desc *desc;
-       int use_count;
-
-       /* lists we belong to */
-       struct list_head list; /* list of all regulators */
-       struct list_head slist; /* list of supplied regulators */
-
-       /* lists we own */
-       struct list_head consumer_list; /* consumers we supply */
-       struct list_head supply_list; /* regulators we supply */
-
-       struct blocking_notifier_head notifier;
-       struct mutex mutex; /* consumer lock */
-       struct module *owner;
-       struct device dev;
-       struct regulation_constraints *constraints;
-       struct regulator_dev *supply;   /* for tree */
-
-       void *reg_data;         /* regulator_dev data */
-};
+static int has_full_constraints;
 
 /*
  * struct regulator_map
@@ -79,7 +53,6 @@ struct regulator {
        int uA_load;
        int min_uV;
        int max_uV;
-       int enabled; /* count of client enables */
        char *supply_name;
        struct device_attribute dev_attr;
        struct regulator_dev *rdev;
@@ -312,6 +285,47 @@ static ssize_t regulator_state_show(struct device *dev,
 }
 static DEVICE_ATTR(state, 0444, regulator_state_show, NULL);
 
+static ssize_t regulator_status_show(struct device *dev,
+                                  struct device_attribute *attr, char *buf)
+{
+       struct regulator_dev *rdev = dev_get_drvdata(dev);
+       int status;
+       char *label;
+
+       status = rdev->desc->ops->get_status(rdev);
+       if (status < 0)
+               return status;
+
+       switch (status) {
+       case REGULATOR_STATUS_OFF:
+               label = "off";
+               break;
+       case REGULATOR_STATUS_ON:
+               label = "on";
+               break;
+       case REGULATOR_STATUS_ERROR:
+               label = "error";
+               break;
+       case REGULATOR_STATUS_FAST:
+               label = "fast";
+               break;
+       case REGULATOR_STATUS_NORMAL:
+               label = "normal";
+               break;
+       case REGULATOR_STATUS_IDLE:
+               label = "idle";
+               break;
+       case REGULATOR_STATUS_STANDBY:
+               label = "standby";
+               break;
+       default:
+               return -ERANGE;
+       }
+
+       return sprintf(buf, "%s\n", label);
+}
+static DEVICE_ATTR(status, 0444, regulator_status_show, NULL);
+
 static ssize_t regulator_min_uA_show(struct device *dev,
                                    struct device_attribute *attr, char *buf)
 {
@@ -678,6 +692,73 @@ static int set_machine_constraints(struct regulator_dev *rdev,
        else
                name = "regulator";
 
+       /* constrain machine-level voltage specs to fit
+        * the actual range supported by this regulator.
+        */
+       if (ops->list_voltage && rdev->desc->n_voltages) {
+               int     count = rdev->desc->n_voltages;
+               int     i;
+               int     min_uV = INT_MAX;
+               int     max_uV = INT_MIN;
+               int     cmin = constraints->min_uV;
+               int     cmax = constraints->max_uV;
+
+               /* it's safe to autoconfigure fixed-voltage supplies */
+               if (count == 1 && !cmin) {
+                       cmin = INT_MIN;
+                       cmax = INT_MAX;
+               }
+
+               /* voltage constraints are optional */
+               if ((cmin == 0) && (cmax == 0))
+                       goto out;
+
+               /* else require explicit machine-level constraints */
+               if (cmin <= 0 || cmax <= 0 || cmax < cmin) {
+                       pr_err("%s: %s '%s' voltage constraints\n",
+                                      __func__, "invalid", name);
+                       ret = -EINVAL;
+                       goto out;
+               }
+
+               /* initial: [cmin..cmax] valid, [min_uV..max_uV] not */
+               for (i = 0; i < count; i++) {
+                       int     value;
+
+                       value = ops->list_voltage(rdev, i);
+                       if (value <= 0)
+                               continue;
+
+                       /* maybe adjust [min_uV..max_uV] */
+                       if (value >= cmin && value < min_uV)
+                               min_uV = value;
+                       if (value <= cmax && value > max_uV)
+                               max_uV = value;
+               }
+
+               /* final: [min_uV..max_uV] valid iff constraints valid */
+               if (max_uV < min_uV) {
+                       pr_err("%s: %s '%s' voltage constraints\n",
+                                      __func__, "unsupportable", name);
+                       ret = -EINVAL;
+                       goto out;
+               }
+
+               /* use regulator's subset of machine constraints */
+               if (constraints->min_uV < min_uV) {
+                       pr_debug("%s: override '%s' %s, %d -> %d\n",
+                                      __func__, name, "min_uV",
+                                       constraints->min_uV, min_uV);
+                       constraints->min_uV = min_uV;
+               }
+               if (constraints->max_uV > max_uV) {
+                       pr_debug("%s: override '%s' %s, %d -> %d\n",
+                                      __func__, name, "max_uV",
+                                       constraints->max_uV, max_uV);
+                       constraints->max_uV = max_uV;
+               }
+       }
+
        rdev->constraints = constraints;
 
        /* do we need to apply the constraint voltage */
@@ -695,10 +776,6 @@ static int set_machine_constraints(struct regulator_dev *rdev,
                        }
        }
 
-       /* are we enabled at boot time by firmware / bootloader */
-       if (rdev->constraints->boot_on)
-               rdev->use_count = 1;
-
        /* do we need to setup our suspend state */
        if (constraints->initial_state) {
                ret = suspend_prepare(rdev, constraints->initial_state);
@@ -710,11 +787,27 @@ static int set_machine_constraints(struct regulator_dev *rdev,
                }
        }
 
-       /* if always_on is set then turn the regulator on if it's not
-        * already on. */
-       if (constraints->always_on && ops->enable &&
-           ((ops->is_enabled && !ops->is_enabled(rdev)) ||
-            (!ops->is_enabled && !constraints->boot_on))) {
+       if (constraints->initial_mode) {
+               if (!ops->set_mode) {
+                       printk(KERN_ERR "%s: no set_mode operation for %s\n",
+                              __func__, name);
+                       ret = -EINVAL;
+                       goto out;
+               }
+
+               ret = ops->set_mode(rdev, constraints->initial_mode);
+               if (ret < 0) {
+                       printk(KERN_ERR
+                              "%s: failed to set initial mode for %s: %d\n",
+                              __func__, name, ret);
+                       goto out;
+               }
+       }
+
+       /* If the constraints say the regulator should be on at this point
+        * and we have control then make sure it is enabled.
+        */
+       if ((constraints->always_on || constraints->boot_on) && ops->enable) {
                ret = ops->enable(rdev);
                if (ret < 0) {
                        printk(KERN_ERR "%s: failed to enable %s\n",
@@ -817,6 +910,19 @@ static void unset_consumer_device_supply(struct regulator_dev *rdev,
        }
 }
 
+static void unset_regulator_supplies(struct regulator_dev *rdev)
+{
+       struct regulator_map *node, *n;
+
+       list_for_each_entry_safe(node, n, &regulator_map_list, list) {
+               if (rdev == node->regulator) {
+                       list_del(&node->list);
+                       kfree(node);
+                       return;
+               }
+       }
+}
+
 #define REG_STR_SIZE   32
 
 static struct regulator *create_regulator(struct regulator_dev *rdev,
@@ -898,9 +1004,12 @@ overflow_err:
  * @id: Supply name or regulator ID.
  *
  * Returns a struct regulator corresponding to the regulator producer,
- * or IS_ERR() condition containing errno.  Use of supply names
- * configured via regulator_set_device_supply() is strongly
- * encouraged.
+ * or IS_ERR() condition containing errno.
+ *
+ * Use of supply names configured via regulator_set_device_supply() is
+ * strongly encouraged.  It is recommended that the supply name used
+ * should match the name used for the supply and/or the relevant
+ * device pins in the datasheet.
  */
 struct regulator *regulator_get(struct device *dev, const char *id)
 {
@@ -922,8 +1031,6 @@ struct regulator *regulator_get(struct device *dev, const char *id)
                        goto found;
                }
        }
-       printk(KERN_ERR "regulator: Unable to get requested regulator: %s\n",
-              id);
        mutex_unlock(&regulator_list_mutex);
        return regulator;
 
@@ -961,10 +1068,6 @@ void regulator_put(struct regulator *regulator)
        mutex_lock(&regulator_list_mutex);
        rdev = regulator->rdev;
 
-       if (WARN(regulator->enabled, "Releasing supply %s while enabled\n",
-                              regulator->supply_name))
-               _regulator_disable(rdev);
-
        /* remove any sysfs entries */
        if (regulator->dev) {
                sysfs_remove_link(&rdev->dev.kobj, regulator->supply_name);
@@ -1039,12 +1142,7 @@ int regulator_enable(struct regulator *regulator)
        int ret = 0;
 
        mutex_lock(&rdev->mutex);
-       if (regulator->enabled == 0)
-               ret = _regulator_enable(rdev);
-       else if (regulator->enabled < 0)
-               ret = -EIO;
-       if (ret == 0)
-               regulator->enabled++;
+       ret = _regulator_enable(rdev);
        mutex_unlock(&rdev->mutex);
        return ret;
 }
@@ -1055,6 +1153,11 @@ static int _regulator_disable(struct regulator_dev *rdev)
 {
        int ret = 0;
 
+       if (WARN(rdev->use_count <= 0,
+                       "unbalanced disables for %s\n",
+                       rdev->desc->name))
+               return -EIO;
+
        /* are we the last user and permitted to disable ? */
        if (rdev->use_count == 1 && !rdev->constraints->always_on) {
 
@@ -1103,16 +1206,7 @@ int regulator_disable(struct regulator *regulator)
        int ret = 0;
 
        mutex_lock(&rdev->mutex);
-       if (regulator->enabled == 1) {
-               ret = _regulator_disable(rdev);
-               if (ret == 0)
-                       regulator->uA_load = 0;
-       } else if (WARN(regulator->enabled <= 0,
-                       "unbalanced disables for supply %s\n",
-                       regulator->supply_name))
-               ret = -EIO;
-       if (ret == 0)
-               regulator->enabled--;
+       ret = _regulator_disable(rdev);
        mutex_unlock(&rdev->mutex);
        return ret;
 }
@@ -1159,7 +1253,6 @@ int regulator_force_disable(struct regulator *regulator)
        int ret;
 
        mutex_lock(&regulator->rdev->mutex);
-       regulator->enabled = 0;
        regulator->uA_load = 0;
        ret = _regulator_force_disable(regulator->rdev);
        mutex_unlock(&regulator->rdev->mutex);
@@ -1203,6 +1296,56 @@ int regulator_is_enabled(struct regulator *regulator)
 }
 EXPORT_SYMBOL_GPL(regulator_is_enabled);
 
+/**
+ * regulator_count_voltages - count regulator_list_voltage() selectors
+ * @regulator: regulator source
+ *
+ * Returns number of selectors, or negative errno.  Selectors are
+ * numbered starting at zero, and typically correspond to bitfields
+ * in hardware registers.
+ */
+int regulator_count_voltages(struct regulator *regulator)
+{
+       struct regulator_dev    *rdev = regulator->rdev;
+
+       return rdev->desc->n_voltages ? : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(regulator_count_voltages);
+
+/**
+ * regulator_list_voltage - enumerate supported voltages
+ * @regulator: regulator source
+ * @selector: identify voltage to list
+ * Context: can sleep
+ *
+ * Returns a voltage that can be passed to @regulator_set_voltage(),
+ * zero if this selector code can't be used on this sytem, or a
+ * negative errno.
+ */
+int regulator_list_voltage(struct regulator *regulator, unsigned selector)
+{
+       struct regulator_dev    *rdev = regulator->rdev;
+       struct regulator_ops    *ops = rdev->desc->ops;
+       int                     ret;
+
+       if (!ops->list_voltage || selector >= rdev->desc->n_voltages)
+               return -EINVAL;
+
+       mutex_lock(&rdev->mutex);
+       ret = ops->list_voltage(rdev, selector);
+       mutex_unlock(&rdev->mutex);
+
+       if (ret > 0) {
+               if (ret < rdev->constraints->min_uV)
+                       ret = 0;
+               else if (ret > rdev->constraints->max_uV)
+                       ret = 0;
+       }
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(regulator_list_voltage);
+
 /**
  * regulator_set_voltage - set regulator output voltage
  * @regulator: regulator source
@@ -1243,6 +1386,7 @@ int regulator_set_voltage(struct regulator *regulator, int min_uV, int max_uV)
        ret = rdev->desc->ops->set_voltage(rdev, min_uV, max_uV);
 
 out:
+       _notifier_call_chain(rdev, REGULATOR_EVENT_VOLTAGE_CHANGE, NULL);
        mutex_unlock(&rdev->mutex);
        return ret;
 }
@@ -1543,20 +1687,23 @@ int regulator_unregister_notifier(struct regulator *regulator,
 }
 EXPORT_SYMBOL_GPL(regulator_unregister_notifier);
 
-/* notify regulator consumers and downstream regulator consumers */
+/* notify regulator consumers and downstream regulator consumers.
+ * Note mutex must be held by caller.
+ */
 static void _notifier_call_chain(struct regulator_dev *rdev,
                                  unsigned long event, void *data)
 {
        struct regulator_dev *_rdev;
 
        /* call rdev chain first */
-       mutex_lock(&rdev->mutex);
        blocking_notifier_call_chain(&rdev->notifier, event, NULL);
-       mutex_unlock(&rdev->mutex);
 
        /* now notify regulator we supply */
-       list_for_each_entry(_rdev, &rdev->supply_list, slist)
-               _notifier_call_chain(_rdev, event, data);
+       list_for_each_entry(_rdev, &rdev->supply_list, slist) {
+         mutex_lock(&_rdev->mutex);
+         _notifier_call_chain(_rdev, event, data);
+         mutex_unlock(&_rdev->mutex);
+       }
 }
 
 /**
@@ -1703,6 +1850,7 @@ EXPORT_SYMBOL_GPL(regulator_bulk_free);
  *
  * Called by regulator drivers to notify clients a regulator event has
  * occurred. We also notify regulator clients downstream.
+ * Note lock must be held by caller.
  */
 int regulator_notifier_call_chain(struct regulator_dev *rdev,
                                  unsigned long event, void *data)
@@ -1744,6 +1892,11 @@ static int add_regulator_attributes(struct regulator_dev *rdev)
                if (status < 0)
                        return status;
        }
+       if (ops->get_status) {
+               status = device_create_file(dev, &dev_attr_status);
+               if (status < 0)
+                       return status;
+       }
 
        /* some attributes are type-specific */
        if (rdev->desc->type == REGULATOR_CURRENT) {
@@ -1828,17 +1981,18 @@ static int add_regulator_attributes(struct regulator_dev *rdev)
  * regulator_register - register regulator
  * @regulator_desc: regulator to register
  * @dev: struct device for the regulator
+ * @init_data: platform provided init data, passed through by driver
  * @driver_data: private regulator data
  *
  * Called by regulator drivers to register a regulator.
  * Returns 0 on success.
  */
 struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
-       struct device *dev, void *driver_data)
+       struct device *dev, struct regulator_init_data *init_data,
+       void *driver_data)
 {
        static atomic_t regulator_no = ATOMIC_INIT(0);
        struct regulator_dev *rdev;
-       struct regulator_init_data *init_data = dev->platform_data;
        int ret, i;
 
        if (regulator_desc == NULL)
@@ -1945,6 +2099,7 @@ void regulator_unregister(struct regulator_dev *rdev)
                return;
 
        mutex_lock(&regulator_list_mutex);
+       unset_regulator_supplies(rdev);
        list_del(&rdev->list);
        if (rdev->supply)
                sysfs_remove_link(&rdev->dev.kobj, "supply");
@@ -1988,6 +2143,23 @@ out:
 }
 EXPORT_SYMBOL_GPL(regulator_suspend_prepare);
 
+/**
+ * regulator_has_full_constraints - the system has fully specified constraints
+ *
+ * Calling this function will cause the regulator API to disable all
+ * regulators which have a zero use count and don't have an always_on
+ * constraint in a late_initcall.
+ *
+ * The intention is that this will become the default behaviour in a
+ * future kernel release so users are encouraged to use this facility
+ * now.
+ */
+void regulator_has_full_constraints(void)
+{
+       has_full_constraints = 1;
+}
+EXPORT_SYMBOL_GPL(regulator_has_full_constraints);
+
 /**
  * rdev_get_drvdata - get rdev regulator driver data
  * @rdev: regulator
@@ -2055,3 +2227,77 @@ static int __init regulator_init(void)
 
 /* init early to allow our consumers to complete system booting */
 core_initcall(regulator_init);
+
+static int __init regulator_init_complete(void)
+{
+       struct regulator_dev *rdev;
+       struct regulator_ops *ops;
+       struct regulation_constraints *c;
+       int enabled, ret;
+       const char *name;
+
+       mutex_lock(&regulator_list_mutex);
+
+       /* If we have a full configuration then disable any regulators
+        * which are not in use or always_on.  This will become the
+        * default behaviour in the future.
+        */
+       list_for_each_entry(rdev, &regulator_list, list) {
+               ops = rdev->desc->ops;
+               c = rdev->constraints;
+
+               if (c->name)
+                       name = c->name;
+               else if (rdev->desc->name)
+                       name = rdev->desc->name;
+               else
+                       name = "regulator";
+
+               if (!ops->disable || c->always_on)
+                       continue;
+
+               mutex_lock(&rdev->mutex);
+
+               if (rdev->use_count)
+                       goto unlock;
+
+               /* If we can't read the status assume it's on. */
+               if (ops->is_enabled)
+                       enabled = ops->is_enabled(rdev);
+               else
+                       enabled = 1;
+
+               if (!enabled)
+                       goto unlock;
+
+               if (has_full_constraints) {
+                       /* We log since this may kill the system if it
+                        * goes wrong. */
+                       printk(KERN_INFO "%s: disabling %s\n",
+                              __func__, name);
+                       ret = ops->disable(rdev);
+                       if (ret != 0) {
+                               printk(KERN_ERR
+                                      "%s: couldn't disable %s: %d\n",
+                                      __func__, name, ret);
+                       }
+               } else {
+                       /* The intention is that in future we will
+                        * assume that full constraints are provided
+                        * so warn even if we aren't going to do
+                        * anything here.
+                        */
+                       printk(KERN_WARNING
+                              "%s: incomplete constraints, leaving %s on\n",
+                              __func__, name);
+               }
+
+unlock:
+               mutex_unlock(&rdev->mutex);
+       }
+
+       mutex_unlock(&regulator_list_mutex);
+
+       return 0;
+}
+late_initcall(regulator_init_complete);
index fe77730a7edb73442b088a4e325dd70d4c2a6366..72b15495183cfd290289fd9256babe296bbbf263 100644 (file)
@@ -471,7 +471,8 @@ static int __devinit da903x_regulator_probe(struct platform_device *pdev)
        if (ri->desc.id == DA9030_ID_LDO1 || ri->desc.id == DA9030_ID_LDO15)
                ri->desc.ops = &da9030_regulator_ldo1_15_ops;
 
-       rdev = regulator_register(&ri->desc, &pdev->dev, ri);
+       rdev = regulator_register(&ri->desc, &pdev->dev,
+                                 pdev->dev.platform_data, ri);
        if (IS_ERR(rdev)) {
                dev_err(&pdev->dev, "failed to register regulator %s\n",
                                ri->desc.name);
index d31db3e14913fb1d1281c6a4eea49bf145242b76..23d554628a76128e6786bb9666a67c0179e4ec32 100644 (file)
@@ -73,7 +73,8 @@ static int regulator_fixed_voltage_probe(struct platform_device *pdev)
 
        drvdata->microvolts = config->microvolts;
 
-       drvdata->dev = regulator_register(&drvdata->desc, drvdata);
+       drvdata->dev = regulator_register(&drvdata->desc, &pdev->dev,
+                                         config->init_data, drvdata);
        if (IS_ERR(drvdata->dev)) {
                ret = PTR_ERR(drvdata->dev);
                goto err_name;
index 4cc85ec6e1208b25abf0ed4121726002739adbc2..cd761d85c8fdbc39e42a6cf3d67864744c64c76b 100644 (file)
@@ -284,7 +284,8 @@ static int __devinit pcf50633_regulator_probe(struct platform_device *pdev)
        /* Already set by core driver */
        pcf = platform_get_drvdata(pdev);
 
-       rdev = regulator_register(&regulators[pdev->id], &pdev->dev, pcf);
+       rdev = regulator_register(&regulators[pdev->id], &pdev->dev,
+                                 pdev->dev.platform_data, pcf);
        if (IS_ERR(rdev))
                return PTR_ERR(rdev);
 
diff --git a/drivers/regulator/twl4030-regulator.c b/drivers/regulator/twl4030-regulator.c
new file mode 100644 (file)
index 0000000..e2032fb
--- /dev/null
@@ -0,0 +1,500 @@
+/*
+ * twl4030-regulator.c -- support regulators in twl4030 family chips
+ *
+ * Copyright (C) 2008 David Brownell
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/driver.h>
+#include <linux/regulator/machine.h>
+#include <linux/i2c/twl4030.h>
+
+
+/*
+ * The TWL4030/TW5030/TPS659x0 family chips include power management, a
+ * USB OTG transceiver, an RTC, ADC, PWM, and lots more.  Some versions
+ * include an audio codec, battery charger, and more voltage regulators.
+ * These chips are often used in OMAP-based systems.
+ *
+ * This driver implements software-based resource control for various
+ * voltage regulators.  This is usually augmented with state machine
+ * based control.
+ */
+
+struct twlreg_info {
+       /* start of regulator's PM_RECEIVER control register bank */
+       u8                      base;
+
+       /* twl4030 resource ID, for resource control state machine */
+       u8                      id;
+
+       /* voltage in mV = table[VSEL]; table_len must be a power-of-two */
+       u8                      table_len;
+       const u16               *table;
+
+       /* chip constraints on regulator behavior */
+       u16                     min_mV;
+
+       /* used by regulator core */
+       struct regulator_desc   desc;
+};
+
+
+/* LDO control registers ... offset is from the base of its register bank.
+ * The first three registers of all power resource banks help hardware to
+ * manage the various resource groups.
+ */
+#define VREG_GRP               0
+#define VREG_TYPE              1
+#define VREG_REMAP             2
+#define VREG_DEDICATED         3       /* LDO control */
+
+
+static inline int
+twl4030reg_read(struct twlreg_info *info, unsigned offset)
+{
+       u8 value;
+       int status;
+
+       status = twl4030_i2c_read_u8(TWL4030_MODULE_PM_RECEIVER,
+                       &value, info->base + offset);
+       return (status < 0) ? status : value;
+}
+
+static inline int
+twl4030reg_write(struct twlreg_info *info, unsigned offset, u8 value)
+{
+       return twl4030_i2c_write_u8(TWL4030_MODULE_PM_RECEIVER,
+                       value, info->base + offset);
+}
+
+/*----------------------------------------------------------------------*/
+
+/* generic power resource operations, which work on all regulators */
+
+static int twl4030reg_grp(struct regulator_dev *rdev)
+{
+       return twl4030reg_read(rdev_get_drvdata(rdev), VREG_GRP);
+}
+
+/*
+ * Enable/disable regulators by joining/leaving the P1 (processor) group.
+ * We assume nobody else is updating the DEV_GRP registers.
+ */
+
+#define P3_GRP         BIT(7)          /* "peripherals" */
+#define P2_GRP         BIT(6)          /* secondary processor, modem, etc */
+#define P1_GRP         BIT(5)          /* CPU/Linux */
+
+static int twl4030reg_is_enabled(struct regulator_dev *rdev)
+{
+       int     state = twl4030reg_grp(rdev);
+
+       if (state < 0)
+               return state;
+
+       return (state & P1_GRP) != 0;
+}
+
+static int twl4030reg_enable(struct regulator_dev *rdev)
+{
+       struct twlreg_info      *info = rdev_get_drvdata(rdev);
+       int                     grp;
+
+       grp = twl4030reg_read(info, VREG_GRP);
+       if (grp < 0)
+               return grp;
+
+       grp |= P1_GRP;
+       return twl4030reg_write(info, VREG_GRP, grp);
+}
+
+static int twl4030reg_disable(struct regulator_dev *rdev)
+{
+       struct twlreg_info      *info = rdev_get_drvdata(rdev);
+       int                     grp;
+
+       grp = twl4030reg_read(info, VREG_GRP);
+       if (grp < 0)
+               return grp;
+
+       grp &= ~P1_GRP;
+       return twl4030reg_write(info, VREG_GRP, grp);
+}
+
+static int twl4030reg_get_status(struct regulator_dev *rdev)
+{
+       int     state = twl4030reg_grp(rdev);
+
+       if (state < 0)
+               return state;
+       state &= 0x0f;
+
+       /* assume state != WARM_RESET; we'd not be running...  */
+       if (!state)
+               return REGULATOR_STATUS_OFF;
+       return (state & BIT(3))
+               ? REGULATOR_STATUS_NORMAL
+               : REGULATOR_STATUS_STANDBY;
+}
+
+static int twl4030reg_set_mode(struct regulator_dev *rdev, unsigned mode)
+{
+       struct twlreg_info      *info = rdev_get_drvdata(rdev);
+       unsigned                message;
+       int                     status;
+
+       /* We can only set the mode through state machine commands... */
+       switch (mode) {
+       case REGULATOR_MODE_NORMAL:
+               message = MSG_SINGULAR(DEV_GRP_P1, info->id, RES_STATE_ACTIVE);
+               break;
+       case REGULATOR_MODE_STANDBY:
+               message = MSG_SINGULAR(DEV_GRP_P1, info->id, RES_STATE_SLEEP);
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       /* Ensure the resource is associated with some group */
+       status = twl4030reg_grp(rdev);
+       if (status < 0)
+               return status;
+       if (!(status & (P3_GRP | P2_GRP | P1_GRP)))
+               return -EACCES;
+
+       status = twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER,
+                       message >> 8, 0x15 /* PB_WORD_MSB */ );
+       if (status >= 0)
+               return status;
+
+       return twl4030_i2c_write_u8(TWL4030_MODULE_PM_MASTER,
+                       message, 0x16 /* PB_WORD_LSB */ );
+}
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * Support for adjustable-voltage LDOs uses a four bit (or less) voltage
+ * select field in its control register.   We use tables indexed by VSEL
+ * to record voltages in milliVolts.  (Accuracy is about three percent.)
+ *
+ * Note that VSEL values for VAUX2 changed in twl5030 and newer silicon;
+ * currently handled by listing two slightly different VAUX2 regulators,
+ * only one of which will be configured.
+ *
+ * VSEL values documented as "TI cannot support these values" are flagged
+ * in these tables as UNSUP() values; we normally won't assign them.
+ *
+ * VAUX3 at 3V is incorrectly listed in some TI manuals as unsupported.
+ * TI are revising the twl5030/tps659x0 specs to support that 3.0V setting.
+ */
+#ifdef CONFIG_TWL4030_ALLOW_UNSUPPORTED
+#define UNSUP_MASK     0x0000
+#else
+#define UNSUP_MASK     0x8000
+#endif
+
+#define UNSUP(x)       (UNSUP_MASK | (x))
+#define IS_UNSUP(x)    (UNSUP_MASK & (x))
+#define LDO_MV(x)      (~UNSUP_MASK & (x))
+
+
+static const u16 VAUX1_VSEL_table[] = {
+       UNSUP(1500), UNSUP(1800), 2500, 2800,
+       3000, 3000, 3000, 3000,
+};
+static const u16 VAUX2_4030_VSEL_table[] = {
+       UNSUP(1000), UNSUP(1000), UNSUP(1200), 1300,
+       1500, 1800, UNSUP(1850), 2500,
+       UNSUP(2600), 2800, UNSUP(2850), UNSUP(3000),
+       UNSUP(3150), UNSUP(3150), UNSUP(3150), UNSUP(3150),
+};
+static const u16 VAUX2_VSEL_table[] = {
+       1700, 1700, 1900, 1300,
+       1500, 1800, 2000, 2500,
+       2100, 2800, 2200, 2300,
+       2400, 2400, 2400, 2400,
+};
+static const u16 VAUX3_VSEL_table[] = {
+       1500, 1800, 2500, 2800,
+       3000, 3000, 3000, 3000,
+};
+static const u16 VAUX4_VSEL_table[] = {
+       700, 1000, 1200, UNSUP(1300),
+       1500, 1800, UNSUP(1850), 2500,
+       UNSUP(2600), 2800, UNSUP(2850), UNSUP(3000),
+       UNSUP(3150), UNSUP(3150), UNSUP(3150), UNSUP(3150),
+};
+static const u16 VMMC1_VSEL_table[] = {
+       1850, 2850, 3000, 3150,
+};
+static const u16 VMMC2_VSEL_table[] = {
+       UNSUP(1000), UNSUP(1000), UNSUP(1200), UNSUP(1300),
+       UNSUP(1500), UNSUP(1800), 1850, UNSUP(2500),
+       2600, 2800, 2850, 3000,
+       3150, 3150, 3150, 3150,
+};
+static const u16 VPLL1_VSEL_table[] = {
+       1000, 1200, 1300, 1800,
+       UNSUP(2800), UNSUP(3000), UNSUP(3000), UNSUP(3000),
+};
+static const u16 VPLL2_VSEL_table[] = {
+       700, 1000, 1200, 1300,
+       UNSUP(1500), 1800, UNSUP(1850), UNSUP(2500),
+       UNSUP(2600), UNSUP(2800), UNSUP(2850), UNSUP(3000),
+       UNSUP(3150), UNSUP(3150), UNSUP(3150), UNSUP(3150),
+};
+static const u16 VSIM_VSEL_table[] = {
+       UNSUP(1000), UNSUP(1200), UNSUP(1300), 1800,
+       2800, 3000, 3000, 3000,
+};
+static const u16 VDAC_VSEL_table[] = {
+       1200, 1300, 1800, 1800,
+};
+
+
+static int twl4030ldo_list_voltage(struct regulator_dev *rdev, unsigned index)
+{
+       struct twlreg_info      *info = rdev_get_drvdata(rdev);
+       int                     mV = info->table[index];
+
+       return IS_UNSUP(mV) ? 0 : (LDO_MV(mV) * 1000);
+}
+
+static int
+twl4030ldo_set_voltage(struct regulator_dev *rdev, int min_uV, int max_uV)
+{
+       struct twlreg_info      *info = rdev_get_drvdata(rdev);
+       int                     vsel;
+
+       for (vsel = 0; vsel < info->table_len; vsel++) {
+               int mV = info->table[vsel];
+               int uV;
+
+               if (IS_UNSUP(mV))
+                       continue;
+               uV = LDO_MV(mV) * 1000;
+
+               /* REVISIT for VAUX2, first match may not be best/lowest */
+
+               /* use the first in-range value */
+               if (min_uV <= uV && uV <= max_uV)
+                       return twl4030reg_write(info, VREG_DEDICATED, vsel);
+       }
+
+       return -EDOM;
+}
+
+static int twl4030ldo_get_voltage(struct regulator_dev *rdev)
+{
+       struct twlreg_info      *info = rdev_get_drvdata(rdev);
+       int                     vsel = twl4030reg_read(info, VREG_DEDICATED);
+
+       if (vsel < 0)
+               return vsel;
+
+       vsel &= info->table_len - 1;
+       return LDO_MV(info->table[vsel]) * 1000;
+}
+
+static struct regulator_ops twl4030ldo_ops = {
+       .list_voltage   = twl4030ldo_list_voltage,
+
+       .set_voltage    = twl4030ldo_set_voltage,
+       .get_voltage    = twl4030ldo_get_voltage,
+
+       .enable         = twl4030reg_enable,
+       .disable        = twl4030reg_disable,
+       .is_enabled     = twl4030reg_is_enabled,
+
+       .set_mode       = twl4030reg_set_mode,
+
+       .get_status     = twl4030reg_get_status,
+};
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * Fixed voltage LDOs don't have a VSEL field to update.
+ */
+static int twl4030fixed_list_voltage(struct regulator_dev *rdev, unsigned index)
+{
+       struct twlreg_info      *info = rdev_get_drvdata(rdev);
+
+       return info->min_mV * 1000;
+}
+
+static int twl4030fixed_get_voltage(struct regulator_dev *rdev)
+{
+       struct twlreg_info      *info = rdev_get_drvdata(rdev);
+
+       return info->min_mV * 1000;
+}
+
+static struct regulator_ops twl4030fixed_ops = {
+       .list_voltage   = twl4030fixed_list_voltage,
+
+       .get_voltage    = twl4030fixed_get_voltage,
+
+       .enable         = twl4030reg_enable,
+       .disable        = twl4030reg_disable,
+       .is_enabled     = twl4030reg_is_enabled,
+
+       .set_mode       = twl4030reg_set_mode,
+
+       .get_status     = twl4030reg_get_status,
+};
+
+/*----------------------------------------------------------------------*/
+
+#define TWL_ADJUSTABLE_LDO(label, offset, num) { \
+       .base = offset, \
+       .id = num, \
+       .table_len = ARRAY_SIZE(label##_VSEL_table), \
+       .table = label##_VSEL_table, \
+       .desc = { \
+               .name = #label, \
+               .id = TWL4030_REG_##label, \
+               .n_voltages = ARRAY_SIZE(label##_VSEL_table), \
+               .ops = &twl4030ldo_ops, \
+               .type = REGULATOR_VOLTAGE, \
+               .owner = THIS_MODULE, \
+               }, \
+       }
+
+#define TWL_FIXED_LDO(label, offset, mVolts, num) { \
+       .base = offset, \
+       .id = num, \
+       .min_mV = mVolts, \
+       .desc = { \
+               .name = #label, \
+               .id = TWL4030_REG_##label, \
+               .n_voltages = 1, \
+               .ops = &twl4030fixed_ops, \
+               .type = REGULATOR_VOLTAGE, \
+               .owner = THIS_MODULE, \
+               }, \
+       }
+
+/*
+ * We list regulators here if systems need some level of
+ * software control over them after boot.
+ */
+static struct twlreg_info twl4030_regs[] = {
+       TWL_ADJUSTABLE_LDO(VAUX1, 0x17, 1),
+       TWL_ADJUSTABLE_LDO(VAUX2_4030, 0x1b, 2),
+       TWL_ADJUSTABLE_LDO(VAUX2, 0x1b, 2),
+       TWL_ADJUSTABLE_LDO(VAUX3, 0x1f, 3),
+       TWL_ADJUSTABLE_LDO(VAUX4, 0x23, 4),
+       TWL_ADJUSTABLE_LDO(VMMC1, 0x27, 5),
+       TWL_ADJUSTABLE_LDO(VMMC2, 0x2b, 6),
+       /*
+       TWL_ADJUSTABLE_LDO(VPLL1, 0x2f, 7),
+       */
+       TWL_ADJUSTABLE_LDO(VPLL2, 0x33, 8),
+       TWL_ADJUSTABLE_LDO(VSIM, 0x37, 9),
+       TWL_ADJUSTABLE_LDO(VDAC, 0x3b, 10),
+       /*
+       TWL_ADJUSTABLE_LDO(VINTANA1, 0x3f, 11),
+       TWL_ADJUSTABLE_LDO(VINTANA2, 0x43, 12),
+       TWL_ADJUSTABLE_LDO(VINTDIG, 0x47, 13),
+       TWL_SMPS(VIO, 0x4b, 14),
+       TWL_SMPS(VDD1, 0x55, 15),
+       TWL_SMPS(VDD2, 0x63, 16),
+        */
+       TWL_FIXED_LDO(VUSB1V5, 0x71, 1500, 17),
+       TWL_FIXED_LDO(VUSB1V8, 0x74, 1800, 18),
+       TWL_FIXED_LDO(VUSB3V1, 0x77, 3100, 19),
+       /* VUSBCP is managed *only* by the USB subchip */
+};
+
+static int twl4030reg_probe(struct platform_device *pdev)
+{
+       int                             i;
+       struct twlreg_info              *info;
+       struct regulator_init_data      *initdata;
+       struct regulation_constraints   *c;
+       struct regulator_dev            *rdev;
+
+       for (i = 0, info = NULL; i < ARRAY_SIZE(twl4030_regs); i++) {
+               if (twl4030_regs[i].desc.id != pdev->id)
+                       continue;
+               info = twl4030_regs + i;
+               break;
+       }
+       if (!info)
+               return -ENODEV;
+
+       initdata = pdev->dev.platform_data;
+       if (!initdata)
+               return -EINVAL;
+
+       /* Constrain board-specific capabilities according to what
+        * this driver and the chip itself can actually do.
+        */
+       c = &initdata->constraints;
+       c->valid_modes_mask &= REGULATOR_MODE_NORMAL | REGULATOR_MODE_STANDBY;
+       c->valid_ops_mask &= REGULATOR_CHANGE_VOLTAGE
+                               | REGULATOR_CHANGE_MODE
+                               | REGULATOR_CHANGE_STATUS;
+
+       rdev = regulator_register(&info->desc, &pdev->dev, initdata, info);
+       if (IS_ERR(rdev)) {
+               dev_err(&pdev->dev, "can't register %s, %ld\n",
+                               info->desc.name, PTR_ERR(rdev));
+               return PTR_ERR(rdev);
+       }
+       platform_set_drvdata(pdev, rdev);
+
+       /* NOTE:  many regulators support short-circuit IRQs (presentable
+        * as REGULATOR_OVER_CURRENT notifications?) configured via:
+        *  - SC_CONFIG
+        *  - SC_DETECT1 (vintana2, vmmc1/2, vaux1/2/3/4)
+        *  - SC_DETECT2 (vusb, vdac, vio, vdd1/2, vpll2)
+        *  - IT_CONFIG
+        */
+
+       return 0;
+}
+
+static int __devexit twl4030reg_remove(struct platform_device *pdev)
+{
+       regulator_unregister(platform_get_drvdata(pdev));
+       return 0;
+}
+
+MODULE_ALIAS("platform:twl4030_reg");
+
+static struct platform_driver twl4030reg_driver = {
+       .probe          = twl4030reg_probe,
+       .remove         = __devexit_p(twl4030reg_remove),
+       /* NOTE: short name, to work around driver model truncation of
+        * "twl4030_regulator.12" (and friends) to "twl4030_regulator.1".
+        */
+       .driver.name    = "twl4030_reg",
+       .driver.owner   = THIS_MODULE,
+};
+
+static int __init twl4030reg_init(void)
+{
+       return platform_driver_register(&twl4030reg_driver);
+}
+subsys_initcall(twl4030reg_init);
+
+static void __exit twl4030reg_exit(void)
+{
+       platform_driver_unregister(&twl4030reg_driver);
+}
+module_exit(twl4030reg_exit)
+
+MODULE_DESCRIPTION("TWL4030 regulator driver");
+MODULE_LICENSE("GPL");
index 5ddb464b1c3f1a661efb669e36f898d063b194ad..3d08348584e1798e7d0c03a73c363a6c8d40a48a 100644 (file)
@@ -226,13 +226,17 @@ static ssize_t set_mode(struct device *dev, struct device_attribute *attr,
        unsigned int mode;
        int ret;
 
-       if (strncmp(buf, "fast", strlen("fast")) == 0)
+       /*
+        * sysfs_streq() doesn't need the \n's, but we add them so the strings
+        * will be shared with show_mode(), above.
+        */
+       if (sysfs_streq(buf, "fast\n") == 0)
                mode = REGULATOR_MODE_FAST;
-       else if (strncmp(buf, "normal", strlen("normal")) == 0)
+       else if (sysfs_streq(buf, "normal\n") == 0)
                mode = REGULATOR_MODE_NORMAL;
-       else if (strncmp(buf, "idle", strlen("idle")) == 0)
+       else if (sysfs_streq(buf, "idle\n") == 0)
                mode = REGULATOR_MODE_IDLE;
-       else if (strncmp(buf, "standby", strlen("standby")) == 0)
+       else if (sysfs_streq(buf, "standby\n") == 0)
                mode = REGULATOR_MODE_STANDBY;
        else {
                dev_err(dev, "Configuring invalid mode\n");
@@ -256,7 +260,7 @@ static DEVICE_ATTR(min_microamps, 0666, show_min_uA, set_min_uA);
 static DEVICE_ATTR(max_microamps, 0666, show_max_uA, set_max_uA);
 static DEVICE_ATTR(mode, 0666, show_mode, set_mode);
 
-struct device_attribute *attributes[] = {
+static struct device_attribute *attributes[] = {
        &dev_attr_min_microvolts,
        &dev_attr_max_microvolts,
        &dev_attr_min_microamps,
index 5056e23e441471f658276c802300d5a8aba3355f..771eca1066b5916378c0620387b3ecf8dae3342a 100644 (file)
@@ -24,6 +24,9 @@
 #include <linux/regulator/driver.h>
 #include <linux/regulator/machine.h>
 
+/* Maximum value possible for VSEL */
+#define WM8350_DCDC_MAX_VSEL 0x66
+
 /* Microamps */
 static const int isink_cur[] = {
        4,
@@ -385,6 +388,14 @@ static int wm8350_dcdc_get_voltage(struct regulator_dev *rdev)
        return wm8350_dcdc_val_to_mvolts(val) * 1000;
 }
 
+static int wm8350_dcdc_list_voltage(struct regulator_dev *rdev,
+                                   unsigned selector)
+{
+       if (selector > WM8350_DCDC_MAX_VSEL)
+               return -EINVAL;
+       return wm8350_dcdc_val_to_mvolts(selector) * 1000;
+}
+
 static int wm8350_dcdc_set_suspend_voltage(struct regulator_dev *rdev, int uV)
 {
        struct wm8350 *wm8350 = rdev_get_drvdata(rdev);
@@ -775,6 +786,14 @@ static int wm8350_ldo_get_voltage(struct regulator_dev *rdev)
        return wm8350_ldo_val_to_mvolts(val) * 1000;
 }
 
+static int wm8350_ldo_list_voltage(struct regulator_dev *rdev,
+                                   unsigned selector)
+{
+       if (selector > WM8350_LDO1_VSEL_MASK)
+               return -EINVAL;
+       return wm8350_ldo_val_to_mvolts(selector) * 1000;
+}
+
 int wm8350_dcdc_set_slot(struct wm8350 *wm8350, int dcdc, u16 start,
                         u16 stop, u16 fault)
 {
@@ -1031,18 +1050,30 @@ static unsigned int wm8350_dcdc_get_mode(struct regulator_dev *rdev)
        int dcdc = rdev_get_id(rdev);
        u16 mask, sleep, active, force;
        int mode = REGULATOR_MODE_NORMAL;
+       int reg;
 
-       if (dcdc < WM8350_DCDC_1 || dcdc > WM8350_DCDC_6)
-               return -EINVAL;
-
-       if (dcdc == WM8350_DCDC_2 || dcdc == WM8350_DCDC_5)
+       switch (dcdc) {
+       case WM8350_DCDC_1:
+               reg = WM8350_DCDC1_FORCE_PWM;
+               break;
+       case WM8350_DCDC_3:
+               reg = WM8350_DCDC3_FORCE_PWM;
+               break;
+       case WM8350_DCDC_4:
+               reg = WM8350_DCDC4_FORCE_PWM;
+               break;
+       case WM8350_DCDC_6:
+               reg = WM8350_DCDC6_FORCE_PWM;
+               break;
+       default:
                return -EINVAL;
+       }
 
        mask = 1 << (dcdc - WM8350_DCDC_1);
        active = wm8350_reg_read(wm8350, WM8350_DCDC_ACTIVE_OPTIONS) & mask;
+       force = wm8350_reg_read(wm8350, reg) & WM8350_DCDC1_FORCE_PWM_ENA;
        sleep = wm8350_reg_read(wm8350, WM8350_DCDC_SLEEP_OPTIONS) & mask;
-       force = wm8350_reg_read(wm8350, WM8350_DCDC1_FORCE_PWM)
-           & WM8350_DCDC1_FORCE_PWM_ENA;
+
        dev_dbg(wm8350->dev, "mask %x active %x sleep %x force %x",
                mask, active, sleep, force);
 
@@ -1150,6 +1181,7 @@ static int wm8350_ldo_is_enabled(struct regulator_dev *rdev)
 static struct regulator_ops wm8350_dcdc_ops = {
        .set_voltage = wm8350_dcdc_set_voltage,
        .get_voltage = wm8350_dcdc_get_voltage,
+       .list_voltage = wm8350_dcdc_list_voltage,
        .enable = wm8350_dcdc_enable,
        .disable = wm8350_dcdc_disable,
        .get_mode = wm8350_dcdc_get_mode,
@@ -1173,6 +1205,7 @@ static struct regulator_ops wm8350_dcdc2_5_ops = {
 static struct regulator_ops wm8350_ldo_ops = {
        .set_voltage = wm8350_ldo_set_voltage,
        .get_voltage = wm8350_ldo_get_voltage,
+       .list_voltage = wm8350_ldo_list_voltage,
        .enable = wm8350_ldo_enable,
        .disable = wm8350_ldo_disable,
        .is_enabled = wm8350_ldo_is_enabled,
@@ -1197,6 +1230,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = {
                .ops = &wm8350_dcdc_ops,
                .irq = WM8350_IRQ_UV_DC1,
                .type = REGULATOR_VOLTAGE,
+               .n_voltages = WM8350_DCDC_MAX_VSEL + 1,
                .owner = THIS_MODULE,
        },
        {
@@ -1213,6 +1247,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = {
                .ops = &wm8350_dcdc_ops,
                .irq = WM8350_IRQ_UV_DC3,
                .type = REGULATOR_VOLTAGE,
+               .n_voltages = WM8350_DCDC_MAX_VSEL + 1,
                .owner = THIS_MODULE,
        },
        {
@@ -1221,6 +1256,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = {
                .ops = &wm8350_dcdc_ops,
                .irq = WM8350_IRQ_UV_DC4,
                .type = REGULATOR_VOLTAGE,
+               .n_voltages = WM8350_DCDC_MAX_VSEL + 1,
                .owner = THIS_MODULE,
        },
        {
@@ -1237,6 +1273,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = {
                .ops = &wm8350_dcdc_ops,
                .irq = WM8350_IRQ_UV_DC6,
                .type = REGULATOR_VOLTAGE,
+               .n_voltages = WM8350_DCDC_MAX_VSEL + 1,
                .owner = THIS_MODULE,
        },
        {
@@ -1245,6 +1282,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = {
                .ops = &wm8350_ldo_ops,
                .irq = WM8350_IRQ_UV_LDO1,
                .type = REGULATOR_VOLTAGE,
+               .n_voltages = WM8350_LDO1_VSEL_MASK + 1,
                .owner = THIS_MODULE,
        },
        {
@@ -1253,6 +1291,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = {
                .ops = &wm8350_ldo_ops,
                .irq = WM8350_IRQ_UV_LDO2,
                .type = REGULATOR_VOLTAGE,
+               .n_voltages = WM8350_LDO2_VSEL_MASK + 1,
                .owner = THIS_MODULE,
        },
        {
@@ -1261,6 +1300,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = {
                .ops = &wm8350_ldo_ops,
                .irq = WM8350_IRQ_UV_LDO3,
                .type = REGULATOR_VOLTAGE,
+               .n_voltages = WM8350_LDO3_VSEL_MASK + 1,
                .owner = THIS_MODULE,
        },
        {
@@ -1269,6 +1309,7 @@ static struct regulator_desc wm8350_reg[NUM_WM8350_REGULATORS] = {
                .ops = &wm8350_ldo_ops,
                .irq = WM8350_IRQ_UV_LDO4,
                .type = REGULATOR_VOLTAGE,
+               .n_voltages = WM8350_LDO4_VSEL_MASK + 1,
                .owner = THIS_MODULE,
        },
        {
@@ -1293,6 +1334,7 @@ static void pmic_uv_handler(struct wm8350 *wm8350, int irq, void *data)
 {
        struct regulator_dev *rdev = (struct regulator_dev *)data;
 
+       mutex_lock(&rdev->mutex);
        if (irq == WM8350_IRQ_CS1 || irq == WM8350_IRQ_CS2)
                regulator_notifier_call_chain(rdev,
                                              REGULATOR_EVENT_REGULATION_OUT,
@@ -1301,6 +1343,7 @@ static void pmic_uv_handler(struct wm8350 *wm8350, int irq, void *data)
                regulator_notifier_call_chain(rdev,
                                              REGULATOR_EVENT_UNDER_VOLTAGE,
                                              wm8350);
+       mutex_unlock(&rdev->mutex);
 }
 
 static int wm8350_regulator_probe(struct platform_device *pdev)
@@ -1333,9 +1376,9 @@ static int wm8350_regulator_probe(struct platform_device *pdev)
                break;
        }
 
-
        /* register regulator */
        rdev = regulator_register(&wm8350_reg[pdev->id], &pdev->dev,
+                                 pdev->dev.platform_data,
                                  dev_get_drvdata(&pdev->dev));
        if (IS_ERR(rdev)) {
                dev_err(&pdev->dev, "failed to register %s\n",
index 56e23d44ba591383696ec9382a1a3c4b05ae32a9..157426029071fd72e5506c34dbeaecaf9ca9b619 100644 (file)
@@ -43,6 +43,18 @@ static int wm8400_ldo_disable(struct regulator_dev *dev)
                               WM8400_LDO1_ENA, 0);
 }
 
+static int wm8400_ldo_list_voltage(struct regulator_dev *dev,
+                                  unsigned selector)
+{
+       if (selector > WM8400_LDO1_VSEL_MASK)
+               return -EINVAL;
+
+       if (selector < 15)
+               return 900000 + (selector * 50000);
+       else
+               return 1600000 + ((selector - 14) * 100000);
+}
+
 static int wm8400_ldo_get_voltage(struct regulator_dev *dev)
 {
        struct wm8400 *wm8400 = rdev_get_drvdata(dev);
@@ -51,10 +63,7 @@ static int wm8400_ldo_get_voltage(struct regulator_dev *dev)
        val = wm8400_reg_read(wm8400, WM8400_LDO1_CONTROL + rdev_get_id(dev));
        val &= WM8400_LDO1_VSEL_MASK;
 
-       if (val < 15)
-               return 900000 + (val * 50000);
-       else
-               return 1600000 + ((val - 14) * 100000);
+       return wm8400_ldo_list_voltage(dev, val);
 }
 
 static int wm8400_ldo_set_voltage(struct regulator_dev *dev,
@@ -92,6 +101,7 @@ static struct regulator_ops wm8400_ldo_ops = {
        .is_enabled = wm8400_ldo_is_enabled,
        .enable = wm8400_ldo_enable,
        .disable = wm8400_ldo_disable,
+       .list_voltage = wm8400_ldo_list_voltage,
        .get_voltage = wm8400_ldo_get_voltage,
        .set_voltage = wm8400_ldo_set_voltage,
 };
@@ -124,6 +134,15 @@ static int wm8400_dcdc_disable(struct regulator_dev *dev)
                               WM8400_DC1_ENA, 0);
 }
 
+static int wm8400_dcdc_list_voltage(struct regulator_dev *dev,
+                                   unsigned selector)
+{
+       if (selector > WM8400_DC1_VSEL_MASK)
+               return -EINVAL;
+
+       return 850000 + (selector * 25000);
+}
+
 static int wm8400_dcdc_get_voltage(struct regulator_dev *dev)
 {
        struct wm8400 *wm8400 = rdev_get_drvdata(dev);
@@ -237,6 +256,7 @@ static struct regulator_ops wm8400_dcdc_ops = {
        .is_enabled = wm8400_dcdc_is_enabled,
        .enable = wm8400_dcdc_enable,
        .disable = wm8400_dcdc_disable,
+       .list_voltage = wm8400_dcdc_list_voltage,
        .get_voltage = wm8400_dcdc_get_voltage,
        .set_voltage = wm8400_dcdc_set_voltage,
        .get_mode = wm8400_dcdc_get_mode,
@@ -249,6 +269,7 @@ static struct regulator_desc regulators[] = {
                .name = "LDO1",
                .id = WM8400_LDO1,
                .ops = &wm8400_ldo_ops,
+               .n_voltages = WM8400_LDO1_VSEL_MASK + 1,
                .type = REGULATOR_VOLTAGE,
                .owner = THIS_MODULE,
        },
@@ -256,6 +277,7 @@ static struct regulator_desc regulators[] = {
                .name = "LDO2",
                .id = WM8400_LDO2,
                .ops = &wm8400_ldo_ops,
+               .n_voltages = WM8400_LDO2_VSEL_MASK + 1,
                .type = REGULATOR_VOLTAGE,
                .owner = THIS_MODULE,
        },
@@ -263,6 +285,7 @@ static struct regulator_desc regulators[] = {
                .name = "LDO3",
                .id = WM8400_LDO3,
                .ops = &wm8400_ldo_ops,
+               .n_voltages = WM8400_LDO3_VSEL_MASK + 1,
                .type = REGULATOR_VOLTAGE,
                .owner = THIS_MODULE,
        },
@@ -270,6 +293,7 @@ static struct regulator_desc regulators[] = {
                .name = "LDO4",
                .id = WM8400_LDO4,
                .ops = &wm8400_ldo_ops,
+               .n_voltages = WM8400_LDO4_VSEL_MASK + 1,
                .type = REGULATOR_VOLTAGE,
                .owner = THIS_MODULE,
        },
@@ -277,6 +301,7 @@ static struct regulator_desc regulators[] = {
                .name = "DCDC1",
                .id = WM8400_DCDC1,
                .ops = &wm8400_dcdc_ops,
+               .n_voltages = WM8400_DC1_VSEL_MASK + 1,
                .type = REGULATOR_VOLTAGE,
                .owner = THIS_MODULE,
        },
@@ -284,6 +309,7 @@ static struct regulator_desc regulators[] = {
                .name = "DCDC2",
                .id = WM8400_DCDC2,
                .ops = &wm8400_dcdc_ops,
+               .n_voltages = WM8400_DC2_VSEL_MASK + 1,
                .type = REGULATOR_VOLTAGE,
                .owner = THIS_MODULE,
        },
@@ -294,7 +320,7 @@ static int __devinit wm8400_regulator_probe(struct platform_device *pdev)
        struct regulator_dev *rdev;
 
        rdev = regulator_register(&regulators[pdev->id], &pdev->dev,
-               pdev->dev.driver_data);
+               pdev->dev.platform_data, pdev->dev.driver_data);
 
        if (IS_ERR(rdev))
                return PTR_ERR(rdev);
index 56002f7d26bdf349612b0500e31d12e5cac24b49..ffe34a12f446d22ad8ad9a0963e15a2cef54f4a4 100644 (file)
@@ -688,22 +688,16 @@ config RTC_DRV_RS5C313
        help
          If you say yes here you get support for the Ricoh RS5C313 RTC chips.
 
-config RTC_DRV_PARISC
-       tristate "PA-RISC firmware RTC support"
-       depends on PARISC
-       help
-         Say Y or M here to enable RTC support on PA-RISC systems using
-         firmware calls. If you do not know what you are doing, you should
+config RTC_DRV_GENERIC
+       tristate "Generic RTC support"
+       # Please consider writing a new RTC driver instead of using the generic
+       # RTC abstraction
+       depends on PARISC || M68K || PPC
+       help
+         Say Y or M here to enable RTC support on systems using the generic
+         RTC abstraction. If you do not know what you are doing, you should
          just say Y.
 
-config RTC_DRV_PPC
-       tristate "PowerPC machine dependent RTC support"
-       depends on PPC
-       help
-        The PowerPC kernel has machine-specific functions for accessing
-        the RTC. This exposes that functionality through the generic RTC
-        class.
-
 config RTC_DRV_PXA
        tristate "PXA27x/PXA3xx"
        depends on ARCH_PXA
@@ -747,4 +741,13 @@ config RTC_DRV_MV
          This driver can also be built as a module. If so, the module
          will be called rtc-mv.
 
+config RTC_DRV_PS3
+       tristate "PS3 RTC"
+       depends on PPC_PS3
+       help
+         If you say yes here you will get support for the RTC on PS3.
+
+         This driver can also be built as a module. If so, the module
+         will be called rtc-ps3.
+
 endif # RTC_CLASS
index e7b09986d26ee84b961068a0051d6ab8c7094916..6c0639a14f09446d09de01f42a8a8a74314a95c5 100644 (file)
@@ -56,8 +56,7 @@ obj-$(CONFIG_RTC_DRV_PCF8563) += rtc-pcf8563.o
 obj-$(CONFIG_RTC_DRV_PCF8583)  += rtc-pcf8583.o
 obj-$(CONFIG_RTC_DRV_PL030)    += rtc-pl030.o
 obj-$(CONFIG_RTC_DRV_PL031)    += rtc-pl031.o
-obj-$(CONFIG_RTC_DRV_PARISC)   += rtc-parisc.o
-obj-$(CONFIG_RTC_DRV_PPC)      += rtc-ppc.o
+obj-$(CONFIG_RTC_DRV_GENERIC)  += rtc-generic.o
 obj-$(CONFIG_RTC_DRV_PXA)      += rtc-pxa.o
 obj-$(CONFIG_RTC_DRV_R9701)    += rtc-r9701.o
 obj-$(CONFIG_RTC_DRV_RS5C313)  += rtc-rs5c313.o
@@ -77,3 +76,4 @@ obj-$(CONFIG_RTC_DRV_VR41XX)  += rtc-vr41xx.o
 obj-$(CONFIG_RTC_DRV_WM8350)   += rtc-wm8350.o
 obj-$(CONFIG_RTC_DRV_X1205)    += rtc-x1205.o
 obj-$(CONFIG_RTC_DRV_PCF50633) += rtc-pcf50633.o
+obj-$(CONFIG_RTC_DRV_PS3)      += rtc-ps3.o
diff --git a/drivers/rtc/rtc-generic.c b/drivers/rtc/rtc-generic.c
new file mode 100644 (file)
index 0000000..9832200
--- /dev/null
@@ -0,0 +1,84 @@
+/* rtc-generic: RTC driver using the generic RTC abstraction
+ *
+ * Copyright (C) 2008 Kyle McMartin <kyle@mcmartin.ca>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+
+#include <asm/rtc.h>
+
+static int generic_get_time(struct device *dev, struct rtc_time *tm)
+{
+       unsigned int ret = get_rtc_time(tm);
+
+       if (ret & RTC_BATT_BAD)
+               return -EOPNOTSUPP;
+
+       return rtc_valid_tm(tm);
+}
+
+static int generic_set_time(struct device *dev, struct rtc_time *tm)
+{
+       if (set_rtc_time(tm) < 0)
+               return -EOPNOTSUPP;
+
+       return 0;
+}
+
+static const struct rtc_class_ops generic_rtc_ops = {
+       .read_time = generic_get_time,
+       .set_time = generic_set_time,
+};
+
+static int __init generic_rtc_probe(struct platform_device *dev)
+{
+       struct rtc_device *rtc;
+
+       rtc = rtc_device_register("rtc-generic", &dev->dev, &generic_rtc_ops,
+                                 THIS_MODULE);
+       if (IS_ERR(rtc))
+               return PTR_ERR(rtc);
+
+       platform_set_drvdata(dev, rtc);
+
+       return 0;
+}
+
+static int __exit generic_rtc_remove(struct platform_device *dev)
+{
+       struct rtc_device *rtc = platform_get_drvdata(dev);
+
+       rtc_device_unregister(rtc);
+
+       return 0;
+}
+
+static struct platform_driver generic_rtc_driver = {
+       .driver = {
+               .name = "rtc-generic",
+               .owner = THIS_MODULE,
+       },
+       .remove = __exit_p(generic_rtc_remove),
+};
+
+static int __init generic_rtc_init(void)
+{
+       return platform_driver_probe(&generic_rtc_driver, generic_rtc_probe);
+}
+
+static void __exit generic_rtc_fini(void)
+{
+       platform_driver_unregister(&generic_rtc_driver);
+}
+
+module_init(generic_rtc_init);
+module_exit(generic_rtc_fini);
+
+MODULE_AUTHOR("Kyle McMartin <kyle@mcmartin.ca>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Generic RTC driver");
+MODULE_ALIAS("platform:rtc-generic");
diff --git a/drivers/rtc/rtc-parisc.c b/drivers/rtc/rtc-parisc.c
deleted file mode 100644 (file)
index b966f56..0000000
+++ /dev/null
@@ -1,86 +0,0 @@
-/* rtc-parisc: RTC for HP PA-RISC firmware
- *
- * Copyright (C) 2008 Kyle McMartin <kyle@mcmartin.ca>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/time.h>
-#include <linux/platform_device.h>
-#include <linux/rtc.h>
-
-#include <asm/rtc.h>
-
-static int parisc_get_time(struct device *dev, struct rtc_time *tm)
-{
-       unsigned long ret;
-
-       ret = get_rtc_time(tm);
-
-       if (ret & RTC_BATT_BAD)
-               return -EOPNOTSUPP;
-
-       return rtc_valid_tm(tm);
-}
-
-static int parisc_set_time(struct device *dev, struct rtc_time *tm)
-{
-       if (set_rtc_time(tm) < 0)
-               return -EOPNOTSUPP;
-
-       return 0;
-}
-
-static const struct rtc_class_ops parisc_rtc_ops = {
-       .read_time = parisc_get_time,
-       .set_time = parisc_set_time,
-};
-
-static int __init parisc_rtc_probe(struct platform_device *dev)
-{
-       struct rtc_device *rtc;
-
-       rtc = rtc_device_register("rtc-parisc", &dev->dev, &parisc_rtc_ops,
-                                 THIS_MODULE);
-       if (IS_ERR(rtc))
-               return PTR_ERR(rtc);
-
-       platform_set_drvdata(dev, rtc);
-
-       return 0;
-}
-
-static int __exit parisc_rtc_remove(struct platform_device *dev)
-{
-       struct rtc_device *rtc = platform_get_drvdata(dev);
-
-       rtc_device_unregister(rtc);
-
-       return 0;
-}
-
-static struct platform_driver parisc_rtc_driver = {
-       .driver = {
-               .name = "rtc-parisc",
-               .owner = THIS_MODULE,
-       },
-       .probe = parisc_rtc_probe,
-       .remove = __devexit_p(parisc_rtc_remove),
-};
-
-static int __init parisc_rtc_init(void)
-{
-       return platform_driver_probe(&parisc_rtc_driver, parisc_rtc_probe);
-}
-
-static void __exit parisc_rtc_fini(void)
-{
-       platform_driver_unregister(&parisc_rtc_driver);
-}
-
-module_init(parisc_rtc_init);
-module_exit(parisc_rtc_fini);
-
-MODULE_AUTHOR("Kyle McMartin <kyle@mcmartin.ca>");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("HP PA-RISC RTC driver");
diff --git a/drivers/rtc/rtc-ppc.c b/drivers/rtc/rtc-ppc.c
deleted file mode 100644 (file)
index c8e97e2..0000000
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * RTC driver for ppc_md RTC functions
- *
- * Â© 2007 Red Hat, Inc.
- *
- * Author: David Woodhouse <dwmw2@infradead.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-
-#include <linux/module.h>
-#include <linux/err.h>
-#include <linux/rtc.h>
-#include <linux/platform_device.h>
-#include <asm/machdep.h>
-
-static int ppc_rtc_read_time(struct device *dev, struct rtc_time *tm)
-{
-       ppc_md.get_rtc_time(tm);
-       return 0;
-}
-
-static int ppc_rtc_set_time(struct device *dev, struct rtc_time *tm)
-{
-       return ppc_md.set_rtc_time(tm);
-}
-
-static const struct rtc_class_ops ppc_rtc_ops = {
-       .set_time = ppc_rtc_set_time,
-       .read_time = ppc_rtc_read_time,
-};
-
-static struct rtc_device *rtc;
-static struct platform_device *ppc_rtc_pdev;
-
-static int __init ppc_rtc_init(void)
-{
-       if (!ppc_md.get_rtc_time || !ppc_md.set_rtc_time)
-               return -ENODEV;
-
-       ppc_rtc_pdev = platform_device_register_simple("ppc-rtc", 0, NULL, 0);
-       if (IS_ERR(ppc_rtc_pdev))
-               return PTR_ERR(ppc_rtc_pdev);
-
-       rtc = rtc_device_register("ppc_md", &ppc_rtc_pdev->dev,
-                                 &ppc_rtc_ops, THIS_MODULE);
-       if (IS_ERR(rtc)) {
-               platform_device_unregister(ppc_rtc_pdev);
-               return PTR_ERR(rtc);
-       }
-
-       return 0;
-}
-
-static void __exit ppc_rtc_exit(void)
-{
-       rtc_device_unregister(rtc);
-       platform_device_unregister(ppc_rtc_pdev);
-}
-
-module_init(ppc_rtc_init);
-module_exit(ppc_rtc_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org>");
-MODULE_DESCRIPTION("Generic RTC class driver for PowerPC");
diff --git a/drivers/rtc/rtc-ps3.c b/drivers/rtc/rtc-ps3.c
new file mode 100644 (file)
index 0000000..968133c
--- /dev/null
@@ -0,0 +1,104 @@
+/*
+ * PS3 RTC Driver
+ *
+ * Copyright 2009 Sony Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.
+ * If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+
+#include <asm/lv1call.h>
+#include <asm/ps3.h>
+
+
+static u64 read_rtc(void)
+{
+       int result;
+       u64 rtc_val;
+       u64 tb_val;
+
+       result = lv1_get_rtc(&rtc_val, &tb_val);
+       BUG_ON(result);
+
+       return rtc_val;
+}
+
+static int ps3_get_time(struct device *dev, struct rtc_time *tm)
+{
+       rtc_time_to_tm(read_rtc() + ps3_os_area_get_rtc_diff(), tm);
+       return rtc_valid_tm(tm);
+}
+
+static int ps3_set_time(struct device *dev, struct rtc_time *tm)
+{
+       unsigned long now;
+
+       rtc_tm_to_time(tm, &now);
+       ps3_os_area_set_rtc_diff(now - read_rtc());
+       return 0;
+}
+
+static const struct rtc_class_ops ps3_rtc_ops = {
+       .read_time = ps3_get_time,
+       .set_time = ps3_set_time,
+};
+
+static int __init ps3_rtc_probe(struct platform_device *dev)
+{
+       struct rtc_device *rtc;
+
+       rtc = rtc_device_register("rtc-ps3", &dev->dev, &ps3_rtc_ops,
+                                 THIS_MODULE);
+       if (IS_ERR(rtc))
+               return PTR_ERR(rtc);
+
+       platform_set_drvdata(dev, rtc);
+       return 0;
+}
+
+static int __exit ps3_rtc_remove(struct platform_device *dev)
+{
+       rtc_device_unregister(platform_get_drvdata(dev));
+       return 0;
+}
+
+static struct platform_driver ps3_rtc_driver = {
+       .driver = {
+               .name = "rtc-ps3",
+               .owner = THIS_MODULE,
+       },
+       .remove = __exit_p(ps3_rtc_remove),
+};
+
+static int __init ps3_rtc_init(void)
+{
+       return platform_driver_probe(&ps3_rtc_driver, ps3_rtc_probe);
+}
+
+static void __exit ps3_rtc_fini(void)
+{
+       platform_driver_unregister(&ps3_rtc_driver);
+}
+
+module_init(ps3_rtc_init);
+module_exit(ps3_rtc_fini);
+
+MODULE_AUTHOR("Sony Corporation");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ps3 RTC driver");
+MODULE_ALIAS("platform:rtc-ps3");
index 56841fe5f4839bd70d3e917bf32317ada965154e..0eefb07bebaf73708b7934588b54811bcc4615be 100644 (file)
@@ -513,7 +513,7 @@ static int __init mcf_console_setup(struct console *co, char *options)
        int parity = 'n';
        int flow = 'n';
 
-       if ((co->index >= 0) && (co->index <= MCF_MAXPORTS))
+       if ((co->index < 0) || (co->index >= MCF_MAXPORTS))
                co->index = 0;
        port = &mcf_ports[co->index].port;
        if (port->membase == 0)
index 882c57b399f7d94944ac93011807e4380bda6df6..fdba2f69d4c9ed2d390c69638f37e328960aab2b 100644 (file)
@@ -46,6 +46,7 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/ata.h>
 #include <linux/hdreg.h>
 #include <linux/scatterlist.h>
 
@@ -328,7 +329,7 @@ struct isd200_config {
 
 struct isd200_info {
        struct inquiry_data InquiryData;
-       struct hd_driveid *id;
+       u16 *id;
        struct isd200_config ConfigData;
        unsigned char *RegsBuf;
        unsigned char ATARegs[8];
@@ -419,19 +420,19 @@ static void isd200_build_sense(struct us_data *us, struct scsi_cmnd *srb)
                buf->Flags = UNIT_ATTENTION;
                buf->AdditionalSenseCode = 0;
                buf->AdditionalSenseCodeQualifier = 0;
-       } else if(error & MCR_ERR) {
+       } else if (error & ATA_MCR) {
                buf->ErrorCode = 0x70 | SENSE_ERRCODE_VALID;
                buf->AdditionalSenseLength = 0xb;
                buf->Flags =  UNIT_ATTENTION;
                buf->AdditionalSenseCode = 0;
                buf->AdditionalSenseCodeQualifier = 0;
-       } else if(error & TRK0_ERR) {
+       } else if (error & ATA_TRK0NF) {
                buf->ErrorCode = 0x70 | SENSE_ERRCODE_VALID;
                buf->AdditionalSenseLength = 0xb;
                buf->Flags =  NOT_READY;
                buf->AdditionalSenseCode = 0;
                buf->AdditionalSenseCodeQualifier = 0;
-       } else if(error & ECC_ERR) {
+       } else if (error & ATA_UNC) {
                buf->ErrorCode = 0x70 | SENSE_ERRCODE_VALID;
                buf->AdditionalSenseLength = 0xb;
                buf->Flags =  DATA_PROTECT;
@@ -547,16 +548,16 @@ static int isd200_action( struct us_data *us, int action,
                ata.generic.ActionSelect = ACTION_SELECT_1|ACTION_SELECT_5;
                ata.generic.RegisterSelect = REG_DEVICE_HEAD | REG_COMMAND;
                ata.write.DeviceHeadByte = info->DeviceHead;
-               ata.write.CommandByte = WIN_SRST;
+               ata.write.CommandByte = ATA_CMD_DEV_RESET;
                isd200_set_srb(info, DMA_NONE, NULL, 0);
                break;
 
        case ACTION_IDENTIFY:
                US_DEBUGP("   isd200_action(IDENTIFY)\n");
                ata.generic.RegisterSelect = REG_COMMAND;
-               ata.write.CommandByte = WIN_IDENTIFY;
+               ata.write.CommandByte = ATA_CMD_ID_ATA;
                isd200_set_srb(info, DMA_FROM_DEVICE, info->id,
-                                               sizeof(struct hd_driveid));
+                               ATA_ID_WORDS * 2);
                break;
 
        default:
@@ -944,22 +945,22 @@ static int isd200_try_enum(struct us_data *us, unsigned char master_slave,
                        break;
 
                if (!detect) {
-                       if (regs[ATA_REG_STATUS_OFFSET] & BUSY_STAT) {
+                       if (regs[ATA_REG_STATUS_OFFSET] & ATA_BUSY) {
                                US_DEBUGP("   %s status is still BSY, try again...\n",mstr);
                        } else {
                                US_DEBUGP("   %s status !BSY, continue with next operation\n",mstr);
                                break;
                        }
                }
-               /* check for BUSY_STAT and */
-               /* WRERR_STAT (workaround ATA Zip drive) and */ 
-               /* ERR_STAT (workaround for Archos CD-ROM) */
+               /* check for ATA_BUSY and */
+               /* ATA_DF (workaround ATA Zip drive) and */
+               /* ATA_ERR (workaround for Archos CD-ROM) */
                else if (regs[ATA_REG_STATUS_OFFSET] &
-                        (BUSY_STAT | WRERR_STAT | ERR_STAT )) {
+                        (ATA_BUSY | ATA_DF | ATA_ERR)) {
                        US_DEBUGP("   Status indicates it is not ready, try again...\n");
                }
                /* check for DRDY, ATA devices set DRDY after SRST */
-               else if (regs[ATA_REG_STATUS_OFFSET] & READY_STAT) {
+               else if (regs[ATA_REG_STATUS_OFFSET] & ATA_DRDY) {
                        US_DEBUGP("   Identified ATA device\n");
                        info->DeviceFlags |= DF_ATA_DEVICE;
                        info->DeviceHead = master_slave;
@@ -1053,103 +1054,50 @@ static int isd200_manual_enum(struct us_data *us)
        return(retStatus);
 }
 
-static void isd200_fix_driveid (struct hd_driveid *id)
+static void isd200_fix_driveid(u16 *id)
 {
 #ifndef __LITTLE_ENDIAN
 # ifdef __BIG_ENDIAN
        int i;
-       u16 *stringcast;
-
-       id->config         = __le16_to_cpu(id->config);
-       id->cyls           = __le16_to_cpu(id->cyls);
-       id->reserved2      = __le16_to_cpu(id->reserved2);
-       id->heads          = __le16_to_cpu(id->heads);
-       id->track_bytes    = __le16_to_cpu(id->track_bytes);
-       id->sector_bytes   = __le16_to_cpu(id->sector_bytes);
-       id->sectors        = __le16_to_cpu(id->sectors);
-       id->vendor0        = __le16_to_cpu(id->vendor0);
-       id->vendor1        = __le16_to_cpu(id->vendor1);
-       id->vendor2        = __le16_to_cpu(id->vendor2);
-       stringcast = (u16 *)&id->serial_no[0];
-       for (i = 0; i < (20/2); i++)
-               stringcast[i] = __le16_to_cpu(stringcast[i]);
-       id->buf_type       = __le16_to_cpu(id->buf_type);
-       id->buf_size       = __le16_to_cpu(id->buf_size);
-       id->ecc_bytes      = __le16_to_cpu(id->ecc_bytes);
-       stringcast = (u16 *)&id->fw_rev[0];
-       for (i = 0; i < (8/2); i++)
-               stringcast[i] = __le16_to_cpu(stringcast[i]);
-       stringcast = (u16 *)&id->model[0];
-       for (i = 0; i < (40/2); i++)
-               stringcast[i] = __le16_to_cpu(stringcast[i]);
-       id->dword_io       = __le16_to_cpu(id->dword_io);
-       id->reserved50     = __le16_to_cpu(id->reserved50);
-       id->field_valid    = __le16_to_cpu(id->field_valid);
-       id->cur_cyls       = __le16_to_cpu(id->cur_cyls);
-       id->cur_heads      = __le16_to_cpu(id->cur_heads);
-       id->cur_sectors    = __le16_to_cpu(id->cur_sectors);
-       id->cur_capacity0  = __le16_to_cpu(id->cur_capacity0);
-       id->cur_capacity1  = __le16_to_cpu(id->cur_capacity1);
-       id->lba_capacity   = __le32_to_cpu(id->lba_capacity);
-       id->dma_1word      = __le16_to_cpu(id->dma_1word);
-       id->dma_mword      = __le16_to_cpu(id->dma_mword);
-       id->eide_pio_modes = __le16_to_cpu(id->eide_pio_modes);
-       id->eide_dma_min   = __le16_to_cpu(id->eide_dma_min);
-       id->eide_dma_time  = __le16_to_cpu(id->eide_dma_time);
-       id->eide_pio       = __le16_to_cpu(id->eide_pio);
-       id->eide_pio_iordy = __le16_to_cpu(id->eide_pio_iordy);
-       for (i = 0; i < 2; ++i)
-               id->words69_70[i] = __le16_to_cpu(id->words69_70[i]);
-       for (i = 0; i < 4; ++i)
-               id->words71_74[i] = __le16_to_cpu(id->words71_74[i]);
-       id->queue_depth    = __le16_to_cpu(id->queue_depth);
-       for (i = 0; i < 4; ++i)
-               id->words76_79[i] = __le16_to_cpu(id->words76_79[i]);
-       id->major_rev_num  = __le16_to_cpu(id->major_rev_num);
-       id->minor_rev_num  = __le16_to_cpu(id->minor_rev_num);
-       id->command_set_1  = __le16_to_cpu(id->command_set_1);
-       id->command_set_2  = __le16_to_cpu(id->command_set_2);
-       id->cfsse          = __le16_to_cpu(id->cfsse);
-       id->cfs_enable_1   = __le16_to_cpu(id->cfs_enable_1);
-       id->cfs_enable_2   = __le16_to_cpu(id->cfs_enable_2);
-       id->csf_default    = __le16_to_cpu(id->csf_default);
-       id->dma_ultra      = __le16_to_cpu(id->dma_ultra);
-       id->trseuc         = __le16_to_cpu(id->trseuc);
-       id->trsEuc         = __le16_to_cpu(id->trsEuc);
-       id->CurAPMvalues   = __le16_to_cpu(id->CurAPMvalues);
-       id->mprc           = __le16_to_cpu(id->mprc);
-       id->hw_config      = __le16_to_cpu(id->hw_config);
-       id->acoustic       = __le16_to_cpu(id->acoustic);
-       id->msrqs          = __le16_to_cpu(id->msrqs);
-       id->sxfert         = __le16_to_cpu(id->sxfert);
-       id->sal            = __le16_to_cpu(id->sal);
-       id->spg            = __le32_to_cpu(id->spg);
-       id->lba_capacity_2 = __le64_to_cpu(id->lba_capacity_2);
-       for (i = 0; i < 22; i++)
-               id->words104_125[i]   = __le16_to_cpu(id->words104_125[i]);
-       id->last_lun       = __le16_to_cpu(id->last_lun);
-       id->word127        = __le16_to_cpu(id->word127);
-       id->dlf            = __le16_to_cpu(id->dlf);
-       id->csfo           = __le16_to_cpu(id->csfo);
-       for (i = 0; i < 26; i++)
-               id->words130_155[i] = __le16_to_cpu(id->words130_155[i]);
-       id->word156        = __le16_to_cpu(id->word156);
-       for (i = 0; i < 3; i++)
-               id->words157_159[i] = __le16_to_cpu(id->words157_159[i]);
-       id->cfa_power      = __le16_to_cpu(id->cfa_power);
-       for (i = 0; i < 14; i++)
-               id->words161_175[i] = __le16_to_cpu(id->words161_175[i]);
-       for (i = 0; i < 31; i++)
-               id->words176_205[i] = __le16_to_cpu(id->words176_205[i]);
-       for (i = 0; i < 48; i++)
-               id->words206_254[i] = __le16_to_cpu(id->words206_254[i]);
-       id->integrity_word  = __le16_to_cpu(id->integrity_word);
+
+       for (i = 0; i < ATA_ID_WORDS; i++)
+               id[i] = __le16_to_cpu(id[i]);
 # else
 #  error "Please fix <asm/byteorder.h>"
 # endif
 #endif
 }
 
+static void isd200_dump_driveid(u16 *id)
+{
+       US_DEBUGP("   Identify Data Structure:\n");
+       US_DEBUGP("      config = 0x%x\n",        id[ATA_ID_CONFIG]);
+       US_DEBUGP("      cyls = 0x%x\n",          id[ATA_ID_CYLS]);
+       US_DEBUGP("      heads = 0x%x\n",         id[ATA_ID_HEADS]);
+       US_DEBUGP("      track_bytes = 0x%x\n",   id[4]);
+       US_DEBUGP("      sector_bytes = 0x%x\n",  id[5]);
+       US_DEBUGP("      sectors = 0x%x\n",       id[ATA_ID_SECTORS]);
+       US_DEBUGP("      serial_no[0] = 0x%x\n",  *(char *)&id[ATA_ID_SERNO]);
+       US_DEBUGP("      buf_type = 0x%x\n",      id[20]);
+       US_DEBUGP("      buf_size = 0x%x\n",      id[ATA_ID_BUF_SIZE]);
+       US_DEBUGP("      ecc_bytes = 0x%x\n",     id[22]);
+       US_DEBUGP("      fw_rev[0] = 0x%x\n",     *(char *)&id[ATA_ID_FW_REV]);
+       US_DEBUGP("      model[0] = 0x%x\n",      *(char *)&id[ATA_ID_PROD]);
+       US_DEBUGP("      max_multsect = 0x%x\n",  id[ATA_ID_MAX_MULTSECT] & 0xff);
+       US_DEBUGP("      dword_io = 0x%x\n",      id[ATA_ID_DWORD_IO]);
+       US_DEBUGP("      capability = 0x%x\n",    id[ATA_ID_CAPABILITY] >> 8);
+       US_DEBUGP("      tPIO = 0x%x\n",          id[ATA_ID_OLD_PIO_MODES] >> 8);
+       US_DEBUGP("      tDMA = 0x%x\n",          id[ATA_ID_OLD_DMA_MODES] >> 8);
+       US_DEBUGP("      field_valid = 0x%x\n",   id[ATA_ID_FIELD_VALID]);
+       US_DEBUGP("      cur_cyls = 0x%x\n",      id[ATA_ID_CUR_CYLS]);
+       US_DEBUGP("      cur_heads = 0x%x\n",     id[ATA_ID_CUR_HEADS]);
+       US_DEBUGP("      cur_sectors = 0x%x\n",   id[ATA_ID_CUR_SECTORS]);
+       US_DEBUGP("      cur_capacity = 0x%x\n",  ata_id_u32(id, 57));
+       US_DEBUGP("      multsect = 0x%x\n",      id[ATA_ID_MULTSECT] & 0xff);
+       US_DEBUGP("      lba_capacity = 0x%x\n",  ata_id_u32(id, ATA_ID_LBA_CAPACITY));
+       US_DEBUGP("      command_set_1 = 0x%x\n", id[ATA_ID_COMMAND_SET_1]);
+       US_DEBUGP("      command_set_2 = 0x%x\n", id[ATA_ID_COMMAND_SET_2]);
+}
 
 /**************************************************************************
  * isd200_get_inquiry_data
@@ -1163,7 +1111,7 @@ static int isd200_get_inquiry_data( struct us_data *us )
 {
        struct isd200_info *info = (struct isd200_info *)us->extra;
        int retStatus = ISD200_GOOD;
-       struct hd_driveid *id = info->id;
+       u16 *id = info->id;
 
        US_DEBUGP("Entering isd200_get_inquiry_data\n");
 
@@ -1180,8 +1128,7 @@ static int isd200_get_inquiry_data( struct us_data *us )
                        /* this must be an ATA device */
                        /* perform an ATA Command Identify */
                        transferStatus = isd200_action( us, ACTION_IDENTIFY,
-                                                       id, 
-                                                       sizeof(struct hd_driveid) );
+                                                       id, ATA_ID_WORDS * 2);
                        if (transferStatus != ISD200_TRANSPORT_GOOD) {
                                /* Error issuing ATA Command Identify */
                                US_DEBUGP("   Error issuing ATA Command Identify\n");
@@ -1191,35 +1138,9 @@ static int isd200_get_inquiry_data( struct us_data *us )
                                int i;
                                __be16 *src;
                                __u16 *dest;
-                               isd200_fix_driveid(id);
 
-                               US_DEBUGP("   Identify Data Structure:\n");
-                               US_DEBUGP("      config = 0x%x\n", id->config);
-                               US_DEBUGP("      cyls = 0x%x\n", id->cyls);
-                               US_DEBUGP("      heads = 0x%x\n", id->heads);
-                               US_DEBUGP("      track_bytes = 0x%x\n", id->track_bytes);
-                               US_DEBUGP("      sector_bytes = 0x%x\n", id->sector_bytes);
-                               US_DEBUGP("      sectors = 0x%x\n", id->sectors);
-                               US_DEBUGP("      serial_no[0] = 0x%x\n", id->serial_no[0]);
-                               US_DEBUGP("      buf_type = 0x%x\n", id->buf_type);
-                               US_DEBUGP("      buf_size = 0x%x\n", id->buf_size);
-                               US_DEBUGP("      ecc_bytes = 0x%x\n", id->ecc_bytes);
-                               US_DEBUGP("      fw_rev[0] = 0x%x\n", id->fw_rev[0]);
-                               US_DEBUGP("      model[0] = 0x%x\n", id->model[0]);
-                               US_DEBUGP("      max_multsect = 0x%x\n", id->max_multsect);
-                               US_DEBUGP("      dword_io = 0x%x\n", id->dword_io);
-                               US_DEBUGP("      capability = 0x%x\n", id->capability);
-                               US_DEBUGP("      tPIO = 0x%x\n", id->tPIO);
-                               US_DEBUGP("      tDMA = 0x%x\n", id->tDMA);
-                               US_DEBUGP("      field_valid = 0x%x\n", id->field_valid);
-                               US_DEBUGP("      cur_cyls = 0x%x\n", id->cur_cyls);
-                               US_DEBUGP("      cur_heads = 0x%x\n", id->cur_heads);
-                               US_DEBUGP("      cur_sectors = 0x%x\n", id->cur_sectors);
-                               US_DEBUGP("      cur_capacity = 0x%x\n", (id->cur_capacity1 << 16) + id->cur_capacity0 );
-                               US_DEBUGP("      multsect = 0x%x\n", id->multsect);
-                               US_DEBUGP("      lba_capacity = 0x%x\n", id->lba_capacity);
-                               US_DEBUGP("      command_set_1 = 0x%x\n", id->command_set_1);
-                               US_DEBUGP("      command_set_2 = 0x%x\n", id->command_set_2);
+                               isd200_fix_driveid(id);
+                               isd200_dump_driveid(id);
 
                                memset(&info->InquiryData, 0, sizeof(info->InquiryData));
 
@@ -1229,30 +1150,30 @@ static int isd200_get_inquiry_data( struct us_data *us )
                                /* The length must be at least 36 (5 + 31) */
                                info->InquiryData.AdditionalLength = 0x1F;
 
-                               if (id->command_set_1 & COMMANDSET_MEDIA_STATUS) {
+                               if (id[ATA_ID_COMMAND_SET_1] & COMMANDSET_MEDIA_STATUS) {
                                        /* set the removable bit */
                                        info->InquiryData.DeviceTypeModifier = DEVICE_REMOVABLE;
                                        info->DeviceFlags |= DF_REMOVABLE_MEDIA;
                                }
 
                                /* Fill in vendor identification fields */
-                               src = (__be16*)id->model;
+                               src = (__be16 *)&id[ATA_ID_PROD];
                                dest = (__u16*)info->InquiryData.VendorId;
                                for (i=0;i<4;i++)
                                        dest[i] = be16_to_cpu(src[i]);
 
-                               src = (__be16*)(id->model+8);
+                               src = (__be16 *)&id[ATA_ID_PROD + 8/2];
                                dest = (__u16*)info->InquiryData.ProductId;
                                for (i=0;i<8;i++)
                                        dest[i] = be16_to_cpu(src[i]);
 
-                               src = (__be16*)id->fw_rev;
+                               src = (__be16 *)&id[ATA_ID_FW_REV];
                                dest = (__u16*)info->InquiryData.ProductRevisionLevel;
                                for (i=0;i<2;i++)
                                        dest[i] = be16_to_cpu(src[i]);
 
                                /* determine if it supports Media Status Notification */
-                               if (id->command_set_2 & COMMANDSET_MEDIA_STATUS) {
+                               if (id[ATA_ID_COMMAND_SET_2] & COMMANDSET_MEDIA_STATUS) {
                                        US_DEBUGP("   Device supports Media Status Notification\n");
 
                                        /* Indicate that it is enabled, even though it is not
@@ -1301,7 +1222,7 @@ static int isd200_scsi_to_ata(struct scsi_cmnd *srb, struct us_data *us,
                              union ata_cdb * ataCdb)
 {
        struct isd200_info *info = (struct isd200_info *)us->extra;
-       struct hd_driveid *id = info->id;
+       u16 *id = info->id;
        int sendToTransport = 1;
        unsigned char sectnum, head;
        unsigned short cylinder;
@@ -1369,13 +1290,12 @@ static int isd200_scsi_to_ata(struct scsi_cmnd *srb, struct us_data *us,
 
                US_DEBUGP("   ATA OUT - SCSIOP_READ_CAPACITY\n");
 
-               if (id->capability & CAPABILITY_LBA ) {
-                       capacity = id->lba_capacity - 1;
-               } else {
-                       capacity = (id->heads *
-                                   id->cyls *
-                                   id->sectors) - 1;
-               }
+               if (ata_id_has_lba(id))
+                       capacity = ata_id_u32(id, ATA_ID_LBA_CAPACITY) - 1;
+               else
+                       capacity = (id[ATA_ID_HEADS] * id[ATA_ID_CYLS] *
+                                   id[ATA_ID_SECTORS]) - 1;
+
                readCapacityData.LogicalBlockAddress = cpu_to_be32(capacity);
                readCapacityData.BytesPerBlock = cpu_to_be32(0x200);
 
@@ -1392,16 +1312,16 @@ static int isd200_scsi_to_ata(struct scsi_cmnd *srb, struct us_data *us,
                lba = be32_to_cpu(*(__be32 *)&srb->cmnd[2]);
                blockCount = (unsigned long)srb->cmnd[7]<<8 | (unsigned long)srb->cmnd[8];
 
-               if (id->capability & CAPABILITY_LBA) {
+               if (ata_id_has_lba(id)) {
                        sectnum = (unsigned char)(lba);
                        cylinder = (unsigned short)(lba>>8);
                        head = ATA_ADDRESS_DEVHEAD_LBA_MODE | (unsigned char)(lba>>24 & 0x0F);
                } else {
-                       sectnum = (unsigned char)((lba % id->sectors) + 1);
-                       cylinder = (unsigned short)(lba / (id->sectors *
-                                                          id->heads));
-                       head = (unsigned char)((lba / id->sectors) %
-                                              id->heads);
+                       sectnum = (u8)((lba % id[ATA_ID_SECTORS]) + 1);
+                       cylinder = (u16)(lba / (id[ATA_ID_SECTORS] *
+                                       id[ATA_ID_HEADS]));
+                       head = (u8)((lba / id[ATA_ID_SECTORS]) %
+                                       id[ATA_ID_HEADS]);
                }
                ataCdb->generic.SignatureByte0 = info->ConfigData.ATAMajorCommand;
                ataCdb->generic.SignatureByte1 = info->ConfigData.ATAMinorCommand;
@@ -1415,7 +1335,7 @@ static int isd200_scsi_to_ata(struct scsi_cmnd *srb, struct us_data *us,
                ataCdb->write.CylinderHighByte = (unsigned char)(cylinder>>8);
                ataCdb->write.CylinderLowByte = (unsigned char)cylinder;
                ataCdb->write.DeviceHeadByte = (head | ATA_ADDRESS_DEVHEAD_STD);
-               ataCdb->write.CommandByte = WIN_READ;
+               ataCdb->write.CommandByte = ATA_CMD_PIO_READ;
                break;
 
        case WRITE_10:
@@ -1424,14 +1344,16 @@ static int isd200_scsi_to_ata(struct scsi_cmnd *srb, struct us_data *us,
                lba = be32_to_cpu(*(__be32 *)&srb->cmnd[2]);
                blockCount = (unsigned long)srb->cmnd[7]<<8 | (unsigned long)srb->cmnd[8];
 
-               if (id->capability & CAPABILITY_LBA) {
+               if (ata_id_has_lba(id)) {
                        sectnum = (unsigned char)(lba);
                        cylinder = (unsigned short)(lba>>8);
                        head = ATA_ADDRESS_DEVHEAD_LBA_MODE | (unsigned char)(lba>>24 & 0x0F);
                } else {
-                       sectnum = (unsigned char)((lba % id->sectors) + 1);
-                       cylinder = (unsigned short)(lba / (id->sectors * id->heads));
-                       head = (unsigned char)((lba / id->sectors) % id->heads);
+                       sectnum = (u8)((lba % id[ATA_ID_SECTORS]) + 1);
+                       cylinder = (u16)(lba / (id[ATA_ID_SECTORS] *
+                                       id[ATA_ID_HEADS]));
+                       head = (u8)((lba / id[ATA_ID_SECTORS]) %
+                                       id[ATA_ID_HEADS]);
                }
                ataCdb->generic.SignatureByte0 = info->ConfigData.ATAMajorCommand;
                ataCdb->generic.SignatureByte1 = info->ConfigData.ATAMinorCommand;
@@ -1445,7 +1367,7 @@ static int isd200_scsi_to_ata(struct scsi_cmnd *srb, struct us_data *us,
                ataCdb->write.CylinderHighByte = (unsigned char)(cylinder>>8);
                ataCdb->write.CylinderLowByte = (unsigned char)cylinder;
                ataCdb->write.DeviceHeadByte = (head | ATA_ADDRESS_DEVHEAD_STD);
-               ataCdb->write.CommandByte = WIN_WRITE;
+               ataCdb->write.CommandByte = ATA_CMD_PIO_WRITE;
                break;
 
        case ALLOW_MEDIUM_REMOVAL:
@@ -1459,7 +1381,7 @@ static int isd200_scsi_to_ata(struct scsi_cmnd *srb, struct us_data *us,
                        ataCdb->generic.TransferBlockSize = 1;
                        ataCdb->generic.RegisterSelect = REG_COMMAND;
                        ataCdb->write.CommandByte = (srb->cmnd[4] & 0x1) ?
-                               WIN_DOORLOCK : WIN_DOORUNLOCK;
+                               ATA_CMD_MEDIA_LOCK : ATA_CMD_MEDIA_UNLOCK;
                        isd200_srb_set_bufflen(srb, 0);
                } else {
                        US_DEBUGP("   Not removeable media, just report okay\n");
@@ -1539,8 +1461,7 @@ static int isd200_init_info(struct us_data *us)
        if (!info)
                retStatus = ISD200_ERROR;
        else {
-               info->id = (struct hd_driveid *)
-                               kzalloc(sizeof(struct hd_driveid), GFP_KERNEL);
+               info->id = kzalloc(ATA_ID_WORDS * 2, GFP_KERNEL);
                info->RegsBuf = (unsigned char *)
                                kmalloc(sizeof(info->ATARegs), GFP_KERNEL);
                info->srb.sense_buffer =
index cef8b18ceaa367b56ffcf8145cb2f24af8de7389..86b203fc3c56af2b41e0118dd3cc3bddd8a002d4 100644 (file)
@@ -66,6 +66,13 @@ config GENERIC_ACL
        bool
        select FS_POSIX_ACL
 
+menu "Caches"
+
+source "fs/fscache/Kconfig"
+source "fs/cachefiles/Kconfig"
+
+endmenu
+
 if BLOCK
 menu "CD-ROM/DVD Filesystems"
 
@@ -169,6 +176,8 @@ source "fs/romfs/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 
+source "fs/exofs/Kconfig"
+
 endif # MISC_FILESYSTEMS
 
 menuconfig NETWORK_FILESYSTEMS
index b5cd8e18dd9feb14317339cf5220d39102d48f0a..70b2aed87133328997e0754737f7ea49a499b21b 100644 (file)
@@ -63,6 +63,7 @@ obj-$(CONFIG_PROFILING)               += dcookies.o
 obj-$(CONFIG_DLM)              += dlm/
  
 # Do not add any filesystems before this line
+obj-$(CONFIG_FSCACHE)          += fscache/
 obj-$(CONFIG_REISERFS_FS)      += reiserfs/
 obj-$(CONFIG_EXT3_FS)          += ext3/ # Before ext2 so root fs can be ext3
 obj-$(CONFIG_EXT2_FS)          += ext2/
@@ -116,7 +117,9 @@ obj-$(CONFIG_AFS_FS)                += afs/
 obj-$(CONFIG_BEFS_FS)          += befs/
 obj-$(CONFIG_HOSTFS)           += hostfs/
 obj-$(CONFIG_HPPFS)            += hppfs/
+obj-$(CONFIG_CACHEFILES)       += cachefiles/
 obj-$(CONFIG_DEBUG_FS)         += debugfs/
 obj-$(CONFIG_OCFS2_FS)         += ocfs2/
 obj-$(CONFIG_BTRFS_FS)         += btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
+obj-$(CONFIG_EXOFS_FS)          += exofs/
index e7b522fe15e167c1ee53b0a6c0011b472e145a14..5c4e61d3c77250728076fb8940f0cd4cc27d7d72 100644 (file)
@@ -19,3 +19,11 @@ config AFS_DEBUG
          See <file:Documentation/filesystems/afs.txt> for more information.
 
          If unsure, say N.
+
+config AFS_FSCACHE
+       bool "Provide AFS client caching support (EXPERIMENTAL)"
+       depends on EXPERIMENTAL
+       depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y
+       help
+         Say Y here if you want AFS data to be cached locally on disk through
+         the generic filesystem cache manager
index a66671082cfbb98d37277e73e3255c70d8415946..4f64b95d57bd1ff7af9c781eac506787dc3b4c9f 100644 (file)
@@ -2,7 +2,10 @@
 # Makefile for Red Hat Linux AFS client.
 #
 
+afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o
+
 kafs-objs := \
+       $(afs-cache-y) \
        callback.o \
        cell.o \
        cmservice.o \
index de0d7de69edc4764b98f3d2ff098225e0e18e5fe..e2b1d3f165191444e96f3eebfbeb909aa07f5db1 100644 (file)
@@ -1,6 +1,6 @@
 /* AFS caching stuff
  *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
  * 2 of the License, or (at your option) any later version.
  */
 
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_cell_cache_match(void *target,
-                                               const void *entry);
-static void afs_cell_cache_update(void *source, void *entry);
-
-struct cachefs_index_def afs_cache_cell_index_def = {
-       .name                   = "cell_ix",
-       .data_size              = sizeof(struct afs_cache_cell),
-       .keys[0]                = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
-       .match                  = afs_cell_cache_match,
-       .update                 = afs_cell_cache_update,
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
+                                      void *buffer, uint16_t buflen);
+static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
+                                      void *buffer, uint16_t buflen);
+static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
+                                                     const void *buffer,
+                                                     uint16_t buflen);
+
+static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
+                                           void *buffer, uint16_t buflen);
+static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
+                                           void *buffer, uint16_t buflen);
+static enum fscache_checkaux afs_vlocation_cache_check_aux(
+       void *cookie_netfs_data, const void *buffer, uint16_t buflen);
+
+static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
+                                        void *buffer, uint16_t buflen);
+
+static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
+                                       void *buffer, uint16_t buflen);
+static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
+                                    uint64_t *size);
+static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
+                                       void *buffer, uint16_t buflen);
+static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
+                                                      const void *buffer,
+                                                      uint16_t buflen);
+static void afs_vnode_cache_now_uncached(void *cookie_netfs_data);
+
+struct fscache_netfs afs_cache_netfs = {
+       .name                   = "afs",
+       .version                = 0,
+};
+
+struct fscache_cookie_def afs_cell_cache_index_def = {
+       .name           = "AFS.cell",
+       .type           = FSCACHE_COOKIE_TYPE_INDEX,
+       .get_key        = afs_cell_cache_get_key,
+       .get_aux        = afs_cell_cache_get_aux,
+       .check_aux      = afs_cell_cache_check_aux,
+};
+
+struct fscache_cookie_def afs_vlocation_cache_index_def = {
+       .name                   = "AFS.vldb",
+       .type                   = FSCACHE_COOKIE_TYPE_INDEX,
+       .get_key                = afs_vlocation_cache_get_key,
+       .get_aux                = afs_vlocation_cache_get_aux,
+       .check_aux              = afs_vlocation_cache_check_aux,
+};
+
+struct fscache_cookie_def afs_volume_cache_index_def = {
+       .name           = "AFS.volume",
+       .type           = FSCACHE_COOKIE_TYPE_INDEX,
+       .get_key        = afs_volume_cache_get_key,
+};
+
+struct fscache_cookie_def afs_vnode_cache_index_def = {
+       .name                   = "AFS.vnode",
+       .type                   = FSCACHE_COOKIE_TYPE_DATAFILE,
+       .get_key                = afs_vnode_cache_get_key,
+       .get_attr               = afs_vnode_cache_get_attr,
+       .get_aux                = afs_vnode_cache_get_aux,
+       .check_aux              = afs_vnode_cache_check_aux,
+       .now_uncached           = afs_vnode_cache_now_uncached,
 };
-#endif
 
 /*
- * match a cell record obtained from the cache
+ * set the key for the index entry
  */
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_cell_cache_match(void *target,
-                                               const void *entry)
+static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
+                                      void *buffer, uint16_t bufmax)
 {
-       const struct afs_cache_cell *ccell = entry;
-       struct afs_cell *cell = target;
+       const struct afs_cell *cell = cookie_netfs_data;
+       uint16_t klen;
 
-       _enter("{%s},{%s}", ccell->name, cell->name);
+       _enter("%p,%p,%u", cell, buffer, bufmax);
 
-       if (strncmp(ccell->name, cell->name, sizeof(ccell->name)) == 0) {
-               _leave(" = SUCCESS");
-               return CACHEFS_MATCH_SUCCESS;
-       }
+       klen = strlen(cell->name);
+       if (klen > bufmax)
+               return 0;
 
-       _leave(" = FAILED");
-       return CACHEFS_MATCH_FAILED;
+       memcpy(buffer, cell->name, klen);
+       return klen;
 }
-#endif
 
 /*
- * update a cell record in the cache
+ * provide new auxilliary cache data
  */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_cell_cache_update(void *source, void *entry)
+static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
+                                      void *buffer, uint16_t bufmax)
 {
-       struct afs_cache_cell *ccell = entry;
-       struct afs_cell *cell = source;
+       const struct afs_cell *cell = cookie_netfs_data;
+       uint16_t dlen;
 
-       _enter("%p,%p", source, entry);
+       _enter("%p,%p,%u", cell, buffer, bufmax);
 
-       strncpy(ccell->name, cell->name, sizeof(ccell->name));
+       dlen = cell->vl_naddrs * sizeof(cell->vl_addrs[0]);
+       dlen = min(dlen, bufmax);
+       dlen &= ~(sizeof(cell->vl_addrs[0]) - 1);
 
-       memcpy(ccell->vl_servers,
-              cell->vl_addrs,
-              min(sizeof(ccell->vl_servers), sizeof(cell->vl_addrs)));
+       memcpy(buffer, cell->vl_addrs, dlen);
+       return dlen;
+}
 
+/*
+ * check that the auxilliary data indicates that the entry is still valid
+ */
+static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
+                                                     const void *buffer,
+                                                     uint16_t buflen)
+{
+       _leave(" = OKAY");
+       return FSCACHE_CHECKAUX_OKAY;
 }
-#endif
-
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vlocation_cache_match(void *target,
-                                                    const void *entry);
-static void afs_vlocation_cache_update(void *source, void *entry);
-
-struct cachefs_index_def afs_vlocation_cache_index_def = {
-       .name           = "vldb",
-       .data_size      = sizeof(struct afs_cache_vlocation),
-       .keys[0]        = { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
-       .match          = afs_vlocation_cache_match,
-       .update         = afs_vlocation_cache_update,
-};
-#endif
 
+/*****************************************************************************/
 /*
- * match a VLDB record stored in the cache
- * - may also load target from entry
+ * set the key for the index entry
  */
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vlocation_cache_match(void *target,
-                                                    const void *entry)
+static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
+                                           void *buffer, uint16_t bufmax)
 {
-       const struct afs_cache_vlocation *vldb = entry;
-       struct afs_vlocation *vlocation = target;
+       const struct afs_vlocation *vlocation = cookie_netfs_data;
+       uint16_t klen;
+
+       _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
+
+       klen = strnlen(vlocation->vldb.name, sizeof(vlocation->vldb.name));
+       if (klen > bufmax)
+               return 0;
 
-       _enter("{%s},{%s}", vlocation->vldb.name, vldb->name);
+       memcpy(buffer, vlocation->vldb.name, klen);
 
-       if (strncmp(vlocation->vldb.name, vldb->name, sizeof(vldb->name)) == 0
-           ) {
-               if (!vlocation->valid ||
-                   vlocation->vldb.rtime == vldb->rtime
+       _leave(" = %u", klen);
+       return klen;
+}
+
+/*
+ * provide new auxilliary cache data
+ */
+static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
+                                           void *buffer, uint16_t bufmax)
+{
+       const struct afs_vlocation *vlocation = cookie_netfs_data;
+       uint16_t dlen;
+
+       _enter("{%s},%p,%u", vlocation->vldb.name, buffer, bufmax);
+
+       dlen = sizeof(struct afs_cache_vlocation);
+       dlen -= offsetof(struct afs_cache_vlocation, nservers);
+       if (dlen > bufmax)
+               return 0;
+
+       memcpy(buffer, (uint8_t *)&vlocation->vldb.nservers, dlen);
+
+       _leave(" = %u", dlen);
+       return dlen;
+}
+
+/*
+ * check that the auxilliary data indicates that the entry is still valid
+ */
+static
+enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data,
+                                                   const void *buffer,
+                                                   uint16_t buflen)
+{
+       const struct afs_cache_vlocation *cvldb;
+       struct afs_vlocation *vlocation = cookie_netfs_data;
+       uint16_t dlen;
+
+       _enter("{%s},%p,%u", vlocation->vldb.name, buffer, buflen);
+
+       /* check the size of the data is what we're expecting */
+       dlen = sizeof(struct afs_cache_vlocation);
+       dlen -= offsetof(struct afs_cache_vlocation, nservers);
+       if (dlen != buflen)
+               return FSCACHE_CHECKAUX_OBSOLETE;
+
+       cvldb = container_of(buffer, struct afs_cache_vlocation, nservers);
+
+       /* if what's on disk is more valid than what's in memory, then use the
+        * VL record from the cache */
+       if (!vlocation->valid || vlocation->vldb.rtime == cvldb->rtime) {
+               memcpy((uint8_t *)&vlocation->vldb.nservers, buffer, dlen);
+               vlocation->valid = 1;
+               _leave(" = SUCCESS [c->m]");
+               return FSCACHE_CHECKAUX_OKAY;
+       }
+
+       /* need to update the cache if the cached info differs */
+       if (memcmp(&vlocation->vldb, buffer, dlen) != 0) {
+               /* delete if the volume IDs for this name differ */
+               if (memcmp(&vlocation->vldb.vid, &cvldb->vid,
+                          sizeof(cvldb->vid)) != 0
                    ) {
-                       vlocation->vldb = *vldb;
-                       vlocation->valid = 1;
-                       _leave(" = SUCCESS [c->m]");
-                       return CACHEFS_MATCH_SUCCESS;
-               } else if (memcmp(&vlocation->vldb, vldb, sizeof(*vldb)) != 0) {
-                       /* delete if VIDs for this name differ */
-                       if (memcmp(&vlocation->vldb.vid,
-                                  &vldb->vid,
-                                  sizeof(vldb->vid)) != 0) {
-                               _leave(" = DELETE");
-                               return CACHEFS_MATCH_SUCCESS_DELETE;
-                       }
-
-                       _leave(" = UPDATE");
-                       return CACHEFS_MATCH_SUCCESS_UPDATE;
-               } else {
-                       _leave(" = SUCCESS");
-                       return CACHEFS_MATCH_SUCCESS;
+                       _leave(" = OBSOLETE");
+                       return FSCACHE_CHECKAUX_OBSOLETE;
                }
+
+               _leave(" = UPDATE");
+               return FSCACHE_CHECKAUX_NEEDS_UPDATE;
        }
 
-       _leave(" = FAILED");
-       return CACHEFS_MATCH_FAILED;
+       _leave(" = OKAY");
+       return FSCACHE_CHECKAUX_OKAY;
 }
-#endif
 
+/*****************************************************************************/
 /*
- * update a VLDB record stored in the cache
+ * set the key for the volume index entry
  */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_vlocation_cache_update(void *source, void *entry)
+static uint16_t afs_volume_cache_get_key(const void *cookie_netfs_data,
+                                       void *buffer, uint16_t bufmax)
 {
-       struct afs_cache_vlocation *vldb = entry;
-       struct afs_vlocation *vlocation = source;
+       const struct afs_volume *volume = cookie_netfs_data;
+       uint16_t klen;
+
+       _enter("{%u},%p,%u", volume->type, buffer, bufmax);
+
+       klen = sizeof(volume->type);
+       if (klen > bufmax)
+               return 0;
 
-       _enter("");
+       memcpy(buffer, &volume->type, sizeof(volume->type));
+
+       _leave(" = %u", klen);
+       return klen;
 
-       *vldb = vlocation->vldb;
 }
-#endif
-
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_volume_cache_match(void *target,
-                                                 const void *entry);
-static void afs_volume_cache_update(void *source, void *entry);
-
-struct cachefs_index_def afs_volume_cache_index_def = {
-       .name           = "volume",
-       .data_size      = sizeof(struct afs_cache_vhash),
-       .keys[0]        = { CACHEFS_INDEX_KEYS_BIN, 1 },
-       .keys[1]        = { CACHEFS_INDEX_KEYS_BIN, 1 },
-       .match          = afs_volume_cache_match,
-       .update         = afs_volume_cache_update,
-};
-#endif
 
+/*****************************************************************************/
 /*
- * match a volume hash record stored in the cache
+ * set the key for the index entry
  */
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_volume_cache_match(void *target,
-                                                 const void *entry)
+static uint16_t afs_vnode_cache_get_key(const void *cookie_netfs_data,
+                                       void *buffer, uint16_t bufmax)
 {
-       const struct afs_cache_vhash *vhash = entry;
-       struct afs_volume *volume = target;
+       const struct afs_vnode *vnode = cookie_netfs_data;
+       uint16_t klen;
 
-       _enter("{%u},{%u}", volume->type, vhash->vtype);
+       _enter("{%x,%x,%llx},%p,%u",
+              vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
+              buffer, bufmax);
 
-       if (volume->type == vhash->vtype) {
-               _leave(" = SUCCESS");
-               return CACHEFS_MATCH_SUCCESS;
-       }
+       klen = sizeof(vnode->fid.vnode);
+       if (klen > bufmax)
+               return 0;
+
+       memcpy(buffer, &vnode->fid.vnode, sizeof(vnode->fid.vnode));
 
-       _leave(" = FAILED");
-       return CACHEFS_MATCH_FAILED;
+       _leave(" = %u", klen);
+       return klen;
 }
-#endif
 
 /*
- * update a volume hash record stored in the cache
+ * provide updated file attributes
  */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_volume_cache_update(void *source, void *entry)
+static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
+                                    uint64_t *size)
 {
-       struct afs_cache_vhash *vhash = entry;
-       struct afs_volume *volume = source;
+       const struct afs_vnode *vnode = cookie_netfs_data;
 
-       _enter("");
+       _enter("{%x,%x,%llx},",
+              vnode->fid.vnode, vnode->fid.unique,
+              vnode->status.data_version);
 
-       vhash->vtype = volume->type;
+       *size = vnode->status.size;
 }
-#endif
-
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vnode_cache_match(void *target,
-                                                const void *entry);
-static void afs_vnode_cache_update(void *source, void *entry);
-
-struct cachefs_index_def afs_vnode_cache_index_def = {
-       .name           = "vnode",
-       .data_size      = sizeof(struct afs_cache_vnode),
-       .keys[0]        = { CACHEFS_INDEX_KEYS_BIN, 4 },
-       .match          = afs_vnode_cache_match,
-       .update         = afs_vnode_cache_update,
-};
-#endif
 
 /*
- * match a vnode record stored in the cache
+ * provide new auxilliary cache data
+ */
+static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
+                                       void *buffer, uint16_t bufmax)
+{
+       const struct afs_vnode *vnode = cookie_netfs_data;
+       uint16_t dlen;
+
+       _enter("{%x,%x,%Lx},%p,%u",
+              vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
+              buffer, bufmax);
+
+       dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
+       if (dlen > bufmax)
+               return 0;
+
+       memcpy(buffer, &vnode->fid.unique, sizeof(vnode->fid.unique));
+       buffer += sizeof(vnode->fid.unique);
+       memcpy(buffer, &vnode->status.data_version,
+              sizeof(vnode->status.data_version));
+
+       _leave(" = %u", dlen);
+       return dlen;
+}
+
+/*
+ * check that the auxilliary data indicates that the entry is still valid
  */
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vnode_cache_match(void *target,
-                                                const void *entry)
+static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
+                                                      const void *buffer,
+                                                      uint16_t buflen)
 {
-       const struct afs_cache_vnode *cvnode = entry;
-       struct afs_vnode *vnode = target;
-
-       _enter("{%x,%x,%Lx},{%x,%x,%Lx}",
-              vnode->fid.vnode,
-              vnode->fid.unique,
-              vnode->status.version,
-              cvnode->vnode_id,
-              cvnode->vnode_unique,
-              cvnode->data_version);
-
-       if (vnode->fid.vnode != cvnode->vnode_id) {
-               _leave(" = FAILED");
-               return CACHEFS_MATCH_FAILED;
+       struct afs_vnode *vnode = cookie_netfs_data;
+       uint16_t dlen;
+
+       _enter("{%x,%x,%llx},%p,%u",
+              vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version,
+              buffer, buflen);
+
+       /* check the size of the data is what we're expecting */
+       dlen = sizeof(vnode->fid.unique) + sizeof(vnode->status.data_version);
+       if (dlen != buflen) {
+               _leave(" = OBSOLETE [len %hx != %hx]", dlen, buflen);
+               return FSCACHE_CHECKAUX_OBSOLETE;
        }
 
-       if (vnode->fid.unique != cvnode->vnode_unique ||
-           vnode->status.version != cvnode->data_version) {
-               _leave(" = DELETE");
-               return CACHEFS_MATCH_SUCCESS_DELETE;
+       if (memcmp(buffer,
+                  &vnode->fid.unique,
+                  sizeof(vnode->fid.unique)
+                  ) != 0) {
+               unsigned unique;
+
+               memcpy(&unique, buffer, sizeof(unique));
+
+               _leave(" = OBSOLETE [uniq %x != %x]",
+                      unique, vnode->fid.unique);
+               return FSCACHE_CHECKAUX_OBSOLETE;
+       }
+
+       if (memcmp(buffer + sizeof(vnode->fid.unique),
+                  &vnode->status.data_version,
+                  sizeof(vnode->status.data_version)
+                  ) != 0) {
+               afs_dataversion_t version;
+
+               memcpy(&version, buffer + sizeof(vnode->fid.unique),
+                      sizeof(version));
+
+               _leave(" = OBSOLETE [vers %llx != %llx]",
+                      version, vnode->status.data_version);
+               return FSCACHE_CHECKAUX_OBSOLETE;
        }
 
        _leave(" = SUCCESS");
-       return CACHEFS_MATCH_SUCCESS;
+       return FSCACHE_CHECKAUX_OKAY;
 }
-#endif
 
 /*
- * update a vnode record stored in the cache
+ * indication the cookie is no longer uncached
+ * - this function is called when the backing store currently caching a cookie
+ *   is removed
+ * - the netfs should use this to clean up any markers indicating cached pages
+ * - this is mandatory for any object that may have data
  */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_vnode_cache_update(void *source, void *entry)
+static void afs_vnode_cache_now_uncached(void *cookie_netfs_data)
 {
-       struct afs_cache_vnode *cvnode = entry;
-       struct afs_vnode *vnode = source;
+       struct afs_vnode *vnode = cookie_netfs_data;
+       struct pagevec pvec;
+       pgoff_t first;
+       int loop, nr_pages;
+
+       _enter("{%x,%x,%Lx}",
+              vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version);
+
+       pagevec_init(&pvec, 0);
+       first = 0;
+
+       for (;;) {
+               /* grab a bunch of pages to clean */
+               nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping,
+                                         first,
+                                         PAGEVEC_SIZE - pagevec_count(&pvec));
+               if (!nr_pages)
+                       break;
 
-       _enter("");
+               for (loop = 0; loop < nr_pages; loop++)
+                       ClearPageFsCache(pvec.pages[loop]);
+
+               first = pvec.pages[nr_pages - 1]->index + 1;
+
+               pvec.nr = nr_pages;
+               pagevec_release(&pvec);
+               cond_resched();
+       }
 
-       cvnode->vnode_id        = vnode->fid.vnode;
-       cvnode->vnode_unique    = vnode->fid.unique;
-       cvnode->data_version    = vnode->status.version;
+       _leave("");
 }
-#endif
index 36a3642cf90ecdf7e9034f4c40b1b9b41805d989..5c4f6b499e90104a5c3184989090cf948a0b57fd 100644 (file)
@@ -1,6 +1,6 @@
 /* AFS local cache management interface
  *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -9,15 +9,4 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#ifndef AFS_CACHE_H
-#define AFS_CACHE_H
-
-#undef AFS_CACHING_SUPPORT
-
-#include <linux/mm.h>
-#ifdef AFS_CACHING_SUPPORT
-#include <linux/cachefs.h>
-#endif
-#include "types.h"
-
-#endif /* AFS_CACHE_H */
+#include <linux/fscache.h>
index 5e1df14e16b188d9a47501a28c48ce65a4be4c5b..e19c13f059ed5fa8ec0bcac9f0ce80377476b26c 100644 (file)
@@ -147,12 +147,11 @@ struct afs_cell *afs_cell_create(const char *name, char *vllist)
        if (ret < 0)
                goto error;
 
-#ifdef AFS_CACHING_SUPPORT
-       /* put it up for caching */
-       cachefs_acquire_cookie(afs_cache_netfs.primary_index,
-                              &afs_vlocation_cache_index_def,
-                              cell,
-                              &cell->cache);
+#ifdef CONFIG_AFS_FSCACHE
+       /* put it up for caching (this never returns an error) */
+       cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
+                                            &afs_cell_cache_index_def,
+                                            cell);
 #endif
 
        /* add to the cell lists */
@@ -362,10 +361,9 @@ static void afs_cell_destroy(struct afs_cell *cell)
        list_del_init(&cell->proc_link);
        up_write(&afs_proc_cells_sem);
 
-#ifdef AFS_CACHING_SUPPORT
-       cachefs_relinquish_cookie(cell->cache, 0);
+#ifdef CONFIG_AFS_FSCACHE
+       fscache_relinquish_cookie(cell->cache, 0);
 #endif
-
        key_put(cell->anonymous_key);
        kfree(cell);
 
index a3901769a96c4231b3b6c0cf88b9d3e6746f7468..7a1d942ef68d469b778678482f880afac446bfa9 100644 (file)
@@ -23,6 +23,9 @@ static void afs_invalidatepage(struct page *page, unsigned long offset);
 static int afs_releasepage(struct page *page, gfp_t gfp_flags);
 static int afs_launder_page(struct page *page);
 
+static int afs_readpages(struct file *filp, struct address_space *mapping,
+                        struct list_head *pages, unsigned nr_pages);
+
 const struct file_operations afs_file_operations = {
        .open           = afs_open,
        .release        = afs_release,
@@ -46,6 +49,7 @@ const struct inode_operations afs_file_inode_operations = {
 
 const struct address_space_operations afs_fs_aops = {
        .readpage       = afs_readpage,
+       .readpages      = afs_readpages,
        .set_page_dirty = afs_set_page_dirty,
        .launder_page   = afs_launder_page,
        .releasepage    = afs_releasepage,
@@ -101,37 +105,18 @@ int afs_release(struct inode *inode, struct file *file)
 /*
  * deal with notification that a page was read from the cache
  */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_readpage_read_complete(void *cookie_data,
-                                      struct page *page,
-                                      void *data,
-                                      int error)
+static void afs_file_readpage_read_complete(struct page *page,
+                                           void *data,
+                                           int error)
 {
-       _enter("%p,%p,%p,%d", cookie_data, page, data, error);
+       _enter("%p,%p,%d", page, data, error);
 
-       if (error)
-               SetPageError(page);
-       else
+       /* if the read completes with an error, we just unlock the page and let
+        * the VM reissue the readpage */
+       if (!error)
                SetPageUptodate(page);
        unlock_page(page);
-
 }
-#endif
-
-/*
- * deal with notification that a page was written to the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_readpage_write_complete(void *cookie_data,
-                                       struct page *page,
-                                       void *data,
-                                       int error)
-{
-       _enter("%p,%p,%p,%d", cookie_data, page, data, error);
-
-       unlock_page(page);
-}
-#endif
 
 /*
  * AFS read page from file, directory or symlink
@@ -161,9 +146,9 @@ static int afs_readpage(struct file *file, struct page *page)
        if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
                goto error;
 
-#ifdef AFS_CACHING_SUPPORT
        /* is it cached? */
-       ret = cachefs_read_or_alloc_page(vnode->cache,
+#ifdef CONFIG_AFS_FSCACHE
+       ret = fscache_read_or_alloc_page(vnode->cache,
                                         page,
                                         afs_file_readpage_read_complete,
                                         NULL,
@@ -171,20 +156,21 @@ static int afs_readpage(struct file *file, struct page *page)
 #else
        ret = -ENOBUFS;
 #endif
-
        switch (ret) {
-               /* read BIO submitted and wb-journal entry found */
-       case 1:
-               BUG(); // TODO - handle wb-journal match
-
                /* read BIO submitted (page in cache) */
        case 0:
                break;
 
-               /* no page available in cache */
-       case -ENOBUFS:
+               /* page not yet cached */
        case -ENODATA:
+               _debug("cache said ENODATA");
+               goto go_on;
+
+               /* page will not be cached */
+       case -ENOBUFS:
+               _debug("cache said ENOBUFS");
        default:
+       go_on:
                offset = page->index << PAGE_CACHE_SHIFT;
                len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
 
@@ -198,27 +184,25 @@ static int afs_readpage(struct file *file, struct page *page)
                                set_bit(AFS_VNODE_DELETED, &vnode->flags);
                                ret = -ESTALE;
                        }
-#ifdef AFS_CACHING_SUPPORT
-                       cachefs_uncache_page(vnode->cache, page);
+
+#ifdef CONFIG_AFS_FSCACHE
+                       fscache_uncache_page(vnode->cache, page);
 #endif
+                       BUG_ON(PageFsCache(page));
                        goto error;
                }
 
                SetPageUptodate(page);
 
-#ifdef AFS_CACHING_SUPPORT
-               if (cachefs_write_page(vnode->cache,
-                                      page,
-                                      afs_file_readpage_write_complete,
-                                      NULL,
-                                      GFP_KERNEL) != 0
-                   ) {
-                       cachefs_uncache_page(vnode->cache, page);
-                       unlock_page(page);
+               /* send the page to the cache */
+#ifdef CONFIG_AFS_FSCACHE
+               if (PageFsCache(page) &&
+                   fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) {
+                       fscache_uncache_page(vnode->cache, page);
+                       BUG_ON(PageFsCache(page));
                }
-#else
-               unlock_page(page);
 #endif
+               unlock_page(page);
        }
 
        _leave(" = 0");
@@ -232,34 +216,59 @@ error:
 }
 
 /*
- * invalidate part or all of a page
+ * read a set of pages
  */
-static void afs_invalidatepage(struct page *page, unsigned long offset)
+static int afs_readpages(struct file *file, struct address_space *mapping,
+                        struct list_head *pages, unsigned nr_pages)
 {
-       int ret = 1;
+       struct afs_vnode *vnode;
+       int ret = 0;
 
-       _enter("{%lu},%lu", page->index, offset);
+       _enter(",{%lu},,%d", mapping->host->i_ino, nr_pages);
 
-       BUG_ON(!PageLocked(page));
+       vnode = AFS_FS_I(mapping->host);
+       if (vnode->flags & AFS_VNODE_DELETED) {
+               _leave(" = -ESTALE");
+               return -ESTALE;
+       }
 
-       if (PagePrivate(page)) {
-               /* We release buffers only if the entire page is being
-                * invalidated.
-                * The get_block cached value has been unconditionally
-                * invalidated, so real IO is not possible anymore.
-                */
-               if (offset == 0) {
-                       BUG_ON(!PageLocked(page));
-
-                       ret = 0;
-                       if (!PageWriteback(page))
-                               ret = page->mapping->a_ops->releasepage(page,
-                                                                       0);
-                       /* possibly should BUG_ON(!ret); - neilb */
-               }
+       /* attempt to read as many of the pages as possible */
+#ifdef CONFIG_AFS_FSCACHE
+       ret = fscache_read_or_alloc_pages(vnode->cache,
+                                         mapping,
+                                         pages,
+                                         &nr_pages,
+                                         afs_file_readpage_read_complete,
+                                         NULL,
+                                         mapping_gfp_mask(mapping));
+#else
+       ret = -ENOBUFS;
+#endif
+
+       switch (ret) {
+               /* all pages are being read from the cache */
+       case 0:
+               BUG_ON(!list_empty(pages));
+               BUG_ON(nr_pages != 0);
+               _leave(" = 0 [reading all]");
+               return 0;
+
+               /* there were pages that couldn't be read from the cache */
+       case -ENODATA:
+       case -ENOBUFS:
+               break;
+
+               /* other error */
+       default:
+               _leave(" = %d", ret);
+               return ret;
        }
 
-       _leave(" = %d", ret);
+       /* load the missing pages from the network */
+       ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file);
+
+       _leave(" = %d [netting]", ret);
+       return ret;
 }
 
 /*
@@ -273,25 +282,82 @@ static int afs_launder_page(struct page *page)
 }
 
 /*
- * release a page and cleanup its private data
+ * invalidate part or all of a page
+ * - release a page and clean up its private data if offset is 0 (indicating
+ *   the entire page)
+ */
+static void afs_invalidatepage(struct page *page, unsigned long offset)
+{
+       struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
+
+       _enter("{%lu},%lu", page->index, offset);
+
+       BUG_ON(!PageLocked(page));
+
+       /* we clean up only if the entire page is being invalidated */
+       if (offset == 0) {
+#ifdef CONFIG_AFS_FSCACHE
+               if (PageFsCache(page)) {
+                       struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+                       fscache_wait_on_page_write(vnode->cache, page);
+                       fscache_uncache_page(vnode->cache, page);
+                       ClearPageFsCache(page);
+               }
+#endif
+
+               if (PagePrivate(page)) {
+                       if (wb && !PageWriteback(page)) {
+                               set_page_private(page, 0);
+                               afs_put_writeback(wb);
+                       }
+
+                       if (!page_private(page))
+                               ClearPagePrivate(page);
+               }
+       }
+
+       _leave("");
+}
+
+/*
+ * release a page and clean up its private state if it's not busy
+ * - return true if the page can now be released, false if not
  */
 static int afs_releasepage(struct page *page, gfp_t gfp_flags)
 {
+       struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
        struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
-       struct afs_writeback *wb;
 
        _enter("{{%x:%u}[%lu],%lx},%x",
               vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
               gfp_flags);
 
+       /* deny if page is being written to the cache and the caller hasn't
+        * elected to wait */
+#ifdef CONFIG_AFS_FSCACHE
+       if (PageFsCache(page)) {
+               if (fscache_check_page_write(vnode->cache, page)) {
+                       if (!(gfp_flags & __GFP_WAIT)) {
+                               _leave(" = F [cache busy]");
+                               return 0;
+                       }
+                       fscache_wait_on_page_write(vnode->cache, page);
+               }
+
+               fscache_uncache_page(vnode->cache, page);
+               ClearPageFsCache(page);
+       }
+#endif
+
        if (PagePrivate(page)) {
-               wb = (struct afs_writeback *) page_private(page);
-               ASSERT(wb != NULL);
-               set_page_private(page, 0);
+               if (wb) {
+                       set_page_private(page, 0);
+                       afs_put_writeback(wb);
+               }
                ClearPagePrivate(page);
-               afs_put_writeback(wb);
        }
 
-       _leave(" = 0");
-       return 0;
+       /* indicate that the page can be released */
+       _leave(" = T");
+       return 1;
 }
index bb47217f6a18478abd4d35215f4fdda1135845a4..c048f06587512c93535e7ca1221d6d7b3ee77f02 100644 (file)
@@ -61,6 +61,11 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
                return -EBADMSG;
        }
 
+#ifdef CONFIG_AFS_FSCACHE
+       if (vnode->status.size != inode->i_size)
+               fscache_attr_changed(vnode->cache);
+#endif
+
        inode->i_nlink          = vnode->status.nlink;
        inode->i_uid            = vnode->status.owner;
        inode->i_gid            = 0;
@@ -149,15 +154,6 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
                return inode;
        }
 
-#ifdef AFS_CACHING_SUPPORT
-       /* set up caching before reading the status, as fetch-status reads the
-        * first page of symlinks to see if they're really mntpts */
-       cachefs_acquire_cookie(vnode->volume->cache,
-                              NULL,
-                              vnode,
-                              &vnode->cache);
-#endif
-
        if (!status) {
                /* it's a remotely extant inode */
                set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
@@ -183,6 +179,15 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
                }
        }
 
+       /* set up caching before mapping the status, as map-status reads the
+        * first page of symlinks to see if they're really mountpoints */
+       inode->i_size = vnode->status.size;
+#ifdef CONFIG_AFS_FSCACHE
+       vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
+                                             &afs_vnode_cache_index_def,
+                                             vnode);
+#endif
+
        ret = afs_inode_map_status(vnode, key);
        if (ret < 0)
                goto bad_inode;
@@ -196,6 +201,10 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
 
        /* failure */
 bad_inode:
+#ifdef CONFIG_AFS_FSCACHE
+       fscache_relinquish_cookie(vnode->cache, 0);
+       vnode->cache = NULL;
+#endif
        iget_failed(inode);
        _leave(" = %d [bad]", ret);
        return ERR_PTR(ret);
@@ -340,8 +349,8 @@ void afs_clear_inode(struct inode *inode)
        ASSERT(list_empty(&vnode->writebacks));
        ASSERT(!vnode->cb_promised);
 
-#ifdef AFS_CACHING_SUPPORT
-       cachefs_relinquish_cookie(vnode->cache, 0);
+#ifdef CONFIG_AFS_FSCACHE
+       fscache_relinquish_cookie(vnode->cache, 0);
        vnode->cache = NULL;
 #endif
 
index 67f259d99cd6f11afae3b9296b74f0f5bdde3065..106be66dafd2ca7edc9388dac55d8a218ab52dc6 100644 (file)
@@ -21,6 +21,7 @@
 
 #include "afs.h"
 #include "afs_vl.h"
+#include "cache.h"
 
 #define AFS_CELL_MAX_ADDRS 15
 
@@ -193,8 +194,8 @@ struct afs_cell {
        struct key              *anonymous_key; /* anonymous user key for this cell */
        struct list_head        proc_link;      /* /proc cell list link */
        struct proc_dir_entry   *proc_dir;      /* /proc dir for this cell */
-#ifdef AFS_CACHING_SUPPORT
-       struct cachefs_cookie   *cache;         /* caching cookie */
+#ifdef CONFIG_AFS_FSCACHE
+       struct fscache_cookie   *cache;         /* caching cookie */
 #endif
 
        /* server record management */
@@ -249,8 +250,8 @@ struct afs_vlocation {
        struct list_head        grave;          /* link in master graveyard list */
        struct list_head        update;         /* link in master update list */
        struct afs_cell         *cell;          /* cell to which volume belongs */
-#ifdef AFS_CACHING_SUPPORT
-       struct cachefs_cookie   *cache;         /* caching cookie */
+#ifdef CONFIG_AFS_FSCACHE
+       struct fscache_cookie   *cache;         /* caching cookie */
 #endif
        struct afs_cache_vlocation vldb;        /* volume information DB record */
        struct afs_volume       *vols[3];       /* volume access record pointer (index by type) */
@@ -302,8 +303,8 @@ struct afs_volume {
        atomic_t                usage;
        struct afs_cell         *cell;          /* cell to which belongs (unrefd ptr) */
        struct afs_vlocation    *vlocation;     /* volume location */
-#ifdef AFS_CACHING_SUPPORT
-       struct cachefs_cookie   *cache;         /* caching cookie */
+#ifdef CONFIG_AFS_FSCACHE
+       struct fscache_cookie   *cache;         /* caching cookie */
 #endif
        afs_volid_t             vid;            /* volume ID */
        afs_voltype_t           type;           /* type of volume */
@@ -333,8 +334,8 @@ struct afs_vnode {
        struct afs_server       *server;        /* server currently supplying this file */
        struct afs_fid          fid;            /* the file identifier for this inode */
        struct afs_file_status  status;         /* AFS status info for this file */
-#ifdef AFS_CACHING_SUPPORT
-       struct cachefs_cookie   *cache;         /* caching cookie */
+#ifdef CONFIG_AFS_FSCACHE
+       struct fscache_cookie   *cache;         /* caching cookie */
 #endif
        struct afs_permits      *permits;       /* cache of permits so far obtained */
        struct mutex            permits_lock;   /* lock for altering permits list */
@@ -427,6 +428,22 @@ struct afs_uuid {
 };
 
 /*****************************************************************************/
+/*
+ * cache.c
+ */
+#ifdef CONFIG_AFS_FSCACHE
+extern struct fscache_netfs afs_cache_netfs;
+extern struct fscache_cookie_def afs_cell_cache_index_def;
+extern struct fscache_cookie_def afs_vlocation_cache_index_def;
+extern struct fscache_cookie_def afs_volume_cache_index_def;
+extern struct fscache_cookie_def afs_vnode_cache_index_def;
+#else
+#define afs_cell_cache_index_def       (*(struct fscache_cookie_def *) NULL)
+#define afs_vlocation_cache_index_def  (*(struct fscache_cookie_def *) NULL)
+#define afs_volume_cache_index_def     (*(struct fscache_cookie_def *) NULL)
+#define afs_vnode_cache_index_def      (*(struct fscache_cookie_def *) NULL)
+#endif
+
 /*
  * callback.c
  */
@@ -446,9 +463,6 @@ extern void afs_callback_update_kill(void);
  */
 extern struct rw_semaphore afs_proc_cells_sem;
 extern struct list_head afs_proc_cells;
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_cache_cell_index_def;
-#endif
 
 #define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
 extern int afs_cell_init(char *);
@@ -554,9 +568,6 @@ extern void afs_clear_inode(struct inode *);
  * main.c
  */
 extern struct afs_uuid afs_uuid;
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_netfs afs_cache_netfs;
-#endif
 
 /*
  * misc.c
@@ -637,10 +648,6 @@ extern int afs_get_MAC_address(u8 *, size_t);
 /*
  * vlclient.c
  */
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_vlocation_cache_index_def;
-#endif
-
 extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *,
                                    const char *, struct afs_cache_vlocation *,
                                    const struct afs_wait_mode *);
@@ -664,12 +671,6 @@ extern void afs_vlocation_purge(void);
 /*
  * vnode.c
  */
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_vnode_cache_index_def;
-#endif
-
-extern struct afs_timer_ops afs_vnode_cb_timed_out_ops;
-
 static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
 {
        return container_of(inode, struct afs_vnode, vfs_inode);
@@ -711,10 +712,6 @@ extern int afs_vnode_release_lock(struct afs_vnode *, struct key *);
 /*
  * volume.c
  */
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_volume_cache_index_def;
-#endif
-
 #define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
 
 extern void afs_put_volume(struct afs_volume *);
index 2d3e5d4fb9f781b7b360fea9bf3c2896b0771504..66d54d348c55f2c232bd362f1b432ab242d4256b 100644 (file)
@@ -1,6 +1,6 @@
 /* AFS client file system
  *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002,5 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -29,18 +29,6 @@ static char *rootcell;
 module_param(rootcell, charp, 0);
 MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
 
-#ifdef AFS_CACHING_SUPPORT
-static struct cachefs_netfs_operations afs_cache_ops = {
-       .get_page_cookie        = afs_cache_get_page_cookie,
-};
-
-struct cachefs_netfs afs_cache_netfs = {
-       .name                   = "afs",
-       .version                = 0,
-       .ops                    = &afs_cache_ops,
-};
-#endif
-
 struct afs_uuid afs_uuid;
 
 /*
@@ -104,10 +92,9 @@ static int __init afs_init(void)
        if (ret < 0)
                return ret;
 
-#ifdef AFS_CACHING_SUPPORT
+#ifdef CONFIG_AFS_FSCACHE
        /* we want to be able to cache */
-       ret = cachefs_register_netfs(&afs_cache_netfs,
-                                    &afs_cache_cell_index_def);
+       ret = fscache_register_netfs(&afs_cache_netfs);
        if (ret < 0)
                goto error_cache;
 #endif
@@ -142,8 +129,8 @@ error_fs:
 error_open_socket:
 error_vl_update_init:
 error_cell_init:
-#ifdef AFS_CACHING_SUPPORT
-       cachefs_unregister_netfs(&afs_cache_netfs);
+#ifdef CONFIG_AFS_FSCACHE
+       fscache_unregister_netfs(&afs_cache_netfs);
 error_cache:
 #endif
        afs_callback_update_kill();
@@ -175,8 +162,8 @@ static void __exit afs_exit(void)
        afs_vlocation_purge();
        flush_scheduled_work();
        afs_cell_purge();
-#ifdef AFS_CACHING_SUPPORT
-       cachefs_unregister_netfs(&afs_cache_netfs);
+#ifdef CONFIG_AFS_FSCACHE
+       fscache_unregister_netfs(&afs_cache_netfs);
 #endif
        afs_proc_cleanup();
        rcu_barrier();
index 78db4953a80047847b2d6098ca0c8bf4b30a76ef..2b9e2d03a3902d686940339ce349f88e9af51324 100644 (file)
@@ -173,9 +173,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
        if (PageError(page))
                goto error;
 
-       buf = kmap(page);
+       buf = kmap_atomic(page, KM_USER0);
        memcpy(devname, buf, size);
-       kunmap(page);
+       kunmap_atomic(buf, KM_USER0);
        page_cache_release(page);
        page = NULL;
 
index 849fc3160cb5b372c2626545f926ce185c22762b..ec2a7431e458b52c89795f6ba9c01a2efe8aa254 100644 (file)
@@ -281,9 +281,8 @@ static void afs_vlocation_apply_update(struct afs_vlocation *vl,
 
        vl->vldb = *vldb;
 
-#ifdef AFS_CACHING_SUPPORT
-       /* update volume entry in local cache */
-       cachefs_update_cookie(vl->cache);
+#ifdef CONFIG_AFS_FSCACHE
+       fscache_update_cookie(vl->cache);
 #endif
 }
 
@@ -304,11 +303,9 @@ static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
        memset(&vldb, 0, sizeof(vldb));
 
        /* see if we have an in-cache copy (will set vl->valid if there is) */
-#ifdef AFS_CACHING_SUPPORT
-       cachefs_acquire_cookie(cell->cache,
-                              &afs_volume_cache_index_def,
-                              vlocation,
-                              &vl->cache);
+#ifdef CONFIG_AFS_FSCACHE
+       vl->cache = fscache_acquire_cookie(vl->cell->cache,
+                                          &afs_vlocation_cache_index_def, vl);
 #endif
 
        if (vl->valid) {
@@ -420,6 +417,11 @@ fill_in_record:
        spin_unlock(&vl->lock);
        wake_up(&vl->waitq);
 
+       /* update volume entry in local cache */
+#ifdef CONFIG_AFS_FSCACHE
+       fscache_update_cookie(vl->cache);
+#endif
+
        /* schedule for regular updates */
        afs_vlocation_queue_for_updates(vl);
        goto success;
@@ -465,7 +467,7 @@ found_in_memory:
        spin_unlock(&vl->lock);
 
 success:
-       _leave(" = %p",vl);
+       _leave(" = %p", vl);
        return vl;
 
 error_abandon:
@@ -523,10 +525,9 @@ static void afs_vlocation_destroy(struct afs_vlocation *vl)
 {
        _enter("%p", vl);
 
-#ifdef AFS_CACHING_SUPPORT
-       cachefs_relinquish_cookie(vl->cache, 0);
+#ifdef CONFIG_AFS_FSCACHE
+       fscache_relinquish_cookie(vl->cache, 0);
 #endif
-
        afs_put_cell(vl->cell);
        kfree(vl);
 }
index 8bab0e3437f9384507721cb68e93245c4c154ff6..a353e69e2391bc5f8bb3dd2970f19aeade9ea7e8 100644 (file)
@@ -124,13 +124,11 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
        }
 
        /* attach the cache and volume location */
-#ifdef AFS_CACHING_SUPPORT
-       cachefs_acquire_cookie(vlocation->cache,
-                              &afs_vnode_cache_index_def,
-                              volume,
-                              &volume->cache);
+#ifdef CONFIG_AFS_FSCACHE
+       volume->cache = fscache_acquire_cookie(vlocation->cache,
+                                              &afs_volume_cache_index_def,
+                                              volume);
 #endif
-
        afs_get_vlocation(vlocation);
        volume->vlocation = vlocation;
 
@@ -194,8 +192,8 @@ void afs_put_volume(struct afs_volume *volume)
        up_write(&vlocation->cell->vl_sem);
 
        /* finish cleaning up the volume */
-#ifdef AFS_CACHING_SUPPORT
-       cachefs_relinquish_cookie(volume->cache, 0);
+#ifdef CONFIG_AFS_FSCACHE
+       fscache_relinquish_cookie(volume->cache, 0);
 #endif
        afs_put_vlocation(vlocation);
 
index 3fb36d433621dd2bb71edcc757dbe22390ff2640..c2e7a7ff008054ad63fb8ae33677b96b359bd90f 100644 (file)
@@ -780,3 +780,24 @@ int afs_fsync(struct file *file, struct dentry *dentry, int datasync)
        _leave(" = %d", ret);
        return ret;
 }
+
+/*
+ * notification that a previously read-only page is about to become writable
+ * - if it returns an error, the caller will deliver a bus error signal
+ */
+int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+       struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host);
+
+       _enter("{{%x:%u}},{%lx}",
+              vnode->fid.vid, vnode->fid.vnode, page->index);
+
+       /* wait for the page to be written to the cache before we allow it to
+        * be modified */
+#ifdef CONFIG_AFS_FSCACHE
+       fscache_wait_on_page_write(vnode->cache, page);
+#endif
+
+       _leave(" = 0");
+       return 0;
+}
index c84ca1f5259a5408f538d8254f939677567924a4..51bfdfc8fcdac00b5f13599bed87922563a38f07 100644 (file)
@@ -20,7 +20,6 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/freezer.h>
-#include <linux/ftrace.h>
 #include "async-thread.h"
 
 #define WORK_QUEUED_BIT 0
@@ -195,6 +194,9 @@ again_locked:
                                if (!list_empty(&worker->pending))
                                        continue;
 
+                               if (kthread_should_stop())
+                                       break;
+
                                /* still no more work?, sleep for real */
                                spin_lock_irq(&worker->lock);
                                set_current_state(TASK_INTERRUPTIBLE);
@@ -208,7 +210,8 @@ again_locked:
                                worker->working = 0;
                                spin_unlock_irq(&worker->lock);
 
-                               schedule();
+                               if (!kthread_should_stop())
+                                       schedule();
                        }
                        __set_current_state(TASK_RUNNING);
                }
index dbb7241246335373dde1fa8597744faec6748942..e5b2533b691a3040f108d517efea04ccfdcc6117 100644 (file)
@@ -1244,9 +1244,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
  * readahead one full node of leaves, finding things that are close
  * to the block in 'slot', and triggering ra on them.
  */
-static noinline void reada_for_search(struct btrfs_root *root,
-                                     struct btrfs_path *path,
-                                     int level, int slot, u64 objectid)
+static void reada_for_search(struct btrfs_root *root,
+                            struct btrfs_path *path,
+                            int level, int slot, u64 objectid)
 {
        struct extent_buffer *node;
        struct btrfs_disk_key disk_key;
@@ -1446,6 +1446,117 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
        }
 }
 
+/*
+ * helper function for btrfs_search_slot.  The goal is to find a block
+ * in cache without setting the path to blocking.  If we find the block
+ * we return zero and the path is unchanged.
+ *
+ * If we can't find the block, we set the path blocking and do some
+ * reada.  -EAGAIN is returned and the search must be repeated.
+ */
+static int
+read_block_for_search(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root, struct btrfs_path *p,
+                      struct extent_buffer **eb_ret, int level, int slot,
+                      struct btrfs_key *key)
+{
+       u64 blocknr;
+       u64 gen;
+       u32 blocksize;
+       struct extent_buffer *b = *eb_ret;
+       struct extent_buffer *tmp;
+
+       blocknr = btrfs_node_blockptr(b, slot);
+       gen = btrfs_node_ptr_generation(b, slot);
+       blocksize = btrfs_level_size(root, level - 1);
+
+       tmp = btrfs_find_tree_block(root, blocknr, blocksize);
+       if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+               *eb_ret = tmp;
+               return 0;
+       }
+
+       /*
+        * reduce lock contention at high levels
+        * of the btree by dropping locks before
+        * we read.
+        */
+       btrfs_release_path(NULL, p);
+       if (tmp)
+               free_extent_buffer(tmp);
+       if (p->reada)
+               reada_for_search(root, p, level, slot, key->objectid);
+
+       tmp = read_tree_block(root, blocknr, blocksize, gen);
+       if (tmp)
+               free_extent_buffer(tmp);
+       return -EAGAIN;
+}
+
+/*
+ * helper function for btrfs_search_slot.  This does all of the checks
+ * for node-level blocks and does any balancing required based on
+ * the ins_len.
+ *
+ * If no extra work was required, zero is returned.  If we had to
+ * drop the path, -EAGAIN is returned and btrfs_search_slot must
+ * start over
+ */
+static int
+setup_nodes_for_search(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root, struct btrfs_path *p,
+                      struct extent_buffer *b, int level, int ins_len)
+{
+       int ret;
+       if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
+           BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
+               int sret;
+
+               sret = reada_for_balance(root, p, level);
+               if (sret)
+                       goto again;
+
+               btrfs_set_path_blocking(p);
+               sret = split_node(trans, root, p, level);
+               btrfs_clear_path_blocking(p, NULL);
+
+               BUG_ON(sret > 0);
+               if (sret) {
+                       ret = sret;
+                       goto done;
+               }
+               b = p->nodes[level];
+       } else if (ins_len < 0 && btrfs_header_nritems(b) <
+                  BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
+               int sret;
+
+               sret = reada_for_balance(root, p, level);
+               if (sret)
+                       goto again;
+
+               btrfs_set_path_blocking(p);
+               sret = balance_level(trans, root, p, level);
+               btrfs_clear_path_blocking(p, NULL);
+
+               if (sret) {
+                       ret = sret;
+                       goto done;
+               }
+               b = p->nodes[level];
+               if (!b) {
+                       btrfs_release_path(NULL, p);
+                       goto again;
+               }
+               BUG_ON(btrfs_header_nritems(b) == 1);
+       }
+       return 0;
+
+again:
+       ret = -EAGAIN;
+done:
+       return ret;
+}
+
 /*
  * look for key in the tree.  path is filled in with nodes along the way
  * if key is found, we return zero and you can find the item in the leaf
@@ -1464,16 +1575,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
                      ins_len, int cow)
 {
        struct extent_buffer *b;
-       struct extent_buffer *tmp;
        int slot;
        int ret;
        int level;
-       int should_reada = p->reada;
        int lowest_unlock = 1;
-       int blocksize;
        u8 lowest_level = 0;
-       u64 blocknr;
-       u64 gen;
 
        lowest_level = p->lowest_level;
        WARN_ON(lowest_level && ins_len > 0);
@@ -1502,7 +1608,11 @@ again:
                if (cow) {
                        int wret;
 
-                       /* is a cow on this block not required */
+                       /*
+                        * if we don't really need to cow this block
+                        * then we don't want to set the path blocking,
+                        * so we test it here
+                        */
                        if (btrfs_header_generation(b) == trans->transid &&
                            btrfs_header_owner(b) == root->root_key.objectid &&
                            !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
@@ -1557,51 +1667,15 @@ cow_done:
                        if (ret && slot > 0)
                                slot -= 1;
                        p->slots[level] = slot;
-                       if ((p->search_for_split || ins_len > 0) &&
-                           btrfs_header_nritems(b) >=
-                           BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
-                               int sret;
-
-                               sret = reada_for_balance(root, p, level);
-                               if (sret)
-                                       goto again;
-
-                               btrfs_set_path_blocking(p);
-                               sret = split_node(trans, root, p, level);
-                               btrfs_clear_path_blocking(p, NULL);
-
-                               BUG_ON(sret > 0);
-                               if (sret) {
-                                       ret = sret;
-                                       goto done;
-                               }
-                               b = p->nodes[level];
-                               slot = p->slots[level];
-                       } else if (ins_len < 0 &&
-                                  btrfs_header_nritems(b) <
-                                  BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
-                               int sret;
-
-                               sret = reada_for_balance(root, p, level);
-                               if (sret)
-                                       goto again;
-
-                               btrfs_set_path_blocking(p);
-                               sret = balance_level(trans, root, p, level);
-                               btrfs_clear_path_blocking(p, NULL);
+                       ret = setup_nodes_for_search(trans, root, p, b, level,
+                                                    ins_len);
+                       if (ret == -EAGAIN)
+                               goto again;
+                       else if (ret)
+                               goto done;
+                       b = p->nodes[level];
+                       slot = p->slots[level];
 
-                               if (sret) {
-                                       ret = sret;
-                                       goto done;
-                               }
-                               b = p->nodes[level];
-                               if (!b) {
-                                       btrfs_release_path(NULL, p);
-                                       goto again;
-                               }
-                               slot = p->slots[level];
-                               BUG_ON(btrfs_header_nritems(b) == 1);
-                       }
                        unlock_up(p, level, lowest_unlock);
 
                        /* this is only true while dropping a snapshot */
@@ -1610,44 +1684,11 @@ cow_done:
                                goto done;
                        }
 
-                       blocknr = btrfs_node_blockptr(b, slot);
-                       gen = btrfs_node_ptr_generation(b, slot);
-                       blocksize = btrfs_level_size(root, level - 1);
+                       ret = read_block_for_search(trans, root, p,
+                                                   &b, level, slot, key);
+                       if (ret == -EAGAIN)
+                               goto again;
 
-                       tmp = btrfs_find_tree_block(root, blocknr, blocksize);
-                       if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
-                               b = tmp;
-                       } else {
-                               /*
-                                * reduce lock contention at high levels
-                                * of the btree by dropping locks before
-                                * we read.
-                                */
-                               if (level > 0) {
-                                       btrfs_release_path(NULL, p);
-                                       if (tmp)
-                                               free_extent_buffer(tmp);
-                                       if (should_reada)
-                                               reada_for_search(root, p,
-                                                                level, slot,
-                                                                key->objectid);
-
-                                       tmp = read_tree_block(root, blocknr,
-                                                        blocksize, gen);
-                                       if (tmp)
-                                               free_extent_buffer(tmp);
-                                       goto again;
-                               } else {
-                                       btrfs_set_path_blocking(p);
-                                       if (tmp)
-                                               free_extent_buffer(tmp);
-                                       if (should_reada)
-                                               reada_for_search(root, p,
-                                                                level, slot,
-                                                                key->objectid);
-                                       b = read_node_slot(root, b, slot);
-                               }
-                       }
                        if (!p->skip_locking) {
                                int lret;
 
@@ -2116,8 +2157,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
        BUG_ON(!path->nodes[level]);
        lower = path->nodes[level];
        nritems = btrfs_header_nritems(lower);
-       if (slot > nritems)
-               BUG();
+       BUG_ON(slot > nritems);
        if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
                BUG();
        if (slot != nritems) {
@@ -4086,28 +4126,44 @@ next:
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 {
        int slot;
-       int level = 1;
+       int level;
        struct extent_buffer *c;
-       struct extent_buffer *next = NULL;
+       struct extent_buffer *next;
        struct btrfs_key key;
        u32 nritems;
        int ret;
+       int old_spinning = path->leave_spinning;
+       int force_blocking = 0;
 
        nritems = btrfs_header_nritems(path->nodes[0]);
        if (nritems == 0)
                return 1;
 
-       btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+       /*
+        * we take the blocks in an order that upsets lockdep.  Using
+        * blocking mode is the only way around it.
+        */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+       force_blocking = 1;
+#endif
 
+       btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+again:
+       level = 1;
+       next = NULL;
        btrfs_release_path(root, path);
+
        path->keep_locks = 1;
+
+       if (!force_blocking)
+               path->leave_spinning = 1;
+
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        path->keep_locks = 0;
 
        if (ret < 0)
                return ret;
 
-       btrfs_set_path_blocking(path);
        nritems = btrfs_header_nritems(path->nodes[0]);
        /*
         * by releasing the path above we dropped all our locks.  A balance
@@ -4117,19 +4173,24 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
         */
        if (nritems > 0 && path->slots[0] < nritems - 1) {
                path->slots[0]++;
+               ret = 0;
                goto done;
        }
 
        while (level < BTRFS_MAX_LEVEL) {
-               if (!path->nodes[level])
-                       return 1;
+               if (!path->nodes[level]) {
+                       ret = 1;
+                       goto done;
+               }
 
                slot = path->slots[level] + 1;
                c = path->nodes[level];
                if (slot >= btrfs_header_nritems(c)) {
                        level++;
-                       if (level == BTRFS_MAX_LEVEL)
-                               return 1;
+                       if (level == BTRFS_MAX_LEVEL) {
+                               ret = 1;
+                               goto done;
+                       }
                        continue;
                }
 
@@ -4138,16 +4199,22 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
                        free_extent_buffer(next);
                }
 
-               /* the path was set to blocking above */
-               if (level == 1 && (path->locks[1] || path->skip_locking) &&
-                   path->reada)
-                       reada_for_search(root, path, level, slot, 0);
+               next = c;
+               ret = read_block_for_search(NULL, root, path, &next, level,
+                                           slot, &key);
+               if (ret == -EAGAIN)
+                       goto again;
 
-               next = read_node_slot(root, c, slot);
                if (!path->skip_locking) {
-                       btrfs_assert_tree_locked(c);
-                       btrfs_tree_lock(next);
-                       btrfs_set_lock_blocking(next);
+                       ret = btrfs_try_spin_lock(next);
+                       if (!ret) {
+                               btrfs_set_path_blocking(path);
+                               btrfs_tree_lock(next);
+                               if (!force_blocking)
+                                       btrfs_clear_path_blocking(path, next);
+                       }
+                       if (force_blocking)
+                               btrfs_set_lock_blocking(next);
                }
                break;
        }
@@ -4157,27 +4224,42 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
                c = path->nodes[level];
                if (path->locks[level])
                        btrfs_tree_unlock(c);
+
                free_extent_buffer(c);
                path->nodes[level] = next;
                path->slots[level] = 0;
                if (!path->skip_locking)
                        path->locks[level] = 1;
+
                if (!level)
                        break;
 
-               btrfs_set_path_blocking(path);
-               if (level == 1 && path->locks[1] && path->reada)
-                       reada_for_search(root, path, level, slot, 0);
-               next = read_node_slot(root, next, 0);
+               ret = read_block_for_search(NULL, root, path, &next, level,
+                                           0, &key);
+               if (ret == -EAGAIN)
+                       goto again;
+
                if (!path->skip_locking) {
                        btrfs_assert_tree_locked(path->nodes[level]);
-                       btrfs_tree_lock(next);
-                       btrfs_set_lock_blocking(next);
+                       ret = btrfs_try_spin_lock(next);
+                       if (!ret) {
+                               btrfs_set_path_blocking(path);
+                               btrfs_tree_lock(next);
+                               if (!force_blocking)
+                                       btrfs_clear_path_blocking(path, next);
+                       }
+                       if (force_blocking)
+                               btrfs_set_lock_blocking(next);
                }
        }
+       ret = 0;
 done:
        unlock_up(path, 0, 1);
-       return 0;
+       path->leave_spinning = old_spinning;
+       if (!old_spinning)
+               btrfs_set_path_blocking(path);
+
+       return ret;
 }
 
 /*
index 9417713542a2a5867490d0308241f610832400ee..ad96495dedc52f79a05cdbccfea20383b871164f 100644 (file)
@@ -143,12 +143,15 @@ static int btrfs_csum_sizes[] = { 4, 0 };
 #define BTRFS_FT_MAX           9
 
 /*
- * the key defines the order in the tree, and so it also defines (optimal)
- * block layout.  objectid corresonds to the inode number.  The flags
- * tells us things about the object, and is a kind of stream selector.
- * so for a given inode, keys with flags of 1 might refer to the inode
- * data, flags of 2 may point to file data in the btree and flags == 3
- * may point to extents.
+ * The key defines the order in the tree, and so it also defines (optimal)
+ * block layout.
+ *
+ * objectid corresponds to the inode number.
+ *
+ * type tells us things about the object, and is a kind of stream selector.
+ * so for a given inode, keys with type of 1 might refer to the inode data,
+ * type of 2 may point to file data in the btree and type == 3 may point to
+ * extents.
  *
  * offset is the starting byte offset for this key in the stream.
  *
@@ -200,7 +203,7 @@ struct btrfs_dev_item {
 
        /*
         * starting byte of this partition on the device,
-        * to allowr for stripe alignment in the future
+        * to allow for stripe alignment in the future
         */
        __le64 start_offset;
 
@@ -633,18 +636,35 @@ struct btrfs_space_info {
        struct rw_semaphore groups_sem;
 };
 
-struct btrfs_free_space {
-       struct rb_node bytes_index;
-       struct rb_node offset_index;
-       u64 offset;
-       u64 bytes;
+/*
+ * free clusters are used to claim free space in relatively large chunks,
+ * allowing us to do less seeky writes.  They are used for all metadata
+ * allocations and data allocations in ssd mode.
+ */
+struct btrfs_free_cluster {
+       spinlock_t lock;
+       spinlock_t refill_lock;
+       struct rb_root root;
+
+       /* largest extent in this cluster */
+       u64 max_size;
+
+       /* first extent starting offset */
+       u64 window_start;
+
+       struct btrfs_block_group_cache *block_group;
+       /*
+        * when a cluster is allocated from a block group, we put the
+        * cluster onto a list in the block group so that it can
+        * be freed before the block group is freed.
+        */
+       struct list_head block_group_list;
 };
 
 struct btrfs_block_group_cache {
        struct btrfs_key key;
        struct btrfs_block_group_item item;
        spinlock_t lock;
-       struct mutex alloc_mutex;
        struct mutex cache_mutex;
        u64 pinned;
        u64 reserved;
@@ -656,6 +676,7 @@ struct btrfs_block_group_cache {
        struct btrfs_space_info *space_info;
 
        /* free space cache stuff */
+       spinlock_t tree_lock;
        struct rb_root free_space_bytes;
        struct rb_root free_space_offset;
 
@@ -667,6 +688,11 @@ struct btrfs_block_group_cache {
 
        /* usage count */
        atomic_t count;
+
+       /* List of struct btrfs_free_clusters for this block group.
+        * Today it will only have one thing on it, but that may change
+        */
+       struct list_head cluster_list;
 };
 
 struct btrfs_leaf_ref_tree {
@@ -728,7 +754,6 @@ struct btrfs_fs_info {
        struct mutex tree_log_mutex;
        struct mutex transaction_kthread_mutex;
        struct mutex cleaner_mutex;
-       struct mutex pinned_mutex;
        struct mutex chunk_mutex;
        struct mutex drop_mutex;
        struct mutex volume_mutex;
@@ -839,8 +864,12 @@ struct btrfs_fs_info {
        spinlock_t delalloc_lock;
        spinlock_t new_trans_lock;
        u64 delalloc_bytes;
-       u64 last_alloc;
-       u64 last_data_alloc;
+
+       /* data_alloc_cluster is only used in ssd mode */
+       struct btrfs_free_cluster data_alloc_cluster;
+
+       /* all metadata allocations go through this cluster */
+       struct btrfs_free_cluster meta_alloc_cluster;
 
        spinlock_t ref_cache_lock;
        u64 total_ref_cache_size;
@@ -932,7 +961,6 @@ struct btrfs_root {
 };
 
 /*
-
  * inode items have the data typically returned from stat and store other
  * info about object characteristics.  There is one for every file and dir in
  * the FS
@@ -963,7 +991,7 @@ struct btrfs_root {
 #define BTRFS_EXTENT_CSUM_KEY  128
 
 /*
- * root items point to tree roots.  There are typically in the root
+ * root items point to tree roots.  They are typically in the root
  * tree used by the super block to find all the other trees
  */
 #define BTRFS_ROOT_ITEM_KEY    132
@@ -1010,6 +1038,8 @@ struct btrfs_root {
 #define BTRFS_MOUNT_SSD                        (1 << 3)
 #define BTRFS_MOUNT_DEGRADED           (1 << 4)
 #define BTRFS_MOUNT_COMPRESS           (1 << 5)
+#define BTRFS_MOUNT_NOTREELOG           (1 << 6)
+#define BTRFS_MOUNT_FLUSHONCOMMIT       (1 << 7)
 
 #define btrfs_clear_opt(o, opt)                ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)          ((o) |= BTRFS_MOUNT_##opt)
@@ -1748,6 +1778,7 @@ static inline struct dentry *fdentry(struct file *file)
 }
 
 /* extent-tree.c */
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, unsigned long count);
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
@@ -2174,21 +2205,4 @@ int btrfs_check_acl(struct inode *inode, int mask);
 int btrfs_init_acl(struct inode *inode, struct inode *dir);
 int btrfs_acl_chmod(struct inode *inode);
 
-/* free-space-cache.c */
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
-                        u64 bytenr, u64 size);
-int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
-                             u64 offset, u64 bytes);
-int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
-                           u64 bytenr, u64 size);
-int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
-                                u64 offset, u64 bytes);
-void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
-                                  *block_group);
-struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
-                                              *block_group, u64 offset,
-                                              u64 bytes);
-void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
-                          u64 bytes);
-u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
 #endif
index cbf7dc8ae3ec2e2b84eb70f65da7cce75de6afbd..d6c01c096a40be707627116382780eb74626649f 100644 (file)
@@ -18,7 +18,6 @@
 
 #include <linux/sched.h>
 #include <linux/sort.h>
-#include <linux/ftrace.h>
 #include "ctree.h"
 #include "delayed-ref.h"
 #include "transaction.h"
index 92d73929d3814b50024196eb5e8688d2ee330222..92caa8035f36f9beecc3c21551722a0eefdd911e 100644 (file)
@@ -38,6 +38,7 @@
 #include "locking.h"
 #include "ref-cache.h"
 #include "tree-log.h"
+#include "free-space-cache.h"
 
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
@@ -1412,8 +1413,6 @@ static int bio_ready_for_csum(struct bio *bio)
 
        ret = extent_range_uptodate(io_tree, start + length,
                                    start + buf_len - 1);
-       if (ret == 1)
-               return ret;
        return ret;
 }
 
@@ -1647,12 +1646,15 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        mutex_init(&fs_info->ordered_operations_mutex);
        mutex_init(&fs_info->tree_log_mutex);
        mutex_init(&fs_info->drop_mutex);
-       mutex_init(&fs_info->pinned_mutex);
        mutex_init(&fs_info->chunk_mutex);
        mutex_init(&fs_info->transaction_kthread_mutex);
        mutex_init(&fs_info->cleaner_mutex);
        mutex_init(&fs_info->volume_mutex);
        mutex_init(&fs_info->tree_reloc_mutex);
+
+       btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
+       btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
+
        init_waitqueue_head(&fs_info->transaction_throttle);
        init_waitqueue_head(&fs_info->transaction_wait);
        init_waitqueue_head(&fs_info->async_submit_wait);
index f5e7cae63d80f8828f711833b58600a8bce71df9..178df4c67de447e87514e90284a7ccc294dc8df1 100644 (file)
@@ -31,6 +31,7 @@
 #include "volumes.h"
 #include "locking.h"
 #include "ref-cache.h"
+#include "free-space-cache.h"
 
 #define PENDING_EXTENT_INSERT 0
 #define PENDING_EXTENT_DELETE 1
@@ -166,7 +167,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
        u64 extent_start, extent_end, size;
        int ret;
 
-       mutex_lock(&info->pinned_mutex);
        while (start < end) {
                ret = find_first_extent_bit(&info->pinned_extents, start,
                                            &extent_start, &extent_end,
@@ -192,7 +192,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
                ret = btrfs_add_free_space(block_group, start, size);
                BUG_ON(ret);
        }
-       mutex_unlock(&info->pinned_mutex);
 
        return 0;
 }
@@ -291,8 +290,8 @@ next:
                           block_group->key.objectid +
                           block_group->key.offset);
 
-       remove_sb_from_cache(root, block_group);
        block_group->cached = 1;
+       remove_sb_from_cache(root, block_group);
        ret = 0;
 err:
        btrfs_free_path(path);
@@ -326,7 +325,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
        return cache;
 }
 
-static inline void put_block_group(struct btrfs_block_group_cache *cache)
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 {
        if (atomic_dec_and_test(&cache->count))
                kfree(cache);
@@ -399,12 +398,12 @@ again:
                            div_factor(cache->key.offset, factor)) {
                                group_start = cache->key.objectid;
                                spin_unlock(&cache->lock);
-                               put_block_group(cache);
+                               btrfs_put_block_group(cache);
                                goto found;
                        }
                }
                spin_unlock(&cache->lock);
-               put_block_group(cache);
+               btrfs_put_block_group(cache);
                cond_resched();
        }
        if (!wrapped) {
@@ -1594,7 +1593,7 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
        if (!block_group || block_group->ro)
                readonly = 1;
        if (block_group)
-               put_block_group(block_group);
+               btrfs_put_block_group(block_group);
        return readonly;
 }
 
@@ -2018,7 +2017,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                                WARN_ON(ret);
                        }
                }
-               put_block_group(cache);
+               btrfs_put_block_group(cache);
                total -= num_bytes;
                bytenr += num_bytes;
        }
@@ -2035,7 +2034,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
                return 0;
 
        bytenr = cache->key.objectid;
-       put_block_group(cache);
+       btrfs_put_block_group(cache);
 
        return bytenr;
 }
@@ -2047,7 +2046,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
        struct btrfs_block_group_cache *cache;
        struct btrfs_fs_info *fs_info = root->fs_info;
 
-       WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
        if (pin) {
                set_extent_dirty(&fs_info->pinned_extents,
                                bytenr, bytenr + num - 1, GFP_NOFS);
@@ -2055,7 +2053,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
                clear_extent_dirty(&fs_info->pinned_extents,
                                bytenr, bytenr + num - 1, GFP_NOFS);
        }
-       mutex_unlock(&root->fs_info->pinned_mutex);
 
        while (num > 0) {
                cache = btrfs_lookup_block_group(fs_info, bytenr);
@@ -2081,7 +2078,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
                        if (cache->cached)
                                btrfs_add_free_space(cache, bytenr, len);
                }
-               put_block_group(cache);
+               btrfs_put_block_group(cache);
                bytenr += len;
                num -= len;
        }
@@ -2112,7 +2109,7 @@ static int update_reserved_extents(struct btrfs_root *root,
                }
                spin_unlock(&cache->lock);
                spin_unlock(&cache->space_info->lock);
-               put_block_group(cache);
+               btrfs_put_block_group(cache);
                bytenr += len;
                num -= len;
        }
@@ -2127,7 +2124,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
        struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
        int ret;
 
-       mutex_lock(&root->fs_info->pinned_mutex);
        while (1) {
                ret = find_first_extent_bit(pinned_extents, last,
                                            &start, &end, EXTENT_DIRTY);
@@ -2136,7 +2132,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
                set_extent_dirty(copy, start, end, GFP_NOFS);
                last = end + 1;
        }
-       mutex_unlock(&root->fs_info->pinned_mutex);
        return 0;
 }
 
@@ -2149,7 +2144,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
        int ret;
 
        while (1) {
-               mutex_lock(&root->fs_info->pinned_mutex);
                ret = find_first_extent_bit(unpin, 0, &start, &end,
                                            EXTENT_DIRTY);
                if (ret)
@@ -2163,7 +2157,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
                cond_resched();
        }
-       mutex_unlock(&root->fs_info->pinned_mutex);
        return ret;
 }
 
@@ -2205,7 +2198,6 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
        free_extent_buffer(buf);
 pinit:
        btrfs_set_path_blocking(path);
-       mutex_lock(&root->fs_info->pinned_mutex);
        /* unlocks the pinned mutex */
        btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
 
@@ -2511,8 +2503,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
         */
        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
            owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
-               mutex_lock(&root->fs_info->pinned_mutex);
-
                /* unlocks the pinned mutex */
                btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
                update_reserved_extents(root, bytenr, num_bytes, 0);
@@ -2554,228 +2544,237 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 {
        int ret = 0;
        struct btrfs_root *root = orig_root->fs_info->extent_root;
-       u64 total_needed = num_bytes;
-       u64 *last_ptr = NULL;
-       u64 last_wanted = 0;
+       struct btrfs_free_cluster *last_ptr = NULL;
        struct btrfs_block_group_cache *block_group = NULL;
-       int chunk_alloc_done = 0;
        int empty_cluster = 2 * 1024 * 1024;
        int allowed_chunk_alloc = 0;
-       struct list_head *head = NULL, *cur = NULL;
-       int loop = 0;
-       int extra_loop = 0;
        struct btrfs_space_info *space_info;
+       int last_ptr_loop = 0;
+       int loop = 0;
 
        WARN_ON(num_bytes < root->sectorsize);
        btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
        ins->objectid = 0;
        ins->offset = 0;
 
+       space_info = __find_space_info(root->fs_info, data);
+
        if (orig_root->ref_cows || empty_size)
                allowed_chunk_alloc = 1;
 
        if (data & BTRFS_BLOCK_GROUP_METADATA) {
-               last_ptr = &root->fs_info->last_alloc;
+               last_ptr = &root->fs_info->meta_alloc_cluster;
                if (!btrfs_test_opt(root, SSD))
                        empty_cluster = 64 * 1024;
        }
 
-       if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
-               last_ptr = &root->fs_info->last_data_alloc;
+       if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
+               last_ptr = &root->fs_info->data_alloc_cluster;
+       }
 
        if (last_ptr) {
-               if (*last_ptr) {
-                       hint_byte = *last_ptr;
-                       last_wanted = *last_ptr;
-               } else
-                       empty_size += empty_cluster;
-       } else {
-               empty_cluster = 0;
+               spin_lock(&last_ptr->lock);
+               if (last_ptr->block_group)
+                       hint_byte = last_ptr->window_start;
+               spin_unlock(&last_ptr->lock);
        }
+
        search_start = max(search_start, first_logical_byte(root, 0));
        search_start = max(search_start, hint_byte);
 
-       if (last_wanted && search_start != last_wanted) {
-               last_wanted = 0;
-               empty_size += empty_cluster;
+       if (!last_ptr) {
+               empty_cluster = 0;
+               loop = 1;
        }
 
-       total_needed += empty_size;
-       block_group = btrfs_lookup_block_group(root->fs_info, search_start);
-       if (!block_group)
-               block_group = btrfs_lookup_first_block_group(root->fs_info,
-                                                            search_start);
-       space_info = __find_space_info(root->fs_info, data);
+       if (search_start == hint_byte) {
+               block_group = btrfs_lookup_block_group(root->fs_info,
+                                                      search_start);
+               if (block_group && block_group_bits(block_group, data)) {
+                       down_read(&space_info->groups_sem);
+                       goto have_block_group;
+               } else if (block_group) {
+                       btrfs_put_block_group(block_group);
+               }
+       }
 
+search:
        down_read(&space_info->groups_sem);
-       while (1) {
-               struct btrfs_free_space *free_space;
-               /*
-                * the only way this happens if our hint points to a block
-                * group thats not of the proper type, while looping this
-                * should never happen
-                */
-               if (empty_size)
-                       extra_loop = 1;
+       list_for_each_entry(block_group, &space_info->block_groups, list) {
+               u64 offset;
 
-               if (!block_group)
-                       goto new_group_no_lock;
+               atomic_inc(&block_group->count);
+               search_start = block_group->key.objectid;
 
+have_block_group:
                if (unlikely(!block_group->cached)) {
                        mutex_lock(&block_group->cache_mutex);
                        ret = cache_block_group(root, block_group);
                        mutex_unlock(&block_group->cache_mutex);
-                       if (ret)
+                       if (ret) {
+                               btrfs_put_block_group(block_group);
                                break;
+                       }
                }
 
-               mutex_lock(&block_group->alloc_mutex);
-               if (unlikely(!block_group_bits(block_group, data)))
-                       goto new_group;
-
                if (unlikely(block_group->ro))
-                       goto new_group;
+                       goto loop;
 
-               free_space = btrfs_find_free_space(block_group, search_start,
-                                                  total_needed);
-               if (free_space) {
-                       u64 start = block_group->key.objectid;
-                       u64 end = block_group->key.objectid +
-                               block_group->key.offset;
+               if (last_ptr) {
+                       /*
+                        * the refill lock keeps out other
+                        * people trying to start a new cluster
+                        */
+                       spin_lock(&last_ptr->refill_lock);
+                       offset = btrfs_alloc_from_cluster(block_group, last_ptr,
+                                                num_bytes, search_start);
+                       if (offset) {
+                               /* we have a block, we're done */
+                               spin_unlock(&last_ptr->refill_lock);
+                               goto checks;
+                       }
 
-                       search_start = stripe_align(root, free_space->offset);
+                       spin_lock(&last_ptr->lock);
+                       /*
+                        * whoops, this cluster doesn't actually point to
+                        * this block group.  Get a ref on the block
+                        * group is does point to and try again
+                        */
+                       if (!last_ptr_loop && last_ptr->block_group &&
+                           last_ptr->block_group != block_group) {
+
+                               btrfs_put_block_group(block_group);
+                               block_group = last_ptr->block_group;
+                               atomic_inc(&block_group->count);
+                               spin_unlock(&last_ptr->lock);
+                               spin_unlock(&last_ptr->refill_lock);
+
+                               last_ptr_loop = 1;
+                               search_start = block_group->key.objectid;
+                               goto have_block_group;
+                       }
+                       spin_unlock(&last_ptr->lock);
 
-                       /* move on to the next group */
-                       if (search_start + num_bytes >= search_end)
-                               goto new_group;
+                       /*
+                        * this cluster didn't work out, free it and
+                        * start over
+                        */
+                       btrfs_return_cluster_to_free_space(NULL, last_ptr);
 
-                       /* move on to the next group */
-                       if (search_start + num_bytes > end)
-                               goto new_group;
+                       last_ptr_loop = 0;
 
-                       if (last_wanted && search_start != last_wanted) {
-                               total_needed += empty_cluster;
-                               empty_size += empty_cluster;
-                               last_wanted = 0;
+                       /* allocate a cluster in this block group */
+                       ret = btrfs_find_space_cluster(trans,
+                                              block_group, last_ptr,
+                                              offset, num_bytes,
+                                              empty_cluster + empty_size);
+                       if (ret == 0) {
                                /*
-                                * if search_start is still in this block group
-                                * then we just re-search this block group
+                                * now pull our allocation out of this
+                                * cluster
                                 */
-                               if (search_start >= start &&
-                                   search_start < end) {
-                                       mutex_unlock(&block_group->alloc_mutex);
-                                       continue;
+                               offset = btrfs_alloc_from_cluster(block_group,
+                                                 last_ptr, num_bytes,
+                                                 search_start);
+                               if (offset) {
+                                       /* we found one, proceed */
+                                       spin_unlock(&last_ptr->refill_lock);
+                                       goto checks;
                                }
-
-                               /* else we go to the next block group */
-                               goto new_group;
                        }
-
-                       if (exclude_nr > 0 &&
-                           (search_start + num_bytes > exclude_start &&
-                            search_start < exclude_start + exclude_nr)) {
-                               search_start = exclude_start + exclude_nr;
-                               /*
-                                * if search_start is still in this block group
-                                * then we just re-search this block group
-                                */
-                               if (search_start >= start &&
-                                   search_start < end) {
-                                       mutex_unlock(&block_group->alloc_mutex);
-                                       last_wanted = 0;
-                                       continue;
-                               }
-
-                               /* else we go to the next block group */
-                               goto new_group;
+                       /*
+                        * at this point we either didn't find a cluster
+                        * or we weren't able to allocate a block from our
+                        * cluster.  Free the cluster we've been trying
+                        * to use, and go to the next block group
+                        */
+                       if (loop < 2) {
+                               btrfs_return_cluster_to_free_space(NULL,
+                                                                  last_ptr);
+                               spin_unlock(&last_ptr->refill_lock);
+                               goto loop;
                        }
+                       spin_unlock(&last_ptr->refill_lock);
+               }
 
-                       ins->objectid = search_start;
-                       ins->offset = num_bytes;
+               offset = btrfs_find_space_for_alloc(block_group, search_start,
+                                                   num_bytes, empty_size);
+               if (!offset)
+                       goto loop;
+checks:
+               search_start = stripe_align(root, offset);
+
+               /* move on to the next group */
+               if (search_start + num_bytes >= search_end) {
+                       btrfs_add_free_space(block_group, offset, num_bytes);
+                       goto loop;
+               }
 
-                       btrfs_remove_free_space_lock(block_group, search_start,
-                                                    num_bytes);
-                       /* we are all good, lets return */
-                       mutex_unlock(&block_group->alloc_mutex);
-                       break;
+               /* move on to the next group */
+               if (search_start + num_bytes >
+                   block_group->key.objectid + block_group->key.offset) {
+                       btrfs_add_free_space(block_group, offset, num_bytes);
+                       goto loop;
                }
-new_group:
-               mutex_unlock(&block_group->alloc_mutex);
-               put_block_group(block_group);
-               block_group = NULL;
-new_group_no_lock:
-               /* don't try to compare new allocations against the
-                * last allocation any more
-                */
-               last_wanted = 0;
 
-               /*
-                * Here's how this works.
-                * loop == 0: we were searching a block group via a hint
-                *              and didn't find anything, so we start at
-                *              the head of the block groups and keep searching
-                * loop == 1: we're searching through all of the block groups
-                *              if we hit the head again we have searched
-                *              all of the block groups for this space and we
-                *              need to try and allocate, if we cant error out.
-                * loop == 2: we allocated more space and are looping through
-                *              all of the block groups again.
-                */
-               if (loop == 0) {
-                       head = &space_info->block_groups;
-                       cur = head->next;
-                       loop++;
-               } else if (loop == 1 && cur == head) {
-                       int keep_going;
-
-                       /* at this point we give up on the empty_size
-                        * allocations and just try to allocate the min
-                        * space.
-                        *
-                        * The extra_loop field was set if an empty_size
-                        * allocation was attempted above, and if this
-                        * is try we need to try the loop again without
-                        * the additional empty_size.
+               if (exclude_nr > 0 &&
+                   (search_start + num_bytes > exclude_start &&
+                    search_start < exclude_start + exclude_nr)) {
+                       search_start = exclude_start + exclude_nr;
+
+                       btrfs_add_free_space(block_group, offset, num_bytes);
+                       /*
+                        * if search_start is still in this block group
+                        * then we just re-search this block group
                         */
-                       total_needed -= empty_size;
-                       empty_size = 0;
-                       keep_going = extra_loop;
-                       loop++;
+                       if (search_start >= block_group->key.objectid &&
+                           search_start < (block_group->key.objectid +
+                                           block_group->key.offset))
+                               goto have_block_group;
+                       goto loop;
+               }
 
-                       if (allowed_chunk_alloc && !chunk_alloc_done) {
-                               up_read(&space_info->groups_sem);
-                               ret = do_chunk_alloc(trans, root, num_bytes +
-                                                    2 * 1024 * 1024, data, 1);
-                               down_read(&space_info->groups_sem);
-                               if (ret < 0)
-                                       goto loop_check;
-                               head = &space_info->block_groups;
-                               /*
-                                * we've allocated a new chunk, keep
-                                * trying
-                                */
-                               keep_going = 1;
-                               chunk_alloc_done = 1;
-                       } else if (!allowed_chunk_alloc) {
-                               space_info->force_alloc = 1;
-                       }
-loop_check:
-                       if (keep_going) {
-                               cur = head->next;
-                               extra_loop = 0;
-                       } else {
-                               break;
-                       }
-               } else if (cur == head) {
-                       break;
+               ins->objectid = search_start;
+               ins->offset = num_bytes;
+
+               if (offset < search_start)
+                       btrfs_add_free_space(block_group, offset,
+                                            search_start - offset);
+               BUG_ON(offset > search_start);
+
+               /* we are all good, lets return */
+               break;
+loop:
+               btrfs_put_block_group(block_group);
+       }
+       up_read(&space_info->groups_sem);
+
+       /* loop == 0, try to find a clustered alloc in every block group
+        * loop == 1, try again after forcing a chunk allocation
+        * loop == 2, set empty_size and empty_cluster to 0 and try again
+        */
+       if (!ins->objectid && loop < 3 &&
+           (empty_size || empty_cluster || allowed_chunk_alloc)) {
+               if (loop >= 2) {
+                       empty_size = 0;
+                       empty_cluster = 0;
                }
 
-               block_group = list_entry(cur, struct btrfs_block_group_cache,
-                                        list);
-               atomic_inc(&block_group->count);
+               if (allowed_chunk_alloc) {
+                       ret = do_chunk_alloc(trans, root, num_bytes +
+                                            2 * 1024 * 1024, data, 1);
+                       allowed_chunk_alloc = 0;
+               } else {
+                       space_info->force_alloc = 1;
+               }
 
-               search_start = block_group->key.objectid;
-               cur = cur->next;
+               if (loop < 3) {
+                       loop++;
+                       goto search;
+               }
+               ret = -ENOSPC;
+       } else if (!ins->objectid) {
+               ret = -ENOSPC;
        }
 
        /* we found what we needed */
@@ -2783,21 +2782,10 @@ loop_check:
                if (!(data & BTRFS_BLOCK_GROUP_DATA))
                        trans->block_group = block_group->key.objectid;
 
-               if (last_ptr)
-                       *last_ptr = ins->objectid + ins->offset;
+               btrfs_put_block_group(block_group);
                ret = 0;
-       } else if (!ret) {
-               printk(KERN_ERR "btrfs searching for %llu bytes, "
-                      "num_bytes %llu, loop %d, allowed_alloc %d\n",
-                      (unsigned long long)total_needed,
-                      (unsigned long long)num_bytes,
-                      loop, allowed_chunk_alloc);
-               ret = -ENOSPC;
        }
-       if (block_group)
-               put_block_group(block_group);
 
-       up_read(&space_info->groups_sem);
        return ret;
 }
 
@@ -2902,7 +2890,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
        ret = btrfs_discard_extent(root, start, len);
 
        btrfs_add_free_space(cache, start, len);
-       put_block_group(cache);
+       btrfs_put_block_group(cache);
        update_reserved_extents(root, start, len, 0);
 
        return ret;
@@ -3040,7 +3028,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
        ret = btrfs_remove_free_space(block_group, ins->objectid,
                                      ins->offset);
        BUG_ON(ret);
-       put_block_group(block_group);
+       btrfs_put_block_group(block_group);
        ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
                                            ref_generation, owner, ins, 1);
        return ret;
@@ -5729,7 +5717,7 @@ next:
        WARN_ON(block_group->reserved > 0);
        WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
        spin_unlock(&block_group->lock);
-       put_block_group(block_group);
+       btrfs_put_block_group(block_group);
        ret = 0;
 out:
        btrfs_free_path(path);
@@ -5856,9 +5844,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 
                atomic_set(&cache->count, 1);
                spin_lock_init(&cache->lock);
-               mutex_init(&cache->alloc_mutex);
+               spin_lock_init(&cache->tree_lock);
                mutex_init(&cache->cache_mutex);
                INIT_LIST_HEAD(&cache->list);
+               INIT_LIST_HEAD(&cache->cluster_list);
                read_extent_buffer(leaf, &cache->item,
                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
                                   sizeof(cache->item));
@@ -5912,9 +5901,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
        atomic_set(&cache->count, 1);
        spin_lock_init(&cache->lock);
-       mutex_init(&cache->alloc_mutex);
+       spin_lock_init(&cache->tree_lock);
        mutex_init(&cache->cache_mutex);
        INIT_LIST_HEAD(&cache->list);
+       INIT_LIST_HEAD(&cache->cluster_list);
 
        btrfs_set_block_group_used(&cache->item, bytes_used);
        btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
@@ -5974,8 +5964,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        spin_unlock(&block_group->space_info->lock);
        block_group->space_info->full = 0;
 
-       put_block_group(block_group);
-       put_block_group(block_group);
+       btrfs_put_block_group(block_group);
+       btrfs_put_block_group(block_group);
 
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret > 0)
index 08085af089e27827eba49898146ebe63c6623b07..eb2bee8b7fbfb19fb37dc26cbe6fe01ec92ea3ef 100644 (file)
@@ -2884,25 +2884,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                disko = 0;
                flags = 0;
 
-               switch (em->block_start) {
-               case EXTENT_MAP_LAST_BYTE:
+               if (em->block_start == EXTENT_MAP_LAST_BYTE) {
                        end = 1;
                        flags |= FIEMAP_EXTENT_LAST;
-                       break;
-               case EXTENT_MAP_HOLE:
+               } else if (em->block_start == EXTENT_MAP_HOLE) {
                        flags |= FIEMAP_EXTENT_UNWRITTEN;
-                       break;
-               case EXTENT_MAP_INLINE:
+               } else if (em->block_start == EXTENT_MAP_INLINE) {
                        flags |= (FIEMAP_EXTENT_DATA_INLINE |
                                  FIEMAP_EXTENT_NOT_ALIGNED);
-                       break;
-               case EXTENT_MAP_DELALLOC:
+               } else if (em->block_start == EXTENT_MAP_DELALLOC) {
                        flags |= (FIEMAP_EXTENT_DELALLOC |
                                  FIEMAP_EXTENT_UNKNOWN);
-                       break;
-               default:
+               } else {
                        disko = em->block_start;
-                       break;
                }
                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
                        flags |= FIEMAP_EXTENT_ENCODED;
index 50da69da20cec141fca63ffba2a4e92b8f93f875..b187917b36fa8c19202b1e5049f3bbf9aaef451f 100644 (file)
@@ -234,7 +234,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
        rb = tree_insert(&tree->map, em->start, &em->rb_node);
        if (rb) {
                ret = -EEXIST;
-               free_extent_map(merge);
                goto out;
        }
        atomic_inc(&em->refs);
index d1e5f0e84c58c8733e90ad15d09453a31e114470..768b9523662df85274e402328c618c226be5ab77 100644 (file)
 
 #include <linux/sched.h>
 #include "ctree.h"
+#include "free-space-cache.h"
+#include "transaction.h"
+
+struct btrfs_free_space {
+       struct rb_node bytes_index;
+       struct rb_node offset_index;
+       u64 offset;
+       u64 bytes;
+};
 
 static int tree_insert_offset(struct rb_root *root, u64 offset,
                              struct rb_node *node)
@@ -68,14 +77,24 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
 }
 
 /*
- * searches the tree for the given offset.  If contains is set we will return
- * the free space that contains the given offset.  If contains is not set we
- * will return the free space that starts at or after the given offset and is
- * at least bytes long.
+ * searches the tree for the given offset.
+ *
+ * fuzzy == 1: this is used for allocations where we are given a hint of where
+ * to look for free space.  Because the hint may not be completely on an offset
+ * mark, or the hint may no longer point to free space we need to fudge our
+ * results a bit.  So we look for free space starting at or after offset with at
+ * least bytes size.  We prefer to find as close to the given offset as we can.
+ * Also if the offset is within a free space range, then we will return the free
+ * space that contains the given offset, which means we can return a free space
+ * chunk with an offset before the provided offset.
+ *
+ * fuzzy == 0: this is just a normal tree search.  Give us the free space that
+ * starts at the given offset which is at least bytes size, and if its not there
+ * return NULL.
  */
 static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
                                                   u64 offset, u64 bytes,
-                                                  int contains)
+                                                  int fuzzy)
 {
        struct rb_node *n = root->rb_node;
        struct btrfs_free_space *entry, *ret = NULL;
@@ -84,13 +103,14 @@ static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
                entry = rb_entry(n, struct btrfs_free_space, offset_index);
 
                if (offset < entry->offset) {
-                       if (!contains &&
+                       if (fuzzy &&
                            (!ret || entry->offset < ret->offset) &&
                            (bytes <= entry->bytes))
                                ret = entry;
                        n = n->rb_left;
                } else if (offset > entry->offset) {
-                       if ((entry->offset + entry->bytes - 1) >= offset &&
+                       if (fuzzy &&
+                           (entry->offset + entry->bytes - 1) >= offset &&
                            bytes <= entry->bytes) {
                                ret = entry;
                                break;
@@ -171,6 +191,7 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
        int ret = 0;
 
 
+       BUG_ON(!info->bytes);
        ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
                                 &info->offset_index);
        if (ret)
@@ -184,108 +205,70 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
        return ret;
 }
 
-static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
-                                 u64 offset, u64 bytes)
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+                        u64 offset, u64 bytes)
 {
        struct btrfs_free_space *right_info;
        struct btrfs_free_space *left_info;
        struct btrfs_free_space *info = NULL;
-       struct btrfs_free_space *alloc_info;
        int ret = 0;
 
-       alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
-       if (!alloc_info)
+       info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+       if (!info)
                return -ENOMEM;
 
+       info->offset = offset;
+       info->bytes = bytes;
+
+       spin_lock(&block_group->tree_lock);
+
        /*
         * first we want to see if there is free space adjacent to the range we
         * are adding, if there is remove that struct and add a new one to
         * cover the entire range
         */
        right_info = tree_search_offset(&block_group->free_space_offset,
-                                       offset+bytes, 0, 1);
+                                       offset+bytes, 0, 0);
        left_info = tree_search_offset(&block_group->free_space_offset,
                                       offset-1, 0, 1);
 
-       if (right_info && right_info->offset == offset+bytes) {
+       if (right_info) {
                unlink_free_space(block_group, right_info);
-               info = right_info;
-               info->offset = offset;
-               info->bytes += bytes;
-       } else if (right_info && right_info->offset != offset+bytes) {
-               printk(KERN_ERR "btrfs adding space in the middle of an "
-                      "existing free space area. existing: "
-                      "offset=%llu, bytes=%llu. new: offset=%llu, "
-                      "bytes=%llu\n", (unsigned long long)right_info->offset,
-                      (unsigned long long)right_info->bytes,
-                      (unsigned long long)offset,
-                      (unsigned long long)bytes);
-               BUG();
+               info->bytes += right_info->bytes;
+               kfree(right_info);
        }
 
-       if (left_info) {
+       if (left_info && left_info->offset + left_info->bytes == offset) {
                unlink_free_space(block_group, left_info);
-
-               if (unlikely((left_info->offset + left_info->bytes) !=
-                            offset)) {
-                       printk(KERN_ERR "btrfs free space to the left "
-                              "of new free space isn't "
-                              "quite right. existing: offset=%llu, "
-                              "bytes=%llu. new: offset=%llu, bytes=%llu\n",
-                              (unsigned long long)left_info->offset,
-                              (unsigned long long)left_info->bytes,
-                              (unsigned long long)offset,
-                              (unsigned long long)bytes);
-                       BUG();
-               }
-
-               if (info) {
-                       info->offset = left_info->offset;
-                       info->bytes += left_info->bytes;
-                       kfree(left_info);
-               } else {
-                       info = left_info;
-                       info->bytes += bytes;
-               }
+               info->offset = left_info->offset;
+               info->bytes += left_info->bytes;
+               kfree(left_info);
        }
 
-       if (info) {
-               ret = link_free_space(block_group, info);
-               if (!ret)
-                       info = NULL;
-               goto out;
-       }
-
-       info = alloc_info;
-       alloc_info = NULL;
-       info->offset = offset;
-       info->bytes = bytes;
-
        ret = link_free_space(block_group, info);
        if (ret)
                kfree(info);
-out:
+
+       spin_unlock(&block_group->tree_lock);
+
        if (ret) {
                printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
-               if (ret == -EEXIST)
-                       BUG();
+               BUG_ON(ret == -EEXIST);
        }
 
-       kfree(alloc_info);
-
        return ret;
 }
 
-static int
-__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
-                         u64 offset, u64 bytes)
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+                           u64 offset, u64 bytes)
 {
        struct btrfs_free_space *info;
        int ret = 0;
 
+       spin_lock(&block_group->tree_lock);
+
        info = tree_search_offset(&block_group->free_space_offset, offset, 0,
                                  1);
-
        if (info && info->offset == offset) {
                if (info->bytes < bytes) {
                        printk(KERN_ERR "Found free space at %llu, size %llu,"
@@ -295,12 +278,14 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                               (unsigned long long)bytes);
                        WARN_ON(1);
                        ret = -EINVAL;
+                       spin_unlock(&block_group->tree_lock);
                        goto out;
                }
                unlink_free_space(block_group, info);
 
                if (info->bytes == bytes) {
                        kfree(info);
+                       spin_unlock(&block_group->tree_lock);
                        goto out;
                }
 
@@ -308,6 +293,7 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                info->bytes -= bytes;
 
                ret = link_free_space(block_group, info);
+               spin_unlock(&block_group->tree_lock);
                BUG_ON(ret);
        } else if (info && info->offset < offset &&
                   info->offset + info->bytes >= offset + bytes) {
@@ -333,70 +319,33 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                         */
                        kfree(info);
                }
-
+               spin_unlock(&block_group->tree_lock);
                /* step two, insert a new info struct to cover anything
                 * before the hole
                 */
-               ret = __btrfs_add_free_space(block_group, old_start,
-                                            offset - old_start);
+               ret = btrfs_add_free_space(block_group, old_start,
+                                          offset - old_start);
                BUG_ON(ret);
        } else {
+               spin_unlock(&block_group->tree_lock);
+               if (!info) {
+                       printk(KERN_ERR "couldn't find space %llu to free\n",
+                              (unsigned long long)offset);
+                       printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
+                              block_group->cached, block_group->key.objectid,
+                              block_group->key.offset);
+                       btrfs_dump_free_space(block_group, bytes);
+               } else if (info) {
+                       printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
+                              "but wanted offset=%llu bytes=%llu\n",
+                              info->offset, info->bytes, offset, bytes);
+               }
                WARN_ON(1);
        }
 out:
        return ret;
 }
 
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
-                        u64 offset, u64 bytes)
-{
-       int ret;
-       struct btrfs_free_space *sp;
-
-       mutex_lock(&block_group->alloc_mutex);
-       ret = __btrfs_add_free_space(block_group, offset, bytes);
-       sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
-       BUG_ON(!sp);
-       mutex_unlock(&block_group->alloc_mutex);
-
-       return ret;
-}
-
-int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
-                             u64 offset, u64 bytes)
-{
-       int ret;
-       struct btrfs_free_space *sp;
-
-       ret = __btrfs_add_free_space(block_group, offset, bytes);
-       sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
-       BUG_ON(!sp);
-
-       return ret;
-}
-
-int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
-                           u64 offset, u64 bytes)
-{
-       int ret = 0;
-
-       mutex_lock(&block_group->alloc_mutex);
-       ret = __btrfs_remove_free_space(block_group, offset, bytes);
-       mutex_unlock(&block_group->alloc_mutex);
-
-       return ret;
-}
-
-int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
-                                u64 offset, u64 bytes)
-{
-       int ret;
-
-       ret = __btrfs_remove_free_space(block_group, offset, bytes);
-
-       return ret;
-}
-
 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
                           u64 bytes)
 {
@@ -408,6 +357,8 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
                info = rb_entry(n, struct btrfs_free_space, offset_index);
                if (info->bytes >= bytes)
                        count++;
+               printk(KERN_ERR "entry offset %llu, bytes %llu\n", info->offset,
+                      info->bytes);
        }
        printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
               "\n", count);
@@ -428,68 +379,337 @@ u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
        return ret;
 }
 
+/*
+ * for a given cluster, put all of its extents back into the free
+ * space cache.  If the block group passed doesn't match the block group
+ * pointed to by the cluster, someone else raced in and freed the
+ * cluster already.  In that case, we just return without changing anything
+ */
+static int
+__btrfs_return_cluster_to_free_space(
+                            struct btrfs_block_group_cache *block_group,
+                            struct btrfs_free_cluster *cluster)
+{
+       struct btrfs_free_space *entry;
+       struct rb_node *node;
+
+       spin_lock(&cluster->lock);
+       if (cluster->block_group != block_group)
+               goto out;
+
+       cluster->window_start = 0;
+       node = rb_first(&cluster->root);
+       while(node) {
+               entry = rb_entry(node, struct btrfs_free_space, offset_index);
+               node = rb_next(&entry->offset_index);
+               rb_erase(&entry->offset_index, &cluster->root);
+               link_free_space(block_group, entry);
+       }
+       list_del_init(&cluster->block_group_list);
+
+       btrfs_put_block_group(cluster->block_group);
+       cluster->block_group = NULL;
+       cluster->root.rb_node = NULL;
+out:
+       spin_unlock(&cluster->lock);
+       return 0;
+}
+
 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
 {
        struct btrfs_free_space *info;
        struct rb_node *node;
+       struct btrfs_free_cluster *cluster;
+       struct btrfs_free_cluster *safe;
+
+       spin_lock(&block_group->tree_lock);
+
+       list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
+                                block_group_list) {
+
+               WARN_ON(cluster->block_group != block_group);
+               __btrfs_return_cluster_to_free_space(block_group, cluster);
+       }
 
-       mutex_lock(&block_group->alloc_mutex);
        while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
                info = rb_entry(node, struct btrfs_free_space, bytes_index);
                unlink_free_space(block_group, info);
                kfree(info);
                if (need_resched()) {
-                       mutex_unlock(&block_group->alloc_mutex);
+                       spin_unlock(&block_group->tree_lock);
                        cond_resched();
-                       mutex_lock(&block_group->alloc_mutex);
+                       spin_lock(&block_group->tree_lock);
                }
        }
-       mutex_unlock(&block_group->alloc_mutex);
+       spin_unlock(&block_group->tree_lock);
 }
 
-#if 0
-static struct btrfs_free_space *btrfs_find_free_space_offset(struct
-                                                     btrfs_block_group_cache
-                                                     *block_group, u64 offset,
-                                                     u64 bytes)
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
+                              u64 offset, u64 bytes, u64 empty_size)
 {
-       struct btrfs_free_space *ret;
+       struct btrfs_free_space *entry = NULL;
+       u64 ret = 0;
 
-       mutex_lock(&block_group->alloc_mutex);
-       ret = tree_search_offset(&block_group->free_space_offset, offset,
-                                bytes, 0);
-       mutex_unlock(&block_group->alloc_mutex);
+       spin_lock(&block_group->tree_lock);
+       entry = tree_search_offset(&block_group->free_space_offset, offset,
+                                  bytes + empty_size, 1);
+       if (!entry)
+               entry = tree_search_bytes(&block_group->free_space_bytes,
+                                         offset, bytes + empty_size);
+       if (entry) {
+               unlink_free_space(block_group, entry);
+               ret = entry->offset;
+               entry->offset += bytes;
+               entry->bytes -= bytes;
+
+               if (!entry->bytes)
+                       kfree(entry);
+               else
+                       link_free_space(block_group, entry);
+       }
+       spin_unlock(&block_group->tree_lock);
 
        return ret;
 }
 
-static struct btrfs_free_space *btrfs_find_free_space_bytes(struct
-                                                    btrfs_block_group_cache
-                                                    *block_group, u64 offset,
-                                                    u64 bytes)
+/*
+ * given a cluster, put all of its extents back into the free space
+ * cache.  If a block group is passed, this function will only free
+ * a cluster that belongs to the passed block group.
+ *
+ * Otherwise, it'll get a reference on the block group pointed to by the
+ * cluster and remove the cluster from it.
+ */
+int btrfs_return_cluster_to_free_space(
+                              struct btrfs_block_group_cache *block_group,
+                              struct btrfs_free_cluster *cluster)
 {
-       struct btrfs_free_space *ret;
+       int ret;
 
-       mutex_lock(&block_group->alloc_mutex);
+       /* first, get a safe pointer to the block group */
+       spin_lock(&cluster->lock);
+       if (!block_group) {
+               block_group = cluster->block_group;
+               if (!block_group) {
+                       spin_unlock(&cluster->lock);
+                       return 0;
+               }
+       } else if (cluster->block_group != block_group) {
+               /* someone else has already freed it don't redo their work */
+               spin_unlock(&cluster->lock);
+               return 0;
+       }
+       atomic_inc(&block_group->count);
+       spin_unlock(&cluster->lock);
 
-       ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
-       mutex_unlock(&block_group->alloc_mutex);
+       /* now return any extents the cluster had on it */
+       spin_lock(&block_group->tree_lock);
+       ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
+       spin_unlock(&block_group->tree_lock);
 
+       /* finally drop our ref */
+       btrfs_put_block_group(block_group);
        return ret;
 }
-#endif
 
-struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
-                                              *block_group, u64 offset,
-                                              u64 bytes)
+/*
+ * given a cluster, try to allocate 'bytes' from it, returns 0
+ * if it couldn't find anything suitably large, or a logical disk offset
+ * if things worked out
+ */
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+                            struct btrfs_free_cluster *cluster, u64 bytes,
+                            u64 min_start)
+{
+       struct btrfs_free_space *entry = NULL;
+       struct rb_node *node;
+       u64 ret = 0;
+
+       spin_lock(&cluster->lock);
+       if (bytes > cluster->max_size)
+               goto out;
+
+       if (cluster->block_group != block_group)
+               goto out;
+
+       node = rb_first(&cluster->root);
+       if (!node)
+               goto out;
+
+       entry = rb_entry(node, struct btrfs_free_space, offset_index);
+
+       while(1) {
+               if (entry->bytes < bytes || entry->offset < min_start) {
+                       struct rb_node *node;
+
+                       node = rb_next(&entry->offset_index);
+                       if (!node)
+                               break;
+                       entry = rb_entry(node, struct btrfs_free_space,
+                                        offset_index);
+                       continue;
+               }
+               ret = entry->offset;
+
+               entry->offset += bytes;
+               entry->bytes -= bytes;
+
+               if (entry->bytes == 0) {
+                       rb_erase(&entry->offset_index, &cluster->root);
+                       kfree(entry);
+               }
+               break;
+       }
+out:
+       spin_unlock(&cluster->lock);
+       return ret;
+}
+
+/*
+ * here we try to find a cluster of blocks in a block group.  The goal
+ * is to find at least bytes free and up to empty_size + bytes free.
+ * We might not find them all in one contiguous area.
+ *
+ * returns zero and sets up cluster if things worked out, otherwise
+ * it returns -enospc
+ */
+int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+                            struct btrfs_block_group_cache *block_group,
+                            struct btrfs_free_cluster *cluster,
+                            u64 offset, u64 bytes, u64 empty_size)
 {
-       struct btrfs_free_space *ret = NULL;
+       struct btrfs_free_space *entry = NULL;
+       struct rb_node *node;
+       struct btrfs_free_space *next;
+       struct btrfs_free_space *last;
+       u64 min_bytes;
+       u64 window_start;
+       u64 window_free;
+       u64 max_extent = 0;
+       int total_retries = 0;
+       int ret;
+
+       /* for metadata, allow allocates with more holes */
+       if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
+               /*
+                * we want to do larger allocations when we are
+                * flushing out the delayed refs, it helps prevent
+                * making more work as we go along.
+                */
+               if (trans->transaction->delayed_refs.flushing)
+                       min_bytes = max(bytes, (bytes + empty_size) >> 1);
+               else
+                       min_bytes = max(bytes, (bytes + empty_size) >> 4);
+       } else
+               min_bytes = max(bytes, (bytes + empty_size) >> 2);
+
+       spin_lock(&block_group->tree_lock);
+       spin_lock(&cluster->lock);
+
+       /* someone already found a cluster, hooray */
+       if (cluster->block_group) {
+               ret = 0;
+               goto out;
+       }
+again:
+       min_bytes = min(min_bytes, bytes + empty_size);
+       entry = tree_search_bytes(&block_group->free_space_bytes,
+                                 offset, min_bytes);
+       if (!entry) {
+               ret = -ENOSPC;
+               goto out;
+       }
+       window_start = entry->offset;
+       window_free = entry->bytes;
+       last = entry;
+       max_extent = entry->bytes;
+
+       while(1) {
+               /* out window is just right, lets fill it */
+               if (window_free >= bytes + empty_size)
+                       break;
 
-       ret = tree_search_offset(&block_group->free_space_offset, offset,
-                                bytes, 0);
-       if (!ret)
-               ret = tree_search_bytes(&block_group->free_space_bytes,
-                                       offset, bytes);
+               node = rb_next(&last->offset_index);
+               if (!node) {
+                       ret = -ENOSPC;
+                       goto out;
+               }
+               next = rb_entry(node, struct btrfs_free_space, offset_index);
+
+               /*
+                * we haven't filled the empty size and the window is
+                * very large.  reset and try again
+                */
+               if (next->offset - window_start > (bytes + empty_size) * 2) {
+                       entry = next;
+                       window_start = entry->offset;
+                       window_free = entry->bytes;
+                       last = entry;
+                       max_extent = 0;
+                       total_retries++;
+                       if (total_retries % 256 == 0) {
+                               if (min_bytes >= (bytes + empty_size)) {
+                                       ret = -ENOSPC;
+                                       goto out;
+                               }
+                               /*
+                                * grow our allocation a bit, we're not having
+                                * much luck
+                                */
+                               min_bytes *= 2;
+                               goto again;
+                       }
+               } else {
+                       last = next;
+                       window_free += next->bytes;
+                       if (entry->bytes > max_extent)
+                               max_extent = entry->bytes;
+               }
+       }
+
+       cluster->window_start = entry->offset;
+
+       /*
+        * now we've found our entries, pull them out of the free space
+        * cache and put them into the cluster rbtree
+        *
+        * The cluster includes an rbtree, but only uses the offset index
+        * of each free space cache entry.
+        */
+       while(1) {
+               node = rb_next(&entry->offset_index);
+               unlink_free_space(block_group, entry);
+               ret = tree_insert_offset(&cluster->root, entry->offset,
+                                        &entry->offset_index);
+               BUG_ON(ret);
+
+               if (!node || entry == last)
+                       break;
+
+               entry = rb_entry(node, struct btrfs_free_space, offset_index);
+       }
+       ret = 0;
+       cluster->max_size = max_extent;
+       atomic_inc(&block_group->count);
+       list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
+       cluster->block_group = block_group;
+out:
+       spin_unlock(&cluster->lock);
+       spin_unlock(&block_group->tree_lock);
 
        return ret;
 }
+
+/*
+ * simple code to zero out a cluster
+ */
+void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
+{
+       spin_lock_init(&cluster->lock);
+       spin_lock_init(&cluster->refill_lock);
+       cluster->root.rb_node = NULL;
+       cluster->max_size = 0;
+       INIT_LIST_HEAD(&cluster->block_group_list);
+       cluster->block_group = NULL;
+}
+
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
new file mode 100644 (file)
index 0000000..ab0bdc0
--- /dev/null
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_FREE_SPACE_CACHE
+#define __BTRFS_FREE_SPACE_CACHE
+
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+                        u64 bytenr, u64 size);
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+                           u64 bytenr, u64 size);
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
+                                  *block_group);
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
+                              u64 offset, u64 bytes, u64 empty_size);
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+                          u64 bytes);
+u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
+int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+                            struct btrfs_block_group_cache *block_group,
+                            struct btrfs_free_cluster *cluster,
+                            u64 offset, u64 bytes, u64 empty_size);
+void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+                            struct btrfs_free_cluster *cluster, u64 bytes,
+                            u64 min_start);
+int btrfs_return_cluster_to_free_space(
+                              struct btrfs_block_group_cache *block_group,
+                              struct btrfs_free_cluster *cluster);
+#endif
index 06d8db5afb08ba8800dc776a64f421c1e9322780..a0d1dd492a58a57bdf23464340fd6312c192fca9 100644 (file)
@@ -3481,8 +3481,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 
        if (dir) {
                ret = btrfs_set_inode_index(dir, index);
-               if (ret)
+               if (ret) {
+                       iput(inode);
                        return ERR_PTR(ret);
+               }
        }
        /*
         * index_cnt is ignored for everything but a dir,
@@ -3565,6 +3567,7 @@ fail:
        if (dir)
                BTRFS_I(dir)->index_cnt--;
        btrfs_free_path(path);
+       iput(inode);
        return ERR_PTR(ret);
 }
 
index a5310c0f41e24f089bfd78e3a2f0939d3957f717..1c36e5cd8f55495843631f7e5d3e6245d47684dd 100644 (file)
@@ -60,8 +60,8 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
 
 /*
  * unfortunately, many of the places that currently set a lock to blocking
- * don't end up blocking for every long, and often they don't block
- * at all.  For a dbench 50 run, if we don't spin one the blocking bit
+ * don't end up blocking for very long, and often they don't block
+ * at all.  For a dbench 50 run, if we don't spin on the blocking bit
  * at all, the context switch rate can jump up to 400,000/sec or more.
  *
  * So, we're still stuck with this crummy spin on the blocking bit,
index 19a4daf03ccb6d7d8730d4d8c9602a24dc0c370b..9744af9d71e95afa3a66f9ac1b2188722318aad8 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/highmem.h>
 #include <linux/time.h>
 #include <linux/init.h>
+#include <linux/seq_file.h>
 #include <linux/string.h>
 #include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
@@ -66,7 +67,8 @@ static void btrfs_put_super(struct super_block *sb)
 enum {
        Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
        Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-       Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_err,
+       Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_notreelog,
+       Opt_flushoncommit, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -83,6 +85,8 @@ static match_table_t tokens = {
        {Opt_compress, "compress"},
        {Opt_ssd, "ssd"},
        {Opt_noacl, "noacl"},
+       {Opt_notreelog, "notreelog"},
+       {Opt_flushoncommit, "flushoncommit"},
        {Opt_err, NULL},
 };
 
@@ -222,6 +226,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                case Opt_noacl:
                        root->fs_info->sb->s_flags &= ~MS_POSIXACL;
                        break;
+               case Opt_notreelog:
+                       printk(KERN_INFO "btrfs: disabling tree log\n");
+                       btrfs_set_opt(info->mount_opt, NOTREELOG);
+                       break;
+               case Opt_flushoncommit:
+                       printk(KERN_INFO "btrfs: turning on flush-on-commit\n");
+                       btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
+                       break;
                default:
                        break;
                }
@@ -363,9 +375,8 @@ fail_close:
 int btrfs_sync_fs(struct super_block *sb, int wait)
 {
        struct btrfs_trans_handle *trans;
-       struct btrfs_root *root;
+       struct btrfs_root *root = btrfs_sb(sb);
        int ret;
-       root = btrfs_sb(sb);
 
        if (sb->s_flags & MS_RDONLY)
                return 0;
@@ -385,6 +396,41 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        return ret;
 }
 
+static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+       struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
+       struct btrfs_fs_info *info = root->fs_info;
+
+       if (btrfs_test_opt(root, DEGRADED))
+               seq_puts(seq, ",degraded");
+       if (btrfs_test_opt(root, NODATASUM))
+               seq_puts(seq, ",nodatasum");
+       if (btrfs_test_opt(root, NODATACOW))
+               seq_puts(seq, ",nodatacow");
+       if (btrfs_test_opt(root, NOBARRIER))
+               seq_puts(seq, ",nobarrier");
+       if (info->max_extent != (u64)-1)
+               seq_printf(seq, ",max_extent=%llu", info->max_extent);
+       if (info->max_inline != 8192 * 1024)
+               seq_printf(seq, ",max_inline=%llu", info->max_inline);
+       if (info->alloc_start != 0)
+               seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
+       if (info->thread_pool_size !=  min_t(unsigned long,
+                                            num_online_cpus() + 2, 8))
+               seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
+       if (btrfs_test_opt(root, COMPRESS))
+               seq_puts(seq, ",compress");
+       if (btrfs_test_opt(root, SSD))
+               seq_puts(seq, ",ssd");
+       if (btrfs_test_opt(root, NOTREELOG))
+               seq_puts(seq, ",no-treelog");
+       if (btrfs_test_opt(root, FLUSHONCOMMIT))
+               seq_puts(seq, ",flush-on-commit");
+       if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
+               seq_puts(seq, ",noacl");
+       return 0;
+}
+
 static void btrfs_write_super(struct super_block *sb)
 {
        sb->s_dirt = 0;
@@ -630,7 +676,7 @@ static struct super_operations btrfs_super_ops = {
        .put_super      = btrfs_put_super,
        .write_super    = btrfs_write_super,
        .sync_fs        = btrfs_sync_fs,
-       .show_options   = generic_show_options,
+       .show_options   = btrfs_show_options,
        .write_inode    = btrfs_write_inode,
        .dirty_inode    = btrfs_dirty_inode,
        .alloc_inode    = btrfs_alloc_inode,
index 664782c6a2dfbf6e65628f09fe2ff8552335f016..2869b3361eb6be3cf82e60ba58ce1d9191360c46 100644 (file)
@@ -53,8 +53,6 @@ static noinline int join_transaction(struct btrfs_root *root)
                                             GFP_NOFS);
                BUG_ON(!cur_trans);
                root->fs_info->generation++;
-               root->fs_info->last_alloc = 0;
-               root->fs_info->last_data_alloc = 0;
                cur_trans->num_writers = 1;
                cur_trans->num_joined = 0;
                cur_trans->transid = root->fs_info->generation;
@@ -974,6 +972,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        int ret;
        int should_grow = 0;
        unsigned long now = get_seconds();
+       int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
 
        btrfs_run_ordered_operations(root, 0);
 
@@ -1053,7 +1052,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
                mutex_unlock(&root->fs_info->trans_mutex);
 
-               if (snap_pending) {
+               if (flush_on_commit || snap_pending) {
+                       if (flush_on_commit)
+                               btrfs_start_delalloc_inodes(root);
                        ret = btrfs_wait_ordered_extents(root, 1);
                        BUG_ON(ret);
                }
index fc9b87a7975bd38f76d7e22c84cfe010e3aafa42..25f20ea11f2789db90f796ecfd52aeae87c02910 100644 (file)
@@ -262,11 +262,9 @@ static int process_one_buffer(struct btrfs_root *log,
                              struct extent_buffer *eb,
                              struct walk_control *wc, u64 gen)
 {
-       if (wc->pin) {
-               mutex_lock(&log->fs_info->pinned_mutex);
+       if (wc->pin)
                btrfs_update_pinned_extents(log->fs_info->extent_root,
                                            eb->start, eb->len, 1);
-       }
 
        if (btrfs_buffer_uptodate(eb, gen)) {
                if (wc->write)
@@ -1224,8 +1222,7 @@ insert:
        ret = insert_one_name(trans, root, path, key->objectid, key->offset,
                              name, name_len, log_type, &log_key);
 
-       if (ret && ret != -ENOENT)
-               BUG();
+       BUG_ON(ret && ret != -ENOENT);
        goto out;
 }
 
@@ -2900,6 +2897,11 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 
        sb = inode->i_sb;
 
+       if (btrfs_test_opt(root, NOTREELOG)) {
+               ret = 1;
+               goto end_no_trans;
+       }
+
        if (root->fs_info->last_trans_log_full_commit >
            root->fs_info->last_trans_committed) {
                ret = 1;
index dd06e18e5aac7518191e02791073345e6a0f51cd..e0913e4697284673b4c00cd58bf911fc44fe4019 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/random.h>
+#include <linux/iocontext.h>
 #include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
@@ -145,8 +146,9 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
        int again = 0;
        unsigned long num_run = 0;
        unsigned long limit;
+       unsigned long last_waited = 0;
 
-       bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
+       bdi = blk_get_backing_dev_info(device->bdev);
        fs_info = device->dev_root->fs_info;
        limit = btrfs_async_submit_limit(fs_info);
        limit = limit * 2 / 3;
@@ -207,7 +209,32 @@ loop_lock:
                if (pending && bdi_write_congested(bdi) && num_run > 16 &&
                    fs_info->fs_devices->open_devices > 1) {
                        struct bio *old_head;
+                       struct io_context *ioc;
 
+                       ioc = current->io_context;
+
+                       /*
+                        * the main goal here is that we don't want to
+                        * block if we're going to be able to submit
+                        * more requests without blocking.
+                        *
+                        * This code does two great things, it pokes into
+                        * the elevator code from a filesystem _and_
+                        * it makes assumptions about how batching works.
+                        */
+                       if (ioc && ioc->nr_batch_requests > 0 &&
+                           time_before(jiffies, ioc->last_waited + HZ/50UL) &&
+                           (last_waited == 0 ||
+                            ioc->last_waited == last_waited)) {
+                               /*
+                                * we want to go through our batch of
+                                * requests and stop.  So, we copy out
+                                * the ioc->last_waited time and test
+                                * against it before looping
+                                */
+                               last_waited = ioc->last_waited;
+                               continue;
+                       }
                        spin_lock(&device->io_lock);
 
                        old_head = device->pending_bios;
@@ -231,6 +258,18 @@ loop_lock:
        if (device->pending_bios)
                goto loop_lock;
        spin_unlock(&device->io_lock);
+
+       /*
+        * IO has already been through a long path to get here.  Checksumming,
+        * async helper threads, perhaps compression.  We've done a pretty
+        * good job of collecting a batch of IO and should just unplug
+        * the device right away.
+        *
+        * This will help anyone who is waiting on the IO, they might have
+        * already unplugged, but managed to do so before the bio they
+        * cared about found its way down here.
+        */
+       blk_run_backing_dev(bdi, NULL);
 done:
        return 0;
 }
index 86c44e9ae1101e2332dadad41b6def15932ae96c..2185de72ff7dad59154f5a568624be89ab7e8b97 100644 (file)
@@ -76,7 +76,7 @@ struct btrfs_device {
 struct btrfs_fs_devices {
        u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
 
-       /* the device with this id has the most recent coyp of the super */
+       /* the device with this id has the most recent copy of the super */
        u64 latest_devid;
        u64 latest_trans;
        u64 num_devices;
index c2fa1be4923d19885dab161d45a53588ff296ac5..5d55a896ff78f2a79ab9bc2dee72dae77ddb93eb 100644 (file)
@@ -1595,6 +1595,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
        struct buffer_head *bh, *head;
        const unsigned blocksize = 1 << inode->i_blkbits;
        int nr_underway = 0;
+       int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
 
        BUG_ON(!PageLocked(page));
 
@@ -1686,7 +1687,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
        do {
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
-                       submit_bh(WRITE, bh);
+                       submit_bh(write_op, bh);
                        nr_underway++;
                }
                bh = next;
@@ -1740,7 +1741,7 @@ recover:
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
                        clear_buffer_dirty(bh);
-                       submit_bh(WRITE, bh);
+                       submit_bh(write_op, bh);
                        nr_underway++;
                }
                bh = next;
diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
new file mode 100644 (file)
index 0000000..80e9c61
--- /dev/null
@@ -0,0 +1,39 @@
+
+config CACHEFILES
+       tristate "Filesystem caching on files"
+       depends on FSCACHE && BLOCK
+       help
+         This permits use of a mounted filesystem as a cache for other
+         filesystems - primarily networking filesystems - thus allowing fast
+         local disk to enhance the speed of slower devices.
+
+         See Documentation/filesystems/caching/cachefiles.txt for more
+         information.
+
+config CACHEFILES_DEBUG
+       bool "Debug CacheFiles"
+       depends on CACHEFILES
+       help
+         This permits debugging to be dynamically enabled in the filesystem
+         caching on files module.  If this is set, the debugging output may be
+         enabled by setting bits in /sys/modules/cachefiles/parameter/debug or
+         by including a debugging specifier in /etc/cachefilesd.conf.
+
+config CACHEFILES_HISTOGRAM
+       bool "Gather latency information on CacheFiles"
+       depends on CACHEFILES && PROC_FS
+       help
+
+         This option causes latency information to be gathered on CacheFiles
+         operation and exported through file:
+
+               /proc/fs/cachefiles/histogram
+
+         The generation of this histogram adds a certain amount of overhead to
+         execution as there are a number of points at which data is gathered,
+         and on a multi-CPU system these may be on cachelines that keep
+         bouncing between CPUs.  On the other hand, the histogram may be
+         useful for debugging purposes.  Saying 'N' here is recommended.
+
+         See Documentation/filesystems/caching/cachefiles.txt for more
+         information.
diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
new file mode 100644 (file)
index 0000000..32cbab0
--- /dev/null
@@ -0,0 +1,18 @@
+#
+# Makefile for caching in a mounted filesystem
+#
+
+cachefiles-y := \
+       bind.o \
+       daemon.o \
+       interface.o \
+       key.o \
+       main.o \
+       namei.o \
+       rdwr.o \
+       security.o \
+       xattr.o
+
+cachefiles-$(CONFIG_CACHEFILES_HISTOGRAM) += proc.o
+
+obj-$(CONFIG_CACHEFILES) := cachefiles.o
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
new file mode 100644 (file)
index 0000000..3797e00
--- /dev/null
@@ -0,0 +1,286 @@
+/* Bind and unbind a cache from the filesystem backing it
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/ctype.h>
+#include "internal.h"
+
+static int cachefiles_daemon_add_cache(struct cachefiles_cache *caches);
+
+/*
+ * bind a directory as a cache
+ */
+int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
+{
+       _enter("{%u,%u,%u,%u,%u,%u},%s",
+              cache->frun_percent,
+              cache->fcull_percent,
+              cache->fstop_percent,
+              cache->brun_percent,
+              cache->bcull_percent,
+              cache->bstop_percent,
+              args);
+
+       /* start by checking things over */
+       ASSERT(cache->fstop_percent >= 0 &&
+              cache->fstop_percent < cache->fcull_percent &&
+              cache->fcull_percent < cache->frun_percent &&
+              cache->frun_percent  < 100);
+
+       ASSERT(cache->bstop_percent >= 0 &&
+              cache->bstop_percent < cache->bcull_percent &&
+              cache->bcull_percent < cache->brun_percent &&
+              cache->brun_percent  < 100);
+
+       if (*args) {
+               kerror("'bind' command doesn't take an argument");
+               return -EINVAL;
+       }
+
+       if (!cache->rootdirname) {
+               kerror("No cache directory specified");
+               return -EINVAL;
+       }
+
+       /* don't permit already bound caches to be re-bound */
+       if (test_bit(CACHEFILES_READY, &cache->flags)) {
+               kerror("Cache already bound");
+               return -EBUSY;
+       }
+
+       /* make sure we have copies of the tag and dirname strings */
+       if (!cache->tag) {
+               /* the tag string is released by the fops->release()
+                * function, so we don't release it on error here */
+               cache->tag = kstrdup("CacheFiles", GFP_KERNEL);
+               if (!cache->tag)
+                       return -ENOMEM;
+       }
+
+       /* add the cache */
+       return cachefiles_daemon_add_cache(cache);
+}
+
+/*
+ * add a cache
+ */
+static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
+{
+       struct cachefiles_object *fsdef;
+       struct nameidata nd;
+       struct kstatfs stats;
+       struct dentry *graveyard, *cachedir, *root;
+       const struct cred *saved_cred;
+       int ret;
+
+       _enter("");
+
+       /* we want to work under the module's security ID */
+       ret = cachefiles_get_security_ID(cache);
+       if (ret < 0)
+               return ret;
+
+       cachefiles_begin_secure(cache, &saved_cred);
+
+       /* allocate the root index object */
+       ret = -ENOMEM;
+
+       fsdef = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
+       if (!fsdef)
+               goto error_root_object;
+
+       ASSERTCMP(fsdef->backer, ==, NULL);
+
+       atomic_set(&fsdef->usage, 1);
+       fsdef->type = FSCACHE_COOKIE_TYPE_INDEX;
+
+       _debug("- fsdef %p", fsdef);
+
+       /* look up the directory at the root of the cache */
+       memset(&nd, 0, sizeof(nd));
+
+       ret = path_lookup(cache->rootdirname, LOOKUP_DIRECTORY, &nd);
+       if (ret < 0)
+               goto error_open_root;
+
+       cache->mnt = mntget(nd.path.mnt);
+       root = dget(nd.path.dentry);
+       path_put(&nd.path);
+
+       /* check parameters */
+       ret = -EOPNOTSUPP;
+       if (!root->d_inode ||
+           !root->d_inode->i_op ||
+           !root->d_inode->i_op->lookup ||
+           !root->d_inode->i_op->mkdir ||
+           !root->d_inode->i_op->setxattr ||
+           !root->d_inode->i_op->getxattr ||
+           !root->d_sb ||
+           !root->d_sb->s_op ||
+           !root->d_sb->s_op->statfs ||
+           !root->d_sb->s_op->sync_fs)
+               goto error_unsupported;
+
+       ret = -EROFS;
+       if (root->d_sb->s_flags & MS_RDONLY)
+               goto error_unsupported;
+
+       /* determine the security of the on-disk cache as this governs
+        * security ID of files we create */
+       ret = cachefiles_determine_cache_security(cache, root, &saved_cred);
+       if (ret < 0)
+               goto error_unsupported;
+
+       /* get the cache size and blocksize */
+       ret = vfs_statfs(root, &stats);
+       if (ret < 0)
+               goto error_unsupported;
+
+       ret = -ERANGE;
+       if (stats.f_bsize <= 0)
+               goto error_unsupported;
+
+       ret = -EOPNOTSUPP;
+       if (stats.f_bsize > PAGE_SIZE)
+               goto error_unsupported;
+
+       cache->bsize = stats.f_bsize;
+       cache->bshift = 0;
+       if (stats.f_bsize < PAGE_SIZE)
+               cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize);
+
+       _debug("blksize %u (shift %u)",
+              cache->bsize, cache->bshift);
+
+       _debug("size %llu, avail %llu",
+              (unsigned long long) stats.f_blocks,
+              (unsigned long long) stats.f_bavail);
+
+       /* set up caching limits */
+       do_div(stats.f_files, 100);
+       cache->fstop = stats.f_files * cache->fstop_percent;
+       cache->fcull = stats.f_files * cache->fcull_percent;
+       cache->frun  = stats.f_files * cache->frun_percent;
+
+       _debug("limits {%llu,%llu,%llu} files",
+              (unsigned long long) cache->frun,
+              (unsigned long long) cache->fcull,
+              (unsigned long long) cache->fstop);
+
+       stats.f_blocks >>= cache->bshift;
+       do_div(stats.f_blocks, 100);
+       cache->bstop = stats.f_blocks * cache->bstop_percent;
+       cache->bcull = stats.f_blocks * cache->bcull_percent;
+       cache->brun  = stats.f_blocks * cache->brun_percent;
+
+       _debug("limits {%llu,%llu,%llu} blocks",
+              (unsigned long long) cache->brun,
+              (unsigned long long) cache->bcull,
+              (unsigned long long) cache->bstop);
+
+       /* get the cache directory and check its type */
+       cachedir = cachefiles_get_directory(cache, root, "cache");
+       if (IS_ERR(cachedir)) {
+               ret = PTR_ERR(cachedir);
+               goto error_unsupported;
+       }
+
+       fsdef->dentry = cachedir;
+       fsdef->fscache.cookie = NULL;
+
+       ret = cachefiles_check_object_type(fsdef);
+       if (ret < 0)
+               goto error_unsupported;
+
+       /* get the graveyard directory */
+       graveyard = cachefiles_get_directory(cache, root, "graveyard");
+       if (IS_ERR(graveyard)) {
+               ret = PTR_ERR(graveyard);
+               goto error_unsupported;
+       }
+
+       cache->graveyard = graveyard;
+
+       /* publish the cache */
+       fscache_init_cache(&cache->cache,
+                          &cachefiles_cache_ops,
+                          "%s",
+                          fsdef->dentry->d_sb->s_id);
+
+       fscache_object_init(&fsdef->fscache, NULL, &cache->cache);
+
+       ret = fscache_add_cache(&cache->cache, &fsdef->fscache, cache->tag);
+       if (ret < 0)
+               goto error_add_cache;
+
+       /* done */
+       set_bit(CACHEFILES_READY, &cache->flags);
+       dput(root);
+
+       printk(KERN_INFO "CacheFiles:"
+              " File cache on %s registered\n",
+              cache->cache.identifier);
+
+       /* check how much space the cache has */
+       cachefiles_has_space(cache, 0, 0);
+       cachefiles_end_secure(cache, saved_cred);
+       return 0;
+
+error_add_cache:
+       dput(cache->graveyard);
+       cache->graveyard = NULL;
+error_unsupported:
+       mntput(cache->mnt);
+       cache->mnt = NULL;
+       dput(fsdef->dentry);
+       fsdef->dentry = NULL;
+       dput(root);
+error_open_root:
+       kmem_cache_free(cachefiles_object_jar, fsdef);
+error_root_object:
+       cachefiles_end_secure(cache, saved_cred);
+       kerror("Failed to register: %d", ret);
+       return ret;
+}
+
+/*
+ * unbind a cache on fd release
+ */
+void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
+{
+       _enter("");
+
+       if (test_bit(CACHEFILES_READY, &cache->flags)) {
+               printk(KERN_INFO "CacheFiles:"
+                      " File cache on %s unregistering\n",
+                      cache->cache.identifier);
+
+               fscache_withdraw_cache(&cache->cache);
+       }
+
+       dput(cache->graveyard);
+       mntput(cache->mnt);
+
+       kfree(cache->rootdirname);
+       kfree(cache->secctx);
+       kfree(cache->tag);
+
+       _leave("");
+}
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
new file mode 100644 (file)
index 0000000..4618516
--- /dev/null
@@ -0,0 +1,755 @@
+/* Daemon interface
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/ctype.h>
+#include <linux/fs_struct.h>
+#include "internal.h"
+
+static int cachefiles_daemon_open(struct inode *, struct file *);
+static int cachefiles_daemon_release(struct inode *, struct file *);
+static ssize_t cachefiles_daemon_read(struct file *, char __user *, size_t,
+                                     loff_t *);
+static ssize_t cachefiles_daemon_write(struct file *, const char __user *,
+                                      size_t, loff_t *);
+static unsigned int cachefiles_daemon_poll(struct file *,
+                                          struct poll_table_struct *);
+static int cachefiles_daemon_frun(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_fcull(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_fstop(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_brun(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_bcull(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_bstop(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_cull(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_debug(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_dir(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_inuse(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_secctx(struct cachefiles_cache *, char *);
+static int cachefiles_daemon_tag(struct cachefiles_cache *, char *);
+
+static unsigned long cachefiles_open;
+
+const struct file_operations cachefiles_daemon_fops = {
+       .owner          = THIS_MODULE,
+       .open           = cachefiles_daemon_open,
+       .release        = cachefiles_daemon_release,
+       .read           = cachefiles_daemon_read,
+       .write          = cachefiles_daemon_write,
+       .poll           = cachefiles_daemon_poll,
+};
+
+struct cachefiles_daemon_cmd {
+       char name[8];
+       int (*handler)(struct cachefiles_cache *cache, char *args);
+};
+
+static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {
+       { "bind",       cachefiles_daemon_bind          },
+       { "brun",       cachefiles_daemon_brun          },
+       { "bcull",      cachefiles_daemon_bcull         },
+       { "bstop",      cachefiles_daemon_bstop         },
+       { "cull",       cachefiles_daemon_cull          },
+       { "debug",      cachefiles_daemon_debug         },
+       { "dir",        cachefiles_daemon_dir           },
+       { "frun",       cachefiles_daemon_frun          },
+       { "fcull",      cachefiles_daemon_fcull         },
+       { "fstop",      cachefiles_daemon_fstop         },
+       { "inuse",      cachefiles_daemon_inuse         },
+       { "secctx",     cachefiles_daemon_secctx        },
+       { "tag",        cachefiles_daemon_tag           },
+       { "",           NULL                            }
+};
+
+
+/*
+ * do various checks
+ */
+static int cachefiles_daemon_open(struct inode *inode, struct file *file)
+{
+       struct cachefiles_cache *cache;
+
+       _enter("");
+
+       /* only the superuser may do this */
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       /* the cachefiles device may only be open once at a time */
+       if (xchg(&cachefiles_open, 1) == 1)
+               return -EBUSY;
+
+       /* allocate a cache record */
+       cache = kzalloc(sizeof(struct cachefiles_cache), GFP_KERNEL);
+       if (!cache) {
+               cachefiles_open = 0;
+               return -ENOMEM;
+       }
+
+       mutex_init(&cache->daemon_mutex);
+       cache->active_nodes = RB_ROOT;
+       rwlock_init(&cache->active_lock);
+       init_waitqueue_head(&cache->daemon_pollwq);
+
+       /* set default caching limits
+        * - limit at 1% free space and/or free files
+        * - cull below 5% free space and/or free files
+        * - cease culling above 7% free space and/or free files
+        */
+       cache->frun_percent = 7;
+       cache->fcull_percent = 5;
+       cache->fstop_percent = 1;
+       cache->brun_percent = 7;
+       cache->bcull_percent = 5;
+       cache->bstop_percent = 1;
+
+       file->private_data = cache;
+       cache->cachefilesd = file;
+       return 0;
+}
+
+/*
+ * release a cache
+ */
+static int cachefiles_daemon_release(struct inode *inode, struct file *file)
+{
+       struct cachefiles_cache *cache = file->private_data;
+
+       _enter("");
+
+       ASSERT(cache);
+
+       set_bit(CACHEFILES_DEAD, &cache->flags);
+
+       cachefiles_daemon_unbind(cache);
+
+       ASSERT(!cache->active_nodes.rb_node);
+
+       /* clean up the control file interface */
+       cache->cachefilesd = NULL;
+       file->private_data = NULL;
+       cachefiles_open = 0;
+
+       kfree(cache);
+
+       _leave("");
+       return 0;
+}
+
+/*
+ * read the cache state
+ */
+static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
+                                     size_t buflen, loff_t *pos)
+{
+       struct cachefiles_cache *cache = file->private_data;
+       char buffer[256];
+       int n;
+
+       //_enter(",,%zu,", buflen);
+
+       if (!test_bit(CACHEFILES_READY, &cache->flags))
+               return 0;
+
+       /* check how much space the cache has */
+       cachefiles_has_space(cache, 0, 0);
+
+       /* summarise */
+       clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
+
+       n = snprintf(buffer, sizeof(buffer),
+                    "cull=%c"
+                    " frun=%llx"
+                    " fcull=%llx"
+                    " fstop=%llx"
+                    " brun=%llx"
+                    " bcull=%llx"
+                    " bstop=%llx",
+                    test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0',
+                    (unsigned long long) cache->frun,
+                    (unsigned long long) cache->fcull,
+                    (unsigned long long) cache->fstop,
+                    (unsigned long long) cache->brun,
+                    (unsigned long long) cache->bcull,
+                    (unsigned long long) cache->bstop
+                    );
+
+       if (n > buflen)
+               return -EMSGSIZE;
+
+       if (copy_to_user(_buffer, buffer, n) != 0)
+               return -EFAULT;
+
+       return n;
+}
+
+/*
+ * command the cache
+ */
+static ssize_t cachefiles_daemon_write(struct file *file,
+                                      const char __user *_data,
+                                      size_t datalen,
+                                      loff_t *pos)
+{
+       const struct cachefiles_daemon_cmd *cmd;
+       struct cachefiles_cache *cache = file->private_data;
+       ssize_t ret;
+       char *data, *args, *cp;
+
+       //_enter(",,%zu,", datalen);
+
+       ASSERT(cache);
+
+       if (test_bit(CACHEFILES_DEAD, &cache->flags))
+               return -EIO;
+
+       if (datalen < 0 || datalen > PAGE_SIZE - 1)
+               return -EOPNOTSUPP;
+
+       /* drag the command string into the kernel so we can parse it */
+       data = kmalloc(datalen + 1, GFP_KERNEL);
+       if (!data)
+               return -ENOMEM;
+
+       ret = -EFAULT;
+       if (copy_from_user(data, _data, datalen) != 0)
+               goto error;
+
+       data[datalen] = '\0';
+
+       ret = -EINVAL;
+       if (memchr(data, '\0', datalen))
+               goto error;
+
+       /* strip any newline */
+       cp = memchr(data, '\n', datalen);
+       if (cp) {
+               if (cp == data)
+                       goto error;
+
+               *cp = '\0';
+       }
+
+       /* parse the command */
+       ret = -EOPNOTSUPP;
+
+       for (args = data; *args; args++)
+               if (isspace(*args))
+                       break;
+       if (*args) {
+               if (args == data)
+                       goto error;
+               *args = '\0';
+               for (args++; isspace(*args); args++)
+                       continue;
+       }
+
+       /* run the appropriate command handler */
+       for (cmd = cachefiles_daemon_cmds; cmd->name[0]; cmd++)
+               if (strcmp(cmd->name, data) == 0)
+                       goto found_command;
+
+error:
+       kfree(data);
+       //_leave(" = %zd", ret);
+       return ret;
+
+found_command:
+       mutex_lock(&cache->daemon_mutex);
+
+       ret = -EIO;
+       if (!test_bit(CACHEFILES_DEAD, &cache->flags))
+               ret = cmd->handler(cache, args);
+
+       mutex_unlock(&cache->daemon_mutex);
+
+       if (ret == 0)
+               ret = datalen;
+       goto error;
+}
+
+/*
+ * poll for culling state
+ * - use POLLOUT to indicate culling state
+ */
+static unsigned int cachefiles_daemon_poll(struct file *file,
+                                          struct poll_table_struct *poll)
+{
+       struct cachefiles_cache *cache = file->private_data;
+       unsigned int mask;
+
+       poll_wait(file, &cache->daemon_pollwq, poll);
+       mask = 0;
+
+       if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
+               mask |= POLLIN;
+
+       if (test_bit(CACHEFILES_CULLING, &cache->flags))
+               mask |= POLLOUT;
+
+       return mask;
+}
+
+/*
+ * give a range error for cache space constraints
+ * - can be tail-called
+ */
+static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
+                                        char *args)
+{
+       kerror("Free space limits must be in range"
+              " 0%%<=stop<cull<run<100%%");
+
+       return -EINVAL;
+}
+
+/*
+ * set the percentage of files at which to stop culling
+ * - command: "frun <N>%"
+ */
+static int cachefiles_daemon_frun(struct cachefiles_cache *cache, char *args)
+{
+       unsigned long frun;
+
+       _enter(",%s", args);
+
+       if (!*args)
+               return -EINVAL;
+
+       frun = simple_strtoul(args, &args, 10);
+       if (args[0] != '%' || args[1] != '\0')
+               return -EINVAL;
+
+       if (frun <= cache->fcull_percent || frun >= 100)
+               return cachefiles_daemon_range_error(cache, args);
+
+       cache->frun_percent = frun;
+       return 0;
+}
+
+/*
+ * set the percentage of files at which to start culling
+ * - command: "fcull <N>%"
+ */
+static int cachefiles_daemon_fcull(struct cachefiles_cache *cache, char *args)
+{
+       unsigned long fcull;
+
+       _enter(",%s", args);
+
+       if (!*args)
+               return -EINVAL;
+
+       fcull = simple_strtoul(args, &args, 10);
+       if (args[0] != '%' || args[1] != '\0')
+               return -EINVAL;
+
+       if (fcull <= cache->fstop_percent || fcull >= cache->frun_percent)
+               return cachefiles_daemon_range_error(cache, args);
+
+       cache->fcull_percent = fcull;
+       return 0;
+}
+
+/*
+ * set the percentage of files at which to stop allocating
+ * - command: "fstop <N>%"
+ */
+static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args)
+{
+       unsigned long fstop;
+
+       _enter(",%s", args);
+
+       if (!*args)
+               return -EINVAL;
+
+       fstop = simple_strtoul(args, &args, 10);
+       if (args[0] != '%' || args[1] != '\0')
+               return -EINVAL;
+
+       if (fstop < 0 || fstop >= cache->fcull_percent)
+               return cachefiles_daemon_range_error(cache, args);
+
+       cache->fstop_percent = fstop;
+       return 0;
+}
+
+/*
+ * set the percentage of blocks at which to stop culling
+ * - command: "brun <N>%"
+ */
+static int cachefiles_daemon_brun(struct cachefiles_cache *cache, char *args)
+{
+       unsigned long brun;
+
+       _enter(",%s", args);
+
+       if (!*args)
+               return -EINVAL;
+
+       brun = simple_strtoul(args, &args, 10);
+       if (args[0] != '%' || args[1] != '\0')
+               return -EINVAL;
+
+       if (brun <= cache->bcull_percent || brun >= 100)
+               return cachefiles_daemon_range_error(cache, args);
+
+       cache->brun_percent = brun;
+       return 0;
+}
+
+/*
+ * set the percentage of blocks at which to start culling
+ * - command: "bcull <N>%"
+ */
+static int cachefiles_daemon_bcull(struct cachefiles_cache *cache, char *args)
+{
+       unsigned long bcull;
+
+       _enter(",%s", args);
+
+       if (!*args)
+               return -EINVAL;
+
+       bcull = simple_strtoul(args, &args, 10);
+       if (args[0] != '%' || args[1] != '\0')
+               return -EINVAL;
+
+       if (bcull <= cache->bstop_percent || bcull >= cache->brun_percent)
+               return cachefiles_daemon_range_error(cache, args);
+
+       cache->bcull_percent = bcull;
+       return 0;
+}
+
+/*
+ * set the percentage of blocks at which to stop allocating
+ * - command: "bstop <N>%"
+ */
+static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args)
+{
+       unsigned long bstop;
+
+       _enter(",%s", args);
+
+       if (!*args)
+               return -EINVAL;
+
+       bstop = simple_strtoul(args, &args, 10);
+       if (args[0] != '%' || args[1] != '\0')
+               return -EINVAL;
+
+       if (bstop < 0 || bstop >= cache->bcull_percent)
+               return cachefiles_daemon_range_error(cache, args);
+
+       cache->bstop_percent = bstop;
+       return 0;
+}
+
+/*
+ * set the cache directory
+ * - command: "dir <name>"
+ */
+static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
+{
+       char *dir;
+
+       _enter(",%s", args);
+
+       if (!*args) {
+               kerror("Empty directory specified");
+               return -EINVAL;
+       }
+
+       if (cache->rootdirname) {
+               kerror("Second cache directory specified");
+               return -EEXIST;
+       }
+
+       dir = kstrdup(args, GFP_KERNEL);
+       if (!dir)
+               return -ENOMEM;
+
+       cache->rootdirname = dir;
+       return 0;
+}
+
+/*
+ * set the cache security context
+ * - command: "secctx <ctx>"
+ */
+static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
+{
+       char *secctx;
+
+       _enter(",%s", args);
+
+       if (!*args) {
+               kerror("Empty security context specified");
+               return -EINVAL;
+       }
+
+       if (cache->secctx) {
+               kerror("Second security context specified");
+               return -EINVAL;
+       }
+
+       secctx = kstrdup(args, GFP_KERNEL);
+       if (!secctx)
+               return -ENOMEM;
+
+       cache->secctx = secctx;
+       return 0;
+}
+
+/*
+ * set the cache tag
+ * - command: "tag <name>"
+ */
+static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
+{
+       char *tag;
+
+       _enter(",%s", args);
+
+       if (!*args) {
+               kerror("Empty tag specified");
+               return -EINVAL;
+       }
+
+       if (cache->tag)
+               return -EEXIST;
+
+       tag = kstrdup(args, GFP_KERNEL);
+       if (!tag)
+               return -ENOMEM;
+
+       cache->tag = tag;
+       return 0;
+}
+
+/*
+ * request a node in the cache be culled from the current working directory
+ * - command: "cull <name>"
+ */
+static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
+{
+       struct fs_struct *fs;
+       struct dentry *dir;
+       const struct cred *saved_cred;
+       int ret;
+
+       _enter(",%s", args);
+
+       if (strchr(args, '/'))
+               goto inval;
+
+       if (!test_bit(CACHEFILES_READY, &cache->flags)) {
+               kerror("cull applied to unready cache");
+               return -EIO;
+       }
+
+       if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+               kerror("cull applied to dead cache");
+               return -EIO;
+       }
+
+       /* extract the directory dentry from the cwd */
+       fs = current->fs;
+       read_lock(&fs->lock);
+       dir = dget(fs->pwd.dentry);
+       read_unlock(&fs->lock);
+
+       if (!S_ISDIR(dir->d_inode->i_mode))
+               goto notdir;
+
+       cachefiles_begin_secure(cache, &saved_cred);
+       ret = cachefiles_cull(cache, dir, args);
+       cachefiles_end_secure(cache, saved_cred);
+
+       dput(dir);
+       _leave(" = %d", ret);
+       return ret;
+
+notdir:
+       dput(dir);
+       kerror("cull command requires dirfd to be a directory");
+       return -ENOTDIR;
+
+inval:
+       kerror("cull command requires dirfd and filename");
+       return -EINVAL;
+}
+
+/*
+ * set debugging mode
+ * - command: "debug <mask>"
+ */
+static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
+{
+       unsigned long mask;
+
+       _enter(",%s", args);
+
+       mask = simple_strtoul(args, &args, 0);
+       if (args[0] != '\0')
+               goto inval;
+
+       cachefiles_debug = mask;
+       _leave(" = 0");
+       return 0;
+
+inval:
+       kerror("debug command requires mask");
+       return -EINVAL;
+}
+
+/*
+ * find out whether an object in the current working directory is in use or not
+ * - command: "inuse <name>"
+ */
+static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
+{
+       struct fs_struct *fs;
+       struct dentry *dir;
+       const struct cred *saved_cred;
+       int ret;
+
+       //_enter(",%s", args);
+
+       if (strchr(args, '/'))
+               goto inval;
+
+       if (!test_bit(CACHEFILES_READY, &cache->flags)) {
+               kerror("inuse applied to unready cache");
+               return -EIO;
+       }
+
+       if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+               kerror("inuse applied to dead cache");
+               return -EIO;
+       }
+
+       /* extract the directory dentry from the cwd */
+       fs = current->fs;
+       read_lock(&fs->lock);
+       dir = dget(fs->pwd.dentry);
+       read_unlock(&fs->lock);
+
+       if (!S_ISDIR(dir->d_inode->i_mode))
+               goto notdir;
+
+       cachefiles_begin_secure(cache, &saved_cred);
+       ret = cachefiles_check_in_use(cache, dir, args);
+       cachefiles_end_secure(cache, saved_cred);
+
+       dput(dir);
+       //_leave(" = %d", ret);
+       return ret;
+
+notdir:
+       dput(dir);
+       kerror("inuse command requires dirfd to be a directory");
+       return -ENOTDIR;
+
+inval:
+       kerror("inuse command requires dirfd and filename");
+       return -EINVAL;
+}
+
+/*
+ * see if we have space for a number of pages and/or a number of files in the
+ * cache
+ */
+int cachefiles_has_space(struct cachefiles_cache *cache,
+                        unsigned fnr, unsigned bnr)
+{
+       struct kstatfs stats;
+       int ret;
+
+       //_enter("{%llu,%llu,%llu,%llu,%llu,%llu},%u,%u",
+       //       (unsigned long long) cache->frun,
+       //       (unsigned long long) cache->fcull,
+       //       (unsigned long long) cache->fstop,
+       //       (unsigned long long) cache->brun,
+       //       (unsigned long long) cache->bcull,
+       //       (unsigned long long) cache->bstop,
+       //       fnr, bnr);
+
+       /* find out how many pages of blockdev are available */
+       memset(&stats, 0, sizeof(stats));
+
+       ret = vfs_statfs(cache->mnt->mnt_root, &stats);
+       if (ret < 0) {
+               if (ret == -EIO)
+                       cachefiles_io_error(cache, "statfs failed");
+               _leave(" = %d", ret);
+               return ret;
+       }
+
+       stats.f_bavail >>= cache->bshift;
+
+       //_debug("avail %llu,%llu",
+       //       (unsigned long long) stats.f_ffree,
+       //       (unsigned long long) stats.f_bavail);
+
+       /* see if there is sufficient space */
+       if (stats.f_ffree > fnr)
+               stats.f_ffree -= fnr;
+       else
+               stats.f_ffree = 0;
+
+       if (stats.f_bavail > bnr)
+               stats.f_bavail -= bnr;
+       else
+               stats.f_bavail = 0;
+
+       ret = -ENOBUFS;
+       if (stats.f_ffree < cache->fstop ||
+           stats.f_bavail < cache->bstop)
+               goto begin_cull;
+
+       ret = 0;
+       if (stats.f_ffree < cache->fcull ||
+           stats.f_bavail < cache->bcull)
+               goto begin_cull;
+
+       if (test_bit(CACHEFILES_CULLING, &cache->flags) &&
+           stats.f_ffree >= cache->frun &&
+           stats.f_bavail >= cache->brun &&
+           test_and_clear_bit(CACHEFILES_CULLING, &cache->flags)
+           ) {
+               _debug("cease culling");
+               cachefiles_state_changed(cache);
+       }
+
+       //_leave(" = 0");
+       return 0;
+
+begin_cull:
+       if (!test_and_set_bit(CACHEFILES_CULLING, &cache->flags)) {
+               _debug("### CULL CACHE ###");
+               cachefiles_state_changed(cache);
+       }
+
+       _leave(" = %d", ret);
+       return ret;
+}
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
new file mode 100644 (file)
index 0000000..1e96234
--- /dev/null
@@ -0,0 +1,449 @@
+/* FS-Cache interface to CacheFiles
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/mount.h>
+#include <linux/buffer_head.h>
+#include "internal.h"
+
+#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
+
+struct cachefiles_lookup_data {
+       struct cachefiles_xattr *auxdata;       /* auxiliary data */
+       char                    *key;           /* key path */
+};
+
+static int cachefiles_attr_changed(struct fscache_object *_object);
+
+/*
+ * allocate an object record for a cookie lookup and prepare the lookup data
+ */
+static struct fscache_object *cachefiles_alloc_object(
+       struct fscache_cache *_cache,
+       struct fscache_cookie *cookie)
+{
+       struct cachefiles_lookup_data *lookup_data;
+       struct cachefiles_object *object;
+       struct cachefiles_cache *cache;
+       struct cachefiles_xattr *auxdata;
+       unsigned keylen, auxlen;
+       void *buffer;
+       char *key;
+
+       cache = container_of(_cache, struct cachefiles_cache, cache);
+
+       _enter("{%s},%p,", cache->cache.identifier, cookie);
+
+       lookup_data = kmalloc(sizeof(*lookup_data), GFP_KERNEL);
+       if (!lookup_data)
+               goto nomem_lookup_data;
+
+       /* create a new object record and a temporary leaf image */
+       object = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
+       if (!object)
+               goto nomem_object;
+
+       ASSERTCMP(object->backer, ==, NULL);
+
+       BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
+       atomic_set(&object->usage, 1);
+
+       fscache_object_init(&object->fscache, cookie, &cache->cache);
+
+       object->type = cookie->def->type;
+
+       /* get hold of the raw key
+        * - stick the length on the front and leave space on the back for the
+        *   encoder
+        */
+       buffer = kmalloc((2 + 512) + 3, GFP_KERNEL);
+       if (!buffer)
+               goto nomem_buffer;
+
+       keylen = cookie->def->get_key(cookie->netfs_data, buffer + 2, 512);
+       ASSERTCMP(keylen, <, 512);
+
+       *(uint16_t *)buffer = keylen;
+       ((char *)buffer)[keylen + 2] = 0;
+       ((char *)buffer)[keylen + 3] = 0;
+       ((char *)buffer)[keylen + 4] = 0;
+
+       /* turn the raw key into something that can work with as a filename */
+       key = cachefiles_cook_key(buffer, keylen + 2, object->type);
+       if (!key)
+               goto nomem_key;
+
+       /* get hold of the auxiliary data and prepend the object type */
+       auxdata = buffer;
+       auxlen = 0;
+       if (cookie->def->get_aux) {
+               auxlen = cookie->def->get_aux(cookie->netfs_data,
+                                             auxdata->data, 511);
+               ASSERTCMP(auxlen, <, 511);
+       }
+
+       auxdata->len = auxlen + 1;
+       auxdata->type = cookie->def->type;
+
+       lookup_data->auxdata = auxdata;
+       lookup_data->key = key;
+       object->lookup_data = lookup_data;
+
+       _leave(" = %p [%p]", &object->fscache, lookup_data);
+       return &object->fscache;
+
+nomem_key:
+       kfree(buffer);
+nomem_buffer:
+       BUG_ON(test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
+       kmem_cache_free(cachefiles_object_jar, object);
+       fscache_object_destroyed(&cache->cache);
+nomem_object:
+       kfree(lookup_data);
+nomem_lookup_data:
+       _leave(" = -ENOMEM");
+       return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * attempt to look up the nominated node in this cache
+ */
+static void cachefiles_lookup_object(struct fscache_object *_object)
+{
+       struct cachefiles_lookup_data *lookup_data;
+       struct cachefiles_object *parent, *object;
+       struct cachefiles_cache *cache;
+       const struct cred *saved_cred;
+       int ret;
+
+       _enter("{OBJ%x}", _object->debug_id);
+
+       cache = container_of(_object->cache, struct cachefiles_cache, cache);
+       parent = container_of(_object->parent,
+                             struct cachefiles_object, fscache);
+       object = container_of(_object, struct cachefiles_object, fscache);
+       lookup_data = object->lookup_data;
+
+       ASSERTCMP(lookup_data, !=, NULL);
+
+       /* look up the key, creating any missing bits */
+       cachefiles_begin_secure(cache, &saved_cred);
+       ret = cachefiles_walk_to_object(parent, object,
+                                       lookup_data->key,
+                                       lookup_data->auxdata);
+       cachefiles_end_secure(cache, saved_cred);
+
+       /* polish off by setting the attributes of non-index files */
+       if (ret == 0 &&
+           object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
+               cachefiles_attr_changed(&object->fscache);
+
+       if (ret < 0) {
+               printk(KERN_WARNING "CacheFiles: Lookup failed error %d\n",
+                      ret);
+               fscache_object_lookup_error(&object->fscache);
+       }
+
+       _leave(" [%d]", ret);
+}
+
+/*
+ * indication of lookup completion
+ */
+static void cachefiles_lookup_complete(struct fscache_object *_object)
+{
+       struct cachefiles_object *object;
+
+       object = container_of(_object, struct cachefiles_object, fscache);
+
+       _enter("{OBJ%x,%p}", object->fscache.debug_id, object->lookup_data);
+
+       if (object->lookup_data) {
+               kfree(object->lookup_data->key);
+               kfree(object->lookup_data->auxdata);
+               kfree(object->lookup_data);
+               object->lookup_data = NULL;
+       }
+}
+
+/*
+ * increment the usage count on an inode object (may fail if unmounting)
+ */
+static
+struct fscache_object *cachefiles_grab_object(struct fscache_object *_object)
+{
+       struct cachefiles_object *object =
+               container_of(_object, struct cachefiles_object, fscache);
+
+       _enter("{OBJ%x,%d}", _object->debug_id, atomic_read(&object->usage));
+
+#ifdef CACHEFILES_DEBUG_SLAB
+       ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+
+       atomic_inc(&object->usage);
+       return &object->fscache;
+}
+
+/*
+ * update the auxilliary data for an object object on disk
+ */
+static void cachefiles_update_object(struct fscache_object *_object)
+{
+       struct cachefiles_object *object;
+       struct cachefiles_xattr *auxdata;
+       struct cachefiles_cache *cache;
+       struct fscache_cookie *cookie;
+       const struct cred *saved_cred;
+       unsigned auxlen;
+
+       _enter("{OBJ%x}", _object->debug_id);
+
+       object = container_of(_object, struct cachefiles_object, fscache);
+       cache = container_of(object->fscache.cache, struct cachefiles_cache,
+                            cache);
+       cookie = object->fscache.cookie;
+
+       if (!cookie->def->get_aux) {
+               _leave(" [no aux]");
+               return;
+       }
+
+       auxdata = kmalloc(2 + 512 + 3, GFP_KERNEL);
+       if (!auxdata) {
+               _leave(" [nomem]");
+               return;
+       }
+
+       auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
+       ASSERTCMP(auxlen, <, 511);
+
+       auxdata->len = auxlen + 1;
+       auxdata->type = cookie->def->type;
+
+       cachefiles_begin_secure(cache, &saved_cred);
+       cachefiles_update_object_xattr(object, auxdata);
+       cachefiles_end_secure(cache, saved_cred);
+       kfree(auxdata);
+       _leave("");
+}
+
+/*
+ * discard the resources pinned by an object and effect retirement if
+ * requested
+ */
+static void cachefiles_drop_object(struct fscache_object *_object)
+{
+       struct cachefiles_object *object;
+       struct cachefiles_cache *cache;
+       const struct cred *saved_cred;
+
+       ASSERT(_object);
+
+       object = container_of(_object, struct cachefiles_object, fscache);
+
+       _enter("{OBJ%x,%d}",
+              object->fscache.debug_id, atomic_read(&object->usage));
+
+       cache = container_of(object->fscache.cache,
+                            struct cachefiles_cache, cache);
+
+#ifdef CACHEFILES_DEBUG_SLAB
+       ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+
+       /* delete retired objects */
+       if (object->fscache.state == FSCACHE_OBJECT_RECYCLING &&
+           _object != cache->cache.fsdef
+           ) {
+               _debug("- retire object OBJ%x", object->fscache.debug_id);
+               cachefiles_begin_secure(cache, &saved_cred);
+               cachefiles_delete_object(cache, object);
+               cachefiles_end_secure(cache, saved_cred);
+       }
+
+       /* close the filesystem stuff attached to the object */
+       if (object->backer != object->dentry)
+               dput(object->backer);
+       object->backer = NULL;
+
+       /* note that the object is now inactive */
+       if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
+               write_lock(&cache->active_lock);
+               if (!test_and_clear_bit(CACHEFILES_OBJECT_ACTIVE,
+                                       &object->flags))
+                       BUG();
+               rb_erase(&object->active_node, &cache->active_nodes);
+               wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
+               write_unlock(&cache->active_lock);
+       }
+
+       dput(object->dentry);
+       object->dentry = NULL;
+
+       _leave("");
+}
+
+/*
+ * dispose of a reference to an object
+ */
+static void cachefiles_put_object(struct fscache_object *_object)
+{
+       struct cachefiles_object *object;
+       struct fscache_cache *cache;
+
+       ASSERT(_object);
+
+       object = container_of(_object, struct cachefiles_object, fscache);
+
+       _enter("{OBJ%x,%d}",
+              object->fscache.debug_id, atomic_read(&object->usage));
+
+#ifdef CACHEFILES_DEBUG_SLAB
+       ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
+#endif
+
+       ASSERTIFCMP(object->fscache.parent,
+                   object->fscache.parent->n_children, >, 0);
+
+       if (atomic_dec_and_test(&object->usage)) {
+               _debug("- kill object OBJ%x", object->fscache.debug_id);
+
+               ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags));
+               ASSERTCMP(object->fscache.parent, ==, NULL);
+               ASSERTCMP(object->backer, ==, NULL);
+               ASSERTCMP(object->dentry, ==, NULL);
+               ASSERTCMP(object->fscache.n_ops, ==, 0);
+               ASSERTCMP(object->fscache.n_children, ==, 0);
+
+               if (object->lookup_data) {
+                       kfree(object->lookup_data->key);
+                       kfree(object->lookup_data->auxdata);
+                       kfree(object->lookup_data);
+                       object->lookup_data = NULL;
+               }
+
+               cache = object->fscache.cache;
+               kmem_cache_free(cachefiles_object_jar, object);
+               fscache_object_destroyed(cache);
+       }
+
+       _leave("");
+}
+
+/*
+ * sync a cache
+ */
+static void cachefiles_sync_cache(struct fscache_cache *_cache)
+{
+       struct cachefiles_cache *cache;
+       const struct cred *saved_cred;
+       int ret;
+
+       _enter("%p", _cache);
+
+       cache = container_of(_cache, struct cachefiles_cache, cache);
+
+       /* make sure all pages pinned by operations on behalf of the netfs are
+        * written to disc */
+       cachefiles_begin_secure(cache, &saved_cred);
+       ret = fsync_super(cache->mnt->mnt_sb);
+       cachefiles_end_secure(cache, saved_cred);
+
+       if (ret == -EIO)
+               cachefiles_io_error(cache,
+                                   "Attempt to sync backing fs superblock"
+                                   " returned error %d",
+                                   ret);
+}
+
+/*
+ * notification the attributes on an object have changed
+ * - called with reads/writes excluded by FS-Cache
+ */
+static int cachefiles_attr_changed(struct fscache_object *_object)
+{
+       struct cachefiles_object *object;
+       struct cachefiles_cache *cache;
+       const struct cred *saved_cred;
+       struct iattr newattrs;
+       uint64_t ni_size;
+       loff_t oi_size;
+       int ret;
+
+       _object->cookie->def->get_attr(_object->cookie->netfs_data, &ni_size);
+
+       _enter("{OBJ%x},[%llu]",
+              _object->debug_id, (unsigned long long) ni_size);
+
+       object = container_of(_object, struct cachefiles_object, fscache);
+       cache = container_of(object->fscache.cache,
+                            struct cachefiles_cache, cache);
+
+       if (ni_size == object->i_size)
+               return 0;
+
+       if (!object->backer)
+               return -ENOBUFS;
+
+       ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+
+       fscache_set_store_limit(&object->fscache, ni_size);
+
+       oi_size = i_size_read(object->backer->d_inode);
+       if (oi_size == ni_size)
+               return 0;
+
+       newattrs.ia_size = ni_size;
+       newattrs.ia_valid = ATTR_SIZE;
+
+       cachefiles_begin_secure(cache, &saved_cred);
+       mutex_lock(&object->backer->d_inode->i_mutex);
+       ret = notify_change(object->backer, &newattrs);
+       mutex_unlock(&object->backer->d_inode->i_mutex);
+       cachefiles_end_secure(cache, saved_cred);
+
+       if (ret == -EIO) {
+               fscache_set_store_limit(&object->fscache, 0);
+               cachefiles_io_error_obj(object, "Size set failed");
+               ret = -ENOBUFS;
+       }
+
+       _leave(" = %d", ret);
+       return ret;
+}
+
+/*
+ * dissociate a cache from all the pages it was backing
+ */
+static void cachefiles_dissociate_pages(struct fscache_cache *cache)
+{
+       _enter("");
+}
+
+const struct fscache_cache_ops cachefiles_cache_ops = {
+       .name                   = "cachefiles",
+       .alloc_object           = cachefiles_alloc_object,
+       .lookup_object          = cachefiles_lookup_object,
+       .lookup_complete        = cachefiles_lookup_complete,
+       .grab_object            = cachefiles_grab_object,
+       .update_object          = cachefiles_update_object,
+       .drop_object            = cachefiles_drop_object,
+       .put_object             = cachefiles_put_object,
+       .sync_cache             = cachefiles_sync_cache,
+       .attr_changed           = cachefiles_attr_changed,
+       .read_or_alloc_page     = cachefiles_read_or_alloc_page,
+       .read_or_alloc_pages    = cachefiles_read_or_alloc_pages,
+       .allocate_page          = cachefiles_allocate_page,
+       .allocate_pages         = cachefiles_allocate_pages,
+       .write_page             = cachefiles_write_page,
+       .uncache_page           = cachefiles_uncache_page,
+       .dissociate_pages       = cachefiles_dissociate_pages,
+};
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
new file mode 100644 (file)
index 0000000..19218e1
--- /dev/null
@@ -0,0 +1,360 @@
+/* General netfs cache on cache files internal defs
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/fscache-cache.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+
+struct cachefiles_cache;
+struct cachefiles_object;
+
+extern unsigned cachefiles_debug;
+#define CACHEFILES_DEBUG_KENTER        1
+#define CACHEFILES_DEBUG_KLEAVE        2
+#define CACHEFILES_DEBUG_KDEBUG        4
+
+/*
+ * node records
+ */
+struct cachefiles_object {
+       struct fscache_object           fscache;        /* fscache handle */
+       struct cachefiles_lookup_data   *lookup_data;   /* cached lookup data */
+       struct dentry                   *dentry;        /* the file/dir representing this object */
+       struct dentry                   *backer;        /* backing file */
+       loff_t                          i_size;         /* object size */
+       unsigned long                   flags;
+#define CACHEFILES_OBJECT_ACTIVE       0               /* T if marked active */
+       atomic_t                        usage;          /* object usage count */
+       uint8_t                         type;           /* object type */
+       uint8_t                         new;            /* T if object new */
+       spinlock_t                      work_lock;
+       struct rb_node                  active_node;    /* link in active tree (dentry is key) */
+};
+
+extern struct kmem_cache *cachefiles_object_jar;
+
+/*
+ * Cache files cache definition
+ */
+struct cachefiles_cache {
+       struct fscache_cache            cache;          /* FS-Cache record */
+       struct vfsmount                 *mnt;           /* mountpoint holding the cache */
+       struct dentry                   *graveyard;     /* directory into which dead objects go */
+       struct file                     *cachefilesd;   /* manager daemon handle */
+       const struct cred               *cache_cred;    /* security override for accessing cache */
+       struct mutex                    daemon_mutex;   /* command serialisation mutex */
+       wait_queue_head_t               daemon_pollwq;  /* poll waitqueue for daemon */
+       struct rb_root                  active_nodes;   /* active nodes (can't be culled) */
+       rwlock_t                        active_lock;    /* lock for active_nodes */
+       atomic_t                        gravecounter;   /* graveyard uniquifier */
+       unsigned                        frun_percent;   /* when to stop culling (% files) */
+       unsigned                        fcull_percent;  /* when to start culling (% files) */
+       unsigned                        fstop_percent;  /* when to stop allocating (% files) */
+       unsigned                        brun_percent;   /* when to stop culling (% blocks) */
+       unsigned                        bcull_percent;  /* when to start culling (% blocks) */
+       unsigned                        bstop_percent;  /* when to stop allocating (% blocks) */
+       unsigned                        bsize;          /* cache's block size */
+       unsigned                        bshift;         /* min(ilog2(PAGE_SIZE / bsize), 0) */
+       uint64_t                        frun;           /* when to stop culling */
+       uint64_t                        fcull;          /* when to start culling */
+       uint64_t                        fstop;          /* when to stop allocating */
+       sector_t                        brun;           /* when to stop culling */
+       sector_t                        bcull;          /* when to start culling */
+       sector_t                        bstop;          /* when to stop allocating */
+       unsigned long                   flags;
+#define CACHEFILES_READY               0       /* T if cache prepared */
+#define CACHEFILES_DEAD                        1       /* T if cache dead */
+#define CACHEFILES_CULLING             2       /* T if cull engaged */
+#define CACHEFILES_STATE_CHANGED       3       /* T if state changed (poll trigger) */
+       char                            *rootdirname;   /* name of cache root directory */
+       char                            *secctx;        /* LSM security context */
+       char                            *tag;           /* cache binding tag */
+};
+
+/*
+ * backing file read tracking
+ */
+struct cachefiles_one_read {
+       wait_queue_t                    monitor;        /* link into monitored waitqueue */
+       struct page                     *back_page;     /* backing file page we're waiting for */
+       struct page                     *netfs_page;    /* netfs page we're going to fill */
+       struct fscache_retrieval        *op;            /* retrieval op covering this */
+       struct list_head                op_link;        /* link in op's todo list */
+};
+
+/*
+ * backing file write tracking
+ */
+struct cachefiles_one_write {
+       struct page                     *netfs_page;    /* netfs page to copy */
+       struct cachefiles_object        *object;
+       struct list_head                obj_link;       /* link in object's lists */
+       fscache_rw_complete_t           end_io_func;
+       void                            *context;
+};
+
+/*
+ * auxiliary data xattr buffer
+ */
+struct cachefiles_xattr {
+       uint16_t                        len;
+       uint8_t                         type;
+       uint8_t                         data[];
+};
+
+/*
+ * note change of state for daemon
+ */
+static inline void cachefiles_state_changed(struct cachefiles_cache *cache)
+{
+       set_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
+       wake_up_all(&cache->daemon_pollwq);
+}
+
+/*
+ * cf-bind.c
+ */
+extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args);
+extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache);
+
+/*
+ * cf-daemon.c
+ */
+extern const struct file_operations cachefiles_daemon_fops;
+
+extern int cachefiles_has_space(struct cachefiles_cache *cache,
+                               unsigned fnr, unsigned bnr);
+
+/*
+ * cf-interface.c
+ */
+extern const struct fscache_cache_ops cachefiles_cache_ops;
+
+/*
+ * cf-key.c
+ */
+extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
+
+/*
+ * cf-namei.c
+ */
+extern int cachefiles_delete_object(struct cachefiles_cache *cache,
+                                   struct cachefiles_object *object);
+extern int cachefiles_walk_to_object(struct cachefiles_object *parent,
+                                    struct cachefiles_object *object,
+                                    const char *key,
+                                    struct cachefiles_xattr *auxdata);
+extern struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
+                                              struct dentry *dir,
+                                              const char *name);
+
+extern int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
+                          char *filename);
+
+extern int cachefiles_check_in_use(struct cachefiles_cache *cache,
+                                  struct dentry *dir, char *filename);
+
+/*
+ * cf-proc.c
+ */
+#ifdef CONFIG_CACHEFILES_HISTOGRAM
+extern atomic_t cachefiles_lookup_histogram[HZ];
+extern atomic_t cachefiles_mkdir_histogram[HZ];
+extern atomic_t cachefiles_create_histogram[HZ];
+
+extern int __init cachefiles_proc_init(void);
+extern void cachefiles_proc_cleanup(void);
+static inline
+void cachefiles_hist(atomic_t histogram[], unsigned long start_jif)
+{
+       unsigned long jif = jiffies - start_jif;
+       if (jif >= HZ)
+               jif = HZ - 1;
+       atomic_inc(&histogram[jif]);
+}
+
+#else
+#define cachefiles_proc_init()         (0)
+#define cachefiles_proc_cleanup()      do {} while (0)
+#define cachefiles_hist(hist, start_jif) do {} while (0)
+#endif
+
+/*
+ * cf-rdwr.c
+ */
+extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *,
+                                        struct page *, gfp_t);
+extern int cachefiles_read_or_alloc_pages(struct fscache_retrieval *,
+                                         struct list_head *, unsigned *,
+                                         gfp_t);
+extern int cachefiles_allocate_page(struct fscache_retrieval *, struct page *,
+                                   gfp_t);
+extern int cachefiles_allocate_pages(struct fscache_retrieval *,
+                                    struct list_head *, unsigned *, gfp_t);
+extern int cachefiles_write_page(struct fscache_storage *, struct page *);
+extern void cachefiles_uncache_page(struct fscache_object *, struct page *);
+
+/*
+ * cf-security.c
+ */
+extern int cachefiles_get_security_ID(struct cachefiles_cache *cache);
+extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
+                                              struct dentry *root,
+                                              const struct cred **_saved_cred);
+
+static inline void cachefiles_begin_secure(struct cachefiles_cache *cache,
+                                          const struct cred **_saved_cred)
+{
+       *_saved_cred = override_creds(cache->cache_cred);
+}
+
+static inline void cachefiles_end_secure(struct cachefiles_cache *cache,
+                                        const struct cred *saved_cred)
+{
+       revert_creds(saved_cred);
+}
+
+/*
+ * cf-xattr.c
+ */
+extern int cachefiles_check_object_type(struct cachefiles_object *object);
+extern int cachefiles_set_object_xattr(struct cachefiles_object *object,
+                                      struct cachefiles_xattr *auxdata);
+extern int cachefiles_update_object_xattr(struct cachefiles_object *object,
+                                         struct cachefiles_xattr *auxdata);
+extern int cachefiles_check_object_xattr(struct cachefiles_object *object,
+                                        struct cachefiles_xattr *auxdata);
+extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
+                                         struct dentry *dentry);
+
+
+/*
+ * error handling
+ */
+#define kerror(FMT, ...) printk(KERN_ERR "CacheFiles: "FMT"\n", ##__VA_ARGS__)
+
+#define cachefiles_io_error(___cache, FMT, ...)                \
+do {                                                   \
+       kerror("I/O Error: " FMT, ##__VA_ARGS__);       \
+       fscache_io_error(&(___cache)->cache);           \
+       set_bit(CACHEFILES_DEAD, &(___cache)->flags);   \
+} while (0)
+
+#define cachefiles_io_error_obj(object, FMT, ...)                      \
+do {                                                                   \
+       struct cachefiles_cache *___cache;                              \
+                                                                       \
+       ___cache = container_of((object)->fscache.cache,                \
+                               struct cachefiles_cache, cache);        \
+       cachefiles_io_error(___cache, FMT, ##__VA_ARGS__);              \
+} while (0)
+
+
+/*
+ * debug tracing
+ */
+#define dbgprintk(FMT, ...) \
+       printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
+
+/* make sure we maintain the format strings, even when debugging is disabled */
+static inline void _dbprintk(const char *fmt, ...)
+       __attribute__((format(printf, 1, 2)));
+static inline void _dbprintk(const char *fmt, ...)
+{
+}
+
+#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
+
+
+#if defined(__KDEBUG)
+#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
+#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
+#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
+
+#elif defined(CONFIG_CACHEFILES_DEBUG)
+#define _enter(FMT, ...)                               \
+do {                                                   \
+       if (cachefiles_debug & CACHEFILES_DEBUG_KENTER) \
+               kenter(FMT, ##__VA_ARGS__);             \
+} while (0)
+
+#define _leave(FMT, ...)                               \
+do {                                                   \
+       if (cachefiles_debug & CACHEFILES_DEBUG_KLEAVE) \
+               kleave(FMT, ##__VA_ARGS__);             \
+} while (0)
+
+#define _debug(FMT, ...)                               \
+do {                                                   \
+       if (cachefiles_debug & CACHEFILES_DEBUG_KDEBUG) \
+               kdebug(FMT, ##__VA_ARGS__);             \
+} while (0)
+
+#else
+#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#endif
+
+#if 1 /* defined(__KDEBUGALL) */
+
+#define ASSERT(X)                                                      \
+do {                                                                   \
+       if (unlikely(!(X))) {                                           \
+               printk(KERN_ERR "\n");                                  \
+               printk(KERN_ERR "CacheFiles: Assertion failed\n");      \
+               BUG();                                                  \
+       }                                                               \
+} while (0)
+
+#define ASSERTCMP(X, OP, Y)                                            \
+do {                                                                   \
+       if (unlikely(!((X) OP (Y)))) {                                  \
+               printk(KERN_ERR "\n");                                  \
+               printk(KERN_ERR "CacheFiles: Assertion failed\n");      \
+               printk(KERN_ERR "%lx " #OP " %lx is false\n",           \
+                      (unsigned long)(X), (unsigned long)(Y));         \
+               BUG();                                                  \
+       }                                                               \
+} while (0)
+
+#define ASSERTIF(C, X)                                                 \
+do {                                                                   \
+       if (unlikely((C) && !(X))) {                                    \
+               printk(KERN_ERR "\n");                                  \
+               printk(KERN_ERR "CacheFiles: Assertion failed\n");      \
+               BUG();                                                  \
+       }                                                               \
+} while (0)
+
+#define ASSERTIFCMP(C, X, OP, Y)                                       \
+do {                                                                   \
+       if (unlikely((C) && !((X) OP (Y)))) {                           \
+               printk(KERN_ERR "\n");                                  \
+               printk(KERN_ERR "CacheFiles: Assertion failed\n");      \
+               printk(KERN_ERR "%lx " #OP " %lx is false\n",           \
+                      (unsigned long)(X), (unsigned long)(Y));         \
+               BUG();                                                  \
+       }                                                               \
+} while (0)
+
+#else
+
+#define ASSERT(X)                      do {} while (0)
+#define ASSERTCMP(X, OP, Y)            do {} while (0)
+#define ASSERTIF(C, X)                 do {} while (0)
+#define ASSERTIFCMP(C, X, OP, Y)       do {} while (0)
+
+#endif
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
new file mode 100644 (file)
index 0000000..81b8b2b
--- /dev/null
@@ -0,0 +1,159 @@
+/* Key to pathname encoder
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/slab.h>
+#include "internal.h"
+
+static const char cachefiles_charmap[64] =
+       "0123456789"                    /* 0 - 9 */
+       "abcdefghijklmnopqrstuvwxyz"    /* 10 - 35 */
+       "ABCDEFGHIJKLMNOPQRSTUVWXYZ"    /* 36 - 61 */
+       "_-"                            /* 62 - 63 */
+       ;
+
+static const char cachefiles_filecharmap[256] = {
+       /* we skip space and tab and control chars */
+       [33 ... 46] = 1,                /* '!' -> '.' */
+       /* we skip '/' as it's significant to pathwalk */
+       [48 ... 127] = 1,               /* '0' -> '~' */
+};
+
+/*
+ * turn the raw key into something cooked
+ * - the raw key should include the length in the two bytes at the front
+ * - the key may be up to 514 bytes in length (including the length word)
+ *   - "base64" encode the strange keys, mapping 3 bytes of raw to four of
+ *     cooked
+ *   - need to cut the cooked key into 252 char lengths (189 raw bytes)
+ */
+char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
+{
+       unsigned char csum, ch;
+       unsigned int acc;
+       char *key;
+       int loop, len, max, seg, mark, print;
+
+       _enter(",%d", keylen);
+
+       BUG_ON(keylen < 2 || keylen > 514);
+
+       csum = raw[0] + raw[1];
+       print = 1;
+       for (loop = 2; loop < keylen; loop++) {
+               ch = raw[loop];
+               csum += ch;
+               print &= cachefiles_filecharmap[ch];
+       }
+
+       if (print) {
+               /* if the path is usable ASCII, then we render it directly */
+               max = keylen - 2;
+               max += 2;       /* two base64'd length chars on the front */
+               max += 5;       /* @checksum/M */
+               max += 3 * 2;   /* maximum number of segment dividers (".../M")
+                                * is ((514 + 251) / 252) = 3
+                                */
+               max += 1;       /* NUL on end */
+       } else {
+               /* calculate the maximum length of the cooked key */
+               keylen = (keylen + 2) / 3;
+
+               max = keylen * 4;
+               max += 5;       /* @checksum/M */
+               max += 3 * 2;   /* maximum number of segment dividers (".../M")
+                                * is ((514 + 188) / 189) = 3
+                                */
+               max += 1;       /* NUL on end */
+       }
+
+       max += 1;       /* 2nd NUL on end */
+
+       _debug("max: %d", max);
+
+       key = kmalloc(max, GFP_KERNEL);
+       if (!key)
+               return NULL;
+
+       len = 0;
+
+       /* build the cooked key */
+       sprintf(key, "@%02x%c+", (unsigned) csum, 0);
+       len = 5;
+       mark = len - 1;
+
+       if (print) {
+               acc = *(uint16_t *) raw;
+               raw += 2;
+
+               key[len + 1] = cachefiles_charmap[acc & 63];
+               acc >>= 6;
+               key[len] = cachefiles_charmap[acc & 63];
+               len += 2;
+
+               seg = 250;
+               for (loop = keylen; loop > 0; loop--) {
+                       if (seg <= 0) {
+                               key[len++] = '\0';
+                               mark = len;
+                               key[len++] = '+';
+                               seg = 252;
+                       }
+
+                       key[len++] = *raw++;
+                       ASSERT(len < max);
+               }
+
+               switch (type) {
+               case FSCACHE_COOKIE_TYPE_INDEX:         type = 'I';     break;
+               case FSCACHE_COOKIE_TYPE_DATAFILE:      type = 'D';     break;
+               default:                                type = 'S';     break;
+               }
+       } else {
+               seg = 252;
+               for (loop = keylen; loop > 0; loop--) {
+                       if (seg <= 0) {
+                               key[len++] = '\0';
+                               mark = len;
+                               key[len++] = '+';
+                               seg = 252;
+                       }
+
+                       acc = *raw++;
+                       acc |= *raw++ << 8;
+                       acc |= *raw++ << 16;
+
+                       _debug("acc: %06x", acc);
+
+                       key[len++] = cachefiles_charmap[acc & 63];
+                       acc >>= 6;
+                       key[len++] = cachefiles_charmap[acc & 63];
+                       acc >>= 6;
+                       key[len++] = cachefiles_charmap[acc & 63];
+                       acc >>= 6;
+                       key[len++] = cachefiles_charmap[acc & 63];
+
+                       ASSERT(len < max);
+               }
+
+               switch (type) {
+               case FSCACHE_COOKIE_TYPE_INDEX:         type = 'J';     break;
+               case FSCACHE_COOKIE_TYPE_DATAFILE:      type = 'E';     break;
+               default:                                type = 'T';     break;
+               }
+       }
+
+       key[mark] = type;
+       key[len++] = 0;
+       key[len] = 0;
+
+       _leave(" = %p %d", key, len);
+       return key;
+}
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
new file mode 100644 (file)
index 0000000..4bfa8cf
--- /dev/null
@@ -0,0 +1,106 @@
+/* Network filesystem caching backend to use cache files on a premounted
+ * filesystem
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/sysctl.h>
+#include <linux/miscdevice.h>
+#include "internal.h"
+
+unsigned cachefiles_debug;
+module_param_named(debug, cachefiles_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(cachefiles_debug, "CacheFiles debugging mask");
+
+MODULE_DESCRIPTION("Mounted-filesystem based cache");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+struct kmem_cache *cachefiles_object_jar;
+
+static struct miscdevice cachefiles_dev = {
+       .minor  = MISC_DYNAMIC_MINOR,
+       .name   = "cachefiles",
+       .fops   = &cachefiles_daemon_fops,
+};
+
+static void cachefiles_object_init_once(void *_object)
+{
+       struct cachefiles_object *object = _object;
+
+       memset(object, 0, sizeof(*object));
+       spin_lock_init(&object->work_lock);
+}
+
+/*
+ * initialise the fs caching module
+ */
+static int __init cachefiles_init(void)
+{
+       int ret;
+
+       ret = misc_register(&cachefiles_dev);
+       if (ret < 0)
+               goto error_dev;
+
+       /* create an object jar */
+       ret = -ENOMEM;
+       cachefiles_object_jar =
+               kmem_cache_create("cachefiles_object_jar",
+                                 sizeof(struct cachefiles_object),
+                                 0,
+                                 SLAB_HWCACHE_ALIGN,
+                                 cachefiles_object_init_once);
+       if (!cachefiles_object_jar) {
+               printk(KERN_NOTICE
+                      "CacheFiles: Failed to allocate an object jar\n");
+               goto error_object_jar;
+       }
+
+       ret = cachefiles_proc_init();
+       if (ret < 0)
+               goto error_proc;
+
+       printk(KERN_INFO "CacheFiles: Loaded\n");
+       return 0;
+
+error_proc:
+       kmem_cache_destroy(cachefiles_object_jar);
+error_object_jar:
+       misc_deregister(&cachefiles_dev);
+error_dev:
+       kerror("failed to register: %d", ret);
+       return ret;
+}
+
+fs_initcall(cachefiles_init);
+
+/*
+ * clean up on module removal
+ */
+static void __exit cachefiles_exit(void)
+{
+       printk(KERN_INFO "CacheFiles: Unloading\n");
+
+       cachefiles_proc_cleanup();
+       kmem_cache_destroy(cachefiles_object_jar);
+       misc_deregister(&cachefiles_dev);
+}
+
+module_exit(cachefiles_exit);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
new file mode 100644 (file)
index 0000000..4ce818a
--- /dev/null
@@ -0,0 +1,771 @@
+/* CacheFiles path walking and related routines
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/quotaops.h>
+#include <linux/xattr.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include "internal.h"
+
+static int cachefiles_wait_bit(void *flags)
+{
+       schedule();
+       return 0;
+}
+
+/*
+ * record the fact that an object is now active
+ */
+static void cachefiles_mark_object_active(struct cachefiles_cache *cache,
+                                         struct cachefiles_object *object)
+{
+       struct cachefiles_object *xobject;
+       struct rb_node **_p, *_parent = NULL;
+       struct dentry *dentry;
+
+       _enter(",%p", object);
+
+try_again:
+       write_lock(&cache->active_lock);
+
+       if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags))
+               BUG();
+
+       dentry = object->dentry;
+       _p = &cache->active_nodes.rb_node;
+       while (*_p) {
+               _parent = *_p;
+               xobject = rb_entry(_parent,
+                                  struct cachefiles_object, active_node);
+
+               ASSERT(xobject != object);
+
+               if (xobject->dentry > dentry)
+                       _p = &(*_p)->rb_left;
+               else if (xobject->dentry < dentry)
+                       _p = &(*_p)->rb_right;
+               else
+                       goto wait_for_old_object;
+       }
+
+       rb_link_node(&object->active_node, _parent, _p);
+       rb_insert_color(&object->active_node, &cache->active_nodes);
+
+       write_unlock(&cache->active_lock);
+       _leave("");
+       return;
+
+       /* an old object from a previous incarnation is hogging the slot - we
+        * need to wait for it to be destroyed */
+wait_for_old_object:
+       if (xobject->fscache.state < FSCACHE_OBJECT_DYING) {
+               printk(KERN_ERR "\n");
+               printk(KERN_ERR "CacheFiles: Error:"
+                      " Unexpected object collision\n");
+               printk(KERN_ERR "xobject: OBJ%x\n",
+                      xobject->fscache.debug_id);
+               printk(KERN_ERR "xobjstate=%s\n",
+                      fscache_object_states[xobject->fscache.state]);
+               printk(KERN_ERR "xobjflags=%lx\n", xobject->fscache.flags);
+               printk(KERN_ERR "xobjevent=%lx [%lx]\n",
+                      xobject->fscache.events, xobject->fscache.event_mask);
+               printk(KERN_ERR "xops=%u inp=%u exc=%u\n",
+                      xobject->fscache.n_ops, xobject->fscache.n_in_progress,
+                      xobject->fscache.n_exclusive);
+               printk(KERN_ERR "xcookie=%p [pr=%p nd=%p fl=%lx]\n",
+                      xobject->fscache.cookie,
+                      xobject->fscache.cookie->parent,
+                      xobject->fscache.cookie->netfs_data,
+                      xobject->fscache.cookie->flags);
+               printk(KERN_ERR "xparent=%p\n",
+                      xobject->fscache.parent);
+               printk(KERN_ERR "object: OBJ%x\n",
+                      object->fscache.debug_id);
+               printk(KERN_ERR "cookie=%p [pr=%p nd=%p fl=%lx]\n",
+                      object->fscache.cookie,
+                      object->fscache.cookie->parent,
+                      object->fscache.cookie->netfs_data,
+                      object->fscache.cookie->flags);
+               printk(KERN_ERR "parent=%p\n",
+                      object->fscache.parent);
+               BUG();
+       }
+       atomic_inc(&xobject->usage);
+       write_unlock(&cache->active_lock);
+
+       _debug(">>> wait");
+       wait_on_bit(&xobject->flags, CACHEFILES_OBJECT_ACTIVE,
+                   cachefiles_wait_bit, TASK_UNINTERRUPTIBLE);
+       _debug("<<< waited");
+
+       cache->cache.ops->put_object(&xobject->fscache);
+       goto try_again;
+}
+
+/*
+ * delete an object representation from the cache
+ * - file backed objects are unlinked
+ * - directory backed objects are stuffed into the graveyard for userspace to
+ *   delete
+ * - unlocks the directory mutex
+ */
+static int cachefiles_bury_object(struct cachefiles_cache *cache,
+                                 struct dentry *dir,
+                                 struct dentry *rep)
+{
+       struct dentry *grave, *trap;
+       char nbuffer[8 + 8 + 1];
+       int ret;
+
+       _enter(",'%*.*s','%*.*s'",
+              dir->d_name.len, dir->d_name.len, dir->d_name.name,
+              rep->d_name.len, rep->d_name.len, rep->d_name.name);
+
+       /* non-directories can just be unlinked */
+       if (!S_ISDIR(rep->d_inode->i_mode)) {
+               _debug("unlink stale object");
+               ret = vfs_unlink(dir->d_inode, rep);
+
+               mutex_unlock(&dir->d_inode->i_mutex);
+
+               if (ret == -EIO)
+                       cachefiles_io_error(cache, "Unlink failed");
+
+               _leave(" = %d", ret);
+               return ret;
+       }
+
+       /* directories have to be moved to the graveyard */
+       _debug("move stale object to graveyard");
+       mutex_unlock(&dir->d_inode->i_mutex);
+
+try_again:
+       /* first step is to make up a grave dentry in the graveyard */
+       sprintf(nbuffer, "%08x%08x",
+               (uint32_t) get_seconds(),
+               (uint32_t) atomic_inc_return(&cache->gravecounter));
+
+       /* do the multiway lock magic */
+       trap = lock_rename(cache->graveyard, dir);
+
+       /* do some checks before getting the grave dentry */
+       if (rep->d_parent != dir) {
+               /* the entry was probably culled when we dropped the parent dir
+                * lock */
+               unlock_rename(cache->graveyard, dir);
+               _leave(" = 0 [culled?]");
+               return 0;
+       }
+
+       if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) {
+               unlock_rename(cache->graveyard, dir);
+               cachefiles_io_error(cache, "Graveyard no longer a directory");
+               return -EIO;
+       }
+
+       if (trap == rep) {
+               unlock_rename(cache->graveyard, dir);
+               cachefiles_io_error(cache, "May not make directory loop");
+               return -EIO;
+       }
+
+       if (d_mountpoint(rep)) {
+               unlock_rename(cache->graveyard, dir);
+               cachefiles_io_error(cache, "Mountpoint in cache");
+               return -EIO;
+       }
+
+       grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer));
+       if (IS_ERR(grave)) {
+               unlock_rename(cache->graveyard, dir);
+
+               if (PTR_ERR(grave) == -ENOMEM) {
+                       _leave(" = -ENOMEM");
+                       return -ENOMEM;
+               }
+
+               cachefiles_io_error(cache, "Lookup error %ld",
+                                   PTR_ERR(grave));
+               return -EIO;
+       }
+
+       if (grave->d_inode) {
+               unlock_rename(cache->graveyard, dir);
+               dput(grave);
+               grave = NULL;
+               cond_resched();
+               goto try_again;
+       }
+
+       if (d_mountpoint(grave)) {
+               unlock_rename(cache->graveyard, dir);
+               dput(grave);
+               cachefiles_io_error(cache, "Mountpoint in graveyard");
+               return -EIO;
+       }
+
+       /* target should not be an ancestor of source */
+       if (trap == grave) {
+               unlock_rename(cache->graveyard, dir);
+               dput(grave);
+               cachefiles_io_error(cache, "May not make directory loop");
+               return -EIO;
+       }
+
+       /* attempt the rename */
+       ret = vfs_rename(dir->d_inode, rep, cache->graveyard->d_inode, grave);
+       if (ret != 0 && ret != -ENOMEM)
+               cachefiles_io_error(cache, "Rename failed with error %d", ret);
+
+       unlock_rename(cache->graveyard, dir);
+       dput(grave);
+       _leave(" = 0");
+       return 0;
+}
+
+/*
+ * delete an object representation from the cache
+ */
+int cachefiles_delete_object(struct cachefiles_cache *cache,
+                            struct cachefiles_object *object)
+{
+       struct dentry *dir;
+       int ret;
+
+       _enter(",{%p}", object->dentry);
+
+       ASSERT(object->dentry);
+       ASSERT(object->dentry->d_inode);
+       ASSERT(object->dentry->d_parent);
+
+       dir = dget_parent(object->dentry);
+
+       mutex_lock(&dir->d_inode->i_mutex);
+       ret = cachefiles_bury_object(cache, dir, object->dentry);
+
+       dput(dir);
+       _leave(" = %d", ret);
+       return ret;
+}
+
+/*
+ * walk from the parent object to the child object through the backing
+ * filesystem, creating directories as we go
+ */
+int cachefiles_walk_to_object(struct cachefiles_object *parent,
+                             struct cachefiles_object *object,
+                             const char *key,
+                             struct cachefiles_xattr *auxdata)
+{
+       struct cachefiles_cache *cache;
+       struct dentry *dir, *next = NULL;
+       unsigned long start;
+       const char *name;
+       int ret, nlen;
+
+       _enter("{%p},,%s,", parent->dentry, key);
+
+       cache = container_of(parent->fscache.cache,
+                            struct cachefiles_cache, cache);
+
+       ASSERT(parent->dentry);
+       ASSERT(parent->dentry->d_inode);
+
+       if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) {
+               // TODO: convert file to dir
+               _leave("looking up in none directory");
+               return -ENOBUFS;
+       }
+
+       dir = dget(parent->dentry);
+
+advance:
+       /* attempt to transit the first directory component */
+       name = key;
+       nlen = strlen(key);
+
+       /* key ends in a double NUL */
+       key = key + nlen + 1;
+       if (!*key)
+               key = NULL;
+
+lookup_again:
+       /* search the current directory for the element name */
+       _debug("lookup '%s'", name);
+
+       mutex_lock(&dir->d_inode->i_mutex);
+
+       start = jiffies;
+       next = lookup_one_len(name, dir, nlen);
+       cachefiles_hist(cachefiles_lookup_histogram, start);
+       if (IS_ERR(next))
+               goto lookup_error;
+
+       _debug("next -> %p %s", next, next->d_inode ? "positive" : "negative");
+
+       if (!key)
+               object->new = !next->d_inode;
+
+       /* if this element of the path doesn't exist, then the lookup phase
+        * failed, and we can release any readers in the certain knowledge that
+        * there's nothing for them to actually read */
+       if (!next->d_inode)
+               fscache_object_lookup_negative(&object->fscache);
+
+       /* we need to create the object if it's negative */
+       if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) {
+               /* index objects and intervening tree levels must be subdirs */
+               if (!next->d_inode) {
+                       ret = cachefiles_has_space(cache, 1, 0);
+                       if (ret < 0)
+                               goto create_error;
+
+                       start = jiffies;
+                       ret = vfs_mkdir(dir->d_inode, next, 0);
+                       cachefiles_hist(cachefiles_mkdir_histogram, start);
+                       if (ret < 0)
+                               goto create_error;
+
+                       ASSERT(next->d_inode);
+
+                       _debug("mkdir -> %p{%p{ino=%lu}}",
+                              next, next->d_inode, next->d_inode->i_ino);
+
+               } else if (!S_ISDIR(next->d_inode->i_mode)) {
+                       kerror("inode %lu is not a directory",
+                              next->d_inode->i_ino);
+                       ret = -ENOBUFS;
+                       goto error;
+               }
+
+       } else {
+               /* non-index objects start out life as files */
+               if (!next->d_inode) {
+                       ret = cachefiles_has_space(cache, 1, 0);
+                       if (ret < 0)
+                               goto create_error;
+
+                       start = jiffies;
+                       ret = vfs_create(dir->d_inode, next, S_IFREG, NULL);
+                       cachefiles_hist(cachefiles_create_histogram, start);
+                       if (ret < 0)
+                               goto create_error;
+
+                       ASSERT(next->d_inode);
+
+                       _debug("create -> %p{%p{ino=%lu}}",
+                              next, next->d_inode, next->d_inode->i_ino);
+
+               } else if (!S_ISDIR(next->d_inode->i_mode) &&
+                          !S_ISREG(next->d_inode->i_mode)
+                          ) {
+                       kerror("inode %lu is not a file or directory",
+                              next->d_inode->i_ino);
+                       ret = -ENOBUFS;
+                       goto error;
+               }
+       }
+
+       /* process the next component */
+       if (key) {
+               _debug("advance");
+               mutex_unlock(&dir->d_inode->i_mutex);
+               dput(dir);
+               dir = next;
+               next = NULL;
+               goto advance;
+       }
+
+       /* we've found the object we were looking for */
+       object->dentry = next;
+
+       /* if we've found that the terminal object exists, then we need to
+        * check its attributes and delete it if it's out of date */
+       if (!object->new) {
+               _debug("validate '%*.*s'",
+                      next->d_name.len, next->d_name.len, next->d_name.name);
+
+               ret = cachefiles_check_object_xattr(object, auxdata);
+               if (ret == -ESTALE) {
+                       /* delete the object (the deleter drops the directory
+                        * mutex) */
+                       object->dentry = NULL;
+
+                       ret = cachefiles_bury_object(cache, dir, next);
+                       dput(next);
+                       next = NULL;
+
+                       if (ret < 0)
+                               goto delete_error;
+
+                       _debug("redo lookup");
+                       goto lookup_again;
+               }
+       }
+
+       /* note that we're now using this object */
+       cachefiles_mark_object_active(cache, object);
+
+       mutex_unlock(&dir->d_inode->i_mutex);
+       dput(dir);
+       dir = NULL;
+
+       _debug("=== OBTAINED_OBJECT ===");
+
+       if (object->new) {
+               /* attach data to a newly constructed terminal object */
+               ret = cachefiles_set_object_xattr(object, auxdata);
+               if (ret < 0)
+                       goto check_error;
+       } else {
+               /* always update the atime on an object we've just looked up
+                * (this is used to keep track of culling, and atimes are only
+                * updated by read, write and readdir but not lookup or
+                * open) */
+               touch_atime(cache->mnt, next);
+       }
+
+       /* open a file interface onto a data file */
+       if (object->type != FSCACHE_COOKIE_TYPE_INDEX) {
+               if (S_ISREG(object->dentry->d_inode->i_mode)) {
+                       const struct address_space_operations *aops;
+
+                       ret = -EPERM;
+                       aops = object->dentry->d_inode->i_mapping->a_ops;
+                       if (!aops->bmap)
+                               goto check_error;
+
+                       object->backer = object->dentry;
+               } else {
+                       BUG(); // TODO: open file in data-class subdir
+               }
+       }
+
+       object->new = 0;
+       fscache_obtained_object(&object->fscache);
+
+       _leave(" = 0 [%lu]", object->dentry->d_inode->i_ino);
+       return 0;
+
+create_error:
+       _debug("create error %d", ret);
+       if (ret == -EIO)
+               cachefiles_io_error(cache, "Create/mkdir failed");
+       goto error;
+
+check_error:
+       _debug("check error %d", ret);
+       write_lock(&cache->active_lock);
+       rb_erase(&object->active_node, &cache->active_nodes);
+       clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
+       wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
+       write_unlock(&cache->active_lock);
+
+       dput(object->dentry);
+       object->dentry = NULL;
+       goto error_out;
+
+delete_error:
+       _debug("delete error %d", ret);
+       goto error_out2;
+
+lookup_error:
+       _debug("lookup error %ld", PTR_ERR(next));
+       ret = PTR_ERR(next);
+       if (ret == -EIO)
+               cachefiles_io_error(cache, "Lookup failed");
+       next = NULL;
+error:
+       mutex_unlock(&dir->d_inode->i_mutex);
+       dput(next);
+error_out2:
+       dput(dir);
+error_out:
+       if (ret == -ENOSPC)
+               ret = -ENOBUFS;
+
+       _leave(" = error %d", -ret);
+       return ret;
+}
+
+/*
+ * get a subdirectory
+ */
+struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
+                                       struct dentry *dir,
+                                       const char *dirname)
+{
+       struct dentry *subdir;
+       unsigned long start;
+       int ret;
+
+       _enter(",,%s", dirname);
+
+       /* search the current directory for the element name */
+       mutex_lock(&dir->d_inode->i_mutex);
+
+       start = jiffies;
+       subdir = lookup_one_len(dirname, dir, strlen(dirname));
+       cachefiles_hist(cachefiles_lookup_histogram, start);
+       if (IS_ERR(subdir)) {
+               if (PTR_ERR(subdir) == -ENOMEM)
+                       goto nomem_d_alloc;
+               goto lookup_error;
+       }
+
+       _debug("subdir -> %p %s",
+              subdir, subdir->d_inode ? "positive" : "negative");
+
+       /* we need to create the subdir if it doesn't exist yet */
+       if (!subdir->d_inode) {
+               ret = cachefiles_has_space(cache, 1, 0);
+               if (ret < 0)
+                       goto mkdir_error;
+
+               _debug("attempt mkdir");
+
+               ret = vfs_mkdir(dir->d_inode, subdir, 0700);
+               if (ret < 0)
+                       goto mkdir_error;
+
+               ASSERT(subdir->d_inode);
+
+               _debug("mkdir -> %p{%p{ino=%lu}}",
+                      subdir,
+                      subdir->d_inode,
+                      subdir->d_inode->i_ino);
+       }
+
+       mutex_unlock(&dir->d_inode->i_mutex);
+
+       /* we need to make sure the subdir is a directory */
+       ASSERT(subdir->d_inode);
+
+       if (!S_ISDIR(subdir->d_inode->i_mode)) {
+               kerror("%s is not a directory", dirname);
+               ret = -EIO;
+               goto check_error;
+       }
+
+       ret = -EPERM;
+       if (!subdir->d_inode->i_op ||
+           !subdir->d_inode->i_op->setxattr ||
+           !subdir->d_inode->i_op->getxattr ||
+           !subdir->d_inode->i_op->lookup ||
+           !subdir->d_inode->i_op->mkdir ||
+           !subdir->d_inode->i_op->create ||
+           !subdir->d_inode->i_op->rename ||
+           !subdir->d_inode->i_op->rmdir ||
+           !subdir->d_inode->i_op->unlink)
+               goto check_error;
+
+       _leave(" = [%lu]", subdir->d_inode->i_ino);
+       return subdir;
+
+check_error:
+       dput(subdir);
+       _leave(" = %d [check]", ret);
+       return ERR_PTR(ret);
+
+mkdir_error:
+       mutex_unlock(&dir->d_inode->i_mutex);
+       dput(subdir);
+       kerror("mkdir %s failed with error %d", dirname, ret);
+       return ERR_PTR(ret);
+
+lookup_error:
+       mutex_unlock(&dir->d_inode->i_mutex);
+       ret = PTR_ERR(subdir);
+       kerror("Lookup %s failed with error %d", dirname, ret);
+       return ERR_PTR(ret);
+
+nomem_d_alloc:
+       mutex_unlock(&dir->d_inode->i_mutex);
+       _leave(" = -ENOMEM");
+       return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * find out if an object is in use or not
+ * - if finds object and it's not in use:
+ *   - returns a pointer to the object and a reference on it
+ *   - returns with the directory locked
+ */
+static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
+                                             struct dentry *dir,
+                                             char *filename)
+{
+       struct cachefiles_object *object;
+       struct rb_node *_n;
+       struct dentry *victim;
+       unsigned long start;
+       int ret;
+
+       //_enter(",%*.*s/,%s",
+       //       dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+
+       /* look up the victim */
+       mutex_lock_nested(&dir->d_inode->i_mutex, 1);
+
+       start = jiffies;
+       victim = lookup_one_len(filename, dir, strlen(filename));
+       cachefiles_hist(cachefiles_lookup_histogram, start);
+       if (IS_ERR(victim))
+               goto lookup_error;
+
+       //_debug("victim -> %p %s",
+       //       victim, victim->d_inode ? "positive" : "negative");
+
+       /* if the object is no longer there then we probably retired the object
+        * at the netfs's request whilst the cull was in progress
+        */
+       if (!victim->d_inode) {
+               mutex_unlock(&dir->d_inode->i_mutex);
+               dput(victim);
+               _leave(" = -ENOENT [absent]");
+               return ERR_PTR(-ENOENT);
+       }
+
+       /* check to see if we're using this object */
+       read_lock(&cache->active_lock);
+
+       _n = cache->active_nodes.rb_node;
+
+       while (_n) {
+               object = rb_entry(_n, struct cachefiles_object, active_node);
+
+               if (object->dentry > victim)
+                       _n = _n->rb_left;
+               else if (object->dentry < victim)
+                       _n = _n->rb_right;
+               else
+                       goto object_in_use;
+       }
+
+       read_unlock(&cache->active_lock);
+
+       //_leave(" = %p", victim);
+       return victim;
+
+object_in_use:
+       read_unlock(&cache->active_lock);
+       mutex_unlock(&dir->d_inode->i_mutex);
+       dput(victim);
+       //_leave(" = -EBUSY [in use]");
+       return ERR_PTR(-EBUSY);
+
+lookup_error:
+       mutex_unlock(&dir->d_inode->i_mutex);
+       ret = PTR_ERR(victim);
+       if (ret == -ENOENT) {
+               /* file or dir now absent - probably retired by netfs */
+               _leave(" = -ESTALE [absent]");
+               return ERR_PTR(-ESTALE);
+       }
+
+       if (ret == -EIO) {
+               cachefiles_io_error(cache, "Lookup failed");
+       } else if (ret != -ENOMEM) {
+               kerror("Internal error: %d", ret);
+               ret = -EIO;
+       }
+
+       _leave(" = %d", ret);
+       return ERR_PTR(ret);
+}
+
+/*
+ * cull an object if it's not in use
+ * - called only by cache manager daemon
+ */
+int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
+                   char *filename)
+{
+       struct dentry *victim;
+       int ret;
+
+       _enter(",%*.*s/,%s",
+              dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+
+       victim = cachefiles_check_active(cache, dir, filename);
+       if (IS_ERR(victim))
+               return PTR_ERR(victim);
+
+       _debug("victim -> %p %s",
+              victim, victim->d_inode ? "positive" : "negative");
+
+       /* okay... the victim is not being used so we can cull it
+        * - start by marking it as stale
+        */
+       _debug("victim is cullable");
+
+       ret = cachefiles_remove_object_xattr(cache, victim);
+       if (ret < 0)
+               goto error_unlock;
+
+       /*  actually remove the victim (drops the dir mutex) */
+       _debug("bury");
+
+       ret = cachefiles_bury_object(cache, dir, victim);
+       if (ret < 0)
+               goto error;
+
+       dput(victim);
+       _leave(" = 0");
+       return 0;
+
+error_unlock:
+       mutex_unlock(&dir->d_inode->i_mutex);
+error:
+       dput(victim);
+       if (ret == -ENOENT) {
+               /* file or dir now absent - probably retired by netfs */
+               _leave(" = -ESTALE [absent]");
+               return -ESTALE;
+       }
+
+       if (ret != -ENOMEM) {
+               kerror("Internal error: %d", ret);
+               ret = -EIO;
+       }
+
+       _leave(" = %d", ret);
+       return ret;
+}
+
+/*
+ * find out if an object is in use or not
+ * - called only by cache manager daemon
+ * - returns -EBUSY or 0 to indicate whether an object is in use or not
+ */
+int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir,
+                           char *filename)
+{
+       struct dentry *victim;
+
+       //_enter(",%*.*s/,%s",
+       //       dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
+
+       victim = cachefiles_check_active(cache, dir, filename);
+       if (IS_ERR(victim))
+               return PTR_ERR(victim);
+
+       mutex_unlock(&dir->d_inode->i_mutex);
+       dput(victim);
+       //_leave(" = 0");
+       return 0;
+}
diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c
new file mode 100644 (file)
index 0000000..eccd339
--- /dev/null
@@ -0,0 +1,134 @@
+/* CacheFiles statistics
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+atomic_t cachefiles_lookup_histogram[HZ];
+atomic_t cachefiles_mkdir_histogram[HZ];
+atomic_t cachefiles_create_histogram[HZ];
+
+/*
+ * display the latency histogram
+ */
+static int cachefiles_histogram_show(struct seq_file *m, void *v)
+{
+       unsigned long index;
+       unsigned x, y, z, t;
+
+       switch ((unsigned long) v) {
+       case 1:
+               seq_puts(m, "JIFS  SECS  LOOKUPS   MKDIRS    CREATES\n");
+               return 0;
+       case 2:
+               seq_puts(m, "===== ===== ========= ========= =========\n");
+               return 0;
+       default:
+               index = (unsigned long) v - 3;
+               x = atomic_read(&cachefiles_lookup_histogram[index]);
+               y = atomic_read(&cachefiles_mkdir_histogram[index]);
+               z = atomic_read(&cachefiles_create_histogram[index]);
+               if (x == 0 && y == 0 && z == 0)
+                       return 0;
+
+               t = (index * 1000) / HZ;
+
+               seq_printf(m, "%4lu  0.%03u %9u %9u %9u\n", index, t, x, y, z);
+               return 0;
+       }
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *cachefiles_histogram_start(struct seq_file *m, loff_t *_pos)
+{
+       if ((unsigned long long)*_pos >= HZ + 2)
+               return NULL;
+       if (*_pos == 0)
+               *_pos = 1;
+       return (void *)(unsigned long) *_pos;
+}
+
+/*
+ * move to the next line
+ */
+static void *cachefiles_histogram_next(struct seq_file *m, void *v, loff_t *pos)
+{
+       (*pos)++;
+       return (unsigned long long)*pos > HZ + 2 ?
+               NULL : (void *)(unsigned long) *pos;
+}
+
+/*
+ * clean up after reading
+ */
+static void cachefiles_histogram_stop(struct seq_file *m, void *v)
+{
+}
+
+static const struct seq_operations cachefiles_histogram_ops = {
+       .start          = cachefiles_histogram_start,
+       .stop           = cachefiles_histogram_stop,
+       .next           = cachefiles_histogram_next,
+       .show           = cachefiles_histogram_show,
+};
+
+/*
+ * open "/proc/fs/cachefiles/XXX" which provide statistics summaries
+ */
+static int cachefiles_histogram_open(struct inode *inode, struct file *file)
+{
+       return seq_open(file, &cachefiles_histogram_ops);
+}
+
+static const struct file_operations cachefiles_histogram_fops = {
+       .owner          = THIS_MODULE,
+       .open           = cachefiles_histogram_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = seq_release,
+};
+
+/*
+ * initialise the /proc/fs/cachefiles/ directory
+ */
+int __init cachefiles_proc_init(void)
+{
+       _enter("");
+
+       if (!proc_mkdir("fs/cachefiles", NULL))
+               goto error_dir;
+
+       if (!proc_create("fs/cachefiles/histogram", S_IFREG | 0444, NULL,
+                        &cachefiles_histogram_fops))
+               goto error_histogram;
+
+       _leave(" = 0");
+       return 0;
+
+error_histogram:
+       remove_proc_entry("fs/cachefiles", NULL);
+error_dir:
+       _leave(" = -ENOMEM");
+       return -ENOMEM;
+}
+
+/*
+ * clean up the /proc/fs/cachefiles/ directory
+ */
+void cachefiles_proc_cleanup(void)
+{
+       remove_proc_entry("fs/cachefiles/histogram", NULL);
+       remove_proc_entry("fs/cachefiles", NULL);
+}
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
new file mode 100644 (file)
index 0000000..a69787e
--- /dev/null
@@ -0,0 +1,879 @@
+/* Storage object read/write
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/mount.h>
+#include <linux/file.h>
+#include "internal.h"
+
+/*
+ * detect wake up events generated by the unlocking of pages in which we're
+ * interested
+ * - we use this to detect read completion of backing pages
+ * - the caller holds the waitqueue lock
+ */
+static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
+                                 int sync, void *_key)
+{
+       struct cachefiles_one_read *monitor =
+               container_of(wait, struct cachefiles_one_read, monitor);
+       struct cachefiles_object *object;
+       struct wait_bit_key *key = _key;
+       struct page *page = wait->private;
+
+       ASSERT(key);
+
+       _enter("{%lu},%u,%d,{%p,%u}",
+              monitor->netfs_page->index, mode, sync,
+              key->flags, key->bit_nr);
+
+       if (key->flags != &page->flags ||
+           key->bit_nr != PG_locked)
+               return 0;
+
+       _debug("--- monitor %p %lx ---", page, page->flags);
+
+       if (!PageUptodate(page) && !PageError(page))
+               dump_stack();
+
+       /* remove from the waitqueue */
+       list_del(&wait->task_list);
+
+       /* move onto the action list and queue for FS-Cache thread pool */
+       ASSERT(monitor->op);
+
+       object = container_of(monitor->op->op.object,
+                             struct cachefiles_object, fscache);
+
+       spin_lock(&object->work_lock);
+       list_add_tail(&monitor->op_link, &monitor->op->to_do);
+       spin_unlock(&object->work_lock);
+
+       fscache_enqueue_retrieval(monitor->op);
+       return 0;
+}
+
+/*
+ * copy data from backing pages to netfs pages to complete a read operation
+ * - driven by FS-Cache's thread pool
+ */
+static void cachefiles_read_copier(struct fscache_operation *_op)
+{
+       struct cachefiles_one_read *monitor;
+       struct cachefiles_object *object;
+       struct fscache_retrieval *op;
+       struct pagevec pagevec;
+       int error, max;
+
+       op = container_of(_op, struct fscache_retrieval, op);
+       object = container_of(op->op.object,
+                             struct cachefiles_object, fscache);
+
+       _enter("{ino=%lu}", object->backer->d_inode->i_ino);
+
+       pagevec_init(&pagevec, 0);
+
+       max = 8;
+       spin_lock_irq(&object->work_lock);
+
+       while (!list_empty(&op->to_do)) {
+               monitor = list_entry(op->to_do.next,
+                                    struct cachefiles_one_read, op_link);
+               list_del(&monitor->op_link);
+
+               spin_unlock_irq(&object->work_lock);
+
+               _debug("- copy {%lu}", monitor->back_page->index);
+
+               error = -EIO;
+               if (PageUptodate(monitor->back_page)) {
+                       copy_highpage(monitor->netfs_page, monitor->back_page);
+
+                       pagevec_add(&pagevec, monitor->netfs_page);
+                       fscache_mark_pages_cached(monitor->op, &pagevec);
+                       error = 0;
+               }
+
+               if (error)
+                       cachefiles_io_error_obj(
+                               object,
+                               "Readpage failed on backing file %lx",
+                               (unsigned long) monitor->back_page->flags);
+
+               page_cache_release(monitor->back_page);
+
+               fscache_end_io(op, monitor->netfs_page, error);
+               page_cache_release(monitor->netfs_page);
+               fscache_put_retrieval(op);
+               kfree(monitor);
+
+               /* let the thread pool have some air occasionally */
+               max--;
+               if (max < 0 || need_resched()) {
+                       if (!list_empty(&op->to_do))
+                               fscache_enqueue_retrieval(op);
+                       _leave(" [maxed out]");
+                       return;
+               }
+
+               spin_lock_irq(&object->work_lock);
+       }
+
+       spin_unlock_irq(&object->work_lock);
+       _leave("");
+}
+
+/*
+ * read the corresponding page to the given set from the backing file
+ * - an uncertain page is simply discarded, to be tried again another time
+ */
+static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
+                                           struct fscache_retrieval *op,
+                                           struct page *netpage,
+                                           struct pagevec *pagevec)
+{
+       struct cachefiles_one_read *monitor;
+       struct address_space *bmapping;
+       struct page *newpage, *backpage;
+       int ret;
+
+       _enter("");
+
+       pagevec_reinit(pagevec);
+
+       _debug("read back %p{%lu,%d}",
+              netpage, netpage->index, page_count(netpage));
+
+       monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
+       if (!monitor)
+               goto nomem;
+
+       monitor->netfs_page = netpage;
+       monitor->op = fscache_get_retrieval(op);
+
+       init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter);
+
+       /* attempt to get hold of the backing page */
+       bmapping = object->backer->d_inode->i_mapping;
+       newpage = NULL;
+
+       for (;;) {
+               backpage = find_get_page(bmapping, netpage->index);
+               if (backpage)
+                       goto backing_page_already_present;
+
+               if (!newpage) {
+                       newpage = page_cache_alloc_cold(bmapping);
+                       if (!newpage)
+                               goto nomem_monitor;
+               }
+
+               ret = add_to_page_cache(newpage, bmapping,
+                                       netpage->index, GFP_KERNEL);
+               if (ret == 0)
+                       goto installed_new_backing_page;
+               if (ret != -EEXIST)
+                       goto nomem_page;
+       }
+
+       /* we've installed a new backing page, so now we need to add it
+        * to the LRU list and start it reading */
+installed_new_backing_page:
+       _debug("- new %p", newpage);
+
+       backpage = newpage;
+       newpage = NULL;
+
+       page_cache_get(backpage);
+       pagevec_add(pagevec, backpage);
+       __pagevec_lru_add_file(pagevec);
+
+read_backing_page:
+       ret = bmapping->a_ops->readpage(NULL, backpage);
+       if (ret < 0)
+               goto read_error;
+
+       /* set the monitor to transfer the data across */
+monitor_backing_page:
+       _debug("- monitor add");
+
+       /* install the monitor */
+       page_cache_get(monitor->netfs_page);
+       page_cache_get(backpage);
+       monitor->back_page = backpage;
+       monitor->monitor.private = backpage;
+       add_page_wait_queue(backpage, &monitor->monitor);
+       monitor = NULL;
+
+       /* but the page may have been read before the monitor was installed, so
+        * the monitor may miss the event - so we have to ensure that we do get
+        * one in such a case */
+       if (trylock_page(backpage)) {
+               _debug("jumpstart %p {%lx}", backpage, backpage->flags);
+               unlock_page(backpage);
+       }
+       goto success;
+
+       /* if the backing page is already present, it can be in one of
+        * three states: read in progress, read failed or read okay */
+backing_page_already_present:
+       _debug("- present");
+
+       if (newpage) {
+               page_cache_release(newpage);
+               newpage = NULL;
+       }
+
+       if (PageError(backpage))
+               goto io_error;
+
+       if (PageUptodate(backpage))
+               goto backing_page_already_uptodate;
+
+       if (!trylock_page(backpage))
+               goto monitor_backing_page;
+       _debug("read %p {%lx}", backpage, backpage->flags);
+       goto read_backing_page;
+
+       /* the backing page is already up to date, attach the netfs
+        * page to the pagecache and LRU and copy the data across */
+backing_page_already_uptodate:
+       _debug("- uptodate");
+
+       pagevec_add(pagevec, netpage);
+       fscache_mark_pages_cached(op, pagevec);
+
+       copy_highpage(netpage, backpage);
+       fscache_end_io(op, netpage, 0);
+
+success:
+       _debug("success");
+       ret = 0;
+
+out:
+       if (backpage)
+               page_cache_release(backpage);
+       if (monitor) {
+               fscache_put_retrieval(monitor->op);
+               kfree(monitor);
+       }
+       _leave(" = %d", ret);
+       return ret;
+
+read_error:
+       _debug("read error %d", ret);
+       if (ret == -ENOMEM)
+               goto out;
+io_error:
+       cachefiles_io_error_obj(object, "Page read error on backing file");
+       ret = -ENOBUFS;
+       goto out;
+
+nomem_page:
+       page_cache_release(newpage);
+nomem_monitor:
+       fscache_put_retrieval(monitor->op);
+       kfree(monitor);
+nomem:
+       _leave(" = -ENOMEM");
+       return -ENOMEM;
+}
+
+/*
+ * read a page from the cache or allocate a block in which to store it
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if no buffers can be made available
+ * - returns -ENOBUFS if page is beyond EOF
+ * - if the page is backed by a block in the cache:
+ *   - a read will be started which will call the callback on completion
+ *   - 0 will be returned
+ * - else if the page is unbacked:
+ *   - the metadata will be retained
+ *   - -ENODATA will be returned
+ */
+int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
+                                 struct page *page,
+                                 gfp_t gfp)
+{
+       struct cachefiles_object *object;
+       struct cachefiles_cache *cache;
+       struct pagevec pagevec;
+       struct inode *inode;
+       sector_t block0, block;
+       unsigned shift;
+       int ret;
+
+       object = container_of(op->op.object,
+                             struct cachefiles_object, fscache);
+       cache = container_of(object->fscache.cache,
+                            struct cachefiles_cache, cache);
+
+       _enter("{%p},{%lx},,,", object, page->index);
+
+       if (!object->backer)
+               return -ENOBUFS;
+
+       inode = object->backer->d_inode;
+       ASSERT(S_ISREG(inode->i_mode));
+       ASSERT(inode->i_mapping->a_ops->bmap);
+       ASSERT(inode->i_mapping->a_ops->readpages);
+
+       /* calculate the shift required to use bmap */
+       if (inode->i_sb->s_blocksize > PAGE_SIZE)
+               return -ENOBUFS;
+
+       shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
+
+       op->op.flags = FSCACHE_OP_FAST;
+       op->op.processor = cachefiles_read_copier;
+
+       pagevec_init(&pagevec, 0);
+
+       /* we assume the absence or presence of the first block is a good
+        * enough indication for the page as a whole
+        * - TODO: don't use bmap() for this as it is _not_ actually good
+        *   enough for this as it doesn't indicate errors, but it's all we've
+        *   got for the moment
+        */
+       block0 = page->index;
+       block0 <<= shift;
+
+       block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block0);
+       _debug("%llx -> %llx",
+              (unsigned long long) block0,
+              (unsigned long long) block);
+
+       if (block) {
+               /* submit the apparently valid page to the backing fs to be
+                * read from disk */
+               ret = cachefiles_read_backing_file_one(object, op, page,
+                                                      &pagevec);
+       } else if (cachefiles_has_space(cache, 0, 1) == 0) {
+               /* there's space in the cache we can use */
+               pagevec_add(&pagevec, page);
+               fscache_mark_pages_cached(op, &pagevec);
+               ret = -ENODATA;
+       } else {
+               ret = -ENOBUFS;
+       }
+
+       _leave(" = %d", ret);
+       return ret;
+}
+
+/*
+ * read the corresponding pages to the given set from the backing file
+ * - any uncertain pages are simply discarded, to be tried again another time
+ */
+static int cachefiles_read_backing_file(struct cachefiles_object *object,
+                                       struct fscache_retrieval *op,
+                                       struct list_head *list,
+                                       struct pagevec *mark_pvec)
+{
+       struct cachefiles_one_read *monitor = NULL;
+       struct address_space *bmapping = object->backer->d_inode->i_mapping;
+       struct pagevec lru_pvec;
+       struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
+       int ret = 0;
+
+       _enter("");
+
+       pagevec_init(&lru_pvec, 0);
+
+       list_for_each_entry_safe(netpage, _n, list, lru) {
+               list_del(&netpage->lru);
+
+               _debug("read back %p{%lu,%d}",
+                      netpage, netpage->index, page_count(netpage));
+
+               if (!monitor) {
+                       monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
+                       if (!monitor)
+                               goto nomem;
+
+                       monitor->op = fscache_get_retrieval(op);
+                       init_waitqueue_func_entry(&monitor->monitor,
+                                                 cachefiles_read_waiter);
+               }
+
+               for (;;) {
+                       backpage = find_get_page(bmapping, netpage->index);
+                       if (backpage)
+                               goto backing_page_already_present;
+
+                       if (!newpage) {
+                               newpage = page_cache_alloc_cold(bmapping);
+                               if (!newpage)
+                                       goto nomem;
+                       }
+
+                       ret = add_to_page_cache(newpage, bmapping,
+                                               netpage->index, GFP_KERNEL);
+                       if (ret == 0)
+                               goto installed_new_backing_page;
+                       if (ret != -EEXIST)
+                               goto nomem;
+               }
+
+               /* we've installed a new backing page, so now we need to add it
+                * to the LRU list and start it reading */
+       installed_new_backing_page:
+               _debug("- new %p", newpage);
+
+               backpage = newpage;
+               newpage = NULL;
+
+               page_cache_get(backpage);
+               if (!pagevec_add(&lru_pvec, backpage))
+                       __pagevec_lru_add_file(&lru_pvec);
+
+       reread_backing_page:
+               ret = bmapping->a_ops->readpage(NULL, backpage);
+               if (ret < 0)
+                       goto read_error;
+
+               /* add the netfs page to the pagecache and LRU, and set the
+                * monitor to transfer the data across */
+       monitor_backing_page:
+               _debug("- monitor add");
+
+               ret = add_to_page_cache(netpage, op->mapping, netpage->index,
+                                       GFP_KERNEL);
+               if (ret < 0) {
+                       if (ret == -EEXIST) {
+                               page_cache_release(netpage);
+                               continue;
+                       }
+                       goto nomem;
+               }
+
+               page_cache_get(netpage);
+               if (!pagevec_add(&lru_pvec, netpage))
+                       __pagevec_lru_add_file(&lru_pvec);
+
+               /* install a monitor */
+               page_cache_get(netpage);
+               monitor->netfs_page = netpage;
+
+               page_cache_get(backpage);
+               monitor->back_page = backpage;
+               monitor->monitor.private = backpage;
+               add_page_wait_queue(backpage, &monitor->monitor);
+               monitor = NULL;
+
+               /* but the page may have been read before the monitor was
+                * installed, so the monitor may miss the event - so we have to
+                * ensure that we do get one in such a case */
+               if (trylock_page(backpage)) {
+                       _debug("2unlock %p {%lx}", backpage, backpage->flags);
+                       unlock_page(backpage);
+               }
+
+               page_cache_release(backpage);
+               backpage = NULL;
+
+               page_cache_release(netpage);
+               netpage = NULL;
+               continue;
+
+               /* if the backing page is already present, it can be in one of
+                * three states: read in progress, read failed or read okay */
+       backing_page_already_present:
+               _debug("- present %p", backpage);
+
+               if (PageError(backpage))
+                       goto io_error;
+
+               if (PageUptodate(backpage))
+                       goto backing_page_already_uptodate;
+
+               _debug("- not ready %p{%lx}", backpage, backpage->flags);
+
+               if (!trylock_page(backpage))
+                       goto monitor_backing_page;
+
+               if (PageError(backpage)) {
+                       _debug("error %lx", backpage->flags);
+                       unlock_page(backpage);
+                       goto io_error;
+               }
+
+               if (PageUptodate(backpage))
+                       goto backing_page_already_uptodate_unlock;
+
+               /* we've locked a page that's neither up to date nor erroneous,
+                * so we need to attempt to read it again */
+               goto reread_backing_page;
+
+               /* the backing page is already up to date, attach the netfs
+                * page to the pagecache and LRU and copy the data across */
+       backing_page_already_uptodate_unlock:
+               _debug("uptodate %lx", backpage->flags);
+               unlock_page(backpage);
+       backing_page_already_uptodate:
+               _debug("- uptodate");
+
+               ret = add_to_page_cache(netpage, op->mapping, netpage->index,
+                                       GFP_KERNEL);
+               if (ret < 0) {
+                       if (ret == -EEXIST) {
+                               page_cache_release(netpage);
+                               continue;
+                       }
+                       goto nomem;
+               }
+
+               copy_highpage(netpage, backpage);
+
+               page_cache_release(backpage);
+               backpage = NULL;
+
+               if (!pagevec_add(mark_pvec, netpage))
+                       fscache_mark_pages_cached(op, mark_pvec);
+
+               page_cache_get(netpage);
+               if (!pagevec_add(&lru_pvec, netpage))
+                       __pagevec_lru_add_file(&lru_pvec);
+
+               fscache_end_io(op, netpage, 0);
+               page_cache_release(netpage);
+               netpage = NULL;
+               continue;
+       }
+
+       netpage = NULL;
+
+       _debug("out");
+
+out:
+       /* tidy up */
+       pagevec_lru_add_file(&lru_pvec);
+
+       if (newpage)
+               page_cache_release(newpage);
+       if (netpage)
+               page_cache_release(netpage);
+       if (backpage)
+               page_cache_release(backpage);
+       if (monitor) {
+               fscache_put_retrieval(op);
+               kfree(monitor);
+       }
+
+       list_for_each_entry_safe(netpage, _n, list, lru) {
+               list_del(&netpage->lru);
+               page_cache_release(netpage);
+       }
+
+       _leave(" = %d", ret);
+       return ret;
+
+nomem:
+       _debug("nomem");
+       ret = -ENOMEM;
+       goto out;
+
+read_error:
+       _debug("read error %d", ret);
+       if (ret == -ENOMEM)
+               goto out;
+io_error:
+       cachefiles_io_error_obj(object, "Page read error on backing file");
+       ret = -ENOBUFS;
+       goto out;
+}
+
+/*
+ * read a list of pages from the cache or allocate blocks in which to store
+ * them
+ */
+int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
+                                  struct list_head *pages,
+                                  unsigned *nr_pages,
+                                  gfp_t gfp)
+{
+       struct cachefiles_object *object;
+       struct cachefiles_cache *cache;
+       struct list_head backpages;
+       struct pagevec pagevec;
+       struct inode *inode;
+       struct page *page, *_n;
+       unsigned shift, nrbackpages;
+       int ret, ret2, space;
+
+       object = container_of(op->op.object,
+                             struct cachefiles_object, fscache);
+       cache = container_of(object->fscache.cache,
+                            struct cachefiles_cache, cache);
+
+       _enter("{OBJ%x,%d},,%d,,",
+              object->fscache.debug_id, atomic_read(&op->op.usage),
+              *nr_pages);
+
+       if (!object->backer)
+               return -ENOBUFS;
+
+       space = 1;
+       if (cachefiles_has_space(cache, 0, *nr_pages) < 0)
+               space = 0;
+
+       inode = object->backer->d_inode;
+       ASSERT(S_ISREG(inode->i_mode));
+       ASSERT(inode->i_mapping->a_ops->bmap);
+       ASSERT(inode->i_mapping->a_ops->readpages);
+
+       /* calculate the shift required to use bmap */
+       if (inode->i_sb->s_blocksize > PAGE_SIZE)
+               return -ENOBUFS;
+
+       shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
+
+       pagevec_init(&pagevec, 0);
+
+       op->op.flags = FSCACHE_OP_FAST;
+       op->op.processor = cachefiles_read_copier;
+
+       INIT_LIST_HEAD(&backpages);
+       nrbackpages = 0;
+
+       ret = space ? -ENODATA : -ENOBUFS;
+       list_for_each_entry_safe(page, _n, pages, lru) {
+               sector_t block0, block;
+
+               /* we assume the absence or presence of the first block is a
+                * good enough indication for the page as a whole
+                * - TODO: don't use bmap() for this as it is _not_ actually
+                *   good enough for this as it doesn't indicate errors, but
+                *   it's all we've got for the moment
+                */
+               block0 = page->index;
+               block0 <<= shift;
+
+               block = inode->i_mapping->a_ops->bmap(inode->i_mapping,
+                                                     block0);
+               _debug("%llx -> %llx",
+                      (unsigned long long) block0,
+                      (unsigned long long) block);
+
+               if (block) {
+                       /* we have data - add it to the list to give to the
+                        * backing fs */
+                       list_move(&page->lru, &backpages);
+                       (*nr_pages)--;
+                       nrbackpages++;
+               } else if (space && pagevec_add(&pagevec, page) == 0) {
+                       fscache_mark_pages_cached(op, &pagevec);
+                       ret = -ENODATA;
+               }
+       }
+
+       if (pagevec_count(&pagevec) > 0)
+               fscache_mark_pages_cached(op, &pagevec);
+
+       if (list_empty(pages))
+               ret = 0;
+
+       /* submit the apparently valid pages to the backing fs to be read from
+        * disk */
+       if (nrbackpages > 0) {
+               ret2 = cachefiles_read_backing_file(object, op, &backpages,
+                                                   &pagevec);
+               if (ret2 == -ENOMEM || ret2 == -EINTR)
+                       ret = ret2;
+       }
+
+       if (pagevec_count(&pagevec) > 0)
+               fscache_mark_pages_cached(op, &pagevec);
+
+       _leave(" = %d [nr=%u%s]",
+              ret, *nr_pages, list_empty(pages) ? " empty" : "");
+       return ret;
+}
+
+/*
+ * allocate a block in the cache in which to store a page
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if no buffers can be made available
+ * - returns -ENOBUFS if page is beyond EOF
+ * - otherwise:
+ *   - the metadata will be retained
+ *   - 0 will be returned
+ */
+int cachefiles_allocate_page(struct fscache_retrieval *op,
+                            struct page *page,
+                            gfp_t gfp)
+{
+       struct cachefiles_object *object;
+       struct cachefiles_cache *cache;
+       struct pagevec pagevec;
+       int ret;
+
+       object = container_of(op->op.object,
+                             struct cachefiles_object, fscache);
+       cache = container_of(object->fscache.cache,
+                            struct cachefiles_cache, cache);
+
+       _enter("%p,{%lx},", object, page->index);
+
+       ret = cachefiles_has_space(cache, 0, 1);
+       if (ret == 0) {
+               pagevec_init(&pagevec, 0);
+               pagevec_add(&pagevec, page);
+               fscache_mark_pages_cached(op, &pagevec);
+       } else {
+               ret = -ENOBUFS;
+       }
+
+       _leave(" = %d", ret);
+       return ret;
+}
+
+/*
+ * allocate blocks in the cache in which to store a set of pages
+ * - cache withdrawal is prevented by the caller
+ * - returns -EINTR if interrupted
+ * - returns -ENOMEM if ran out of memory
+ * - returns -ENOBUFS if some buffers couldn't be made available
+ * - returns -ENOBUFS if some pages are beyond EOF
+ * - otherwise:
+ *   - -ENODATA will be returned
+ * - metadata will be retained for any page marked
+ */
+int cachefiles_allocate_pages(struct fscache_retrieval *op,
+                             struct list_head *pages,
+                             unsigned *nr_pages,
+                             gfp_t gfp)
+{
+       struct cachefiles_object *object;
+       struct cachefiles_cache *cache;
+       struct pagevec pagevec;
+       struct page *page;
+       int ret;
+
+       object = container_of(op->op.object,
+                             struct cachefiles_object, fscache);
+       cache = container_of(object->fscache.cache,
+                            struct cachefiles_cache, cache);
+
+       _enter("%p,,,%d,", object, *nr_pages);
+
+       ret = cachefiles_has_space(cache, 0, *nr_pages);
+       if (ret == 0) {
+               pagevec_init(&pagevec, 0);
+
+               list_for_each_entry(page, pages, lru) {
+                       if (pagevec_add(&pagevec, page) == 0)
+                               fscache_mark_pages_cached(op, &pagevec);
+               }
+
+               if (pagevec_count(&pagevec) > 0)
+                       fscache_mark_pages_cached(op, &pagevec);
+               ret = -ENODATA;
+       } else {
+               ret = -ENOBUFS;
+       }
+
+       _leave(" = %d", ret);
+       return ret;
+}
+
+/*
+ * request a page be stored in the cache
+ * - cache withdrawal is prevented by the caller
+ * - this request may be ignored if there's no cache block available, in which
+ *   case -ENOBUFS will be returned
+ * - if the op is in progress, 0 will be returned
+ */
+int cachefiles_write_page(struct fscache_storage *op, struct page *page)
+{
+       struct cachefiles_object *object;
+       struct cachefiles_cache *cache;
+       mm_segment_t old_fs;
+       struct file *file;
+       loff_t pos;
+       void *data;
+       int ret;
+
+       ASSERT(op != NULL);
+       ASSERT(page != NULL);
+
+       object = container_of(op->op.object,
+                             struct cachefiles_object, fscache);
+
+       _enter("%p,%p{%lx},,,", object, page, page->index);
+
+       if (!object->backer) {
+               _leave(" = -ENOBUFS");
+               return -ENOBUFS;
+       }
+
+       ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+
+       cache = container_of(object->fscache.cache,
+                            struct cachefiles_cache, cache);
+
+       /* write the page to the backing filesystem and let it store it in its
+        * own time */
+       dget(object->backer);
+       mntget(cache->mnt);
+       file = dentry_open(object->backer, cache->mnt, O_RDWR,
+                          cache->cache_cred);
+       if (IS_ERR(file)) {
+               ret = PTR_ERR(file);
+       } else {
+               ret = -EIO;
+               if (file->f_op->write) {
+                       pos = (loff_t) page->index << PAGE_SHIFT;
+                       data = kmap(page);
+                       old_fs = get_fs();
+                       set_fs(KERNEL_DS);
+                       ret = file->f_op->write(
+                               file, (const void __user *) data, PAGE_SIZE,
+                               &pos);
+                       set_fs(old_fs);
+                       kunmap(page);
+                       if (ret != PAGE_SIZE)
+                               ret = -EIO;
+               }
+               fput(file);
+       }
+
+       if (ret < 0) {
+               if (ret == -EIO)
+                       cachefiles_io_error_obj(
+                               object, "Write page to backing file failed");
+               ret = -ENOBUFS;
+       }
+
+       _leave(" = %d", ret);
+       return ret;
+}
+
+/*
+ * detach a backing block from a page
+ * - cache withdrawal is prevented by the caller
+ */
+void cachefiles_uncache_page(struct fscache_object *_object, struct page *page)
+{
+       struct cachefiles_object *object;
+       struct cachefiles_cache *cache;
+
+       object = container_of(_object, struct cachefiles_object, fscache);
+       cache = container_of(object->fscache.cache,
+                            struct cachefiles_cache, cache);
+
+       _enter("%p,{%lu}", object, page->index);
+
+       spin_unlock(&object->fscache.cookie->lock);
+}
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
new file mode 100644 (file)
index 0000000..b5808cd
--- /dev/null
@@ -0,0 +1,116 @@
+/* CacheFiles security management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/cred.h>
+#include "internal.h"
+
+/*
+ * determine the security context within which we access the cache from within
+ * the kernel
+ */
+int cachefiles_get_security_ID(struct cachefiles_cache *cache)
+{
+       struct cred *new;
+       int ret;
+
+       _enter("{%s}", cache->secctx);
+
+       new = prepare_kernel_cred(current);
+       if (!new) {
+               ret = -ENOMEM;
+               goto error;
+       }
+
+       if (cache->secctx) {
+               ret = set_security_override_from_ctx(new, cache->secctx);
+               if (ret < 0) {
+                       put_cred(new);
+                       printk(KERN_ERR "CacheFiles:"
+                              " Security denies permission to nominate"
+                              " security context: error %d\n",
+                              ret);
+                       goto error;
+               }
+       }
+
+       cache->cache_cred = new;
+       ret = 0;
+error:
+       _leave(" = %d", ret);
+       return ret;
+}
+
+/*
+ * see if mkdir and create can be performed in the root directory
+ */
+static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
+                                     struct dentry *root)
+{
+       int ret;
+
+       ret = security_inode_mkdir(root->d_inode, root, 0);
+       if (ret < 0) {
+               printk(KERN_ERR "CacheFiles:"
+                      " Security denies permission to make dirs: error %d",
+                      ret);
+               return ret;
+       }
+
+       ret = security_inode_create(root->d_inode, root, 0);
+       if (ret < 0)
+               printk(KERN_ERR "CacheFiles:"
+                      " Security denies permission to create files: error %d",
+                      ret);
+
+       return ret;
+}
+
+/*
+ * check the security details of the on-disk cache
+ * - must be called with security override in force
+ */
+int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
+                                       struct dentry *root,
+                                       const struct cred **_saved_cred)
+{
+       struct cred *new;
+       int ret;
+
+       _enter("");
+
+       /* duplicate the cache creds for COW (the override is currently in
+        * force, so we can use prepare_creds() to do this) */
+       new = prepare_creds();
+       if (!new)
+               return -ENOMEM;
+
+       cachefiles_end_secure(cache, *_saved_cred);
+
+       /* use the cache root dir's security context as the basis with
+        * which create files */
+       ret = set_create_files_as(new, root->d_inode);
+       if (ret < 0) {
+               _leave(" = %d [cfa]", ret);
+               return ret;
+       }
+
+       put_cred(cache->cache_cred);
+       cache->cache_cred = new;
+
+       cachefiles_begin_secure(cache, _saved_cred);
+       ret = cachefiles_check_cache_dir(cache, root);
+
+       if (ret == -EOPNOTSUPP)
+               ret = 0;
+       _leave(" = %d", ret);
+       return ret;
+}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
new file mode 100644 (file)
index 0000000..f3e7a0b
--- /dev/null
@@ -0,0 +1,291 @@
+/* CacheFiles extended attribute management
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/quotaops.h>
+#include <linux/xattr.h>
+#include "internal.h"
+
+static const char cachefiles_xattr_cache[] =
+       XATTR_USER_PREFIX "CacheFiles.cache";
+
+/*
+ * check the type label on an object
+ * - done using xattrs
+ */
+int cachefiles_check_object_type(struct cachefiles_object *object)
+{
+       struct dentry *dentry = object->dentry;
+       char type[3], xtype[3];
+       int ret;
+
+       ASSERT(dentry);
+       ASSERT(dentry->d_inode);
+
+       if (!object->fscache.cookie)
+               strcpy(type, "C3");
+       else
+               snprintf(type, 3, "%02x", object->fscache.cookie->def->type);
+
+       _enter("%p{%s}", object, type);
+
+       /* attempt to install a type label directly */
+       ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2,
+                          XATTR_CREATE);
+       if (ret == 0) {
+               _debug("SET"); /* we succeeded */
+               goto error;
+       }
+
+       if (ret != -EEXIST) {
+               kerror("Can't set xattr on %*.*s [%lu] (err %d)",
+                      dentry->d_name.len, dentry->d_name.len,
+                      dentry->d_name.name, dentry->d_inode->i_ino,
+                      -ret);
+               goto error;
+       }
+
+       /* read the current type label */
+       ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3);
+       if (ret < 0) {
+               if (ret == -ERANGE)
+                       goto bad_type_length;
+
+               kerror("Can't read xattr on %*.*s [%lu] (err %d)",
+                      dentry->d_name.len, dentry->d_name.len,
+                      dentry->d_name.name, dentry->d_inode->i_ino,
+                      -ret);
+               goto error;
+       }
+
+       /* check the type is what we're expecting */
+       if (ret != 2)
+               goto bad_type_length;
+
+       if (xtype[0] != type[0] || xtype[1] != type[1])
+               goto bad_type;
+
+       ret = 0;
+
+error:
+       _leave(" = %d", ret);
+       return ret;
+
+bad_type_length:
+       kerror("Cache object %lu type xattr length incorrect",
+              dentry->d_inode->i_ino);
+       ret = -EIO;
+       goto error;
+
+bad_type:
+       xtype[2] = 0;
+       kerror("Cache object %*.*s [%lu] type %s not %s",
+              dentry->d_name.len, dentry->d_name.len,
+              dentry->d_name.name, dentry->d_inode->i_ino,
+              xtype, type);
+       ret = -EIO;
+       goto error;
+}
+
+/*
+ * set the state xattr on a cache file
+ */
+int cachefiles_set_object_xattr(struct cachefiles_object *object,
+                               struct cachefiles_xattr *auxdata)
+{
+       struct dentry *dentry = object->dentry;
+       int ret;
+
+       ASSERT(object->fscache.cookie);
+       ASSERT(dentry);
+
+       _enter("%p,#%d", object, auxdata->len);
+
+       /* attempt to install the cache metadata directly */
+       _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+
+       ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+                          &auxdata->type, auxdata->len,
+                          XATTR_CREATE);
+       if (ret < 0 && ret != -ENOMEM)
+               cachefiles_io_error_obj(
+                       object,
+                       "Failed to set xattr with error %d", ret);
+
+       _leave(" = %d", ret);
+       return ret;
+}
+
+/*
+ * update the state xattr on a cache file
+ */
+int cachefiles_update_object_xattr(struct cachefiles_object *object,
+                                  struct cachefiles_xattr *auxdata)
+{
+       struct dentry *dentry = object->dentry;
+       int ret;
+
+       ASSERT(object->fscache.cookie);
+       ASSERT(dentry);
+
+       _enter("%p,#%d", object, auxdata->len);
+
+       /* attempt to install the cache metadata directly */
+       _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len);
+
+       ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+                          &auxdata->type, auxdata->len,
+                          XATTR_REPLACE);
+       if (ret < 0 && ret != -ENOMEM)
+               cachefiles_io_error_obj(
+                       object,
+                       "Failed to update xattr with error %d", ret);
+
+       _leave(" = %d", ret);
+       return ret;
+}
+
+/*
+ * check the state xattr on a cache file
+ * - return -ESTALE if the object should be deleted
+ */
+int cachefiles_check_object_xattr(struct cachefiles_object *object,
+                                 struct cachefiles_xattr *auxdata)
+{
+       struct cachefiles_xattr *auxbuf;
+       struct dentry *dentry = object->dentry;
+       int ret;
+
+       _enter("%p,#%d", object, auxdata->len);
+
+       ASSERT(dentry);
+       ASSERT(dentry->d_inode);
+
+       auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);
+       if (!auxbuf) {
+               _leave(" = -ENOMEM");
+               return -ENOMEM;
+       }
+
+       /* read the current type label */
+       ret = vfs_getxattr(dentry, cachefiles_xattr_cache,
+                          &auxbuf->type, 512 + 1);
+       if (ret < 0) {
+               if (ret == -ENODATA)
+                       goto stale; /* no attribute - power went off
+                                    * mid-cull? */
+
+               if (ret == -ERANGE)
+                       goto bad_type_length;
+
+               cachefiles_io_error_obj(object,
+                                       "Can't read xattr on %lu (err %d)",
+                                       dentry->d_inode->i_ino, -ret);
+               goto error;
+       }
+
+       /* check the on-disk object */
+       if (ret < 1)
+               goto bad_type_length;
+
+       if (auxbuf->type != auxdata->type)
+               goto stale;
+
+       auxbuf->len = ret;
+
+       /* consult the netfs */
+       if (object->fscache.cookie->def->check_aux) {
+               enum fscache_checkaux result;
+               unsigned int dlen;
+
+               dlen = auxbuf->len - 1;
+
+               _debug("checkaux %s #%u",
+                      object->fscache.cookie->def->name, dlen);
+
+               result = fscache_check_aux(&object->fscache,
+                                          &auxbuf->data, dlen);
+
+               switch (result) {
+                       /* entry okay as is */
+               case FSCACHE_CHECKAUX_OKAY:
+                       goto okay;
+
+                       /* entry requires update */
+               case FSCACHE_CHECKAUX_NEEDS_UPDATE:
+                       break;
+
+                       /* entry requires deletion */
+               case FSCACHE_CHECKAUX_OBSOLETE:
+                       goto stale;
+
+               default:
+                       BUG();
+               }
+
+               /* update the current label */
+               ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
+                                  &auxdata->type, auxdata->len,
+                                  XATTR_REPLACE);
+               if (ret < 0) {
+                       cachefiles_io_error_obj(object,
+                                               "Can't update xattr on %lu"
+                                               " (error %d)",
+                                               dentry->d_inode->i_ino, -ret);
+                       goto error;
+               }
+       }
+
+okay:
+       ret = 0;
+
+error:
+       kfree(auxbuf);
+       _leave(" = %d", ret);
+       return ret;
+
+bad_type_length:
+       kerror("Cache object %lu xattr length incorrect",
+              dentry->d_inode->i_ino);
+       ret = -EIO;
+       goto error;
+
+stale:
+       ret = -ESTALE;
+       goto error;
+}
+
+/*
+ * remove the object's xattr to mark it stale
+ */
+int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
+                                  struct dentry *dentry)
+{
+       int ret;
+
+       ret = vfs_removexattr(dentry, cachefiles_xattr_cache);
+       if (ret < 0) {
+               if (ret == -ENOENT || ret == -ENODATA)
+                       ret = 0;
+               else if (ret != -ENOMEM)
+                       cachefiles_io_error(cache,
+                                           "Can't remove xattr from %lu"
+                                           " (error %d)",
+                                           dentry->d_inode->i_ino, -ret);
+       }
+
+       _leave(" = %d", ret);
+       return ret;
+}
index ff786687e93b7289e74f0587c170ca5a2f1006c0..3e87ce443ea2c0fd39f32743d56d9cdcd89175d2 100644 (file)
@@ -23,7 +23,7 @@
 #include <linux/if.h>
 #include <linux/if_bridge.h>
 #include <linux/slab.h>
-#include <linux/raid/md.h>
+#include <linux/raid/md_u.h>
 #include <linux/kd.h>
 #include <linux/route.h>
 #include <linux/in6.h>
diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS
new file mode 100644 (file)
index 0000000..1b2d4c6
--- /dev/null
@@ -0,0 +1,3 @@
+- Out-of-space may cause a severe problem if the object (and directory entry)
+  were written, but the inode attributes failed. Then if the filesystem was
+  unmounted and mounted the kernel can get into an endless loop doing a readdir.
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
new file mode 100644 (file)
index 0000000..cc2d22d
--- /dev/null
@@ -0,0 +1,16 @@
+#
+# Kbuild for the EXOFS module
+#
+# Copyright (C) 2008 Panasas Inc.  All rights reserved.
+#
+# Authors:
+#   Boaz Harrosh <bharrosh@panasas.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2
+#
+# Kbuild - Gets included from the Kernels Makefile and build system
+#
+
+exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o
+obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
new file mode 100644 (file)
index 0000000..86194b2
--- /dev/null
@@ -0,0 +1,13 @@
+config EXOFS_FS
+       tristate "exofs: OSD based file system support"
+       depends on SCSI_OSD_ULD
+       help
+         EXOFS is a file system that uses an OSD storage device,
+         as its backing storage.
+
+# Debugging-related stuff
+config EXOFS_DEBUG
+       bool "Enable debugging"
+       depends on EXOFS_FS
+       help
+         This option enables EXOFS debug prints.
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
new file mode 100644 (file)
index 0000000..b1512c4
--- /dev/null
@@ -0,0 +1,184 @@
+/*
+ * common.h - Common definitions for both Kernel and user-mode utilities
+ *
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef __EXOFS_COM_H__
+#define __EXOFS_COM_H__
+
+#include <linux/types.h>
+
+#include <scsi/osd_attributes.h>
+#include <scsi/osd_initiator.h>
+#include <scsi/osd_sec.h>
+
+/****************************************************************************
+ * Object ID related defines
+ * NOTE: inode# = object ID - EXOFS_OBJ_OFF
+ ****************************************************************************/
+#define EXOFS_MIN_PID   0x10000        /* Smallest partition ID */
+#define EXOFS_OBJ_OFF  0x10000 /* offset for objects */
+#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */
+#define EXOFS_ROOT_ID  0x10002 /* object ID for root directory */
+
+/* exofs Application specific page/attribute */
+# define EXOFS_APAGE_FS_DATA   (OSD_APAGE_APP_DEFINED_FIRST + 3)
+# define EXOFS_ATTR_INODE_DATA 1
+
+/*
+ * The maximum number of files we can have is limited by the size of the
+ * inode number.  This is the largest object ID that the file system supports.
+ * Object IDs 0, 1, and 2 are always in use (see above defines).
+ */
+enum {
+       EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX :
+                                       (1ULL << (sizeof(ino_t) * 8ULL - 1ULL)),
+       EXOFS_MAX_ID     = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF),
+};
+
+/****************************************************************************
+ * Misc.
+ ****************************************************************************/
+#define EXOFS_BLKSHIFT 12
+#define EXOFS_BLKSIZE  (1UL << EXOFS_BLKSHIFT)
+
+/****************************************************************************
+ * superblock-related things
+ ****************************************************************************/
+#define EXOFS_SUPER_MAGIC      0x5DF5
+
+/*
+ * The file system control block - stored in an object's data (mainly, the one
+ * with ID EXOFS_SUPER_ID).  This is where the in-memory superblock is stored
+ * on disk.  Right now it just has a magic value, which is basically a sanity
+ * check on our ability to communicate with the object store.
+ */
+struct exofs_fscb {
+       __le64  s_nextid;       /* Highest object ID used */
+       __le32  s_numfiles;     /* Number of files on fs */
+       __le16  s_magic;        /* Magic signature */
+       __le16  s_newfs;        /* Non-zero if this is a new fs */
+};
+
+/****************************************************************************
+ * inode-related things
+ ****************************************************************************/
+#define EXOFS_IDATA            5
+
+/*
+ * The file control block - stored in an object's attributes.  This is where
+ * the in-memory inode is stored on disk.
+ */
+struct exofs_fcb {
+       __le64  i_size;                 /* Size of the file */
+       __le16  i_mode;                 /* File mode */
+       __le16  i_links_count;          /* Links count */
+       __le32  i_uid;                  /* Owner Uid */
+       __le32  i_gid;                  /* Group Id */
+       __le32  i_atime;                /* Access time */
+       __le32  i_ctime;                /* Creation time */
+       __le32  i_mtime;                /* Modification time */
+       __le32  i_flags;                /* File flags (unused for now)*/
+       __le32  i_generation;           /* File version (for NFS) */
+       __le32  i_data[EXOFS_IDATA];    /* Short symlink names and device #s */
+};
+
+#define EXOFS_INO_ATTR_SIZE    sizeof(struct exofs_fcb)
+
+/* This is the Attribute the fcb is stored in */
+static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF(
+       EXOFS_APAGE_FS_DATA,
+       EXOFS_ATTR_INODE_DATA,
+       EXOFS_INO_ATTR_SIZE);
+
+/****************************************************************************
+ * dentry-related things
+ ****************************************************************************/
+#define EXOFS_NAME_LEN 255
+
+/*
+ * The on-disk directory entry
+ */
+struct exofs_dir_entry {
+       __le64          inode_no;               /* inode number           */
+       __le16          rec_len;                /* directory entry length */
+       u8              name_len;               /* name length            */
+       u8              file_type;              /* umm...file type        */
+       char            name[EXOFS_NAME_LEN];   /* file name              */
+};
+
+enum {
+       EXOFS_FT_UNKNOWN,
+       EXOFS_FT_REG_FILE,
+       EXOFS_FT_DIR,
+       EXOFS_FT_CHRDEV,
+       EXOFS_FT_BLKDEV,
+       EXOFS_FT_FIFO,
+       EXOFS_FT_SOCK,
+       EXOFS_FT_SYMLINK,
+       EXOFS_FT_MAX
+};
+
+#define EXOFS_DIR_PAD                  4
+#define EXOFS_DIR_ROUND                        (EXOFS_DIR_PAD - 1)
+#define EXOFS_DIR_REC_LEN(name_len) \
+       (((name_len) + offsetof(struct exofs_dir_entry, name)  + \
+         EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
+
+/*************************
+ * function declarations *
+ *************************/
+/* osd.c                 */
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
+                          const struct osd_obj_id *obj);
+
+int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid);
+static inline int exofs_check_ok(struct osd_request *or)
+{
+       return exofs_check_ok_resid(or, NULL, NULL);
+}
+int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred);
+int exofs_async_op(struct osd_request *or,
+       osd_req_done_fn *async_done, void *caller_context, u8 *cred);
+
+int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
+
+int osd_req_read_kern(struct osd_request *or,
+       const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
+
+int osd_req_write_kern(struct osd_request *or,
+       const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
+
+#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
new file mode 100644 (file)
index 0000000..65b0c8c
--- /dev/null
@@ -0,0 +1,672 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "exofs.h"
+
+static inline unsigned exofs_chunk_size(struct inode *inode)
+{
+       return inode->i_sb->s_blocksize;
+}
+
+static inline void exofs_put_page(struct page *page)
+{
+       kunmap(page);
+       page_cache_release(page);
+}
+
+/* Accesses dir's inode->i_size must be called under inode lock */
+static inline unsigned long dir_pages(struct inode *inode)
+{
+       return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+}
+
+static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
+{
+       loff_t last_byte = inode->i_size;
+
+       last_byte -= page_nr << PAGE_CACHE_SHIFT;
+       if (last_byte > PAGE_CACHE_SIZE)
+               last_byte = PAGE_CACHE_SIZE;
+       return last_byte;
+}
+
+static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
+{
+       struct address_space *mapping = page->mapping;
+       struct inode *dir = mapping->host;
+       int err = 0;
+
+       dir->i_version++;
+
+       if (!PageUptodate(page))
+               SetPageUptodate(page);
+
+       if (pos+len > dir->i_size) {
+               i_size_write(dir, pos+len);
+               mark_inode_dirty(dir);
+       }
+       set_page_dirty(page);
+
+       if (IS_DIRSYNC(dir))
+               err = write_one_page(page, 1);
+       else
+               unlock_page(page);
+
+       return err;
+}
+
+static void exofs_check_page(struct page *page)
+{
+       struct inode *dir = page->mapping->host;
+       unsigned chunk_size = exofs_chunk_size(dir);
+       char *kaddr = page_address(page);
+       unsigned offs, rec_len;
+       unsigned limit = PAGE_CACHE_SIZE;
+       struct exofs_dir_entry *p;
+       char *error;
+
+       /* if the page is the last one in the directory */
+       if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+               limit = dir->i_size & ~PAGE_CACHE_MASK;
+               if (limit & (chunk_size - 1))
+                       goto Ebadsize;
+               if (!limit)
+                       goto out;
+       }
+       for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) {
+               p = (struct exofs_dir_entry *)(kaddr + offs);
+               rec_len = le16_to_cpu(p->rec_len);
+
+               if (rec_len < EXOFS_DIR_REC_LEN(1))
+                       goto Eshort;
+               if (rec_len & 3)
+                       goto Ealign;
+               if (rec_len < EXOFS_DIR_REC_LEN(p->name_len))
+                       goto Enamelen;
+               if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
+                       goto Espan;
+       }
+       if (offs != limit)
+               goto Eend;
+out:
+       SetPageChecked(page);
+       return;
+
+Ebadsize:
+       EXOFS_ERR("ERROR [exofs_check_page]: "
+               "size of directory #%lu is not a multiple of chunk size",
+               dir->i_ino
+       );
+       goto fail;
+Eshort:
+       error = "rec_len is smaller than minimal";
+       goto bad_entry;
+Ealign:
+       error = "unaligned directory entry";
+       goto bad_entry;
+Enamelen:
+       error = "rec_len is too small for name_len";
+       goto bad_entry;
+Espan:
+       error = "directory entry across blocks";
+       goto bad_entry;
+bad_entry:
+       EXOFS_ERR(
+               "ERROR [exofs_check_page]: bad entry in directory #%lu: %s - "
+               "offset=%lu, inode=%llu, rec_len=%d, name_len=%d",
+               dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+               _LLU(le64_to_cpu(p->inode_no)),
+               rec_len, p->name_len);
+       goto fail;
+Eend:
+       p = (struct exofs_dir_entry *)(kaddr + offs);
+       EXOFS_ERR("ERROR [exofs_check_page]: "
+               "entry in directory #%lu spans the page boundary"
+               "offset=%lu, inode=%llu",
+               dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+               _LLU(le64_to_cpu(p->inode_no)));
+fail:
+       SetPageChecked(page);
+       SetPageError(page);
+}
+
+static struct page *exofs_get_page(struct inode *dir, unsigned long n)
+{
+       struct address_space *mapping = dir->i_mapping;
+       struct page *page = read_mapping_page(mapping, n, NULL);
+
+       if (!IS_ERR(page)) {
+               kmap(page);
+               if (!PageChecked(page))
+                       exofs_check_page(page);
+               if (PageError(page))
+                       goto fail;
+       }
+       return page;
+
+fail:
+       exofs_put_page(page);
+       return ERR_PTR(-EIO);
+}
+
+static inline int exofs_match(int len, const unsigned char *name,
+                                       struct exofs_dir_entry *de)
+{
+       if (len != de->name_len)
+               return 0;
+       if (!de->inode_no)
+               return 0;
+       return !memcmp(name, de->name, len);
+}
+
+static inline
+struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p)
+{
+       return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
+}
+
+static inline unsigned
+exofs_validate_entry(char *base, unsigned offset, unsigned mask)
+{
+       struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset);
+       struct exofs_dir_entry *p =
+                       (struct exofs_dir_entry *)(base + (offset&mask));
+       while ((char *)p < (char *)de) {
+               if (p->rec_len == 0)
+                       break;
+               p = exofs_next_entry(p);
+       }
+       return (char *)p - base;
+}
+
+static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = {
+       [EXOFS_FT_UNKNOWN]      = DT_UNKNOWN,
+       [EXOFS_FT_REG_FILE]     = DT_REG,
+       [EXOFS_FT_DIR]          = DT_DIR,
+       [EXOFS_FT_CHRDEV]       = DT_CHR,
+       [EXOFS_FT_BLKDEV]       = DT_BLK,
+       [EXOFS_FT_FIFO]         = DT_FIFO,
+       [EXOFS_FT_SOCK]         = DT_SOCK,
+       [EXOFS_FT_SYMLINK]      = DT_LNK,
+};
+
+#define S_SHIFT 12
+static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = {
+       [S_IFREG >> S_SHIFT]    = EXOFS_FT_REG_FILE,
+       [S_IFDIR >> S_SHIFT]    = EXOFS_FT_DIR,
+       [S_IFCHR >> S_SHIFT]    = EXOFS_FT_CHRDEV,
+       [S_IFBLK >> S_SHIFT]    = EXOFS_FT_BLKDEV,
+       [S_IFIFO >> S_SHIFT]    = EXOFS_FT_FIFO,
+       [S_IFSOCK >> S_SHIFT]   = EXOFS_FT_SOCK,
+       [S_IFLNK >> S_SHIFT]    = EXOFS_FT_SYMLINK,
+};
+
+static inline
+void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
+{
+       mode_t mode = inode->i_mode;
+       de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
+}
+
+static int
+exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+       loff_t pos = filp->f_pos;
+       struct inode *inode = filp->f_path.dentry->d_inode;
+       unsigned int offset = pos & ~PAGE_CACHE_MASK;
+       unsigned long n = pos >> PAGE_CACHE_SHIFT;
+       unsigned long npages = dir_pages(inode);
+       unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
+       unsigned char *types = NULL;
+       int need_revalidate = (filp->f_version != inode->i_version);
+
+       if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
+               return 0;
+
+       types = exofs_filetype_table;
+
+       for ( ; n < npages; n++, offset = 0) {
+               char *kaddr, *limit;
+               struct exofs_dir_entry *de;
+               struct page *page = exofs_get_page(inode, n);
+
+               if (IS_ERR(page)) {
+                       EXOFS_ERR("ERROR: "
+                                  "bad page in #%lu",
+                                  inode->i_ino);
+                       filp->f_pos += PAGE_CACHE_SIZE - offset;
+                       return PTR_ERR(page);
+               }
+               kaddr = page_address(page);
+               if (unlikely(need_revalidate)) {
+                       if (offset) {
+                               offset = exofs_validate_entry(kaddr, offset,
+                                                               chunk_mask);
+                               filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+                       }
+                       filp->f_version = inode->i_version;
+                       need_revalidate = 0;
+               }
+               de = (struct exofs_dir_entry *)(kaddr + offset);
+               limit = kaddr + exofs_last_byte(inode, n) -
+                                                       EXOFS_DIR_REC_LEN(1);
+               for (; (char *)de <= limit; de = exofs_next_entry(de)) {
+                       if (de->rec_len == 0) {
+                               EXOFS_ERR("ERROR: "
+                                       "zero-length directory entry");
+                               exofs_put_page(page);
+                               return -EIO;
+                       }
+                       if (de->inode_no) {
+                               int over;
+                               unsigned char d_type = DT_UNKNOWN;
+
+                               if (types && de->file_type < EXOFS_FT_MAX)
+                                       d_type = types[de->file_type];
+
+                               offset = (char *)de - kaddr;
+                               over = filldir(dirent, de->name, de->name_len,
+                                               (n<<PAGE_CACHE_SHIFT) | offset,
+                                               le64_to_cpu(de->inode_no),
+                                               d_type);
+                               if (over) {
+                                       exofs_put_page(page);
+                                       return 0;
+                               }
+                       }
+                       filp->f_pos += le16_to_cpu(de->rec_len);
+               }
+               exofs_put_page(page);
+       }
+
+       return 0;
+}
+
+struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
+                       struct dentry *dentry, struct page **res_page)
+{
+       const unsigned char *name = dentry->d_name.name;
+       int namelen = dentry->d_name.len;
+       unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
+       unsigned long start, n;
+       unsigned long npages = dir_pages(dir);
+       struct page *page = NULL;
+       struct exofs_i_info *oi = exofs_i(dir);
+       struct exofs_dir_entry *de;
+
+       if (npages == 0)
+               goto out;
+
+       *res_page = NULL;
+
+       start = oi->i_dir_start_lookup;
+       if (start >= npages)
+               start = 0;
+       n = start;
+       do {
+               char *kaddr;
+               page = exofs_get_page(dir, n);
+               if (!IS_ERR(page)) {
+                       kaddr = page_address(page);
+                       de = (struct exofs_dir_entry *) kaddr;
+                       kaddr += exofs_last_byte(dir, n) - reclen;
+                       while ((char *) de <= kaddr) {
+                               if (de->rec_len == 0) {
+                                       EXOFS_ERR(
+                                               "ERROR: exofs_find_entry: "
+                                               "zero-length directory entry");
+                                       exofs_put_page(page);
+                                       goto out;
+                               }
+                               if (exofs_match(namelen, name, de))
+                                       goto found;
+                               de = exofs_next_entry(de);
+                       }
+                       exofs_put_page(page);
+               }
+               if (++n >= npages)
+                       n = 0;
+       } while (n != start);
+out:
+       return NULL;
+
+found:
+       *res_page = page;
+       oi->i_dir_start_lookup = n;
+       return de;
+}
+
+struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p)
+{
+       struct page *page = exofs_get_page(dir, 0);
+       struct exofs_dir_entry *de = NULL;
+
+       if (!IS_ERR(page)) {
+               de = exofs_next_entry(
+                               (struct exofs_dir_entry *)page_address(page));
+               *p = page;
+       }
+       return de;
+}
+
+ino_t exofs_parent_ino(struct dentry *child)
+{
+       struct page *page;
+       struct exofs_dir_entry *de;
+       ino_t ino;
+
+       de = exofs_dotdot(child->d_inode, &page);
+       if (!de)
+               return 0;
+
+       ino = le64_to_cpu(de->inode_no);
+       exofs_put_page(page);
+       return ino;
+}
+
+ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry)
+{
+       ino_t res = 0;
+       struct exofs_dir_entry *de;
+       struct page *page;
+
+       de = exofs_find_entry(dir, dentry, &page);
+       if (de) {
+               res = le64_to_cpu(de->inode_no);
+               exofs_put_page(page);
+       }
+       return res;
+}
+
+int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
+                       struct page *page, struct inode *inode)
+{
+       loff_t pos = page_offset(page) +
+                       (char *) de - (char *) page_address(page);
+       unsigned len = le16_to_cpu(de->rec_len);
+       int err;
+
+       lock_page(page);
+       err = exofs_write_begin(NULL, page->mapping, pos, len,
+                               AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+       if (err)
+               EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n",
+                         err);
+
+       de->inode_no = cpu_to_le64(inode->i_ino);
+       exofs_set_de_type(de, inode);
+       if (likely(!err))
+               err = exofs_commit_chunk(page, pos, len);
+       exofs_put_page(page);
+       dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+       mark_inode_dirty(dir);
+       return err;
+}
+
+int exofs_add_link(struct dentry *dentry, struct inode *inode)
+{
+       struct inode *dir = dentry->d_parent->d_inode;
+       const unsigned char *name = dentry->d_name.name;
+       int namelen = dentry->d_name.len;
+       unsigned chunk_size = exofs_chunk_size(dir);
+       unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
+       unsigned short rec_len, name_len;
+       struct page *page = NULL;
+       struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+       struct exofs_dir_entry *de;
+       unsigned long npages = dir_pages(dir);
+       unsigned long n;
+       char *kaddr;
+       loff_t pos;
+       int err;
+
+       for (n = 0; n <= npages; n++) {
+               char *dir_end;
+
+               page = exofs_get_page(dir, n);
+               err = PTR_ERR(page);
+               if (IS_ERR(page))
+                       goto out;
+               lock_page(page);
+               kaddr = page_address(page);
+               dir_end = kaddr + exofs_last_byte(dir, n);
+               de = (struct exofs_dir_entry *)kaddr;
+               kaddr += PAGE_CACHE_SIZE - reclen;
+               while ((char *)de <= kaddr) {
+                       if ((char *)de == dir_end) {
+                               name_len = 0;
+                               rec_len = chunk_size;
+                               de->rec_len = cpu_to_le16(chunk_size);
+                               de->inode_no = 0;
+                               goto got_it;
+                       }
+                       if (de->rec_len == 0) {
+                               EXOFS_ERR("ERROR: exofs_add_link: "
+                                       "zero-length directory entry");
+                               err = -EIO;
+                               goto out_unlock;
+                       }
+                       err = -EEXIST;
+                       if (exofs_match(namelen, name, de))
+                               goto out_unlock;
+                       name_len = EXOFS_DIR_REC_LEN(de->name_len);
+                       rec_len = le16_to_cpu(de->rec_len);
+                       if (!de->inode_no && rec_len >= reclen)
+                               goto got_it;
+                       if (rec_len >= name_len + reclen)
+                               goto got_it;
+                       de = (struct exofs_dir_entry *) ((char *) de + rec_len);
+               }
+               unlock_page(page);
+               exofs_put_page(page);
+       }
+
+       EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=%p", dentry, inode);
+       return -EINVAL;
+
+got_it:
+       pos = page_offset(page) +
+               (char *)de - (char *)page_address(page);
+       err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0,
+                                                       &page, NULL);
+       if (err)
+               goto out_unlock;
+       if (de->inode_no) {
+               struct exofs_dir_entry *de1 =
+                       (struct exofs_dir_entry *)((char *)de + name_len);
+               de1->rec_len = cpu_to_le16(rec_len - name_len);
+               de->rec_len = cpu_to_le16(name_len);
+               de = de1;
+       }
+       de->name_len = namelen;
+       memcpy(de->name, name, namelen);
+       de->inode_no = cpu_to_le64(inode->i_ino);
+       exofs_set_de_type(de, inode);
+       err = exofs_commit_chunk(page, pos, rec_len);
+       dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+       mark_inode_dirty(dir);
+       sbi->s_numfiles++;
+
+out_put:
+       exofs_put_page(page);
+out:
+       return err;
+out_unlock:
+       unlock_page(page);
+       goto out_put;
+}
+
+int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
+{
+       struct address_space *mapping = page->mapping;
+       struct inode *inode = mapping->host;
+       struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+       char *kaddr = page_address(page);
+       unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1);
+       unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
+       loff_t pos;
+       struct exofs_dir_entry *pde = NULL;
+       struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from);
+       int err;
+
+       while (de < dir) {
+               if (de->rec_len == 0) {
+                       EXOFS_ERR("ERROR: exofs_delete_entry:"
+                               "zero-length directory entry");
+                       err = -EIO;
+                       goto out;
+               }
+               pde = de;
+               de = exofs_next_entry(de);
+       }
+       if (pde)
+               from = (char *)pde - (char *)page_address(page);
+       pos = page_offset(page) + from;
+       lock_page(page);
+       err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
+                                                       &page, NULL);
+       if (err)
+               EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILD => %d\n",
+                         err);
+       if (pde)
+               pde->rec_len = cpu_to_le16(to - from);
+       dir->inode_no = 0;
+       if (likely(!err))
+               err = exofs_commit_chunk(page, pos, to - from);
+       inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+       mark_inode_dirty(inode);
+       sbi->s_numfiles--;
+out:
+       exofs_put_page(page);
+       return err;
+}
+
+/* kept aligned on 4 bytes */
+#define THIS_DIR ".\0\0"
+#define PARENT_DIR "..\0"
+
+int exofs_make_empty(struct inode *inode, struct inode *parent)
+{
+       struct address_space *mapping = inode->i_mapping;
+       struct page *page = grab_cache_page(mapping, 0);
+       unsigned chunk_size = exofs_chunk_size(inode);
+       struct exofs_dir_entry *de;
+       int err;
+       void *kaddr;
+
+       if (!page)
+               return -ENOMEM;
+
+       err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0,
+                                                       &page, NULL);
+       if (err) {
+               unlock_page(page);
+               goto fail;
+       }
+
+       kaddr = kmap_atomic(page, KM_USER0);
+       de = (struct exofs_dir_entry *)kaddr;
+       de->name_len = 1;
+       de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1));
+       memcpy(de->name, THIS_DIR, sizeof(THIS_DIR));
+       de->inode_no = cpu_to_le64(inode->i_ino);
+       exofs_set_de_type(de, inode);
+
+       de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1));
+       de->name_len = 2;
+       de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1));
+       de->inode_no = cpu_to_le64(parent->i_ino);
+       memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
+       exofs_set_de_type(de, inode);
+       kunmap_atomic(page, KM_USER0);
+       err = exofs_commit_chunk(page, 0, chunk_size);
+fail:
+       page_cache_release(page);
+       return err;
+}
+
+int exofs_empty_dir(struct inode *inode)
+{
+       struct page *page = NULL;
+       unsigned long i, npages = dir_pages(inode);
+
+       for (i = 0; i < npages; i++) {
+               char *kaddr;
+               struct exofs_dir_entry *de;
+               page = exofs_get_page(inode, i);
+
+               if (IS_ERR(page))
+                       continue;
+
+               kaddr = page_address(page);
+               de = (struct exofs_dir_entry *)kaddr;
+               kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1);
+
+               while ((char *)de <= kaddr) {
+                       if (de->rec_len == 0) {
+                               EXOFS_ERR("ERROR: exofs_empty_dir: "
+                                         "zero-length directory entry"
+                                         "kaddr=%p, de=%p\n", kaddr, de);
+                               goto not_empty;
+                       }
+                       if (de->inode_no != 0) {
+                               /* check for . and .. */
+                               if (de->name[0] != '.')
+                                       goto not_empty;
+                               if (de->name_len > 2)
+                                       goto not_empty;
+                               if (de->name_len < 2) {
+                                       if (le64_to_cpu(de->inode_no) !=
+                                           inode->i_ino)
+                                               goto not_empty;
+                               } else if (de->name[1] != '.')
+                                       goto not_empty;
+                       }
+                       de = exofs_next_entry(de);
+               }
+               exofs_put_page(page);
+       }
+       return 1;
+
+not_empty:
+       exofs_put_page(page);
+       return 0;
+}
+
+const struct file_operations exofs_dir_operations = {
+       .llseek         = generic_file_llseek,
+       .read           = generic_read_dir,
+       .readdir        = exofs_readdir,
+};
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
new file mode 100644 (file)
index 0000000..0fd4c78
--- /dev/null
@@ -0,0 +1,180 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include "common.h"
+
+#ifndef __EXOFS_H__
+#define __EXOFS_H__
+
+#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
+
+#ifdef CONFIG_EXOFS_DEBUG
+#define EXOFS_DBGMSG(fmt, a...) \
+       printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a)
+#else
+#define EXOFS_DBGMSG(fmt, a...) \
+       do { if (0) printk(fmt, ##a); } while (0)
+#endif
+
+/* u64 has problems with printk this will cast it to unsigned long long */
+#define _LLU(x) (unsigned long long)(x)
+
+/*
+ * our extension to the in-memory superblock
+ */
+struct exofs_sb_info {
+       struct osd_dev  *s_dev;                 /* returned by get_osd_dev    */
+       osd_id          s_pid;                  /* partition ID of file system*/
+       int             s_timeout;              /* timeout for OSD operations */
+       uint64_t        s_nextid;               /* highest object ID used     */
+       uint32_t        s_numfiles;             /* number of files on fs      */
+       spinlock_t      s_next_gen_lock;        /* spinlock for gen # update  */
+       u32             s_next_generation;      /* next gen # to use          */
+       atomic_t        s_curr_pending;         /* number of pending commands */
+       uint8_t         s_cred[OSD_CAP_LEN];    /* all-powerful credential    */
+};
+
+/*
+ * our extension to the in-memory inode
+ */
+struct exofs_i_info {
+       unsigned long  i_flags;            /* various atomic flags            */
+       uint32_t       i_data[EXOFS_IDATA];/*short symlink names and device #s*/
+       uint32_t       i_dir_start_lookup; /* which page to start lookup      */
+       wait_queue_head_t i_wq;            /* wait queue for inode            */
+       uint64_t       i_commit_size;      /* the object's written length     */
+       uint8_t        i_cred[OSD_CAP_LEN];/* all-powerful credential         */
+       struct inode   vfs_inode;          /* normal in-memory inode          */
+};
+
+/*
+ * our inode flags
+ */
+#define OBJ_2BCREATED  0       /* object will be created soon*/
+#define OBJ_CREATED    1       /* object has been created on the osd*/
+
+static inline int obj_2bcreated(struct exofs_i_info *oi)
+{
+       return test_bit(OBJ_2BCREATED, &oi->i_flags);
+}
+
+static inline void set_obj_2bcreated(struct exofs_i_info *oi)
+{
+       set_bit(OBJ_2BCREATED, &oi->i_flags);
+}
+
+static inline int obj_created(struct exofs_i_info *oi)
+{
+       return test_bit(OBJ_CREATED, &oi->i_flags);
+}
+
+static inline void set_obj_created(struct exofs_i_info *oi)
+{
+       set_bit(OBJ_CREATED, &oi->i_flags);
+}
+
+int __exofs_wait_obj_created(struct exofs_i_info *oi);
+static inline int wait_obj_created(struct exofs_i_info *oi)
+{
+       if (likely(obj_created(oi)))
+               return 0;
+
+       return __exofs_wait_obj_created(oi);
+}
+
+/*
+ * get to our inode from the vfs inode
+ */
+static inline struct exofs_i_info *exofs_i(struct inode *inode)
+{
+       return container_of(inode, struct exofs_i_info, vfs_inode);
+}
+
+/*
+ * Maximum count of links to a file
+ */
+#define EXOFS_LINK_MAX           32000
+
+/*************************
+ * function declarations *
+ *************************/
+/* inode.c               */
+void exofs_truncate(struct inode *inode);
+int exofs_setattr(struct dentry *, struct iattr *);
+int exofs_write_begin(struct file *file, struct address_space *mapping,
+               loff_t pos, unsigned len, unsigned flags,
+               struct page **pagep, void **fsdata);
+extern struct inode *exofs_iget(struct super_block *, unsigned long);
+struct inode *exofs_new_inode(struct inode *, int);
+extern int exofs_write_inode(struct inode *, int);
+extern void exofs_delete_inode(struct inode *);
+
+/* dir.c:                */
+int exofs_add_link(struct dentry *, struct inode *);
+ino_t exofs_inode_by_name(struct inode *, struct dentry *);
+int exofs_delete_entry(struct exofs_dir_entry *, struct page *);
+int exofs_make_empty(struct inode *, struct inode *);
+struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *,
+                                        struct page **);
+int exofs_empty_dir(struct inode *);
+struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **);
+ino_t exofs_parent_ino(struct dentry *child);
+int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
+                   struct inode *);
+
+/*********************
+ * operation vectors *
+ *********************/
+/* dir.c:            */
+extern const struct file_operations exofs_dir_operations;
+
+/* file.c            */
+extern const struct inode_operations exofs_file_inode_operations;
+extern const struct file_operations exofs_file_operations;
+
+/* inode.c           */
+extern const struct address_space_operations exofs_aops;
+
+/* namei.c           */
+extern const struct inode_operations exofs_dir_inode_operations;
+extern const struct inode_operations exofs_special_inode_operations;
+
+/* symlink.c         */
+extern const struct inode_operations exofs_symlink_inode_operations;
+extern const struct inode_operations exofs_fast_symlink_inode_operations;
+
+#endif
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
new file mode 100644 (file)
index 0000000..6ed7fe4
--- /dev/null
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/buffer_head.h>
+
+#include "exofs.h"
+
+static int exofs_release_file(struct inode *inode, struct file *filp)
+{
+       return 0;
+}
+
+static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
+                           int datasync)
+{
+       int ret;
+       struct address_space *mapping = filp->f_mapping;
+
+       ret = filemap_write_and_wait(mapping);
+       if (ret)
+               return ret;
+
+       /*Note: file_fsync below also calles sync_blockdev, which is a no-op
+        *      for exofs, but other then that it does sync_inode and
+        *      sync_superblock which is what we need here.
+        */
+       return file_fsync(filp, dentry, datasync);
+}
+
+static int exofs_flush(struct file *file, fl_owner_t id)
+{
+       exofs_file_fsync(file, file->f_path.dentry, 1);
+       /* TODO: Flush the OSD target */
+       return 0;
+}
+
+const struct file_operations exofs_file_operations = {
+       .llseek         = generic_file_llseek,
+       .read           = do_sync_read,
+       .write          = do_sync_write,
+       .aio_read       = generic_file_aio_read,
+       .aio_write      = generic_file_aio_write,
+       .mmap           = generic_file_mmap,
+       .open           = generic_file_open,
+       .release        = exofs_release_file,
+       .fsync          = exofs_file_fsync,
+       .flush          = exofs_flush,
+       .splice_read    = generic_file_splice_read,
+       .splice_write   = generic_file_splice_write,
+};
+
+const struct inode_operations exofs_file_inode_operations = {
+       .truncate       = exofs_truncate,
+       .setattr        = exofs_setattr,
+};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
new file mode 100644 (file)
index 0000000..ba8d9fa
--- /dev/null
@@ -0,0 +1,1303 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+#include <scsi/scsi_device.h>
+
+#include "exofs.h"
+
+#ifdef CONFIG_EXOFS_DEBUG
+#  define EXOFS_DEBUG_OBJ_ISIZE 1
+#endif
+
+struct page_collect {
+       struct exofs_sb_info *sbi;
+       struct request_queue *req_q;
+       struct inode *inode;
+       unsigned expected_pages;
+
+       struct bio *bio;
+       unsigned nr_pages;
+       unsigned long length;
+       loff_t pg_first; /* keep 64bit also in 32-arches */
+};
+
+static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
+               struct inode *inode)
+{
+       struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+       struct request_queue *req_q = sbi->s_dev->scsi_device->request_queue;
+
+       pcol->sbi = sbi;
+       pcol->req_q = req_q;
+       pcol->inode = inode;
+       pcol->expected_pages = expected_pages;
+
+       pcol->bio = NULL;
+       pcol->nr_pages = 0;
+       pcol->length = 0;
+       pcol->pg_first = -1;
+
+       EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino,
+                    expected_pages);
+}
+
+static void _pcol_reset(struct page_collect *pcol)
+{
+       pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
+
+       pcol->bio = NULL;
+       pcol->nr_pages = 0;
+       pcol->length = 0;
+       pcol->pg_first = -1;
+       EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n",
+                    pcol->inode->i_ino, pcol->expected_pages);
+
+       /* this is probably the end of the loop but in writes
+        * it might not end here. don't be left with nothing
+        */
+       if (!pcol->expected_pages)
+               pcol->expected_pages = 128;
+}
+
+static int pcol_try_alloc(struct page_collect *pcol)
+{
+       int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES);
+
+       for (; pages; pages >>= 1) {
+               pcol->bio = bio_alloc(GFP_KERNEL, pages);
+               if (likely(pcol->bio))
+                       return 0;
+       }
+
+       EXOFS_ERR("Failed to kcalloc expected_pages=%u\n",
+                 pcol->expected_pages);
+       return -ENOMEM;
+}
+
+static void pcol_free(struct page_collect *pcol)
+{
+       bio_put(pcol->bio);
+       pcol->bio = NULL;
+}
+
+static int pcol_add_page(struct page_collect *pcol, struct page *page,
+                        unsigned len)
+{
+       int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0);
+       if (unlikely(len != added_len))
+               return -ENOMEM;
+
+       ++pcol->nr_pages;
+       pcol->length += len;
+       return 0;
+}
+
+static int update_read_page(struct page *page, int ret)
+{
+       if (ret == 0) {
+               /* Everything is OK */
+               SetPageUptodate(page);
+               if (PageError(page))
+                       ClearPageError(page);
+       } else if (ret == -EFAULT) {
+               /* In this case we were trying to read something that wasn't on
+                * disk yet - return a page full of zeroes.  This should be OK,
+                * because the object should be empty (if there was a write
+                * before this read, the read would be waiting with the page
+                * locked */
+               clear_highpage(page);
+
+               SetPageUptodate(page);
+               if (PageError(page))
+                       ClearPageError(page);
+               ret = 0; /* recovered error */
+               EXOFS_DBGMSG("recovered read error\n");
+       } else /* Error */
+               SetPageError(page);
+
+       return ret;
+}
+
+static void update_write_page(struct page *page, int ret)
+{
+       if (ret) {
+               mapping_set_error(page->mapping, ret);
+               SetPageError(page);
+       }
+       end_page_writeback(page);
+}
+
+/* Called at the end of reads, to optionally unlock pages and update their
+ * status.
+ */
+static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
+                           bool do_unlock)
+{
+       struct bio_vec *bvec;
+       int i;
+       u64 resid;
+       u64 good_bytes;
+       u64 length = 0;
+       int ret = exofs_check_ok_resid(or, &resid, NULL);
+
+       osd_end_request(or);
+
+       if (likely(!ret))
+               good_bytes = pcol->length;
+       else if (!resid)
+               good_bytes = 0;
+       else
+               good_bytes = pcol->length - resid;
+
+       EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx"
+                    " length=0x%lx nr_pages=%u\n",
+                    pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
+                    pcol->nr_pages);
+
+       __bio_for_each_segment(bvec, pcol->bio, i, 0) {
+               struct page *page = bvec->bv_page;
+               struct inode *inode = page->mapping->host;
+               int page_stat;
+
+               if (inode != pcol->inode)
+                       continue; /* osd might add more pages at end */
+
+               if (likely(length < good_bytes))
+                       page_stat = 0;
+               else
+                       page_stat = ret;
+
+               EXOFS_DBGMSG("    readpages_done(0x%lx, 0x%lx) %s\n",
+                         inode->i_ino, page->index,
+                         page_stat ? "bad_bytes" : "good_bytes");
+
+               ret = update_read_page(page, page_stat);
+               if (do_unlock)
+                       unlock_page(page);
+               length += bvec->bv_len;
+       }
+
+       pcol_free(pcol);
+       EXOFS_DBGMSG("readpages_done END\n");
+       return ret;
+}
+
+/* callback of async reads */
+static void readpages_done(struct osd_request *or, void *p)
+{
+       struct page_collect *pcol = p;
+
+       __readpages_done(or, pcol, true);
+       atomic_dec(&pcol->sbi->s_curr_pending);
+       kfree(p);
+}
+
+static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
+{
+       struct bio_vec *bvec;
+       int i;
+
+       __bio_for_each_segment(bvec, pcol->bio, i, 0) {
+               struct page *page = bvec->bv_page;
+
+               if (rw == READ)
+                       update_read_page(page, ret);
+               else
+                       update_write_page(page, ret);
+
+               unlock_page(page);
+       }
+       pcol_free(pcol);
+}
+
+static int read_exec(struct page_collect *pcol, bool is_sync)
+{
+       struct exofs_i_info *oi = exofs_i(pcol->inode);
+       struct osd_obj_id obj = {pcol->sbi->s_pid,
+                                       pcol->inode->i_ino + EXOFS_OBJ_OFF};
+       struct osd_request *or = NULL;
+       struct page_collect *pcol_copy = NULL;
+       loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
+       int ret;
+
+       if (!pcol->bio)
+               return 0;
+
+       /* see comment in _readpage() about sync reads */
+       WARN_ON(is_sync && (pcol->nr_pages != 1));
+
+       or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
+       if (unlikely(!or)) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       osd_req_read(or, &obj, pcol->bio, i_start);
+
+       if (is_sync) {
+               exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred);
+               return __readpages_done(or, pcol, false);
+       }
+
+       pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
+       if (!pcol_copy) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       *pcol_copy = *pcol;
+       ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred);
+       if (unlikely(ret))
+               goto err;
+
+       atomic_inc(&pcol->sbi->s_curr_pending);
+
+       EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
+                 obj.id, _LLU(i_start), pcol->length);
+
+       /* pages ownership was passed to pcol_copy */
+       _pcol_reset(pcol);
+       return 0;
+
+err:
+       if (!is_sync)
+               _unlock_pcol_pages(pcol, ret, READ);
+       kfree(pcol_copy);
+       if (or)
+               osd_end_request(or);
+       return ret;
+}
+
+/* readpage_strip is called either directly from readpage() or by the VFS from
+ * within read_cache_pages(), to add one more page to be read. It will try to
+ * collect as many contiguous pages as posible. If a discontinuity is
+ * encountered, or it runs out of resources, it will submit the previous segment
+ * and will start a new collection. Eventually caller must submit the last
+ * segment if present.
+ */
+static int readpage_strip(void *data, struct page *page)
+{
+       struct page_collect *pcol = data;
+       struct inode *inode = pcol->inode;
+       struct exofs_i_info *oi = exofs_i(inode);
+       loff_t i_size = i_size_read(inode);
+       pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+       size_t len;
+       int ret;
+
+       /* FIXME: Just for debugging, will be removed */
+       if (PageUptodate(page))
+               EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
+                         page->index);
+
+       if (page->index < end_index)
+               len = PAGE_CACHE_SIZE;
+       else if (page->index == end_index)
+               len = i_size & ~PAGE_CACHE_MASK;
+       else
+               len = 0;
+
+       if (!len || !obj_created(oi)) {
+               /* this will be out of bounds, or doesn't exist yet.
+                * Current page is cleared and the request is split
+                */
+               clear_highpage(page);
+
+               SetPageUptodate(page);
+               if (PageError(page))
+                       ClearPageError(page);
+
+               unlock_page(page);
+               EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
+                            " splitting\n", inode->i_ino, page->index);
+
+               return read_exec(pcol, false);
+       }
+
+try_again:
+
+       if (unlikely(pcol->pg_first == -1)) {
+               pcol->pg_first = page->index;
+       } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
+                  page->index)) {
+               /* Discontinuity detected, split the request */
+               ret = read_exec(pcol, false);
+               if (unlikely(ret))
+                       goto fail;
+               goto try_again;
+       }
+
+       if (!pcol->bio) {
+               ret = pcol_try_alloc(pcol);
+               if (unlikely(ret))
+                       goto fail;
+       }
+
+       if (len != PAGE_CACHE_SIZE)
+               zero_user(page, len, PAGE_CACHE_SIZE - len);
+
+       EXOFS_DBGMSG("    readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+                    inode->i_ino, page->index, len);
+
+       ret = pcol_add_page(pcol, page, len);
+       if (ret) {
+               EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p "
+                         "this_len=0x%zx nr_pages=%u length=0x%lx\n",
+                         page, len, pcol->nr_pages, pcol->length);
+
+               /* split the request, and start again with current page */
+               ret = read_exec(pcol, false);
+               if (unlikely(ret))
+                       goto fail;
+
+               goto try_again;
+       }
+
+       return 0;
+
+fail:
+       /* SetPageError(page); ??? */
+       unlock_page(page);
+       return ret;
+}
+
+static int exofs_readpages(struct file *file, struct address_space *mapping,
+                          struct list_head *pages, unsigned nr_pages)
+{
+       struct page_collect pcol;
+       int ret;
+
+       _pcol_init(&pcol, nr_pages, mapping->host);
+
+       ret = read_cache_pages(mapping, pages, readpage_strip, &pcol);
+       if (ret) {
+               EXOFS_ERR("read_cache_pages => %d\n", ret);
+               return ret;
+       }
+
+       return read_exec(&pcol, false);
+}
+
+static int _readpage(struct page *page, bool is_sync)
+{
+       struct page_collect pcol;
+       int ret;
+
+       _pcol_init(&pcol, 1, page->mapping->host);
+
+       /* readpage_strip might call read_exec(,async) inside at several places
+        * but this is safe for is_async=0 since read_exec will not do anything
+        * when we have a single page.
+        */
+       ret = readpage_strip(&pcol, page);
+       if (ret) {
+               EXOFS_ERR("_readpage => %d\n", ret);
+               return ret;
+       }
+
+       return read_exec(&pcol, is_sync);
+}
+
+/*
+ * We don't need the file
+ */
+static int exofs_readpage(struct file *file, struct page *page)
+{
+       return _readpage(page, false);
+}
+
+/* Callback for osd_write. All writes are asynchronouse */
+static void writepages_done(struct osd_request *or, void *p)
+{
+       struct page_collect *pcol = p;
+       struct bio_vec *bvec;
+       int i;
+       u64 resid;
+       u64  good_bytes;
+       u64  length = 0;
+
+       int ret = exofs_check_ok_resid(or, NULL, &resid);
+
+       osd_end_request(or);
+       atomic_dec(&pcol->sbi->s_curr_pending);
+
+       if (likely(!ret))
+               good_bytes = pcol->length;
+       else if (!resid)
+               good_bytes = 0;
+       else
+               good_bytes = pcol->length - resid;
+
+       EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx"
+                    " length=0x%lx nr_pages=%u\n",
+                    pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
+                    pcol->nr_pages);
+
+       __bio_for_each_segment(bvec, pcol->bio, i, 0) {
+               struct page *page = bvec->bv_page;
+               struct inode *inode = page->mapping->host;
+               int page_stat;
+
+               if (inode != pcol->inode)
+                       continue; /* osd might add more pages to a bio */
+
+               if (likely(length < good_bytes))
+                       page_stat = 0;
+               else
+                       page_stat = ret;
+
+               update_write_page(page, page_stat);
+               unlock_page(page);
+               EXOFS_DBGMSG("    writepages_done(0x%lx, 0x%lx) status=%d\n",
+                            inode->i_ino, page->index, page_stat);
+
+               length += bvec->bv_len;
+       }
+
+       pcol_free(pcol);
+       kfree(pcol);
+       EXOFS_DBGMSG("writepages_done END\n");
+}
+
+static int write_exec(struct page_collect *pcol)
+{
+       struct exofs_i_info *oi = exofs_i(pcol->inode);
+       struct osd_obj_id obj = {pcol->sbi->s_pid,
+                                       pcol->inode->i_ino + EXOFS_OBJ_OFF};
+       struct osd_request *or = NULL;
+       struct page_collect *pcol_copy = NULL;
+       loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
+       int ret;
+
+       if (!pcol->bio)
+               return 0;
+
+       or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
+       if (unlikely(!or)) {
+               EXOFS_ERR("write_exec: Faild to osd_start_request()\n");
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
+       if (!pcol_copy) {
+               EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       *pcol_copy = *pcol;
+
+       osd_req_write(or, &obj, pcol_copy->bio, i_start);
+       ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred);
+       if (unlikely(ret)) {
+               EXOFS_ERR("write_exec: exofs_async_op() Faild\n");
+               goto err;
+       }
+
+       atomic_inc(&pcol->sbi->s_curr_pending);
+       EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
+                 pcol->inode->i_ino, pcol->pg_first, _LLU(i_start),
+                 pcol->length);
+       /* pages ownership was passed to pcol_copy */
+       _pcol_reset(pcol);
+       return 0;
+
+err:
+       _unlock_pcol_pages(pcol, ret, WRITE);
+       kfree(pcol_copy);
+       if (or)
+               osd_end_request(or);
+       return ret;
+}
+
+/* writepage_strip is called either directly from writepage() or by the VFS from
+ * within write_cache_pages(), to add one more page to be written to storage.
+ * It will try to collect as many contiguous pages as possible. If a
+ * discontinuity is encountered or it runs out of resources it will submit the
+ * previous segment and will start a new collection.
+ * Eventually caller must submit the last segment if present.
+ */
+static int writepage_strip(struct page *page,
+                          struct writeback_control *wbc_unused, void *data)
+{
+       struct page_collect *pcol = data;
+       struct inode *inode = pcol->inode;
+       struct exofs_i_info *oi = exofs_i(inode);
+       loff_t i_size = i_size_read(inode);
+       pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+       size_t len;
+       int ret;
+
+       BUG_ON(!PageLocked(page));
+
+       ret = wait_obj_created(oi);
+       if (unlikely(ret))
+               goto fail;
+
+       if (page->index < end_index)
+               /* in this case, the page is within the limits of the file */
+               len = PAGE_CACHE_SIZE;
+       else {
+               len = i_size & ~PAGE_CACHE_MASK;
+
+               if (page->index > end_index || !len) {
+                       /* in this case, the page is outside the limits
+                        * (truncate in progress)
+                        */
+                       ret = write_exec(pcol);
+                       if (unlikely(ret))
+                               goto fail;
+                       if (PageError(page))
+                               ClearPageError(page);
+                       unlock_page(page);
+                       return 0;
+               }
+       }
+
+try_again:
+
+       if (unlikely(pcol->pg_first == -1)) {
+               pcol->pg_first = page->index;
+       } else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
+                  page->index)) {
+               /* Discontinuity detected, split the request */
+               ret = write_exec(pcol);
+               if (unlikely(ret))
+                       goto fail;
+               goto try_again;
+       }
+
+       if (!pcol->bio) {
+               ret = pcol_try_alloc(pcol);
+               if (unlikely(ret))
+                       goto fail;
+       }
+
+       EXOFS_DBGMSG("    writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+                    inode->i_ino, page->index, len);
+
+       ret = pcol_add_page(pcol, page, len);
+       if (unlikely(ret)) {
+               EXOFS_DBGMSG("Failed pcol_add_page "
+                            "nr_pages=%u total_length=0x%lx\n",
+                            pcol->nr_pages, pcol->length);
+
+               /* split the request, next loop will start again */
+               ret = write_exec(pcol);
+               if (unlikely(ret)) {
+                       EXOFS_DBGMSG("write_exec faild => %d", ret);
+                       goto fail;
+               }
+
+               goto try_again;
+       }
+
+       BUG_ON(PageWriteback(page));
+       set_page_writeback(page);
+
+       return 0;
+
+fail:
+       set_bit(AS_EIO, &page->mapping->flags);
+       unlock_page(page);
+       return ret;
+}
+
+static int exofs_writepages(struct address_space *mapping,
+                      struct writeback_control *wbc)
+{
+       struct page_collect pcol;
+       long start, end, expected_pages;
+       int ret;
+
+       start = wbc->range_start >> PAGE_CACHE_SHIFT;
+       end = (wbc->range_end == LLONG_MAX) ?
+                       start + mapping->nrpages :
+                       wbc->range_end >> PAGE_CACHE_SHIFT;
+
+       if (start || end)
+               expected_pages = min(end - start + 1, 32L);
+       else
+               expected_pages = mapping->nrpages;
+
+       EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx"
+                    " m->nrpages=%lu start=0x%lx end=0x%lx\n",
+                    mapping->host->i_ino, wbc->range_start, wbc->range_end,
+                    mapping->nrpages, start, end);
+
+       _pcol_init(&pcol, expected_pages, mapping->host);
+
+       ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
+       if (ret) {
+               EXOFS_ERR("write_cache_pages => %d\n", ret);
+               return ret;
+       }
+
+       return write_exec(&pcol);
+}
+
+static int exofs_writepage(struct page *page, struct writeback_control *wbc)
+{
+       struct page_collect pcol;
+       int ret;
+
+       _pcol_init(&pcol, 1, page->mapping->host);
+
+       ret = writepage_strip(page, NULL, &pcol);
+       if (ret) {
+               EXOFS_ERR("exofs_writepage => %d\n", ret);
+               return ret;
+       }
+
+       return write_exec(&pcol);
+}
+
+int exofs_write_begin(struct file *file, struct address_space *mapping,
+               loff_t pos, unsigned len, unsigned flags,
+               struct page **pagep, void **fsdata)
+{
+       int ret = 0;
+       struct page *page;
+
+       page = *pagep;
+       if (page == NULL) {
+               ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
+                                        fsdata);
+               if (ret) {
+                       EXOFS_DBGMSG("simple_write_begin faild\n");
+                       return ret;
+               }
+
+               page = *pagep;
+       }
+
+        /* read modify write */
+       if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
+               ret = _readpage(page, true);
+               if (ret) {
+                       /*SetPageError was done by _readpage. Is it ok?*/
+                       unlock_page(page);
+                       EXOFS_DBGMSG("__readpage_filler faild\n");
+               }
+       }
+
+       return ret;
+}
+
+static int exofs_write_begin_export(struct file *file,
+               struct address_space *mapping,
+               loff_t pos, unsigned len, unsigned flags,
+               struct page **pagep, void **fsdata)
+{
+       *pagep = NULL;
+
+       return exofs_write_begin(file, mapping, pos, len, flags, pagep,
+                                       fsdata);
+}
+
+const struct address_space_operations exofs_aops = {
+       .readpage       = exofs_readpage,
+       .readpages      = exofs_readpages,
+       .writepage      = exofs_writepage,
+       .writepages     = exofs_writepages,
+       .write_begin    = exofs_write_begin_export,
+       .write_end      = simple_write_end,
+};
+
+/******************************************************************************
+ * INODE OPERATIONS
+ *****************************************************************************/
+
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static inline int exofs_inode_is_fast_symlink(struct inode *inode)
+{
+       struct exofs_i_info *oi = exofs_i(inode);
+
+       return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
+}
+
+/*
+ * get_block_t - Fill in a buffer_head
+ * An OSD takes care of block allocation so we just fake an allocation by
+ * putting in the inode's sector_t in the buffer_head.
+ * TODO: What about the case of create==0 and @iblock does not exist in the
+ * object?
+ */
+static int exofs_get_block(struct inode *inode, sector_t iblock,
+                   struct buffer_head *bh_result, int create)
+{
+       map_bh(bh_result, inode->i_sb, iblock);
+       return 0;
+}
+
+const struct osd_attr g_attr_logical_length = ATTR_DEF(
+       OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
+
+/*
+ * Truncate a file to the specified size - all we have to do is set the size
+ * attribute.  We make sure the object exists first.
+ */
+void exofs_truncate(struct inode *inode)
+{
+       struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+       struct exofs_i_info *oi = exofs_i(inode);
+       struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+       struct osd_request *or;
+       struct osd_attr attr;
+       loff_t isize = i_size_read(inode);
+       __be64 newsize;
+       int ret;
+
+       if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
+            || S_ISLNK(inode->i_mode)))
+               return;
+       if (exofs_inode_is_fast_symlink(inode))
+               return;
+       if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+               return;
+       inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+       nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
+
+       or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+       if (unlikely(!or)) {
+               EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n");
+               goto fail;
+       }
+
+       osd_req_set_attributes(or, &obj);
+
+       newsize = cpu_to_be64((u64)isize);
+       attr = g_attr_logical_length;
+       attr.val_ptr = &newsize;
+       osd_req_add_set_attr_list(or, &attr, 1);
+
+       /* if we are about to truncate an object, and it hasn't been
+        * created yet, wait
+        */
+       if (unlikely(wait_obj_created(oi)))
+               goto fail;
+
+       ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+       osd_end_request(or);
+       if (ret)
+               goto fail;
+
+out:
+       mark_inode_dirty(inode);
+       return;
+fail:
+       make_bad_inode(inode);
+       goto out;
+}
+
+/*
+ * Set inode attributes - just call generic functions.
+ */
+int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+       struct inode *inode = dentry->d_inode;
+       int error;
+
+       error = inode_change_ok(inode, iattr);
+       if (error)
+               return error;
+
+       error = inode_setattr(inode, iattr);
+       return error;
+}
+
+/*
+ * Read an inode from the OSD, and return it as is.  We also return the size
+ * attribute in the 'sanity' argument if we got compiled with debugging turned
+ * on.
+ */
+static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
+                   struct exofs_fcb *inode, uint64_t *sanity)
+{
+       struct exofs_sb_info *sbi = sb->s_fs_info;
+       struct osd_request *or;
+       struct osd_attr attr;
+       struct osd_obj_id obj = {sbi->s_pid,
+                                oi->vfs_inode.i_ino + EXOFS_OBJ_OFF};
+       int ret;
+
+       exofs_make_credential(oi->i_cred, &obj);
+
+       or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+       if (unlikely(!or)) {
+               EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n");
+               return -ENOMEM;
+       }
+       osd_req_get_attributes(or, &obj);
+
+       /* we need the inode attribute */
+       osd_req_add_get_attr_list(or, &g_attr_inode_data, 1);
+
+#ifdef EXOFS_DEBUG_OBJ_ISIZE
+       /* we get the size attributes to do a sanity check */
+       osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
+#endif
+
+       ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+       if (ret)
+               goto out;
+
+       attr = g_attr_inode_data;
+       ret = extract_attr_from_req(or, &attr);
+       if (ret) {
+               EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n");
+               goto out;
+       }
+
+       WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE);
+       memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE);
+
+#ifdef EXOFS_DEBUG_OBJ_ISIZE
+       attr = g_attr_logical_length;
+       ret = extract_attr_from_req(or, &attr);
+       if (ret) {
+               EXOFS_ERR("ERROR: extract attr from or failed\n");
+               goto out;
+       }
+       *sanity = get_unaligned_be64(attr.val_ptr);
+#endif
+
+out:
+       osd_end_request(or);
+       return ret;
+}
+
+/*
+ * Fill in an inode read from the OSD and set it up for use
+ */
+struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
+{
+       struct exofs_i_info *oi;
+       struct exofs_fcb fcb;
+       struct inode *inode;
+       uint64_t uninitialized_var(sanity);
+       int ret;
+
+       inode = iget_locked(sb, ino);
+       if (!inode)
+               return ERR_PTR(-ENOMEM);
+       if (!(inode->i_state & I_NEW))
+               return inode;
+       oi = exofs_i(inode);
+
+       /* read the inode from the osd */
+       ret = exofs_get_inode(sb, oi, &fcb, &sanity);
+       if (ret)
+               goto bad_inode;
+
+       init_waitqueue_head(&oi->i_wq);
+       set_obj_created(oi);
+
+       /* copy stuff from on-disk struct to in-memory struct */
+       inode->i_mode = le16_to_cpu(fcb.i_mode);
+       inode->i_uid = le32_to_cpu(fcb.i_uid);
+       inode->i_gid = le32_to_cpu(fcb.i_gid);
+       inode->i_nlink = le16_to_cpu(fcb.i_links_count);
+       inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
+       inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
+       inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
+       inode->i_ctime.tv_nsec =
+               inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
+       oi->i_commit_size = le64_to_cpu(fcb.i_size);
+       i_size_write(inode, oi->i_commit_size);
+       inode->i_blkbits = EXOFS_BLKSHIFT;
+       inode->i_generation = le32_to_cpu(fcb.i_generation);
+
+#ifdef EXOFS_DEBUG_OBJ_ISIZE
+       if ((inode->i_size != sanity) &&
+               (!exofs_inode_is_fast_symlink(inode))) {
+               EXOFS_ERR("WARNING: Size of object from inode and "
+                         "attributes differ (%lld != %llu)\n",
+                         inode->i_size, _LLU(sanity));
+       }
+#endif
+
+       oi->i_dir_start_lookup = 0;
+
+       if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
+               ret = -ESTALE;
+               goto bad_inode;
+       }
+
+       if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+               if (fcb.i_data[0])
+                       inode->i_rdev =
+                               old_decode_dev(le32_to_cpu(fcb.i_data[0]));
+               else
+                       inode->i_rdev =
+                               new_decode_dev(le32_to_cpu(fcb.i_data[1]));
+       } else {
+               memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
+       }
+
+       if (S_ISREG(inode->i_mode)) {
+               inode->i_op = &exofs_file_inode_operations;
+               inode->i_fop = &exofs_file_operations;
+               inode->i_mapping->a_ops = &exofs_aops;
+       } else if (S_ISDIR(inode->i_mode)) {
+               inode->i_op = &exofs_dir_inode_operations;
+               inode->i_fop = &exofs_dir_operations;
+               inode->i_mapping->a_ops = &exofs_aops;
+       } else if (S_ISLNK(inode->i_mode)) {
+               if (exofs_inode_is_fast_symlink(inode))
+                       inode->i_op = &exofs_fast_symlink_inode_operations;
+               else {
+                       inode->i_op = &exofs_symlink_inode_operations;
+                       inode->i_mapping->a_ops = &exofs_aops;
+               }
+       } else {
+               inode->i_op = &exofs_special_inode_operations;
+               if (fcb.i_data[0])
+                       init_special_inode(inode, inode->i_mode,
+                          old_decode_dev(le32_to_cpu(fcb.i_data[0])));
+               else
+                       init_special_inode(inode, inode->i_mode,
+                          new_decode_dev(le32_to_cpu(fcb.i_data[1])));
+       }
+
+       unlock_new_inode(inode);
+       return inode;
+
+bad_inode:
+       iget_failed(inode);
+       return ERR_PTR(ret);
+}
+
+int __exofs_wait_obj_created(struct exofs_i_info *oi)
+{
+       if (!obj_created(oi)) {
+               BUG_ON(!obj_2bcreated(oi));
+               wait_event(oi->i_wq, obj_created(oi));
+       }
+       return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
+}
+/*
+ * Callback function from exofs_new_inode().  The important thing is that we
+ * set the obj_created flag so that other methods know that the object exists on
+ * the OSD.
+ */
+static void create_done(struct osd_request *or, void *p)
+{
+       struct inode *inode = p;
+       struct exofs_i_info *oi = exofs_i(inode);
+       struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+       int ret;
+
+       ret = exofs_check_ok(or);
+       osd_end_request(or);
+       atomic_dec(&sbi->s_curr_pending);
+
+       if (unlikely(ret)) {
+               EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
+                         _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF));
+               make_bad_inode(inode);
+       } else
+               set_obj_created(oi);
+
+       atomic_dec(&inode->i_count);
+       wake_up(&oi->i_wq);
+}
+
+/*
+ * Set up a new inode and create an object for it on the OSD
+ */
+struct inode *exofs_new_inode(struct inode *dir, int mode)
+{
+       struct super_block *sb;
+       struct inode *inode;
+       struct exofs_i_info *oi;
+       struct exofs_sb_info *sbi;
+       struct osd_request *or;
+       struct osd_obj_id obj;
+       int ret;
+
+       sb = dir->i_sb;
+       inode = new_inode(sb);
+       if (!inode)
+               return ERR_PTR(-ENOMEM);
+
+       oi = exofs_i(inode);
+
+       init_waitqueue_head(&oi->i_wq);
+       set_obj_2bcreated(oi);
+
+       sbi = sb->s_fs_info;
+
+       sb->s_dirt = 1;
+       inode->i_uid = current->cred->fsuid;
+       if (dir->i_mode & S_ISGID) {
+               inode->i_gid = dir->i_gid;
+               if (S_ISDIR(mode))
+                       mode |= S_ISGID;
+       } else {
+               inode->i_gid = current->cred->fsgid;
+       }
+       inode->i_mode = mode;
+
+       inode->i_ino = sbi->s_nextid++;
+       inode->i_blkbits = EXOFS_BLKSHIFT;
+       inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+       oi->i_commit_size = inode->i_size = 0;
+       spin_lock(&sbi->s_next_gen_lock);
+       inode->i_generation = sbi->s_next_generation++;
+       spin_unlock(&sbi->s_next_gen_lock);
+       insert_inode_hash(inode);
+
+       mark_inode_dirty(inode);
+
+       obj.partition = sbi->s_pid;
+       obj.id = inode->i_ino + EXOFS_OBJ_OFF;
+       exofs_make_credential(oi->i_cred, &obj);
+
+       or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+       if (unlikely(!or)) {
+               EXOFS_ERR("exofs_new_inode: osd_start_request failed\n");
+               return ERR_PTR(-ENOMEM);
+       }
+
+       osd_req_create_object(or, &obj);
+
+       /* increment the refcount so that the inode will still be around when we
+        * reach the callback
+        */
+       atomic_inc(&inode->i_count);
+
+       ret = exofs_async_op(or, create_done, inode, oi->i_cred);
+       if (ret) {
+               atomic_dec(&inode->i_count);
+               osd_end_request(or);
+               return ERR_PTR(-EIO);
+       }
+       atomic_inc(&sbi->s_curr_pending);
+
+       return inode;
+}
+
+/*
+ * struct to pass two arguments to update_inode's callback
+ */
+struct updatei_args {
+       struct exofs_sb_info    *sbi;
+       struct exofs_fcb        fcb;
+};
+
+/*
+ * Callback function from exofs_update_inode().
+ */
+static void updatei_done(struct osd_request *or, void *p)
+{
+       struct updatei_args *args = p;
+
+       osd_end_request(or);
+
+       atomic_dec(&args->sbi->s_curr_pending);
+
+       kfree(args);
+}
+
+/*
+ * Write the inode to the OSD.  Just fill up the struct, and set the attribute
+ * synchronously or asynchronously depending on the do_sync flag.
+ */
+static int exofs_update_inode(struct inode *inode, int do_sync)
+{
+       struct exofs_i_info *oi = exofs_i(inode);
+       struct super_block *sb = inode->i_sb;
+       struct exofs_sb_info *sbi = sb->s_fs_info;
+       struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+       struct osd_request *or;
+       struct osd_attr attr;
+       struct exofs_fcb *fcb;
+       struct updatei_args *args;
+       int ret;
+
+       args = kzalloc(sizeof(*args), GFP_KERNEL);
+       if (!args)
+               return -ENOMEM;
+
+       fcb = &args->fcb;
+
+       fcb->i_mode = cpu_to_le16(inode->i_mode);
+       fcb->i_uid = cpu_to_le32(inode->i_uid);
+       fcb->i_gid = cpu_to_le32(inode->i_gid);
+       fcb->i_links_count = cpu_to_le16(inode->i_nlink);
+       fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+       fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+       fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+       oi->i_commit_size = i_size_read(inode);
+       fcb->i_size = cpu_to_le64(oi->i_commit_size);
+       fcb->i_generation = cpu_to_le32(inode->i_generation);
+
+       if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+               if (old_valid_dev(inode->i_rdev)) {
+                       fcb->i_data[0] =
+                               cpu_to_le32(old_encode_dev(inode->i_rdev));
+                       fcb->i_data[1] = 0;
+               } else {
+                       fcb->i_data[0] = 0;
+                       fcb->i_data[1] =
+                               cpu_to_le32(new_encode_dev(inode->i_rdev));
+                       fcb->i_data[2] = 0;
+               }
+       } else
+               memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
+
+       or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+       if (unlikely(!or)) {
+               EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n");
+               ret = -ENOMEM;
+               goto free_args;
+       }
+
+       osd_req_set_attributes(or, &obj);
+
+       attr = g_attr_inode_data;
+       attr.val_ptr = fcb;
+       osd_req_add_set_attr_list(or, &attr, 1);
+
+       if (!obj_created(oi)) {
+               EXOFS_DBGMSG("!obj_created\n");
+               BUG_ON(!obj_2bcreated(oi));
+               wait_event(oi->i_wq, obj_created(oi));
+               EXOFS_DBGMSG("wait_event done\n");
+       }
+
+       if (do_sync) {
+               ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+               osd_end_request(or);
+               goto free_args;
+       } else {
+               args->sbi = sbi;
+
+               ret = exofs_async_op(or, updatei_done, args, oi->i_cred);
+               if (ret) {
+                       osd_end_request(or);
+                       goto free_args;
+               }
+               atomic_inc(&sbi->s_curr_pending);
+               goto out; /* deallocation in updatei_done */
+       }
+
+free_args:
+       kfree(args);
+out:
+       EXOFS_DBGMSG("ret=>%d\n", ret);
+       return ret;
+}
+
+int exofs_write_inode(struct inode *inode, int wait)
+{
+       return exofs_update_inode(inode, wait);
+}
+
+/*
+ * Callback function from exofs_delete_inode() - don't have much cleaning up to
+ * do.
+ */
+static void delete_done(struct osd_request *or, void *p)
+{
+       struct exofs_sb_info *sbi;
+       osd_end_request(or);
+       sbi = p;
+       atomic_dec(&sbi->s_curr_pending);
+}
+
+/*
+ * Called when the refcount of an inode reaches zero.  We remove the object
+ * from the OSD here.  We make sure the object was created before we try and
+ * delete it.
+ */
+void exofs_delete_inode(struct inode *inode)
+{
+       struct exofs_i_info *oi = exofs_i(inode);
+       struct super_block *sb = inode->i_sb;
+       struct exofs_sb_info *sbi = sb->s_fs_info;
+       struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+       struct osd_request *or;
+       int ret;
+
+       truncate_inode_pages(&inode->i_data, 0);
+
+       if (is_bad_inode(inode))
+               goto no_delete;
+
+       mark_inode_dirty(inode);
+       exofs_update_inode(inode, inode_needs_sync(inode));
+
+       inode->i_size = 0;
+       if (inode->i_blocks)
+               exofs_truncate(inode);
+
+       clear_inode(inode);
+
+       or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+       if (unlikely(!or)) {
+               EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n");
+               return;
+       }
+
+       osd_req_remove_object(or, &obj);
+
+       /* if we are deleting an obj that hasn't been created yet, wait */
+       if (!obj_created(oi)) {
+               BUG_ON(!obj_2bcreated(oi));
+               wait_event(oi->i_wq, obj_created(oi));
+       }
+
+       ret = exofs_async_op(or, delete_done, sbi, oi->i_cred);
+       if (ret) {
+               EXOFS_ERR(
+                      "ERROR: @exofs_delete_inode exofs_async_op failed\n");
+               osd_end_request(or);
+               return;
+       }
+       atomic_inc(&sbi->s_curr_pending);
+
+       return;
+
+no_delete:
+       clear_inode(inode);
+}
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
new file mode 100644 (file)
index 0000000..77fdd76
--- /dev/null
@@ -0,0 +1,342 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "exofs.h"
+
+static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode)
+{
+       int err = exofs_add_link(dentry, inode);
+       if (!err) {
+               d_instantiate(dentry, inode);
+               return 0;
+       }
+       inode_dec_link_count(inode);
+       iput(inode);
+       return err;
+}
+
+static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
+                                  struct nameidata *nd)
+{
+       struct inode *inode;
+       ino_t ino;
+
+       if (dentry->d_name.len > EXOFS_NAME_LEN)
+               return ERR_PTR(-ENAMETOOLONG);
+
+       ino = exofs_inode_by_name(dir, dentry);
+       inode = NULL;
+       if (ino) {
+               inode = exofs_iget(dir->i_sb, ino);
+               if (IS_ERR(inode))
+                       return ERR_CAST(inode);
+       }
+       return d_splice_alias(inode, dentry);
+}
+
+static int exofs_create(struct inode *dir, struct dentry *dentry, int mode,
+                        struct nameidata *nd)
+{
+       struct inode *inode = exofs_new_inode(dir, mode);
+       int err = PTR_ERR(inode);
+       if (!IS_ERR(inode)) {
+               inode->i_op = &exofs_file_inode_operations;
+               inode->i_fop = &exofs_file_operations;
+               inode->i_mapping->a_ops = &exofs_aops;
+               mark_inode_dirty(inode);
+               err = exofs_add_nondir(dentry, inode);
+       }
+       return err;
+}
+
+static int exofs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+                      dev_t rdev)
+{
+       struct inode *inode;
+       int err;
+
+       if (!new_valid_dev(rdev))
+               return -EINVAL;
+
+       inode = exofs_new_inode(dir, mode);
+       err = PTR_ERR(inode);
+       if (!IS_ERR(inode)) {
+               init_special_inode(inode, inode->i_mode, rdev);
+               mark_inode_dirty(inode);
+               err = exofs_add_nondir(dentry, inode);
+       }
+       return err;
+}
+
+static int exofs_symlink(struct inode *dir, struct dentry *dentry,
+                         const char *symname)
+{
+       struct super_block *sb = dir->i_sb;
+       int err = -ENAMETOOLONG;
+       unsigned l = strlen(symname)+1;
+       struct inode *inode;
+       struct exofs_i_info *oi;
+
+       if (l > sb->s_blocksize)
+               goto out;
+
+       inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+       err = PTR_ERR(inode);
+       if (IS_ERR(inode))
+               goto out;
+
+       oi = exofs_i(inode);
+       if (l > sizeof(oi->i_data)) {
+               /* slow symlink */
+               inode->i_op = &exofs_symlink_inode_operations;
+               inode->i_mapping->a_ops = &exofs_aops;
+               memset(oi->i_data, 0, sizeof(oi->i_data));
+
+               err = page_symlink(inode, symname, l);
+               if (err)
+                       goto out_fail;
+       } else {
+               /* fast symlink */
+               inode->i_op = &exofs_fast_symlink_inode_operations;
+               memcpy(oi->i_data, symname, l);
+               inode->i_size = l-1;
+       }
+       mark_inode_dirty(inode);
+
+       err = exofs_add_nondir(dentry, inode);
+out:
+       return err;
+
+out_fail:
+       inode_dec_link_count(inode);
+       iput(inode);
+       goto out;
+}
+
+static int exofs_link(struct dentry *old_dentry, struct inode *dir,
+               struct dentry *dentry)
+{
+       struct inode *inode = old_dentry->d_inode;
+
+       if (inode->i_nlink >= EXOFS_LINK_MAX)
+               return -EMLINK;
+
+       inode->i_ctime = CURRENT_TIME;
+       inode_inc_link_count(inode);
+       atomic_inc(&inode->i_count);
+
+       return exofs_add_nondir(dentry, inode);
+}
+
+static int exofs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+       struct inode *inode;
+       int err = -EMLINK;
+
+       if (dir->i_nlink >= EXOFS_LINK_MAX)
+               goto out;
+
+       inode_inc_link_count(dir);
+
+       inode = exofs_new_inode(dir, S_IFDIR | mode);
+       err = PTR_ERR(inode);
+       if (IS_ERR(inode))
+               goto out_dir;
+
+       inode->i_op = &exofs_dir_inode_operations;
+       inode->i_fop = &exofs_dir_operations;
+       inode->i_mapping->a_ops = &exofs_aops;
+
+       inode_inc_link_count(inode);
+
+       err = exofs_make_empty(inode, dir);
+       if (err)
+               goto out_fail;
+
+       err = exofs_add_link(dentry, inode);
+       if (err)
+               goto out_fail;
+
+       d_instantiate(dentry, inode);
+out:
+       return err;
+
+out_fail:
+       inode_dec_link_count(inode);
+       inode_dec_link_count(inode);
+       iput(inode);
+out_dir:
+       inode_dec_link_count(dir);
+       goto out;
+}
+
+static int exofs_unlink(struct inode *dir, struct dentry *dentry)
+{
+       struct inode *inode = dentry->d_inode;
+       struct exofs_dir_entry *de;
+       struct page *page;
+       int err = -ENOENT;
+
+       de = exofs_find_entry(dir, dentry, &page);
+       if (!de)
+               goto out;
+
+       err = exofs_delete_entry(de, page);
+       if (err)
+               goto out;
+
+       inode->i_ctime = dir->i_ctime;
+       inode_dec_link_count(inode);
+       err = 0;
+out:
+       return err;
+}
+
+static int exofs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+       struct inode *inode = dentry->d_inode;
+       int err = -ENOTEMPTY;
+
+       if (exofs_empty_dir(inode)) {
+               err = exofs_unlink(dir, dentry);
+               if (!err) {
+                       inode->i_size = 0;
+                       inode_dec_link_count(inode);
+                       inode_dec_link_count(dir);
+               }
+       }
+       return err;
+}
+
+static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
+               struct inode *new_dir, struct dentry *new_dentry)
+{
+       struct inode *old_inode = old_dentry->d_inode;
+       struct inode *new_inode = new_dentry->d_inode;
+       struct page *dir_page = NULL;
+       struct exofs_dir_entry *dir_de = NULL;
+       struct page *old_page;
+       struct exofs_dir_entry *old_de;
+       int err = -ENOENT;
+
+       old_de = exofs_find_entry(old_dir, old_dentry, &old_page);
+       if (!old_de)
+               goto out;
+
+       if (S_ISDIR(old_inode->i_mode)) {
+               err = -EIO;
+               dir_de = exofs_dotdot(old_inode, &dir_page);
+               if (!dir_de)
+                       goto out_old;
+       }
+
+       if (new_inode) {
+               struct page *new_page;
+               struct exofs_dir_entry *new_de;
+
+               err = -ENOTEMPTY;
+               if (dir_de && !exofs_empty_dir(new_inode))
+                       goto out_dir;
+
+               err = -ENOENT;
+               new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
+               if (!new_de)
+                       goto out_dir;
+               inode_inc_link_count(old_inode);
+               err = exofs_set_link(new_dir, new_de, new_page, old_inode);
+               new_inode->i_ctime = CURRENT_TIME;
+               if (dir_de)
+                       drop_nlink(new_inode);
+               inode_dec_link_count(new_inode);
+               if (err)
+                       goto out_dir;
+       } else {
+               if (dir_de) {
+                       err = -EMLINK;
+                       if (new_dir->i_nlink >= EXOFS_LINK_MAX)
+                               goto out_dir;
+               }
+               inode_inc_link_count(old_inode);
+               err = exofs_add_link(new_dentry, old_inode);
+               if (err) {
+                       inode_dec_link_count(old_inode);
+                       goto out_dir;
+               }
+               if (dir_de)
+                       inode_inc_link_count(new_dir);
+       }
+
+       old_inode->i_ctime = CURRENT_TIME;
+
+       exofs_delete_entry(old_de, old_page);
+       inode_dec_link_count(old_inode);
+
+       if (dir_de) {
+               err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
+               inode_dec_link_count(old_dir);
+               if (err)
+                       goto out_dir;
+       }
+       return 0;
+
+
+out_dir:
+       if (dir_de) {
+               kunmap(dir_page);
+               page_cache_release(dir_page);
+       }
+out_old:
+       kunmap(old_page);
+       page_cache_release(old_page);
+out:
+       return err;
+}
+
+const struct inode_operations exofs_dir_inode_operations = {
+       .create         = exofs_create,
+       .lookup         = exofs_lookup,
+       .link           = exofs_link,
+       .unlink         = exofs_unlink,
+       .symlink        = exofs_symlink,
+       .mkdir          = exofs_mkdir,
+       .rmdir          = exofs_rmdir,
+       .mknod          = exofs_mknod,
+       .rename         = exofs_rename,
+       .setattr        = exofs_setattr,
+};
+
+const struct inode_operations exofs_special_inode_operations = {
+       .setattr        = exofs_setattr,
+};
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
new file mode 100644 (file)
index 0000000..b249ae9
--- /dev/null
@@ -0,0 +1,153 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <scsi/scsi_device.h>
+#include <scsi/osd_sense.h>
+
+#include "exofs.h"
+
+int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
+{
+       struct osd_sense_info osi;
+       int ret = osd_req_decode_sense(or, &osi);
+
+       if (ret) { /* translate to Linux codes */
+               if (osi.additional_code == scsi_invalid_field_in_cdb) {
+                       if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE)
+                               ret = -EFAULT;
+                       if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID)
+                               ret = -ENOENT;
+                       else
+                               ret = -EINVAL;
+               } else if (osi.additional_code == osd_quota_error)
+                       ret = -ENOSPC;
+               else
+                       ret = -EIO;
+       }
+
+       /* FIXME: should be include in osd_sense_info */
+       if (in_resid)
+               *in_resid = or->in.req ? or->in.req->data_len : 0;
+
+       if (out_resid)
+               *out_resid = or->out.req ? or->out.req->data_len : 0;
+
+       return ret;
+}
+
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
+{
+       osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
+}
+
+/*
+ * Perform a synchronous OSD operation.
+ */
+int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
+{
+       int ret;
+
+       or->timeout = timeout;
+       ret = osd_finalize_request(or, 0, credential, NULL);
+       if (ret) {
+               EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+               return ret;
+       }
+
+       ret = osd_execute_request(or);
+
+       if (ret)
+               EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
+       /* osd_req_decode_sense(or, ret); */
+       return ret;
+}
+
+/*
+ * Perform an asynchronous OSD operation.
+ */
+int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done,
+                  void *caller_context, u8 *cred)
+{
+       int ret;
+
+       ret = osd_finalize_request(or, 0, cred, NULL);
+       if (ret) {
+               EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+               return ret;
+       }
+
+       ret = osd_execute_request_async(or, async_done, caller_context);
+
+       if (ret)
+               EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret);
+       return ret;
+}
+
+int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
+{
+       struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
+       void *iter = NULL;
+       int nelem;
+
+       do {
+               nelem = 1;
+               osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
+               if ((cur_attr.attr_page == attr->attr_page) &&
+                   (cur_attr.attr_id == attr->attr_id)) {
+                       attr->len = cur_attr.len;
+                       attr->val_ptr = cur_attr.val_ptr;
+                       return 0;
+               }
+       } while (iter);
+
+       return -EIO;
+}
+
+int osd_req_read_kern(struct osd_request *or,
+       const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
+{
+       struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
+       struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
+
+       if (!bio)
+               return -ENOMEM;
+
+       osd_req_read(or, obj, bio, offset);
+       return 0;
+}
+
+int osd_req_write_kern(struct osd_request *or,
+       const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
+{
+       struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
+       struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
+
+       if (!bio)
+               return -ENOMEM;
+
+       osd_req_write(or, obj, bio, offset);
+       return 0;
+}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
new file mode 100644 (file)
index 0000000..9f1985e
--- /dev/null
@@ -0,0 +1,584 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/string.h>
+#include <linux/parser.h>
+#include <linux/vfs.h>
+#include <linux/random.h>
+#include <linux/exportfs.h>
+
+#include "exofs.h"
+
+/******************************************************************************
+ * MOUNT OPTIONS
+ *****************************************************************************/
+
+/*
+ * struct to hold what we get from mount options
+ */
+struct exofs_mountopt {
+       const char *dev_name;
+       uint64_t pid;
+       int timeout;
+};
+
+/*
+ * exofs-specific mount-time options.
+ */
+enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err };
+
+/*
+ * Our mount-time options.  These should ideally be 64-bit unsigned, but the
+ * kernel's parsing functions do not currently support that.  32-bit should be
+ * sufficient for most applications now.
+ */
+static match_table_t tokens = {
+       {Opt_pid, "pid=%u"},
+       {Opt_to, "to=%u"},
+       {Opt_err, NULL}
+};
+
+/*
+ * The main option parsing method.  Also makes sure that all of the mandatory
+ * mount options were set.
+ */
+static int parse_options(char *options, struct exofs_mountopt *opts)
+{
+       char *p;
+       substring_t args[MAX_OPT_ARGS];
+       int option;
+       bool s_pid = false;
+
+       EXOFS_DBGMSG("parse_options %s\n", options);
+       /* defaults */
+       memset(opts, 0, sizeof(*opts));
+       opts->timeout = BLK_DEFAULT_SG_TIMEOUT;
+
+       while ((p = strsep(&options, ",")) != NULL) {
+               int token;
+               char str[32];
+
+               if (!*p)
+                       continue;
+
+               token = match_token(p, tokens, args);
+               switch (token) {
+               case Opt_pid:
+                       if (0 == match_strlcpy(str, &args[0], sizeof(str)))
+                               return -EINVAL;
+                       opts->pid = simple_strtoull(str, NULL, 0);
+                       if (opts->pid < EXOFS_MIN_PID) {
+                               EXOFS_ERR("Partition ID must be >= %u",
+                                         EXOFS_MIN_PID);
+                               return -EINVAL;
+                       }
+                       s_pid = 1;
+                       break;
+               case Opt_to:
+                       if (match_int(&args[0], &option))
+                               return -EINVAL;
+                       if (option <= 0) {
+                               EXOFS_ERR("Timout must be > 0");
+                               return -EINVAL;
+                       }
+                       opts->timeout = option * HZ;
+                       break;
+               }
+       }
+
+       if (!s_pid) {
+               EXOFS_ERR("Need to specify the following options:\n");
+               EXOFS_ERR("    -o pid=pid_no_to_use\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+/******************************************************************************
+ * INODE CACHE
+ *****************************************************************************/
+
+/*
+ * Our inode cache.  Isn't it pretty?
+ */
+static struct kmem_cache *exofs_inode_cachep;
+
+/*
+ * Allocate an inode in the cache
+ */
+static struct inode *exofs_alloc_inode(struct super_block *sb)
+{
+       struct exofs_i_info *oi;
+
+       oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL);
+       if (!oi)
+               return NULL;
+
+       oi->vfs_inode.i_version = 1;
+       return &oi->vfs_inode;
+}
+
+/*
+ * Remove an inode from the cache
+ */
+static void exofs_destroy_inode(struct inode *inode)
+{
+       kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
+}
+
+/*
+ * Initialize the inode
+ */
+static void exofs_init_once(void *foo)
+{
+       struct exofs_i_info *oi = foo;
+
+       inode_init_once(&oi->vfs_inode);
+}
+
+/*
+ * Create and initialize the inode cache
+ */
+static int init_inodecache(void)
+{
+       exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
+                               sizeof(struct exofs_i_info), 0,
+                               SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                               exofs_init_once);
+       if (exofs_inode_cachep == NULL)
+               return -ENOMEM;
+       return 0;
+}
+
+/*
+ * Destroy the inode cache
+ */
+static void destroy_inodecache(void)
+{
+       kmem_cache_destroy(exofs_inode_cachep);
+}
+
+/******************************************************************************
+ * SUPERBLOCK FUNCTIONS
+ *****************************************************************************/
+static const struct super_operations exofs_sops;
+static const struct export_operations exofs_export_ops;
+
+/*
+ * Write the superblock to the OSD
+ */
+static void exofs_write_super(struct super_block *sb)
+{
+       struct exofs_sb_info *sbi;
+       struct exofs_fscb *fscb;
+       struct osd_request *or;
+       struct osd_obj_id obj;
+       int ret;
+
+       fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
+       if (!fscb) {
+               EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
+               return;
+       }
+
+       lock_kernel();
+       sbi = sb->s_fs_info;
+       fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
+       fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
+       fscb->s_magic = cpu_to_le16(sb->s_magic);
+       fscb->s_newfs = 0;
+
+       or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+       if (unlikely(!or)) {
+               EXOFS_ERR("exofs_write_super: osd_start_request failed.\n");
+               goto out;
+       }
+
+       obj.partition = sbi->s_pid;
+       obj.id = EXOFS_SUPER_ID;
+       ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb));
+       if (unlikely(ret)) {
+               EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n");
+               goto out;
+       }
+
+       ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
+       if (unlikely(ret)) {
+               EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n");
+               goto out;
+       }
+       sb->s_dirt = 0;
+
+out:
+       if (or)
+               osd_end_request(or);
+       unlock_kernel();
+       kfree(fscb);
+}
+
+/*
+ * This function is called when the vfs is freeing the superblock.  We just
+ * need to free our own part.
+ */
+static void exofs_put_super(struct super_block *sb)
+{
+       int num_pend;
+       struct exofs_sb_info *sbi = sb->s_fs_info;
+
+       /* make sure there are no pending commands */
+       for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
+            num_pend = atomic_read(&sbi->s_curr_pending)) {
+               wait_queue_head_t wq;
+               init_waitqueue_head(&wq);
+               wait_event_timeout(wq,
+                                 (atomic_read(&sbi->s_curr_pending) == 0),
+                                 msecs_to_jiffies(100));
+       }
+
+       osduld_put_device(sbi->s_dev);
+       kfree(sb->s_fs_info);
+       sb->s_fs_info = NULL;
+}
+
+/*
+ * Read the superblock from the OSD and fill in the fields
+ */
+static int exofs_fill_super(struct super_block *sb, void *data, int silent)
+{
+       struct inode *root;
+       struct exofs_mountopt *opts = data;
+       struct exofs_sb_info *sbi;      /*extended info                  */
+       struct exofs_fscb fscb;         /*on-disk superblock info        */
+       struct osd_request *or = NULL;
+       struct osd_obj_id obj;
+       int ret;
+
+       sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+       if (!sbi)
+               return -ENOMEM;
+       sb->s_fs_info = sbi;
+
+       /* use mount options to fill superblock */
+       sbi->s_dev = osduld_path_lookup(opts->dev_name);
+       if (IS_ERR(sbi->s_dev)) {
+               ret = PTR_ERR(sbi->s_dev);
+               sbi->s_dev = NULL;
+               goto free_sbi;
+       }
+
+       sbi->s_pid = opts->pid;
+       sbi->s_timeout = opts->timeout;
+
+       /* fill in some other data by hand */
+       memset(sb->s_id, 0, sizeof(sb->s_id));
+       strcpy(sb->s_id, "exofs");
+       sb->s_blocksize = EXOFS_BLKSIZE;
+       sb->s_blocksize_bits = EXOFS_BLKSHIFT;
+       sb->s_maxbytes = MAX_LFS_FILESIZE;
+       atomic_set(&sbi->s_curr_pending, 0);
+       sb->s_bdev = NULL;
+       sb->s_dev = 0;
+
+       /* read data from on-disk superblock object */
+       obj.partition = sbi->s_pid;
+       obj.id = EXOFS_SUPER_ID;
+       exofs_make_credential(sbi->s_cred, &obj);
+
+       or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+       if (unlikely(!or)) {
+               if (!silent)
+                       EXOFS_ERR(
+                              "exofs_fill_super: osd_start_request failed.\n");
+               ret = -ENOMEM;
+               goto free_sbi;
+       }
+       ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb));
+       if (unlikely(ret)) {
+               if (!silent)
+                       EXOFS_ERR(
+                              "exofs_fill_super: osd_req_read_kern failed.\n");
+               ret = -ENOMEM;
+               goto free_sbi;
+       }
+
+       ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
+       if (unlikely(ret)) {
+               if (!silent)
+                       EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n");
+               ret = -EIO;
+               goto free_sbi;
+       }
+
+       sb->s_magic = le16_to_cpu(fscb.s_magic);
+       sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
+       sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
+
+       /* make sure what we read from the object store is correct */
+       if (sb->s_magic != EXOFS_SUPER_MAGIC) {
+               if (!silent)
+                       EXOFS_ERR("ERROR: Bad magic value\n");
+               ret = -EINVAL;
+               goto free_sbi;
+       }
+
+       /* start generation numbers from a random point */
+       get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+       spin_lock_init(&sbi->s_next_gen_lock);
+
+       /* set up operation vectors */
+       sb->s_op = &exofs_sops;
+       sb->s_export_op = &exofs_export_ops;
+       root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
+       if (IS_ERR(root)) {
+               EXOFS_ERR("ERROR: exofs_iget failed\n");
+               ret = PTR_ERR(root);
+               goto free_sbi;
+       }
+       sb->s_root = d_alloc_root(root);
+       if (!sb->s_root) {
+               iput(root);
+               EXOFS_ERR("ERROR: get root inode failed\n");
+               ret = -ENOMEM;
+               goto free_sbi;
+       }
+
+       if (!S_ISDIR(root->i_mode)) {
+               dput(sb->s_root);
+               sb->s_root = NULL;
+               EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n",
+                      root->i_mode);
+               ret = -EINVAL;
+               goto free_sbi;
+       }
+
+       ret = 0;
+out:
+       if (or)
+               osd_end_request(or);
+       return ret;
+
+free_sbi:
+       osduld_put_device(sbi->s_dev); /* NULL safe */
+       kfree(sbi);
+       goto out;
+}
+
+/*
+ * Set up the superblock (calls exofs_fill_super eventually)
+ */
+static int exofs_get_sb(struct file_system_type *type,
+                         int flags, const char *dev_name,
+                         void *data, struct vfsmount *mnt)
+{
+       struct exofs_mountopt opts;
+       int ret;
+
+       ret = parse_options(data, &opts);
+       if (ret)
+               return ret;
+
+       opts.dev_name = dev_name;
+       return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt);
+}
+
+/*
+ * Return information about the file system state in the buffer.  This is used
+ * by the 'df' command, for example.
+ */
+static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+       struct super_block *sb = dentry->d_sb;
+       struct exofs_sb_info *sbi = sb->s_fs_info;
+       struct osd_obj_id obj = {sbi->s_pid, 0};
+       struct osd_attr attrs[] = {
+               ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
+                       OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
+               ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION,
+                       OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)),
+       };
+       uint64_t capacity = ULLONG_MAX;
+       uint64_t used = ULLONG_MAX;
+       struct osd_request *or;
+       uint8_t cred_a[OSD_CAP_LEN];
+       int ret;
+
+       /* get used/capacity attributes */
+       exofs_make_credential(cred_a, &obj);
+
+       or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+       if (unlikely(!or)) {
+               EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n");
+               return -ENOMEM;
+       }
+
+       osd_req_get_attributes(or, &obj);
+       osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs));
+       ret = exofs_sync_op(or, sbi->s_timeout, cred_a);
+       if (unlikely(ret))
+               goto out;
+
+       ret = extract_attr_from_req(or, &attrs[0]);
+       if (likely(!ret))
+               capacity = get_unaligned_be64(attrs[0].val_ptr);
+       else
+               EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
+
+       ret = extract_attr_from_req(or, &attrs[1]);
+       if (likely(!ret))
+               used = get_unaligned_be64(attrs[1].val_ptr);
+       else
+               EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n");
+
+       /* fill in the stats buffer */
+       buf->f_type = EXOFS_SUPER_MAGIC;
+       buf->f_bsize = EXOFS_BLKSIZE;
+       buf->f_blocks = (capacity >> EXOFS_BLKSHIFT);
+       buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT);
+       buf->f_bavail = buf->f_bfree;
+       buf->f_files = sbi->s_numfiles;
+       buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
+       buf->f_namelen = EXOFS_NAME_LEN;
+
+out:
+       osd_end_request(or);
+       return ret;
+}
+
+static const struct super_operations exofs_sops = {
+       .alloc_inode    = exofs_alloc_inode,
+       .destroy_inode  = exofs_destroy_inode,
+       .write_inode    = exofs_write_inode,
+       .delete_inode   = exofs_delete_inode,
+       .put_super      = exofs_put_super,
+       .write_super    = exofs_write_super,
+       .statfs         = exofs_statfs,
+};
+
+/******************************************************************************
+ * EXPORT OPERATIONS
+ *****************************************************************************/
+
+struct dentry *exofs_get_parent(struct dentry *child)
+{
+       unsigned long ino = exofs_parent_ino(child);
+
+       if (!ino)
+               return NULL;
+
+       return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino));
+}
+
+static struct inode *exofs_nfs_get_inode(struct super_block *sb,
+               u64 ino, u32 generation)
+{
+       struct inode *inode;
+
+       inode = exofs_iget(sb, ino);
+       if (IS_ERR(inode))
+               return ERR_CAST(inode);
+       if (generation && inode->i_generation != generation) {
+               /* we didn't find the right inode.. */
+               iput(inode);
+               return ERR_PTR(-ESTALE);
+       }
+       return inode;
+}
+
+static struct dentry *exofs_fh_to_dentry(struct super_block *sb,
+                               struct fid *fid, int fh_len, int fh_type)
+{
+       return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+                                   exofs_nfs_get_inode);
+}
+
+static struct dentry *exofs_fh_to_parent(struct super_block *sb,
+                               struct fid *fid, int fh_len, int fh_type)
+{
+       return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+                                   exofs_nfs_get_inode);
+}
+
+static const struct export_operations exofs_export_ops = {
+       .fh_to_dentry = exofs_fh_to_dentry,
+       .fh_to_parent = exofs_fh_to_parent,
+       .get_parent = exofs_get_parent,
+};
+
+/******************************************************************************
+ * INSMOD/RMMOD
+ *****************************************************************************/
+
+/*
+ * struct that describes this file system
+ */
+static struct file_system_type exofs_type = {
+       .owner          = THIS_MODULE,
+       .name           = "exofs",
+       .get_sb         = exofs_get_sb,
+       .kill_sb        = generic_shutdown_super,
+};
+
+static int __init init_exofs(void)
+{
+       int err;
+
+       err = init_inodecache();
+       if (err)
+               goto out;
+
+       err = register_filesystem(&exofs_type);
+       if (err)
+               goto out_d;
+
+       return 0;
+out_d:
+       destroy_inodecache();
+out:
+       return err;
+}
+
+static void __exit exit_exofs(void)
+{
+       unregister_filesystem(&exofs_type);
+       destroy_inodecache();
+}
+
+MODULE_AUTHOR("Avishay Traeger <avishay@gmail.com>");
+MODULE_DESCRIPTION("exofs");
+MODULE_LICENSE("GPL");
+
+module_init(init_exofs)
+module_exit(exit_exofs)
diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c
new file mode 100644 (file)
index 0000000..36e2d7b
--- /dev/null
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/namei.h>
+
+#include "exofs.h"
+
+static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+       struct exofs_i_info *oi = exofs_i(dentry->d_inode);
+
+       nd_set_link(nd, (char *)oi->i_data);
+       return NULL;
+}
+
+const struct inode_operations exofs_symlink_inode_operations = {
+       .readlink       = generic_readlink,
+       .follow_link    = page_follow_link_light,
+       .put_link       = page_put_link,
+};
+
+const struct inode_operations exofs_fast_symlink_inode_operations = {
+       .readlink       = generic_readlink,
+       .follow_link    = exofs_follow_link,
+};
index 521f8238b2fa1e967b47040355e5834fe3fc3e53..5b49704b231b27b53a8d52dc15a1c62118f6124f 100644 (file)
  */
 static int ext3_release_file (struct inode * inode, struct file * filp)
 {
+       if (EXT3_I(inode)->i_state & EXT3_STATE_FLUSH_ON_CLOSE) {
+               filemap_flush(inode->i_mapping);
+               EXT3_I(inode)->i_state &= ~EXT3_STATE_FLUSH_ON_CLOSE;
+       }
        /* if we are the last writer on the inode, drop the block reservation */
        if ((filp->f_mode & FMODE_WRITE) &&
                        (atomic_read(&inode->i_writecount) == 1))
index d3ef6566b0190340b21f94402a811f5a45d4f38d..466a332e0bd124c871143cfd682392074252aa76 100644 (file)
@@ -2363,6 +2363,9 @@ void ext3_truncate(struct inode *inode)
        if (!ext3_can_truncate(inode))
                return;
 
+       if (inode->i_size == 0 && ext3_should_writeback_data(inode))
+               ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE;
+
        /*
         * We have to lock the EOF page here, because lock_page() nests
         * outside journal_start().
index 6ddaa0a42b24a1c7d3576ce1a5287d869f0c2f46..6ff7b9730234bd97f2a67a1d01b08692630549ba 100644 (file)
@@ -2274,7 +2274,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
        struct inode * old_inode, * new_inode;
        struct buffer_head * old_bh, * new_bh, * dir_bh;
        struct ext3_dir_entry_2 * old_de, * new_de;
-       int retval;
+       int retval, flush_file = 0;
 
        old_bh = new_bh = dir_bh = NULL;
 
@@ -2410,6 +2410,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
                ext3_mark_inode_dirty(handle, new_inode);
                if (!new_inode->i_nlink)
                        ext3_orphan_add(handle, new_inode);
+               if (ext3_should_writeback_data(new_inode))
+                       flush_file = 1;
        }
        retval = 0;
 
@@ -2418,6 +2420,8 @@ end_rename:
        brelse (old_bh);
        brelse (new_bh);
        ext3_journal_stop(handle);
+       if (retval == 0 && flush_file)
+               filemap_flush(old_inode->i_mapping);
        return retval;
 }
 
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
new file mode 100644 (file)
index 0000000..9bbb8ce
--- /dev/null
@@ -0,0 +1,56 @@
+
+config FSCACHE
+       tristate "General filesystem local caching manager"
+       depends on EXPERIMENTAL
+       select SLOW_WORK
+       help
+         This option enables a generic filesystem caching manager that can be
+         used by various network and other filesystems to cache data locally.
+         Different sorts of caches can be plugged in, depending on the
+         resources available.
+
+         See Documentation/filesystems/caching/fscache.txt for more information.
+
+config FSCACHE_STATS
+       bool "Gather statistical information on local caching"
+       depends on FSCACHE && PROC_FS
+       help
+         This option causes statistical information to be gathered on local
+         caching and exported through file:
+
+               /proc/fs/fscache/stats
+
+         The gathering of statistics adds a certain amount of overhead to
+         execution as there are a quite a few stats gathered, and on a
+         multi-CPU system these may be on cachelines that keep bouncing
+         between CPUs.  On the other hand, the stats are very useful for
+         debugging purposes.  Saying 'Y' here is recommended.
+
+         See Documentation/filesystems/caching/fscache.txt for more information.
+
+config FSCACHE_HISTOGRAM
+       bool "Gather latency information on local caching"
+       depends on FSCACHE && PROC_FS
+       help
+         This option causes latency information to be gathered on local
+         caching and exported through file:
+
+               /proc/fs/fscache/histogram
+
+         The generation of this histogram adds a certain amount of overhead to
+         execution as there are a number of points at which data is gathered,
+         and on a multi-CPU system these may be on cachelines that keep
+         bouncing between CPUs.  On the other hand, the histogram may be
+         useful for debugging purposes.  Saying 'N' here is recommended.
+
+         See Documentation/filesystems/caching/fscache.txt for more information.
+
+config FSCACHE_DEBUG
+       bool "Debug FS-Cache"
+       depends on FSCACHE
+       help
+         This permits debugging to be dynamically enabled in the local caching
+         management module.  If this is set, the debugging output may be
+         enabled by setting bits in /sys/modules/fscache/parameter/debug.
+
+         See Documentation/filesystems/caching/fscache.txt for more information.
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
new file mode 100644 (file)
index 0000000..91571b9
--- /dev/null
@@ -0,0 +1,19 @@
+#
+# Makefile for general filesystem caching code
+#
+
+fscache-y := \
+       cache.o \
+       cookie.o \
+       fsdef.o \
+       main.o \
+       netfs.o \
+       object.o \
+       operation.o \
+       page.o
+
+fscache-$(CONFIG_PROC_FS) += proc.o
+fscache-$(CONFIG_FSCACHE_STATS) += stats.o
+fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o
+
+obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
new file mode 100644 (file)
index 0000000..e21985b
--- /dev/null
@@ -0,0 +1,415 @@
+/* FS-Cache cache handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+LIST_HEAD(fscache_cache_list);
+DECLARE_RWSEM(fscache_addremove_sem);
+DECLARE_WAIT_QUEUE_HEAD(fscache_cache_cleared_wq);
+EXPORT_SYMBOL(fscache_cache_cleared_wq);
+
+static LIST_HEAD(fscache_cache_tag_list);
+
+/*
+ * look up a cache tag
+ */
+struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *name)
+{
+       struct fscache_cache_tag *tag, *xtag;
+
+       /* firstly check for the existence of the tag under read lock */
+       down_read(&fscache_addremove_sem);
+
+       list_for_each_entry(tag, &fscache_cache_tag_list, link) {
+               if (strcmp(tag->name, name) == 0) {
+                       atomic_inc(&tag->usage);
+                       up_read(&fscache_addremove_sem);
+                       return tag;
+               }
+       }
+
+       up_read(&fscache_addremove_sem);
+
+       /* the tag does not exist - create a candidate */
+       xtag = kzalloc(sizeof(*xtag) + strlen(name) + 1, GFP_KERNEL);
+       if (!xtag)
+               /* return a dummy tag if out of memory */
+               return ERR_PTR(-ENOMEM);
+
+       atomic_set(&xtag->usage, 1);
+       strcpy(xtag->name, name);
+
+       /* write lock, search again and add if still not present */
+       down_write(&fscache_addremove_sem);
+
+       list_for_each_entry(tag, &fscache_cache_tag_list, link) {
+               if (strcmp(tag->name, name) == 0) {
+                       atomic_inc(&tag->usage);
+                       up_write(&fscache_addremove_sem);
+                       kfree(xtag);
+                       return tag;
+               }
+       }
+
+       list_add_tail(&xtag->link, &fscache_cache_tag_list);
+       up_write(&fscache_addremove_sem);
+       return xtag;
+}
+
+/*
+ * release a reference to a cache tag
+ */
+void __fscache_release_cache_tag(struct fscache_cache_tag *tag)
+{
+       if (tag != ERR_PTR(-ENOMEM)) {
+               down_write(&fscache_addremove_sem);
+
+               if (atomic_dec_and_test(&tag->usage))
+                       list_del_init(&tag->link);
+               else
+                       tag = NULL;
+
+               up_write(&fscache_addremove_sem);
+
+               kfree(tag);
+       }
+}
+
+/*
+ * select a cache in which to store an object
+ * - the cache addremove semaphore must be at least read-locked by the caller
+ * - the object will never be an index
+ */
+struct fscache_cache *fscache_select_cache_for_object(
+       struct fscache_cookie *cookie)
+{
+       struct fscache_cache_tag *tag;
+       struct fscache_object *object;
+       struct fscache_cache *cache;
+
+       _enter("");
+
+       if (list_empty(&fscache_cache_list)) {
+               _leave(" = NULL [no cache]");
+               return NULL;
+       }
+
+       /* we check the parent to determine the cache to use */
+       spin_lock(&cookie->lock);
+
+       /* the first in the parent's backing list should be the preferred
+        * cache */
+       if (!hlist_empty(&cookie->backing_objects)) {
+               object = hlist_entry(cookie->backing_objects.first,
+                                    struct fscache_object, cookie_link);
+
+               cache = object->cache;
+               if (object->state >= FSCACHE_OBJECT_DYING ||
+                   test_bit(FSCACHE_IOERROR, &cache->flags))
+                       cache = NULL;
+
+               spin_unlock(&cookie->lock);
+               _leave(" = %p [parent]", cache);
+               return cache;
+       }
+
+       /* the parent is unbacked */
+       if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+               /* cookie not an index and is unbacked */
+               spin_unlock(&cookie->lock);
+               _leave(" = NULL [cookie ub,ni]");
+               return NULL;
+       }
+
+       spin_unlock(&cookie->lock);
+
+       if (!cookie->def->select_cache)
+               goto no_preference;
+
+       /* ask the netfs for its preference */
+       tag = cookie->def->select_cache(cookie->parent->netfs_data,
+                                       cookie->netfs_data);
+       if (!tag)
+               goto no_preference;
+
+       if (tag == ERR_PTR(-ENOMEM)) {
+               _leave(" = NULL [nomem tag]");
+               return NULL;
+       }
+
+       if (!tag->cache) {
+               _leave(" = NULL [unbacked tag]");
+               return NULL;
+       }
+
+       if (test_bit(FSCACHE_IOERROR, &tag->cache->flags))
+               return NULL;
+
+       _leave(" = %p [specific]", tag->cache);
+       return tag->cache;
+
+no_preference:
+       /* netfs has no preference - just select first cache */
+       cache = list_entry(fscache_cache_list.next,
+                          struct fscache_cache, link);
+       _leave(" = %p [first]", cache);
+       return cache;
+}
+
+/**
+ * fscache_init_cache - Initialise a cache record
+ * @cache: The cache record to be initialised
+ * @ops: The cache operations to be installed in that record
+ * @idfmt: Format string to define identifier
+ * @...: sprintf-style arguments
+ *
+ * Initialise a record of a cache and fill in the name.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_init_cache(struct fscache_cache *cache,
+                       const struct fscache_cache_ops *ops,
+                       const char *idfmt,
+                       ...)
+{
+       va_list va;
+
+       memset(cache, 0, sizeof(*cache));
+
+       cache->ops = ops;
+
+       va_start(va, idfmt);
+       vsnprintf(cache->identifier, sizeof(cache->identifier), idfmt, va);
+       va_end(va);
+
+       INIT_WORK(&cache->op_gc, fscache_operation_gc);
+       INIT_LIST_HEAD(&cache->link);
+       INIT_LIST_HEAD(&cache->object_list);
+       INIT_LIST_HEAD(&cache->op_gc_list);
+       spin_lock_init(&cache->object_list_lock);
+       spin_lock_init(&cache->op_gc_list_lock);
+}
+EXPORT_SYMBOL(fscache_init_cache);
+
+/**
+ * fscache_add_cache - Declare a cache as being open for business
+ * @cache: The record describing the cache
+ * @ifsdef: The record of the cache object describing the top-level index
+ * @tagname: The tag describing this cache
+ *
+ * Add a cache to the system, making it available for netfs's to use.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+int fscache_add_cache(struct fscache_cache *cache,
+                     struct fscache_object *ifsdef,
+                     const char *tagname)
+{
+       struct fscache_cache_tag *tag;
+
+       BUG_ON(!cache->ops);
+       BUG_ON(!ifsdef);
+
+       cache->flags = 0;
+       ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+       ifsdef->state = FSCACHE_OBJECT_ACTIVE;
+
+       if (!tagname)
+               tagname = cache->identifier;
+
+       BUG_ON(!tagname[0]);
+
+       _enter("{%s.%s},,%s", cache->ops->name, cache->identifier, tagname);
+
+       /* we use the cache tag to uniquely identify caches */
+       tag = __fscache_lookup_cache_tag(tagname);
+       if (IS_ERR(tag))
+               goto nomem;
+
+       if (test_and_set_bit(FSCACHE_TAG_RESERVED, &tag->flags))
+               goto tag_in_use;
+
+       cache->kobj = kobject_create_and_add(tagname, fscache_root);
+       if (!cache->kobj)
+               goto error;
+
+       ifsdef->cookie = &fscache_fsdef_index;
+       ifsdef->cache = cache;
+       cache->fsdef = ifsdef;
+
+       down_write(&fscache_addremove_sem);
+
+       tag->cache = cache;
+       cache->tag = tag;
+
+       /* add the cache to the list */
+       list_add(&cache->link, &fscache_cache_list);
+
+       /* add the cache's netfs definition index object to the cache's
+        * list */
+       spin_lock(&cache->object_list_lock);
+       list_add_tail(&ifsdef->cache_link, &cache->object_list);
+       spin_unlock(&cache->object_list_lock);
+
+       /* add the cache's netfs definition index object to the top level index
+        * cookie as a known backing object */
+       spin_lock(&fscache_fsdef_index.lock);
+
+       hlist_add_head(&ifsdef->cookie_link,
+                      &fscache_fsdef_index.backing_objects);
+
+       atomic_inc(&fscache_fsdef_index.usage);
+
+       /* done */
+       spin_unlock(&fscache_fsdef_index.lock);
+       up_write(&fscache_addremove_sem);
+
+       printk(KERN_NOTICE "FS-Cache: Cache \"%s\" added (type %s)\n",
+              cache->tag->name, cache->ops->name);
+       kobject_uevent(cache->kobj, KOBJ_ADD);
+
+       _leave(" = 0 [%s]", cache->identifier);
+       return 0;
+
+tag_in_use:
+       printk(KERN_ERR "FS-Cache: Cache tag '%s' already in use\n", tagname);
+       __fscache_release_cache_tag(tag);
+       _leave(" = -EXIST");
+       return -EEXIST;
+
+error:
+       __fscache_release_cache_tag(tag);
+       _leave(" = -EINVAL");
+       return -EINVAL;
+
+nomem:
+       _leave(" = -ENOMEM");
+       return -ENOMEM;
+}
+EXPORT_SYMBOL(fscache_add_cache);
+
+/**
+ * fscache_io_error - Note a cache I/O error
+ * @cache: The record describing the cache
+ *
+ * Note that an I/O error occurred in a cache and that it should no longer be
+ * used for anything.  This also reports the error into the kernel log.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_io_error(struct fscache_cache *cache)
+{
+       set_bit(FSCACHE_IOERROR, &cache->flags);
+
+       printk(KERN_ERR "FS-Cache: Cache %s stopped due to I/O error\n",
+              cache->ops->name);
+}
+EXPORT_SYMBOL(fscache_io_error);
+
+/*
+ * request withdrawal of all the objects in a cache
+ * - all the objects being withdrawn are moved onto the supplied list
+ */
+static void fscache_withdraw_all_objects(struct fscache_cache *cache,
+                                        struct list_head *dying_objects)
+{
+       struct fscache_object *object;
+
+       spin_lock(&cache->object_list_lock);
+
+       while (!list_empty(&cache->object_list)) {
+               object = list_entry(cache->object_list.next,
+                                   struct fscache_object, cache_link);
+               list_move_tail(&object->cache_link, dying_objects);
+
+               _debug("withdraw %p", object->cookie);
+
+               spin_lock(&object->lock);
+               spin_unlock(&cache->object_list_lock);
+               fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW);
+               spin_unlock(&object->lock);
+
+               cond_resched();
+               spin_lock(&cache->object_list_lock);
+       }
+
+       spin_unlock(&cache->object_list_lock);
+}
+
+/**
+ * fscache_withdraw_cache - Withdraw a cache from the active service
+ * @cache: The record describing the cache
+ *
+ * Withdraw a cache from service, unbinding all its cache objects from the
+ * netfs cookies they're currently representing.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+void fscache_withdraw_cache(struct fscache_cache *cache)
+{
+       LIST_HEAD(dying_objects);
+
+       _enter("");
+
+       printk(KERN_NOTICE "FS-Cache: Withdrawing cache \"%s\"\n",
+              cache->tag->name);
+
+       /* make the cache unavailable for cookie acquisition */
+       if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags))
+               BUG();
+
+       down_write(&fscache_addremove_sem);
+       list_del_init(&cache->link);
+       cache->tag->cache = NULL;
+       up_write(&fscache_addremove_sem);
+
+       /* make sure all pages pinned by operations on behalf of the netfs are
+        * written to disk */
+       cache->ops->sync_cache(cache);
+
+       /* dissociate all the netfs pages backed by this cache from the block
+        * mappings in the cache */
+       cache->ops->dissociate_pages(cache);
+
+       /* we now have to destroy all the active objects pertaining to this
+        * cache - which we do by passing them off to thread pool to be
+        * disposed of */
+       _debug("destroy");
+
+       fscache_withdraw_all_objects(cache, &dying_objects);
+
+       /* wait for all extant objects to finish their outstanding operations
+        * and go away */
+       _debug("wait for finish");
+       wait_event(fscache_cache_cleared_wq,
+                  atomic_read(&cache->object_count) == 0);
+       _debug("wait for clearance");
+       wait_event(fscache_cache_cleared_wq,
+                  list_empty(&cache->object_list));
+       _debug("cleared");
+       ASSERT(list_empty(&dying_objects));
+
+       kobject_put(cache->kobj);
+
+       clear_bit(FSCACHE_TAG_RESERVED, &cache->tag->flags);
+       fscache_release_cache_tag(cache->tag);
+       cache->tag = NULL;
+
+       _leave("");
+}
+EXPORT_SYMBOL(fscache_withdraw_cache);
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
new file mode 100644 (file)
index 0000000..72fd18f
--- /dev/null
@@ -0,0 +1,500 @@
+/* netfs cookie management
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for more information on
+ * the netfs API.
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+struct kmem_cache *fscache_cookie_jar;
+
+static atomic_t fscache_object_debug_id = ATOMIC_INIT(0);
+
+static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie);
+static int fscache_alloc_object(struct fscache_cache *cache,
+                               struct fscache_cookie *cookie);
+static int fscache_attach_object(struct fscache_cookie *cookie,
+                                struct fscache_object *object);
+
+/*
+ * initialise an cookie jar slab element prior to any use
+ */
+void fscache_cookie_init_once(void *_cookie)
+{
+       struct fscache_cookie *cookie = _cookie;
+
+       memset(cookie, 0, sizeof(*cookie));
+       spin_lock_init(&cookie->lock);
+       INIT_HLIST_HEAD(&cookie->backing_objects);
+}
+
+/*
+ * request a cookie to represent an object (index, datafile, xattr, etc)
+ * - parent specifies the parent object
+ *   - the top level index cookie for each netfs is stored in the fscache_netfs
+ *     struct upon registration
+ * - def points to the definition
+ * - the netfs_data will be passed to the functions pointed to in *def
+ * - all attached caches will be searched to see if they contain this object
+ * - index objects aren't stored on disk until there's a dependent file that
+ *   needs storing
+ * - other objects are stored in a selected cache immediately, and all the
+ *   indices forming the path to it are instantiated if necessary
+ * - we never let on to the netfs about errors
+ *   - we may set a negative cookie pointer, but that's okay
+ */
+struct fscache_cookie *__fscache_acquire_cookie(
+       struct fscache_cookie *parent,
+       const struct fscache_cookie_def *def,
+       void *netfs_data)
+{
+       struct fscache_cookie *cookie;
+
+       BUG_ON(!def);
+
+       _enter("{%s},{%s},%p",
+              parent ? (char *) parent->def->name : "<no-parent>",
+              def->name, netfs_data);
+
+       fscache_stat(&fscache_n_acquires);
+
+       /* if there's no parent cookie, then we don't create one here either */
+       if (!parent) {
+               fscache_stat(&fscache_n_acquires_null);
+               _leave(" [no parent]");
+               return NULL;
+       }
+
+       /* validate the definition */
+       BUG_ON(!def->get_key);
+       BUG_ON(!def->name[0]);
+
+       BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX &&
+              parent->def->type != FSCACHE_COOKIE_TYPE_INDEX);
+
+       /* allocate and initialise a cookie */
+       cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL);
+       if (!cookie) {
+               fscache_stat(&fscache_n_acquires_oom);
+               _leave(" [ENOMEM]");
+               return NULL;
+       }
+
+       atomic_set(&cookie->usage, 1);
+       atomic_set(&cookie->n_children, 0);
+
+       atomic_inc(&parent->usage);
+       atomic_inc(&parent->n_children);
+
+       cookie->def             = def;
+       cookie->parent          = parent;
+       cookie->netfs_data      = netfs_data;
+       cookie->flags           = 0;
+
+       INIT_RADIX_TREE(&cookie->stores, GFP_NOFS);
+
+       switch (cookie->def->type) {
+       case FSCACHE_COOKIE_TYPE_INDEX:
+               fscache_stat(&fscache_n_cookie_index);
+               break;
+       case FSCACHE_COOKIE_TYPE_DATAFILE:
+               fscache_stat(&fscache_n_cookie_data);
+               break;
+       default:
+               fscache_stat(&fscache_n_cookie_special);
+               break;
+       }
+
+       /* if the object is an index then we need do nothing more here - we
+        * create indices on disk when we need them as an index may exist in
+        * multiple caches */
+       if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+               if (fscache_acquire_non_index_cookie(cookie) < 0) {
+                       atomic_dec(&parent->n_children);
+                       __fscache_cookie_put(cookie);
+                       fscache_stat(&fscache_n_acquires_nobufs);
+                       _leave(" = NULL");
+                       return NULL;
+               }
+       }
+
+       fscache_stat(&fscache_n_acquires_ok);
+       _leave(" = %p", cookie);
+       return cookie;
+}
+EXPORT_SYMBOL(__fscache_acquire_cookie);
+
+/*
+ * acquire a non-index cookie
+ * - this must make sure the index chain is instantiated and instantiate the
+ *   object representation too
+ */
+static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
+{
+       struct fscache_object *object;
+       struct fscache_cache *cache;
+       uint64_t i_size;
+       int ret;
+
+       _enter("");
+
+       cookie->flags = 1 << FSCACHE_COOKIE_UNAVAILABLE;
+
+       /* now we need to see whether the backing objects for this cookie yet
+        * exist, if not there'll be nothing to search */
+       down_read(&fscache_addremove_sem);
+
+       if (list_empty(&fscache_cache_list)) {
+               up_read(&fscache_addremove_sem);
+               _leave(" = 0 [no caches]");
+               return 0;
+       }
+
+       /* select a cache in which to store the object */
+       cache = fscache_select_cache_for_object(cookie->parent);
+       if (!cache) {
+               up_read(&fscache_addremove_sem);
+               fscache_stat(&fscache_n_acquires_no_cache);
+               _leave(" = -ENOMEDIUM [no cache]");
+               return -ENOMEDIUM;
+       }
+
+       _debug("cache %s", cache->tag->name);
+
+       cookie->flags =
+               (1 << FSCACHE_COOKIE_LOOKING_UP) |
+               (1 << FSCACHE_COOKIE_CREATING) |
+               (1 << FSCACHE_COOKIE_NO_DATA_YET);
+
+       /* ask the cache to allocate objects for this cookie and its parent
+        * chain */
+       ret = fscache_alloc_object(cache, cookie);
+       if (ret < 0) {
+               up_read(&fscache_addremove_sem);
+               _leave(" = %d", ret);
+               return ret;
+       }
+
+       /* pass on how big the object we're caching is supposed to be */
+       cookie->def->get_attr(cookie->netfs_data, &i_size);
+
+       spin_lock(&cookie->lock);
+       if (hlist_empty(&cookie->backing_objects)) {
+               spin_unlock(&cookie->lock);
+               goto unavailable;
+       }
+
+       object = hlist_entry(cookie->backing_objects.first,
+                            struct fscache_object, cookie_link);
+
+       fscache_set_store_limit(object, i_size);
+
+       /* initiate the process of looking up all the objects in the chain
+        * (done by fscache_initialise_object()) */
+       fscache_enqueue_object(object);
+
+       spin_unlock(&cookie->lock);
+
+       /* we may be required to wait for lookup to complete at this point */
+       if (!fscache_defer_lookup) {
+               _debug("non-deferred lookup %p", &cookie->flags);
+               wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
+                           fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+               _debug("complete");
+               if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags))
+                       goto unavailable;
+       }
+
+       up_read(&fscache_addremove_sem);
+       _leave(" = 0 [deferred]");
+       return 0;
+
+unavailable:
+       up_read(&fscache_addremove_sem);
+       _leave(" = -ENOBUFS");
+       return -ENOBUFS;
+}
+
+/*
+ * recursively allocate cache object records for a cookie/cache combination
+ * - caller must be holding the addremove sem
+ */
+static int fscache_alloc_object(struct fscache_cache *cache,
+                               struct fscache_cookie *cookie)
+{
+       struct fscache_object *object;
+       struct hlist_node *_n;
+       int ret;
+
+       _enter("%p,%p{%s}", cache, cookie, cookie->def->name);
+
+       spin_lock(&cookie->lock);
+       hlist_for_each_entry(object, _n, &cookie->backing_objects,
+                            cookie_link) {
+               if (object->cache == cache)
+                       goto object_already_extant;
+       }
+       spin_unlock(&cookie->lock);
+
+       /* ask the cache to allocate an object (we may end up with duplicate
+        * objects at this stage, but we sort that out later) */
+       object = cache->ops->alloc_object(cache, cookie);
+       if (IS_ERR(object)) {
+               fscache_stat(&fscache_n_object_no_alloc);
+               ret = PTR_ERR(object);
+               goto error;
+       }
+
+       fscache_stat(&fscache_n_object_alloc);
+
+       object->debug_id = atomic_inc_return(&fscache_object_debug_id);
+
+       _debug("ALLOC OBJ%x: %s {%lx}",
+              object->debug_id, cookie->def->name, object->events);
+
+       ret = fscache_alloc_object(cache, cookie->parent);
+       if (ret < 0)
+               goto error_put;
+
+       /* only attach if we managed to allocate all we needed, otherwise
+        * discard the object we just allocated and instead use the one
+        * attached to the cookie */
+       if (fscache_attach_object(cookie, object) < 0)
+               cache->ops->put_object(object);
+
+       _leave(" = 0");
+       return 0;
+
+object_already_extant:
+       ret = -ENOBUFS;
+       if (object->state >= FSCACHE_OBJECT_DYING) {
+               spin_unlock(&cookie->lock);
+               goto error;
+       }
+       spin_unlock(&cookie->lock);
+       _leave(" = 0 [found]");
+       return 0;
+
+error_put:
+       cache->ops->put_object(object);
+error:
+       _leave(" = %d", ret);
+       return ret;
+}
+
+/*
+ * attach a cache object to a cookie
+ */
+static int fscache_attach_object(struct fscache_cookie *cookie,
+                                struct fscache_object *object)
+{
+       struct fscache_object *p;
+       struct fscache_cache *cache = object->cache;
+       struct hlist_node *_n;
+       int ret;
+
+       _enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id);
+
+       spin_lock(&cookie->lock);
+
+       /* there may be multiple initial creations of this object, but we only
+        * want one */
+       ret = -EEXIST;
+       hlist_for_each_entry(p, _n, &cookie->backing_objects, cookie_link) {
+               if (p->cache == object->cache) {
+                       if (p->state >= FSCACHE_OBJECT_DYING)
+                               ret = -ENOBUFS;
+                       goto cant_attach_object;
+               }
+       }
+
+       /* pin the parent object */
+       spin_lock_nested(&cookie->parent->lock, 1);
+       hlist_for_each_entry(p, _n, &cookie->parent->backing_objects,
+                            cookie_link) {
+               if (p->cache == object->cache) {
+                       if (p->state >= FSCACHE_OBJECT_DYING) {
+                               ret = -ENOBUFS;
+                               spin_unlock(&cookie->parent->lock);
+                               goto cant_attach_object;
+                       }
+                       object->parent = p;
+                       spin_lock(&p->lock);
+                       p->n_children++;
+                       spin_unlock(&p->lock);
+                       break;
+               }
+       }
+       spin_unlock(&cookie->parent->lock);
+
+       /* attach to the cache's object list */
+       if (list_empty(&object->cache_link)) {
+               spin_lock(&cache->object_list_lock);
+               list_add(&object->cache_link, &cache->object_list);
+               spin_unlock(&cache->object_list_lock);
+       }
+
+       /* attach to the cookie */
+       object->cookie = cookie;
+       atomic_inc(&cookie->usage);
+       hlist_add_head(&object->cookie_link, &cookie->backing_objects);
+       ret = 0;
+
+cant_attach_object:
+       spin_unlock(&cookie->lock);
+       _leave(" = %d", ret);
+       return ret;
+}
+
+/*
+ * update the index entries backing a cookie
+ */
+void __fscache_update_cookie(struct fscache_cookie *cookie)
+{
+       struct fscache_object *object;
+       struct hlist_node *_p;
+
+       fscache_stat(&fscache_n_updates);
+
+       if (!cookie) {
+               fscache_stat(&fscache_n_updates_null);
+               _leave(" [no cookie]");
+               return;
+       }
+
+       _enter("{%s}", cookie->def->name);
+
+       BUG_ON(!cookie->def->get_aux);
+
+       spin_lock(&cookie->lock);
+
+       /* update the index entry on disk in each cache backing this cookie */
+       hlist_for_each_entry(object, _p,
+                            &cookie->backing_objects, cookie_link) {
+               fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
+       }
+
+       spin_unlock(&cookie->lock);
+       _leave("");
+}
+EXPORT_SYMBOL(__fscache_update_cookie);
+
+/*
+ * release a cookie back to the cache
+ * - the object will be marked as recyclable on disk if retire is true
+ * - all dependents of this cookie must have already been unregistered
+ *   (indices/files/pages)
+ */
+void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
+{
+       struct fscache_cache *cache;
+       struct fscache_object *object;
+       unsigned long event;
+
+       fscache_stat(&fscache_n_relinquishes);
+
+       if (!cookie) {
+               fscache_stat(&fscache_n_relinquishes_null);
+               _leave(" [no cookie]");
+               return;
+       }
+
+       _enter("%p{%s,%p},%d",
+              cookie, cookie->def->name, cookie->netfs_data, retire);
+
+       if (atomic_read(&cookie->n_children) != 0) {
+               printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
+                      cookie->def->name);
+               BUG();
+       }
+
+       /* wait for the cookie to finish being instantiated (or to fail) */
+       if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) {
+               fscache_stat(&fscache_n_relinquishes_waitcrt);
+               wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING,
+                           fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+       }
+
+       event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
+
+       /* detach pointers back to the netfs */
+       spin_lock(&cookie->lock);
+
+       cookie->netfs_data      = NULL;
+       cookie->def             = NULL;
+
+       /* break links with all the active objects */
+       while (!hlist_empty(&cookie->backing_objects)) {
+               object = hlist_entry(cookie->backing_objects.first,
+                                    struct fscache_object,
+                                    cookie_link);
+
+               _debug("RELEASE OBJ%x", object->debug_id);
+
+               /* detach each cache object from the object cookie */
+               spin_lock(&object->lock);
+               hlist_del_init(&object->cookie_link);
+
+               cache = object->cache;
+               object->cookie = NULL;
+               fscache_raise_event(object, event);
+               spin_unlock(&object->lock);
+
+               if (atomic_dec_and_test(&cookie->usage))
+                       /* the cookie refcount shouldn't be reduced to 0 yet */
+                       BUG();
+       }
+
+       spin_unlock(&cookie->lock);
+
+       if (cookie->parent) {
+               ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
+               ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0);
+               atomic_dec(&cookie->parent->n_children);
+       }
+
+       /* finally dispose of the cookie */
+       ASSERTCMP(atomic_read(&cookie->usage), >, 0);
+       fscache_cookie_put(cookie);
+
+       _leave("");
+}
+EXPORT_SYMBOL(__fscache_relinquish_cookie);
+
+/*
+ * destroy a cookie
+ */
+void __fscache_cookie_put(struct fscache_cookie *cookie)
+{
+       struct fscache_cookie *parent;
+
+       _enter("%p", cookie);
+
+       for (;;) {
+               _debug("FREE COOKIE %p", cookie);
+               parent = cookie->parent;
+               BUG_ON(!hlist_empty(&cookie->backing_objects));
+               kmem_cache_free(fscache_cookie_jar, cookie);
+
+               if (!parent)
+                       break;
+
+               cookie = parent;
+               BUG_ON(atomic_read(&cookie->usage) <= 0);
+               if (!atomic_dec_and_test(&cookie->usage))
+                       break;
+       }
+
+       _leave("");
+}
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
new file mode 100644 (file)
index 0000000..f5b4bae
--- /dev/null
@@ -0,0 +1,144 @@
+/* Filesystem index definition
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include "internal.h"
+
+static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
+                                           void *buffer, uint16_t bufmax);
+
+static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
+                                           void *buffer, uint16_t bufmax);
+
+static
+enum fscache_checkaux fscache_fsdef_netfs_check_aux(void *cookie_netfs_data,
+                                                   const void *data,
+                                                   uint16_t datalen);
+
+/*
+ * The root index is owned by FS-Cache itself.
+ *
+ * When a netfs requests caching facilities, FS-Cache will, if one doesn't
+ * already exist, create an entry in the root index with the key being the name
+ * of the netfs ("AFS" for example), and the auxiliary data holding the index
+ * structure version supplied by the netfs:
+ *
+ *                                  FSDEF
+ *                                    |
+ *                              +-----------+
+ *                              |           |
+ *                             NFS         AFS
+ *                            [v=1]       [v=1]
+ *
+ * If an entry with the appropriate name does already exist, the version is
+ * compared.  If the version is different, the entire subtree from that entry
+ * will be discarded and a new entry created.
+ *
+ * The new entry will be an index, and a cookie referring to it will be passed
+ * to the netfs.  This is then the root handle by which the netfs accesses the
+ * cache.  It can create whatever objects it likes in that index, including
+ * further indices.
+ */
+static struct fscache_cookie_def fscache_fsdef_index_def = {
+       .name           = ".FS-Cache",
+       .type           = FSCACHE_COOKIE_TYPE_INDEX,
+};
+
+struct fscache_cookie fscache_fsdef_index = {
+       .usage          = ATOMIC_INIT(1),
+       .lock           = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
+       .backing_objects = HLIST_HEAD_INIT,
+       .def            = &fscache_fsdef_index_def,
+};
+EXPORT_SYMBOL(fscache_fsdef_index);
+
+/*
+ * Definition of an entry in the root index.  Each entry is an index, keyed to
+ * a specific netfs and only applicable to a particular version of the index
+ * structure used by that netfs.
+ */
+struct fscache_cookie_def fscache_fsdef_netfs_def = {
+       .name           = "FSDEF.netfs",
+       .type           = FSCACHE_COOKIE_TYPE_INDEX,
+       .get_key        = fscache_fsdef_netfs_get_key,
+       .get_aux        = fscache_fsdef_netfs_get_aux,
+       .check_aux      = fscache_fsdef_netfs_check_aux,
+};
+
+/*
+ * get the key data for an FSDEF index record - this is the name of the netfs
+ * for which this entry is created
+ */
+static uint16_t fscache_fsdef_netfs_get_key(const void *cookie_netfs_data,
+                                           void *buffer, uint16_t bufmax)
+{
+       const struct fscache_netfs *netfs = cookie_netfs_data;
+       unsigned klen;
+
+       _enter("{%s.%u},", netfs->name, netfs->version);
+
+       klen = strlen(netfs->name);
+       if (klen > bufmax)
+               return 0;
+
+       memcpy(buffer, netfs->name, klen);
+       return klen;
+}
+
+/*
+ * get the auxiliary data for an FSDEF index record - this is the index
+ * structure version number of the netfs for which this version is created
+ */
+static uint16_t fscache_fsdef_netfs_get_aux(const void *cookie_netfs_data,
+                                           void *buffer, uint16_t bufmax)
+{
+       const struct fscache_netfs *netfs = cookie_netfs_data;
+       unsigned dlen;
+
+       _enter("{%s.%u},", netfs->name, netfs->version);
+
+       dlen = sizeof(uint32_t);
+       if (dlen > bufmax)
+               return 0;
+
+       memcpy(buffer, &netfs->version, dlen);
+       return dlen;
+}
+
+/*
+ * check that the index structure version number stored in the auxiliary data
+ * matches the one the netfs gave us
+ */
+static enum fscache_checkaux fscache_fsdef_netfs_check_aux(
+       void *cookie_netfs_data,
+       const void *data,
+       uint16_t datalen)
+{
+       struct fscache_netfs *netfs = cookie_netfs_data;
+       uint32_t version;
+
+       _enter("{%s},,%hu", netfs->name, datalen);
+
+       if (datalen != sizeof(version)) {
+               _leave(" = OBSOLETE [dl=%d v=%zu]", datalen, sizeof(version));
+               return FSCACHE_CHECKAUX_OBSOLETE;
+       }
+
+       memcpy(&version, data, sizeof(version));
+       if (version != netfs->version) {
+               _leave(" = OBSOLETE [ver=%x net=%x]", version, netfs->version);
+               return FSCACHE_CHECKAUX_OBSOLETE;
+       }
+
+       _leave(" = OKAY");
+       return FSCACHE_CHECKAUX_OKAY;
+}
diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c
new file mode 100644 (file)
index 0000000..bad4967
--- /dev/null
@@ -0,0 +1,109 @@
+/* FS-Cache latency histogram
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL THREAD
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+atomic_t fscache_obj_instantiate_histogram[HZ];
+atomic_t fscache_objs_histogram[HZ];
+atomic_t fscache_ops_histogram[HZ];
+atomic_t fscache_retrieval_delay_histogram[HZ];
+atomic_t fscache_retrieval_histogram[HZ];
+
+/*
+ * display the time-taken histogram
+ */
+static int fscache_histogram_show(struct seq_file *m, void *v)
+{
+       unsigned long index;
+       unsigned n[5], t;
+
+       switch ((unsigned long) v) {
+       case 1:
+               seq_puts(m, "JIFS  SECS  OBJ INST  OP RUNS   OBJ RUNS "
+                        " RETRV DLY RETRIEVLS\n");
+               return 0;
+       case 2:
+               seq_puts(m, "===== ===== ========= ========= ========="
+                        " ========= =========\n");
+               return 0;
+       default:
+               index = (unsigned long) v - 3;
+               n[0] = atomic_read(&fscache_obj_instantiate_histogram[index]);
+               n[1] = atomic_read(&fscache_ops_histogram[index]);
+               n[2] = atomic_read(&fscache_objs_histogram[index]);
+               n[3] = atomic_read(&fscache_retrieval_delay_histogram[index]);
+               n[4] = atomic_read(&fscache_retrieval_histogram[index]);
+               if (!(n[0] | n[1] | n[2] | n[3] | n[4]))
+                       return 0;
+
+               t = (index * 1000) / HZ;
+
+               seq_printf(m, "%4lu  0.%03u %9u %9u %9u %9u %9u\n",
+                          index, t, n[0], n[1], n[2], n[3], n[4]);
+               return 0;
+       }
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *fscache_histogram_start(struct seq_file *m, loff_t *_pos)
+{
+       if ((unsigned long long)*_pos >= HZ + 2)
+               return NULL;
+       if (*_pos == 0)
+               *_pos = 1;
+       return (void *)(unsigned long) *_pos;
+}
+
+/*
+ * move to the next line
+ */
+static void *fscache_histogram_next(struct seq_file *m, void *v, loff_t *pos)
+{
+       (*pos)++;
+       return (unsigned long long)*pos > HZ + 2 ?
+               NULL : (void *)(unsigned long) *pos;
+}
+
+/*
+ * clean up after reading
+ */
+static void fscache_histogram_stop(struct seq_file *m, void *v)
+{
+}
+
+static const struct seq_operations fscache_histogram_ops = {
+       .start          = fscache_histogram_start,
+       .stop           = fscache_histogram_stop,
+       .next           = fscache_histogram_next,
+       .show           = fscache_histogram_show,
+};
+
+/*
+ * open "/proc/fs/fscache/histogram" to provide latency data
+ */
+static int fscache_histogram_open(struct inode *inode, struct file *file)
+{
+       return seq_open(file, &fscache_histogram_ops);
+}
+
+const struct file_operations fscache_histogram_fops = {
+       .owner          = THIS_MODULE,
+       .open           = fscache_histogram_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = seq_release,
+};
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
new file mode 100644 (file)
index 0000000..e0cbd16
--- /dev/null
@@ -0,0 +1,380 @@
+/* Internal definitions for FS-Cache
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * Lock order, in the order in which multiple locks should be obtained:
+ * - fscache_addremove_sem
+ * - cookie->lock
+ * - cookie->parent->lock
+ * - cache->object_list_lock
+ * - object->lock
+ * - object->parent->lock
+ * - fscache_thread_lock
+ *
+ */
+
+#include <linux/fscache-cache.h>
+#include <linux/sched.h>
+
+#define FSCACHE_MIN_THREADS    4
+#define FSCACHE_MAX_THREADS    32
+
+/*
+ * fsc-cache.c
+ */
+extern struct list_head fscache_cache_list;
+extern struct rw_semaphore fscache_addremove_sem;
+
+extern struct fscache_cache *fscache_select_cache_for_object(
+       struct fscache_cookie *);
+
+/*
+ * fsc-cookie.c
+ */
+extern struct kmem_cache *fscache_cookie_jar;
+
+extern void fscache_cookie_init_once(void *);
+extern void __fscache_cookie_put(struct fscache_cookie *);
+
+/*
+ * fsc-fsdef.c
+ */
+extern struct fscache_cookie fscache_fsdef_index;
+extern struct fscache_cookie_def fscache_fsdef_netfs_def;
+
+/*
+ * fsc-histogram.c
+ */
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+extern atomic_t fscache_obj_instantiate_histogram[HZ];
+extern atomic_t fscache_objs_histogram[HZ];
+extern atomic_t fscache_ops_histogram[HZ];
+extern atomic_t fscache_retrieval_delay_histogram[HZ];
+extern atomic_t fscache_retrieval_histogram[HZ];
+
+static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif)
+{
+       unsigned long jif = jiffies - start_jif;
+       if (jif >= HZ)
+               jif = HZ - 1;
+       atomic_inc(&histogram[jif]);
+}
+
+extern const struct file_operations fscache_histogram_fops;
+
+#else
+#define fscache_hist(hist, start_jif) do {} while (0)
+#endif
+
+/*
+ * fsc-main.c
+ */
+extern unsigned fscache_defer_lookup;
+extern unsigned fscache_defer_create;
+extern unsigned fscache_debug;
+extern struct kobject *fscache_root;
+
+extern int fscache_wait_bit(void *);
+extern int fscache_wait_bit_interruptible(void *);
+
+/*
+ * fsc-object.c
+ */
+extern void fscache_withdrawing_object(struct fscache_cache *,
+                                      struct fscache_object *);
+extern void fscache_enqueue_object(struct fscache_object *);
+
+/*
+ * fsc-operation.c
+ */
+extern int fscache_submit_exclusive_op(struct fscache_object *,
+                                      struct fscache_operation *);
+extern int fscache_submit_op(struct fscache_object *,
+                            struct fscache_operation *);
+extern void fscache_abort_object(struct fscache_object *);
+extern void fscache_start_operations(struct fscache_object *);
+extern void fscache_operation_gc(struct work_struct *);
+
+/*
+ * fsc-proc.c
+ */
+#ifdef CONFIG_PROC_FS
+extern int __init fscache_proc_init(void);
+extern void fscache_proc_cleanup(void);
+#else
+#define fscache_proc_init()    (0)
+#define fscache_proc_cleanup() do {} while (0)
+#endif
+
+/*
+ * fsc-stats.c
+ */
+#ifdef CONFIG_FSCACHE_STATS
+extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS];
+extern atomic_t fscache_n_objs_processed[FSCACHE_MAX_THREADS];
+
+extern atomic_t fscache_n_op_pend;
+extern atomic_t fscache_n_op_run;
+extern atomic_t fscache_n_op_enqueue;
+extern atomic_t fscache_n_op_deferred_release;
+extern atomic_t fscache_n_op_release;
+extern atomic_t fscache_n_op_gc;
+
+extern atomic_t fscache_n_attr_changed;
+extern atomic_t fscache_n_attr_changed_ok;
+extern atomic_t fscache_n_attr_changed_nobufs;
+extern atomic_t fscache_n_attr_changed_nomem;
+extern atomic_t fscache_n_attr_changed_calls;
+
+extern atomic_t fscache_n_allocs;
+extern atomic_t fscache_n_allocs_ok;
+extern atomic_t fscache_n_allocs_wait;
+extern atomic_t fscache_n_allocs_nobufs;
+extern atomic_t fscache_n_alloc_ops;
+extern atomic_t fscache_n_alloc_op_waits;
+
+extern atomic_t fscache_n_retrievals;
+extern atomic_t fscache_n_retrievals_ok;
+extern atomic_t fscache_n_retrievals_wait;
+extern atomic_t fscache_n_retrievals_nodata;
+extern atomic_t fscache_n_retrievals_nobufs;
+extern atomic_t fscache_n_retrievals_intr;
+extern atomic_t fscache_n_retrievals_nomem;
+extern atomic_t fscache_n_retrieval_ops;
+extern atomic_t fscache_n_retrieval_op_waits;
+
+extern atomic_t fscache_n_stores;
+extern atomic_t fscache_n_stores_ok;
+extern atomic_t fscache_n_stores_again;
+extern atomic_t fscache_n_stores_nobufs;
+extern atomic_t fscache_n_stores_oom;
+extern atomic_t fscache_n_store_ops;
+extern atomic_t fscache_n_store_calls;
+
+extern atomic_t fscache_n_marks;
+extern atomic_t fscache_n_uncaches;
+
+extern atomic_t fscache_n_acquires;
+extern atomic_t fscache_n_acquires_null;
+extern atomic_t fscache_n_acquires_no_cache;
+extern atomic_t fscache_n_acquires_ok;
+extern atomic_t fscache_n_acquires_nobufs;
+extern atomic_t fscache_n_acquires_oom;
+
+extern atomic_t fscache_n_updates;
+extern atomic_t fscache_n_updates_null;
+extern atomic_t fscache_n_updates_run;
+
+extern atomic_t fscache_n_relinquishes;
+extern atomic_t fscache_n_relinquishes_null;
+extern atomic_t fscache_n_relinquishes_waitcrt;
+
+extern atomic_t fscache_n_cookie_index;
+extern atomic_t fscache_n_cookie_data;
+extern atomic_t fscache_n_cookie_special;
+
+extern atomic_t fscache_n_object_alloc;
+extern atomic_t fscache_n_object_no_alloc;
+extern atomic_t fscache_n_object_lookups;
+extern atomic_t fscache_n_object_lookups_negative;
+extern atomic_t fscache_n_object_lookups_positive;
+extern atomic_t fscache_n_object_created;
+extern atomic_t fscache_n_object_avail;
+extern atomic_t fscache_n_object_dead;
+
+extern atomic_t fscache_n_checkaux_none;
+extern atomic_t fscache_n_checkaux_okay;
+extern atomic_t fscache_n_checkaux_update;
+extern atomic_t fscache_n_checkaux_obsolete;
+
+static inline void fscache_stat(atomic_t *stat)
+{
+       atomic_inc(stat);
+}
+
+extern const struct file_operations fscache_stats_fops;
+#else
+
+#define fscache_stat(stat) do {} while (0)
+#endif
+
+/*
+ * raise an event on an object
+ * - if the event is not masked for that object, then the object is
+ *   queued for attention by the thread pool.
+ */
+static inline void fscache_raise_event(struct fscache_object *object,
+                                      unsigned event)
+{
+       if (!test_and_set_bit(event, &object->events) &&
+           test_bit(event, &object->event_mask))
+               fscache_enqueue_object(object);
+}
+
+/*
+ * drop a reference to a cookie
+ */
+static inline void fscache_cookie_put(struct fscache_cookie *cookie)
+{
+       BUG_ON(atomic_read(&cookie->usage) <= 0);
+       if (atomic_dec_and_test(&cookie->usage))
+               __fscache_cookie_put(cookie);
+}
+
+/*
+ * get an extra reference to a netfs retrieval context
+ */
+static inline
+void *fscache_get_context(struct fscache_cookie *cookie, void *context)
+{
+       if (cookie->def->get_context)
+               cookie->def->get_context(cookie->netfs_data, context);
+       return context;
+}
+
+/*
+ * release a reference to a netfs retrieval context
+ */
+static inline
+void fscache_put_context(struct fscache_cookie *cookie, void *context)
+{
+       if (cookie->def->put_context)
+               cookie->def->put_context(cookie->netfs_data, context);
+}
+
+/*****************************************************************************/
+/*
+ * debug tracing
+ */
+#define dbgprintk(FMT, ...) \
+       printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
+
+/* make sure we maintain the format strings, even when debugging is disabled */
+static inline __attribute__((format(printf, 1, 2)))
+void _dbprintk(const char *fmt, ...)
+{
+}
+
+#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
+
+#define kjournal(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+
+#ifdef __KDEBUG
+#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
+#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
+#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
+
+#elif defined(CONFIG_FSCACHE_DEBUG)
+#define _enter(FMT, ...)                       \
+do {                                           \
+       if (__do_kdebug(ENTER))                 \
+               kenter(FMT, ##__VA_ARGS__);     \
+} while (0)
+
+#define _leave(FMT, ...)                       \
+do {                                           \
+       if (__do_kdebug(LEAVE))                 \
+               kleave(FMT, ##__VA_ARGS__);     \
+} while (0)
+
+#define _debug(FMT, ...)                       \
+do {                                           \
+       if (__do_kdebug(DEBUG))                 \
+               kdebug(FMT, ##__VA_ARGS__);     \
+} while (0)
+
+#else
+#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#endif
+
+/*
+ * determine whether a particular optional debugging point should be logged
+ * - we need to go through three steps to persuade cpp to correctly join the
+ *   shorthand in FSCACHE_DEBUG_LEVEL with its prefix
+ */
+#define ____do_kdebug(LEVEL, POINT) \
+       unlikely((fscache_debug & \
+                 (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3))))
+#define ___do_kdebug(LEVEL, POINT) \
+       ____do_kdebug(LEVEL, POINT)
+#define __do_kdebug(POINT) \
+       ___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT)
+
+#define FSCACHE_DEBUG_CACHE    0
+#define FSCACHE_DEBUG_COOKIE   1
+#define FSCACHE_DEBUG_PAGE     2
+#define FSCACHE_DEBUG_OPERATION        3
+
+#define FSCACHE_POINT_ENTER    1
+#define FSCACHE_POINT_LEAVE    2
+#define FSCACHE_POINT_DEBUG    4
+
+#ifndef FSCACHE_DEBUG_LEVEL
+#define FSCACHE_DEBUG_LEVEL CACHE
+#endif
+
+/*
+ * assertions
+ */
+#if 1 /* defined(__KDEBUGALL) */
+
+#define ASSERT(X)                                                      \
+do {                                                                   \
+       if (unlikely(!(X))) {                                           \
+               printk(KERN_ERR "\n");                                  \
+               printk(KERN_ERR "FS-Cache: Assertion failed\n");        \
+               BUG();                                                  \
+       }                                                               \
+} while (0)
+
+#define ASSERTCMP(X, OP, Y)                                            \
+do {                                                                   \
+       if (unlikely(!((X) OP (Y)))) {                                  \
+               printk(KERN_ERR "\n");                                  \
+               printk(KERN_ERR "FS-Cache: Assertion failed\n");        \
+               printk(KERN_ERR "%lx " #OP " %lx is false\n",           \
+                      (unsigned long)(X), (unsigned long)(Y));         \
+               BUG();                                                  \
+       }                                                               \
+} while (0)
+
+#define ASSERTIF(C, X)                                                 \
+do {                                                                   \
+       if (unlikely((C) && !(X))) {                                    \
+               printk(KERN_ERR "\n");                                  \
+               printk(KERN_ERR "FS-Cache: Assertion failed\n");        \
+               BUG();                                                  \
+       }                                                               \
+} while (0)
+
+#define ASSERTIFCMP(C, X, OP, Y)                                       \
+do {                                                                   \
+       if (unlikely((C) && !((X) OP (Y)))) {                           \
+               printk(KERN_ERR "\n");                                  \
+               printk(KERN_ERR "FS-Cache: Assertion failed\n");        \
+               printk(KERN_ERR "%lx " #OP " %lx is false\n",           \
+                      (unsigned long)(X), (unsigned long)(Y));         \
+               BUG();                                                  \
+       }                                                               \
+} while (0)
+
+#else
+
+#define ASSERT(X)                      do {} while (0)
+#define ASSERTCMP(X, OP, Y)            do {} while (0)
+#define ASSERTIF(C, X)                 do {} while (0)
+#define ASSERTIFCMP(C, X, OP, Y)       do {} while (0)
+
+#endif /* assert or not */
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
new file mode 100644 (file)
index 0000000..4de41b5
--- /dev/null
@@ -0,0 +1,124 @@
+/* General filesystem local caching manager
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL CACHE
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+MODULE_DESCRIPTION("FS Cache Manager");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+unsigned fscache_defer_lookup = 1;
+module_param_named(defer_lookup, fscache_defer_lookup, uint,
+                  S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(fscache_defer_lookup,
+                "Defer cookie lookup to background thread");
+
+unsigned fscache_defer_create = 1;
+module_param_named(defer_create, fscache_defer_create, uint,
+                  S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(fscache_defer_create,
+                "Defer cookie creation to background thread");
+
+unsigned fscache_debug;
+module_param_named(debug, fscache_debug, uint,
+                  S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(fscache_debug,
+                "FS-Cache debugging mask");
+
+struct kobject *fscache_root;
+
+/*
+ * initialise the fs caching module
+ */
+static int __init fscache_init(void)
+{
+       int ret;
+
+       ret = slow_work_register_user();
+       if (ret < 0)
+               goto error_slow_work;
+
+       ret = fscache_proc_init();
+       if (ret < 0)
+               goto error_proc;
+
+       fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar",
+                                              sizeof(struct fscache_cookie),
+                                              0,
+                                              0,
+                                              fscache_cookie_init_once);
+       if (!fscache_cookie_jar) {
+               printk(KERN_NOTICE
+                      "FS-Cache: Failed to allocate a cookie jar\n");
+               ret = -ENOMEM;
+               goto error_cookie_jar;
+       }
+
+       fscache_root = kobject_create_and_add("fscache", kernel_kobj);
+       if (!fscache_root)
+               goto error_kobj;
+
+       printk(KERN_NOTICE "FS-Cache: Loaded\n");
+       return 0;
+
+error_kobj:
+       kmem_cache_destroy(fscache_cookie_jar);
+error_cookie_jar:
+       fscache_proc_cleanup();
+error_proc:
+       slow_work_unregister_user();
+error_slow_work:
+       return ret;
+}
+
+fs_initcall(fscache_init);
+
+/*
+ * clean up on module removal
+ */
+static void __exit fscache_exit(void)
+{
+       _enter("");
+
+       kobject_put(fscache_root);
+       kmem_cache_destroy(fscache_cookie_jar);
+       fscache_proc_cleanup();
+       slow_work_unregister_user();
+       printk(KERN_NOTICE "FS-Cache: Unloaded\n");
+}
+
+module_exit(fscache_exit);
+
+/*
+ * wait_on_bit() sleep function for uninterruptible waiting
+ */
+int fscache_wait_bit(void *flags)
+{
+       schedule();
+       return 0;
+}
+EXPORT_SYMBOL(fscache_wait_bit);
+
+/*
+ * wait_on_bit() sleep function for interruptible waiting
+ */
+int fscache_wait_bit_interruptible(void *flags)
+{
+       schedule();
+       return signal_pending(current);
+}
+EXPORT_SYMBOL(fscache_wait_bit_interruptible);
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
new file mode 100644 (file)
index 0000000..e028b8e
--- /dev/null
@@ -0,0 +1,103 @@
+/* FS-Cache netfs (client) registration
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+static LIST_HEAD(fscache_netfs_list);
+
+/*
+ * register a network filesystem for caching
+ */
+int __fscache_register_netfs(struct fscache_netfs *netfs)
+{
+       struct fscache_netfs *ptr;
+       int ret;
+
+       _enter("{%s}", netfs->name);
+
+       INIT_LIST_HEAD(&netfs->link);
+
+       /* allocate a cookie for the primary index */
+       netfs->primary_index =
+               kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
+
+       if (!netfs->primary_index) {
+               _leave(" = -ENOMEM");
+               return -ENOMEM;
+       }
+
+       /* initialise the primary index cookie */
+       atomic_set(&netfs->primary_index->usage, 1);
+       atomic_set(&netfs->primary_index->n_children, 0);
+
+       netfs->primary_index->def               = &fscache_fsdef_netfs_def;
+       netfs->primary_index->parent            = &fscache_fsdef_index;
+       netfs->primary_index->netfs_data        = netfs;
+
+       atomic_inc(&netfs->primary_index->parent->usage);
+       atomic_inc(&netfs->primary_index->parent->n_children);
+
+       spin_lock_init(&netfs->primary_index->lock);
+       INIT_HLIST_HEAD(&netfs->primary_index->backing_objects);
+
+       /* check the netfs type is not already present */
+       down_write(&fscache_addremove_sem);
+
+       ret = -EEXIST;
+       list_for_each_entry(ptr, &fscache_netfs_list, link) {
+               if (strcmp(ptr->name, netfs->name) == 0)
+                       goto already_registered;
+       }
+
+       list_add(&netfs->link, &fscache_netfs_list);
+       ret = 0;
+
+       printk(KERN_NOTICE "FS-Cache: Netfs '%s' registered for caching\n",
+              netfs->name);
+
+already_registered:
+       up_write(&fscache_addremove_sem);
+
+       if (ret < 0) {
+               netfs->primary_index->parent = NULL;
+               __fscache_cookie_put(netfs->primary_index);
+               netfs->primary_index = NULL;
+       }
+
+       _leave(" = %d", ret);
+       return ret;
+}
+EXPORT_SYMBOL(__fscache_register_netfs);
+
+/*
+ * unregister a network filesystem from the cache
+ * - all cookies must have been released first
+ */
+void __fscache_unregister_netfs(struct fscache_netfs *netfs)
+{
+       _enter("{%s.%u}", netfs->name, netfs->version);
+
+       down_write(&fscache_addremove_sem);
+
+       list_del(&netfs->link);
+       fscache_relinquish_cookie(netfs->primary_index, 0);
+
+       up_write(&fscache_addremove_sem);
+
+       printk(KERN_NOTICE "FS-Cache: Netfs '%s' unregistered from caching\n",
+              netfs->name);
+
+       _leave("");
+}
+EXPORT_SYMBOL(__fscache_unregister_netfs);
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
new file mode 100644 (file)
index 0000000..392a41b
--- /dev/null
@@ -0,0 +1,810 @@
+/* FS-Cache object state machine handler
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/object.txt for a description of the
+ * object state machine and the in-kernel representations.
+ */
+
+#define FSCACHE_DEBUG_LEVEL COOKIE
+#include <linux/module.h>
+#include "internal.h"
+
+const char *fscache_object_states[] = {
+       [FSCACHE_OBJECT_INIT]           = "OBJECT_INIT",
+       [FSCACHE_OBJECT_LOOKING_UP]     = "OBJECT_LOOKING_UP",
+       [FSCACHE_OBJECT_CREATING]       = "OBJECT_CREATING",
+       [FSCACHE_OBJECT_AVAILABLE]      = "OBJECT_AVAILABLE",
+       [FSCACHE_OBJECT_ACTIVE]         = "OBJECT_ACTIVE",
+       [FSCACHE_OBJECT_UPDATING]       = "OBJECT_UPDATING",
+       [FSCACHE_OBJECT_DYING]          = "OBJECT_DYING",
+       [FSCACHE_OBJECT_LC_DYING]       = "OBJECT_LC_DYING",
+       [FSCACHE_OBJECT_ABORT_INIT]     = "OBJECT_ABORT_INIT",
+       [FSCACHE_OBJECT_RELEASING]      = "OBJECT_RELEASING",
+       [FSCACHE_OBJECT_RECYCLING]      = "OBJECT_RECYCLING",
+       [FSCACHE_OBJECT_WITHDRAWING]    = "OBJECT_WITHDRAWING",
+       [FSCACHE_OBJECT_DEAD]           = "OBJECT_DEAD",
+};
+EXPORT_SYMBOL(fscache_object_states);
+
+static void fscache_object_slow_work_put_ref(struct slow_work *);
+static int  fscache_object_slow_work_get_ref(struct slow_work *);
+static void fscache_object_slow_work_execute(struct slow_work *);
+static void fscache_initialise_object(struct fscache_object *);
+static void fscache_lookup_object(struct fscache_object *);
+static void fscache_object_available(struct fscache_object *);
+static void fscache_release_object(struct fscache_object *);
+static void fscache_withdraw_object(struct fscache_object *);
+static void fscache_enqueue_dependents(struct fscache_object *);
+static void fscache_dequeue_object(struct fscache_object *);
+
+const struct slow_work_ops fscache_object_slow_work_ops = {
+       .get_ref        = fscache_object_slow_work_get_ref,
+       .put_ref        = fscache_object_slow_work_put_ref,
+       .execute        = fscache_object_slow_work_execute,
+};
+EXPORT_SYMBOL(fscache_object_slow_work_ops);
+
+/*
+ * we need to notify the parent when an op completes that we had outstanding
+ * upon it
+ */
+static inline void fscache_done_parent_op(struct fscache_object *object)
+{
+       struct fscache_object *parent = object->parent;
+
+       _enter("OBJ%x {OBJ%x,%x}",
+              object->debug_id, parent->debug_id, parent->n_ops);
+
+       spin_lock_nested(&parent->lock, 1);
+       parent->n_ops--;
+       parent->n_obj_ops--;
+       if (parent->n_ops == 0)
+               fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+       spin_unlock(&parent->lock);
+}
+
+/*
+ * process events that have been sent to an object's state machine
+ * - initiates parent lookup
+ * - does object lookup
+ * - does object creation
+ * - does object recycling and retirement
+ * - does object withdrawal
+ */
+static void fscache_object_state_machine(struct fscache_object *object)
+{
+       enum fscache_object_state new_state;
+
+       ASSERT(object != NULL);
+
+       _enter("{OBJ%x,%s,%lx}",
+              object->debug_id, fscache_object_states[object->state],
+              object->events);
+
+       switch (object->state) {
+               /* wait for the parent object to become ready */
+       case FSCACHE_OBJECT_INIT:
+               object->event_mask =
+                       ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+               fscache_initialise_object(object);
+               goto done;
+
+               /* look up the object metadata on disk */
+       case FSCACHE_OBJECT_LOOKING_UP:
+               fscache_lookup_object(object);
+               goto lookup_transit;
+
+               /* create the object metadata on disk */
+       case FSCACHE_OBJECT_CREATING:
+               fscache_lookup_object(object);
+               goto lookup_transit;
+
+               /* handle an object becoming available; start pending
+                * operations and queue dependent operations for processing */
+       case FSCACHE_OBJECT_AVAILABLE:
+               fscache_object_available(object);
+               goto active_transit;
+
+               /* normal running state */
+       case FSCACHE_OBJECT_ACTIVE:
+               goto active_transit;
+
+               /* update the object metadata on disk */
+       case FSCACHE_OBJECT_UPDATING:
+               clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
+               fscache_stat(&fscache_n_updates_run);
+               object->cache->ops->update_object(object);
+               goto active_transit;
+
+               /* handle an object dying during lookup or creation */
+       case FSCACHE_OBJECT_LC_DYING:
+               object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
+               object->cache->ops->lookup_complete(object);
+
+               spin_lock(&object->lock);
+               object->state = FSCACHE_OBJECT_DYING;
+               if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
+                                      &object->cookie->flags))
+                       wake_up_bit(&object->cookie->flags,
+                                   FSCACHE_COOKIE_CREATING);
+               spin_unlock(&object->lock);
+
+               fscache_done_parent_op(object);
+
+               /* wait for completion of all active operations on this object
+                * and the death of all child objects of this object */
+       case FSCACHE_OBJECT_DYING:
+       dying:
+               clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events);
+               spin_lock(&object->lock);
+               _debug("dying OBJ%x {%d,%d}",
+                      object->debug_id, object->n_ops, object->n_children);
+               if (object->n_ops == 0 && object->n_children == 0) {
+                       object->event_mask &=
+                               ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+                       object->event_mask |=
+                               (1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+                               (1 << FSCACHE_OBJECT_EV_RETIRE) |
+                               (1 << FSCACHE_OBJECT_EV_RELEASE) |
+                               (1 << FSCACHE_OBJECT_EV_ERROR);
+               } else {
+                       object->event_mask &=
+                               ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+                                 (1 << FSCACHE_OBJECT_EV_RETIRE) |
+                                 (1 << FSCACHE_OBJECT_EV_RELEASE) |
+                                 (1 << FSCACHE_OBJECT_EV_ERROR));
+                       object->event_mask |=
+                               1 << FSCACHE_OBJECT_EV_CLEARED;
+               }
+               spin_unlock(&object->lock);
+               fscache_enqueue_dependents(object);
+               goto terminal_transit;
+
+               /* handle an abort during initialisation */
+       case FSCACHE_OBJECT_ABORT_INIT:
+               _debug("handle abort init %lx", object->events);
+               object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
+
+               spin_lock(&object->lock);
+               fscache_dequeue_object(object);
+
+               object->state = FSCACHE_OBJECT_DYING;
+               if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
+                                      &object->cookie->flags))
+                       wake_up_bit(&object->cookie->flags,
+                                   FSCACHE_COOKIE_CREATING);
+               spin_unlock(&object->lock);
+               goto dying;
+
+               /* handle the netfs releasing an object and possibly marking it
+                * obsolete too */
+       case FSCACHE_OBJECT_RELEASING:
+       case FSCACHE_OBJECT_RECYCLING:
+               object->event_mask &=
+                       ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+                         (1 << FSCACHE_OBJECT_EV_RETIRE) |
+                         (1 << FSCACHE_OBJECT_EV_RELEASE) |
+                         (1 << FSCACHE_OBJECT_EV_ERROR));
+               fscache_release_object(object);
+               spin_lock(&object->lock);
+               object->state = FSCACHE_OBJECT_DEAD;
+               spin_unlock(&object->lock);
+               fscache_stat(&fscache_n_object_dead);
+               goto terminal_transit;
+
+               /* handle the parent cache of this object being withdrawn from
+                * active service */
+       case FSCACHE_OBJECT_WITHDRAWING:
+               object->event_mask &=
+                       ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
+                         (1 << FSCACHE_OBJECT_EV_RETIRE) |
+                         (1 << FSCACHE_OBJECT_EV_RELEASE) |
+                         (1 << FSCACHE_OBJECT_EV_ERROR));
+               fscache_withdraw_object(object);
+               spin_lock(&object->lock);
+               object->state = FSCACHE_OBJECT_DEAD;
+               spin_unlock(&object->lock);
+               fscache_stat(&fscache_n_object_dead);
+               goto terminal_transit;
+
+               /* complain about the object being woken up once it is
+                * deceased */
+       case FSCACHE_OBJECT_DEAD:
+               printk(KERN_ERR "FS-Cache:"
+                      " Unexpected event in dead state %lx\n",
+                      object->events & object->event_mask);
+               BUG();
+
+       default:
+               printk(KERN_ERR "FS-Cache: Unknown object state %u\n",
+                      object->state);
+               BUG();
+       }
+
+       /* determine the transition from a lookup state */
+lookup_transit:
+       switch (fls(object->events & object->event_mask) - 1) {
+       case FSCACHE_OBJECT_EV_WITHDRAW:
+       case FSCACHE_OBJECT_EV_RETIRE:
+       case FSCACHE_OBJECT_EV_RELEASE:
+       case FSCACHE_OBJECT_EV_ERROR:
+               new_state = FSCACHE_OBJECT_LC_DYING;
+               goto change_state;
+       case FSCACHE_OBJECT_EV_REQUEUE:
+               goto done;
+       case -1:
+               goto done; /* sleep until event */
+       default:
+               goto unsupported_event;
+       }
+
+       /* determine the transition from an active state */
+active_transit:
+       switch (fls(object->events & object->event_mask) - 1) {
+       case FSCACHE_OBJECT_EV_WITHDRAW:
+       case FSCACHE_OBJECT_EV_RETIRE:
+       case FSCACHE_OBJECT_EV_RELEASE:
+       case FSCACHE_OBJECT_EV_ERROR:
+               new_state = FSCACHE_OBJECT_DYING;
+               goto change_state;
+       case FSCACHE_OBJECT_EV_UPDATE:
+               new_state = FSCACHE_OBJECT_UPDATING;
+               goto change_state;
+       case -1:
+               new_state = FSCACHE_OBJECT_ACTIVE;
+               goto change_state; /* sleep until event */
+       default:
+               goto unsupported_event;
+       }
+
+       /* determine the transition from a terminal state */
+terminal_transit:
+       switch (fls(object->events & object->event_mask) - 1) {
+       case FSCACHE_OBJECT_EV_WITHDRAW:
+               new_state = FSCACHE_OBJECT_WITHDRAWING;
+               goto change_state;
+       case FSCACHE_OBJECT_EV_RETIRE:
+               new_state = FSCACHE_OBJECT_RECYCLING;
+               goto change_state;
+       case FSCACHE_OBJECT_EV_RELEASE:
+               new_state = FSCACHE_OBJECT_RELEASING;
+               goto change_state;
+       case FSCACHE_OBJECT_EV_ERROR:
+               new_state = FSCACHE_OBJECT_WITHDRAWING;
+               goto change_state;
+       case FSCACHE_OBJECT_EV_CLEARED:
+               new_state = FSCACHE_OBJECT_DYING;
+               goto change_state;
+       case -1:
+               goto done; /* sleep until event */
+       default:
+               goto unsupported_event;
+       }
+
+change_state:
+       spin_lock(&object->lock);
+       object->state = new_state;
+       spin_unlock(&object->lock);
+
+done:
+       _leave(" [->%s]", fscache_object_states[object->state]);
+       return;
+
+unsupported_event:
+       printk(KERN_ERR "FS-Cache:"
+              " Unsupported event %lx [mask %lx] in state %s\n",
+              object->events, object->event_mask,
+              fscache_object_states[object->state]);
+       BUG();
+}
+
+/*
+ * execute an object
+ */
+static void fscache_object_slow_work_execute(struct slow_work *work)
+{
+       struct fscache_object *object =
+               container_of(work, struct fscache_object, work);
+       unsigned long start;
+
+       _enter("{OBJ%x}", object->debug_id);
+
+       clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+
+       start = jiffies;
+       fscache_object_state_machine(object);
+       fscache_hist(fscache_objs_histogram, start);
+       if (object->events & object->event_mask)
+               fscache_enqueue_object(object);
+}
+
+/*
+ * initialise an object
+ * - check the specified object's parent to see if we can make use of it
+ *   immediately to do a creation
+ * - we may need to start the process of creating a parent and we need to wait
+ *   for the parent's lookup and creation to complete if it's not there yet
+ * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
+ *   leaf-most cookies of the object and all its children
+ */
+static void fscache_initialise_object(struct fscache_object *object)
+{
+       struct fscache_object *parent;
+
+       _enter("");
+       ASSERT(object->cookie != NULL);
+       ASSERT(object->cookie->parent != NULL);
+       ASSERT(list_empty(&object->work.link));
+
+       if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
+                             (1 << FSCACHE_OBJECT_EV_RELEASE) |
+                             (1 << FSCACHE_OBJECT_EV_RETIRE) |
+                             (1 << FSCACHE_OBJECT_EV_WITHDRAW))) {
+               _debug("abort init %lx", object->events);
+               spin_lock(&object->lock);
+               object->state = FSCACHE_OBJECT_ABORT_INIT;
+               spin_unlock(&object->lock);
+               return;
+       }
+
+       spin_lock(&object->cookie->lock);
+       spin_lock_nested(&object->cookie->parent->lock, 1);
+
+       parent = object->parent;
+       if (!parent) {
+               _debug("no parent");
+               set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
+       } else {
+               spin_lock(&object->lock);
+               spin_lock_nested(&parent->lock, 1);
+               _debug("parent %s", fscache_object_states[parent->state]);
+
+               if (parent->state >= FSCACHE_OBJECT_DYING) {
+                       _debug("bad parent");
+                       set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
+               } else if (parent->state < FSCACHE_OBJECT_AVAILABLE) {
+                       _debug("wait");
+
+                       /* we may get woken up in this state by child objects
+                        * binding on to us, so we need to make sure we don't
+                        * add ourself to the list multiple times */
+                       if (list_empty(&object->dep_link)) {
+                               object->cache->ops->grab_object(object);
+                               list_add(&object->dep_link,
+                                        &parent->dependents);
+
+                               /* fscache_acquire_non_index_cookie() uses this
+                                * to wake the chain up */
+                               if (parent->state == FSCACHE_OBJECT_INIT)
+                                       fscache_enqueue_object(parent);
+                       }
+               } else {
+                       _debug("go");
+                       parent->n_ops++;
+                       parent->n_obj_ops++;
+                       object->lookup_jif = jiffies;
+                       object->state = FSCACHE_OBJECT_LOOKING_UP;
+                       set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+               }
+
+               spin_unlock(&parent->lock);
+               spin_unlock(&object->lock);
+       }
+
+       spin_unlock(&object->cookie->parent->lock);
+       spin_unlock(&object->cookie->lock);
+       _leave("");
+}
+
+/*
+ * look an object up in the cache from which it was allocated
+ * - we hold an "access lock" on the parent object, so the parent object cannot
+ *   be withdrawn by either party till we've finished
+ * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
+ *   leaf-most cookies of the object and all its children
+ */
+static void fscache_lookup_object(struct fscache_object *object)
+{
+       struct fscache_cookie *cookie = object->cookie;
+       struct fscache_object *parent;
+
+       _enter("");
+
+       parent = object->parent;
+       ASSERT(parent != NULL);
+       ASSERTCMP(parent->n_ops, >, 0);
+       ASSERTCMP(parent->n_obj_ops, >, 0);
+
+       /* make sure the parent is still available */
+       ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE);
+
+       if (parent->state >= FSCACHE_OBJECT_DYING ||
+           test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
+               _debug("unavailable");
+               set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
+               _leave("");
+               return;
+       }
+
+       _debug("LOOKUP \"%s/%s\" in \"%s\"",
+              parent->cookie->def->name, cookie->def->name,
+              object->cache->tag->name);
+
+       fscache_stat(&fscache_n_object_lookups);
+       object->cache->ops->lookup_object(object);
+
+       if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
+               set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
+
+       _leave("");
+}
+
+/**
+ * fscache_object_lookup_negative - Note negative cookie lookup
+ * @object: Object pointing to cookie to mark
+ *
+ * Note negative lookup, permitting those waiting to read data from an already
+ * existing backing object to continue as there's no data for them to read.
+ */
+void fscache_object_lookup_negative(struct fscache_object *object)
+{
+       struct fscache_cookie *cookie = object->cookie;
+
+       _enter("{OBJ%x,%s}",
+              object->debug_id, fscache_object_states[object->state]);
+
+       spin_lock(&object->lock);
+       if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+               fscache_stat(&fscache_n_object_lookups_negative);
+
+               /* transit here to allow write requests to begin stacking up
+                * and read requests to begin returning ENODATA */
+               object->state = FSCACHE_OBJECT_CREATING;
+               spin_unlock(&object->lock);
+
+               set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags);
+               set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+
+               _debug("wake up lookup %p", &cookie->flags);
+               smp_mb__before_clear_bit();
+               clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
+               smp_mb__after_clear_bit();
+               wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+               set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+       } else {
+               ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
+               spin_unlock(&object->lock);
+       }
+
+       _leave("");
+}
+EXPORT_SYMBOL(fscache_object_lookup_negative);
+
+/**
+ * fscache_obtained_object - Note successful object lookup or creation
+ * @object: Object pointing to cookie to mark
+ *
+ * Note successful lookup and/or creation, permitting those waiting to write
+ * data to a backing object to continue.
+ *
+ * Note that after calling this, an object's cookie may be relinquished by the
+ * netfs, and so must be accessed with object lock held.
+ */
+void fscache_obtained_object(struct fscache_object *object)
+{
+       struct fscache_cookie *cookie = object->cookie;
+
+       _enter("{OBJ%x,%s}",
+              object->debug_id, fscache_object_states[object->state]);
+
+       /* if we were still looking up, then we must have a positive lookup
+        * result, in which case there may be data available */
+       spin_lock(&object->lock);
+       if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
+               fscache_stat(&fscache_n_object_lookups_positive);
+
+               clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+
+               object->state = FSCACHE_OBJECT_AVAILABLE;
+               spin_unlock(&object->lock);
+
+               smp_mb__before_clear_bit();
+               clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
+               smp_mb__after_clear_bit();
+               wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
+               set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+       } else {
+               ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
+               fscache_stat(&fscache_n_object_created);
+
+               object->state = FSCACHE_OBJECT_AVAILABLE;
+               spin_unlock(&object->lock);
+               set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
+               smp_wmb();
+       }
+
+       if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags))
+               wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING);
+
+       _leave("");
+}
+EXPORT_SYMBOL(fscache_obtained_object);
+
+/*
+ * handle an object that has just become available
+ */
+static void fscache_object_available(struct fscache_object *object)
+{
+       _enter("{OBJ%x}", object->debug_id);
+
+       spin_lock(&object->lock);
+
+       if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
+               wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
+
+       fscache_done_parent_op(object);
+       if (object->n_in_progress == 0) {
+               if (object->n_ops > 0) {
+                       ASSERTCMP(object->n_ops, >=, object->n_obj_ops);
+                       ASSERTIF(object->n_ops > object->n_obj_ops,
+                                !list_empty(&object->pending_ops));
+                       fscache_start_operations(object);
+               } else {
+                       ASSERT(list_empty(&object->pending_ops));
+               }
+       }
+       spin_unlock(&object->lock);
+
+       object->cache->ops->lookup_complete(object);
+       fscache_enqueue_dependents(object);
+
+       fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
+       fscache_stat(&fscache_n_object_avail);
+
+       _leave("");
+}
+
+/*
+ * drop an object's attachments
+ */
+static void fscache_drop_object(struct fscache_object *object)
+{
+       struct fscache_object *parent = object->parent;
+       struct fscache_cache *cache = object->cache;
+
+       _enter("{OBJ%x,%d}", object->debug_id, object->n_children);
+
+       spin_lock(&cache->object_list_lock);
+       list_del_init(&object->cache_link);
+       spin_unlock(&cache->object_list_lock);
+
+       cache->ops->drop_object(object);
+
+       if (parent) {
+               _debug("release parent OBJ%x {%d}",
+                      parent->debug_id, parent->n_children);
+
+               spin_lock(&parent->lock);
+               parent->n_children--;
+               if (parent->n_children == 0)
+                       fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
+               spin_unlock(&parent->lock);
+               object->parent = NULL;
+       }
+
+       /* this just shifts the object release to the slow work processor */
+       object->cache->ops->put_object(object);
+
+       _leave("");
+}
+
+/*
+ * release or recycle an object that the netfs has discarded
+ */
+static void fscache_release_object(struct fscache_object *object)
+{
+       _enter("");
+
+       fscache_drop_object(object);
+}
+
+/*
+ * withdraw an object from active service
+ */
+static void fscache_withdraw_object(struct fscache_object *object)
+{
+       struct fscache_cookie *cookie;
+       bool detached;
+
+       _enter("");
+
+       spin_lock(&object->lock);
+       cookie = object->cookie;
+       if (cookie) {
+               /* need to get the cookie lock before the object lock, starting
+                * from the object pointer */
+               atomic_inc(&cookie->usage);
+               spin_unlock(&object->lock);
+
+               detached = false;
+               spin_lock(&cookie->lock);
+               spin_lock(&object->lock);
+
+               if (object->cookie == cookie) {
+                       hlist_del_init(&object->cookie_link);
+                       object->cookie = NULL;
+                       detached = true;
+               }
+               spin_unlock(&cookie->lock);
+               fscache_cookie_put(cookie);
+               if (detached)
+                       fscache_cookie_put(cookie);
+       }
+
+       spin_unlock(&object->lock);
+
+       fscache_drop_object(object);
+}
+
+/*
+ * withdraw an object from active service at the behest of the cache
+ * - need break the links to a cached object cookie
+ * - called under two situations:
+ *   (1) recycler decides to reclaim an in-use object
+ *   (2) a cache is unmounted
+ * - have to take care as the cookie can be being relinquished by the netfs
+ *   simultaneously
+ * - the object is pinned by the caller holding a refcount on it
+ */
+void fscache_withdrawing_object(struct fscache_cache *cache,
+                               struct fscache_object *object)
+{
+       bool enqueue = false;
+
+       _enter(",OBJ%x", object->debug_id);
+
+       spin_lock(&object->lock);
+       if (object->state < FSCACHE_OBJECT_WITHDRAWING) {
+               object->state = FSCACHE_OBJECT_WITHDRAWING;
+               enqueue = true;
+       }
+       spin_unlock(&object->lock);
+
+       if (enqueue)
+               fscache_enqueue_object(object);
+
+       _leave("");
+}
+
+/*
+ * allow the slow work item processor to get a ref on an object
+ */
+static int fscache_object_slow_work_get_ref(struct slow_work *work)
+{
+       struct fscache_object *object =
+               container_of(work, struct fscache_object, work);
+
+       return object->cache->ops->grab_object(object) ? 0 : -EAGAIN;
+}
+
+/*
+ * allow the slow work item processor to discard a ref on a work item
+ */
+static void fscache_object_slow_work_put_ref(struct slow_work *work)
+{
+       struct fscache_object *object =
+               container_of(work, struct fscache_object, work);
+
+       return object->cache->ops->put_object(object);
+}
+
+/*
+ * enqueue an object for metadata-type processing
+ */
+void fscache_enqueue_object(struct fscache_object *object)
+{
+       _enter("{OBJ%x}", object->debug_id);
+
+       slow_work_enqueue(&object->work);
+}
+
+/*
+ * enqueue the dependents of an object for metadata-type processing
+ * - the caller must hold the object's lock
+ * - this may cause an already locked object to wind up being processed again
+ */
+static void fscache_enqueue_dependents(struct fscache_object *object)
+{
+       struct fscache_object *dep;
+
+       _enter("{OBJ%x}", object->debug_id);
+
+       if (list_empty(&object->dependents))
+               return;
+
+       spin_lock(&object->lock);
+
+       while (!list_empty(&object->dependents)) {
+               dep = list_entry(object->dependents.next,
+                                struct fscache_object, dep_link);
+               list_del_init(&dep->dep_link);
+
+
+               /* sort onto appropriate lists */
+               fscache_enqueue_object(dep);
+               dep->cache->ops->put_object(dep);
+
+               if (!list_empty(&object->dependents))
+                       cond_resched_lock(&object->lock);
+       }
+
+       spin_unlock(&object->lock);
+}
+
+/*
+ * remove an object from whatever queue it's waiting on
+ * - the caller must hold object->lock
+ */
+void fscache_dequeue_object(struct fscache_object *object)
+{
+       _enter("{OBJ%x}", object->debug_id);
+
+       if (!list_empty(&object->dep_link)) {
+               spin_lock(&object->parent->lock);
+               list_del_init(&object->dep_link);
+               spin_unlock(&object->parent->lock);
+       }
+
+       _leave("");
+}
+
+/**
+ * fscache_check_aux - Ask the netfs whether an object on disk is still valid
+ * @object: The object to ask about
+ * @data: The auxiliary data for the object
+ * @datalen: The size of the auxiliary data
+ *
+ * This function consults the netfs about the coherency state of an object
+ */
+enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
+                                       const void *data, uint16_t datalen)
+{
+       enum fscache_checkaux result;
+
+       if (!object->cookie->def->check_aux) {
+               fscache_stat(&fscache_n_checkaux_none);
+               return FSCACHE_CHECKAUX_OKAY;
+       }
+
+       result = object->cookie->def->check_aux(object->cookie->netfs_data,
+                                               data, datalen);
+       switch (result) {
+               /* entry okay as is */
+       case FSCACHE_CHECKAUX_OKAY:
+               fscache_stat(&fscache_n_checkaux_okay);
+               break;
+
+               /* entry requires update */
+       case FSCACHE_CHECKAUX_NEEDS_UPDATE:
+               fscache_stat(&fscache_n_checkaux_update);
+               break;
+
+               /* entry requires deletion */
+       case FSCACHE_CHECKAUX_OBSOLETE:
+               fscache_stat(&fscache_n_checkaux_obsolete);
+               break;
+
+       default:
+               BUG();
+       }
+
+       return result;
+}
+EXPORT_SYMBOL(fscache_check_aux);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
new file mode 100644 (file)
index 0000000..e7f8d53
--- /dev/null
@@ -0,0 +1,459 @@
+/* FS-Cache worker operation management routines
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * See Documentation/filesystems/caching/operations.txt
+ */
+
+#define FSCACHE_DEBUG_LEVEL OPERATION
+#include <linux/module.h>
+#include "internal.h"
+
+atomic_t fscache_op_debug_id;
+EXPORT_SYMBOL(fscache_op_debug_id);
+
+/**
+ * fscache_enqueue_operation - Enqueue an operation for processing
+ * @op: The operation to enqueue
+ *
+ * Enqueue an operation for processing by the FS-Cache thread pool.
+ *
+ * This will get its own ref on the object.
+ */
+void fscache_enqueue_operation(struct fscache_operation *op)
+{
+       _enter("{OBJ%x OP%x,%u}",
+              op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+
+       ASSERT(op->processor != NULL);
+       ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
+       ASSERTCMP(atomic_read(&op->usage), >, 0);
+
+       if (list_empty(&op->pend_link)) {
+               switch (op->flags & FSCACHE_OP_TYPE) {
+               case FSCACHE_OP_FAST:
+                       _debug("queue fast");
+                       atomic_inc(&op->usage);
+                       if (!schedule_work(&op->fast_work))
+                               fscache_put_operation(op);
+                       break;
+               case FSCACHE_OP_SLOW:
+                       _debug("queue slow");
+                       slow_work_enqueue(&op->slow_work);
+                       break;
+               case FSCACHE_OP_MYTHREAD:
+                       _debug("queue for caller's attention");
+                       break;
+               default:
+                       printk(KERN_ERR "FS-Cache: Unexpected op type %lx",
+                              op->flags);
+                       BUG();
+                       break;
+               }
+               fscache_stat(&fscache_n_op_enqueue);
+       }
+}
+EXPORT_SYMBOL(fscache_enqueue_operation);
+
+/*
+ * start an op running
+ */
+static void fscache_run_op(struct fscache_object *object,
+                          struct fscache_operation *op)
+{
+       object->n_in_progress++;
+       if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+               wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+       if (op->processor)
+               fscache_enqueue_operation(op);
+       fscache_stat(&fscache_n_op_run);
+}
+
+/*
+ * submit an exclusive operation for an object
+ * - other ops are excluded from running simultaneously with this one
+ * - this gets any extra refs it needs on an op
+ */
+int fscache_submit_exclusive_op(struct fscache_object *object,
+                               struct fscache_operation *op)
+{
+       int ret;
+
+       _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
+
+       spin_lock(&object->lock);
+       ASSERTCMP(object->n_ops, >=, object->n_in_progress);
+       ASSERTCMP(object->n_ops, >=, object->n_exclusive);
+
+       ret = -ENOBUFS;
+       if (fscache_object_is_active(object)) {
+               op->object = object;
+               object->n_ops++;
+               object->n_exclusive++;  /* reads and writes must wait */
+
+               if (object->n_ops > 0) {
+                       atomic_inc(&op->usage);
+                       list_add_tail(&op->pend_link, &object->pending_ops);
+                       fscache_stat(&fscache_n_op_pend);
+               } else if (!list_empty(&object->pending_ops)) {
+                       atomic_inc(&op->usage);
+                       list_add_tail(&op->pend_link, &object->pending_ops);
+                       fscache_stat(&fscache_n_op_pend);
+                       fscache_start_operations(object);
+               } else {
+                       ASSERTCMP(object->n_in_progress, ==, 0);
+                       fscache_run_op(object, op);
+               }
+
+               /* need to issue a new write op after this */
+               clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
+               ret = 0;
+       } else if (object->state == FSCACHE_OBJECT_CREATING) {
+               op->object = object;
+               object->n_ops++;
+               object->n_exclusive++;  /* reads and writes must wait */
+               atomic_inc(&op->usage);
+               list_add_tail(&op->pend_link, &object->pending_ops);
+               fscache_stat(&fscache_n_op_pend);
+               ret = 0;
+       } else {
+               /* not allowed to submit ops in any other state */
+               BUG();
+       }
+
+       spin_unlock(&object->lock);
+       return ret;
+}
+
+/*
+ * report an unexpected submission
+ */
+static void fscache_report_unexpected_submission(struct fscache_object *object,
+                                                struct fscache_operation *op,
+                                                unsigned long ostate)
+{
+       static bool once_only;
+       struct fscache_operation *p;
+       unsigned n;
+
+       if (once_only)
+               return;
+       once_only = true;
+
+       kdebug("unexpected submission OP%x [OBJ%x %s]",
+              op->debug_id, object->debug_id,
+              fscache_object_states[object->state]);
+       kdebug("objstate=%s [%s]",
+              fscache_object_states[object->state],
+              fscache_object_states[ostate]);
+       kdebug("objflags=%lx", object->flags);
+       kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
+       kdebug("ops=%u inp=%u exc=%u",
+              object->n_ops, object->n_in_progress, object->n_exclusive);
+
+       if (!list_empty(&object->pending_ops)) {
+               n = 0;
+               list_for_each_entry(p, &object->pending_ops, pend_link) {
+                       ASSERTCMP(p->object, ==, object);
+                       kdebug("%p %p", op->processor, op->release);
+                       n++;
+               }
+
+               kdebug("n=%u", n);
+       }
+
+       dump_stack();
+}
+
+/*
+ * submit an operation for an object
+ * - objects may be submitted only in the following states:
+ *   - during object creation (write ops may be submitted)
+ *   - whilst the object is active
+ *   - after an I/O error incurred in one of the two above states (op rejected)
+ * - this gets any extra refs it needs on an op
+ */
+int fscache_submit_op(struct fscache_object *object,
+                     struct fscache_operation *op)
+{
+       unsigned long ostate;
+       int ret;
+
+       _enter("{OBJ%x OP%x},{%u}",
+              object->debug_id, op->debug_id, atomic_read(&op->usage));
+
+       ASSERTCMP(atomic_read(&op->usage), >, 0);
+
+       spin_lock(&object->lock);
+       ASSERTCMP(object->n_ops, >=, object->n_in_progress);
+       ASSERTCMP(object->n_ops, >=, object->n_exclusive);
+
+       ostate = object->state;
+       smp_rmb();
+
+       if (fscache_object_is_active(object)) {
+               op->object = object;
+               object->n_ops++;
+
+               if (object->n_exclusive > 0) {
+                       atomic_inc(&op->usage);
+                       list_add_tail(&op->pend_link, &object->pending_ops);
+                       fscache_stat(&fscache_n_op_pend);
+               } else if (!list_empty(&object->pending_ops)) {
+                       atomic_inc(&op->usage);
+                       list_add_tail(&op->pend_link, &object->pending_ops);
+                       fscache_stat(&fscache_n_op_pend);
+                       fscache_start_operations(object);
+               } else {
+                       ASSERTCMP(object->n_exclusive, ==, 0);
+                       fscache_run_op(object, op);
+               }
+               ret = 0;
+       } else if (object->state == FSCACHE_OBJECT_CREATING) {
+               op->object = object;
+               object->n_ops++;
+               atomic_inc(&op->usage);
+               list_add_tail(&op->pend_link, &object->pending_ops);
+               fscache_stat(&fscache_n_op_pend);
+               ret = 0;
+       } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
+               fscache_report_unexpected_submission(object, op, ostate);
+               ASSERT(!fscache_object_is_active(object));
+               ret = -ENOBUFS;
+       } else {
+               ret = -ENOBUFS;
+       }
+
+       spin_unlock(&object->lock);
+       return ret;
+}
+
+/*
+ * queue an object for withdrawal on error, aborting all following asynchronous
+ * operations
+ */
+void fscache_abort_object(struct fscache_object *object)
+{
+       _enter("{OBJ%x}", object->debug_id);
+
+       fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
+}
+
+/*
+ * jump start the operation processing on an object
+ * - caller must hold object->lock
+ */
+void fscache_start_operations(struct fscache_object *object)
+{
+       struct fscache_operation *op;
+       bool stop = false;
+
+       while (!list_empty(&object->pending_ops) && !stop) {
+               op = list_entry(object->pending_ops.next,
+                               struct fscache_operation, pend_link);
+
+               if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
+                       if (object->n_in_progress > 0)
+                               break;
+                       stop = true;
+               }
+               list_del_init(&op->pend_link);
+               object->n_in_progress++;
+
+               if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+                       wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+               if (op->processor)
+                       fscache_enqueue_operation(op);
+
+               /* the pending queue was holding a ref on the object */
+               fscache_put_operation(op);
+       }
+
+       ASSERTCMP(object->n_in_progress, <=, object->n_ops);
+
+       _debug("woke %d ops on OBJ%x",
+              object->n_in_progress, object->debug_id);
+}
+
+/*
+ * release an operation
+ * - queues pending ops if this is the last in-progress op
+ */
+void fscache_put_operation(struct fscache_operation *op)
+{
+       struct fscache_object *object;
+       struct fscache_cache *cache;
+
+       _enter("{OBJ%x OP%x,%d}",
+              op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+
+       ASSERTCMP(atomic_read(&op->usage), >, 0);
+
+       if (!atomic_dec_and_test(&op->usage))
+               return;
+
+       _debug("PUT OP");
+       if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
+               BUG();
+
+       fscache_stat(&fscache_n_op_release);
+
+       if (op->release) {
+               op->release(op);
+               op->release = NULL;
+       }
+
+       object = op->object;
+
+       /* now... we may get called with the object spinlock held, so we
+        * complete the cleanup here only if we can immediately acquire the
+        * lock, and defer it otherwise */
+       if (!spin_trylock(&object->lock)) {
+               _debug("defer put");
+               fscache_stat(&fscache_n_op_deferred_release);
+
+               cache = object->cache;
+               spin_lock(&cache->op_gc_list_lock);
+               list_add_tail(&op->pend_link, &cache->op_gc_list);
+               spin_unlock(&cache->op_gc_list_lock);
+               schedule_work(&cache->op_gc);
+               _leave(" [defer]");
+               return;
+       }
+
+       if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
+               ASSERTCMP(object->n_exclusive, >, 0);
+               object->n_exclusive--;
+       }
+
+       ASSERTCMP(object->n_in_progress, >, 0);
+       object->n_in_progress--;
+       if (object->n_in_progress == 0)
+               fscache_start_operations(object);
+
+       ASSERTCMP(object->n_ops, >, 0);
+       object->n_ops--;
+       if (object->n_ops == 0)
+               fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
+
+       spin_unlock(&object->lock);
+
+       kfree(op);
+       _leave(" [done]");
+}
+EXPORT_SYMBOL(fscache_put_operation);
+
+/*
+ * garbage collect operations that have had their release deferred
+ */
+void fscache_operation_gc(struct work_struct *work)
+{
+       struct fscache_operation *op;
+       struct fscache_object *object;
+       struct fscache_cache *cache =
+               container_of(work, struct fscache_cache, op_gc);
+       int count = 0;
+
+       _enter("");
+
+       do {
+               spin_lock(&cache->op_gc_list_lock);
+               if (list_empty(&cache->op_gc_list)) {
+                       spin_unlock(&cache->op_gc_list_lock);
+                       break;
+               }
+
+               op = list_entry(cache->op_gc_list.next,
+                               struct fscache_operation, pend_link);
+               list_del(&op->pend_link);
+               spin_unlock(&cache->op_gc_list_lock);
+
+               object = op->object;
+
+               _debug("GC DEFERRED REL OBJ%x OP%x",
+                      object->debug_id, op->debug_id);
+               fscache_stat(&fscache_n_op_gc);
+
+               ASSERTCMP(atomic_read(&op->usage), ==, 0);
+
+               spin_lock(&object->lock);
+               if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
+                       ASSERTCMP(object->n_exclusive, >, 0);
+                       object->n_exclusive--;
+               }
+
+               ASSERTCMP(object->n_in_progress, >, 0);
+               object->n_in_progress--;
+               if (object->n_in_progress == 0)
+                       fscache_start_operations(object);
+
+               ASSERTCMP(object->n_ops, >, 0);
+               object->n_ops--;
+               if (object->n_ops == 0)
+                       fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
+
+               spin_unlock(&object->lock);
+
+       } while (count++ < 20);
+
+       if (!list_empty(&cache->op_gc_list))
+               schedule_work(&cache->op_gc);
+
+       _leave("");
+}
+
+/*
+ * allow the slow work item processor to get a ref on an operation
+ */
+static int fscache_op_get_ref(struct slow_work *work)
+{
+       struct fscache_operation *op =
+               container_of(work, struct fscache_operation, slow_work);
+
+       atomic_inc(&op->usage);
+       return 0;
+}
+
+/*
+ * allow the slow work item processor to discard a ref on an operation
+ */
+static void fscache_op_put_ref(struct slow_work *work)
+{
+       struct fscache_operation *op =
+               container_of(work, struct fscache_operation, slow_work);
+
+       fscache_put_operation(op);
+}
+
+/*
+ * execute an operation using the slow thread pool to provide processing context
+ * - the caller holds a ref to this object, so we don't need to hold one
+ */
+static void fscache_op_execute(struct slow_work *work)
+{
+       struct fscache_operation *op =
+               container_of(work, struct fscache_operation, slow_work);
+       unsigned long start;
+
+       _enter("{OBJ%x OP%x,%d}",
+              op->object->debug_id, op->debug_id, atomic_read(&op->usage));
+
+       ASSERT(op->processor != NULL);
+       start = jiffies;
+       op->processor(op);
+       fscache_hist(fscache_ops_histogram, start);
+
+       _leave("");
+}
+
+const struct slow_work_ops fscache_op_slow_work_ops = {
+       .get_ref        = fscache_op_get_ref,
+       .put_ref        = fscache_op_put_ref,
+       .execute        = fscache_op_execute,
+};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
new file mode 100644 (file)
index 0000000..2568e0e
--- /dev/null
@@ -0,0 +1,816 @@
+/* Cache page management and data I/O routines
+ *
+ * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL PAGE
+#include <linux/module.h>
+#include <linux/fscache-cache.h>
+#include <linux/buffer_head.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+
+/*
+ * check to see if a page is being written to the cache
+ */
+bool __fscache_check_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+       void *val;
+
+       rcu_read_lock();
+       val = radix_tree_lookup(&cookie->stores, page->index);
+       rcu_read_unlock();
+
+       return val != NULL;
+}
+EXPORT_SYMBOL(__fscache_check_page_write);
+
+/*
+ * wait for a page to finish being written to the cache
+ */
+void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+       wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
+
+       wait_event(*wq, !__fscache_check_page_write(cookie, page));
+}
+EXPORT_SYMBOL(__fscache_wait_on_page_write);
+
+/*
+ * note that a page has finished being written to the cache
+ */
+static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *page)
+{
+       struct page *xpage;
+
+       spin_lock(&cookie->lock);
+       xpage = radix_tree_delete(&cookie->stores, page->index);
+       spin_unlock(&cookie->lock);
+       ASSERT(xpage != NULL);
+
+       wake_up_bit(&cookie->flags, 0);
+}
+
+/*
+ * actually apply the changed attributes to a cache object
+ */
+static void fscache_attr_changed_op(struct fscache_operation *op)
+{
+       struct fscache_object *object = op->object;
+
+       _enter("{OBJ%x OP%x}", object->debug_id, op->debug_id);
+
+       fscache_stat(&fscache_n_attr_changed_calls);
+
+       if (fscache_object_is_active(object) &&
+           object->cache->ops->attr_changed(object) < 0)
+               fscache_abort_object(object);
+
+       _leave("");
+}
+
+/*
+ * notification that the attributes on an object have changed
+ */
+int __fscache_attr_changed(struct fscache_cookie *cookie)
+{
+       struct fscache_operation *op;
+       struct fscache_object *object;
+
+       _enter("%p", cookie);
+
+       ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+
+       fscache_stat(&fscache_n_attr_changed);
+
+       op = kzalloc(sizeof(*op), GFP_KERNEL);
+       if (!op) {
+               fscache_stat(&fscache_n_attr_changed_nomem);
+               _leave(" = -ENOMEM");
+               return -ENOMEM;
+       }
+
+       fscache_operation_init(op, NULL);
+       fscache_operation_init_slow(op, fscache_attr_changed_op);
+       op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
+
+       spin_lock(&cookie->lock);
+
+       if (hlist_empty(&cookie->backing_objects))
+               goto nobufs;
+       object = hlist_entry(cookie->backing_objects.first,
+                            struct fscache_object, cookie_link);
+
+       if (fscache_submit_exclusive_op(object, op) < 0)
+               goto nobufs;
+       spin_unlock(&cookie->lock);
+       fscache_stat(&fscache_n_attr_changed_ok);
+       fscache_put_operation(op);
+       _leave(" = 0");
+       return 0;
+
+nobufs:
+       spin_unlock(&cookie->lock);
+       kfree(op);
+       fscache_stat(&fscache_n_attr_changed_nobufs);
+       _leave(" = %d", -ENOBUFS);
+       return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_attr_changed);
+
+/*
+ * handle secondary execution given to a retrieval op on behalf of the
+ * cache
+ */
+static void fscache_retrieval_work(struct work_struct *work)
+{
+       struct fscache_retrieval *op =
+               container_of(work, struct fscache_retrieval, op.fast_work);
+       unsigned long start;
+
+       _enter("{OP%x}", op->op.debug_id);
+
+       start = jiffies;
+       op->op.processor(&op->op);
+       fscache_hist(fscache_ops_histogram, start);
+       fscache_put_operation(&op->op);
+}
+
+/*
+ * release a retrieval op reference
+ */
+static void fscache_release_retrieval_op(struct fscache_operation *_op)
+{
+       struct fscache_retrieval *op =
+               container_of(_op, struct fscache_retrieval, op);
+
+       _enter("{OP%x}", op->op.debug_id);
+
+       fscache_hist(fscache_retrieval_histogram, op->start_time);
+       if (op->context)
+               fscache_put_context(op->op.object->cookie, op->context);
+
+       _leave("");
+}
+
+/*
+ * allocate a retrieval op
+ */
+static struct fscache_retrieval *fscache_alloc_retrieval(
+       struct address_space *mapping,
+       fscache_rw_complete_t end_io_func,
+       void *context)
+{
+       struct fscache_retrieval *op;
+
+       /* allocate a retrieval operation and attempt to submit it */
+       op = kzalloc(sizeof(*op), GFP_NOIO);
+       if (!op) {
+               fscache_stat(&fscache_n_retrievals_nomem);
+               return NULL;
+       }
+
+       fscache_operation_init(&op->op, fscache_release_retrieval_op);
+       op->op.flags    = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
+       op->mapping     = mapping;
+       op->end_io_func = end_io_func;
+       op->context     = context;
+       op->start_time  = jiffies;
+       INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
+       INIT_LIST_HEAD(&op->to_do);
+       return op;
+}
+
+/*
+ * wait for a deferred lookup to complete
+ */
+static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
+{
+       unsigned long jif;
+
+       _enter("");
+
+       if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) {
+               _leave(" = 0 [imm]");
+               return 0;
+       }
+
+       fscache_stat(&fscache_n_retrievals_wait);
+
+       jif = jiffies;
+       if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
+                       fscache_wait_bit_interruptible,
+                       TASK_INTERRUPTIBLE) != 0) {
+               fscache_stat(&fscache_n_retrievals_intr);
+               _leave(" = -ERESTARTSYS");
+               return -ERESTARTSYS;
+       }
+
+       ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags));
+
+       smp_rmb();
+       fscache_hist(fscache_retrieval_delay_histogram, jif);
+       _leave(" = 0 [dly]");
+       return 0;
+}
+
+/*
+ * read a page from the cache or allocate a block in which to store it
+ * - we return:
+ *   -ENOMEM   - out of memory, nothing done
+ *   -ERESTARTSYS - interrupted
+ *   -ENOBUFS  - no backing object available in which to cache the block
+ *   -ENODATA  - no data available in the backing object for this block
+ *   0         - dispatched a read - it'll call end_io_func() when finished
+ */
+int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
+                                struct page *page,
+                                fscache_rw_complete_t end_io_func,
+                                void *context,
+                                gfp_t gfp)
+{
+       struct fscache_retrieval *op;
+       struct fscache_object *object;
+       int ret;
+
+       _enter("%p,%p,,,", cookie, page);
+
+       fscache_stat(&fscache_n_retrievals);
+
+       if (hlist_empty(&cookie->backing_objects))
+               goto nobufs;
+
+       ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+       ASSERTCMP(page, !=, NULL);
+
+       if (fscache_wait_for_deferred_lookup(cookie) < 0)
+               return -ERESTARTSYS;
+
+       op = fscache_alloc_retrieval(page->mapping, end_io_func, context);
+       if (!op) {
+               _leave(" = -ENOMEM");
+               return -ENOMEM;
+       }
+
+       spin_lock(&cookie->lock);
+
+       if (hlist_empty(&cookie->backing_objects))
+               goto nobufs_unlock;
+       object = hlist_entry(cookie->backing_objects.first,
+                            struct fscache_object, cookie_link);
+
+       ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
+
+       if (fscache_submit_op(object, &op->op) < 0)
+               goto nobufs_unlock;
+       spin_unlock(&cookie->lock);
+
+       fscache_stat(&fscache_n_retrieval_ops);
+
+       /* pin the netfs read context in case we need to do the actual netfs
+        * read because we've encountered a cache read failure */
+       fscache_get_context(object->cookie, op->context);
+
+       /* we wait for the operation to become active, and then process it
+        * *here*, in this thread, and not in the thread pool */
+       if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+               _debug(">>> WT");
+               fscache_stat(&fscache_n_retrieval_op_waits);
+               wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+                           fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+               _debug("<<< GO");
+       }
+
+       /* ask the cache to honour the operation */
+       if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
+               ret = object->cache->ops->allocate_page(op, page, gfp);
+               if (ret == 0)
+                       ret = -ENODATA;
+       } else {
+               ret = object->cache->ops->read_or_alloc_page(op, page, gfp);
+       }
+
+       if (ret == -ENOMEM)
+               fscache_stat(&fscache_n_retrievals_nomem);
+       else if (ret == -ERESTARTSYS)
+               fscache_stat(&fscache_n_retrievals_intr);
+       else if (ret == -ENODATA)
+               fscache_stat(&fscache_n_retrievals_nodata);
+       else if (ret < 0)
+               fscache_stat(&fscache_n_retrievals_nobufs);
+       else
+               fscache_stat(&fscache_n_retrievals_ok);
+
+       fscache_put_retrieval(op);
+       _leave(" = %d", ret);
+       return ret;
+
+nobufs_unlock:
+       spin_unlock(&cookie->lock);
+       kfree(op);
+nobufs:
+       fscache_stat(&fscache_n_retrievals_nobufs);
+       _leave(" = -ENOBUFS");
+       return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_read_or_alloc_page);
+
+/*
+ * read a list of page from the cache or allocate a block in which to store
+ * them
+ * - we return:
+ *   -ENOMEM   - out of memory, some pages may be being read
+ *   -ERESTARTSYS - interrupted, some pages may be being read
+ *   -ENOBUFS  - no backing object or space available in which to cache any
+ *                pages not being read
+ *   -ENODATA  - no data available in the backing object for some or all of
+ *                the pages
+ *   0         - dispatched a read on all pages
+ *
+ * end_io_func() will be called for each page read from the cache as it is
+ * finishes being read
+ *
+ * any pages for which a read is dispatched will be removed from pages and
+ * nr_pages
+ */
+int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
+                                 struct address_space *mapping,
+                                 struct list_head *pages,
+                                 unsigned *nr_pages,
+                                 fscache_rw_complete_t end_io_func,
+                                 void *context,
+                                 gfp_t gfp)
+{
+       fscache_pages_retrieval_func_t func;
+       struct fscache_retrieval *op;
+       struct fscache_object *object;
+       int ret;
+
+       _enter("%p,,%d,,,", cookie, *nr_pages);
+
+       fscache_stat(&fscache_n_retrievals);
+
+       if (hlist_empty(&cookie->backing_objects))
+               goto nobufs;
+
+       ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+       ASSERTCMP(*nr_pages, >, 0);
+       ASSERT(!list_empty(pages));
+
+       if (fscache_wait_for_deferred_lookup(cookie) < 0)
+               return -ERESTARTSYS;
+
+       op = fscache_alloc_retrieval(mapping, end_io_func, context);
+       if (!op)
+               return -ENOMEM;
+
+       spin_lock(&cookie->lock);
+
+       if (hlist_empty(&cookie->backing_objects))
+               goto nobufs_unlock;
+       object = hlist_entry(cookie->backing_objects.first,
+                            struct fscache_object, cookie_link);
+
+       if (fscache_submit_op(object, &op->op) < 0)
+               goto nobufs_unlock;
+       spin_unlock(&cookie->lock);
+
+       fscache_stat(&fscache_n_retrieval_ops);
+
+       /* pin the netfs read context in case we need to do the actual netfs
+        * read because we've encountered a cache read failure */
+       fscache_get_context(object->cookie, op->context);
+
+       /* we wait for the operation to become active, and then process it
+        * *here*, in this thread, and not in the thread pool */
+       if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+               _debug(">>> WT");
+               fscache_stat(&fscache_n_retrieval_op_waits);
+               wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+                           fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+               _debug("<<< GO");
+       }
+
+       /* ask the cache to honour the operation */
+       if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags))
+               func = object->cache->ops->allocate_pages;
+       else
+               func = object->cache->ops->read_or_alloc_pages;
+       ret = func(op, pages, nr_pages, gfp);
+
+       if (ret == -ENOMEM)
+               fscache_stat(&fscache_n_retrievals_nomem);
+       else if (ret == -ERESTARTSYS)
+               fscache_stat(&fscache_n_retrievals_intr);
+       else if (ret == -ENODATA)
+               fscache_stat(&fscache_n_retrievals_nodata);
+       else if (ret < 0)
+               fscache_stat(&fscache_n_retrievals_nobufs);
+       else
+               fscache_stat(&fscache_n_retrievals_ok);
+
+       fscache_put_retrieval(op);
+       _leave(" = %d", ret);
+       return ret;
+
+nobufs_unlock:
+       spin_unlock(&cookie->lock);
+       kfree(op);
+nobufs:
+       fscache_stat(&fscache_n_retrievals_nobufs);
+       _leave(" = -ENOBUFS");
+       return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_read_or_alloc_pages);
+
+/*
+ * allocate a block in the cache on which to store a page
+ * - we return:
+ *   -ENOMEM   - out of memory, nothing done
+ *   -ERESTARTSYS - interrupted
+ *   -ENOBUFS  - no backing object available in which to cache the block
+ *   0         - block allocated
+ */
+int __fscache_alloc_page(struct fscache_cookie *cookie,
+                        struct page *page,
+                        gfp_t gfp)
+{
+       struct fscache_retrieval *op;
+       struct fscache_object *object;
+       int ret;
+
+       _enter("%p,%p,,,", cookie, page);
+
+       fscache_stat(&fscache_n_allocs);
+
+       if (hlist_empty(&cookie->backing_objects))
+               goto nobufs;
+
+       ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+       ASSERTCMP(page, !=, NULL);
+
+       if (fscache_wait_for_deferred_lookup(cookie) < 0)
+               return -ERESTARTSYS;
+
+       op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
+       if (!op)
+               return -ENOMEM;
+
+       spin_lock(&cookie->lock);
+
+       if (hlist_empty(&cookie->backing_objects))
+               goto nobufs_unlock;
+       object = hlist_entry(cookie->backing_objects.first,
+                            struct fscache_object, cookie_link);
+
+       if (fscache_submit_op(object, &op->op) < 0)
+               goto nobufs_unlock;
+       spin_unlock(&cookie->lock);
+
+       fscache_stat(&fscache_n_alloc_ops);
+
+       if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) {
+               _debug(">>> WT");
+               fscache_stat(&fscache_n_alloc_op_waits);
+               wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
+                           fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+               _debug("<<< GO");
+       }
+
+       /* ask the cache to honour the operation */
+       ret = object->cache->ops->allocate_page(op, page, gfp);
+
+       if (ret < 0)
+               fscache_stat(&fscache_n_allocs_nobufs);
+       else
+               fscache_stat(&fscache_n_allocs_ok);
+
+       fscache_put_retrieval(op);
+       _leave(" = %d", ret);
+       return ret;
+
+nobufs_unlock:
+       spin_unlock(&cookie->lock);
+       kfree(op);
+nobufs:
+       fscache_stat(&fscache_n_allocs_nobufs);
+       _leave(" = -ENOBUFS");
+       return -ENOBUFS;
+}
+EXPORT_SYMBOL(__fscache_alloc_page);
+
+/*
+ * release a write op reference
+ */
+static void fscache_release_write_op(struct fscache_operation *_op)
+{
+       _enter("{OP%x}", _op->debug_id);
+}
+
+/*
+ * perform the background storage of a page into the cache
+ */
+static void fscache_write_op(struct fscache_operation *_op)
+{
+       struct fscache_storage *op =
+               container_of(_op, struct fscache_storage, op);
+       struct fscache_object *object = op->op.object;
+       struct fscache_cookie *cookie = object->cookie;
+       struct page *page;
+       unsigned n;
+       void *results[1];
+       int ret;
+
+       _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
+
+       spin_lock(&cookie->lock);
+       spin_lock(&object->lock);
+
+       if (!fscache_object_is_active(object)) {
+               spin_unlock(&object->lock);
+               spin_unlock(&cookie->lock);
+               _leave("");
+               return;
+       }
+
+       fscache_stat(&fscache_n_store_calls);
+
+       /* find a page to store */
+       page = NULL;
+       n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 1,
+                                      FSCACHE_COOKIE_PENDING_TAG);
+       if (n != 1)
+               goto superseded;
+       page = results[0];
+       _debug("gang %d [%lx]", n, page->index);
+       if (page->index > op->store_limit)
+               goto superseded;
+
+       radix_tree_tag_clear(&cookie->stores, page->index,
+                            FSCACHE_COOKIE_PENDING_TAG);
+
+       spin_unlock(&object->lock);
+       spin_unlock(&cookie->lock);
+
+       if (page) {
+               ret = object->cache->ops->write_page(op, page);
+               fscache_end_page_write(cookie, page);
+               page_cache_release(page);
+               if (ret < 0)
+                       fscache_abort_object(object);
+               else
+                       fscache_enqueue_operation(&op->op);
+       }
+
+       _leave("");
+       return;
+
+superseded:
+       /* this writer is going away and there aren't any more things to
+        * write */
+       _debug("cease");
+       clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
+       spin_unlock(&object->lock);
+       spin_unlock(&cookie->lock);
+       _leave("");
+}
+
+/*
+ * request a page be stored in the cache
+ * - returns:
+ *   -ENOMEM   - out of memory, nothing done
+ *   -ENOBUFS  - no backing object available in which to cache the page
+ *   0         - dispatched a write - it'll call end_io_func() when finished
+ *
+ * if the cookie still has a backing object at this point, that object can be
+ * in one of a few states with respect to storage processing:
+ *
+ *  (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
+ *      set)
+ *
+ *     (a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred
+ *         fill op)
+ *
+ *     (b) writes deferred till post-creation (mark page for writing and
+ *         return immediately)
+ *
+ *  (2) negative lookup, object created, initial fill being made from netfs
+ *      (FSCACHE_COOKIE_INITIAL_FILL is set)
+ *
+ *     (a) fill point not yet reached this page (mark page for writing and
+ *          return)
+ *
+ *     (b) fill point passed this page (queue op to store this page)
+ *
+ *  (3) object extant (queue op to store this page)
+ *
+ * any other state is invalid
+ */
+int __fscache_write_page(struct fscache_cookie *cookie,
+                        struct page *page,
+                        gfp_t gfp)
+{
+       struct fscache_storage *op;
+       struct fscache_object *object;
+       int ret;
+
+       _enter("%p,%x,", cookie, (u32) page->flags);
+
+       ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+       ASSERT(PageFsCache(page));
+
+       fscache_stat(&fscache_n_stores);
+
+       op = kzalloc(sizeof(*op), GFP_NOIO);
+       if (!op)
+               goto nomem;
+
+       fscache_operation_init(&op->op, fscache_release_write_op);
+       fscache_operation_init_slow(&op->op, fscache_write_op);
+       op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING);
+
+       ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
+       if (ret < 0)
+               goto nomem_free;
+
+       ret = -ENOBUFS;
+       spin_lock(&cookie->lock);
+
+       if (hlist_empty(&cookie->backing_objects))
+               goto nobufs;
+       object = hlist_entry(cookie->backing_objects.first,
+                            struct fscache_object, cookie_link);
+       if (test_bit(FSCACHE_IOERROR, &object->cache->flags))
+               goto nobufs;
+
+       /* add the page to the pending-storage radix tree on the backing
+        * object */
+       spin_lock(&object->lock);
+
+       _debug("store limit %llx", (unsigned long long) object->store_limit);
+
+       ret = radix_tree_insert(&cookie->stores, page->index, page);
+       if (ret < 0) {
+               if (ret == -EEXIST)
+                       goto already_queued;
+               _debug("insert failed %d", ret);
+               goto nobufs_unlock_obj;
+       }
+
+       radix_tree_tag_set(&cookie->stores, page->index,
+                          FSCACHE_COOKIE_PENDING_TAG);
+       page_cache_get(page);
+
+       /* we only want one writer at a time, but we do need to queue new
+        * writers after exclusive ops */
+       if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags))
+               goto already_pending;
+
+       spin_unlock(&object->lock);
+
+       op->op.debug_id = atomic_inc_return(&fscache_op_debug_id);
+       op->store_limit = object->store_limit;
+
+       if (fscache_submit_op(object, &op->op) < 0)
+               goto submit_failed;
+
+       spin_unlock(&cookie->lock);
+       radix_tree_preload_end();
+       fscache_stat(&fscache_n_store_ops);
+       fscache_stat(&fscache_n_stores_ok);
+
+       /* the slow work queue now carries its own ref on the object */
+       fscache_put_operation(&op->op);
+       _leave(" = 0");
+       return 0;
+
+already_queued:
+       fscache_stat(&fscache_n_stores_again);
+already_pending:
+       spin_unlock(&object->lock);
+       spin_unlock(&cookie->lock);
+       radix_tree_preload_end();
+       kfree(op);
+       fscache_stat(&fscache_n_stores_ok);
+       _leave(" = 0");
+       return 0;
+
+submit_failed:
+       radix_tree_delete(&cookie->stores, page->index);
+       page_cache_release(page);
+       ret = -ENOBUFS;
+       goto nobufs;
+
+nobufs_unlock_obj:
+       spin_unlock(&object->lock);
+nobufs:
+       spin_unlock(&cookie->lock);
+       radix_tree_preload_end();
+       kfree(op);
+       fscache_stat(&fscache_n_stores_nobufs);
+       _leave(" = -ENOBUFS");
+       return -ENOBUFS;
+
+nomem_free:
+       kfree(op);
+nomem:
+       fscache_stat(&fscache_n_stores_oom);
+       _leave(" = -ENOMEM");
+       return -ENOMEM;
+}
+EXPORT_SYMBOL(__fscache_write_page);
+
+/*
+ * remove a page from the cache
+ */
+void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page)
+{
+       struct fscache_object *object;
+
+       _enter(",%p", page);
+
+       ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
+       ASSERTCMP(page, !=, NULL);
+
+       fscache_stat(&fscache_n_uncaches);
+
+       /* cache withdrawal may beat us to it */
+       if (!PageFsCache(page))
+               goto done;
+
+       /* get the object */
+       spin_lock(&cookie->lock);
+
+       if (hlist_empty(&cookie->backing_objects)) {
+               ClearPageFsCache(page);
+               goto done_unlock;
+       }
+
+       object = hlist_entry(cookie->backing_objects.first,
+                            struct fscache_object, cookie_link);
+
+       /* there might now be stuff on disk we could read */
+       clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+
+       /* only invoke the cache backend if we managed to mark the page
+        * uncached here; this deals with synchronisation vs withdrawal */
+       if (TestClearPageFsCache(page) &&
+           object->cache->ops->uncache_page) {
+               /* the cache backend releases the cookie lock */
+               object->cache->ops->uncache_page(object, page);
+               goto done;
+       }
+
+done_unlock:
+       spin_unlock(&cookie->lock);
+done:
+       _leave("");
+}
+EXPORT_SYMBOL(__fscache_uncache_page);
+
+/**
+ * fscache_mark_pages_cached - Mark pages as being cached
+ * @op: The retrieval op pages are being marked for
+ * @pagevec: The pages to be marked
+ *
+ * Mark a bunch of netfs pages as being cached.  After this is called,
+ * the netfs must call fscache_uncache_page() to remove the mark.
+ */
+void fscache_mark_pages_cached(struct fscache_retrieval *op,
+                              struct pagevec *pagevec)
+{
+       struct fscache_cookie *cookie = op->op.object->cookie;
+       unsigned long loop;
+
+#ifdef CONFIG_FSCACHE_STATS
+       atomic_add(pagevec->nr, &fscache_n_marks);
+#endif
+
+       for (loop = 0; loop < pagevec->nr; loop++) {
+               struct page *page = pagevec->pages[loop];
+
+               _debug("- mark %p{%lx}", page, page->index);
+               if (TestSetPageFsCache(page)) {
+                       static bool once_only;
+                       if (!once_only) {
+                               once_only = true;
+                               printk(KERN_WARNING "FS-Cache:"
+                                      " Cookie type %s marked page %lx"
+                                      " multiple times\n",
+                                      cookie->def->name, page->index);
+                       }
+               }
+       }
+
+       if (cookie->def->mark_pages_cached)
+               cookie->def->mark_pages_cached(cookie->netfs_data,
+                                              op->mapping, pagevec);
+       pagevec_reinit(pagevec);
+}
+EXPORT_SYMBOL(fscache_mark_pages_cached);
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
new file mode 100644 (file)
index 0000000..beeab44
--- /dev/null
@@ -0,0 +1,68 @@
+/* FS-Cache statistics viewing interface
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL OPERATION
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+/*
+ * initialise the /proc/fs/fscache/ directory
+ */
+int __init fscache_proc_init(void)
+{
+       _enter("");
+
+       if (!proc_mkdir("fs/fscache", NULL))
+               goto error_dir;
+
+#ifdef CONFIG_FSCACHE_STATS
+       if (!proc_create("fs/fscache/stats", S_IFREG | 0444, NULL,
+                        &fscache_stats_fops))
+               goto error_stats;
+#endif
+
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+       if (!proc_create("fs/fscache/histogram", S_IFREG | 0444, NULL,
+                        &fscache_histogram_fops))
+               goto error_histogram;
+#endif
+
+       _leave(" = 0");
+       return 0;
+
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+error_histogram:
+#endif
+#ifdef CONFIG_FSCACHE_STATS
+       remove_proc_entry("fs/fscache/stats", NULL);
+error_stats:
+#endif
+       remove_proc_entry("fs/fscache", NULL);
+error_dir:
+       _leave(" = -ENOMEM");
+       return -ENOMEM;
+}
+
+/*
+ * clean up the /proc/fs/fscache/ directory
+ */
+void fscache_proc_cleanup(void)
+{
+#ifdef CONFIG_FSCACHE_HISTOGRAM
+       remove_proc_entry("fs/fscache/histogram", NULL);
+#endif
+#ifdef CONFIG_FSCACHE_STATS
+       remove_proc_entry("fs/fscache/stats", NULL);
+#endif
+       remove_proc_entry("fs/fscache", NULL);
+}
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
new file mode 100644 (file)
index 0000000..65deb99
--- /dev/null
@@ -0,0 +1,212 @@
+/* FS-Cache statistics
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define FSCACHE_DEBUG_LEVEL THREAD
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+/*
+ * operation counters
+ */
+atomic_t fscache_n_op_pend;
+atomic_t fscache_n_op_run;
+atomic_t fscache_n_op_enqueue;
+atomic_t fscache_n_op_requeue;
+atomic_t fscache_n_op_deferred_release;
+atomic_t fscache_n_op_release;
+atomic_t fscache_n_op_gc;
+
+atomic_t fscache_n_attr_changed;
+atomic_t fscache_n_attr_changed_ok;
+atomic_t fscache_n_attr_changed_nobufs;
+atomic_t fscache_n_attr_changed_nomem;
+atomic_t fscache_n_attr_changed_calls;
+
+atomic_t fscache_n_allocs;
+atomic_t fscache_n_allocs_ok;
+atomic_t fscache_n_allocs_wait;
+atomic_t fscache_n_allocs_nobufs;
+atomic_t fscache_n_alloc_ops;
+atomic_t fscache_n_alloc_op_waits;
+
+atomic_t fscache_n_retrievals;
+atomic_t fscache_n_retrievals_ok;
+atomic_t fscache_n_retrievals_wait;
+atomic_t fscache_n_retrievals_nodata;
+atomic_t fscache_n_retrievals_nobufs;
+atomic_t fscache_n_retrievals_intr;
+atomic_t fscache_n_retrievals_nomem;
+atomic_t fscache_n_retrieval_ops;
+atomic_t fscache_n_retrieval_op_waits;
+
+atomic_t fscache_n_stores;
+atomic_t fscache_n_stores_ok;
+atomic_t fscache_n_stores_again;
+atomic_t fscache_n_stores_nobufs;
+atomic_t fscache_n_stores_oom;
+atomic_t fscache_n_store_ops;
+atomic_t fscache_n_store_calls;
+
+atomic_t fscache_n_marks;
+atomic_t fscache_n_uncaches;
+
+atomic_t fscache_n_acquires;
+atomic_t fscache_n_acquires_null;
+atomic_t fscache_n_acquires_no_cache;
+atomic_t fscache_n_acquires_ok;
+atomic_t fscache_n_acquires_nobufs;
+atomic_t fscache_n_acquires_oom;
+
+atomic_t fscache_n_updates;
+atomic_t fscache_n_updates_null;
+atomic_t fscache_n_updates_run;
+
+atomic_t fscache_n_relinquishes;
+atomic_t fscache_n_relinquishes_null;
+atomic_t fscache_n_relinquishes_waitcrt;
+
+atomic_t fscache_n_cookie_index;
+atomic_t fscache_n_cookie_data;
+atomic_t fscache_n_cookie_special;
+
+atomic_t fscache_n_object_alloc;
+atomic_t fscache_n_object_no_alloc;
+atomic_t fscache_n_object_lookups;
+atomic_t fscache_n_object_lookups_negative;
+atomic_t fscache_n_object_lookups_positive;
+atomic_t fscache_n_object_created;
+atomic_t fscache_n_object_avail;
+atomic_t fscache_n_object_dead;
+
+atomic_t fscache_n_checkaux_none;
+atomic_t fscache_n_checkaux_okay;
+atomic_t fscache_n_checkaux_update;
+atomic_t fscache_n_checkaux_obsolete;
+
+/*
+ * display the general statistics
+ */
+static int fscache_stats_show(struct seq_file *m, void *v)
+{
+       seq_puts(m, "FS-Cache statistics\n");
+
+       seq_printf(m, "Cookies: idx=%u dat=%u spc=%u\n",
+                  atomic_read(&fscache_n_cookie_index),
+                  atomic_read(&fscache_n_cookie_data),
+                  atomic_read(&fscache_n_cookie_special));
+
+       seq_printf(m, "Objects: alc=%u nal=%u avl=%u ded=%u\n",
+                  atomic_read(&fscache_n_object_alloc),
+                  atomic_read(&fscache_n_object_no_alloc),
+                  atomic_read(&fscache_n_object_avail),
+                  atomic_read(&fscache_n_object_dead));
+       seq_printf(m, "ChkAux : non=%u ok=%u upd=%u obs=%u\n",
+                  atomic_read(&fscache_n_checkaux_none),
+                  atomic_read(&fscache_n_checkaux_okay),
+                  atomic_read(&fscache_n_checkaux_update),
+                  atomic_read(&fscache_n_checkaux_obsolete));
+
+       seq_printf(m, "Pages  : mrk=%u unc=%u\n",
+                  atomic_read(&fscache_n_marks),
+                  atomic_read(&fscache_n_uncaches));
+
+       seq_printf(m, "Acquire: n=%u nul=%u noc=%u ok=%u nbf=%u"
+                  " oom=%u\n",
+                  atomic_read(&fscache_n_acquires),
+                  atomic_read(&fscache_n_acquires_null),
+                  atomic_read(&fscache_n_acquires_no_cache),
+                  atomic_read(&fscache_n_acquires_ok),
+                  atomic_read(&fscache_n_acquires_nobufs),
+                  atomic_read(&fscache_n_acquires_oom));
+
+       seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u\n",
+                  atomic_read(&fscache_n_object_lookups),
+                  atomic_read(&fscache_n_object_lookups_negative),
+                  atomic_read(&fscache_n_object_lookups_positive),
+                  atomic_read(&fscache_n_object_created));
+
+       seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
+                  atomic_read(&fscache_n_updates),
+                  atomic_read(&fscache_n_updates_null),
+                  atomic_read(&fscache_n_updates_run));
+
+       seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u\n",
+                  atomic_read(&fscache_n_relinquishes),
+                  atomic_read(&fscache_n_relinquishes_null),
+                  atomic_read(&fscache_n_relinquishes_waitcrt));
+
+       seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n",
+                  atomic_read(&fscache_n_attr_changed),
+                  atomic_read(&fscache_n_attr_changed_ok),
+                  atomic_read(&fscache_n_attr_changed_nobufs),
+                  atomic_read(&fscache_n_attr_changed_nomem),
+                  atomic_read(&fscache_n_attr_changed_calls));
+
+       seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u\n",
+                  atomic_read(&fscache_n_allocs),
+                  atomic_read(&fscache_n_allocs_ok),
+                  atomic_read(&fscache_n_allocs_wait),
+                  atomic_read(&fscache_n_allocs_nobufs));
+       seq_printf(m, "Allocs : ops=%u owt=%u\n",
+                  atomic_read(&fscache_n_alloc_ops),
+                  atomic_read(&fscache_n_alloc_op_waits));
+
+       seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u"
+                  " int=%u oom=%u\n",
+                  atomic_read(&fscache_n_retrievals),
+                  atomic_read(&fscache_n_retrievals_ok),
+                  atomic_read(&fscache_n_retrievals_wait),
+                  atomic_read(&fscache_n_retrievals_nodata),
+                  atomic_read(&fscache_n_retrievals_nobufs),
+                  atomic_read(&fscache_n_retrievals_intr),
+                  atomic_read(&fscache_n_retrievals_nomem));
+       seq_printf(m, "Retrvls: ops=%u owt=%u\n",
+                  atomic_read(&fscache_n_retrieval_ops),
+                  atomic_read(&fscache_n_retrieval_op_waits));
+
+       seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n",
+                  atomic_read(&fscache_n_stores),
+                  atomic_read(&fscache_n_stores_ok),
+                  atomic_read(&fscache_n_stores_again),
+                  atomic_read(&fscache_n_stores_nobufs),
+                  atomic_read(&fscache_n_stores_oom));
+       seq_printf(m, "Stores : ops=%u run=%u\n",
+                  atomic_read(&fscache_n_store_ops),
+                  atomic_read(&fscache_n_store_calls));
+
+       seq_printf(m, "Ops    : pend=%u run=%u enq=%u\n",
+                  atomic_read(&fscache_n_op_pend),
+                  atomic_read(&fscache_n_op_run),
+                  atomic_read(&fscache_n_op_enqueue));
+       seq_printf(m, "Ops    : dfr=%u rel=%u gc=%u\n",
+                  atomic_read(&fscache_n_op_deferred_release),
+                  atomic_read(&fscache_n_op_release),
+                  atomic_read(&fscache_n_op_gc));
+       return 0;
+}
+
+/*
+ * open "/proc/fs/fscache/stats" allowing provision of a statistical summary
+ */
+static int fscache_stats_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, fscache_stats_show, NULL);
+}
+
+const struct file_operations fscache_stats_fops = {
+       .owner          = THIS_MODULE,
+       .open           = fscache_stats_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = seq_release,
+};
index 3fbffb1ea7147ff079e5c18788231717a345f332..f8077b9c898160513b1e958a532401653ac96123 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
+#include <linux/bio.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -171,14 +172,15 @@ static int journal_write_commit_record(journal_t *journal,
        return (ret == -EIO);
 }
 
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
+                                  int write_op)
 {
        int i;
 
        for (i = 0; i < bufs; i++) {
                wbuf[i]->b_end_io = end_buffer_write_sync;
                /* We use-up our safety reference in submit_bh() */
-               submit_bh(WRITE, wbuf[i]);
+               submit_bh(write_op, wbuf[i]);
        }
 }
 
@@ -186,7 +188,8 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
  *  Submit all the data buffers to disk
  */
 static int journal_submit_data_buffers(journal_t *journal,
-                               transaction_t *commit_transaction)
+                                      transaction_t *commit_transaction,
+                                      int write_op)
 {
        struct journal_head *jh;
        struct buffer_head *bh;
@@ -225,7 +228,7 @@ write_out_data:
                                BUFFER_TRACE(bh, "needs blocking lock");
                                spin_unlock(&journal->j_list_lock);
                                /* Write out all data to prevent deadlocks */
-                               journal_do_submit_data(wbuf, bufs);
+                               journal_do_submit_data(wbuf, bufs, write_op);
                                bufs = 0;
                                lock_buffer(bh);
                                spin_lock(&journal->j_list_lock);
@@ -256,7 +259,7 @@ write_out_data:
                        jbd_unlock_bh_state(bh);
                        if (bufs == journal->j_wbufsize) {
                                spin_unlock(&journal->j_list_lock);
-                               journal_do_submit_data(wbuf, bufs);
+                               journal_do_submit_data(wbuf, bufs, write_op);
                                bufs = 0;
                                goto write_out_data;
                        }
@@ -286,7 +289,7 @@ write_out_data:
                }
        }
        spin_unlock(&journal->j_list_lock);
-       journal_do_submit_data(wbuf, bufs);
+       journal_do_submit_data(wbuf, bufs, write_op);
 
        return err;
 }
@@ -315,6 +318,7 @@ void journal_commit_transaction(journal_t *journal)
        int first_tag = 0;
        int tag_flag;
        int i;
+       int write_op = WRITE;
 
        /*
         * First job: lock down the current transaction and wait for
@@ -347,6 +351,8 @@ void journal_commit_transaction(journal_t *journal)
        spin_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_LOCKED;
 
+       if (commit_transaction->t_synchronous_commit)
+               write_op = WRITE_SYNC;
        spin_lock(&commit_transaction->t_handle_lock);
        while (commit_transaction->t_updates) {
                DEFINE_WAIT(wait);
@@ -431,7 +437,8 @@ void journal_commit_transaction(journal_t *journal)
         * Now start flushing things to disk, in the order they appear
         * on the transaction lists.  Data blocks go first.
         */
-       err = journal_submit_data_buffers(journal, commit_transaction);
+       err = journal_submit_data_buffers(journal, commit_transaction,
+                                         write_op);
 
        /*
         * Wait for all previously submitted IO to complete.
@@ -660,7 +667,7 @@ start_journal_io:
                                clear_buffer_dirty(bh);
                                set_buffer_uptodate(bh);
                                bh->b_end_io = journal_end_buffer_io_sync;
-                               submit_bh(WRITE, bh);
+                               submit_bh(write_op, bh);
                        }
                        cond_resched();
 
index e6a117431277129a9a3e23ca93916826e43f842a..ed886e6db399e31a619539b61e6e5765874d2b06 100644 (file)
@@ -1440,6 +1440,8 @@ int journal_stop(handle_t *handle)
                }
        }
 
+       if (handle->h_sync)
+               transaction->t_synchronous_commit = 1;
        current->journal_info = NULL;
        spin_lock(&journal->j_state_lock);
        spin_lock(&transaction->t_handle_lock);
index 36fe20d6eba29a10e9a00b411f1afe4473ceff22..e67f3ec0773681c17dcf906e0cddfbd12d9ea1bc 100644 (file)
@@ -84,3 +84,11 @@ config ROOT_NFS
          <file:Documentation/filesystems/nfsroot.txt>.
 
          Most people say N here.
+
+config NFS_FSCACHE
+       bool "Provide NFS client caching support (EXPERIMENTAL)"
+       depends on EXPERIMENTAL
+       depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
+       help
+         Say Y here if you want NFS data to be cached locally on disc through
+         the general filesystem cache manager
index ac6170c594a37fadd988fd7027f84e66160f302d..845159814de2985869689aa0d033c2fd07b41a5d 100644 (file)
@@ -15,3 +15,4 @@ nfs-$(CONFIG_NFS_V4)  += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
                           callback.o callback_xdr.o callback_proc.o \
                           nfs4namespace.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
+nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
index aba38017bdefc9773cc498090cca4766dcf2d863..75c9cd2aa1194e0f0c4ac754ae2b85e722ce5809 100644 (file)
@@ -45,6 +45,7 @@
 #include "delegation.h"
 #include "iostat.h"
 #include "internal.h"
+#include "fscache.h"
 
 #define NFSDBG_FACILITY                NFSDBG_CLIENT
 
@@ -154,6 +155,8 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        if (!IS_ERR(cred))
                clp->cl_machine_cred = cred;
 
+       nfs_fscache_get_client_cookie(clp);
+
        return clp;
 
 error_3:
@@ -187,6 +190,8 @@ static void nfs_free_client(struct nfs_client *clp)
 
        nfs4_shutdown_client(clp);
 
+       nfs_fscache_release_client_cookie(clp);
+
        /* -EIO all pending I/O */
        if (!IS_ERR(clp->cl_rpcclient))
                rpc_shutdown_client(clp->cl_rpcclient);
@@ -760,6 +765,7 @@ static int nfs_init_server(struct nfs_server *server,
 
        /* Initialise the client representation from the mount data */
        server->flags = data->flags;
+       server->options = data->options;
 
        if (data->rsize)
                server->rsize = nfs_block_size(data->rsize, NULL);
@@ -1148,6 +1154,7 @@ static int nfs4_init_server(struct nfs_server *server,
        /* Initialise the client representation from the mount data */
        server->flags = data->flags;
        server->caps |= NFS_CAP_ATOMIC_OPEN;
+       server->options = data->options;
 
        /* Get a client record */
        error = nfs4_set_client(server,
@@ -1559,7 +1566,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
 
        /* display header on line 1 */
        if (v == &nfs_volume_list) {
-               seq_puts(m, "NV SERVER   PORT DEV     FSID\n");
+               seq_puts(m, "NV SERVER   PORT DEV     FSID              FSC\n");
                return 0;
        }
        /* display one transport per line on subsequent lines */
@@ -1573,12 +1580,13 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
                 (unsigned long long) server->fsid.major,
                 (unsigned long long) server->fsid.minor);
 
-       seq_printf(m, "v%u %s %s %-7s %-17s\n",
+       seq_printf(m, "v%u %s %s %-7s %-17s %s\n",
                   clp->rpc_ops->version,
                   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
                   rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
                   dev,
-                  fsid);
+                  fsid,
+                  nfs_server_fscache_state(server));
 
        return 0;
 }
index 0abf3f331f56c6a20249a00bd4719cb1cd3b9112..3523b895eb4b3b598a89399aceca8d77833cb595 100644 (file)
@@ -35,6 +35,7 @@
 #include "delegation.h"
 #include "internal.h"
 #include "iostat.h"
+#include "fscache.h"
 
 #define NFSDBG_FACILITY                NFSDBG_FILE
 
@@ -409,6 +410,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
        return copied;
 }
 
+/*
+ * Partially or wholly invalidate a page
+ * - Release the private state associated with a page if undergoing complete
+ *   page invalidation
+ * - Called if either PG_private or PG_fscache is set on the page
+ * - Caller holds page lock
+ */
 static void nfs_invalidate_page(struct page *page, unsigned long offset)
 {
        dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
@@ -417,23 +425,43 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
                return;
        /* Cancel any unstarted writes on this page */
        nfs_wb_page_cancel(page->mapping->host, page);
+
+       nfs_fscache_invalidate_page(page, page->mapping->host);
 }
 
+/*
+ * Attempt to release the private state associated with a page
+ * - Called if either PG_private or PG_fscache is set on the page
+ * - Caller holds page lock
+ * - Return true (may release page) or false (may not)
+ */
 static int nfs_release_page(struct page *page, gfp_t gfp)
 {
        dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
 
        /* If PagePrivate() is set, then the page is not freeable */
-       return 0;
+       if (PagePrivate(page))
+               return 0;
+       return nfs_fscache_release_page(page, gfp);
 }
 
+/*
+ * Attempt to clear the private state associated with a page when an error
+ * occurs that requires the cached contents of an inode to be written back or
+ * destroyed
+ * - Called if either PG_private or fscache is set on the page
+ * - Caller holds page lock
+ * - Return 0 if successful, -error otherwise
+ */
 static int nfs_launder_page(struct page *page)
 {
        struct inode *inode = page->mapping->host;
+       struct nfs_inode *nfsi = NFS_I(inode);
 
        dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
                inode->i_ino, (long long)page_offset(page));
 
+       nfs_fscache_wait_on_page_write(nfsi, page);
        return nfs_wb_page(inode, page);
 }
 
@@ -451,6 +479,11 @@ const struct address_space_operations nfs_file_aops = {
        .launder_page = nfs_launder_page,
 };
 
+/*
+ * Notification that a PTE pointing to an NFS page is about to be made
+ * writable, implying that someone is about to modify the page through a
+ * shared-writable mapping
+ */
 static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct page *page = vmf->page;
@@ -465,6 +498,9 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                filp->f_mapping->host->i_ino,
                (long long)page_offset(page));
 
+       /* make sure the cache has finished storing the page */
+       nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
+
        lock_page(page);
        mapping = page->mapping;
        if (mapping != dentry->d_inode->i_mapping)
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
new file mode 100644 (file)
index 0000000..5b10064
--- /dev/null
@@ -0,0 +1,337 @@
+/* NFS FS-Cache index structure definition
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/in6.h>
+
+#include "internal.h"
+#include "fscache.h"
+
+#define NFSDBG_FACILITY                NFSDBG_FSCACHE
+
+/*
+ * Define the NFS filesystem for FS-Cache.  Upon registration FS-Cache sticks
+ * the cookie for the top-level index object for NFS into here.  The top-level
+ * index can than have other cache objects inserted into it.
+ */
+struct fscache_netfs nfs_fscache_netfs = {
+       .name           = "nfs",
+       .version        = 0,
+};
+
+/*
+ * Register NFS for caching
+ */
+int nfs_fscache_register(void)
+{
+       return fscache_register_netfs(&nfs_fscache_netfs);
+}
+
+/*
+ * Unregister NFS for caching
+ */
+void nfs_fscache_unregister(void)
+{
+       fscache_unregister_netfs(&nfs_fscache_netfs);
+}
+
+/*
+ * Layout of the key for an NFS server cache object.
+ */
+struct nfs_server_key {
+       uint16_t        nfsversion;             /* NFS protocol version */
+       uint16_t        family;                 /* address family */
+       uint16_t        port;                   /* IP port */
+       union {
+               struct in_addr  ipv4_addr;      /* IPv4 address */
+               struct in6_addr ipv6_addr;      /* IPv6 address */
+       } addr[0];
+};
+
+/*
+ * Generate a key to describe a server in the main NFS index
+ * - We return the length of the key, or 0 if we can't generate one
+ */
+static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
+                                  void *buffer, uint16_t bufmax)
+{
+       const struct nfs_client *clp = cookie_netfs_data;
+       const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &clp->cl_addr;
+       const struct sockaddr_in *sin = (struct sockaddr_in *) &clp->cl_addr;
+       struct nfs_server_key *key = buffer;
+       uint16_t len = sizeof(struct nfs_server_key);
+
+       key->nfsversion = clp->rpc_ops->version;
+       key->family = clp->cl_addr.ss_family;
+
+       memset(key, 0, len);
+
+       switch (clp->cl_addr.ss_family) {
+       case AF_INET:
+               key->port = sin->sin_port;
+               key->addr[0].ipv4_addr = sin->sin_addr;
+               len += sizeof(key->addr[0].ipv4_addr);
+               break;
+
+       case AF_INET6:
+               key->port = sin6->sin6_port;
+               key->addr[0].ipv6_addr = sin6->sin6_addr;
+               len += sizeof(key->addr[0].ipv6_addr);
+               break;
+
+       default:
+               printk(KERN_WARNING "NFS: Unknown network family '%d'\n",
+                      clp->cl_addr.ss_family);
+               len = 0;
+               break;
+       }
+
+       return len;
+}
+
+/*
+ * Define the server object for FS-Cache.  This is used to describe a server
+ * object to fscache_acquire_cookie().  It is keyed by the NFS protocol and
+ * server address parameters.
+ */
+const struct fscache_cookie_def nfs_fscache_server_index_def = {
+       .name           = "NFS.server",
+       .type           = FSCACHE_COOKIE_TYPE_INDEX,
+       .get_key        = nfs_server_get_key,
+};
+
+/*
+ * Generate a key to describe a superblock key in the main NFS index
+ */
+static uint16_t nfs_super_get_key(const void *cookie_netfs_data,
+                                 void *buffer, uint16_t bufmax)
+{
+       const struct nfs_fscache_key *key;
+       const struct nfs_server *nfss = cookie_netfs_data;
+       uint16_t len;
+
+       key = nfss->fscache_key;
+       len = sizeof(key->key) + key->key.uniq_len;
+       if (len > bufmax) {
+               len = 0;
+       } else {
+               memcpy(buffer, &key->key, sizeof(key->key));
+               memcpy(buffer + sizeof(key->key),
+                      key->key.uniquifier, key->key.uniq_len);
+       }
+
+       return len;
+}
+
+/*
+ * Define the superblock object for FS-Cache.  This is used to describe a
+ * superblock object to fscache_acquire_cookie().  It is keyed by all the NFS
+ * parameters that might cause a separate superblock.
+ */
+const struct fscache_cookie_def nfs_fscache_super_index_def = {
+       .name           = "NFS.super",
+       .type           = FSCACHE_COOKIE_TYPE_INDEX,
+       .get_key        = nfs_super_get_key,
+};
+
+/*
+ * Definition of the auxiliary data attached to NFS inode storage objects
+ * within the cache.
+ *
+ * The contents of this struct are recorded in the on-disk local cache in the
+ * auxiliary data attached to the data storage object backing an inode.  This
+ * permits coherency to be managed when a new inode binds to an already extant
+ * cache object.
+ */
+struct nfs_fscache_inode_auxdata {
+       struct timespec mtime;
+       struct timespec ctime;
+       loff_t          size;
+       u64             change_attr;
+};
+
+/*
+ * Generate a key to describe an NFS inode in an NFS server's index
+ */
+static uint16_t nfs_fscache_inode_get_key(const void *cookie_netfs_data,
+                                         void *buffer, uint16_t bufmax)
+{
+       const struct nfs_inode *nfsi = cookie_netfs_data;
+       uint16_t nsize;
+
+       /* use the inode's NFS filehandle as the key */
+       nsize = nfsi->fh.size;
+       memcpy(buffer, nfsi->fh.data, nsize);
+       return nsize;
+}
+
+/*
+ * Get certain file attributes from the netfs data
+ * - This function can be absent for an index
+ * - Not permitted to return an error
+ * - The netfs data from the cookie being used as the source is presented
+ */
+static void nfs_fscache_inode_get_attr(const void *cookie_netfs_data,
+                                      uint64_t *size)
+{
+       const struct nfs_inode *nfsi = cookie_netfs_data;
+
+       *size = nfsi->vfs_inode.i_size;
+}
+
+/*
+ * Get the auxiliary data from netfs data
+ * - This function can be absent if the index carries no state data
+ * - Should store the auxiliary data in the buffer
+ * - Should return the amount of amount stored
+ * - Not permitted to return an error
+ * - The netfs data from the cookie being used as the source is presented
+ */
+static uint16_t nfs_fscache_inode_get_aux(const void *cookie_netfs_data,
+                                         void *buffer, uint16_t bufmax)
+{
+       struct nfs_fscache_inode_auxdata auxdata;
+       const struct nfs_inode *nfsi = cookie_netfs_data;
+
+       memset(&auxdata, 0, sizeof(auxdata));
+       auxdata.size = nfsi->vfs_inode.i_size;
+       auxdata.mtime = nfsi->vfs_inode.i_mtime;
+       auxdata.ctime = nfsi->vfs_inode.i_ctime;
+
+       if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
+               auxdata.change_attr = nfsi->change_attr;
+
+       if (bufmax > sizeof(auxdata))
+               bufmax = sizeof(auxdata);
+
+       memcpy(buffer, &auxdata, bufmax);
+       return bufmax;
+}
+
+/*
+ * Consult the netfs about the state of an object
+ * - This function can be absent if the index carries no state data
+ * - The netfs data from the cookie being used as the target is
+ *   presented, as is the auxiliary data
+ */
+static
+enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
+                                                 const void *data,
+                                                 uint16_t datalen)
+{
+       struct nfs_fscache_inode_auxdata auxdata;
+       struct nfs_inode *nfsi = cookie_netfs_data;
+
+       if (datalen != sizeof(auxdata))
+               return FSCACHE_CHECKAUX_OBSOLETE;
+
+       memset(&auxdata, 0, sizeof(auxdata));
+       auxdata.size = nfsi->vfs_inode.i_size;
+       auxdata.mtime = nfsi->vfs_inode.i_mtime;
+       auxdata.ctime = nfsi->vfs_inode.i_ctime;
+
+       if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
+               auxdata.change_attr = nfsi->change_attr;
+
+       if (memcmp(data, &auxdata, datalen) != 0)
+               return FSCACHE_CHECKAUX_OBSOLETE;
+
+       return FSCACHE_CHECKAUX_OKAY;
+}
+
+/*
+ * Indication from FS-Cache that the cookie is no longer cached
+ * - This function is called when the backing store currently caching a cookie
+ *   is removed
+ * - The netfs should use this to clean up any markers indicating cached pages
+ * - This is mandatory for any object that may have data
+ */
+static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data)
+{
+       struct nfs_inode *nfsi = cookie_netfs_data;
+       struct pagevec pvec;
+       pgoff_t first;
+       int loop, nr_pages;
+
+       pagevec_init(&pvec, 0);
+       first = 0;
+
+       dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi);
+
+       for (;;) {
+               /* grab a bunch of pages to unmark */
+               nr_pages = pagevec_lookup(&pvec,
+                                         nfsi->vfs_inode.i_mapping,
+                                         first,
+                                         PAGEVEC_SIZE - pagevec_count(&pvec));
+               if (!nr_pages)
+                       break;
+
+               for (loop = 0; loop < nr_pages; loop++)
+                       ClearPageFsCache(pvec.pages[loop]);
+
+               first = pvec.pages[nr_pages - 1]->index + 1;
+
+               pvec.nr = nr_pages;
+               pagevec_release(&pvec);
+               cond_resched();
+       }
+}
+
+/*
+ * Get an extra reference on a read context.
+ * - This function can be absent if the completion function doesn't require a
+ *   context.
+ * - The read context is passed back to NFS in the event that a data read on the
+ *   cache fails with EIO - in which case the server must be contacted to
+ *   retrieve the data, which requires the read context for security.
+ */
+static void nfs_fh_get_context(void *cookie_netfs_data, void *context)
+{
+       get_nfs_open_context(context);
+}
+
+/*
+ * Release an extra reference on a read context.
+ * - This function can be absent if the completion function doesn't require a
+ *   context.
+ */
+static void nfs_fh_put_context(void *cookie_netfs_data, void *context)
+{
+       if (context)
+               put_nfs_open_context(context);
+}
+
+/*
+ * Define the inode object for FS-Cache.  This is used to describe an inode
+ * object to fscache_acquire_cookie().  It is keyed by the NFS file handle for
+ * an inode.
+ *
+ * Coherency is managed by comparing the copies of i_size, i_mtime and i_ctime
+ * held in the cache auxiliary data for the data storage object with those in
+ * the inode struct in memory.
+ */
+const struct fscache_cookie_def nfs_fscache_inode_object_def = {
+       .name           = "NFS.fh",
+       .type           = FSCACHE_COOKIE_TYPE_DATAFILE,
+       .get_key        = nfs_fscache_inode_get_key,
+       .get_attr       = nfs_fscache_inode_get_attr,
+       .get_aux        = nfs_fscache_inode_get_aux,
+       .check_aux      = nfs_fscache_inode_check_aux,
+       .now_uncached   = nfs_fscache_inode_now_uncached,
+       .get_context    = nfs_fh_get_context,
+       .put_context    = nfs_fh_put_context,
+};
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
new file mode 100644 (file)
index 0000000..379be67
--- /dev/null
@@ -0,0 +1,523 @@
+/* NFS filesystem cache interface
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_fs_sb.h>
+#include <linux/in6.h>
+#include <linux/seq_file.h>
+
+#include "internal.h"
+#include "iostat.h"
+#include "fscache.h"
+
+#define NFSDBG_FACILITY                NFSDBG_FSCACHE
+
+static struct rb_root nfs_fscache_keys = RB_ROOT;
+static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
+
+/*
+ * Get the per-client index cookie for an NFS client if the appropriate mount
+ * flag was set
+ * - We always try and get an index cookie for the client, but get filehandle
+ *   cookies on a per-superblock basis, depending on the mount flags
+ */
+void nfs_fscache_get_client_cookie(struct nfs_client *clp)
+{
+       /* create a cache index for looking up filehandles */
+       clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,
+                                             &nfs_fscache_server_index_def,
+                                             clp);
+       dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",
+                clp, clp->fscache);
+}
+
+/*
+ * Dispose of a per-client cookie
+ */
+void nfs_fscache_release_client_cookie(struct nfs_client *clp)
+{
+       dfprintk(FSCACHE, "NFS: releasing client cookie (0x%p/0x%p)\n",
+                clp, clp->fscache);
+
+       fscache_relinquish_cookie(clp->fscache, 0);
+       clp->fscache = NULL;
+}
+
+/*
+ * Get the cache cookie for an NFS superblock.  We have to handle
+ * uniquification here because the cache doesn't do it for us.
+ */
+void nfs_fscache_get_super_cookie(struct super_block *sb,
+                                 struct nfs_parsed_mount_data *data)
+{
+       struct nfs_fscache_key *key, *xkey;
+       struct nfs_server *nfss = NFS_SB(sb);
+       struct rb_node **p, *parent;
+       const char *uniq = data->fscache_uniq ?: "";
+       int diff, ulen;
+
+       ulen = strlen(uniq);
+       key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL);
+       if (!key)
+               return;
+
+       key->nfs_client = nfss->nfs_client;
+       key->key.super.s_flags = sb->s_flags & NFS_MS_MASK;
+       key->key.nfs_server.flags = nfss->flags;
+       key->key.nfs_server.rsize = nfss->rsize;
+       key->key.nfs_server.wsize = nfss->wsize;
+       key->key.nfs_server.acregmin = nfss->acregmin;
+       key->key.nfs_server.acregmax = nfss->acregmax;
+       key->key.nfs_server.acdirmin = nfss->acdirmin;
+       key->key.nfs_server.acdirmax = nfss->acdirmax;
+       key->key.nfs_server.fsid = nfss->fsid;
+       key->key.rpc_auth.au_flavor = nfss->client->cl_auth->au_flavor;
+
+       key->key.uniq_len = ulen;
+       memcpy(key->key.uniquifier, uniq, ulen);
+
+       spin_lock(&nfs_fscache_keys_lock);
+       p = &nfs_fscache_keys.rb_node;
+       parent = NULL;
+       while (*p) {
+               parent = *p;
+               xkey = rb_entry(parent, struct nfs_fscache_key, node);
+
+               if (key->nfs_client < xkey->nfs_client)
+                       goto go_left;
+               if (key->nfs_client > xkey->nfs_client)
+                       goto go_right;
+
+               diff = memcmp(&key->key, &xkey->key, sizeof(key->key));
+               if (diff < 0)
+                       goto go_left;
+               if (diff > 0)
+                       goto go_right;
+
+               if (key->key.uniq_len == 0)
+                       goto non_unique;
+               diff = memcmp(key->key.uniquifier,
+                             xkey->key.uniquifier,
+                             key->key.uniq_len);
+               if (diff < 0)
+                       goto go_left;
+               if (diff > 0)
+                       goto go_right;
+               goto non_unique;
+
+       go_left:
+               p = &(*p)->rb_left;
+               continue;
+       go_right:
+               p = &(*p)->rb_right;
+       }
+
+       rb_link_node(&key->node, parent, p);
+       rb_insert_color(&key->node, &nfs_fscache_keys);
+       spin_unlock(&nfs_fscache_keys_lock);
+       nfss->fscache_key = key;
+
+       /* create a cache index for looking up filehandles */
+       nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
+                                              &nfs_fscache_super_index_def,
+                                              nfss);
+       dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
+                nfss, nfss->fscache);
+       return;
+
+non_unique:
+       spin_unlock(&nfs_fscache_keys_lock);
+       kfree(key);
+       nfss->fscache_key = NULL;
+       nfss->fscache = NULL;
+       printk(KERN_WARNING "NFS:"
+              " Cache request denied due to non-unique superblock keys\n");
+}
+
+/*
+ * release a per-superblock cookie
+ */
+void nfs_fscache_release_super_cookie(struct super_block *sb)
+{
+       struct nfs_server *nfss = NFS_SB(sb);
+
+       dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n",
+                nfss, nfss->fscache);
+
+       fscache_relinquish_cookie(nfss->fscache, 0);
+       nfss->fscache = NULL;
+
+       if (nfss->fscache_key) {
+               spin_lock(&nfs_fscache_keys_lock);
+               rb_erase(&nfss->fscache_key->node, &nfs_fscache_keys);
+               spin_unlock(&nfs_fscache_keys_lock);
+               kfree(nfss->fscache_key);
+               nfss->fscache_key = NULL;
+       }
+}
+
+/*
+ * Initialise the per-inode cache cookie pointer for an NFS inode.
+ */
+void nfs_fscache_init_inode_cookie(struct inode *inode)
+{
+       NFS_I(inode)->fscache = NULL;
+       if (S_ISREG(inode->i_mode))
+               set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+}
+
+/*
+ * Get the per-inode cache cookie for an NFS inode.
+ */
+static void nfs_fscache_enable_inode_cookie(struct inode *inode)
+{
+       struct super_block *sb = inode->i_sb;
+       struct nfs_inode *nfsi = NFS_I(inode);
+
+       if (nfsi->fscache || !NFS_FSCACHE(inode))
+               return;
+
+       if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) {
+               nfsi->fscache = fscache_acquire_cookie(
+                       NFS_SB(sb)->fscache,
+                       &nfs_fscache_inode_object_def,
+                       nfsi);
+
+               dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n",
+                        sb, nfsi, nfsi->fscache);
+       }
+}
+
+/*
+ * Release a per-inode cookie.
+ */
+void nfs_fscache_release_inode_cookie(struct inode *inode)
+{
+       struct nfs_inode *nfsi = NFS_I(inode);
+
+       dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n",
+                nfsi, nfsi->fscache);
+
+       fscache_relinquish_cookie(nfsi->fscache, 0);
+       nfsi->fscache = NULL;
+}
+
+/*
+ * Retire a per-inode cookie, destroying the data attached to it.
+ */
+void nfs_fscache_zap_inode_cookie(struct inode *inode)
+{
+       struct nfs_inode *nfsi = NFS_I(inode);
+
+       dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n",
+                nfsi, nfsi->fscache);
+
+       fscache_relinquish_cookie(nfsi->fscache, 1);
+       nfsi->fscache = NULL;
+}
+
+/*
+ * Turn off the cache with regard to a per-inode cookie if opened for writing,
+ * invalidating all the pages in the page cache relating to the associated
+ * inode to clear the per-page caching.
+ */
+static void nfs_fscache_disable_inode_cookie(struct inode *inode)
+{
+       clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+
+       if (NFS_I(inode)->fscache) {
+               dfprintk(FSCACHE,
+                        "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode));
+
+               /* Need to invalidate any mapped pages that were read in before
+                * turning off the cache.
+                */
+               if (inode->i_mapping && inode->i_mapping->nrpages)
+                       invalidate_inode_pages2(inode->i_mapping);
+
+               nfs_fscache_zap_inode_cookie(inode);
+       }
+}
+
+/*
+ * wait_on_bit() sleep function for uninterruptible waiting
+ */
+static int nfs_fscache_wait_bit(void *flags)
+{
+       schedule();
+       return 0;
+}
+
+/*
+ * Lock against someone else trying to also acquire or relinquish a cookie
+ */
+static inline void nfs_fscache_inode_lock(struct inode *inode)
+{
+       struct nfs_inode *nfsi = NFS_I(inode);
+
+       while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags))
+               wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK,
+                           nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+}
+
+/*
+ * Unlock cookie management lock
+ */
+static inline void nfs_fscache_inode_unlock(struct inode *inode)
+{
+       struct nfs_inode *nfsi = NFS_I(inode);
+
+       smp_mb__before_clear_bit();
+       clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags);
+       smp_mb__after_clear_bit();
+       wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK);
+}
+
+/*
+ * Decide if we should enable or disable local caching for this inode.
+ * - For now, with NFS, only regular files that are open read-only will be able
+ *   to use the cache.
+ * - May be invoked multiple times in parallel by parallel nfs_open() functions.
+ */
+void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
+{
+       if (NFS_FSCACHE(inode)) {
+               nfs_fscache_inode_lock(inode);
+               if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
+                       nfs_fscache_disable_inode_cookie(inode);
+               else
+                       nfs_fscache_enable_inode_cookie(inode);
+               nfs_fscache_inode_unlock(inode);
+       }
+}
+
+/*
+ * Replace a per-inode cookie due to revalidation detecting a file having
+ * changed on the server.
+ */
+void nfs_fscache_reset_inode_cookie(struct inode *inode)
+{
+       struct nfs_inode *nfsi = NFS_I(inode);
+       struct nfs_server *nfss = NFS_SERVER(inode);
+       struct fscache_cookie *old = nfsi->fscache;
+
+       nfs_fscache_inode_lock(inode);
+       if (nfsi->fscache) {
+               /* retire the current fscache cache and get a new one */
+               fscache_relinquish_cookie(nfsi->fscache, 1);
+
+               nfsi->fscache = fscache_acquire_cookie(
+                       nfss->nfs_client->fscache,
+                       &nfs_fscache_inode_object_def,
+                       nfsi);
+
+               dfprintk(FSCACHE,
+                        "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n",
+                        nfss, nfsi, old, nfsi->fscache);
+       }
+       nfs_fscache_inode_unlock(inode);
+}
+
+/*
+ * Release the caching state associated with a page, if the page isn't busy
+ * interacting with the cache.
+ * - Returns true (can release page) or false (page busy).
+ */
+int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+       struct nfs_inode *nfsi = NFS_I(page->mapping->host);
+       struct fscache_cookie *cookie = nfsi->fscache;
+
+       BUG_ON(!cookie);
+
+       if (fscache_check_page_write(cookie, page)) {
+               if (!(gfp & __GFP_WAIT))
+                       return 0;
+               fscache_wait_on_page_write(cookie, page);
+       }
+
+       if (PageFsCache(page)) {
+               dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
+                        cookie, page, nfsi);
+
+               fscache_uncache_page(cookie, page);
+               nfs_add_fscache_stats(page->mapping->host,
+                                     NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+       }
+
+       return 1;
+}
+
+/*
+ * Release the caching state associated with a page if undergoing complete page
+ * invalidation.
+ */
+void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode)
+{
+       struct nfs_inode *nfsi = NFS_I(inode);
+       struct fscache_cookie *cookie = nfsi->fscache;
+
+       BUG_ON(!cookie);
+
+       dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n",
+                cookie, page, nfsi);
+
+       fscache_wait_on_page_write(cookie, page);
+
+       BUG_ON(!PageLocked(page));
+       fscache_uncache_page(cookie, page);
+       nfs_add_fscache_stats(page->mapping->host,
+                             NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+}
+
+/*
+ * Handle completion of a page being read from the cache.
+ * - Called in process (keventd) context.
+ */
+static void nfs_readpage_from_fscache_complete(struct page *page,
+                                              void *context,
+                                              int error)
+{
+       dfprintk(FSCACHE,
+                "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n",
+                page, context, error);
+
+       /* if the read completes with an error, we just unlock the page and let
+        * the VM reissue the readpage */
+       if (!error) {
+               SetPageUptodate(page);
+               unlock_page(page);
+       } else {
+               error = nfs_readpage_async(context, page->mapping->host, page);
+               if (error)
+                       unlock_page(page);
+       }
+}
+
+/*
+ * Retrieve a page from fscache
+ */
+int __nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+                               struct inode *inode, struct page *page)
+{
+       int ret;
+
+       dfprintk(FSCACHE,
+                "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n",
+                NFS_I(inode)->fscache, page, page->index, page->flags, inode);
+
+       ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache,
+                                        page,
+                                        nfs_readpage_from_fscache_complete,
+                                        ctx,
+                                        GFP_KERNEL);
+
+       switch (ret) {
+       case 0: /* read BIO submitted (page in fscache) */
+               dfprintk(FSCACHE,
+                        "NFS:    readpage_from_fscache: BIO submitted\n");
+               nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, 1);
+               return ret;
+
+       case -ENOBUFS: /* inode not in cache */
+       case -ENODATA: /* page not in cache */
+               nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
+               dfprintk(FSCACHE,
+                        "NFS:    readpage_from_fscache %d\n", ret);
+               return 1;
+
+       default:
+               dfprintk(FSCACHE, "NFS:    readpage_from_fscache %d\n", ret);
+               nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1);
+       }
+       return ret;
+}
+
+/*
+ * Retrieve a set of pages from fscache
+ */
+int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+                                struct inode *inode,
+                                struct address_space *mapping,
+                                struct list_head *pages,
+                                unsigned *nr_pages)
+{
+       int ret, npages = *nr_pages;
+
+       dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
+                NFS_I(inode)->fscache, npages, inode);
+
+       ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache,
+                                         mapping, pages, nr_pages,
+                                         nfs_readpage_from_fscache_complete,
+                                         ctx,
+                                         mapping_gfp_mask(mapping));
+       if (*nr_pages < npages)
+               nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK,
+                                     npages);
+       if (*nr_pages > 0)
+               nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL,
+                                     *nr_pages);
+
+       switch (ret) {
+       case 0: /* read submitted to the cache for all pages */
+               BUG_ON(!list_empty(pages));
+               BUG_ON(*nr_pages != 0);
+               dfprintk(FSCACHE,
+                        "NFS: nfs_getpages_from_fscache: submitted\n");
+
+               return ret;
+
+       case -ENOBUFS: /* some pages aren't cached and can't be */
+       case -ENODATA: /* some pages aren't cached */
+               dfprintk(FSCACHE,
+                        "NFS: nfs_getpages_from_fscache: no page: %d\n", ret);
+               return 1;
+
+       default:
+               dfprintk(FSCACHE,
+                        "NFS: nfs_getpages_from_fscache: ret  %d\n", ret);
+       }
+
+       return ret;
+}
+
+/*
+ * Store a newly fetched page in fscache
+ * - PG_fscache must be set on the page
+ */
+void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)
+{
+       int ret;
+
+       dfprintk(FSCACHE,
+                "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n",
+                NFS_I(inode)->fscache, page, page->index, page->flags, sync);
+
+       ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL);
+       dfprintk(FSCACHE,
+                "NFS:     readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",
+                page, page->index, page->flags, ret);
+
+       if (ret != 0) {
+               fscache_uncache_page(NFS_I(inode)->fscache, page);
+               nfs_add_fscache_stats(inode,
+                                     NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1);
+               nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
+       } else {
+               nfs_add_fscache_stats(inode,
+                                     NFSIOS_FSCACHE_PAGES_WRITTEN_OK, 1);
+       }
+}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
new file mode 100644 (file)
index 0000000..6e809bb
--- /dev/null
@@ -0,0 +1,220 @@
+/* NFS filesystem cache interface definitions
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _NFS_FSCACHE_H
+#define _NFS_FSCACHE_H
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/fscache.h>
+
+#ifdef CONFIG_NFS_FSCACHE
+
+/*
+ * set of NFS FS-Cache objects that form a superblock key
+ */
+struct nfs_fscache_key {
+       struct rb_node          node;
+       struct nfs_client       *nfs_client;    /* the server */
+
+       /* the elements of the unique key - as used by nfs_compare_super() and
+        * nfs_compare_mount_options() to distinguish superblocks */
+       struct {
+               struct {
+                       unsigned long   s_flags;        /* various flags
+                                                        * (& NFS_MS_MASK) */
+               } super;
+
+               struct {
+                       struct nfs_fsid fsid;
+                       int             flags;
+                       unsigned int    rsize;          /* read size */
+                       unsigned int    wsize;          /* write size */
+                       unsigned int    acregmin;       /* attr cache timeouts */
+                       unsigned int    acregmax;
+                       unsigned int    acdirmin;
+                       unsigned int    acdirmax;
+               } nfs_server;
+
+               struct {
+                       rpc_authflavor_t au_flavor;
+               } rpc_auth;
+
+               /* uniquifier - can be used if nfs_server.flags includes
+                * NFS_MOUNT_UNSHARED  */
+               u8 uniq_len;
+               char uniquifier[0];
+       } key;
+};
+
+/*
+ * fscache-index.c
+ */
+extern struct fscache_netfs nfs_fscache_netfs;
+extern const struct fscache_cookie_def nfs_fscache_server_index_def;
+extern const struct fscache_cookie_def nfs_fscache_super_index_def;
+extern const struct fscache_cookie_def nfs_fscache_inode_object_def;
+
+extern int nfs_fscache_register(void);
+extern void nfs_fscache_unregister(void);
+
+/*
+ * fscache.c
+ */
+extern void nfs_fscache_get_client_cookie(struct nfs_client *);
+extern void nfs_fscache_release_client_cookie(struct nfs_client *);
+
+extern void nfs_fscache_get_super_cookie(struct super_block *,
+                                        struct nfs_parsed_mount_data *);
+extern void nfs_fscache_release_super_cookie(struct super_block *);
+
+extern void nfs_fscache_init_inode_cookie(struct inode *);
+extern void nfs_fscache_release_inode_cookie(struct inode *);
+extern void nfs_fscache_zap_inode_cookie(struct inode *);
+extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *);
+extern void nfs_fscache_reset_inode_cookie(struct inode *);
+
+extern void __nfs_fscache_invalidate_page(struct page *, struct inode *);
+extern int nfs_fscache_release_page(struct page *, gfp_t);
+
+extern int __nfs_readpage_from_fscache(struct nfs_open_context *,
+                                      struct inode *, struct page *);
+extern int __nfs_readpages_from_fscache(struct nfs_open_context *,
+                                       struct inode *, struct address_space *,
+                                       struct list_head *, unsigned *);
+extern void __nfs_readpage_to_fscache(struct inode *, struct page *, int);
+
+/*
+ * wait for a page to complete writing to the cache
+ */
+static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
+                                                 struct page *page)
+{
+       if (PageFsCache(page))
+               fscache_wait_on_page_write(nfsi->fscache, page);
+}
+
+/*
+ * release the caching state associated with a page if undergoing complete page
+ * invalidation
+ */
+static inline void nfs_fscache_invalidate_page(struct page *page,
+                                              struct inode *inode)
+{
+       if (PageFsCache(page))
+               __nfs_fscache_invalidate_page(page, inode);
+}
+
+/*
+ * Retrieve a page from an inode data storage object.
+ */
+static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+                                           struct inode *inode,
+                                           struct page *page)
+{
+       if (NFS_I(inode)->fscache)
+               return __nfs_readpage_from_fscache(ctx, inode, page);
+       return -ENOBUFS;
+}
+
+/*
+ * Retrieve a set of pages from an inode data storage object.
+ */
+static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+                                            struct inode *inode,
+                                            struct address_space *mapping,
+                                            struct list_head *pages,
+                                            unsigned *nr_pages)
+{
+       if (NFS_I(inode)->fscache)
+               return __nfs_readpages_from_fscache(ctx, inode, mapping, pages,
+                                                   nr_pages);
+       return -ENOBUFS;
+}
+
+/*
+ * Store a page newly fetched from the server in an inode data storage object
+ * in the cache.
+ */
+static inline void nfs_readpage_to_fscache(struct inode *inode,
+                                          struct page *page,
+                                          int sync)
+{
+       if (PageFsCache(page))
+               __nfs_readpage_to_fscache(inode, page, sync);
+}
+
+/*
+ * indicate the client caching state as readable text
+ */
+static inline const char *nfs_server_fscache_state(struct nfs_server *server)
+{
+       if (server->fscache && (server->options & NFS_OPTION_FSCACHE))
+               return "yes";
+       return "no ";
+}
+
+
+#else /* CONFIG_NFS_FSCACHE */
+static inline int nfs_fscache_register(void) { return 0; }
+static inline void nfs_fscache_unregister(void) {}
+
+static inline void nfs_fscache_get_client_cookie(struct nfs_client *clp) {}
+static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
+
+static inline void nfs_fscache_get_super_cookie(
+       struct super_block *sb,
+       struct nfs_parsed_mount_data *data)
+{
+}
+static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
+
+static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_set_inode_cookie(struct inode *inode,
+                                               struct file *filp) {}
+static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {}
+
+static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+       return 1; /* True: may release page */
+}
+static inline void nfs_fscache_invalidate_page(struct page *page,
+                                              struct inode *inode) {}
+static inline void nfs_fscache_wait_on_page_write(struct nfs_inode *nfsi,
+                                                 struct page *page) {}
+
+static inline int nfs_readpage_from_fscache(struct nfs_open_context *ctx,
+                                           struct inode *inode,
+                                           struct page *page)
+{
+       return -ENOBUFS;
+}
+static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
+                                            struct inode *inode,
+                                            struct address_space *mapping,
+                                            struct list_head *pages,
+                                            unsigned *nr_pages)
+{
+       return -ENOBUFS;
+}
+static inline void nfs_readpage_to_fscache(struct inode *inode,
+                                          struct page *page, int sync) {}
+
+static inline const char *nfs_server_fscache_state(struct nfs_server *server)
+{
+       return "no ";
+}
+
+#endif /* CONFIG_NFS_FSCACHE */
+#endif /* _NFS_FSCACHE_H */
index a834d1d850b78caf0df89e3f293b9f4b9efb88e7..64f87194d3907620709c5413f23c95324b23afd9 100644 (file)
@@ -46,6 +46,7 @@
 #include "delegation.h"
 #include "iostat.h"
 #include "internal.h"
+#include "fscache.h"
 
 #define NFSDBG_FACILITY                NFSDBG_VFS
 
@@ -121,6 +122,7 @@ void nfs_clear_inode(struct inode *inode)
        BUG_ON(!list_empty(&NFS_I(inode)->open_files));
        nfs_zap_acl_cache(inode);
        nfs_access_zap_cache(inode);
+       nfs_fscache_release_inode_cookie(inode);
 }
 
 /**
@@ -355,6 +357,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                nfsi->attrtimeo_timestamp = now;
                nfsi->access_cache = RB_ROOT;
 
+               nfs_fscache_init_inode_cookie(inode);
+
                unlock_new_inode(inode);
        } else
                nfs_refresh_inode(inode, fattr);
@@ -686,6 +690,7 @@ int nfs_open(struct inode *inode, struct file *filp)
        ctx->mode = filp->f_mode;
        nfs_file_set_open_context(filp, ctx);
        put_nfs_open_context(ctx);
+       nfs_fscache_set_inode_cookie(inode, filp);
        return 0;
 }
 
@@ -786,6 +791,7 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa
                memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
        spin_unlock(&inode->i_lock);
        nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
+       nfs_fscache_reset_inode_cookie(inode);
        dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
                        inode->i_sb->s_id, (long long)NFS_FILEID(inode));
        return 0;
@@ -1030,6 +1036,7 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
        spin_lock(&inode->i_lock);
        status = nfs_refresh_inode_locked(inode, fattr);
        spin_unlock(&inode->i_lock);
+
        return status;
 }
 
@@ -1436,6 +1443,10 @@ static int __init init_nfs_fs(void)
 {
        int err;
 
+       err = nfs_fscache_register();
+       if (err < 0)
+               goto out7;
+
        err = nfsiod_start();
        if (err)
                goto out6;
@@ -1488,6 +1499,8 @@ out4:
 out5:
        nfsiod_stop();
 out6:
+       nfs_fscache_unregister();
+out7:
        return err;
 }
 
@@ -1498,6 +1511,7 @@ static void __exit exit_nfs_fs(void)
        nfs_destroy_readpagecache();
        nfs_destroy_inodecache();
        nfs_destroy_nfspagecache();
+       nfs_fscache_unregister();
 #ifdef CONFIG_PROC_FS
        rpc_proc_unregister("nfs");
 #endif
index 2041f68ff1cc1129d41d7fd55b4749714113d4c9..e4d6a8348adf5178a8d7d63b85a96fe22e0d129b 100644 (file)
@@ -5,6 +5,8 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 
+#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
+
 struct nfs_string;
 
 /* Maximum number of readahead requests
@@ -37,10 +39,12 @@ struct nfs_parsed_mount_data {
        int                     acregmin, acregmax,
                                acdirmin, acdirmax;
        int                     namlen;
+       unsigned int            options;
        unsigned int            bsize;
        unsigned int            auth_flavor_len;
        rpc_authflavor_t        auth_flavors[1];
        char                    *client_address;
+       char                    *fscache_uniq;
 
        struct {
                struct sockaddr_storage address;
index a36952810032f8c492acb4e1f4f96c7494250b3f..a2ab2529b5ca4ee1c1ac2b8e3889e4651bb8f1ac 100644 (file)
@@ -16,6 +16,9 @@
 
 struct nfs_iostats {
        unsigned long long      bytes[__NFSIOS_BYTESMAX];
+#ifdef CONFIG_NFS_FSCACHE
+       unsigned long long      fscache[__NFSIOS_FSCACHEMAX];
+#endif
        unsigned long           events[__NFSIOS_COUNTSMAX];
 } ____cacheline_aligned;
 
@@ -57,6 +60,21 @@ static inline void nfs_add_stats(const struct inode *inode,
        nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
 }
 
+#ifdef CONFIG_NFS_FSCACHE
+static inline void nfs_add_fscache_stats(struct inode *inode,
+                                        enum nfs_stat_fscachecounters stat,
+                                        unsigned long addend)
+{
+       struct nfs_iostats *iostats;
+       int cpu;
+
+       cpu = get_cpu();
+       iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu);
+       iostats->fscache[stat] += addend;
+       put_cpu_no_resched();
+}
+#endif
+
 static inline struct nfs_iostats *nfs_alloc_iostats(void)
 {
        return alloc_percpu(struct nfs_iostats);
index f856004bb7fa3c8ed4bfe99c3c61ccd6d5729861..4ace3c50a8ebae5dc010084b7dc479cc8664e834 100644 (file)
@@ -24,6 +24,7 @@
 
 #include "internal.h"
 #include "iostat.h"
+#include "fscache.h"
 
 #define NFSDBG_FACILITY                NFSDBG_PAGECACHE
 
@@ -111,8 +112,8 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
        }
 }
 
-static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
-               struct page *page)
+int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
+                      struct page *page)
 {
        LIST_HEAD(one_request);
        struct nfs_page *new;
@@ -139,6 +140,11 @@ static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 
 static void nfs_readpage_release(struct nfs_page *req)
 {
+       struct inode *d_inode = req->wb_context->path.dentry->d_inode;
+
+       if (PageUptodate(req->wb_page))
+               nfs_readpage_to_fscache(d_inode, req->wb_page, 0);
+
        unlock_page(req->wb_page);
 
        dprintk("NFS: read done (%s/%Ld %d@%Ld)\n",
@@ -510,8 +516,15 @@ int nfs_readpage(struct file *file, struct page *page)
        } else
                ctx = get_nfs_open_context(nfs_file_open_context(file));
 
+       if (!IS_SYNC(inode)) {
+               error = nfs_readpage_from_fscache(ctx, inode, page);
+               if (error == 0)
+                       goto out;
+       }
+
        error = nfs_readpage_async(ctx, inode, page);
 
+out:
        put_nfs_open_context(ctx);
        return error;
 out_unlock:
@@ -584,6 +597,15 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
                        return -EBADF;
        } else
                desc.ctx = get_nfs_open_context(nfs_file_open_context(filp));
+
+       /* attempt to read as many of the pages as possible from the cache
+        * - this returns -ENOBUFS immediately if the cookie is negative
+        */
+       ret = nfs_readpages_from_fscache(desc.ctx, inode, mapping,
+                                        pages, &nr_pages);
+       if (ret == 0)
+               goto read_complete; /* all pages were read */
+
        if (rsize < PAGE_CACHE_SIZE)
                nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
        else
@@ -594,6 +616,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
        nfs_pageio_complete(&pgio);
        npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        nfs_add_stats(inode, NFSIOS_READPAGES, npages);
+read_complete:
        put_nfs_open_context(desc.ctx);
 out:
        return ret;
index 0942fcbbad3c85c119a774521e4790cee0d0c5e5..82eaadbff40872f6fbae90e52a8acf28bca52d7c 100644 (file)
@@ -60,6 +60,7 @@
 #include "delegation.h"
 #include "iostat.h"
 #include "internal.h"
+#include "fscache.h"
 
 #define NFSDBG_FACILITY                NFSDBG_VFS
 
@@ -76,6 +77,7 @@ enum {
        Opt_rdirplus, Opt_nordirplus,
        Opt_sharecache, Opt_nosharecache,
        Opt_resvport, Opt_noresvport,
+       Opt_fscache, Opt_nofscache,
 
        /* Mount options that take integer arguments */
        Opt_port,
@@ -93,6 +95,7 @@ enum {
        Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
        Opt_addr, Opt_mountaddr, Opt_clientaddr,
        Opt_lookupcache,
+       Opt_fscache_uniq,
 
        /* Special mount options */
        Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -132,6 +135,9 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_nosharecache, "nosharecache" },
        { Opt_resvport, "resvport" },
        { Opt_noresvport, "noresvport" },
+       { Opt_fscache, "fsc" },
+       { Opt_fscache_uniq, "fsc=%s" },
+       { Opt_nofscache, "nofsc" },
 
        { Opt_port, "port=%u" },
        { Opt_rsize, "rsize=%u" },
@@ -563,6 +569,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
        if (clp->rpc_ops->version == 4)
                seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr);
 #endif
+       if (nfss->options & NFS_OPTION_FSCACHE)
+               seq_printf(m, ",fsc");
 }
 
 /*
@@ -641,6 +649,10 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
                        totals.events[i] += stats->events[i];
                for (i = 0; i < __NFSIOS_BYTESMAX; i++)
                        totals.bytes[i] += stats->bytes[i];
+#ifdef CONFIG_NFS_FSCACHE
+               for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
+                       totals.fscache[i] += stats->fscache[i];
+#endif
 
                preempt_enable();
        }
@@ -651,6 +663,13 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
        seq_printf(m, "\n\tbytes:\t");
        for (i = 0; i < __NFSIOS_BYTESMAX; i++)
                seq_printf(m, "%Lu ", totals.bytes[i]);
+#ifdef CONFIG_NFS_FSCACHE
+       if (nfss->options & NFS_OPTION_FSCACHE) {
+               seq_printf(m, "\n\tfsc:\t");
+               for (i = 0; i < __NFSIOS_FSCACHEMAX; i++)
+                       seq_printf(m, "%Lu ", totals.bytes[i]);
+       }
+#endif
        seq_printf(m, "\n");
 
        rpc_print_iostats(m, nfss->client);
@@ -1044,6 +1063,24 @@ static int nfs_parse_mount_options(char *raw,
                case Opt_noresvport:
                        mnt->flags |= NFS_MOUNT_NORESVPORT;
                        break;
+               case Opt_fscache:
+                       mnt->options |= NFS_OPTION_FSCACHE;
+                       kfree(mnt->fscache_uniq);
+                       mnt->fscache_uniq = NULL;
+                       break;
+               case Opt_nofscache:
+                       mnt->options &= ~NFS_OPTION_FSCACHE;
+                       kfree(mnt->fscache_uniq);
+                       mnt->fscache_uniq = NULL;
+                       break;
+               case Opt_fscache_uniq:
+                       string = match_strdup(args);
+                       if (!string)
+                               goto out_nomem;
+                       kfree(mnt->fscache_uniq);
+                       mnt->fscache_uniq = string;
+                       mnt->options |= NFS_OPTION_FSCACHE;
+                       break;
 
                /*
                 * options that take numeric values
@@ -1870,8 +1907,6 @@ static void nfs_clone_super(struct super_block *sb,
        nfs_initialise_sb(sb);
 }
 
-#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
-
 static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
 {
        const struct nfs_server *a = s->s_fs_info;
@@ -2036,6 +2071,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        if (!s->s_root) {
                /* initial superblock/root creation */
                nfs_fill_super(s, data);
+               nfs_fscache_get_super_cookie(s, data);
        }
 
        mntroot = nfs_get_root(s, mntfh);
@@ -2056,6 +2092,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 out:
        kfree(data->nfs_server.hostname);
        kfree(data->mount_server.hostname);
+       kfree(data->fscache_uniq);
        security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
        kfree(mntfh);
@@ -2083,6 +2120,7 @@ static void nfs_kill_super(struct super_block *s)
 
        bdi_unregister(&server->backing_dev_info);
        kill_anon_super(s);
+       nfs_fscache_release_super_cookie(s);
        nfs_free_server(server);
 }
 
@@ -2390,6 +2428,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
        if (!s->s_root) {
                /* initial superblock/root creation */
                nfs4_fill_super(s);
+               nfs_fscache_get_super_cookie(s, data);
        }
 
        mntroot = nfs4_get_root(s, mntfh);
@@ -2411,6 +2450,7 @@ out:
        kfree(data->client_address);
        kfree(data->nfs_server.export_path);
        kfree(data->nfs_server.hostname);
+       kfree(data->fscache_uniq);
        security_free_mnt_opts(&data->lsm_opts);
 out_free_fh:
        kfree(mntfh);
@@ -2437,6 +2477,7 @@ static void nfs4_kill_super(struct super_block *sb)
        kill_anon_super(sb);
 
        nfs4_renewd_prepare_shutdown(server);
+       nfs_fscache_release_super_cookie(sb);
        nfs_free_server(server);
 }
 
index 19e3a96aa02c00f67b281ea2c3ce0cbd4e0e4b29..678a067d9251f12424ee9486cc81960749bc33ec 100644 (file)
@@ -294,6 +294,55 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
        .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
 };
 
+static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
+                                         u64 blkno)
+{
+       struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+       dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+       struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+       return le64_to_cpu(dx_root->dr_last_eb_blk);
+}
+
+static void ocfs2_dx_root_update_clusters(struct inode *inode,
+                                         struct ocfs2_extent_tree *et,
+                                         u32 clusters)
+{
+       struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+       le32_add_cpu(&dx_root->dr_clusters, clusters);
+}
+
+static int ocfs2_dx_root_sanity_check(struct inode *inode,
+                                     struct ocfs2_extent_tree *et)
+{
+       struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+       BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
+
+       return 0;
+}
+
+static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
+{
+       struct ocfs2_dx_root_block *dx_root = et->et_object;
+
+       et->et_root_el = &dx_root->dr_list;
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
+       .eo_set_last_eb_blk     = ocfs2_dx_root_set_last_eb_blk,
+       .eo_get_last_eb_blk     = ocfs2_dx_root_get_last_eb_blk,
+       .eo_update_clusters     = ocfs2_dx_root_update_clusters,
+       .eo_sanity_check        = ocfs2_dx_root_sanity_check,
+       .eo_fill_root_el        = ocfs2_dx_root_fill_root_el,
+};
+
 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
                                     struct inode *inode,
                                     struct buffer_head *bh,
@@ -339,6 +388,14 @@ void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
                                 &ocfs2_xattr_value_et_ops);
 }
 
+void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
+                                   struct inode *inode,
+                                   struct buffer_head *bh)
+{
+       __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_dr,
+                                NULL, &ocfs2_dx_root_et_ops);
+}
+
 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
                                            u64 new_last_eb_blk)
 {
index cceff5c37f47c5872284d5ab43fafab9f9e27db1..353254ba29e15ca2c3625d6537128eed590d182e 100644 (file)
@@ -75,6 +75,9 @@ struct ocfs2_xattr_value_buf;
 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
                                        struct inode *inode,
                                        struct ocfs2_xattr_value_buf *vb);
+void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
+                                   struct inode *inode,
+                                   struct buffer_head *bh);
 
 /*
  * Read an extent block into *bh.  If *bh is NULL, a bh will be
index 8e1709a679b724aaaf323671308e1cd32399cf4d..b2c52b3a1484f1c57c4b098bf9706fe21c186ac0 100644 (file)
@@ -1956,15 +1956,16 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
 }
 
 const struct address_space_operations ocfs2_aops = {
-       .readpage       = ocfs2_readpage,
-       .readpages      = ocfs2_readpages,
-       .writepage      = ocfs2_writepage,
-       .write_begin    = ocfs2_write_begin,
-       .write_end      = ocfs2_write_end,
-       .bmap           = ocfs2_bmap,
-       .sync_page      = block_sync_page,
-       .direct_IO      = ocfs2_direct_IO,
-       .invalidatepage = ocfs2_invalidatepage,
-       .releasepage    = ocfs2_releasepage,
-       .migratepage    = buffer_migrate_page,
+       .readpage               = ocfs2_readpage,
+       .readpages              = ocfs2_readpages,
+       .writepage              = ocfs2_writepage,
+       .write_begin            = ocfs2_write_begin,
+       .write_end              = ocfs2_write_end,
+       .bmap                   = ocfs2_bmap,
+       .sync_page              = block_sync_page,
+       .direct_IO              = ocfs2_direct_IO,
+       .invalidatepage         = ocfs2_invalidatepage,
+       .releasepage            = ocfs2_releasepage,
+       .migratepage            = buffer_migrate_page,
+       .is_partially_uptodate  = block_is_partially_uptodate,
 };
index 04697ba7f73e6be8ef79904b6f21e70b70ccb287..4f85eceab376015596adbafae5b9eb774211e2b1 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/random.h>
 #include <linux/crc32.h>
 #include <linux/time.h>
+#include <linux/debugfs.h>
 
 #include "heartbeat.h"
 #include "tcp.h"
@@ -60,6 +61,11 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 static LIST_HEAD(o2hb_node_events);
 static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
 
+#define O2HB_DEBUG_DIR                 "o2hb"
+#define O2HB_DEBUG_LIVENODES           "livenodes"
+static struct dentry *o2hb_debug_dir;
+static struct dentry *o2hb_debug_livenodes;
+
 static LIST_HEAD(o2hb_all_regions);
 
 static struct o2hb_callback {
@@ -905,7 +911,77 @@ static int o2hb_thread(void *data)
        return 0;
 }
 
-void o2hb_init(void)
+#ifdef CONFIG_DEBUG_FS
+static int o2hb_debug_open(struct inode *inode, struct file *file)
+{
+       unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+       char *buf = NULL;
+       int i = -1;
+       int out = 0;
+
+       buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+       if (!buf)
+               goto bail;
+
+       o2hb_fill_node_map(map, sizeof(map));
+
+       while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
+               out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
+       out += snprintf(buf + out, PAGE_SIZE - out, "\n");
+
+       i_size_write(inode, out);
+
+       file->private_data = buf;
+
+       return 0;
+bail:
+       return -ENOMEM;
+}
+
+static int o2hb_debug_release(struct inode *inode, struct file *file)
+{
+       kfree(file->private_data);
+       return 0;
+}
+
+static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
+                                size_t nbytes, loff_t *ppos)
+{
+       return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+                                      i_size_read(file->f_mapping->host));
+}
+#else
+static int o2hb_debug_open(struct inode *inode, struct file *file)
+{
+       return 0;
+}
+static int o2hb_debug_release(struct inode *inode, struct file *file)
+{
+       return 0;
+}
+static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
+                              size_t nbytes, loff_t *ppos)
+{
+       return 0;
+}
+#endif  /* CONFIG_DEBUG_FS */
+
+static struct file_operations o2hb_debug_fops = {
+       .open =         o2hb_debug_open,
+       .release =      o2hb_debug_release,
+       .read =         o2hb_debug_read,
+       .llseek =       generic_file_llseek,
+};
+
+void o2hb_exit(void)
+{
+       if (o2hb_debug_livenodes)
+               debugfs_remove(o2hb_debug_livenodes);
+       if (o2hb_debug_dir)
+               debugfs_remove(o2hb_debug_dir);
+}
+
+int o2hb_init(void)
 {
        int i;
 
@@ -918,6 +994,24 @@ void o2hb_init(void)
        INIT_LIST_HEAD(&o2hb_node_events);
 
        memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
+
+       o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
+       if (!o2hb_debug_dir) {
+               mlog_errno(-ENOMEM);
+               return -ENOMEM;
+       }
+
+       o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES,
+                                                  S_IFREG|S_IRUSR,
+                                                  o2hb_debug_dir, NULL,
+                                                  &o2hb_debug_fops);
+       if (!o2hb_debug_livenodes) {
+               mlog_errno(-ENOMEM);
+               debugfs_remove(o2hb_debug_dir);
+               return -ENOMEM;
+       }
+
+       return 0;
 }
 
 /* if we're already in a callback then we're already serialized by the sem */
index e511339886b31040e5597695726692b91be07e11..2f1649253b497bed42fc14ef87685e59377dc2be 100644 (file)
@@ -75,7 +75,8 @@ void o2hb_unregister_callback(const char *region_uuid,
                              struct o2hb_callback_func *hc);
 void o2hb_fill_node_map(unsigned long *map,
                        unsigned bytes);
-void o2hb_init(void);
+void o2hb_exit(void);
+int o2hb_init(void);
 int o2hb_check_node_heartbeating(u8 node_num);
 int o2hb_check_node_heartbeating_from_callback(u8 node_num);
 int o2hb_check_local_node_heartbeating(void);
index 70e8fa9e2539cdf210676778770e0de4405713e2..7ee6188bc79a76ebc939b738bcfc3914c99b3ef6 100644 (file)
@@ -881,6 +881,7 @@ static void __exit exit_o2nm(void)
        o2cb_sys_shutdown();
 
        o2net_exit();
+       o2hb_exit();
 }
 
 static int __init init_o2nm(void)
@@ -889,11 +890,13 @@ static int __init init_o2nm(void)
 
        cluster_print_version();
 
-       o2hb_init();
+       ret = o2hb_init();
+       if (ret)
+               goto out;
 
        ret = o2net_init();
        if (ret)
-               goto out;
+               goto out_o2hb;
 
        ret = o2net_register_hb_callbacks();
        if (ret)
@@ -916,6 +919,8 @@ out_callbacks:
        o2net_unregister_hb_callbacks();
 out_o2net:
        o2net_exit();
+out_o2hb:
+       o2hb_exit();
 out:
        return ret;
 }
index f2c4098cf337fd077b1d77835144cc454ce5e8dc..e71160cda1100a1d5b9c840fbbbe784058a682fe 100644 (file)
@@ -41,6 +41,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/quotaops.h>
+#include <linux/sort.h>
 
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
@@ -58,6 +59,7 @@
 #include "namei.h"
 #include "suballoc.h"
 #include "super.h"
+#include "sysfile.h"
 #include "uptodate.h"
 
 #include "buffer_head_io.h"
@@ -71,11 +73,6 @@ static unsigned char ocfs2_filetype_table[] = {
        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
 
-static int ocfs2_extend_dir(struct ocfs2_super *osb,
-                           struct inode *dir,
-                           struct buffer_head *parent_fe_bh,
-                           unsigned int blocks_wanted,
-                           struct buffer_head **new_de_bh);
 static int ocfs2_do_extend_dir(struct super_block *sb,
                               handle_t *handle,
                               struct inode *dir,
@@ -83,22 +80,36 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
                               struct ocfs2_alloc_context *data_ac,
                               struct ocfs2_alloc_context *meta_ac,
                               struct buffer_head **new_bh);
+static int ocfs2_dir_indexed(struct inode *inode);
 
 /*
  * These are distinct checks because future versions of the file system will
  * want to have a trailing dirent structure independent of indexing.
  */
-static int ocfs2_dir_has_trailer(struct inode *dir)
+static int ocfs2_supports_dir_trailer(struct inode *dir)
 {
+       struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+
        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
                return 0;
 
-       return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb));
+       return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir);
 }
 
-static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb)
+/*
+ * "new' here refers to the point at which we're creating a new
+ * directory via "mkdir()", but also when we're expanding an inline
+ * directory. In either case, we don't yet have the indexing bit set
+ * on the directory, so the standard checks will fail in when metaecc
+ * is turned off. Only directory-initialization type functions should
+ * use this then. Everything else wants ocfs2_supports_dir_trailer()
+ */
+static int ocfs2_new_dir_wants_trailer(struct inode *dir)
 {
-       return ocfs2_meta_ecc(osb);
+       struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+
+       return ocfs2_meta_ecc(osb) ||
+               ocfs2_supports_indexed_dirs(osb);
 }
 
 static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
@@ -130,7 +141,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir,
 {
        unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
 
-       if (!ocfs2_dir_has_trailer(dir))
+       if (!ocfs2_supports_dir_trailer(dir))
                return 0;
 
        if (offset != toff)
@@ -140,7 +151,7 @@ static int ocfs2_skip_dir_trailer(struct inode *dir,
 }
 
 static void ocfs2_init_dir_trailer(struct inode *inode,
-                                  struct buffer_head *bh)
+                                  struct buffer_head *bh, u16 rec_len)
 {
        struct ocfs2_dir_block_trailer *trailer;
 
@@ -150,6 +161,153 @@ static void ocfs2_init_dir_trailer(struct inode *inode,
                        cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
        trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
        trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
+       trailer->db_free_rec_len = cpu_to_le16(rec_len);
+}
+/*
+ * Link an unindexed block with a dir trailer structure into the index free
+ * list. This function will modify dirdata_bh, but assumes you've already
+ * passed it to the journal.
+ */
+static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle,
+                                    struct buffer_head *dx_root_bh,
+                                    struct buffer_head *dirdata_bh)
+{
+       int ret;
+       struct ocfs2_dx_root_block *dx_root;
+       struct ocfs2_dir_block_trailer *trailer;
+
+       ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+       trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
+       dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+       trailer->db_free_next = dx_root->dr_free_blk;
+       dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
+
+       ocfs2_journal_dirty(handle, dx_root_bh);
+
+out:
+       return ret;
+}
+
+static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res)
+{
+       return res->dl_prev_leaf_bh == NULL;
+}
+
+void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res)
+{
+       brelse(res->dl_dx_root_bh);
+       brelse(res->dl_leaf_bh);
+       brelse(res->dl_dx_leaf_bh);
+       brelse(res->dl_prev_leaf_bh);
+}
+
+static int ocfs2_dir_indexed(struct inode *inode)
+{
+       if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL)
+               return 1;
+       return 0;
+}
+
+static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root)
+{
+       return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE;
+}
+
+/*
+ * Hashing code adapted from ext3
+ */
+#define DELTA 0x9E3779B9
+
+static void TEA_transform(__u32 buf[4], __u32 const in[])
+{
+       __u32   sum = 0;
+       __u32   b0 = buf[0], b1 = buf[1];
+       __u32   a = in[0], b = in[1], c = in[2], d = in[3];
+       int     n = 16;
+
+       do {
+               sum += DELTA;
+               b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
+               b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
+       } while (--n);
+
+       buf[0] += b0;
+       buf[1] += b1;
+}
+
+static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+{
+       __u32   pad, val;
+       int     i;
+
+       pad = (__u32)len | ((__u32)len << 8);
+       pad |= pad << 16;
+
+       val = pad;
+       if (len > num*4)
+               len = num * 4;
+       for (i = 0; i < len; i++) {
+               if ((i % 4) == 0)
+                       val = pad;
+               val = msg[i] + (val << 8);
+               if ((i % 4) == 3) {
+                       *buf++ = val;
+                       val = pad;
+                       num--;
+               }
+       }
+       if (--num >= 0)
+               *buf++ = val;
+       while (--num >= 0)
+               *buf++ = pad;
+}
+
+static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len,
+                                  struct ocfs2_dx_hinfo *hinfo)
+{
+       struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+       const char      *p;
+       __u32           in[8], buf[4];
+
+       /*
+        * XXX: Is this really necessary, if the index is never looked
+        * at by readdir? Is a hash value of '0' a bad idea?
+        */
+       if ((len == 1 && !strncmp(".", name, 1)) ||
+           (len == 2 && !strncmp("..", name, 2))) {
+               buf[0] = buf[1] = 0;
+               goto out;
+       }
+
+#ifdef OCFS2_DEBUG_DX_DIRS
+       /*
+        * This makes it very easy to debug indexing problems. We
+        * should never allow this to be selected without hand editing
+        * this file though.
+        */
+       buf[0] = buf[1] = len;
+       goto out;
+#endif
+
+       memcpy(buf, osb->osb_dx_seed, sizeof(buf));
+
+       p = name;
+       while (len > 0) {
+               str2hashbuf(p, len, in, 4);
+               TEA_transform(buf, in);
+               len -= 16;
+               p += 16;
+       }
+
+out:
+       hinfo->major_hash = buf[0];
+       hinfo->minor_hash = buf[1];
 }
 
 /*
@@ -311,6 +469,52 @@ static int ocfs2_validate_dir_block(struct super_block *sb,
        return rc;
 }
 
+/*
+ * Validate a directory trailer.
+ *
+ * We check the trailer here rather than in ocfs2_validate_dir_block()
+ * because that function doesn't have the inode to test.
+ */
+static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
+{
+       int rc = 0;
+       struct ocfs2_dir_block_trailer *trailer;
+
+       trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
+       if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
+               rc = -EINVAL;
+               ocfs2_error(dir->i_sb,
+                           "Invalid dirblock #%llu: "
+                           "signature = %.*s\n",
+                           (unsigned long long)bh->b_blocknr, 7,
+                           trailer->db_signature);
+               goto out;
+       }
+       if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
+               rc = -EINVAL;
+               ocfs2_error(dir->i_sb,
+                           "Directory block #%llu has an invalid "
+                           "db_blkno of %llu",
+                           (unsigned long long)bh->b_blocknr,
+                           (unsigned long long)le64_to_cpu(trailer->db_blkno));
+               goto out;
+       }
+       if (le64_to_cpu(trailer->db_parent_dinode) !=
+           OCFS2_I(dir)->ip_blkno) {
+               rc = -EINVAL;
+               ocfs2_error(dir->i_sb,
+                           "Directory block #%llu on dinode "
+                           "#%llu has an invalid parent_dinode "
+                           "of %llu",
+                           (unsigned long long)bh->b_blocknr,
+                           (unsigned long long)OCFS2_I(dir)->ip_blkno,
+                           (unsigned long long)le64_to_cpu(trailer->db_blkno));
+               goto out;
+       }
+out:
+       return rc;
+}
+
 /*
  * This function forces all errors to -EIO for consistency with its
  * predecessor, ocfs2_bread().  We haven't audited what returning the
@@ -322,7 +526,6 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
 {
        int rc = 0;
        struct buffer_head *tmp = *bh;
-       struct ocfs2_dir_block_trailer *trailer;
 
        rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
                                    ocfs2_validate_dir_block);
@@ -331,42 +534,13 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
                goto out;
        }
 
-       /*
-        * We check the trailer here rather than in
-        * ocfs2_validate_dir_block() because that function doesn't have
-        * the inode to test.
-        */
        if (!(flags & OCFS2_BH_READAHEAD) &&
-           ocfs2_dir_has_trailer(inode)) {
-               trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb);
-               if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
-                       rc = -EINVAL;
-                       ocfs2_error(inode->i_sb,
-                                   "Invalid dirblock #%llu: "
-                                   "signature = %.*s\n",
-                                   (unsigned long long)tmp->b_blocknr, 7,
-                                   trailer->db_signature);
-                       goto out;
-               }
-               if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
-                       rc = -EINVAL;
-                       ocfs2_error(inode->i_sb,
-                                   "Directory block #%llu has an invalid "
-                                   "db_blkno of %llu",
-                                   (unsigned long long)tmp->b_blocknr,
-                                   (unsigned long long)le64_to_cpu(trailer->db_blkno));
-                       goto out;
-               }
-               if (le64_to_cpu(trailer->db_parent_dinode) !=
-                   OCFS2_I(inode)->ip_blkno) {
-                       rc = -EINVAL;
-                       ocfs2_error(inode->i_sb,
-                                   "Directory block #%llu on dinode "
-                                   "#%llu has an invalid parent_dinode "
-                                   "of %llu",
-                                   (unsigned long long)tmp->b_blocknr,
-                                   (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                                   (unsigned long long)le64_to_cpu(trailer->db_blkno));
+           ocfs2_supports_dir_trailer(inode)) {
+               rc = ocfs2_check_dir_trailer(inode, tmp);
+               if (rc) {
+                       if (!*bh)
+                               brelse(tmp);
+                       mlog_errno(rc);
                        goto out;
                }
        }
@@ -379,6 +553,141 @@ out:
        return rc ? -EIO : 0;
 }
 
+/*
+ * Read the block at 'phys' which belongs to this directory
+ * inode. This function does no virtual->physical block translation -
+ * what's passed in is assumed to be a valid directory block.
+ */
+static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
+                                      struct buffer_head **bh)
+{
+       int ret;
+       struct buffer_head *tmp = *bh;
+
+       ret = ocfs2_read_block(dir, phys, &tmp, ocfs2_validate_dir_block);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       if (ocfs2_supports_dir_trailer(dir)) {
+               ret = ocfs2_check_dir_trailer(dir, tmp);
+               if (ret) {
+                       if (!*bh)
+                               brelse(tmp);
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
+
+       if (!ret && !*bh)
+               *bh = tmp;
+out:
+       return ret;
+}
+
+static int ocfs2_validate_dx_root(struct super_block *sb,
+                                 struct buffer_head *bh)
+{
+       int ret;
+       struct ocfs2_dx_root_block *dx_root;
+
+       BUG_ON(!buffer_uptodate(bh));
+
+       dx_root = (struct ocfs2_dx_root_block *) bh->b_data;
+
+       ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check);
+       if (ret) {
+               mlog(ML_ERROR,
+                    "Checksum failed for dir index root block %llu\n",
+                    (unsigned long long)bh->b_blocknr);
+               return ret;
+       }
+
+       if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
+               ocfs2_error(sb,
+                           "Dir Index Root # %llu has bad signature %.*s",
+                           (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
+                           7, dx_root->dr_signature);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
+                             struct buffer_head **dx_root_bh)
+{
+       int ret;
+       u64 blkno = le64_to_cpu(di->i_dx_root);
+       struct buffer_head *tmp = *dx_root_bh;
+
+       ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_root);
+
+       /* If ocfs2_read_block() got us a new bh, pass it up. */
+       if (!ret && !*dx_root_bh)
+               *dx_root_bh = tmp;
+
+       return ret;
+}
+
+static int ocfs2_validate_dx_leaf(struct super_block *sb,
+                                 struct buffer_head *bh)
+{
+       int ret;
+       struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data;
+
+       BUG_ON(!buffer_uptodate(bh));
+
+       ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check);
+       if (ret) {
+               mlog(ML_ERROR,
+                    "Checksum failed for dir index leaf block %llu\n",
+                    (unsigned long long)bh->b_blocknr);
+               return ret;
+       }
+
+       if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
+               ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
+                           7, dx_leaf->dl_signature);
+               return -EROFS;
+       }
+
+       return 0;
+}
+
+static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
+                             struct buffer_head **dx_leaf_bh)
+{
+       int ret;
+       struct buffer_head *tmp = *dx_leaf_bh;
+
+       ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_leaf);
+
+       /* If ocfs2_read_block() got us a new bh, pass it up. */
+       if (!ret && !*dx_leaf_bh)
+               *dx_leaf_bh = tmp;
+
+       return ret;
+}
+
+/*
+ * Read a series of dx_leaf blocks. This expects all buffer_head
+ * pointers to be NULL on function entry.
+ */
+static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
+                               struct buffer_head **dx_leaf_bhs)
+{
+       int ret;
+
+       ret = ocfs2_read_blocks(dir, start, num, dx_leaf_bhs, 0,
+                               ocfs2_validate_dx_leaf);
+       if (ret)
+               mlog_errno(ret);
+
+       return ret;
+}
+
 static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
                                               struct inode *dir,
                                               struct ocfs2_dir_entry **res_dir)
@@ -480,87 +789,392 @@ cleanup_and_exit:
        return ret;
 }
 
-/*
- * Try to find an entry of the provided name within 'dir'.
- *
- * If nothing was found, NULL is returned. Otherwise, a buffer_head
- * and pointer to the dir entry are passed back.
- *
- * Caller can NOT assume anything about the contents of the
- * buffer_head - it is passed back only so that it can be passed into
- * any one of the manipulation functions (add entry, delete entry,
- * etc). As an example, bh in the extent directory case is a data
- * block, in the inline-data case it actually points to an inode.
- */
-struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
-                                    struct inode *dir,
-                                    struct ocfs2_dir_entry **res_dir)
+static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
+                                  struct ocfs2_extent_list *el,
+                                  u32 major_hash,
+                                  u32 *ret_cpos,
+                                  u64 *ret_phys_blkno,
+                                  unsigned int *ret_clen)
 {
-       *res_dir = NULL;
+       int ret = 0, i, found;
+       struct buffer_head *eb_bh = NULL;
+       struct ocfs2_extent_block *eb;
+       struct ocfs2_extent_rec *rec = NULL;
 
-       if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-               return ocfs2_find_entry_id(name, namelen, dir, res_dir);
+       if (el->l_tree_depth) {
+               ret = ocfs2_find_leaf(inode, el, major_hash, &eb_bh);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+               el = &eb->h_list;
+
+               if (el->l_tree_depth) {
+                       ocfs2_error(inode->i_sb,
+                                   "Inode %lu has non zero tree depth in "
+                                   "btree tree block %llu\n", inode->i_ino,
+                                   (unsigned long long)eb_bh->b_blocknr);
+                       ret = -EROFS;
+                       goto out;
+               }
+       }
+
+       found = 0;
+       for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+               rec = &el->l_recs[i];
+
+               if (le32_to_cpu(rec->e_cpos) <= major_hash) {
+                       found = 1;
+                       break;
+               }
+       }
+
+       if (!found) {
+               ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+                           "record (%u, %u, 0) in btree", inode->i_ino,
+                           le32_to_cpu(rec->e_cpos),
+                           ocfs2_rec_clusters(el, rec));
+               ret = -EROFS;
+               goto out;
+       }
+
+       if (ret_phys_blkno)
+               *ret_phys_blkno = le64_to_cpu(rec->e_blkno);
+       if (ret_cpos)
+               *ret_cpos = le32_to_cpu(rec->e_cpos);
+       if (ret_clen)
+               *ret_clen = le16_to_cpu(rec->e_leaf_clusters);
 
-       return ocfs2_find_entry_el(name, namelen, dir, res_dir);
+out:
+       brelse(eb_bh);
+       return ret;
 }
 
 /*
- * Update inode number and type of a previously found directory entry.
+ * Returns the block index, from the start of the cluster which this
+ * hash belongs too.
  */
-int ocfs2_update_entry(struct inode *dir, handle_t *handle,
-                      struct buffer_head *de_bh, struct ocfs2_dir_entry *de,
-                      struct inode *new_entry_inode)
+static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
+                                                  u32 minor_hash)
 {
-       int ret;
-       ocfs2_journal_access_func access = ocfs2_journal_access_db;
+       return minor_hash & osb->osb_dx_mask;
+}
 
-       /*
-        * The same code works fine for both inline-data and extent
-        * based directories, so no need to split this up.  The only
-        * difference is the journal_access function.
-        */
+static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
+                                         struct ocfs2_dx_hinfo *hinfo)
+{
+       return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash);
+}
 
-       if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-               access = ocfs2_journal_access_di;
+static int ocfs2_dx_dir_lookup(struct inode *inode,
+                              struct ocfs2_extent_list *el,
+                              struct ocfs2_dx_hinfo *hinfo,
+                              u32 *ret_cpos,
+                              u64 *ret_phys_blkno)
+{
+       int ret = 0;
+       unsigned int cend, uninitialized_var(clen);
+       u32 uninitialized_var(cpos);
+       u64 uninitialized_var(blkno);
+       u32 name_hash = hinfo->major_hash;
 
-       ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+       ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
+                                     &clen);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
 
-       de->inode = cpu_to_le64(OCFS2_I(new_entry_inode)->ip_blkno);
-       ocfs2_set_de_type(de, new_entry_inode->i_mode);
+       cend = cpos + clen;
+       if (name_hash >= cend) {
+               /* We want the last cluster */
+               blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1);
+               cpos += clen - 1;
+       } else {
+               blkno += ocfs2_clusters_to_blocks(inode->i_sb,
+                                                 name_hash - cpos);
+               cpos = name_hash;
+       }
 
-       ocfs2_journal_dirty(handle, de_bh);
+       /*
+        * We now have the cluster which should hold our entry. To
+        * find the exact block from the start of the cluster to
+        * search, we take the lower bits of the hash.
+        */
+       blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo);
+
+       if (ret_phys_blkno)
+               *ret_phys_blkno = blkno;
+       if (ret_cpos)
+               *ret_cpos = cpos;
 
 out:
+
        return ret;
 }
 
-static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
-                               struct ocfs2_dir_entry *de_del,
-                               struct buffer_head *bh, char *first_de,
-                               unsigned int bytes)
+static int ocfs2_dx_dir_search(const char *name, int namelen,
+                              struct inode *dir,
+                              struct ocfs2_dx_root_block *dx_root,
+                              struct ocfs2_dir_lookup_result *res)
 {
-       struct ocfs2_dir_entry *de, *pde;
-       int i, status = -ENOENT;
-       ocfs2_journal_access_func access = ocfs2_journal_access_db;
+       int ret, i, found;
+       u64 uninitialized_var(phys);
+       struct buffer_head *dx_leaf_bh = NULL;
+       struct ocfs2_dx_leaf *dx_leaf;
+       struct ocfs2_dx_entry *dx_entry = NULL;
+       struct buffer_head *dir_ent_bh = NULL;
+       struct ocfs2_dir_entry *dir_ent = NULL;
+       struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo;
+       struct ocfs2_extent_list *dr_el;
+       struct ocfs2_dx_entry_list *entry_list;
+
+       ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo);
+
+       if (ocfs2_dx_root_inline(dx_root)) {
+               entry_list = &dx_root->dr_entries;
+               goto search;
+       }
 
-       mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
+       dr_el = &dx_root->dr_list;
 
-       if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-               access = ocfs2_journal_access_di;
+       ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
 
-       i = 0;
-       pde = NULL;
-       de = (struct ocfs2_dir_entry *) first_de;
-       while (i < bytes) {
-               if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
-                       status = -EIO;
-                       mlog_errno(status);
-                       goto bail;
-               }
+       mlog(0, "Dir %llu: name: \"%.*s\", lookup of hash: %u.0x%x "
+            "returns: %llu\n",
+            (unsigned long long)OCFS2_I(dir)->ip_blkno,
+            namelen, name, hinfo->major_hash, hinfo->minor_hash,
+            (unsigned long long)phys);
+
+       ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
+
+       mlog(0, "leaf info: num_used: %d, count: %d\n",
+            le16_to_cpu(dx_leaf->dl_list.de_num_used),
+            le16_to_cpu(dx_leaf->dl_list.de_count));
+
+       entry_list = &dx_leaf->dl_list;
+
+search:
+       /*
+        * Empty leaf is legal, so no need to check for that.
+        */
+       found = 0;
+       for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
+               dx_entry = &entry_list->de_entries[i];
+
+               if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash)
+                   || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash))
+                       continue;
+
+               /*
+                * Search unindexed leaf block now. We're not
+                * guaranteed to find anything.
+                */
+               ret = ocfs2_read_dir_block_direct(dir,
+                                         le64_to_cpu(dx_entry->dx_dirent_blk),
+                                         &dir_ent_bh);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               /*
+                * XXX: We should check the unindexed block here,
+                * before using it.
+                */
+
+               found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen,
+                                             0, dir_ent_bh->b_data,
+                                             dir->i_sb->s_blocksize, &dir_ent);
+               if (found == 1)
+                       break;
+
+               if (found == -1) {
+                       /* This means we found a bad directory entry. */
+                       ret = -EIO;
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               brelse(dir_ent_bh);
+               dir_ent_bh = NULL;
+       }
+
+       if (found <= 0) {
+               ret = -ENOENT;
+               goto out;
+       }
+
+       res->dl_leaf_bh = dir_ent_bh;
+       res->dl_entry = dir_ent;
+       res->dl_dx_leaf_bh = dx_leaf_bh;
+       res->dl_dx_entry = dx_entry;
+
+       ret = 0;
+out:
+       if (ret) {
+               brelse(dx_leaf_bh);
+               brelse(dir_ent_bh);
+       }
+       return ret;
+}
+
+static int ocfs2_find_entry_dx(const char *name, int namelen,
+                              struct inode *dir,
+                              struct ocfs2_dir_lookup_result *lookup)
+{
+       int ret;
+       struct buffer_head *di_bh = NULL;
+       struct ocfs2_dinode *di;
+       struct buffer_head *dx_root_bh = NULL;
+       struct ocfs2_dx_root_block *dx_root;
+
+       ret = ocfs2_read_inode_block(dir, &di_bh);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       di = (struct ocfs2_dinode *)di_bh->b_data;
+
+       ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+       dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+
+       ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup);
+       if (ret) {
+               if (ret != -ENOENT)
+                       mlog_errno(ret);
+               goto out;
+       }
+
+       lookup->dl_dx_root_bh = dx_root_bh;
+       dx_root_bh = NULL;
+out:
+       brelse(di_bh);
+       brelse(dx_root_bh);
+       return ret;
+}
+
+/*
+ * Try to find an entry of the provided name within 'dir'.
+ *
+ * If nothing was found, -ENOENT is returned. Otherwise, zero is
+ * returned and the struct 'res' will contain information useful to
+ * other directory manipulation functions.
+ *
+ * Caller can NOT assume anything about the contents of the
+ * buffer_heads - they are passed back only so that it can be passed
+ * into any one of the manipulation functions (add entry, delete
+ * entry, etc). As an example, bh in the extent directory case is a
+ * data block, in the inline-data case it actually points to an inode,
+ * in the indexed directory case, multiple buffers are involved.
+ */
+int ocfs2_find_entry(const char *name, int namelen,
+                    struct inode *dir, struct ocfs2_dir_lookup_result *lookup)
+{
+       struct buffer_head *bh;
+       struct ocfs2_dir_entry *res_dir = NULL;
+
+       if (ocfs2_dir_indexed(dir))
+               return ocfs2_find_entry_dx(name, namelen, dir, lookup);
+
+       /*
+        * The unindexed dir code only uses part of the lookup
+        * structure, so there's no reason to push it down further
+        * than this.
+        */
+       if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+               bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir);
+       else
+               bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir);
+
+       if (bh == NULL)
+               return -ENOENT;
+
+       lookup->dl_leaf_bh = bh;
+       lookup->dl_entry = res_dir;
+       return 0;
+}
+
+/*
+ * Update inode number and type of a previously found directory entry.
+ */
+int ocfs2_update_entry(struct inode *dir, handle_t *handle,
+                      struct ocfs2_dir_lookup_result *res,
+                      struct inode *new_entry_inode)
+{
+       int ret;
+       ocfs2_journal_access_func access = ocfs2_journal_access_db;
+       struct ocfs2_dir_entry *de = res->dl_entry;
+       struct buffer_head *de_bh = res->dl_leaf_bh;
+
+       /*
+        * The same code works fine for both inline-data and extent
+        * based directories, so no need to split this up.  The only
+        * difference is the journal_access function.
+        */
+
+       if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+               access = ocfs2_journal_access_di;
+
+       ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       de->inode = cpu_to_le64(OCFS2_I(new_entry_inode)->ip_blkno);
+       ocfs2_set_de_type(de, new_entry_inode->i_mode);
+
+       ocfs2_journal_dirty(handle, de_bh);
+
+out:
+       return ret;
+}
+
+/*
+ * __ocfs2_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
+static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
+                               struct ocfs2_dir_entry *de_del,
+                               struct buffer_head *bh, char *first_de,
+                               unsigned int bytes)
+{
+       struct ocfs2_dir_entry *de, *pde;
+       int i, status = -ENOENT;
+       ocfs2_journal_access_func access = ocfs2_journal_access_db;
+
+       mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
+
+       if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+               access = ocfs2_journal_access_di;
+
+       i = 0;
+       pde = NULL;
+       de = (struct ocfs2_dir_entry *) first_de;
+       while (i < bytes) {
+               if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
+                       status = -EIO;
+                       mlog_errno(status);
+                       goto bail;
+               }
                if (de == de_del)  {
                        status = access(handle, dir, bh,
                                        OCFS2_JOURNAL_ACCESS_WRITE);
@@ -587,6 +1201,181 @@ bail:
        return status;
 }
 
+static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de)
+{
+       unsigned int hole;
+
+       if (le64_to_cpu(de->inode) == 0)
+               hole = le16_to_cpu(de->rec_len);
+       else
+               hole = le16_to_cpu(de->rec_len) -
+                       OCFS2_DIR_REC_LEN(de->name_len);
+
+       return hole;
+}
+
+static int ocfs2_find_max_rec_len(struct super_block *sb,
+                                 struct buffer_head *dirblock_bh)
+{
+       int size, this_hole, largest_hole = 0;
+       char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data;
+       struct ocfs2_dir_entry *de;
+
+       trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb);
+       size = ocfs2_dir_trailer_blk_off(sb);
+       limit = start + size;
+       de_buf = start;
+       de = (struct ocfs2_dir_entry *)de_buf;
+       do {
+               if (de_buf != trailer) {
+                       this_hole = ocfs2_figure_dirent_hole(de);
+                       if (this_hole > largest_hole)
+                               largest_hole = this_hole;
+               }
+
+               de_buf += le16_to_cpu(de->rec_len);
+               de = (struct ocfs2_dir_entry *)de_buf;
+       } while (de_buf < limit);
+
+       if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
+               return largest_hole;
+       return 0;
+}
+
+static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list,
+                                      int index)
+{
+       int num_used = le16_to_cpu(entry_list->de_num_used);
+
+       if (num_used == 1 || index == (num_used - 1))
+               goto clear;
+
+       memmove(&entry_list->de_entries[index],
+               &entry_list->de_entries[index + 1],
+               (num_used - index - 1)*sizeof(struct ocfs2_dx_entry));
+clear:
+       num_used--;
+       memset(&entry_list->de_entries[num_used], 0,
+              sizeof(struct ocfs2_dx_entry));
+       entry_list->de_num_used = cpu_to_le16(num_used);
+}
+
+static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
+                                struct ocfs2_dir_lookup_result *lookup)
+{
+       int ret, index, max_rec_len, add_to_free_list = 0;
+       struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
+       struct buffer_head *leaf_bh = lookup->dl_leaf_bh;
+       struct ocfs2_dx_leaf *dx_leaf;
+       struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry;
+       struct ocfs2_dir_block_trailer *trailer;
+       struct ocfs2_dx_root_block *dx_root;
+       struct ocfs2_dx_entry_list *entry_list;
+
+       /*
+        * This function gets a bit messy because we might have to
+        * modify the root block, regardless of whether the indexed
+        * entries are stored inline.
+        */
+
+       /*
+        * *Only* set 'entry_list' here, based on where we're looking
+        * for the indexed entries. Later, we might still want to
+        * journal both blocks, based on free list state.
+        */
+       dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+       if (ocfs2_dx_root_inline(dx_root)) {
+               entry_list = &dx_root->dr_entries;
+       } else {
+               dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data;
+               entry_list = &dx_leaf->dl_list;
+       }
+
+       /* Neither of these are a disk corruption - that should have
+        * been caught by lookup, before we got here. */
+       BUG_ON(le16_to_cpu(entry_list->de_count) <= 0);
+       BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0);
+
+       index = (char *)dx_entry - (char *)entry_list->de_entries;
+       index /= sizeof(*dx_entry);
+
+       if (index >= le16_to_cpu(entry_list->de_num_used)) {
+               mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n",
+                    (unsigned long long)OCFS2_I(dir)->ip_blkno, index,
+                    entry_list, dx_entry);
+               return -EIO;
+       }
+
+       /*
+        * We know that removal of this dirent will leave enough room
+        * for a new one, so add this block to the free list if it
+        * isn't already there.
+        */
+       trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
+       if (trailer->db_free_rec_len == 0)
+               add_to_free_list = 1;
+
+       /*
+        * Add the block holding our index into the journal before
+        * removing the unindexed entry. If we get an error return
+        * from __ocfs2_delete_entry(), then it hasn't removed the
+        * entry yet. Likewise, successful return means we *must*
+        * remove the indexed entry.
+        *
+        * We're also careful to journal the root tree block here as
+        * the entry count needs to be updated. Also, we might be
+        * adding to the start of the free list.
+        */
+       ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       if (!ocfs2_dx_root_inline(dx_root)) {
+               ret = ocfs2_journal_access_dl(handle, dir,
+                                             lookup->dl_dx_leaf_bh,
+                                             OCFS2_JOURNAL_ACCESS_WRITE);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
+
+       mlog(0, "Dir %llu: delete entry at index: %d\n",
+            (unsigned long long)OCFS2_I(dir)->ip_blkno, index);
+
+       ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
+                                  leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh);
+       trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
+       if (add_to_free_list) {
+               trailer->db_free_next = dx_root->dr_free_blk;
+               dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr);
+               ocfs2_journal_dirty(handle, dx_root_bh);
+       }
+
+       /* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */
+       ocfs2_journal_dirty(handle, leaf_bh);
+
+       le32_add_cpu(&dx_root->dr_num_entries, -1);
+       ocfs2_journal_dirty(handle, dx_root_bh);
+
+       ocfs2_dx_list_remove_entry(entry_list, index);
+
+       if (!ocfs2_dx_root_inline(dx_root))
+               ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh);
+
+out:
+       return ret;
+}
+
 static inline int ocfs2_delete_entry_id(handle_t *handle,
                                        struct inode *dir,
                                        struct ocfs2_dir_entry *de_del,
@@ -624,18 +1413,22 @@ static inline int ocfs2_delete_entry_el(handle_t *handle,
 }
 
 /*
- * ocfs2_delete_entry deletes a directory entry by merging it with the
- * previous entry
+ * Delete a directory entry. Hide the details of directory
+ * implementation from the caller.
  */
 int ocfs2_delete_entry(handle_t *handle,
                       struct inode *dir,
-                      struct ocfs2_dir_entry *de_del,
-                      struct buffer_head *bh)
+                      struct ocfs2_dir_lookup_result *res)
 {
+       if (ocfs2_dir_indexed(dir))
+               return ocfs2_delete_entry_dx(handle, dir, res);
+
        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-               return ocfs2_delete_entry_id(handle, dir, de_del, bh);
+               return ocfs2_delete_entry_id(handle, dir, res->dl_entry,
+                                            res->dl_leaf_bh);
 
-       return ocfs2_delete_entry_el(handle, dir, de_del, bh);
+       return ocfs2_delete_entry_el(handle, dir, res->dl_entry,
+                                    res->dl_leaf_bh);
 }
 
 /*
@@ -663,45 +1456,218 @@ static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de,
        return 0;
 }
 
-/* we don't always have a dentry for what we want to add, so people
- * like orphan dir can call this instead.
- *
- * If you pass me insert_bh, I'll skip the search of the other dir
- * blocks and put the record in there.
- */
-int __ocfs2_add_entry(handle_t *handle,
-                     struct inode *dir,
-                     const char *name, int namelen,
-                     struct inode *inode, u64 blkno,
-                     struct buffer_head *parent_fe_bh,
-                     struct buffer_head *insert_bh)
+static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf,
+                                         struct ocfs2_dx_entry *dx_new_entry)
 {
-       unsigned long offset;
-       unsigned short rec_len;
-       struct ocfs2_dir_entry *de, *de1;
-       struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data;
-       struct super_block *sb = dir->i_sb;
-       int retval, status;
-       unsigned int size = sb->s_blocksize;
-       char *data_start = insert_bh->b_data;
+       int i;
 
-       mlog_entry_void();
+       i = le16_to_cpu(dx_leaf->dl_list.de_num_used);
+       dx_leaf->dl_list.de_entries[i] = *dx_new_entry;
 
-       if (!namelen)
-               return -EINVAL;
+       le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1);
+}
 
-       if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-               data_start = di->id2.i_data.id_data;
-               size = i_size_read(dir);
+static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list,
+                                      struct ocfs2_dx_hinfo *hinfo,
+                                      u64 dirent_blk)
+{
+       int i;
+       struct ocfs2_dx_entry *dx_entry;
 
-               BUG_ON(insert_bh != parent_fe_bh);
-       }
+       i = le16_to_cpu(entry_list->de_num_used);
+       dx_entry = &entry_list->de_entries[i];
 
-       rec_len = OCFS2_DIR_REC_LEN(namelen);
-       offset = 0;
-       de = (struct ocfs2_dir_entry *) data_start;
-       while (1) {
-               BUG_ON((char *)de >= (size + data_start));
+       memset(dx_entry, 0, sizeof(*dx_entry));
+       dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash);
+       dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash);
+       dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk);
+
+       le16_add_cpu(&entry_list->de_num_used, 1);
+}
+
+static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
+                                     struct ocfs2_dx_hinfo *hinfo,
+                                     u64 dirent_blk,
+                                     struct buffer_head *dx_leaf_bh)
+{
+       int ret;
+       struct ocfs2_dx_leaf *dx_leaf;
+
+       ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+       ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk);
+       ocfs2_journal_dirty(handle, dx_leaf_bh);
+
+out:
+       return ret;
+}
+
+static void ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle,
+                                       struct ocfs2_dx_hinfo *hinfo,
+                                       u64 dirent_blk,
+                                       struct ocfs2_dx_root_block *dx_root)
+{
+       ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk);
+}
+
+static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle,
+                              struct ocfs2_dir_lookup_result *lookup)
+{
+       int ret = 0;
+       struct ocfs2_dx_root_block *dx_root;
+       struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
+
+       ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data;
+       if (ocfs2_dx_root_inline(dx_root)) {
+               ocfs2_dx_inline_root_insert(dir, handle,
+                                           &lookup->dl_hinfo,
+                                           lookup->dl_leaf_bh->b_blocknr,
+                                           dx_root);
+       } else {
+               ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo,
+                                                lookup->dl_leaf_bh->b_blocknr,
+                                                lookup->dl_dx_leaf_bh);
+               if (ret)
+                       goto out;
+       }
+
+       le32_add_cpu(&dx_root->dr_num_entries, 1);
+       ocfs2_journal_dirty(handle, dx_root_bh);
+
+out:
+       return ret;
+}
+
+static void ocfs2_remove_block_from_free_list(struct inode *dir,
+                                      handle_t *handle,
+                                      struct ocfs2_dir_lookup_result *lookup)
+{
+       struct ocfs2_dir_block_trailer *trailer, *prev;
+       struct ocfs2_dx_root_block *dx_root;
+       struct buffer_head *bh;
+
+       trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
+
+       if (ocfs2_free_list_at_root(lookup)) {
+               bh = lookup->dl_dx_root_bh;
+               dx_root = (struct ocfs2_dx_root_block *)bh->b_data;
+               dx_root->dr_free_blk = trailer->db_free_next;
+       } else {
+               bh = lookup->dl_prev_leaf_bh;
+               prev = ocfs2_trailer_from_bh(bh, dir->i_sb);
+               prev->db_free_next = trailer->db_free_next;
+       }
+
+       trailer->db_free_rec_len = cpu_to_le16(0);
+       trailer->db_free_next = cpu_to_le64(0);
+
+       ocfs2_journal_dirty(handle, bh);
+       ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
+}
+
+/*
+ * This expects that a journal write has been reserved on
+ * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh
+ */
+static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle,
+                                  struct ocfs2_dir_lookup_result *lookup)
+{
+       int max_rec_len;
+       struct ocfs2_dir_block_trailer *trailer;
+
+       /* Walk dl_leaf_bh to figure out what the new free rec_len is. */
+       max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh);
+       if (max_rec_len) {
+               /*
+                * There's still room in this block, so no need to remove it
+                * from the free list. In this case, we just want to update
+                * the rec len accounting.
+                */
+               trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
+               trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
+               ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
+       } else {
+               ocfs2_remove_block_from_free_list(dir, handle, lookup);
+       }
+}
+
+/* we don't always have a dentry for what we want to add, so people
+ * like orphan dir can call this instead.
+ *
+ * The lookup context must have been filled from
+ * ocfs2_prepare_dir_for_insert.
+ */
+int __ocfs2_add_entry(handle_t *handle,
+                     struct inode *dir,
+                     const char *name, int namelen,
+                     struct inode *inode, u64 blkno,
+                     struct buffer_head *parent_fe_bh,
+                     struct ocfs2_dir_lookup_result *lookup)
+{
+       unsigned long offset;
+       unsigned short rec_len;
+       struct ocfs2_dir_entry *de, *de1;
+       struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data;
+       struct super_block *sb = dir->i_sb;
+       int retval, status;
+       unsigned int size = sb->s_blocksize;
+       struct buffer_head *insert_bh = lookup->dl_leaf_bh;
+       char *data_start = insert_bh->b_data;
+
+       mlog_entry_void();
+
+       if (!namelen)
+               return -EINVAL;
+
+       if (ocfs2_dir_indexed(dir)) {
+               struct buffer_head *bh;
+
+               /*
+                * An indexed dir may require that we update the free space
+                * list. Reserve a write to the previous node in the list so
+                * that we don't fail later.
+                *
+                * XXX: This can be either a dx_root_block, or an unindexed
+                * directory tree leaf block.
+                */
+               if (ocfs2_free_list_at_root(lookup)) {
+                       bh = lookup->dl_dx_root_bh;
+                       retval = ocfs2_journal_access_dr(handle, dir, bh,
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
+               } else {
+                       bh = lookup->dl_prev_leaf_bh;
+                       retval = ocfs2_journal_access_db(handle, dir, bh,
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
+               }
+               if (retval) {
+                       mlog_errno(retval);
+                       return retval;
+               }
+       } else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+               data_start = di->id2.i_data.id_data;
+               size = i_size_read(dir);
+
+               BUG_ON(insert_bh != parent_fe_bh);
+       }
+
+       rec_len = OCFS2_DIR_REC_LEN(namelen);
+       offset = 0;
+       de = (struct ocfs2_dir_entry *) data_start;
+       while (1) {
+               BUG_ON((char *)de >= (size + data_start));
 
                /* These checks should've already been passed by the
                 * prepare function, but I guess we can leave them
@@ -737,10 +1703,22 @@ int __ocfs2_add_entry(handle_t *handle,
                                status = ocfs2_journal_access_di(handle, dir,
                                                                 insert_bh,
                                                                 OCFS2_JOURNAL_ACCESS_WRITE);
-                       else
+                       else {
                                status = ocfs2_journal_access_db(handle, dir,
                                                                 insert_bh,
-                                                                OCFS2_JOURNAL_ACCESS_WRITE);
+                                             OCFS2_JOURNAL_ACCESS_WRITE);
+
+                               if (ocfs2_dir_indexed(dir)) {
+                                       status = ocfs2_dx_dir_insert(dir,
+                                                               handle,
+                                                               lookup);
+                                       if (status) {
+                                               mlog_errno(status);
+                                               goto bail;
+                                       }
+                               }
+                       }
+
                        /* By now the buffer is marked for journaling */
                        offset += le16_to_cpu(de->rec_len);
                        if (le64_to_cpu(de->inode)) {
@@ -761,6 +1739,9 @@ int __ocfs2_add_entry(handle_t *handle,
                        de->name_len = namelen;
                        memcpy(de->name, name, namelen);
 
+                       if (ocfs2_dir_indexed(dir))
+                               ocfs2_recalc_free_list(dir, handle, lookup);
+
                        dir->i_version++;
                        status = ocfs2_journal_dirty(handle, insert_bh);
                        retval = 0;
@@ -870,6 +1851,10 @@ out:
        return 0;
 }
 
+/*
+ * NOTE: This function can be called against unindexed directories,
+ * and indexed ones.
+ */
 static int ocfs2_dir_foreach_blk_el(struct inode *inode,
                                    u64 *f_version,
                                    loff_t *f_pos, void *priv,
@@ -1071,31 +2056,22 @@ int ocfs2_find_files_on_disk(const char *name,
                             int namelen,
                             u64 *blkno,
                             struct inode *inode,
-                            struct buffer_head **dirent_bh,
-                            struct ocfs2_dir_entry **dirent)
+                            struct ocfs2_dir_lookup_result *lookup)
 {
        int status = -ENOENT;
 
-       mlog_entry("(name=%.*s, blkno=%p, inode=%p, dirent_bh=%p, dirent=%p)\n",
-                  namelen, name, blkno, inode, dirent_bh, dirent);
+       mlog(0, "name=%.*s, blkno=%p, inode=%llu\n", namelen, name, blkno,
+            (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
-       *dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
-       if (!*dirent_bh || !*dirent) {
-               status = -ENOENT;
+       status = ocfs2_find_entry(name, namelen, inode, lookup);
+       if (status)
                goto leave;
-       }
 
-       *blkno = le64_to_cpu((*dirent)->inode);
+       *blkno = le64_to_cpu(lookup->dl_entry->inode);
 
        status = 0;
 leave:
-       if (status < 0) {
-               *dirent = NULL;
-               brelse(*dirent_bh);
-               *dirent_bh = NULL;
-       }
 
-       mlog_exit(status);
        return status;
 }
 
@@ -1107,11 +2083,10 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
                               int namelen, u64 *blkno)
 {
        int ret;
-       struct buffer_head *bh = NULL;
-       struct ocfs2_dir_entry *dirent = NULL;
+       struct ocfs2_dir_lookup_result lookup = { NULL, };
 
-       ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &bh, &dirent);
-       brelse(bh);
+       ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &lookup);
+       ocfs2_free_dir_lookup_result(&lookup);
 
        return ret;
 }
@@ -1128,20 +2103,18 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
                              int namelen)
 {
        int ret;
-       struct buffer_head *dirent_bh = NULL;
-       struct ocfs2_dir_entry *dirent = NULL;
+       struct ocfs2_dir_lookup_result lookup = { NULL, };
 
        mlog_entry("dir %llu, name '%.*s'\n",
                   (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
 
        ret = -EEXIST;
-       dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent);
-       if (dirent_bh)
+       if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
                goto bail;
 
        ret = 0;
 bail:
-       brelse(dirent_bh);
+       ocfs2_free_dir_lookup_result(&lookup);
 
        mlog_exit(ret);
        return ret;
@@ -1151,6 +2124,7 @@ struct ocfs2_empty_dir_priv {
        unsigned seen_dot;
        unsigned seen_dot_dot;
        unsigned seen_other;
+       unsigned dx_dir;
 };
 static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
                                   loff_t pos, u64 ino, unsigned type)
@@ -1160,6 +2134,13 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
        /*
         * Check the positions of "." and ".." records to be sure
         * they're in the correct place.
+        *
+        * Indexed directories don't need to proceed past the first
+        * two entries, so we end the scan after seeing '..'. Despite
+        * that, we allow the scan to proceed In the event that we
+        * have a corrupted indexed directory (no dot or dot dot
+        * entries). This allows us to double check for existing
+        * entries which might not have been found in the index.
         */
        if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
                p->seen_dot = 1;
@@ -1169,16 +2150,57 @@ static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
        if (name_len == 2 && !strncmp("..", name, 2) &&
            pos == OCFS2_DIR_REC_LEN(1)) {
                p->seen_dot_dot = 1;
+
+               if (p->dx_dir && p->seen_dot)
+                       return 1;
+
                return 0;
        }
 
        p->seen_other = 1;
        return 1;
 }
+
+static int ocfs2_empty_dir_dx(struct inode *inode,
+                             struct ocfs2_empty_dir_priv *priv)
+{
+       int ret;
+       struct buffer_head *di_bh = NULL;
+       struct buffer_head *dx_root_bh = NULL;
+       struct ocfs2_dinode *di;
+       struct ocfs2_dx_root_block *dx_root;
+
+       priv->dx_dir = 1;
+
+       ret = ocfs2_read_inode_block(inode, &di_bh);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+       di = (struct ocfs2_dinode *)di_bh->b_data;
+
+       ret = ocfs2_read_dx_root(inode, di, &dx_root_bh);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+       dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+       if (le32_to_cpu(dx_root->dr_num_entries) != 2)
+               priv->seen_other = 1;
+
+out:
+       brelse(di_bh);
+       brelse(dx_root_bh);
+       return ret;
+}
+
 /*
  * routine to check that the specified directory is empty (for rmdir)
  *
  * Returns 1 if dir is empty, zero otherwise.
+ *
+ * XXX: This is a performance problem for unindexed directories.
  */
 int ocfs2_empty_dir(struct inode *inode)
 {
@@ -1188,6 +2210,16 @@ int ocfs2_empty_dir(struct inode *inode)
 
        memset(&priv, 0, sizeof(priv));
 
+       if (ocfs2_dir_indexed(inode)) {
+               ret = ocfs2_empty_dir_dx(inode, &priv);
+               if (ret)
+                       mlog_errno(ret);
+               /*
+                * We still run ocfs2_dir_foreach to get the checks
+                * for "." and "..".
+                */
+       }
+
        ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
        if (ret)
                mlog_errno(ret);
@@ -1280,7 +2312,8 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
                                 struct inode *parent,
                                 struct inode *inode,
                                 struct buffer_head *fe_bh,
-                                struct ocfs2_alloc_context *data_ac)
+                                struct ocfs2_alloc_context *data_ac,
+                                struct buffer_head **ret_new_bh)
 {
        int status;
        unsigned int size = osb->sb->s_blocksize;
@@ -1289,7 +2322,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
 
        mlog_entry_void();
 
-       if (ocfs2_supports_dir_trailer(osb))
+       if (ocfs2_new_dir_wants_trailer(inode))
                size = ocfs2_dir_trailer_blk_off(parent->i_sb);
 
        status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
@@ -1310,8 +2343,19 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
        memset(new_bh->b_data, 0, osb->sb->s_blocksize);
 
        de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
-       if (ocfs2_supports_dir_trailer(osb))
-               ocfs2_init_dir_trailer(inode, new_bh);
+       if (ocfs2_new_dir_wants_trailer(inode)) {
+               int size = le16_to_cpu(de->rec_len);
+
+               /*
+                * Figure out the size of the hole left over after
+                * insertion of '.' and '..'. The trailer wants this
+                * information.
+                */
+               size -= OCFS2_DIR_REC_LEN(2);
+               size -= sizeof(struct ocfs2_dir_block_trailer);
+
+               ocfs2_init_dir_trailer(inode, new_bh, size);
+       }
 
        status = ocfs2_journal_dirty(handle, new_bh);
        if (status < 0) {
@@ -1329,6 +2373,10 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
        }
 
        status = 0;
+       if (ret_new_bh) {
+               *ret_new_bh = new_bh;
+               new_bh = NULL;
+       }
 bail:
        brelse(new_bh);
 
@@ -1336,646 +2384,1981 @@ bail:
        return status;
 }
 
-int ocfs2_fill_new_dir(struct ocfs2_super *osb,
-                      handle_t *handle,
-                      struct inode *parent,
-                      struct inode *inode,
-                      struct buffer_head *fe_bh,
-                      struct ocfs2_alloc_context *data_ac)
-{
-       BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL);
-
-       if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-               return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh);
-
-       return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh,
-                                    data_ac);
-}
-
-/*
- * Expand rec_len of the rightmost dirent in a directory block so that it
- * contains the end of our valid space for dirents. We do this during
- * expansion from an inline directory to one with extents. The first dir block
- * in that case is taken from the inline data portion of the inode block.
- *
- * We add the dir trailer if this filesystem wants it.
- */
-static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
-                                    struct super_block *sb)
+static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
+                                    handle_t *handle, struct inode *dir,
+                                    struct buffer_head *di_bh,
+                                    struct buffer_head *dirdata_bh,
+                                    struct ocfs2_alloc_context *meta_ac,
+                                    int dx_inline, u32 num_entries,
+                                    struct buffer_head **ret_dx_root_bh)
 {
-       struct ocfs2_dir_entry *de;
-       struct ocfs2_dir_entry *prev_de;
-       char *de_buf, *limit;
-       unsigned int new_size = sb->s_blocksize;
-       unsigned int bytes;
-
-       if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
-               new_size = ocfs2_dir_trailer_blk_off(sb);
+       int ret;
+       struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+       u16 dr_suballoc_bit;
+       u64 dr_blkno;
+       unsigned int num_bits;
+       struct buffer_head *dx_root_bh = NULL;
+       struct ocfs2_dx_root_block *dx_root;
+       struct ocfs2_dir_block_trailer *trailer =
+               ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
 
-       bytes = new_size - old_size;
+       ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, &dr_suballoc_bit,
+                                  &num_bits, &dr_blkno);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
 
-       limit = start + old_size;
-       de_buf = start;
-       de = (struct ocfs2_dir_entry *)de_buf;
-       do {
-               prev_de = de;
-               de_buf += le16_to_cpu(de->rec_len);
-               de = (struct ocfs2_dir_entry *)de_buf;
-       } while (de_buf < limit);
+       mlog(0, "Dir %llu, attach new index block: %llu\n",
+            (unsigned long long)OCFS2_I(dir)->ip_blkno,
+            (unsigned long long)dr_blkno);
 
-       le16_add_cpu(&prev_de->rec_len, bytes);
-}
+       dx_root_bh = sb_getblk(osb->sb, dr_blkno);
+       if (dx_root_bh == NULL) {
+               ret = -EIO;
+               goto out;
+       }
+       ocfs2_set_new_buffer_uptodate(dir, dx_root_bh);
 
-/*
- * We allocate enough clusters to fulfill "blocks_wanted", but set
- * i_size to exactly one block. Ocfs2_extend_dir() will handle the
- * rest automatically for us.
- *
- * *first_block_bh is a pointer to the 1st data block allocated to the
- *  directory.
- */
-static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
-                                  unsigned int blocks_wanted,
-                                  struct buffer_head **first_block_bh)
-{
-       u32 alloc, bit_off, len;
-       struct super_block *sb = dir->i_sb;
-       int ret, credits = ocfs2_inline_to_extents_credits(sb);
-       u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits;
-       struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
-       struct ocfs2_inode_info *oi = OCFS2_I(dir);
-       struct ocfs2_alloc_context *data_ac;
-       struct buffer_head *dirdata_bh = NULL;
-       struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-       handle_t *handle;
-       struct ocfs2_extent_tree et;
-       int did_quota = 0;
+       ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+                                     OCFS2_JOURNAL_ACCESS_CREATE);
+       if (ret < 0) {
+               mlog_errno(ret);
+               goto out;
+       }
 
-       ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
+       dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+       memset(dx_root, 0, osb->sb->s_blocksize);
+       strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
+       dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num);
+       dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
+       dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
+       dx_root->dr_blkno = cpu_to_le64(dr_blkno);
+       dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno);
+       dx_root->dr_num_entries = cpu_to_le32(num_entries);
+       if (le16_to_cpu(trailer->db_free_rec_len))
+               dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
+       else
+               dx_root->dr_free_blk = cpu_to_le64(0);
 
-       alloc = ocfs2_clusters_for_bytes(sb, bytes);
+       if (dx_inline) {
+               dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE;
+               dx_root->dr_entries.de_count =
+                       cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb));
+       } else {
+               dx_root->dr_list.l_count =
+                       cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
+       }
 
-       /*
-        * We should never need more than 2 clusters for this -
-        * maximum dirent size is far less than one block. In fact,
-        * the only time we'd need more than one cluster is if
-        * blocksize == clustersize and the dirent won't fit in the
-        * extra space that the expansion to a single block gives. As
-        * of today, that only happens on 4k/4k file systems.
-        */
-       BUG_ON(alloc > 2);
+       ret = ocfs2_journal_dirty(handle, dx_root_bh);
+       if (ret)
+               mlog_errno(ret);
 
-       ret = ocfs2_reserve_clusters(osb, alloc, &data_ac);
+       ret = ocfs2_journal_access_di(handle, dir, di_bh,
+                                     OCFS2_JOURNAL_ACCESS_CREATE);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
 
-       down_write(&oi->ip_alloc_sem);
+       di->i_dx_root = cpu_to_le64(dr_blkno);
 
-       /*
-        * Prepare for worst case allocation scenario of two separate
-        * extents.
-        */
-       if (alloc == 2)
-               credits += OCFS2_SUBALLOC_ALLOC;
+       OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
+       di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
 
-       handle = ocfs2_start_trans(osb, credits);
-       if (IS_ERR(handle)) {
-               ret = PTR_ERR(handle);
+       ret = ocfs2_journal_dirty(handle, di_bh);
+       if (ret)
                mlog_errno(ret);
-               goto out_sem;
-       }
 
-       if (vfs_dq_alloc_space_nodirty(dir,
-                               ocfs2_clusters_to_bytes(osb->sb, alloc))) {
-               ret = -EDQUOT;
-               goto out_commit;
+       *ret_dx_root_bh = dx_root_bh;
+       dx_root_bh = NULL;
+
+out:
+       brelse(dx_root_bh);
+       return ret;
+}
+
+static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
+                                      handle_t *handle, struct inode *dir,
+                                      struct buffer_head **dx_leaves,
+                                      int num_dx_leaves, u64 start_blk)
+{
+       int ret, i;
+       struct ocfs2_dx_leaf *dx_leaf;
+       struct buffer_head *bh;
+
+       for (i = 0; i < num_dx_leaves; i++) {
+               bh = sb_getblk(osb->sb, start_blk + i);
+               if (bh == NULL) {
+                       ret = -EIO;
+                       goto out;
+               }
+               dx_leaves[i] = bh;
+
+               ocfs2_set_new_buffer_uptodate(dir, bh);
+
+               ret = ocfs2_journal_access_dl(handle, dir, bh,
+                                             OCFS2_JOURNAL_ACCESS_CREATE);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data;
+
+               memset(dx_leaf, 0, osb->sb->s_blocksize);
+               strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE);
+               dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation);
+               dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr);
+               dx_leaf->dl_list.de_count =
+                       cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
+
+               mlog(0,
+                    "Dir %llu, format dx_leaf: %llu, entry count: %u\n",
+                    (unsigned long long)OCFS2_I(dir)->ip_blkno,
+                    (unsigned long long)bh->b_blocknr,
+                    le16_to_cpu(dx_leaf->dl_list.de_count));
+
+               ocfs2_journal_dirty(handle, bh);
        }
-       did_quota = 1;
+
+       ret = 0;
+out:
+       return ret;
+}
+
+/*
+ * Allocates and formats a new cluster for use in an indexed dir
+ * leaf. This version will not do the extent insert, so that it can be
+ * used by operations which need careful ordering.
+ */
+static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
+                                     u32 cpos, handle_t *handle,
+                                     struct ocfs2_alloc_context *data_ac,
+                                     struct buffer_head **dx_leaves,
+                                     int num_dx_leaves, u64 *ret_phys_blkno)
+{
+       int ret;
+       u32 phys, num;
+       u64 phys_blkno;
+       struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+
        /*
-        * Try to claim as many clusters as the bitmap can give though
-        * if we only get one now, that's enough to continue. The rest
-        * will be claimed after the conversion to extents.
+        * XXX: For create, this should claim cluster for the index
+        * *before* the unindexed insert so that we have a better
+        * chance of contiguousness as the directory grows in number
+        * of entries.
         */
-       ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+       ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 1, &phys, &num);
        if (ret) {
                mlog_errno(ret);
-               goto out_commit;
+               goto out;
        }
 
        /*
-        * Operations are carefully ordered so that we set up the new
-        * data block first. The conversion from inline data to
-        * extents follows.
+        * Format the new cluster first. That way, we're inserting
+        * valid data.
         */
-       blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
-       dirdata_bh = sb_getblk(sb, blkno);
-       if (!dirdata_bh) {
-               ret = -EIO;
-               mlog_errno(ret);
-               goto out_commit;
-       }
-
-       ocfs2_set_new_buffer_uptodate(dir, dirdata_bh);
-
-       ret = ocfs2_journal_access_db(handle, dir, dirdata_bh,
-                                     OCFS2_JOURNAL_ACCESS_CREATE);
+       phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys);
+       ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves,
+                                         num_dx_leaves, phys_blkno);
        if (ret) {
                mlog_errno(ret);
-               goto out_commit;
+               goto out;
        }
 
-       memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
-       memset(dirdata_bh->b_data + i_size_read(dir), 0,
-              sb->s_blocksize - i_size_read(dir));
-       ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb);
-       if (ocfs2_supports_dir_trailer(osb))
-               ocfs2_init_dir_trailer(dir, dirdata_bh);
+       *ret_phys_blkno = phys_blkno;
+out:
+       return ret;
+}
 
-       ret = ocfs2_journal_dirty(handle, dirdata_bh);
+static int ocfs2_dx_dir_new_cluster(struct inode *dir,
+                                   struct ocfs2_extent_tree *et,
+                                   u32 cpos, handle_t *handle,
+                                   struct ocfs2_alloc_context *data_ac,
+                                   struct ocfs2_alloc_context *meta_ac,
+                                   struct buffer_head **dx_leaves,
+                                   int num_dx_leaves)
+{
+       int ret;
+       u64 phys_blkno;
+       struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+
+       ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
+                                        num_dx_leaves, &phys_blkno);
        if (ret) {
                mlog_errno(ret);
-               goto out_commit;
+               goto out;
        }
 
-       /*
-        * Set extent, i_size, etc on the directory. After this, the
-        * inode should contain the same exact dirents as before and
-        * be fully accessible from system calls.
-        *
-        * We let the later dirent insert modify c/mtime - to the user
-        * the data hasn't changed.
-        */
-       ret = ocfs2_journal_access_di(handle, dir, di_bh,
-                                     OCFS2_JOURNAL_ACCESS_CREATE);
-       if (ret) {
+       ret = ocfs2_insert_extent(osb, handle, dir, et, cpos, phys_blkno, 1, 0,
+                                 meta_ac);
+       if (ret)
                mlog_errno(ret);
-               goto out_commit;
-       }
+out:
+       return ret;
+}
 
-       spin_lock(&oi->ip_lock);
-       oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
-       di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
-       spin_unlock(&oi->ip_lock);
+static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb,
+                                                       int *ret_num_leaves)
+{
+       int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1);
+       struct buffer_head **dx_leaves;
 
-       ocfs2_dinode_new_extent_list(dir, di);
+       dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *),
+                           GFP_NOFS);
+       if (dx_leaves && ret_num_leaves)
+               *ret_num_leaves = num_dx_leaves;
 
-       i_size_write(dir, sb->s_blocksize);
-       dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+       return dx_leaves;
+}
 
-       di->i_size = cpu_to_le64(sb->s_blocksize);
-       di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
-       di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
+static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb,
+                                handle_t *handle,
+                                struct inode *parent,
+                                struct inode *inode,
+                                struct buffer_head *di_bh,
+                                struct ocfs2_alloc_context *data_ac,
+                                struct ocfs2_alloc_context *meta_ac)
+{
+       int ret;
+       struct buffer_head *leaf_bh = NULL;
+       struct buffer_head *dx_root_bh = NULL;
+       struct ocfs2_dx_hinfo hinfo;
+       struct ocfs2_dx_root_block *dx_root;
+       struct ocfs2_dx_entry_list *entry_list;
 
        /*
-        * This should never fail as our extent list is empty and all
-        * related blocks have been journaled already.
+        * Our strategy is to create the directory as though it were
+        * unindexed, then add the index block. This works with very
+        * little complication since the state of a new directory is a
+        * very well known quantity.
+        *
+        * Essentially, we have two dirents ("." and ".."), in the 1st
+        * block which need indexing. These are easily inserted into
+        * the index block.
         */
-       ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, blkno, len,
-                                 0, NULL);
+
+       ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh,
+                                   data_ac, &leaf_bh);
        if (ret) {
                mlog_errno(ret);
-               goto out_commit;
+               goto out;
        }
 
-       /*
-        * Set i_blocks after the extent insert for the most up to
-        * date ip_clusters value.
-        */
-       dir->i_blocks = ocfs2_inode_sector_count(dir);
-
-       ret = ocfs2_journal_dirty(handle, di_bh);
+       ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh,
+                                       meta_ac, 1, 2, &dx_root_bh);
        if (ret) {
                mlog_errno(ret);
-               goto out_commit;
+               goto out;
        }
+       dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+       entry_list = &dx_root->dr_entries;
 
-       /*
-        * We asked for two clusters, but only got one in the 1st
-        * pass. Claim the 2nd cluster as a separate extent.
-        */
-       if (alloc > len) {
-               ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
-                                          &len);
-               if (ret) {
-                       mlog_errno(ret);
-                       goto out_commit;
-               }
-               blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
+       /* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */
+       ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo);
+       ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
 
-               ret = ocfs2_insert_extent(osb, handle, dir, &et, 1,
-                                         blkno, len, 0, NULL);
-               if (ret) {
-                       mlog_errno(ret);
-                       goto out_commit;
-               }
-       }
+       ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo);
+       ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
 
-       *first_block_bh = dirdata_bh;
-       dirdata_bh = NULL;
+out:
+       brelse(dx_root_bh);
+       brelse(leaf_bh);
+       return ret;
+}
 
-out_commit:
-       if (ret < 0 && did_quota)
-               vfs_dq_free_space_nodirty(dir,
-                       ocfs2_clusters_to_bytes(osb->sb, 2));
-       ocfs2_commit_trans(osb, handle);
+int ocfs2_fill_new_dir(struct ocfs2_super *osb,
+                      handle_t *handle,
+                      struct inode *parent,
+                      struct inode *inode,
+                      struct buffer_head *fe_bh,
+                      struct ocfs2_alloc_context *data_ac,
+                      struct ocfs2_alloc_context *meta_ac)
 
-out_sem:
-       up_write(&oi->ip_alloc_sem);
+{
+       BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL);
 
-out:
-       if (data_ac)
-               ocfs2_free_alloc_context(data_ac);
+       if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+               return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh);
 
-       brelse(dirdata_bh);
+       if (ocfs2_supports_indexed_dirs(osb))
+               return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh,
+                                            data_ac, meta_ac);
 
-       return ret;
+       return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh,
+                                    data_ac, NULL);
 }
 
-/* returns a bh of the 1st new block in the allocation. */
-static int ocfs2_do_extend_dir(struct super_block *sb,
-                              handle_t *handle,
-                              struct inode *dir,
-                              struct buffer_head *parent_fe_bh,
-                              struct ocfs2_alloc_context *data_ac,
-                              struct ocfs2_alloc_context *meta_ac,
-                              struct buffer_head **new_bh)
+static int ocfs2_dx_dir_index_block(struct inode *dir,
+                                   handle_t *handle,
+                                   struct buffer_head **dx_leaves,
+                                   int num_dx_leaves,
+                                   u32 *num_dx_entries,
+                                   struct buffer_head *dirent_bh)
 {
-       int status;
-       int extend, did_quota = 0;
-       u64 p_blkno, v_blkno;
+       int ret, namelen, i;
+       char *de_buf, *limit;
+       struct ocfs2_dir_entry *de;
+       struct buffer_head *dx_leaf_bh;
+       struct ocfs2_dx_hinfo hinfo;
+       u64 dirent_blk = dirent_bh->b_blocknr;
 
-       spin_lock(&OCFS2_I(dir)->ip_lock);
-       extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
-       spin_unlock(&OCFS2_I(dir)->ip_lock);
+       de_buf = dirent_bh->b_data;
+       limit = de_buf + dir->i_sb->s_blocksize;
 
-       if (extend) {
-               u32 offset = OCFS2_I(dir)->ip_clusters;
+       while (de_buf < limit) {
+               de = (struct ocfs2_dir_entry *)de_buf;
 
-               if (vfs_dq_alloc_space_nodirty(dir,
-                                       ocfs2_clusters_to_bytes(sb, 1))) {
-                       status = -EDQUOT;
-                       goto bail;
-               }
-               did_quota = 1;
+               namelen = de->name_len;
+               if (!namelen || !de->inode)
+                       goto inc;
 
-               status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
-                                             1, 0, parent_fe_bh, handle,
-                                             data_ac, meta_ac, NULL);
-               BUG_ON(status == -EAGAIN);
-               if (status < 0) {
-                       mlog_errno(status);
-                       goto bail;
+               ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo);
+
+               i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo);
+               dx_leaf_bh = dx_leaves[i];
+
+               ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo,
+                                                dirent_blk, dx_leaf_bh);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
                }
-       }
 
-       v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir));
-       status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
-       }
+               *num_dx_entries = *num_dx_entries + 1;
 
-       *new_bh = sb_getblk(sb, p_blkno);
-       if (!*new_bh) {
-               status = -EIO;
-               mlog_errno(status);
-               goto bail;
+inc:
+               de_buf += le16_to_cpu(de->rec_len);
        }
-       status = 0;
-bail:
-       if (did_quota && status < 0)
-               vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
-       mlog_exit(status);
-       return status;
+
+out:
+       return ret;
 }
 
 /*
- * Assumes you already have a cluster lock on the directory.
- *
- * 'blocks_wanted' is only used if we have an inline directory which
- * is to be turned into an extent based one. The size of the dirent to
- * insert might be larger than the space gained by growing to just one
- * block, so we may have to grow the inode by two blocks in that case.
+ * XXX: This expects dx_root_bh to already be part of the transaction.
  */
-static int ocfs2_extend_dir(struct ocfs2_super *osb,
-                           struct inode *dir,
-                           struct buffer_head *parent_fe_bh,
-                           unsigned int blocks_wanted,
-                           struct buffer_head **new_de_bh)
+static void ocfs2_dx_dir_index_root_block(struct inode *dir,
+                                        struct buffer_head *dx_root_bh,
+                                        struct buffer_head *dirent_bh)
 {
-       int status = 0;
-       int credits, num_free_extents, drop_alloc_sem = 0;
-       loff_t dir_i_size;
-       struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
-       struct ocfs2_extent_list *el = &fe->id2.i_list;
-       struct ocfs2_alloc_context *data_ac = NULL;
-       struct ocfs2_alloc_context *meta_ac = NULL;
-       handle_t *handle = NULL;
-       struct buffer_head *new_bh = NULL;
-       struct ocfs2_dir_entry * de;
-       struct super_block *sb = osb->sb;
-       struct ocfs2_extent_tree et;
+       char *de_buf, *limit;
+       struct ocfs2_dx_root_block *dx_root;
+       struct ocfs2_dir_entry *de;
+       struct ocfs2_dx_hinfo hinfo;
+       u64 dirent_blk = dirent_bh->b_blocknr;
 
-       mlog_entry_void();
+       dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
 
-       if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-               status = ocfs2_expand_inline_dir(dir, parent_fe_bh,
-                                                blocks_wanted, &new_bh);
-               if (status) {
-                       mlog_errno(status);
-                       goto bail;
+       de_buf = dirent_bh->b_data;
+       limit = de_buf + dir->i_sb->s_blocksize;
+
+       while (de_buf < limit) {
+               de = (struct ocfs2_dir_entry *)de_buf;
+
+               if (!de->name_len || !de->inode)
+                       goto inc;
+
+               ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo);
+
+               mlog(0,
+                    "dir: %llu, major: 0x%x minor: 0x%x, index: %u, name: %.*s\n",
+                    (unsigned long long)dir->i_ino, hinfo.major_hash,
+                    hinfo.minor_hash,
+                    le16_to_cpu(dx_root->dr_entries.de_num_used),
+                    de->name_len, de->name);
+
+               ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo,
+                                          dirent_blk);
+
+               le32_add_cpu(&dx_root->dr_num_entries, 1);
+inc:
+               de_buf += le16_to_cpu(de->rec_len);
+       }
+}
+
+/*
+ * Count the number of inline directory entries in di_bh and compare
+ * them against the number of entries we can hold in an inline dx root
+ * block.
+ */
+static int ocfs2_new_dx_should_be_inline(struct inode *dir,
+                                        struct buffer_head *di_bh)
+{
+       int dirent_count = 0;
+       char *de_buf, *limit;
+       struct ocfs2_dir_entry *de;
+       struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+       de_buf = di->id2.i_data.id_data;
+       limit = de_buf + i_size_read(dir);
+
+       while (de_buf < limit) {
+               de = (struct ocfs2_dir_entry *)de_buf;
+
+               if (de->name_len && de->inode)
+                       dirent_count++;
+
+               de_buf += le16_to_cpu(de->rec_len);
+       }
+
+       /* We are careful to leave room for one extra record. */
+       return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb);
+}
+
+/*
+ * Expand rec_len of the rightmost dirent in a directory block so that it
+ * contains the end of our valid space for dirents. We do this during
+ * expansion from an inline directory to one with extents. The first dir block
+ * in that case is taken from the inline data portion of the inode block.
+ *
+ * This will also return the largest amount of contiguous space for a dirent
+ * in the block. That value is *not* necessarily the last dirent, even after
+ * expansion. The directory indexing code wants this value for free space
+ * accounting. We do this here since we're already walking the entire dir
+ * block.
+ *
+ * We add the dir trailer if this filesystem wants it.
+ */
+static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size,
+                                            struct inode *dir)
+{
+       struct super_block *sb = dir->i_sb;
+       struct ocfs2_dir_entry *de;
+       struct ocfs2_dir_entry *prev_de;
+       char *de_buf, *limit;
+       unsigned int new_size = sb->s_blocksize;
+       unsigned int bytes, this_hole;
+       unsigned int largest_hole = 0;
+
+       if (ocfs2_new_dir_wants_trailer(dir))
+               new_size = ocfs2_dir_trailer_blk_off(sb);
+
+       bytes = new_size - old_size;
+
+       limit = start + old_size;
+       de_buf = start;
+       de = (struct ocfs2_dir_entry *)de_buf;
+       do {
+               this_hole = ocfs2_figure_dirent_hole(de);
+               if (this_hole > largest_hole)
+                       largest_hole = this_hole;
+
+               prev_de = de;
+               de_buf += le16_to_cpu(de->rec_len);
+               de = (struct ocfs2_dir_entry *)de_buf;
+       } while (de_buf < limit);
+
+       le16_add_cpu(&prev_de->rec_len, bytes);
+
+       /* We need to double check this after modification of the final
+        * dirent. */
+       this_hole = ocfs2_figure_dirent_hole(prev_de);
+       if (this_hole > largest_hole)
+               largest_hole = this_hole;
+
+       if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
+               return largest_hole;
+       return 0;
+}
+
+/*
+ * We allocate enough clusters to fulfill "blocks_wanted", but set
+ * i_size to exactly one block. Ocfs2_extend_dir() will handle the
+ * rest automatically for us.
+ *
+ * *first_block_bh is a pointer to the 1st data block allocated to the
+ *  directory.
+ */
+static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
+                                  unsigned int blocks_wanted,
+                                  struct ocfs2_dir_lookup_result *lookup,
+                                  struct buffer_head **first_block_bh)
+{
+       u32 alloc, dx_alloc, bit_off, len, num_dx_entries = 0;
+       struct super_block *sb = dir->i_sb;
+       int ret, i, num_dx_leaves = 0, dx_inline = 0,
+               credits = ocfs2_inline_to_extents_credits(sb);
+       u64 dx_insert_blkno, blkno,
+               bytes = blocks_wanted << sb->s_blocksize_bits;
+       struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+       struct ocfs2_inode_info *oi = OCFS2_I(dir);
+       struct ocfs2_alloc_context *data_ac;
+       struct ocfs2_alloc_context *meta_ac = NULL;
+       struct buffer_head *dirdata_bh = NULL;
+       struct buffer_head *dx_root_bh = NULL;
+       struct buffer_head **dx_leaves = NULL;
+       struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+       handle_t *handle;
+       struct ocfs2_extent_tree et;
+       struct ocfs2_extent_tree dx_et;
+       int did_quota = 0, bytes_allocated = 0;
+
+       ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
+
+       alloc = ocfs2_clusters_for_bytes(sb, bytes);
+       dx_alloc = 0;
+
+       if (ocfs2_supports_indexed_dirs(osb)) {
+               credits += ocfs2_add_dir_index_credits(sb);
+
+               dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh);
+               if (!dx_inline) {
+                       /* Add one more cluster for an index leaf */
+                       dx_alloc++;
+                       dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb,
+                                                               &num_dx_leaves);
+                       if (!dx_leaves) {
+                               ret = -ENOMEM;
+                               mlog_errno(ret);
+                               goto out;
+                       }
+               }
+
+               /* This gets us the dx_root */
+               ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+       }
+
+       /*
+        * We should never need more than 2 clusters for the unindexed
+        * tree - maximum dirent size is far less than one block. In
+        * fact, the only time we'd need more than one cluster is if
+        * blocksize == clustersize and the dirent won't fit in the
+        * extra space that the expansion to a single block gives. As
+        * of today, that only happens on 4k/4k file systems.
+        */
+       BUG_ON(alloc > 2);
+
+       ret = ocfs2_reserve_clusters(osb, alloc, &data_ac);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       down_write(&oi->ip_alloc_sem);
+
+       /*
+        * Prepare for worst case allocation scenario of two separate
+        * extents in the unindexed tree.
+        */
+       if (alloc == 2)
+               credits += OCFS2_SUBALLOC_ALLOC;
+
+       handle = ocfs2_start_trans(osb, credits);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               mlog_errno(ret);
+               goto out_sem;
+       }
+
+       if (vfs_dq_alloc_space_nodirty(dir,
+                               ocfs2_clusters_to_bytes(osb->sb,
+                                                       alloc + dx_alloc))) {
+               ret = -EDQUOT;
+               goto out_commit;
+       }
+       did_quota = 1;
+
+       if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
+               /*
+                * Allocate our index cluster first, to maximize the
+                * possibility that unindexed leaves grow
+                * contiguously.
+                */
+               ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac,
+                                                dx_leaves, num_dx_leaves,
+                                                &dx_insert_blkno);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out_commit;
+               }
+               bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
+       }
+
+       /*
+        * Try to claim as many clusters as the bitmap can give though
+        * if we only get one now, that's enough to continue. The rest
+        * will be claimed after the conversion to extents.
+        */
+       ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+       bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
+
+       /*
+        * Operations are carefully ordered so that we set up the new
+        * data block first. The conversion from inline data to
+        * extents follows.
+        */
+       blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
+       dirdata_bh = sb_getblk(sb, blkno);
+       if (!dirdata_bh) {
+               ret = -EIO;
+               mlog_errno(ret);
+               goto out_commit;
+       }
+
+       ocfs2_set_new_buffer_uptodate(dir, dirdata_bh);
+
+       ret = ocfs2_journal_access_db(handle, dir, dirdata_bh,
+                                     OCFS2_JOURNAL_ACCESS_CREATE);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+
+       memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
+       memset(dirdata_bh->b_data + i_size_read(dir), 0,
+              sb->s_blocksize - i_size_read(dir));
+       i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir);
+       if (ocfs2_new_dir_wants_trailer(dir)) {
+               /*
+                * Prepare the dir trailer up front. It will otherwise look
+                * like a valid dirent. Even if inserting the index fails
+                * (unlikely), then all we'll have done is given first dir
+                * block a small amount of fragmentation.
+                */
+               ocfs2_init_dir_trailer(dir, dirdata_bh, i);
+       }
+
+       ret = ocfs2_journal_dirty(handle, dirdata_bh);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+
+       if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
+               /*
+                * Dx dirs with an external cluster need to do this up
+                * front. Inline dx root's get handled later, after
+                * we've allocated our root block. We get passed back
+                * a total number of items so that dr_num_entries can
+                * be correctly set once the dx_root has been
+                * allocated.
+                */
+               ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves,
+                                              num_dx_leaves, &num_dx_entries,
+                                              dirdata_bh);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out_commit;
+               }
+       }
+
+       /*
+        * Set extent, i_size, etc on the directory. After this, the
+        * inode should contain the same exact dirents as before and
+        * be fully accessible from system calls.
+        *
+        * We let the later dirent insert modify c/mtime - to the user
+        * the data hasn't changed.
+        */
+       ret = ocfs2_journal_access_di(handle, dir, di_bh,
+                                     OCFS2_JOURNAL_ACCESS_CREATE);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+
+       spin_lock(&oi->ip_lock);
+       oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
+       di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+       spin_unlock(&oi->ip_lock);
+
+       ocfs2_dinode_new_extent_list(dir, di);
+
+       i_size_write(dir, sb->s_blocksize);
+       dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+
+       di->i_size = cpu_to_le64(sb->s_blocksize);
+       di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
+       di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
+
+       /*
+        * This should never fail as our extent list is empty and all
+        * related blocks have been journaled already.
+        */
+       ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, blkno, len,
+                                 0, NULL);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+
+       /*
+        * Set i_blocks after the extent insert for the most up to
+        * date ip_clusters value.
+        */
+       dir->i_blocks = ocfs2_inode_sector_count(dir);
+
+       ret = ocfs2_journal_dirty(handle, di_bh);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+
+       if (ocfs2_supports_indexed_dirs(osb)) {
+               ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
+                                               dirdata_bh, meta_ac, dx_inline,
+                                               num_dx_entries, &dx_root_bh);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out_commit;
+               }
+
+               if (dx_inline) {
+                       ocfs2_dx_dir_index_root_block(dir, dx_root_bh,
+                                                     dirdata_bh);
+               } else {
+                       ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh);
+                       ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0,
+                                                 dx_insert_blkno, 1, 0, NULL);
+                       if (ret)
+                               mlog_errno(ret);
+               }
+       }
+
+       /*
+        * We asked for two clusters, but only got one in the 1st
+        * pass. Claim the 2nd cluster as a separate extent.
+        */
+       if (alloc > len) {
+               ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
+                                          &len);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out_commit;
+               }
+               blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
+
+               ret = ocfs2_insert_extent(osb, handle, dir, &et, 1,
+                                         blkno, len, 0, NULL);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out_commit;
+               }
+               bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
+       }
+
+       *first_block_bh = dirdata_bh;
+       dirdata_bh = NULL;
+       if (ocfs2_supports_indexed_dirs(osb)) {
+               unsigned int off;
+
+               if (!dx_inline) {
+                       /*
+                        * We need to return the correct block within the
+                        * cluster which should hold our entry.
+                        */
+                       off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb),
+                                                   &lookup->dl_hinfo);
+                       get_bh(dx_leaves[off]);
+                       lookup->dl_dx_leaf_bh = dx_leaves[off];
+               }
+               lookup->dl_dx_root_bh = dx_root_bh;
+               dx_root_bh = NULL;
+       }
+
+out_commit:
+       if (ret < 0 && did_quota)
+               vfs_dq_free_space_nodirty(dir, bytes_allocated);
+
+       ocfs2_commit_trans(osb, handle);
+
+out_sem:
+       up_write(&oi->ip_alloc_sem);
+
+out:
+       if (data_ac)
+               ocfs2_free_alloc_context(data_ac);
+       if (meta_ac)
+               ocfs2_free_alloc_context(meta_ac);
+
+       if (dx_leaves) {
+               for (i = 0; i < num_dx_leaves; i++)
+                       brelse(dx_leaves[i]);
+               kfree(dx_leaves);
+       }
+
+       brelse(dirdata_bh);
+       brelse(dx_root_bh);
+
+       return ret;
+}
+
+/* returns a bh of the 1st new block in the allocation. */
+static int ocfs2_do_extend_dir(struct super_block *sb,
+                              handle_t *handle,
+                              struct inode *dir,
+                              struct buffer_head *parent_fe_bh,
+                              struct ocfs2_alloc_context *data_ac,
+                              struct ocfs2_alloc_context *meta_ac,
+                              struct buffer_head **new_bh)
+{
+       int status;
+       int extend, did_quota = 0;
+       u64 p_blkno, v_blkno;
+
+       spin_lock(&OCFS2_I(dir)->ip_lock);
+       extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
+       spin_unlock(&OCFS2_I(dir)->ip_lock);
+
+       if (extend) {
+               u32 offset = OCFS2_I(dir)->ip_clusters;
+
+               if (vfs_dq_alloc_space_nodirty(dir,
+                                       ocfs2_clusters_to_bytes(sb, 1))) {
+                       status = -EDQUOT;
+                       goto bail;
+               }
+               did_quota = 1;
+
+               status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
+                                             1, 0, parent_fe_bh, handle,
+                                             data_ac, meta_ac, NULL);
+               BUG_ON(status == -EAGAIN);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+       }
+
+       v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir));
+       status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       *new_bh = sb_getblk(sb, p_blkno);
+       if (!*new_bh) {
+               status = -EIO;
+               mlog_errno(status);
+               goto bail;
+       }
+       status = 0;
+bail:
+       if (did_quota && status < 0)
+               vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
+       mlog_exit(status);
+       return status;
+}
+
+/*
+ * Assumes you already have a cluster lock on the directory.
+ *
+ * 'blocks_wanted' is only used if we have an inline directory which
+ * is to be turned into an extent based one. The size of the dirent to
+ * insert might be larger than the space gained by growing to just one
+ * block, so we may have to grow the inode by two blocks in that case.
+ *
+ * If the directory is already indexed, dx_root_bh must be provided.
+ */
+static int ocfs2_extend_dir(struct ocfs2_super *osb,
+                           struct inode *dir,
+                           struct buffer_head *parent_fe_bh,
+                           unsigned int blocks_wanted,
+                           struct ocfs2_dir_lookup_result *lookup,
+                           struct buffer_head **new_de_bh)
+{
+       int status = 0;
+       int credits, num_free_extents, drop_alloc_sem = 0;
+       loff_t dir_i_size;
+       struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+       struct ocfs2_extent_list *el = &fe->id2.i_list;
+       struct ocfs2_alloc_context *data_ac = NULL;
+       struct ocfs2_alloc_context *meta_ac = NULL;
+       handle_t *handle = NULL;
+       struct buffer_head *new_bh = NULL;
+       struct ocfs2_dir_entry * de;
+       struct super_block *sb = osb->sb;
+       struct ocfs2_extent_tree et;
+       struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
+
+       mlog_entry_void();
+
+       if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+               /*
+                * This would be a code error as an inline directory should
+                * never have an index root.
+                */
+               BUG_ON(dx_root_bh);
+
+               status = ocfs2_expand_inline_dir(dir, parent_fe_bh,
+                                                blocks_wanted, lookup,
+                                                &new_bh);
+               if (status) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+
+               /* Expansion from inline to an indexed directory will
+                * have given us this. */
+               dx_root_bh = lookup->dl_dx_root_bh;
+
+               if (blocks_wanted == 1) {
+                       /*
+                        * If the new dirent will fit inside the space
+                        * created by pushing out to one block, then
+                        * we can complete the operation
+                        * here. Otherwise we have to expand i_size
+                        * and format the 2nd block below.
+                        */
+                       BUG_ON(new_bh == NULL);
+                       goto bail_bh;
+               }
+
+               /*
+                * Get rid of 'new_bh' - we want to format the 2nd
+                * data block and return that instead.
+                */
+               brelse(new_bh);
+               new_bh = NULL;
+
+               dir_i_size = i_size_read(dir);
+               credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
+               goto do_extend;
+       }
+
+       dir_i_size = i_size_read(dir);
+       mlog(0, "extending dir %llu (i_size = %lld)\n",
+            (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size);
+
+       /* dir->i_size is always block aligned. */
+       spin_lock(&OCFS2_I(dir)->ip_lock);
+       if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
+               spin_unlock(&OCFS2_I(dir)->ip_lock);
+               ocfs2_init_dinode_extent_tree(&et, dir, parent_fe_bh);
+               num_free_extents = ocfs2_num_free_extents(osb, dir, &et);
+               if (num_free_extents < 0) {
+                       status = num_free_extents;
+                       mlog_errno(status);
+                       goto bail;
+               }
+
+               if (!num_free_extents) {
+                       status = ocfs2_reserve_new_metadata(osb, el, &meta_ac);
+                       if (status < 0) {
+                               if (status != -ENOSPC)
+                                       mlog_errno(status);
+                               goto bail;
+                       }
+               }
+
+               status = ocfs2_reserve_clusters(osb, 1, &data_ac);
+               if (status < 0) {
+                       if (status != -ENOSPC)
+                               mlog_errno(status);
+                       goto bail;
+               }
+
+               credits = ocfs2_calc_extend_credits(sb, el, 1);
+       } else {
+               spin_unlock(&OCFS2_I(dir)->ip_lock);
+               credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
+       }
+
+do_extend:
+       if (ocfs2_dir_indexed(dir))
+               credits++; /* For attaching the new dirent block to the
+                           * dx_root */
+
+       down_write(&OCFS2_I(dir)->ip_alloc_sem);
+       drop_alloc_sem = 1;
+
+       handle = ocfs2_start_trans(osb, credits);
+       if (IS_ERR(handle)) {
+               status = PTR_ERR(handle);
+               handle = NULL;
+               mlog_errno(status);
+               goto bail;
+       }
+
+       status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh,
+                                    data_ac, meta_ac, &new_bh);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       ocfs2_set_new_buffer_uptodate(dir, new_bh);
+
+       status = ocfs2_journal_access_db(handle, dir, new_bh,
+                                        OCFS2_JOURNAL_ACCESS_CREATE);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+       memset(new_bh->b_data, 0, sb->s_blocksize);
+
+       de = (struct ocfs2_dir_entry *) new_bh->b_data;
+       de->inode = 0;
+       if (ocfs2_supports_dir_trailer(dir)) {
+               de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
+
+               ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len));
+
+               if (ocfs2_dir_indexed(dir)) {
+                       status = ocfs2_dx_dir_link_trailer(dir, handle,
+                                                          dx_root_bh, new_bh);
+                       if (status) {
+                               mlog_errno(status);
+                               goto bail;
+                       }
+               }
+       } else {
+               de->rec_len = cpu_to_le16(sb->s_blocksize);
+       }
+       status = ocfs2_journal_dirty(handle, new_bh);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       dir_i_size += dir->i_sb->s_blocksize;
+       i_size_write(dir, dir_i_size);
+       dir->i_blocks = ocfs2_inode_sector_count(dir);
+       status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+bail_bh:
+       *new_de_bh = new_bh;
+       get_bh(*new_de_bh);
+bail:
+       if (drop_alloc_sem)
+               up_write(&OCFS2_I(dir)->ip_alloc_sem);
+       if (handle)
+               ocfs2_commit_trans(osb, handle);
+
+       if (data_ac)
+               ocfs2_free_alloc_context(data_ac);
+       if (meta_ac)
+               ocfs2_free_alloc_context(meta_ac);
+
+       brelse(new_bh);
+
+       mlog_exit(status);
+       return status;
+}
+
+static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
+                                  const char *name, int namelen,
+                                  struct buffer_head **ret_de_bh,
+                                  unsigned int *blocks_wanted)
+{
+       int ret;
+       struct super_block *sb = dir->i_sb;
+       struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+       struct ocfs2_dir_entry *de, *last_de = NULL;
+       char *de_buf, *limit;
+       unsigned long offset = 0;
+       unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize;
+
+       /*
+        * This calculates how many free bytes we'd have in block zero, should
+        * this function force expansion to an extent tree.
+        */
+       if (ocfs2_new_dir_wants_trailer(dir))
+               free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
+       else
+               free_space = dir->i_sb->s_blocksize - i_size_read(dir);
+
+       de_buf = di->id2.i_data.id_data;
+       limit = de_buf + i_size_read(dir);
+       rec_len = OCFS2_DIR_REC_LEN(namelen);
+
+       while (de_buf < limit) {
+               de = (struct ocfs2_dir_entry *)de_buf;
+
+               if (!ocfs2_check_dir_entry(dir, de, di_bh, offset)) {
+                       ret = -ENOENT;
+                       goto out;
+               }
+               if (ocfs2_match(namelen, name, de)) {
+                       ret = -EEXIST;
+                       goto out;
+               }
+               /*
+                * No need to check for a trailing dirent record here as
+                * they're not used for inline dirs.
+                */
+
+               if (ocfs2_dirent_would_fit(de, rec_len)) {
+                       /* Ok, we found a spot. Return this bh and let
+                        * the caller actually fill it in. */
+                       *ret_de_bh = di_bh;
+                       get_bh(*ret_de_bh);
+                       ret = 0;
+                       goto out;
+               }
+
+               last_de = de;
+               de_buf += le16_to_cpu(de->rec_len);
+               offset += le16_to_cpu(de->rec_len);
+       }
+
+       /*
+        * We're going to require expansion of the directory - figure
+        * out how many blocks we'll need so that a place for the
+        * dirent can be found.
+        */
+       *blocks_wanted = 1;
+       new_rec_len = le16_to_cpu(last_de->rec_len) + free_space;
+       if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
+               *blocks_wanted = 2;
+
+       ret = -ENOSPC;
+out:
+       return ret;
+}
+
+static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
+                                  int namelen, struct buffer_head **ret_de_bh)
+{
+       unsigned long offset;
+       struct buffer_head *bh = NULL;
+       unsigned short rec_len;
+       struct ocfs2_dir_entry *de;
+       struct super_block *sb = dir->i_sb;
+       int status;
+       int blocksize = dir->i_sb->s_blocksize;
+
+       status = ocfs2_read_dir_block(dir, 0, &bh, 0);
+       if (status) {
+               mlog_errno(status);
+               goto bail;
+       }
+
+       rec_len = OCFS2_DIR_REC_LEN(namelen);
+       offset = 0;
+       de = (struct ocfs2_dir_entry *) bh->b_data;
+       while (1) {
+               if ((char *)de >= sb->s_blocksize + bh->b_data) {
+                       brelse(bh);
+                       bh = NULL;
+
+                       if (i_size_read(dir) <= offset) {
+                               /*
+                                * Caller will have to expand this
+                                * directory.
+                                */
+                               status = -ENOSPC;
+                               goto bail;
+                       }
+                       status = ocfs2_read_dir_block(dir,
+                                            offset >> sb->s_blocksize_bits,
+                                            &bh, 0);
+                       if (status) {
+                               mlog_errno(status);
+                               goto bail;
+                       }
+                       /* move to next block */
+                       de = (struct ocfs2_dir_entry *) bh->b_data;
+               }
+               if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
+                       status = -ENOENT;
+                       goto bail;
+               }
+               if (ocfs2_match(namelen, name, de)) {
+                       status = -EEXIST;
+                       goto bail;
+               }
+
+               if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize,
+                                          blocksize))
+                       goto next;
+
+               if (ocfs2_dirent_would_fit(de, rec_len)) {
+                       /* Ok, we found a spot. Return this bh and let
+                        * the caller actually fill it in. */
+                       *ret_de_bh = bh;
+                       get_bh(*ret_de_bh);
+                       status = 0;
+                       goto bail;
+               }
+next:
+               offset += le16_to_cpu(de->rec_len);
+               de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
+       }
+
+       status = 0;
+bail:
+       brelse(bh);
+
+       mlog_exit(status);
+       return status;
+}
+
+static int dx_leaf_sort_cmp(const void *a, const void *b)
+{
+       const struct ocfs2_dx_entry *entry1 = a;
+       const struct ocfs2_dx_entry *entry2 = b;
+       u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash);
+       u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash);
+       u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash);
+       u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash);
+
+       if (major_hash1 > major_hash2)
+               return 1;
+       if (major_hash1 < major_hash2)
+               return -1;
+
+       /*
+        * It is not strictly necessary to sort by minor
+        */
+       if (minor_hash1 > minor_hash2)
+               return 1;
+       if (minor_hash1 < minor_hash2)
+               return -1;
+       return 0;
+}
+
+static void dx_leaf_sort_swap(void *a, void *b, int size)
+{
+       struct ocfs2_dx_entry *entry1 = a;
+       struct ocfs2_dx_entry *entry2 = b;
+       struct ocfs2_dx_entry tmp;
+
+       BUG_ON(size != sizeof(*entry1));
+
+       tmp = *entry1;
+       *entry1 = *entry2;
+       *entry2 = tmp;
+}
+
+static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
+{
+       struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
+       int i, num = le16_to_cpu(dl_list->de_num_used);
+
+       for (i = 0; i < (num - 1); i++) {
+               if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) !=
+                   le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash))
+                       return 0;
+       }
+
+       return 1;
+}
+
+/*
+ * Find the optimal value to split this leaf on. This expects the leaf
+ * entries to be in sorted order.
+ *
+ * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is
+ * the hash we want to insert.
+ *
+ * This function is only concerned with the major hash - that which
+ * determines which cluster an item belongs to.
+ */
+static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf,
+                                       u32 leaf_cpos, u32 insert_hash,
+                                       u32 *split_hash)
+{
+       struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
+       int i, num_used = le16_to_cpu(dl_list->de_num_used);
+       int allsame;
+
+       /*
+        * There's a couple rare, but nasty corner cases we have to
+        * check for here. All of them involve a leaf where all value
+        * have the same hash, which is what we look for first.
+        *
+        * Most of the time, all of the above is false, and we simply
+        * pick the median value for a split.
+        */
+       allsame = ocfs2_dx_leaf_same_major(dx_leaf);
+       if (allsame) {
+               u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash);
+
+               if (val == insert_hash) {
+                       /*
+                        * No matter where we would choose to split,
+                        * the new entry would want to occupy the same
+                        * block as these. Since there's no space left
+                        * in their existing block, we know there
+                        * won't be space after the split.
+                        */
+                       return -ENOSPC;
+               }
+
+               if (val == leaf_cpos) {
+                       /*
+                        * Because val is the same as leaf_cpos (which
+                        * is the smallest value this leaf can have),
+                        * yet is not equal to insert_hash, then we
+                        * know that insert_hash *must* be larger than
+                        * val (and leaf_cpos). At least cpos+1 in value.
+                        *
+                        * We also know then, that there cannot be an
+                        * adjacent extent (otherwise we'd be looking
+                        * at it). Choosing this value gives us a
+                        * chance to get some contiguousness.
+                        */
+                       *split_hash = leaf_cpos + 1;
+                       return 0;
+               }
+
+               if (val > insert_hash) {
+                       /*
+                        * val can not be the same as insert hash, and
+                        * also must be larger than leaf_cpos. Also,
+                        * we know that there can't be a leaf between
+                        * cpos and val, otherwise the entries with
+                        * hash 'val' would be there.
+                        */
+                       *split_hash = val;
+                       return 0;
+               }
+
+               *split_hash = insert_hash;
+               return 0;
+       }
+
+       /*
+        * Since the records are sorted and the checks above
+        * guaranteed that not all records in this block are the same,
+        * we simple travel forward, from the median, and pick the 1st
+        * record whose value is larger than leaf_cpos.
+        */
+       for (i = (num_used / 2); i < num_used; i++)
+               if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) >
+                   leaf_cpos)
+                       break;
+
+       BUG_ON(i == num_used); /* Should be impossible */
+       *split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash);
+       return 0;
+}
+
+/*
+ * Transfer all entries in orig_dx_leaves whose major hash is equal to or
+ * larger than split_hash into new_dx_leaves. We use a temporary
+ * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks.
+ *
+ * Since the block offset inside a leaf (cluster) is a constant mask
+ * of minor_hash, we can optimize - an item at block offset X within
+ * the original cluster, will be at offset X within the new cluster.
+ */
+static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
+                                      handle_t *handle,
+                                      struct ocfs2_dx_leaf *tmp_dx_leaf,
+                                      struct buffer_head **orig_dx_leaves,
+                                      struct buffer_head **new_dx_leaves,
+                                      int num_dx_leaves)
+{
+       int i, j, num_used;
+       u32 major_hash;
+       struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
+       struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list;
+       struct ocfs2_dx_entry *dx_entry;
+
+       tmp_list = &tmp_dx_leaf->dl_list;
+
+       for (i = 0; i < num_dx_leaves; i++) {
+               orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
+               orig_list = &orig_dx_leaf->dl_list;
+               new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
+               new_list = &new_dx_leaf->dl_list;
+
+               num_used = le16_to_cpu(orig_list->de_num_used);
+
+               memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize);
+               tmp_list->de_num_used = cpu_to_le16(0);
+               memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used);
+
+               for (j = 0; j < num_used; j++) {
+                       dx_entry = &orig_list->de_entries[j];
+                       major_hash = le32_to_cpu(dx_entry->dx_major_hash);
+                       if (major_hash >= split_hash)
+                               ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf,
+                                                             dx_entry);
+                       else
+                               ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf,
+                                                             dx_entry);
+               }
+               memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize);
+
+               ocfs2_journal_dirty(handle, orig_dx_leaves[i]);
+               ocfs2_journal_dirty(handle, new_dx_leaves[i]);
+       }
+}
+
+static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
+                                         struct ocfs2_dx_root_block *dx_root)
+{
+       int credits = ocfs2_clusters_to_blocks(osb->sb, 2);
+
+       credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1);
+       credits += ocfs2_quota_trans_credits(osb->sb);
+       return credits;
+}
+
+/*
+ * Find the median value in dx_leaf_bh and allocate a new leaf to move
+ * half our entries into.
+ */
+static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
+                                 struct buffer_head *dx_root_bh,
+                                 struct buffer_head *dx_leaf_bh,
+                                 struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos,
+                                 u64 leaf_blkno)
+{
+       struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+       int credits, ret, i, num_used, did_quota = 0;
+       u32 cpos, split_hash, insert_hash = hinfo->major_hash;
+       u64 orig_leaves_start;
+       int num_dx_leaves;
+       struct buffer_head **orig_dx_leaves = NULL;
+       struct buffer_head **new_dx_leaves = NULL;
+       struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL;
+       struct ocfs2_extent_tree et;
+       handle_t *handle = NULL;
+       struct ocfs2_dx_root_block *dx_root;
+       struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
+
+       mlog(0, "DX Dir: %llu, rebalance leaf leaf_blkno: %llu insert: %u\n",
+            (unsigned long long)OCFS2_I(dir)->ip_blkno,
+            (unsigned long long)leaf_blkno, insert_hash);
+
+       ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
+
+       dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+       /*
+        * XXX: This is a rather large limit. We should use a more
+        * realistic value.
+        */
+       if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX)
+               return -ENOSPC;
+
+       num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used);
+       if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) {
+               mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: "
+                    "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno,
+                    (unsigned long long)leaf_blkno, num_used);
+               ret = -EIO;
+               goto out;
+       }
+
+       orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
+       if (!orig_dx_leaves) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL);
+       if (!new_dx_leaves) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac);
+       if (ret) {
+               if (ret != -ENOSPC)
+                       mlog_errno(ret);
+               goto out;
+       }
+
+       credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root);
+       handle = ocfs2_start_trans(osb, credits);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               handle = NULL;
+               mlog_errno(ret);
+               goto out;
+       }
+
+       if (vfs_dq_alloc_space_nodirty(dir,
+                                      ocfs2_clusters_to_bytes(dir->i_sb, 1))) {
+               ret = -EDQUOT;
+               goto out_commit;
+       }
+       did_quota = 1;
+
+       ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+
+       /*
+        * This block is changing anyway, so we can sort it in place.
+        */
+       sort(dx_leaf->dl_list.de_entries, num_used,
+            sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
+            dx_leaf_sort_swap);
+
+       ret = ocfs2_journal_dirty(handle, dx_leaf_bh);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+
+       ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
+                                          &split_hash);
+       if (ret) {
+               mlog_errno(ret);
+               goto  out_commit;
+       }
+
+       mlog(0, "Split leaf (%u) at %u, insert major hash is %u\n",
+            leaf_cpos, split_hash, insert_hash);
+
+       /*
+        * We have to carefully order operations here. There are items
+        * which want to be in the new cluster before insert, but in
+        * order to put those items in the new cluster, we alter the
+        * old cluster. A failure to insert gets nasty.
+        *
+        * So, start by reserving writes to the old
+        * cluster. ocfs2_dx_dir_new_cluster will reserve writes on
+        * the new cluster for us, before inserting it. The insert
+        * won't happen if there's an error before that. Once the
+        * insert is done then, we can transfer from one leaf into the
+        * other without fear of hitting any error.
+        */
+
+       /*
+        * The leaf transfer wants some scratch space so that we don't
+        * wind up doing a bunch of expensive memmove().
+        */
+       tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS);
+       if (!tmp_dx_leaf) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               goto out_commit;
+       }
+
+       orig_leaves_start = ocfs2_block_to_cluster_start(dir->i_sb, leaf_blkno);
+       ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves,
+                                  orig_dx_leaves);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+
+       for (i = 0; i < num_dx_leaves; i++) {
+               ret = ocfs2_journal_access_dl(handle, dir, orig_dx_leaves[i],
+                                             OCFS2_JOURNAL_ACCESS_WRITE);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out_commit;
                }
+       }
 
-               if (blocks_wanted == 1) {
+       cpos = split_hash;
+       ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
+                                      data_ac, meta_ac, new_dx_leaves,
+                                      num_dx_leaves);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+
+       ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
+                                  orig_dx_leaves, new_dx_leaves, num_dx_leaves);
+
+out_commit:
+       if (ret < 0 && did_quota)
+               vfs_dq_free_space_nodirty(dir,
+                               ocfs2_clusters_to_bytes(dir->i_sb, 1));
+
+       ocfs2_commit_trans(osb, handle);
+
+out:
+       if (orig_dx_leaves || new_dx_leaves) {
+               for (i = 0; i < num_dx_leaves; i++) {
+                       if (orig_dx_leaves)
+                               brelse(orig_dx_leaves[i]);
+                       if (new_dx_leaves)
+                               brelse(new_dx_leaves[i]);
+               }
+               kfree(orig_dx_leaves);
+               kfree(new_dx_leaves);
+       }
+
+       if (meta_ac)
+               ocfs2_free_alloc_context(meta_ac);
+       if (data_ac)
+               ocfs2_free_alloc_context(data_ac);
+
+       kfree(tmp_dx_leaf);
+       return ret;
+}
+
+static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir,
+                                  struct buffer_head *di_bh,
+                                  struct buffer_head *dx_root_bh,
+                                  const char *name, int namelen,
+                                  struct ocfs2_dir_lookup_result *lookup)
+{
+       int ret, rebalanced = 0;
+       struct ocfs2_dx_root_block *dx_root;
+       struct buffer_head *dx_leaf_bh = NULL;
+       struct ocfs2_dx_leaf *dx_leaf;
+       u64 blkno;
+       u32 leaf_cpos;
+
+       dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+restart_search:
+       ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo,
+                                 &leaf_cpos, &blkno);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
+
+       if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >=
+           le16_to_cpu(dx_leaf->dl_list.de_count)) {
+               if (rebalanced) {
                        /*
-                        * If the new dirent will fit inside the space
-                        * created by pushing out to one block, then
-                        * we can complete the operation
-                        * here. Otherwise we have to expand i_size
-                        * and format the 2nd block below.
+                        * Rebalancing should have provided us with
+                        * space in an appropriate leaf.
+                        *
+                        * XXX: Is this an abnormal condition then?
+                        * Should we print a message here?
                         */
-                       BUG_ON(new_bh == NULL);
-                       goto bail_bh;
+                       ret = -ENOSPC;
+                       goto out;
+               }
+
+               ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh,
+                                            &lookup->dl_hinfo, leaf_cpos,
+                                            blkno);
+               if (ret) {
+                       if (ret != -ENOSPC)
+                               mlog_errno(ret);
+                       goto out;
                }
 
                /*
-                * Get rid of 'new_bh' - we want to format the 2nd
-                * data block and return that instead.
+                * Restart the lookup. The rebalance might have
+                * changed which block our item fits into. Mark our
+                * progress, so we only execute this once.
                 */
-               brelse(new_bh);
-               new_bh = NULL;
-
-               dir_i_size = i_size_read(dir);
-               credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
-               goto do_extend;
+               brelse(dx_leaf_bh);
+               dx_leaf_bh = NULL;
+               rebalanced = 1;
+               goto restart_search;
        }
 
-       dir_i_size = i_size_read(dir);
-       mlog(0, "extending dir %llu (i_size = %lld)\n",
-            (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size);
+       lookup->dl_dx_leaf_bh = dx_leaf_bh;
+       dx_leaf_bh = NULL;
 
-       /* dir->i_size is always block aligned. */
-       spin_lock(&OCFS2_I(dir)->ip_lock);
-       if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
-               spin_unlock(&OCFS2_I(dir)->ip_lock);
-               ocfs2_init_dinode_extent_tree(&et, dir, parent_fe_bh);
-               num_free_extents = ocfs2_num_free_extents(osb, dir, &et);
-               if (num_free_extents < 0) {
-                       status = num_free_extents;
-                       mlog_errno(status);
-                       goto bail;
-               }
+out:
+       brelse(dx_leaf_bh);
+       return ret;
+}
 
-               if (!num_free_extents) {
-                       status = ocfs2_reserve_new_metadata(osb, el, &meta_ac);
-                       if (status < 0) {
-                               if (status != -ENOSPC)
-                                       mlog_errno(status);
-                               goto bail;
-                       }
+static int ocfs2_search_dx_free_list(struct inode *dir,
+                                    struct buffer_head *dx_root_bh,
+                                    int namelen,
+                                    struct ocfs2_dir_lookup_result *lookup)
+{
+       int ret = -ENOSPC;
+       struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL;
+       struct ocfs2_dir_block_trailer *db;
+       u64 next_block;
+       int rec_len = OCFS2_DIR_REC_LEN(namelen);
+       struct ocfs2_dx_root_block *dx_root;
+
+       dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+       next_block = le64_to_cpu(dx_root->dr_free_blk);
+
+       while (next_block) {
+               brelse(prev_leaf_bh);
+               prev_leaf_bh = leaf_bh;
+               leaf_bh = NULL;
+
+               ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
                }
 
-               status = ocfs2_reserve_clusters(osb, 1, &data_ac);
-               if (status < 0) {
-                       if (status != -ENOSPC)
-                               mlog_errno(status);
-                       goto bail;
+               db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
+               if (rec_len <= le16_to_cpu(db->db_free_rec_len)) {
+                       lookup->dl_leaf_bh = leaf_bh;
+                       lookup->dl_prev_leaf_bh = prev_leaf_bh;
+                       leaf_bh = NULL;
+                       prev_leaf_bh = NULL;
+                       break;
                }
 
-               credits = ocfs2_calc_extend_credits(sb, el, 1);
-       } else {
-               spin_unlock(&OCFS2_I(dir)->ip_lock);
-               credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
+               next_block = le64_to_cpu(db->db_free_next);
        }
 
-do_extend:
-       down_write(&OCFS2_I(dir)->ip_alloc_sem);
-       drop_alloc_sem = 1;
+       if (!next_block)
+               ret = -ENOSPC;
 
-       handle = ocfs2_start_trans(osb, credits);
-       if (IS_ERR(handle)) {
-               status = PTR_ERR(handle);
-               handle = NULL;
-               mlog_errno(status);
-               goto bail;
+out:
+
+       brelse(leaf_bh);
+       brelse(prev_leaf_bh);
+       return ret;
+}
+
+static int ocfs2_expand_inline_dx_root(struct inode *dir,
+                                      struct buffer_head *dx_root_bh)
+{
+       int ret, num_dx_leaves, i, j, did_quota = 0;
+       struct buffer_head **dx_leaves = NULL;
+       struct ocfs2_extent_tree et;
+       u64 insert_blkno;
+       struct ocfs2_alloc_context *data_ac = NULL;
+       struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+       handle_t *handle = NULL;
+       struct ocfs2_dx_root_block *dx_root;
+       struct ocfs2_dx_entry_list *entry_list;
+       struct ocfs2_dx_entry *dx_entry;
+       struct ocfs2_dx_leaf *target_leaf;
+
+       ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
        }
 
-       status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh,
-                                    data_ac, meta_ac, &new_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
+       dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
+       if (!dx_leaves) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               goto out;
        }
 
-       ocfs2_set_new_buffer_uptodate(dir, new_bh);
+       handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb));
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               mlog_errno(ret);
+               goto out;
+       }
 
-       status = ocfs2_journal_access_db(handle, dir, new_bh,
-                                        OCFS2_JOURNAL_ACCESS_CREATE);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
+       if (vfs_dq_alloc_space_nodirty(dir,
+                                      ocfs2_clusters_to_bytes(osb->sb, 1))) {
+               ret = -EDQUOT;
+               goto out_commit;
        }
-       memset(new_bh->b_data, 0, sb->s_blocksize);
+       did_quota = 1;
 
-       de = (struct ocfs2_dir_entry *) new_bh->b_data;
-       de->inode = 0;
-       if (ocfs2_dir_has_trailer(dir)) {
-               de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
-               ocfs2_init_dir_trailer(dir, new_bh);
-       } else {
-               de->rec_len = cpu_to_le16(sb->s_blocksize);
+       /*
+        * We do this up front, before the allocation, so that a
+        * failure to add the dx_root_bh to the journal won't result
+        * us losing clusters.
+        */
+       ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
        }
-       status = ocfs2_journal_dirty(handle, new_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
+
+       ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves,
+                                        num_dx_leaves, &insert_blkno);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
        }
 
-       dir_i_size += dir->i_sb->s_blocksize;
-       i_size_write(dir, dir_i_size);
-       dir->i_blocks = ocfs2_inode_sector_count(dir);
-       status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
+       /*
+        * Transfer the entries from our dx_root into the appropriate
+        * block
+        */
+       dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+       entry_list = &dx_root->dr_entries;
+
+       for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
+               dx_entry = &entry_list->de_entries[i];
+
+               j = __ocfs2_dx_dir_hash_idx(osb,
+                                           le32_to_cpu(dx_entry->dx_minor_hash));
+               target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data;
+
+               ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry);
+
+               /* Each leaf has been passed to the journal already
+                * via __ocfs2_dx_dir_new_cluster() */
        }
 
-bail_bh:
-       *new_de_bh = new_bh;
-       get_bh(*new_de_bh);
-bail:
-       if (drop_alloc_sem)
-               up_write(&OCFS2_I(dir)->ip_alloc_sem);
-       if (handle)
-               ocfs2_commit_trans(osb, handle);
+       dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE;
+       memset(&dx_root->dr_list, 0, osb->sb->s_blocksize -
+              offsetof(struct ocfs2_dx_root_block, dr_list));
+       dx_root->dr_list.l_count =
+               cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
+
+       /* This should never fail considering we start with an empty
+        * dx_root. */
+       ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
+       ret = ocfs2_insert_extent(osb, handle, dir, &et, 0,
+                                 insert_blkno, 1, 0, NULL);
+       if (ret)
+               mlog_errno(ret);
+       did_quota = 0;
+
+       ocfs2_journal_dirty(handle, dx_root_bh);
+
+out_commit:
+       if (ret < 0 && did_quota)
+               vfs_dq_free_space_nodirty(dir,
+                                         ocfs2_clusters_to_bytes(dir->i_sb, 1));
+
+       ocfs2_commit_trans(osb, handle);
+
+out:
+       if (data_ac)
+               ocfs2_free_alloc_context(data_ac);
+
+       if (dx_leaves) {
+               for (i = 0; i < num_dx_leaves; i++)
+                       brelse(dx_leaves[i]);
+               kfree(dx_leaves);
+       }
+       return ret;
+}
 
-       if (data_ac)
-               ocfs2_free_alloc_context(data_ac);
-       if (meta_ac)
-               ocfs2_free_alloc_context(meta_ac);
+static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh)
+{
+       struct ocfs2_dx_root_block *dx_root;
+       struct ocfs2_dx_entry_list *entry_list;
 
-       brelse(new_bh);
+       dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+       entry_list = &dx_root->dr_entries;
 
-       mlog_exit(status);
-       return status;
+       if (le16_to_cpu(entry_list->de_num_used) >=
+           le16_to_cpu(entry_list->de_count))
+               return -ENOSPC;
+
+       return 0;
 }
 
-static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
-                                  const char *name, int namelen,
-                                  struct buffer_head **ret_de_bh,
-                                  unsigned int *blocks_wanted)
+static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir,
+                                          struct buffer_head *di_bh,
+                                          const char *name,
+                                          int namelen,
+                                          struct ocfs2_dir_lookup_result *lookup)
 {
-       int ret;
-       struct super_block *sb = dir->i_sb;
+       int ret, free_dx_root = 1;
+       struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+       struct buffer_head *dx_root_bh = NULL;
+       struct buffer_head *leaf_bh = NULL;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-       struct ocfs2_dir_entry *de, *last_de = NULL;
-       char *de_buf, *limit;
-       unsigned long offset = 0;
-       unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize;
+       struct ocfs2_dx_root_block *dx_root;
 
-       /*
-        * This calculates how many free bytes we'd have in block zero, should
-        * this function force expansion to an extent tree.
-        */
-       if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
-               free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
-       else
-               free_space = dir->i_sb->s_blocksize - i_size_read(dir);
+       ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
 
-       de_buf = di->id2.i_data.id_data;
-       limit = de_buf + i_size_read(dir);
-       rec_len = OCFS2_DIR_REC_LEN(namelen);
+       dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+       if (le32_to_cpu(dx_root->dr_num_entries) == OCFS2_DX_ENTRIES_MAX) {
+               ret = -ENOSPC;
+               mlog_errno(ret);
+               goto out;
+       }
 
-       while (de_buf < limit) {
-               de = (struct ocfs2_dir_entry *)de_buf;
+       if (ocfs2_dx_root_inline(dx_root)) {
+               ret = ocfs2_inline_dx_has_space(dx_root_bh);
+
+               if (ret == 0)
+                       goto search_el;
 
-               if (!ocfs2_check_dir_entry(dir, de, di_bh, offset)) {
-                       ret = -ENOENT;
-                       goto out;
-               }
-               if (ocfs2_match(namelen, name, de)) {
-                       ret = -EEXIST;
-                       goto out;
-               }
                /*
-                * No need to check for a trailing dirent record here as
-                * they're not used for inline dirs.
+                * We ran out of room in the root block. Expand it to
+                * an extent, then allow ocfs2_find_dir_space_dx to do
+                * the rest.
                 */
-
-               if (ocfs2_dirent_would_fit(de, rec_len)) {
-                       /* Ok, we found a spot. Return this bh and let
-                        * the caller actually fill it in. */
-                       *ret_de_bh = di_bh;
-                       get_bh(*ret_de_bh);
-                       ret = 0;
+               ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh);
+               if (ret) {
+                       mlog_errno(ret);
                        goto out;
                }
-
-               last_de = de;
-               de_buf += le16_to_cpu(de->rec_len);
-               offset += le16_to_cpu(de->rec_len);
        }
 
        /*
-        * We're going to require expansion of the directory - figure
-        * out how many blocks we'll need so that a place for the
-        * dirent can be found.
+        * Insert preparation for an indexed directory is split into two
+        * steps. The call to find_dir_space_dx reserves room in the index for
+        * an additional item. If we run out of space there, it's a real error
+        * we can't continue on.
         */
-       *blocks_wanted = 1;
-       new_rec_len = le16_to_cpu(last_de->rec_len) + free_space;
-       if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
-               *blocks_wanted = 2;
-
-       ret = -ENOSPC;
-out:
-       return ret;
-}
-
-static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
-                                  int namelen, struct buffer_head **ret_de_bh)
-{
-       unsigned long offset;
-       struct buffer_head *bh = NULL;
-       unsigned short rec_len;
-       struct ocfs2_dir_entry *de;
-       struct super_block *sb = dir->i_sb;
-       int status;
-       int blocksize = dir->i_sb->s_blocksize;
-
-       status = ocfs2_read_dir_block(dir, 0, &bh, 0);
-       if (status) {
-               mlog_errno(status);
-               goto bail;
+       ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name,
+                                     namelen, lookup);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
        }
 
-       rec_len = OCFS2_DIR_REC_LEN(namelen);
-       offset = 0;
-       de = (struct ocfs2_dir_entry *) bh->b_data;
-       while (1) {
-               if ((char *)de >= sb->s_blocksize + bh->b_data) {
-                       brelse(bh);
-                       bh = NULL;
+search_el:
+       /*
+        * Next, we need to find space in the unindexed tree. This call
+        * searches using the free space linked list. If the unindexed tree
+        * lacks sufficient space, we'll expand it below. The expansion code
+        * is smart enough to add any new blocks to the free space list.
+        */
+       ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup);
+       if (ret && ret != -ENOSPC) {
+               mlog_errno(ret);
+               goto out;
+       }
 
-                       if (i_size_read(dir) <= offset) {
-                               /*
-                                * Caller will have to expand this
-                                * directory.
-                                */
-                               status = -ENOSPC;
-                               goto bail;
-                       }
-                       status = ocfs2_read_dir_block(dir,
-                                            offset >> sb->s_blocksize_bits,
-                                            &bh, 0);
-                       if (status) {
-                               mlog_errno(status);
-                               goto bail;
-                       }
-                       /* move to next block */
-                       de = (struct ocfs2_dir_entry *) bh->b_data;
-               }
-               if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
-                       status = -ENOENT;
-                       goto bail;
-               }
-               if (ocfs2_match(namelen, name, de)) {
-                       status = -EEXIST;
-                       goto bail;
-               }
+       /* Do this up here - ocfs2_extend_dir might need the dx_root */
+       lookup->dl_dx_root_bh = dx_root_bh;
+       free_dx_root = 0;
 
-               if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize,
-                                          blocksize))
-                       goto next;
+       if (ret == -ENOSPC) {
+               ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh);
 
-               if (ocfs2_dirent_would_fit(de, rec_len)) {
-                       /* Ok, we found a spot. Return this bh and let
-                        * the caller actually fill it in. */
-                       *ret_de_bh = bh;
-                       get_bh(*ret_de_bh);
-                       status = 0;
-                       goto bail;
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
                }
-next:
-               offset += le16_to_cpu(de->rec_len);
-               de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
-       }
 
-       status = 0;
-bail:
-       brelse(bh);
+               /*
+                * We make the assumption here that new leaf blocks are added
+                * to the front of our free list.
+                */
+               lookup->dl_prev_leaf_bh = NULL;
+               lookup->dl_leaf_bh = leaf_bh;
+       }
 
-       mlog_exit(status);
-       return status;
+out:
+       if (free_dx_root)
+               brelse(dx_root_bh);
+       return ret;
 }
 
+/*
+ * Get a directory ready for insert. Any directory allocation required
+ * happens here. Success returns zero, and enough context in the dir
+ * lookup result that ocfs2_add_entry() will be able complete the task
+ * with minimal performance impact.
+ */
 int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
                                 struct inode *dir,
                                 struct buffer_head *parent_fe_bh,
                                 const char *name,
                                 int namelen,
-                                struct buffer_head **ret_de_bh)
+                                struct ocfs2_dir_lookup_result *lookup)
 {
        int ret;
        unsigned int blocks_wanted = 1;
@@ -1984,14 +4367,34 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
        mlog(0, "getting ready to insert namelen %d into dir %llu\n",
             namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno);
 
-       *ret_de_bh = NULL;
-
        if (!namelen) {
                ret = -EINVAL;
                mlog_errno(ret);
                goto out;
        }
 
+       /*
+        * Do this up front to reduce confusion.
+        *
+        * The directory might start inline, then be turned into an
+        * indexed one, in which case we'd need to hash deep inside
+        * ocfs2_find_dir_space_id(). Since
+        * ocfs2_prepare_dx_dir_for_insert() also needs this hash
+        * done, there seems no point in spreading out the calls. We
+        * can optimize away the case where the file system doesn't
+        * support indexing.
+        */
+       if (ocfs2_supports_indexed_dirs(osb))
+               ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo);
+
+       if (ocfs2_dir_indexed(dir)) {
+               ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh,
+                                                     name, namelen, lookup);
+               if (ret)
+                       mlog_errno(ret);
+               goto out;
+       }
+
        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
                ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name,
                                              namelen, &bh, &blocks_wanted);
@@ -2010,7 +4413,7 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
                BUG_ON(bh);
 
                ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted,
-                                      &bh);
+                                      lookup, &bh);
                if (ret) {
                        if (ret != -ENOSPC)
                                mlog_errno(ret);
@@ -2020,9 +4423,154 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
                BUG_ON(!bh);
        }
 
-       *ret_de_bh = bh;
+       lookup->dl_leaf_bh = bh;
        bh = NULL;
 out:
        brelse(bh);
        return ret;
 }
+
+static int ocfs2_dx_dir_remove_index(struct inode *dir,
+                                    struct buffer_head *di_bh,
+                                    struct buffer_head *dx_root_bh)
+{
+       int ret;
+       struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+       struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+       struct ocfs2_dx_root_block *dx_root;
+       struct inode *dx_alloc_inode = NULL;
+       struct buffer_head *dx_alloc_bh = NULL;
+       handle_t *handle;
+       u64 blk;
+       u16 bit;
+       u64 bg_blkno;
+
+       dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
+
+       dx_alloc_inode = ocfs2_get_system_file_inode(osb,
+                                       EXTENT_ALLOC_SYSTEM_INODE,
+                                       le16_to_cpu(dx_root->dr_suballoc_slot));
+       if (!dx_alloc_inode) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               goto out;
+       }
+       mutex_lock(&dx_alloc_inode->i_mutex);
+
+       ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_mutex;
+       }
+
+       handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               mlog_errno(ret);
+               goto out_unlock;
+       }
+
+       ret = ocfs2_journal_access_di(handle, dir, di_bh,
+                                     OCFS2_JOURNAL_ACCESS_WRITE);
+       if (ret) {
+               mlog_errno(ret);
+               goto out_commit;
+       }
+
+       OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
+       di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
+       di->i_dx_root = cpu_to_le64(0ULL);
+
+       ocfs2_journal_dirty(handle, di_bh);
+
+       blk = le64_to_cpu(dx_root->dr_blkno);
+       bit = le16_to_cpu(dx_root->dr_suballoc_bit);
+       bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+       ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
+                                      bit, bg_blkno, 1);
+       if (ret)
+               mlog_errno(ret);
+
+out_commit:
+       ocfs2_commit_trans(osb, handle);
+
+out_unlock:
+       ocfs2_inode_unlock(dx_alloc_inode, 1);
+
+out_mutex:
+       mutex_unlock(&dx_alloc_inode->i_mutex);
+       brelse(dx_alloc_bh);
+out:
+       iput(dx_alloc_inode);
+       return ret;
+}
+
+int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
+{
+       int ret;
+       unsigned int uninitialized_var(clen);
+       u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos);
+       u64 uninitialized_var(blkno);
+       struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+       struct buffer_head *dx_root_bh = NULL;
+       struct ocfs2_dx_root_block *dx_root;
+       struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+       struct ocfs2_cached_dealloc_ctxt dealloc;
+       struct ocfs2_extent_tree et;
+
+       ocfs2_init_dealloc_ctxt(&dealloc);
+
+       if (!ocfs2_dir_indexed(dir))
+               return 0;
+
+       ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+       dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
+
+       if (ocfs2_dx_root_inline(dx_root))
+               goto remove_index;
+
+       ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh);
+
+       /* XXX: What if dr_clusters is too large? */
+       while (le32_to_cpu(dx_root->dr_clusters)) {
+               ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list,
+                                             major_hash, &cpos, &blkno, &clen);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
+
+               ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen,
+                                              &dealloc);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               if (cpos == 0)
+                       break;
+
+               major_hash = cpos - 1;
+       }
+
+remove_index:
+       ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       ocfs2_remove_from_cache(dir, dx_root_bh);
+out:
+       ocfs2_schedule_truncate_log_flush(osb, 1);
+       ocfs2_run_deallocs(osb, &dealloc);
+
+       brelse(dx_root_bh);
+       return ret;
+}
index c511e2e18e9f064ff67d7b3299bad90f4fd99a94..e683f3deb64503a3167198a88d37070cf1ee0eba 100644 (file)
 #ifndef OCFS2_DIR_H
 #define OCFS2_DIR_H
 
-struct buffer_head *ocfs2_find_entry(const char *name,
-                                    int namelen,
-                                    struct inode *dir,
-                                    struct ocfs2_dir_entry **res_dir);
+struct ocfs2_dx_hinfo {
+       u32     major_hash;
+       u32     minor_hash;
+};
+
+struct ocfs2_dir_lookup_result {
+       struct buffer_head              *dl_leaf_bh;    /* Unindexed leaf
+                                                        * block */
+       struct ocfs2_dir_entry          *dl_entry;      /* Target dirent in
+                                                        * unindexed leaf */
+
+       struct buffer_head              *dl_dx_root_bh; /* Root of indexed
+                                                        * tree */
+
+       struct buffer_head              *dl_dx_leaf_bh; /* Indexed leaf block */
+       struct ocfs2_dx_entry           *dl_dx_entry;   /* Target dx_entry in
+                                                        * indexed leaf */
+       struct ocfs2_dx_hinfo           dl_hinfo;       /* Name hash results */
+
+       struct buffer_head              *dl_prev_leaf_bh;/* Previous entry in
+                                                         * dir free space
+                                                         * list. NULL if
+                                                         * previous entry is
+                                                         * dx root block. */
+};
+
+void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res);
+
+int ocfs2_find_entry(const char *name, int namelen,
+                    struct inode *dir,
+                    struct ocfs2_dir_lookup_result *lookup);
 int ocfs2_delete_entry(handle_t *handle,
                       struct inode *dir,
-                      struct ocfs2_dir_entry *de_del,
-                      struct buffer_head *bh);
+                      struct ocfs2_dir_lookup_result *res);
 int __ocfs2_add_entry(handle_t *handle,
                      struct inode *dir,
                      const char *name, int namelen,
                      struct inode *inode, u64 blkno,
                      struct buffer_head *parent_fe_bh,
-                     struct buffer_head *insert_bh);
+                     struct ocfs2_dir_lookup_result *lookup);
 static inline int ocfs2_add_entry(handle_t *handle,
                                  struct dentry *dentry,
                                  struct inode *inode, u64 blkno,
                                  struct buffer_head *parent_fe_bh,
-                                 struct buffer_head *insert_bh)
+                                 struct ocfs2_dir_lookup_result *lookup)
 {
        return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
                                 dentry->d_name.name, dentry->d_name.len,
-                                inode, blkno, parent_fe_bh, insert_bh);
+                                inode, blkno, parent_fe_bh, lookup);
 }
 int ocfs2_update_entry(struct inode *dir, handle_t *handle,
-                      struct buffer_head *de_bh, struct ocfs2_dir_entry *de,
+                      struct ocfs2_dir_lookup_result *res,
                       struct inode *new_entry_inode);
 
 int ocfs2_check_dir_for_entry(struct inode *dir,
                              const char *name,
                              int namelen);
 int ocfs2_empty_dir(struct inode *inode);
+
 int ocfs2_find_files_on_disk(const char *name,
                             int namelen,
                             u64 *blkno,
                             struct inode *inode,
-                            struct buffer_head **dirent_bh,
-                            struct ocfs2_dir_entry **dirent);
+                            struct ocfs2_dir_lookup_result *res);
 int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
                               int namelen, u64 *blkno);
 int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
@@ -74,14 +100,17 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
                                 struct buffer_head *parent_fe_bh,
                                 const char *name,
                                 int namelen,
-                                struct buffer_head **ret_de_bh);
+                                struct ocfs2_dir_lookup_result *lookup);
 struct ocfs2_alloc_context;
 int ocfs2_fill_new_dir(struct ocfs2_super *osb,
                       handle_t *handle,
                       struct inode *parent,
                       struct inode *inode,
                       struct buffer_head *fe_bh,
-                      struct ocfs2_alloc_context *data_ac);
+                      struct ocfs2_alloc_context *data_ac,
+                      struct ocfs2_alloc_context *meta_ac);
+
+int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh);
 
 struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
                                                            void *data);
index bb53714813abb716dca2bb2fe793ee64dd36a8e9..0102be35980c03ae0d3bcfe561282e5f9781626f 100644 (file)
 enum dlm_mle_type {
        DLM_MLE_BLOCK,
        DLM_MLE_MASTER,
-       DLM_MLE_MIGRATION
-};
-
-struct dlm_lock_name {
-       u8 len;
-       u8 name[DLM_LOCKID_NAME_MAX];
+       DLM_MLE_MIGRATION,
+       DLM_MLE_NUM_TYPES
 };
 
 struct dlm_master_list_entry {
-       struct list_head list;
+       struct hlist_node master_hash_node;
        struct list_head hb_events;
        struct dlm_ctxt *dlm;
        spinlock_t spinlock;
@@ -78,10 +74,10 @@ struct dlm_master_list_entry {
        enum dlm_mle_type type;
        struct o2hb_callback_func mle_hb_up;
        struct o2hb_callback_func mle_hb_down;
-       union {
-               struct dlm_lock_resource *res;
-               struct dlm_lock_name name;
-       } u;
+       struct dlm_lock_resource *mleres;
+       unsigned char mname[DLM_LOCKID_NAME_MAX];
+       unsigned int mnamelen;
+       unsigned int mnamehash;
 };
 
 enum dlm_ast_type {
@@ -151,13 +147,14 @@ struct dlm_ctxt
        unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
        struct dlm_recovery_ctxt reco;
        spinlock_t master_lock;
-       struct list_head master_list;
+       struct hlist_head **master_hash;
        struct list_head mle_hb_events;
 
        /* these give a really vague idea of the system load */
-       atomic_t local_resources;
-       atomic_t remote_resources;
-       atomic_t unknown_resources;
+       atomic_t mle_tot_count[DLM_MLE_NUM_TYPES];
+       atomic_t mle_cur_count[DLM_MLE_NUM_TYPES];
+       atomic_t res_tot_count;
+       atomic_t res_cur_count;
 
        struct dlm_debug_ctxt *dlm_debug_ctxt;
        struct dentry *dlm_debugfs_subroot;
@@ -195,6 +192,13 @@ static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned
        return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE);
 }
 
+static inline struct hlist_head *dlm_master_hash(struct dlm_ctxt *dlm,
+                                                unsigned i)
+{
+       return dlm->master_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] +
+                       (i % DLM_BUCKETS_PER_PAGE);
+}
+
 /* these keventd work queue items are for less-frequently
  * called functions that cannot be directly called from the
  * net message handlers for some reason, usually because
@@ -848,9 +852,7 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
                                              unsigned int len);
 
 int dlm_is_host_down(int errno);
-void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
-                             struct dlm_lock_resource *res,
-                             u8 owner);
+
 struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
                                                 const char *lockid,
                                                 int namelen,
@@ -1008,6 +1010,9 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
                                          DLM_LOCK_RES_MIGRATING));
 }
 
+void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
+void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle);
+
 /* create/destroy slab caches */
 int dlm_init_master_caches(void);
 void dlm_destroy_master_caches(void);
@@ -1110,6 +1115,23 @@ static inline int dlm_node_iter_next(struct dlm_node_iter *iter)
        return bit;
 }
 
+static inline void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
+                                        struct dlm_lock_resource *res,
+                                        u8 owner)
+{
+       assert_spin_locked(&res->spinlock);
+
+       res->owner = owner;
+}
 
+static inline void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
+                                           struct dlm_lock_resource *res,
+                                           u8 owner)
+{
+       assert_spin_locked(&res->spinlock);
+
+       if (owner != res->owner)
+               dlm_set_lockres_owner(dlm, res, owner);
+}
 
 #endif /* DLMCOMMON_H */
index b32f60a5acfb9e4353ce3ad61d009bb6858d158f..df52f706f66971026b2f09ae0f978a8ba17858f7 100644 (file)
@@ -287,18 +287,8 @@ static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
 static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
 {
        int out = 0;
-       unsigned int namelen;
-       const char *name;
        char *mle_type;
 
-       if (mle->type != DLM_MLE_MASTER) {
-               namelen = mle->u.name.len;
-               name = mle->u.name.name;
-       } else {
-               namelen = mle->u.res->lockname.len;
-               name = mle->u.res->lockname.name;
-       }
-
        if (mle->type == DLM_MLE_BLOCK)
                mle_type = "BLK";
        else if (mle->type == DLM_MLE_MASTER)
@@ -306,7 +296,7 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
        else
                mle_type = "MIG";
 
-       out += stringify_lockname(name, namelen, buf + out, len - out);
+       out += stringify_lockname(mle->mname, mle->mnamelen, buf + out, len - out);
        out += snprintf(buf + out, len - out,
                        "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
                        mle_type, mle->master, mle->new_master,
@@ -501,23 +491,33 @@ static struct file_operations debug_purgelist_fops = {
 static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
 {
        struct dlm_master_list_entry *mle;
-       int out = 0;
-       unsigned long total = 0;
+       struct hlist_head *bucket;
+       struct hlist_node *list;
+       int i, out = 0;
+       unsigned long total = 0, longest = 0, bktcnt;
 
        out += snprintf(db->buf + out, db->len - out,
                        "Dumping MLEs for Domain: %s\n", dlm->name);
 
        spin_lock(&dlm->master_lock);
-       list_for_each_entry(mle, &dlm->master_list, list) {
-               ++total;
-               if (db->len - out < 200)
-                       continue;
-               out += dump_mle(mle, db->buf + out, db->len - out);
+       for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+               bucket = dlm_master_hash(dlm, i);
+               hlist_for_each(list, bucket) {
+                       mle = hlist_entry(list, struct dlm_master_list_entry,
+                                         master_hash_node);
+                       ++total;
+                       ++bktcnt;
+                       if (db->len - out < 200)
+                               continue;
+                       out += dump_mle(mle, db->buf + out, db->len - out);
+               }
+               longest = max(longest, bktcnt);
+               bktcnt = 0;
        }
        spin_unlock(&dlm->master_lock);
 
        out += snprintf(db->buf + out, db->len - out,
-                       "Total on list: %ld\n", total);
+                       "Total: %ld, Longest: %ld\n", total, longest);
        return out;
 }
 
@@ -756,12 +756,8 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
        int out = 0;
        struct dlm_reco_node_data *node;
        char *state;
-       int lres, rres, ures, tres;
-
-       lres = atomic_read(&dlm->local_resources);
-       rres = atomic_read(&dlm->remote_resources);
-       ures = atomic_read(&dlm->unknown_resources);
-       tres = lres + rres + ures;
+       int cur_mles = 0, tot_mles = 0;
+       int i;
 
        spin_lock(&dlm->spinlock);
 
@@ -804,21 +800,48 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
                                 db->buf + out, db->len - out);
        out += snprintf(db->buf + out, db->len - out, "\n");
 
-       /* Mastered Resources Total: xxx  Locally: xxx  Remotely: ... */
+       /* Lock Resources: xxx (xxx) */
+       out += snprintf(db->buf + out, db->len - out,
+                       "Lock Resources: %d (%d)\n",
+                       atomic_read(&dlm->res_cur_count),
+                       atomic_read(&dlm->res_tot_count));
+
+       for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
+               tot_mles += atomic_read(&dlm->mle_tot_count[i]);
+
+       for (i = 0; i < DLM_MLE_NUM_TYPES; ++i)
+               cur_mles += atomic_read(&dlm->mle_cur_count[i]);
+
+       /* MLEs: xxx (xxx) */
+       out += snprintf(db->buf + out, db->len - out,
+                       "MLEs: %d (%d)\n", cur_mles, tot_mles);
+
+       /*  Blocking: xxx (xxx) */
+       out += snprintf(db->buf + out, db->len - out,
+                       "  Blocking: %d (%d)\n",
+                       atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]),
+                       atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK]));
+
+       /*  Mastery: xxx (xxx) */
+       out += snprintf(db->buf + out, db->len - out,
+                       "  Mastery: %d (%d)\n",
+                       atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]),
+                       atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER]));
+
+       /*  Migration: xxx (xxx) */
        out += snprintf(db->buf + out, db->len - out,
-                       "Mastered Resources Total: %d  Locally: %d  "
-                       "Remotely: %d  Unknown: %d\n",
-                       tres, lres, rres, ures);
+                       "  Migration: %d (%d)\n",
+                       atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]),
+                       atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION]));
 
        /* Lists: Dirty=Empty  Purge=InUse  PendingASTs=Empty  ... */
        out += snprintf(db->buf + out, db->len - out,
                        "Lists: Dirty=%s  Purge=%s  PendingASTs=%s  "
-                       "PendingBASTs=%s  Master=%s\n",
+                       "PendingBASTs=%s\n",
                        (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
                        (list_empty(&dlm->purge_list) ? "Empty" : "InUse"),
                        (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"),
-                       (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"),
-                       (list_empty(&dlm->master_list) ? "Empty" : "InUse"));
+                       (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"));
 
        /* Purge Count: xxx  Refs: xxx */
        out += snprintf(db->buf + out, db->len - out,
index d8d578f4561389f2b5083fdd03d4a66ebfeee1a9..4d9e6b288dd871e4c4ffee4157522913988a159b 100644 (file)
@@ -304,6 +304,9 @@ static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
        if (dlm->lockres_hash)
                dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
 
+       if (dlm->master_hash)
+               dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
+
        if (dlm->name)
                kfree(dlm->name);
 
@@ -1534,12 +1537,27 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        for (i = 0; i < DLM_HASH_BUCKETS; i++)
                INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i));
 
+       dlm->master_hash = (struct hlist_head **)
+                               dlm_alloc_pagevec(DLM_HASH_PAGES);
+       if (!dlm->master_hash) {
+               mlog_errno(-ENOMEM);
+               dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
+               kfree(dlm->name);
+               kfree(dlm);
+               dlm = NULL;
+               goto leave;
+       }
+
+       for (i = 0; i < DLM_HASH_BUCKETS; i++)
+               INIT_HLIST_HEAD(dlm_master_hash(dlm, i));
+
        strcpy(dlm->name, domain);
        dlm->key = key;
        dlm->node_num = o2nm_this_node();
 
        ret = dlm_create_debugfs_subroot(dlm);
        if (ret < 0) {
+               dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
                dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
                kfree(dlm->name);
                kfree(dlm);
@@ -1579,7 +1597,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        init_waitqueue_head(&dlm->reco.event);
        init_waitqueue_head(&dlm->ast_wq);
        init_waitqueue_head(&dlm->migration_wq);
-       INIT_LIST_HEAD(&dlm->master_list);
        INIT_LIST_HEAD(&dlm->mle_hb_events);
 
        dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
@@ -1587,9 +1604,13 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 
        dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
        dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
-       atomic_set(&dlm->local_resources, 0);
-       atomic_set(&dlm->remote_resources, 0);
-       atomic_set(&dlm->unknown_resources, 0);
+
+       atomic_set(&dlm->res_tot_count, 0);
+       atomic_set(&dlm->res_cur_count, 0);
+       for (i = 0; i < DLM_MLE_NUM_TYPES; ++i) {
+               atomic_set(&dlm->mle_tot_count[i], 0);
+               atomic_set(&dlm->mle_cur_count[i], 0);
+       }
 
        spin_lock_init(&dlm->work_lock);
        INIT_LIST_HEAD(&dlm->work_list);
index 0a2813947853dfd68372506b9fb43de894c3d577..f8b653fcd4ddb872ec046217a6f1a731db36dc25 100644 (file)
@@ -73,22 +73,13 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
                                const char *name,
                                unsigned int namelen)
 {
-       struct dlm_lock_resource *res;
-
        if (dlm != mle->dlm)
                return 0;
 
-       if (mle->type == DLM_MLE_BLOCK ||
-           mle->type == DLM_MLE_MIGRATION) {
-               if (namelen != mle->u.name.len ||
-                   memcmp(name, mle->u.name.name, namelen)!=0)
-                       return 0;
-       } else {
-               res = mle->u.res;
-               if (namelen != res->lockname.len ||
-                   memcmp(res->lockname.name, name, namelen) != 0)
-                       return 0;
-       }
+       if (namelen != mle->mnamelen ||
+           memcmp(name, mle->mname, namelen) != 0)
+               return 0;
+
        return 1;
 }
 
@@ -283,7 +274,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
 
        mle->dlm = dlm;
        mle->type = type;
-       INIT_LIST_HEAD(&mle->list);
+       INIT_HLIST_NODE(&mle->master_hash_node);
        INIT_LIST_HEAD(&mle->hb_events);
        memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
        spin_lock_init(&mle->spinlock);
@@ -295,19 +286,27 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
        mle->new_master = O2NM_MAX_NODES;
        mle->inuse = 0;
 
+       BUG_ON(mle->type != DLM_MLE_BLOCK &&
+              mle->type != DLM_MLE_MASTER &&
+              mle->type != DLM_MLE_MIGRATION);
+
        if (mle->type == DLM_MLE_MASTER) {
                BUG_ON(!res);
-               mle->u.res = res;
-       } else if (mle->type == DLM_MLE_BLOCK) {
-               BUG_ON(!name);
-               memcpy(mle->u.name.name, name, namelen);
-               mle->u.name.len = namelen;
-       } else /* DLM_MLE_MIGRATION */ {
+               mle->mleres = res;
+               memcpy(mle->mname, res->lockname.name, res->lockname.len);
+               mle->mnamelen = res->lockname.len;
+               mle->mnamehash = res->lockname.hash;
+       } else {
                BUG_ON(!name);
-               memcpy(mle->u.name.name, name, namelen);
-               mle->u.name.len = namelen;
+               mle->mleres = NULL;
+               memcpy(mle->mname, name, namelen);
+               mle->mnamelen = namelen;
+               mle->mnamehash = dlm_lockid_hash(name, namelen);
        }
 
+       atomic_inc(&dlm->mle_tot_count[mle->type]);
+       atomic_inc(&dlm->mle_cur_count[mle->type]);
+
        /* copy off the node_map and register hb callbacks on our copy */
        memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
        memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
@@ -318,6 +317,24 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
        __dlm_mle_attach_hb_events(dlm, mle);
 }
 
+void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
+{
+       assert_spin_locked(&dlm->spinlock);
+       assert_spin_locked(&dlm->master_lock);
+
+       if (!hlist_unhashed(&mle->master_hash_node))
+               hlist_del_init(&mle->master_hash_node);
+}
+
+void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
+{
+       struct hlist_head *bucket;
+
+       assert_spin_locked(&dlm->master_lock);
+
+       bucket = dlm_master_hash(dlm, mle->mnamehash);
+       hlist_add_head(&mle->master_hash_node, bucket);
+}
 
 /* returns 1 if found, 0 if not */
 static int dlm_find_mle(struct dlm_ctxt *dlm,
@@ -325,10 +342,17 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
                        char *name, unsigned int namelen)
 {
        struct dlm_master_list_entry *tmpmle;
+       struct hlist_head *bucket;
+       struct hlist_node *list;
+       unsigned int hash;
 
        assert_spin_locked(&dlm->master_lock);
 
-       list_for_each_entry(tmpmle, &dlm->master_list, list) {
+       hash = dlm_lockid_hash(name, namelen);
+       bucket = dlm_master_hash(dlm, hash);
+       hlist_for_each(list, bucket) {
+               tmpmle = hlist_entry(list, struct dlm_master_list_entry,
+                                    master_hash_node);
                if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
                        continue;
                dlm_get_mle(tmpmle);
@@ -408,24 +432,20 @@ static void dlm_mle_release(struct kref *kref)
        mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
        dlm = mle->dlm;
 
-       if (mle->type != DLM_MLE_MASTER) {
-               mlog(0, "calling mle_release for %.*s, type %d\n",
-                    mle->u.name.len, mle->u.name.name, mle->type);
-       } else {
-               mlog(0, "calling mle_release for %.*s, type %d\n",
-                    mle->u.res->lockname.len,
-                    mle->u.res->lockname.name, mle->type);
-       }
        assert_spin_locked(&dlm->spinlock);
        assert_spin_locked(&dlm->master_lock);
 
+       mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname,
+            mle->type);
+
        /* remove from list if not already */
-       if (!list_empty(&mle->list))
-               list_del_init(&mle->list);
+       __dlm_unlink_mle(dlm, mle);
 
        /* detach the mle from the domain node up/down events */
        __dlm_mle_detach_hb_events(dlm, mle);
 
+       atomic_dec(&dlm->mle_cur_count[mle->type]);
+
        /* NOTE: kfree under spinlock here.
         * if this is bad, we can move this to a freelist. */
        kmem_cache_free(dlm_mle_cache, mle);
@@ -465,43 +485,6 @@ void dlm_destroy_master_caches(void)
                kmem_cache_destroy(dlm_lockres_cache);
 }
 
-static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
-                                 struct dlm_lock_resource *res,
-                                 u8 owner)
-{
-       assert_spin_locked(&res->spinlock);
-
-       mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner);
-
-       if (owner == dlm->node_num)
-               atomic_inc(&dlm->local_resources);
-       else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN)
-               atomic_inc(&dlm->unknown_resources);
-       else
-               atomic_inc(&dlm->remote_resources);
-
-       res->owner = owner;
-}
-
-void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
-                             struct dlm_lock_resource *res, u8 owner)
-{
-       assert_spin_locked(&res->spinlock);
-
-       if (owner == res->owner)
-               return;
-
-       if (res->owner == dlm->node_num)
-               atomic_dec(&dlm->local_resources);
-       else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN)
-               atomic_dec(&dlm->unknown_resources);
-       else
-               atomic_dec(&dlm->remote_resources);
-
-       dlm_set_lockres_owner(dlm, res, owner);
-}
-
-
 static void dlm_lockres_release(struct kref *kref)
 {
        struct dlm_lock_resource *res;
@@ -527,6 +510,8 @@ static void dlm_lockres_release(struct kref *kref)
        }
        spin_unlock(&dlm->track_lock);
 
+       atomic_dec(&dlm->res_cur_count);
+
        dlm_put(dlm);
 
        if (!hlist_unhashed(&res->hash_node) ||
@@ -607,6 +592,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
 
        kref_init(&res->refs);
 
+       atomic_inc(&dlm->res_tot_count);
+       atomic_inc(&dlm->res_cur_count);
+
        /* just for consistency */
        spin_lock(&res->spinlock);
        dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -843,7 +831,7 @@ lookup:
                alloc_mle = NULL;
                dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
                set_bit(dlm->node_num, mle->maybe_map);
-               list_add(&mle->list, &dlm->master_list);
+               __dlm_insert_mle(dlm, mle);
 
                /* still holding the dlm spinlock, check the recovery map
                 * to see if there are any nodes that still need to be 
@@ -1270,7 +1258,7 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
                                                     res->lockname.len,
                                                     res->lockname.name);
                                                mle->type = DLM_MLE_MASTER;
-                                               mle->u.res = res;
+                                               mle->mleres = res;
                                        }
                                }
                        }
@@ -1315,14 +1303,8 @@ static int dlm_do_master_request(struct dlm_lock_resource *res,
 
        BUG_ON(mle->type == DLM_MLE_MIGRATION);
 
-       if (mle->type != DLM_MLE_MASTER) {
-               request.namelen = mle->u.name.len;
-               memcpy(request.name, mle->u.name.name, request.namelen);
-       } else {
-               request.namelen = mle->u.res->lockname.len;
-               memcpy(request.name, mle->u.res->lockname.name,
-                       request.namelen);
-       }
+       request.namelen = (u8)mle->mnamelen;
+       memcpy(request.name, mle->mname, request.namelen);
 
 again:
        ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
@@ -1575,7 +1557,7 @@ way_up_top:
                // "add the block.\n");
                dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
                set_bit(request->node_idx, mle->maybe_map);
-               list_add(&mle->list, &dlm->master_list);
+               __dlm_insert_mle(dlm, mle);
                response = DLM_MASTER_RESP_NO;
        } else {
                // mlog(0, "mle was found\n");
@@ -1967,7 +1949,7 @@ ok:
                             assert->node_idx, rr, extra_ref, mle->inuse);
                        dlm_print_one_mle(mle);
                }
-               list_del_init(&mle->list);
+               __dlm_unlink_mle(dlm, mle);
                __dlm_mle_detach_hb_events(dlm, mle);
                __dlm_put_mle(mle);
                if (extra_ref) {
@@ -3159,10 +3141,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
                        tmp->master = master;
                        atomic_set(&tmp->woken, 1);
                        wake_up(&tmp->wq);
-                       /* remove it from the list so that only one
-                        * mle will be found */
-                       list_del_init(&tmp->list);
-                       /* this was obviously WRONG.  mle is uninited here.  should be tmp. */
+                       /* remove it so that only one mle will be found */
+                       __dlm_unlink_mle(dlm, tmp);
                        __dlm_mle_detach_hb_events(dlm, tmp);
                        ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
                        mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
@@ -3181,137 +3161,164 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
        mle->master = master;
        /* do this for consistency with other mle types */
        set_bit(new_master, mle->maybe_map);
-       list_add(&mle->list, &dlm->master_list);
+       __dlm_insert_mle(dlm, mle);
 
        return ret;
 }
 
-
-void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
+/*
+ * Sets the owner of the lockres, associated to the mle, to UNKNOWN
+ */
+static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm,
+                                       struct dlm_master_list_entry *mle)
 {
-       struct dlm_master_list_entry *mle, *next;
        struct dlm_lock_resource *res;
-       unsigned int hash;
 
-       mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
-top:
-       assert_spin_locked(&dlm->spinlock);
+       /* Find the lockres associated to the mle and set its owner to UNK */
+       res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen,
+                                  mle->mnamehash);
+       if (res) {
+               spin_unlock(&dlm->master_lock);
 
-       /* clean the master list */
-       spin_lock(&dlm->master_lock);
-       list_for_each_entry_safe(mle, next, &dlm->master_list, list) {
-               BUG_ON(mle->type != DLM_MLE_BLOCK &&
-                      mle->type != DLM_MLE_MASTER &&
-                      mle->type != DLM_MLE_MIGRATION);
-
-               /* MASTER mles are initiated locally.  the waiting
-                * process will notice the node map change
-                * shortly.  let that happen as normal. */
-               if (mle->type == DLM_MLE_MASTER)
-                       continue;
+               /* move lockres onto recovery list */
+               spin_lock(&res->spinlock);
+               dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
+               dlm_move_lockres_to_recovery_list(dlm, res);
+               spin_unlock(&res->spinlock);
+               dlm_lockres_put(res);
 
+               /* about to get rid of mle, detach from heartbeat */
+               __dlm_mle_detach_hb_events(dlm, mle);
 
-               /* BLOCK mles are initiated by other nodes.
-                * need to clean up if the dead node would have
-                * been the master. */
-               if (mle->type == DLM_MLE_BLOCK) {
-                       int bit;
+               /* dump the mle */
+               spin_lock(&dlm->master_lock);
+               __dlm_put_mle(mle);
+               spin_unlock(&dlm->master_lock);
+       }
 
-                       spin_lock(&mle->spinlock);
-                       bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
-                       if (bit != dead_node) {
-                               mlog(0, "mle found, but dead node %u would "
-                                    "not have been master\n", dead_node);
-                               spin_unlock(&mle->spinlock);
-                       } else {
-                               /* must drop the refcount by one since the
-                                * assert_master will never arrive.  this
-                                * may result in the mle being unlinked and
-                                * freed, but there may still be a process
-                                * waiting in the dlmlock path which is fine. */
-                               mlog(0, "node %u was expected master\n",
-                                    dead_node);
-                               atomic_set(&mle->woken, 1);
-                               spin_unlock(&mle->spinlock);
-                               wake_up(&mle->wq);
-                               /* do not need events any longer, so detach 
-                                * from heartbeat */
-                               __dlm_mle_detach_hb_events(dlm, mle);
-                               __dlm_put_mle(mle);
-                       }
-                       continue;
-               }
+       return res;
+}
 
-               /* everything else is a MIGRATION mle */
-
-               /* the rule for MIGRATION mles is that the master
-                * becomes UNKNOWN if *either* the original or
-                * the new master dies.  all UNKNOWN lockreses
-                * are sent to whichever node becomes the recovery
-                * master.  the new master is responsible for
-                * determining if there is still a master for
-                * this lockres, or if he needs to take over
-                * mastery.  either way, this node should expect
-                * another message to resolve this. */
-               if (mle->master != dead_node &&
-                   mle->new_master != dead_node)
-                       continue;
+static void dlm_clean_migration_mle(struct dlm_ctxt *dlm,
+                                   struct dlm_master_list_entry *mle)
+{
+       __dlm_mle_detach_hb_events(dlm, mle);
 
-               /* if we have reached this point, this mle needs to
-                * be removed from the list and freed. */
+       spin_lock(&mle->spinlock);
+       __dlm_unlink_mle(dlm, mle);
+       atomic_set(&mle->woken, 1);
+       spin_unlock(&mle->spinlock);
 
-               /* remove from the list early.  NOTE: unlinking
-                * list_head while in list_for_each_safe */
-               __dlm_mle_detach_hb_events(dlm, mle);
-               spin_lock(&mle->spinlock);
-               list_del_init(&mle->list);
+       wake_up(&mle->wq);
+}
+
+static void dlm_clean_block_mle(struct dlm_ctxt *dlm,
+                               struct dlm_master_list_entry *mle, u8 dead_node)
+{
+       int bit;
+
+       BUG_ON(mle->type != DLM_MLE_BLOCK);
+
+       spin_lock(&mle->spinlock);
+       bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
+       if (bit != dead_node) {
+               mlog(0, "mle found, but dead node %u would not have been "
+                    "master\n", dead_node);
+               spin_unlock(&mle->spinlock);
+       } else {
+               /* Must drop the refcount by one since the assert_master will
+                * never arrive. This may result in the mle being unlinked and
+                * freed, but there may still be a process waiting in the
+                * dlmlock path which is fine. */
+               mlog(0, "node %u was expected master\n", dead_node);
                atomic_set(&mle->woken, 1);
                spin_unlock(&mle->spinlock);
                wake_up(&mle->wq);
 
-               mlog(0, "%s: node %u died during migration from "
-                    "%u to %u!\n", dlm->name, dead_node,
-                    mle->master, mle->new_master);
-               /* if there is a lockres associated with this
-                * mle, find it and set its owner to UNKNOWN */
-               hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len);
-               res = __dlm_lookup_lockres(dlm, mle->u.name.name,
-                                          mle->u.name.len, hash);
-               if (res) {
-                       /* unfortunately if we hit this rare case, our
-                        * lock ordering is messed.  we need to drop
-                        * the master lock so that we can take the
-                        * lockres lock, meaning that we will have to
-                        * restart from the head of list. */
-                       spin_unlock(&dlm->master_lock);
+               /* Do not need events any longer, so detach from heartbeat */
+               __dlm_mle_detach_hb_events(dlm, mle);
+               __dlm_put_mle(mle);
+       }
+}
 
-                       /* move lockres onto recovery list */
-                       spin_lock(&res->spinlock);
-                       dlm_set_lockres_owner(dlm, res,
-                                       DLM_LOCK_RES_OWNER_UNKNOWN);
-                       dlm_move_lockres_to_recovery_list(dlm, res);
-                       spin_unlock(&res->spinlock);
-                       dlm_lockres_put(res);
+void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
+{
+       struct dlm_master_list_entry *mle;
+       struct dlm_lock_resource *res;
+       struct hlist_head *bucket;
+       struct hlist_node *list;
+       unsigned int i;
 
-                       /* about to get rid of mle, detach from heartbeat */
-                       __dlm_mle_detach_hb_events(dlm, mle);
+       mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
+top:
+       assert_spin_locked(&dlm->spinlock);
 
-                       /* dump the mle */
-                       spin_lock(&dlm->master_lock);
-                       __dlm_put_mle(mle);
-                       spin_unlock(&dlm->master_lock);
+       /* clean the master list */
+       spin_lock(&dlm->master_lock);
+       for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+               bucket = dlm_master_hash(dlm, i);
+               hlist_for_each(list, bucket) {
+                       mle = hlist_entry(list, struct dlm_master_list_entry,
+                                         master_hash_node);
+
+                       BUG_ON(mle->type != DLM_MLE_BLOCK &&
+                              mle->type != DLM_MLE_MASTER &&
+                              mle->type != DLM_MLE_MIGRATION);
+
+                       /* MASTER mles are initiated locally. The waiting
+                        * process will notice the node map change shortly.
+                        * Let that happen as normal. */
+                       if (mle->type == DLM_MLE_MASTER)
+                               continue;
+
+                       /* BLOCK mles are initiated by other nodes. Need to
+                        * clean up if the dead node would have been the
+                        * master. */
+                       if (mle->type == DLM_MLE_BLOCK) {
+                               dlm_clean_block_mle(dlm, mle, dead_node);
+                               continue;
+                       }
 
-                       /* restart */
-                       goto top;
-               }
+                       /* Everything else is a MIGRATION mle */
+
+                       /* The rule for MIGRATION mles is that the master
+                        * becomes UNKNOWN if *either* the original or the new
+                        * master dies. All UNKNOWN lockres' are sent to
+                        * whichever node becomes the recovery master. The new
+                        * master is responsible for determining if there is
+                        * still a master for this lockres, or if he needs to
+                        * take over mastery. Either way, this node should
+                        * expect another message to resolve this. */
+
+                       if (mle->master != dead_node &&
+                           mle->new_master != dead_node)
+                               continue;
+
+                       /* If we have reached this point, this mle needs to be
+                        * removed from the list and freed. */
+                       dlm_clean_migration_mle(dlm, mle);
+
+                       mlog(0, "%s: node %u died during migration from "
+                            "%u to %u!\n", dlm->name, dead_node, mle->master,
+                            mle->new_master);
+
+                       /* If we find a lockres associated with the mle, we've
+                        * hit this rare case that messes up our lock ordering.
+                        * If so, we need to drop the master lock so that we can
+                        * take the lockres lock, meaning that we will have to
+                        * restart from the head of list. */
+                       res = dlm_reset_mleres_owner(dlm, mle);
+                       if (res)
+                               /* restart */
+                               goto top;
 
-               /* this may be the last reference */
-               __dlm_put_mle(mle);
+                       /* This may be the last reference */
+                       __dlm_put_mle(mle);
+               }
        }
        spin_unlock(&dlm->master_lock);
 }
 
-
 int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                         u8 old_master)
 {
index 4060bb328bc8a08c22bbd77c59835d757ebdcda5..d490b66ad9d75e1ca24667cd6a7c8626d30200e0 100644 (file)
@@ -162,12 +162,28 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
 
        spin_lock(&res->spinlock);
        if (!__dlm_lockres_unused(res)) {
-               spin_unlock(&res->spinlock);
                mlog(0, "%s:%.*s: tried to purge but not unused\n",
                     dlm->name, res->lockname.len, res->lockname.name);
-               return -ENOTEMPTY;
+               __dlm_print_one_lock_resource(res);
+               spin_unlock(&res->spinlock);
+               BUG();
        }
+
+       if (res->state & DLM_LOCK_RES_MIGRATING) {
+               mlog(0, "%s:%.*s: Delay dropref as this lockres is "
+                    "being remastered\n", dlm->name, res->lockname.len,
+                    res->lockname.name);
+               /* Re-add the lockres to the end of the purge list */
+               if (!list_empty(&res->purge)) {
+                       list_del_init(&res->purge);
+                       list_add_tail(&res->purge, &dlm->purge_list);
+               }
+               spin_unlock(&res->spinlock);
+               return 0;
+       }
+
        master = (res->owner == dlm->node_num);
+
        if (!master)
                res->state |= DLM_LOCK_RES_DROPPING_REF;
        spin_unlock(&res->spinlock);
index 7219a86d34ccc3102b360e10c298ed50fda903a7..e15fc7d50827019fe974b9bb61b8f5dd7312665a 100644 (file)
@@ -244,6 +244,10 @@ static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
        .flags          = 0,
 };
 
+static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
+       .flags          = 0,
+};
+
 static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
        .get_osb        = ocfs2_get_dentry_osb,
        .post_unlock    = ocfs2_dentry_post_unlock,
@@ -622,6 +626,17 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
                                   &ocfs2_rename_lops, osb);
 }
 
+static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
+                                        struct ocfs2_super *osb)
+{
+       /* nfs_sync lockres doesn't come from a slab so we call init
+        * once on it manually.  */
+       ocfs2_lock_res_init_once(res);
+       ocfs2_build_lock_name(OCFS2_LOCK_TYPE_NFS_SYNC, 0, 0, res->l_name);
+       ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_NFS_SYNC,
+                                  &ocfs2_nfs_sync_lops, osb);
+}
+
 void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
                              struct ocfs2_file_private *fp)
 {
@@ -2417,6 +2432,34 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb)
                ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
 }
 
+int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
+{
+       int status;
+       struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
+
+       if (ocfs2_is_hard_readonly(osb))
+               return -EROFS;
+
+       if (ocfs2_mount_local(osb))
+               return 0;
+
+       status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
+                                   0, 0);
+       if (status < 0)
+               mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);
+
+       return status;
+}
+
+void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
+{
+       struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
+
+       if (!ocfs2_mount_local(osb))
+               ocfs2_cluster_unlock(osb, lockres,
+                                    ex ? LKM_EXMODE : LKM_PRMODE);
+}
+
 int ocfs2_dentry_lock(struct dentry *dentry, int ex)
 {
        int ret;
@@ -2798,6 +2841,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
 local:
        ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
        ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
+       ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
 
        osb->cconn = conn;
 
@@ -2833,6 +2877,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
 
        ocfs2_lock_res_free(&osb->osb_super_lockres);
        ocfs2_lock_res_free(&osb->osb_rename_lockres);
+       ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
 
        ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
        osb->cconn = NULL;
@@ -3015,6 +3060,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
 {
        ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
        ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
+       ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
 }
 
 int ocfs2_drop_inode_locks(struct inode *inode)
index 3f8d9986b8e0e82d9f8b938ef9d648669b14fff0..e1fd5721cd7f15da6e844579e1964d691af597fe 100644 (file)
@@ -115,6 +115,8 @@ void ocfs2_super_unlock(struct ocfs2_super *osb,
                        int ex);
 int ocfs2_rename_lock(struct ocfs2_super *osb);
 void ocfs2_rename_unlock(struct ocfs2_super *osb);
+int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
+void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex);
 int ocfs2_dentry_lock(struct dentry *dentry, int ex);
 void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
 int ocfs2_file_lock(struct file *file, int ex, int trylock);
index 2f27b332d8b316949b131eb8172c1e1bdf4d1362..de3da8eb558ced894690263794c4c1f28652f351 100644 (file)
@@ -31,6 +31,7 @@
 
 #include "ocfs2.h"
 
+#include "alloc.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "dcache.h"
@@ -38,6 +39,7 @@
 #include "inode.h"
 
 #include "buffer_head_io.h"
+#include "suballoc.h"
 
 struct ocfs2_inode_handle
 {
@@ -49,29 +51,97 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
                struct ocfs2_inode_handle *handle)
 {
        struct inode *inode;
+       struct ocfs2_super *osb = OCFS2_SB(sb);
+       u64 blkno = handle->ih_blkno;
+       int status, set;
        struct dentry *result;
 
        mlog_entry("(0x%p, 0x%p)\n", sb, handle);
 
-       if (handle->ih_blkno == 0) {
-               mlog_errno(-ESTALE);
-               return ERR_PTR(-ESTALE);
+       if (blkno == 0) {
+               mlog(0, "nfs wants inode with blkno: 0\n");
+               result = ERR_PTR(-ESTALE);
+               goto bail;
+       }
+
+       inode = ocfs2_ilookup(sb, blkno);
+       /*
+        * If the inode exists in memory, we only need to check it's
+        * generation number
+        */
+       if (inode)
+               goto check_gen;
+
+       /*
+        * This will synchronize us against ocfs2_delete_inode() on
+        * all nodes
+        */
+       status = ocfs2_nfs_sync_lock(osb, 1);
+       if (status < 0) {
+               mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status);
+               goto check_err;
+       }
+
+       status = ocfs2_test_inode_bit(osb, blkno, &set);
+       if (status < 0) {
+               if (status == -EINVAL) {
+                       /*
+                        * The blkno NFS gave us doesn't even show up
+                        * as an inode, we return -ESTALE to be
+                        * nice
+                        */
+                       mlog(0, "test inode bit failed %d\n", status);
+                       status = -ESTALE;
+               } else {
+                       mlog(ML_ERROR, "test inode bit failed %d\n", status);
+               }
+               goto unlock_nfs_sync;
+       }
+
+       /* If the inode allocator bit is clear, this inode must be stale */
+       if (!set) {
+               mlog(0, "inode %llu suballoc bit is clear\n", blkno);
+               status = -ESTALE;
+               goto unlock_nfs_sync;
        }
 
-       inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0);
+       inode = ocfs2_iget(osb, blkno, 0, 0);
 
-       if (IS_ERR(inode))
-               return (void *)inode;
+unlock_nfs_sync:
+       ocfs2_nfs_sync_unlock(osb, 1);
 
+check_err:
+       if (status < 0) {
+               if (status == -ESTALE) {
+                       mlog(0, "stale inode ino: %llu generation: %u\n",
+                            blkno, handle->ih_generation);
+               }
+               result = ERR_PTR(status);
+               goto bail;
+       }
+
+       if (IS_ERR(inode)) {
+               mlog_errno(PTR_ERR(inode));
+               result = (void *)inode;
+               goto bail;
+       }
+
+check_gen:
        if (handle->ih_generation != inode->i_generation) {
                iput(inode);
-               return ERR_PTR(-ESTALE);
+               mlog(0, "stale inode ino: %llu generation: %u\n", blkno,
+                    handle->ih_generation);
+               result = ERR_PTR(-ESTALE);
+               goto bail;
        }
 
        result = d_obtain_alias(inode);
        if (!IS_ERR(result))
                result->d_op = &ocfs2_dentry_ops;
+       else
+               mlog_errno(PTR_ERR(result));
 
+bail:
        mlog_exit_ptr(result);
        return result;
 }
index 229e707bc050629c0ad372b14c75194bf4b97f3d..10e1fa87396aedf9ea0a1cfbd501c25361a872e7 100644 (file)
@@ -38,6 +38,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "dir.h"
 #include "blockcheck.h"
 #include "dlmglue.h"
 #include "extent_map.h"
@@ -112,6 +113,17 @@ void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi)
                oi->ip_attr |= OCFS2_DIRSYNC_FL;
 }
 
+struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
+{
+       struct ocfs2_find_inode_args args;
+
+       args.fi_blkno = blkno;
+       args.fi_flags = 0;
+       args.fi_ino = ino_from_blkno(sb, blkno);
+       args.fi_sysfile_type = 0;
+
+       return ilookup5(sb, blkno, ocfs2_find_actor, &args);
+}
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
                         int sysfile_type)
 {
@@ -275,7 +287,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
                     (unsigned long long)le64_to_cpu(fe->i_blkno));
 
-       inode->i_nlink = le16_to_cpu(fe->i_links_count);
+       inode->i_nlink = ocfs2_read_links_count(fe);
 
        if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
@@ -351,6 +363,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 
        ocfs2_set_inode_flags(inode);
 
+       OCFS2_I(inode)->ip_last_used_slot = 0;
+       OCFS2_I(inode)->ip_last_used_group = 0;
        mlog_exit_void();
 }
 
@@ -606,7 +620,7 @@ static int ocfs2_remove_inode(struct inode *inode,
        }
 
        handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
-                                       ocfs2_quota_trans_credits(inode->i_sb));
+                                  ocfs2_quota_trans_credits(inode->i_sb));
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                mlog_errno(status);
@@ -740,6 +754,15 @@ static int ocfs2_wipe_inode(struct inode *inode,
                goto bail_unlock_dir;
        }
 
+       /* Remove any dir index tree */
+       if (S_ISDIR(inode->i_mode)) {
+               status = ocfs2_dx_dir_truncate(inode, di_bh);
+               if (status) {
+                       mlog_errno(status);
+                       goto bail_unlock_dir;
+               }
+       }
+
        /*Free extended attribute resources associated with this inode.*/
        status = ocfs2_xattr_remove(inode, di_bh);
        if (status < 0) {
@@ -949,6 +972,17 @@ void ocfs2_delete_inode(struct inode *inode)
                goto bail;
        }
 
+       /*
+        * Synchronize us against ocfs2_get_dentry. We take this in
+        * shared mode so that all nodes can still concurrently
+        * process deletes.
+        */
+       status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0);
+       if (status < 0) {
+               mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status);
+               ocfs2_cleanup_delete_inode(inode, 0);
+               goto bail_unblock;
+       }
        /* Lock down the inode. This gives us an up to date view of
         * it's metadata (for verification), and allows us to
         * serialize delete_inode on multiple nodes.
@@ -962,7 +996,7 @@ void ocfs2_delete_inode(struct inode *inode)
                if (status != -ENOENT)
                        mlog_errno(status);
                ocfs2_cleanup_delete_inode(inode, 0);
-               goto bail_unblock;
+               goto bail_unlock_nfs_sync;
        }
 
        /* Query the cluster. This will be the final decision made
@@ -1005,6 +1039,10 @@ void ocfs2_delete_inode(struct inode *inode)
 bail_unlock_inode:
        ocfs2_inode_unlock(inode, 1);
        brelse(di_bh);
+
+bail_unlock_nfs_sync:
+       ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
+
 bail_unblock:
        status = sigprocmask(SIG_SETMASK, &oldset, NULL);
        if (status < 0)
@@ -1205,7 +1243,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
        spin_unlock(&OCFS2_I(inode)->ip_lock);
 
        fe->i_size = cpu_to_le64(i_size_read(inode));
-       fe->i_links_count = cpu_to_le16(inode->i_nlink);
+       ocfs2_set_links_count(fe, inode->i_nlink);
        fe->i_uid = cpu_to_le32(inode->i_uid);
        fe->i_gid = cpu_to_le32(inode->i_gid);
        fe->i_mode = cpu_to_le16(inode->i_mode);
@@ -1242,7 +1280,7 @@ void ocfs2_refresh_inode(struct inode *inode,
        OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
        ocfs2_set_inode_flags(inode);
        i_size_write(inode, le64_to_cpu(fe->i_size));
-       inode->i_nlink = le16_to_cpu(fe->i_links_count);
+       inode->i_nlink = ocfs2_read_links_count(fe);
        inode->i_uid = le32_to_cpu(fe->i_uid);
        inode->i_gid = le32_to_cpu(fe->i_gid);
        inode->i_mode = le16_to_cpu(fe->i_mode);
index eb3c302b38d34c15d96057ff4484e22ba9574999..ea71525aad41fdc4070cd52890d6385359dec7f5 100644 (file)
@@ -72,6 +72,10 @@ struct ocfs2_inode_info
 
        struct inode                    vfs_inode;
        struct jbd2_inode               ip_jinode;
+
+       /* Only valid if the inode is the dir. */
+       u32                             ip_last_used_slot;
+       u64                             ip_last_used_group;
 };
 
 /*
@@ -124,6 +128,7 @@ void ocfs2_drop_inode(struct inode *inode);
 /* Flags for ocfs2_iget() */
 #define OCFS2_FI_FLAG_SYSFILE          0x1
 #define OCFS2_FI_FLAG_ORPHAN_RECOVERY  0x2
+struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
                         int sysfile_type);
 int ocfs2_inode_init_private(struct inode *inode);
index 57d7d25a2b9a3b9e0a68c7bf8f1348832e69cb25..a20a0f1e37fd18e898ade8803d19ff7489df7e85 100644 (file)
@@ -65,6 +65,11 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
 static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                                 int slot);
 static int ocfs2_commit_thread(void *arg);
+static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
+                                           int slot_num,
+                                           struct ocfs2_dinode *la_dinode,
+                                           struct ocfs2_dinode *tl_dinode,
+                                           struct ocfs2_quota_recovery *qrec);
 
 static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
 {
@@ -76,18 +81,97 @@ static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
        return __ocfs2_wait_on_mount(osb, 1);
 }
 
-
-
 /*
- * The recovery_list is a simple linked list of node numbers to recover.
- * It is protected by the recovery_lock.
+ * This replay_map is to track online/offline slots, so we could recover
+ * offline slots during recovery and mount
  */
 
-struct ocfs2_recovery_map {
-       unsigned int rm_used;
-       unsigned int *rm_entries;
+enum ocfs2_replay_state {
+       REPLAY_UNNEEDED = 0,    /* Replay is not needed, so ignore this map */
+       REPLAY_NEEDED,          /* Replay slots marked in rm_replay_slots */
+       REPLAY_DONE             /* Replay was already queued */
 };
 
+struct ocfs2_replay_map {
+       unsigned int rm_slots;
+       enum ocfs2_replay_state rm_state;
+       unsigned char rm_replay_slots[0];
+};
+
+void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
+{
+       if (!osb->replay_map)
+               return;
+
+       /* If we've already queued the replay, we don't have any more to do */
+       if (osb->replay_map->rm_state == REPLAY_DONE)
+               return;
+
+       osb->replay_map->rm_state = state;
+}
+
+int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
+{
+       struct ocfs2_replay_map *replay_map;
+       int i, node_num;
+
+       /* If replay map is already set, we don't do it again */
+       if (osb->replay_map)
+               return 0;
+
+       replay_map = kzalloc(sizeof(struct ocfs2_replay_map) +
+                            (osb->max_slots * sizeof(char)), GFP_KERNEL);
+
+       if (!replay_map) {
+               mlog_errno(-ENOMEM);
+               return -ENOMEM;
+       }
+
+       spin_lock(&osb->osb_lock);
+
+       replay_map->rm_slots = osb->max_slots;
+       replay_map->rm_state = REPLAY_UNNEEDED;
+
+       /* set rm_replay_slots for offline slot(s) */
+       for (i = 0; i < replay_map->rm_slots; i++) {
+               if (ocfs2_slot_to_node_num_locked(osb, i, &node_num) == -ENOENT)
+                       replay_map->rm_replay_slots[i] = 1;
+       }
+
+       osb->replay_map = replay_map;
+       spin_unlock(&osb->osb_lock);
+       return 0;
+}
+
+void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
+{
+       struct ocfs2_replay_map *replay_map = osb->replay_map;
+       int i;
+
+       if (!replay_map)
+               return;
+
+       if (replay_map->rm_state != REPLAY_NEEDED)
+               return;
+
+       for (i = 0; i < replay_map->rm_slots; i++)
+               if (replay_map->rm_replay_slots[i])
+                       ocfs2_queue_recovery_completion(osb->journal, i, NULL,
+                                                       NULL, NULL);
+       replay_map->rm_state = REPLAY_DONE;
+}
+
+void ocfs2_free_replay_slots(struct ocfs2_super *osb)
+{
+       struct ocfs2_replay_map *replay_map = osb->replay_map;
+
+       if (!osb->replay_map)
+               return;
+
+       kfree(replay_map);
+       osb->replay_map = NULL;
+}
+
 int ocfs2_recovery_init(struct ocfs2_super *osb)
 {
        struct ocfs2_recovery_map *rm;
@@ -496,6 +580,22 @@ static struct ocfs2_triggers dq_triggers = {
        },
 };
 
+static struct ocfs2_triggers dr_triggers = {
+       .ot_triggers = {
+               .t_commit = ocfs2_commit_trigger,
+               .t_abort = ocfs2_abort_trigger,
+       },
+       .ot_offset      = offsetof(struct ocfs2_dx_root_block, dr_check),
+};
+
+static struct ocfs2_triggers dl_triggers = {
+       .ot_triggers = {
+               .t_commit = ocfs2_commit_trigger,
+               .t_abort = ocfs2_abort_trigger,
+       },
+       .ot_offset      = offsetof(struct ocfs2_dx_leaf, dl_check),
+};
+
 static int __ocfs2_journal_access(handle_t *handle,
                                  struct inode *inode,
                                  struct buffer_head *bh,
@@ -600,6 +700,20 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
                                      type);
 }
 
+int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
+                           struct buffer_head *bh, int type)
+{
+       return __ocfs2_journal_access(handle, inode, bh, &dr_triggers,
+                                     type);
+}
+
+int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
+                           struct buffer_head *bh, int type)
+{
+       return __ocfs2_journal_access(handle, inode, bh, &dl_triggers,
+                                     type);
+}
+
 int ocfs2_journal_access(handle_t *handle, struct inode *inode,
                         struct buffer_head *bh, int type)
 {
@@ -1176,24 +1290,24 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
 }
 
 /* Called by the mount code to queue recovery the last part of
- * recovery for it's own slot. */
+ * recovery for it's own and offline slot(s). */
 void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
 {
        struct ocfs2_journal *journal = osb->journal;
 
-       if (osb->dirty) {
-               /* No need to queue up our truncate_log as regular
-                * cleanup will catch that. */
-               ocfs2_queue_recovery_completion(journal,
-                                               osb->slot_num,
-                                               osb->local_alloc_copy,
-                                               NULL,
-                                               NULL);
-               ocfs2_schedule_truncate_log_flush(osb, 0);
+       /* No need to queue up our truncate_log as regular cleanup will catch
+        * that */
+       ocfs2_queue_recovery_completion(journal, osb->slot_num,
+                                       osb->local_alloc_copy, NULL, NULL);
+       ocfs2_schedule_truncate_log_flush(osb, 0);
 
-               osb->local_alloc_copy = NULL;
-               osb->dirty = 0;
-       }
+       osb->local_alloc_copy = NULL;
+       osb->dirty = 0;
+
+       /* queue to recover orphan slots for all offline slots */
+       ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
+       ocfs2_queue_replay_slots(osb);
+       ocfs2_free_replay_slots(osb);
 }
 
 void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
@@ -1236,6 +1350,14 @@ restart:
                goto bail;
        }
 
+       status = ocfs2_compute_replay_slots(osb);
+       if (status < 0)
+               mlog_errno(status);
+
+       /* queue recovery for our own slot */
+       ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
+                                       NULL, NULL);
+
        spin_lock(&osb->osb_lock);
        while (rm->rm_used) {
                /* It's always safe to remove entry zero, as we won't
@@ -1301,11 +1423,8 @@ skip_recovery:
 
        ocfs2_super_unlock(osb, 1);
 
-       /* We always run recovery on our own orphan dir - the dead
-        * node(s) may have disallowd a previos inode delete. Re-processing
-        * is therefore required. */
-       ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
-                                       NULL, NULL);
+       /* queue recovery for offline slots */
+       ocfs2_queue_replay_slots(osb);
 
 bail:
        mutex_lock(&osb->recovery_lock);
@@ -1314,6 +1433,7 @@ bail:
                goto restart;
        }
 
+       ocfs2_free_replay_slots(osb);
        osb->recovery_thread_task = NULL;
        mb(); /* sync with ocfs2_recovery_thread_running */
        wake_up(&osb->recovery_event);
@@ -1465,6 +1585,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
                goto done;
        }
 
+       /* we need to run complete recovery for offline orphan slots */
+       ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
+
        mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
             node_num, slot_num,
             MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
index 172850a9a12a7d78cc115e80500979673f4ae7f5..619dd7f6c053c1562863707bebc7c3c6e1753b52 100644 (file)
@@ -38,6 +38,17 @@ enum ocfs2_journal_state {
 struct ocfs2_super;
 struct ocfs2_dinode;
 
+/*
+ * The recovery_list is a simple linked list of node numbers to recover.
+ * It is protected by the recovery_lock.
+ */
+
+struct ocfs2_recovery_map {
+       unsigned int rm_used;
+       unsigned int *rm_entries;
+};
+
+
 struct ocfs2_journal {
        enum ocfs2_journal_state   j_state;    /* Journals current state   */
 
@@ -139,6 +150,7 @@ void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
 int ocfs2_recovery_init(struct ocfs2_super *osb);
 void ocfs2_recovery_exit(struct ocfs2_super *osb);
 
+int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
 /*
  *  Journal Control:
  *  Initialize, Load, Shutdown, Wipe a journal.
@@ -266,6 +278,12 @@ int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
 /* dirblock */
 int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
                            struct buffer_head *bh, int type);
+/* ocfs2_dx_root_block */
+int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode,
+                           struct buffer_head *bh, int type);
+/* ocfs2_dx_leaf */
+int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode,
+                           struct buffer_head *bh, int type);
 /* Anything that has no ecc */
 int ocfs2_journal_access(handle_t *handle, struct inode *inode,
                         struct buffer_head *bh, int type);
@@ -368,14 +386,29 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb)
 }
 
 /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
- * bitmap block for the new bit) */
-#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
+ * bitmap block for the new bit) dx_root update for free list */
+#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1)
+
+static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
+{
+       /* 1 block for index, 2 allocs (data, metadata), 1 clusters
+        * worth of blocks for initial extent. */
+       return 1 + 2 * OCFS2_SUBALLOC_ALLOC +
+               ocfs2_clusters_to_blocks(sb, 1);
+}
 
-/* parent fe, parent block, new file entry, inode alloc fe, inode alloc
- * group descriptor + mkdir/symlink blocks + quota update */
-static inline int ocfs2_mknod_credits(struct super_block *sb)
+/* parent fe, parent block, new file entry, index leaf, inode alloc fe, inode
+ * alloc group descriptor + mkdir/symlink blocks + dir blocks + xattr
+ * blocks + quota update */
+static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
+                                     int xattr_credits)
 {
-       return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS +
+       int dir_credits = OCFS2_DIR_LINK_ADDITIONAL_CREDITS;
+
+       if (is_dir)
+               dir_credits += ocfs2_add_dir_index_credits(sb);
+
+       return 4 + OCFS2_SUBALLOC_ALLOC + dir_credits + xattr_credits +
               ocfs2_quota_trans_credits(sb);
 }
 
@@ -388,31 +421,31 @@ static inline int ocfs2_mknod_credits(struct super_block *sb)
 #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
 
 /* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
- * update on dir */
+ * update on dir + index leaf + dx root update for free list */
 static inline int ocfs2_link_credits(struct super_block *sb)
 {
-       return 2*OCFS2_INODE_UPDATE_CREDITS + 1 +
+       return 2*OCFS2_INODE_UPDATE_CREDITS + 3 +
               ocfs2_quota_trans_credits(sb);
 }
 
 /* inode + dir inode (if we unlink a dir), + dir entry block + orphan
- * dir inode link */
+ * dir inode link + dir inode index leaf + dir index root */
 static inline int ocfs2_unlink_credits(struct super_block *sb)
 {
        /* The quota update from ocfs2_link_credits is unused here... */
-       return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb);
+       return 2 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_link_credits(sb);
 }
 
 /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
- * inode alloc group descriptor */
-#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1)
+ * inode alloc group descriptor + orphan dir index leaf */
+#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3)
 
 /* dinode update, old dir dinode update, new dir dinode update, old
  * dir dir entry, new dir dir entry, dir entry update for renaming
- * directory + target unlink */
+ * directory + target unlink + 3 x dir index leaves */
 static inline int ocfs2_rename_credits(struct super_block *sb)
 {
-       return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb);
+       return 3 * OCFS2_INODE_UPDATE_CREDITS + 6 + ocfs2_unlink_credits(sb);
 }
 
 /* global bitmap dinode, group desc., relinked group,
@@ -422,6 +455,20 @@ static inline int ocfs2_rename_credits(struct super_block *sb)
                                          + OCFS2_INODE_UPDATE_CREDITS \
                                          + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
 
+/* inode update, removal of dx root block from allocator */
+#define OCFS2_DX_ROOT_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS +     \
+                                     OCFS2_SUBALLOC_FREE)
+
+static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
+{
+       int credits = 1 + OCFS2_SUBALLOC_ALLOC;
+
+       credits += ocfs2_clusters_to_blocks(sb, 1);
+       credits += ocfs2_quota_trans_credits(sb);
+
+       return credits;
+}
+
 /*
  * Please note that the caller must make sure that root_el is the root
  * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
@@ -457,7 +504,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
 
 static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
 {
-       int blocks = ocfs2_mknod_credits(sb);
+       int blocks = ocfs2_mknod_credits(sb, 0, 0);
 
        /* links can be longer than one block so we may update many
         * within our single allocated extent. */
index ec70cdbe77fcb6a360efa338ce7f832f5937965b..bac7e6abaf4763f75f873592beec3c0d4cb63472 100644 (file)
@@ -28,7 +28,6 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/bitops.h>
-#include <linux/debugfs.h>
 
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -75,84 +74,6 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
                                          struct inode *local_alloc_inode);
 
-#ifdef CONFIG_OCFS2_FS_STATS
-
-static int ocfs2_la_debug_open(struct inode *inode, struct file *file)
-{
-       file->private_data = inode->i_private;
-       return 0;
-}
-
-#define LA_DEBUG_BUF_SZ        PAGE_CACHE_SIZE
-#define LA_DEBUG_VER   1
-static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf,
-                                  size_t count, loff_t *ppos)
-{
-       static DEFINE_MUTEX(la_debug_mutex);
-       struct ocfs2_super *osb = file->private_data;
-       int written, ret;
-       char *buf = osb->local_alloc_debug_buf;
-
-       mutex_lock(&la_debug_mutex);
-       memset(buf, 0, LA_DEBUG_BUF_SZ);
-
-       written = snprintf(buf, LA_DEBUG_BUF_SZ,
-                          "0x%x\t0x%llx\t%u\t%u\t0x%x\n",
-                          LA_DEBUG_VER,
-                          (unsigned long long)osb->la_last_gd,
-                          osb->local_alloc_default_bits,
-                          osb->local_alloc_bits, osb->local_alloc_state);
-
-       ret = simple_read_from_buffer(userbuf, count, ppos, buf, written);
-
-       mutex_unlock(&la_debug_mutex);
-       return ret;
-}
-
-static const struct file_operations ocfs2_la_debug_fops = {
-       .open =         ocfs2_la_debug_open,
-       .read =         ocfs2_la_debug_read,
-};
-
-static void ocfs2_init_la_debug(struct ocfs2_super *osb)
-{
-       osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS);
-       if (!osb->local_alloc_debug_buf)
-               return;
-
-       osb->local_alloc_debug = debugfs_create_file("local_alloc_stats",
-                                                    S_IFREG|S_IRUSR,
-                                                    osb->osb_debug_root,
-                                                    osb,
-                                                    &ocfs2_la_debug_fops);
-       if (!osb->local_alloc_debug) {
-               kfree(osb->local_alloc_debug_buf);
-               osb->local_alloc_debug_buf = NULL;
-       }
-}
-
-static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
-{
-       if (osb->local_alloc_debug)
-               debugfs_remove(osb->local_alloc_debug);
-
-       if (osb->local_alloc_debug_buf)
-               kfree(osb->local_alloc_debug_buf);
-
-       osb->local_alloc_debug_buf = NULL;
-       osb->local_alloc_debug = NULL;
-}
-#else  /* CONFIG_OCFS2_FS_STATS */
-static void ocfs2_init_la_debug(struct ocfs2_super *osb)
-{
-       return;
-}
-static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
-{
-       return;
-}
-#endif
-
 static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
 {
        return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
@@ -226,8 +147,6 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 
        mlog_entry_void();
 
-       ocfs2_init_la_debug(osb);
-
        if (osb->local_alloc_bits == 0)
                goto bail;
 
@@ -299,9 +218,6 @@ bail:
        if (inode)
                iput(inode);
 
-       if (status < 0)
-               ocfs2_shutdown_la_debug(osb);
-
        mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
 
        mlog_exit(status);
@@ -331,8 +247,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        cancel_delayed_work(&osb->la_enable_wq);
        flush_workqueue(ocfs2_wq);
 
-       ocfs2_shutdown_la_debug(osb);
-
        if (osb->local_alloc_state == OCFS2_LA_UNUSED)
                goto out;
 
index 4b11762f249e91aba2d4ece846bbc86ac8058005..2220f93f668bc9d0b0cc5446efdb612ccfd1d4bb 100644 (file)
@@ -80,14 +80,14 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
                                    struct inode **ret_orphan_dir,
                                    struct inode *inode,
                                    char *name,
-                                   struct buffer_head **de_bh);
+                                   struct ocfs2_dir_lookup_result *lookup);
 
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            handle_t *handle,
                            struct inode *inode,
                            struct ocfs2_dinode *fe,
                            char *name,
-                           struct buffer_head *de_bh,
+                           struct ocfs2_dir_lookup_result *lookup,
                            struct inode *orphan_dir_inode);
 
 static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
@@ -228,17 +228,18 @@ static int ocfs2_mknod(struct inode *dir,
        struct ocfs2_super *osb;
        struct ocfs2_dinode *dirfe;
        struct buffer_head *new_fe_bh = NULL;
-       struct buffer_head *de_bh = NULL;
        struct inode *inode = NULL;
        struct ocfs2_alloc_context *inode_ac = NULL;
        struct ocfs2_alloc_context *data_ac = NULL;
-       struct ocfs2_alloc_context *xattr_ac = NULL;
+       struct ocfs2_alloc_context *meta_ac = NULL;
        int want_clusters = 0;
+       int want_meta = 0;
        int xattr_credits = 0;
        struct ocfs2_security_xattr_info si = {
                .enable = 1,
        };
        int did_quota_inode = 0;
+       struct ocfs2_dir_lookup_result lookup = { NULL, };
 
        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
                   (unsigned long)dev, dentry->d_name.len,
@@ -254,13 +255,13 @@ static int ocfs2_mknod(struct inode *dir,
                return status;
        }
 
-       if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) {
+       if (S_ISDIR(mode) && (dir->i_nlink >= ocfs2_link_max(osb))) {
                status = -EMLINK;
                goto leave;
        }
 
        dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
-       if (!dirfe->i_links_count) {
+       if (!ocfs2_read_links_count(dirfe)) {
                /* can't make a file in a deleted directory. */
                status = -ENOENT;
                goto leave;
@@ -274,7 +275,7 @@ static int ocfs2_mknod(struct inode *dir,
        /* get a spot inside the dir. */
        status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
                                              dentry->d_name.name,
-                                             dentry->d_name.len, &de_bh);
+                                             dentry->d_name.len, &lookup);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -308,17 +309,29 @@ static int ocfs2_mknod(struct inode *dir,
 
        /* calculate meta data/clusters for setting security and acl xattr */
        status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
-                                       &si, &want_clusters,
-                                       &xattr_credits, &xattr_ac);
+                                      &si, &want_clusters,
+                                      &xattr_credits, &want_meta);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
 
        /* Reserve a cluster if creating an extent based directory. */
-       if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb))
+       if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
                want_clusters += 1;
 
+               /* Dir indexing requires extra space as well */
+               if (ocfs2_supports_indexed_dirs(osb))
+                       want_meta++;
+       }
+
+       status = ocfs2_reserve_new_metadata_blocks(osb, want_meta, &meta_ac);
+       if (status < 0) {
+               if (status != -ENOSPC)
+                       mlog_errno(status);
+               goto leave;
+       }
+
        status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
        if (status < 0) {
                if (status != -ENOSPC)
@@ -326,8 +339,9 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
 
-       handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) +
-                                  xattr_credits);
+       handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
+                                                           S_ISDIR(mode),
+                                                           xattr_credits));
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                handle = NULL;
@@ -355,7 +369,7 @@ static int ocfs2_mknod(struct inode *dir,
 
        if (S_ISDIR(mode)) {
                status = ocfs2_fill_new_dir(osb, handle, dir, inode,
-                                           new_fe_bh, data_ac);
+                                           new_fe_bh, data_ac, meta_ac);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -367,7 +381,7 @@ static int ocfs2_mknod(struct inode *dir,
                        mlog_errno(status);
                        goto leave;
                }
-               le16_add_cpu(&dirfe->i_links_count, 1);
+               ocfs2_add_links_count(dirfe, 1);
                status = ocfs2_journal_dirty(handle, parent_fe_bh);
                if (status < 0) {
                        mlog_errno(status);
@@ -377,7 +391,7 @@ static int ocfs2_mknod(struct inode *dir,
        }
 
        status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
-                               xattr_ac, data_ac);
+                               meta_ac, data_ac);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -385,7 +399,7 @@ static int ocfs2_mknod(struct inode *dir,
 
        if (si.enable) {
                status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
-                                                xattr_ac, data_ac);
+                                                meta_ac, data_ac);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -394,7 +408,7 @@ static int ocfs2_mknod(struct inode *dir,
 
        status = ocfs2_add_entry(handle, dentry, inode,
                                 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
-                                de_bh);
+                                &lookup);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -423,11 +437,12 @@ leave:
                mlog(0, "Disk is full\n");
 
        brelse(new_fe_bh);
-       brelse(de_bh);
        brelse(parent_fe_bh);
        kfree(si.name);
        kfree(si.value);
 
+       ocfs2_free_dir_lookup_result(&lookup);
+
        if ((status < 0) && inode) {
                clear_nlink(inode);
                iput(inode);
@@ -439,8 +454,8 @@ leave:
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
 
-       if (xattr_ac)
-               ocfs2_free_alloc_context(xattr_ac);
+       if (meta_ac)
+               ocfs2_free_alloc_context(meta_ac);
 
        mlog_exit(status);
 
@@ -462,6 +477,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        struct ocfs2_extent_list *fel;
        u64 fe_blkno = 0;
        u16 suballoc_bit;
+       u16 feat;
 
        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
                   inode->i_mode, (unsigned long)dev, dentry->d_name.len,
@@ -469,8 +485,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 
        *new_fe_bh = NULL;
 
-       status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
-                                      &fe_blkno);
+       status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
+                                      inode_ac, &suballoc_bit, &fe_blkno);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -513,7 +529,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        fe->i_mode = cpu_to_le16(inode->i_mode);
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
                fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
-       fe->i_links_count = cpu_to_le16(inode->i_nlink);
+
+       ocfs2_set_links_count(fe, inode->i_nlink);
 
        fe->i_last_eb_blk = 0;
        strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
@@ -525,11 +542,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        fe->i_dtime = 0;
 
        /*
-        * If supported, directories start with inline data.
+        * If supported, directories start with inline data. If inline
+        * isn't supported, but indexing is, we start them as indexed.
         */
+       feat = le16_to_cpu(fe->i_dyn_features);
        if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
-               u16 feat = le16_to_cpu(fe->i_dyn_features);
-
                fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
 
                fe->id2.i_data.id_count = cpu_to_le16(
@@ -608,9 +625,9 @@ static int ocfs2_link(struct dentry *old_dentry,
        int err;
        struct buffer_head *fe_bh = NULL;
        struct buffer_head *parent_fe_bh = NULL;
-       struct buffer_head *de_bh = NULL;
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+       struct ocfs2_dir_lookup_result lookup = { NULL, };
 
        mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
                   old_dentry->d_name.len, old_dentry->d_name.name,
@@ -638,7 +655,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 
        err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
                                           dentry->d_name.name,
-                                          dentry->d_name.len, &de_bh);
+                                          dentry->d_name.len, &lookup);
        if (err < 0) {
                mlog_errno(err);
                goto out;
@@ -652,7 +669,7 @@ static int ocfs2_link(struct dentry *old_dentry,
        }
 
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
-       if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) {
+       if (ocfs2_read_links_count(fe) >= ocfs2_link_max(osb)) {
                err = -EMLINK;
                goto out_unlock_inode;
        }
@@ -674,13 +691,13 @@ static int ocfs2_link(struct dentry *old_dentry,
 
        inc_nlink(inode);
        inode->i_ctime = CURRENT_TIME;
-       fe->i_links_count = cpu_to_le16(inode->i_nlink);
+       ocfs2_set_links_count(fe, inode->i_nlink);
        fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
        fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
 
        err = ocfs2_journal_dirty(handle, fe_bh);
        if (err < 0) {
-               le16_add_cpu(&fe->i_links_count, -1);
+               ocfs2_add_links_count(fe, -1);
                drop_nlink(inode);
                mlog_errno(err);
                goto out_commit;
@@ -688,9 +705,9 @@ static int ocfs2_link(struct dentry *old_dentry,
 
        err = ocfs2_add_entry(handle, dentry, inode,
                              OCFS2_I(inode)->ip_blkno,
-                             parent_fe_bh, de_bh);
+                             parent_fe_bh, &lookup);
        if (err) {
-               le16_add_cpu(&fe->i_links_count, -1);
+               ocfs2_add_links_count(fe, -1);
                drop_nlink(inode);
                mlog_errno(err);
                goto out_commit;
@@ -714,10 +731,11 @@ out_unlock_inode:
 out:
        ocfs2_inode_unlock(dir, 1);
 
-       brelse(de_bh);
        brelse(fe_bh);
        brelse(parent_fe_bh);
 
+       ocfs2_free_dir_lookup_result(&lookup);
+
        mlog_exit(err);
 
        return err;
@@ -766,10 +784,9 @@ static int ocfs2_unlink(struct inode *dir,
        struct buffer_head *fe_bh = NULL;
        struct buffer_head *parent_node_bh = NULL;
        handle_t *handle = NULL;
-       struct ocfs2_dir_entry *dirent = NULL;
-       struct buffer_head *dirent_bh = NULL;
        char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
-       struct buffer_head *orphan_entry_bh = NULL;
+       struct ocfs2_dir_lookup_result lookup = { NULL, };
+       struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
 
        mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
                   dentry->d_name.len, dentry->d_name.name);
@@ -791,8 +808,8 @@ static int ocfs2_unlink(struct inode *dir,
        }
 
        status = ocfs2_find_files_on_disk(dentry->d_name.name,
-                                         dentry->d_name.len, &blkno,
-                                         dir, &dirent_bh, &dirent);
+                                         dentry->d_name.len, &blkno, dir,
+                                         &lookup);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
@@ -817,10 +834,7 @@ static int ocfs2_unlink(struct inode *dir,
        child_locked = 1;
 
        if (S_ISDIR(inode->i_mode)) {
-               if (!ocfs2_empty_dir(inode)) {
-                       status = -ENOTEMPTY;
-                       goto leave;
-               } else if (inode->i_nlink != 2) {
+               if (inode->i_nlink != 2 || !ocfs2_empty_dir(inode)) {
                        status = -ENOTEMPTY;
                        goto leave;
                }
@@ -836,8 +850,7 @@ static int ocfs2_unlink(struct inode *dir,
 
        if (inode_is_unlinkable(inode)) {
                status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode,
-                                                 orphan_name,
-                                                 &orphan_entry_bh);
+                                                 orphan_name, &orphan_insert);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -863,7 +876,7 @@ static int ocfs2_unlink(struct inode *dir,
 
        if (inode_is_unlinkable(inode)) {
                status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
-                                         orphan_entry_bh, orphan_dir);
+                                         &orphan_insert, orphan_dir);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -871,7 +884,7 @@ static int ocfs2_unlink(struct inode *dir,
        }
 
        /* delete the name from the parent dir */
-       status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh);
+       status = ocfs2_delete_entry(handle, dir, &lookup);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -880,7 +893,7 @@ static int ocfs2_unlink(struct inode *dir,
        if (S_ISDIR(inode->i_mode))
                drop_nlink(inode);
        drop_nlink(inode);
-       fe->i_links_count = cpu_to_le16(inode->i_nlink);
+       ocfs2_set_links_count(fe, inode->i_nlink);
 
        status = ocfs2_journal_dirty(handle, fe_bh);
        if (status < 0) {
@@ -916,9 +929,10 @@ leave:
        }
 
        brelse(fe_bh);
-       brelse(dirent_bh);
        brelse(parent_node_bh);
-       brelse(orphan_entry_bh);
+
+       ocfs2_free_dir_lookup_result(&orphan_insert);
+       ocfs2_free_dir_lookup_result(&lookup);
 
        mlog_exit(status);
 
@@ -1004,8 +1018,8 @@ static int ocfs2_rename(struct inode *old_dir,
                        struct inode *new_dir,
                        struct dentry *new_dentry)
 {
-       int status = 0, rename_lock = 0, parents_locked = 0;
-       int old_child_locked = 0, new_child_locked = 0;
+       int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0;
+       int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0;
        struct inode *old_inode = old_dentry->d_inode;
        struct inode *new_inode = new_dentry->d_inode;
        struct inode *orphan_dir = NULL;
@@ -1020,13 +1034,13 @@ static int ocfs2_rename(struct inode *old_dir,
        handle_t *handle = NULL;
        struct buffer_head *old_dir_bh = NULL;
        struct buffer_head *new_dir_bh = NULL;
-       struct ocfs2_dir_entry *old_inode_dot_dot_de = NULL, *old_de = NULL,
-               *new_de = NULL;
-       struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
-       struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
-                                                   // this is the 1st dirent bh
        nlink_t old_dir_nlink = old_dir->i_nlink;
        struct ocfs2_dinode *old_di;
+       struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, };
+       struct ocfs2_dir_lookup_result target_lookup_res = { NULL, };
+       struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
+       struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+       struct ocfs2_dir_lookup_result target_insert = { NULL, };
 
        /* At some point it might be nice to break this function up a
         * bit. */
@@ -1108,9 +1122,10 @@ static int ocfs2_rename(struct inode *old_dir,
        if (S_ISDIR(old_inode->i_mode)) {
                u64 old_inode_parent;
 
+               update_dot_dot = 1;
                status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent,
-                                                 old_inode, &old_inode_de_bh,
-                                                 &old_inode_dot_dot_de);
+                                                 old_inode,
+                                                 &old_inode_dot_dot_res);
                if (status) {
                        status = -EIO;
                        goto bail;
@@ -1122,7 +1137,7 @@ static int ocfs2_rename(struct inode *old_dir,
                }
 
                if (!new_inode && new_dir != old_dir &&
-                   new_dir->i_nlink >= OCFS2_LINK_MAX) {
+                   new_dir->i_nlink >= ocfs2_link_max(osb)) {
                        status = -EMLINK;
                        goto bail;
                }
@@ -1151,8 +1166,8 @@ static int ocfs2_rename(struct inode *old_dir,
         * to delete it */
        status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
                                          new_dentry->d_name.len,
-                                         &newfe_blkno, new_dir, &new_de_bh,
-                                         &new_de);
+                                         &newfe_blkno, new_dir,
+                                         &target_lookup_res);
        /* The only error we allow here is -ENOENT because the new
         * file not existing is perfectly valid. */
        if ((status < 0) && (status != -ENOENT)) {
@@ -1161,8 +1176,10 @@ static int ocfs2_rename(struct inode *old_dir,
                mlog_errno(status);
                goto bail;
        }
+       if (status == 0)
+               target_exists = 1;
 
-       if (!new_de && new_inode) {
+       if (!target_exists && new_inode) {
                /*
                 * Target was unlinked by another node while we were
                 * waiting to get to ocfs2_rename(). There isn't
@@ -1175,7 +1192,7 @@ static int ocfs2_rename(struct inode *old_dir,
 
        /* In case we need to overwrite an existing file, we blow it
         * away first */
-       if (new_de) {
+       if (target_exists) {
                /* VFS didn't think there existed an inode here, but
                 * someone else in the cluster must have raced our
                 * rename to create one. Today we error cleanly, in
@@ -1216,8 +1233,8 @@ static int ocfs2_rename(struct inode *old_dir,
 
                newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
 
-               mlog(0, "aha rename over existing... new_de=%p new_blkno=%llu "
-                    "newfebh=%p bhblocknr=%llu\n", new_de,
+               mlog(0, "aha rename over existing... new_blkno=%llu "
+                    "newfebh=%p bhblocknr=%llu\n",
                     (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
                     (unsigned long long)newfe_bh->b_blocknr : 0ULL);
 
@@ -1225,7 +1242,7 @@ static int ocfs2_rename(struct inode *old_dir,
                        status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
                                                          new_inode,
                                                          orphan_name,
-                                                         &orphan_entry_bh);
+                                                         &orphan_insert);
                        if (status < 0) {
                                mlog_errno(status);
                                goto bail;
@@ -1243,7 +1260,7 @@ static int ocfs2_rename(struct inode *old_dir,
                status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
                                                      new_dentry->d_name.name,
                                                      new_dentry->d_name.len,
-                                                     &insert_entry_bh);
+                                                     &target_insert);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -1258,10 +1275,10 @@ static int ocfs2_rename(struct inode *old_dir,
                goto bail;
        }
 
-       if (new_de) {
+       if (target_exists) {
                if (S_ISDIR(new_inode->i_mode)) {
-                       if (!ocfs2_empty_dir(new_inode) ||
-                           new_inode->i_nlink != 2) {
+                       if (new_inode->i_nlink != 2 ||
+                           !ocfs2_empty_dir(new_inode)) {
                                status = -ENOTEMPTY;
                                goto bail;
                        }
@@ -1274,10 +1291,10 @@ static int ocfs2_rename(struct inode *old_dir,
                }
 
                if (S_ISDIR(new_inode->i_mode) ||
-                   (newfe->i_links_count == cpu_to_le16(1))){
+                   (ocfs2_read_links_count(newfe) == 1)) {
                        status = ocfs2_orphan_add(osb, handle, new_inode,
                                                  newfe, orphan_name,
-                                                 orphan_entry_bh, orphan_dir);
+                                                 &orphan_insert, orphan_dir);
                        if (status < 0) {
                                mlog_errno(status);
                                goto bail;
@@ -1285,8 +1302,8 @@ static int ocfs2_rename(struct inode *old_dir,
                }
 
                /* change the dirent to point to the correct inode */
-               status = ocfs2_update_entry(new_dir, handle, new_de_bh,
-                                           new_de, old_inode);
+               status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
+                                           old_inode);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -1294,9 +1311,9 @@ static int ocfs2_rename(struct inode *old_dir,
                new_dir->i_version++;
 
                if (S_ISDIR(new_inode->i_mode))
-                       newfe->i_links_count = 0;
+                       ocfs2_set_links_count(newfe, 0);
                else
-                       le16_add_cpu(&newfe->i_links_count, -1);
+                       ocfs2_add_links_count(newfe, -1);
 
                status = ocfs2_journal_dirty(handle, newfe_bh);
                if (status < 0) {
@@ -1307,7 +1324,7 @@ static int ocfs2_rename(struct inode *old_dir,
                /* if the name was not found in new_dir, add it now */
                status = ocfs2_add_entry(handle, new_dentry, old_inode,
                                         OCFS2_I(old_inode)->ip_blkno,
-                                        new_dir_bh, insert_entry_bh);
+                                        new_dir_bh, &target_insert);
        }
 
        old_inode->i_ctime = CURRENT_TIME;
@@ -1334,15 +1351,13 @@ static int ocfs2_rename(struct inode *old_dir,
         * because the insert might have changed the type of directory
         * we're dealing with.
         */
-       old_de_bh = ocfs2_find_entry(old_dentry->d_name.name,
-                                    old_dentry->d_name.len,
-                                    old_dir, &old_de);
-       if (!old_de_bh) {
-               status = -EIO;
+       status = ocfs2_find_entry(old_dentry->d_name.name,
+                                 old_dentry->d_name.len, old_dir,
+                                 &old_entry_lookup);
+       if (status)
                goto bail;
-       }
 
-       status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh);
+       status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1353,9 +1368,10 @@ static int ocfs2_rename(struct inode *old_dir,
                new_inode->i_ctime = CURRENT_TIME;
        }
        old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
-       if (old_inode_de_bh) {
-               status = ocfs2_update_entry(old_inode, handle, old_inode_de_bh,
-                                           old_inode_dot_dot_de, new_dir);
+
+       if (update_dot_dot) {
+               status = ocfs2_update_entry(old_inode, handle,
+                                           &old_inode_dot_dot_res, new_dir);
                old_dir->i_nlink--;
                if (new_inode) {
                        new_inode->i_nlink--;
@@ -1391,14 +1407,13 @@ static int ocfs2_rename(struct inode *old_dir,
                } else {
                        struct ocfs2_dinode *fe;
                        status = ocfs2_journal_access_di(handle, old_dir,
-                                                        old_dir_bh,
-                                                        OCFS2_JOURNAL_ACCESS_WRITE);
+                                                     old_dir_bh,
+                                                     OCFS2_JOURNAL_ACCESS_WRITE);
                        fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
-                       fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
+                       ocfs2_set_links_count(fe, old_dir->i_nlink);
                        status = ocfs2_journal_dirty(handle, old_dir_bh);
                }
        }
-
        ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
        status = 0;
 bail:
@@ -1429,13 +1444,17 @@ bail:
 
        if (new_inode)
                iput(new_inode);
+
+       ocfs2_free_dir_lookup_result(&target_lookup_res);
+       ocfs2_free_dir_lookup_result(&old_entry_lookup);
+       ocfs2_free_dir_lookup_result(&old_inode_dot_dot_res);
+       ocfs2_free_dir_lookup_result(&orphan_insert);
+       ocfs2_free_dir_lookup_result(&target_insert);
+
        brelse(newfe_bh);
        brelse(old_inode_bh);
        brelse(old_dir_bh);
        brelse(new_dir_bh);
-       brelse(new_de_bh);
-       brelse(old_de_bh);
-       brelse(old_inode_de_bh);
        brelse(orphan_entry_bh);
        brelse(insert_entry_bh);
 
@@ -1558,7 +1577,6 @@ static int ocfs2_symlink(struct inode *dir,
        struct inode *inode = NULL;
        struct super_block *sb;
        struct buffer_head *new_fe_bh = NULL;
-       struct buffer_head *de_bh = NULL;
        struct buffer_head *parent_fe_bh = NULL;
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_dinode *dirfe;
@@ -1572,6 +1590,7 @@ static int ocfs2_symlink(struct inode *dir,
                .enable = 1,
        };
        int did_quota = 0, did_quota_inode = 0;
+       struct ocfs2_dir_lookup_result lookup = { NULL, };
 
        mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
                   dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1592,7 +1611,7 @@ static int ocfs2_symlink(struct inode *dir,
        }
 
        dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
-       if (!dirfe->i_links_count) {
+       if (!ocfs2_read_links_count(dirfe)) {
                /* can't make a file in a deleted directory. */
                status = -ENOENT;
                goto bail;
@@ -1605,7 +1624,7 @@ static int ocfs2_symlink(struct inode *dir,
 
        status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
                                              dentry->d_name.name,
-                                             dentry->d_name.len, &de_bh);
+                                             dentry->d_name.len, &lookup);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1744,7 +1763,7 @@ static int ocfs2_symlink(struct inode *dir,
 
        status = ocfs2_add_entry(handle, dentry, inode,
                                 le64_to_cpu(fe->i_blkno), parent_fe_bh,
-                                de_bh);
+                                &lookup);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1772,9 +1791,9 @@ bail:
 
        brelse(new_fe_bh);
        brelse(parent_fe_bh);
-       brelse(de_bh);
        kfree(si.name);
        kfree(si.value);
+       ocfs2_free_dir_lookup_result(&lookup);
        if (inode_ac)
                ocfs2_free_alloc_context(inode_ac);
        if (data_ac)
@@ -1826,7 +1845,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
                                    struct inode **ret_orphan_dir,
                                    struct inode *inode,
                                    char *name,
-                                   struct buffer_head **de_bh)
+                                   struct ocfs2_dir_lookup_result *lookup)
 {
        struct inode *orphan_dir_inode;
        struct buffer_head *orphan_dir_bh = NULL;
@@ -1857,7 +1876,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
 
        status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
                                              orphan_dir_bh, name,
-                                             OCFS2_ORPHAN_NAMELEN, de_bh);
+                                             OCFS2_ORPHAN_NAMELEN, lookup);
        if (status < 0) {
                ocfs2_inode_unlock(orphan_dir_inode, 1);
 
@@ -1884,7 +1903,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            struct inode *inode,
                            struct ocfs2_dinode *fe,
                            char *name,
-                           struct buffer_head *de_bh,
+                           struct ocfs2_dir_lookup_result *lookup,
                            struct inode *orphan_dir_inode)
 {
        struct buffer_head *orphan_dir_bh = NULL;
@@ -1910,8 +1929,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
         * underneath us... */
        orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
        if (S_ISDIR(inode->i_mode))
-               le16_add_cpu(&orphan_fe->i_links_count, 1);
-       orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
+               ocfs2_add_links_count(orphan_fe, 1);
+       orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
 
        status = ocfs2_journal_dirty(handle, orphan_dir_bh);
        if (status < 0) {
@@ -1922,7 +1941,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
                                   OCFS2_ORPHAN_NAMELEN, inode,
                                   OCFS2_I(inode)->ip_blkno,
-                                  orphan_dir_bh, de_bh);
+                                  orphan_dir_bh, lookup);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1955,8 +1974,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        char name[OCFS2_ORPHAN_NAMELEN + 1];
        struct ocfs2_dinode *orphan_fe;
        int status = 0;
-       struct buffer_head *target_de_bh = NULL;
-       struct ocfs2_dir_entry *target_de = NULL;
+       struct ocfs2_dir_lookup_result lookup = { NULL, };
 
        mlog_entry_void();
 
@@ -1971,17 +1989,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
             OCFS2_ORPHAN_NAMELEN);
 
        /* find it's spot in the orphan directory */
-       target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN,
-                                       orphan_dir_inode, &target_de);
-       if (!target_de_bh) {
-               status = -ENOENT;
+       status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
+                                 &lookup);
+       if (status) {
                mlog_errno(status);
                goto leave;
        }
 
        /* remove it from the orphan directory */
-       status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de,
-                                   target_de_bh);
+       status = ocfs2_delete_entry(handle, orphan_dir_inode, &lookup);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1997,8 +2013,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        /* do the i_nlink dance! :) */
        orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
        if (S_ISDIR(inode->i_mode))
-               le16_add_cpu(&orphan_fe->i_links_count, -1);
-       orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
+               ocfs2_add_links_count(orphan_fe, -1);
+       orphan_dir_inode->i_nlink = ocfs2_read_links_count(orphan_fe);
 
        status = ocfs2_journal_dirty(handle, orphan_dir_bh);
        if (status < 0) {
@@ -2007,7 +2023,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        }
 
 leave:
-       brelse(target_de_bh);
+       ocfs2_free_dir_lookup_result(&lookup);
 
        mlog_exit(status);
        return status;
index 946d3c34b90ba29e0e1cc0d44df7ca9bfe1878f5..1386281950dbead47e4b5528cde7caae706cb3f2 100644 (file)
@@ -209,6 +209,7 @@ enum ocfs2_mount_options
 struct ocfs2_journal;
 struct ocfs2_slot_info;
 struct ocfs2_recovery_map;
+struct ocfs2_replay_map;
 struct ocfs2_quota_recovery;
 struct ocfs2_dentry_lock;
 struct ocfs2_super
@@ -264,6 +265,7 @@ struct ocfs2_super
        atomic_t vol_state;
        struct mutex recovery_lock;
        struct ocfs2_recovery_map *recovery_map;
+       struct ocfs2_replay_map *replay_map;
        struct task_struct *recovery_thread_task;
        int disable_recovery;
        wait_queue_head_t checkpoint_event;
@@ -287,11 +289,6 @@ struct ocfs2_super
 
        u64 la_last_gd;
 
-#ifdef CONFIG_OCFS2_FS_STATS
-       struct dentry *local_alloc_debug;
-       char *local_alloc_debug_buf;
-#endif
-
        /* Next three fields are for local node slot recovery during
         * mount. */
        int dirty;
@@ -305,9 +302,11 @@ struct ocfs2_super
        struct ocfs2_cluster_connection *cconn;
        struct ocfs2_lock_res osb_super_lockres;
        struct ocfs2_lock_res osb_rename_lockres;
+       struct ocfs2_lock_res osb_nfs_sync_lockres;
        struct ocfs2_dlm_debug *osb_dlm_debug;
 
        struct dentry *osb_debug_root;
+       struct dentry *osb_ctxt;
 
        wait_queue_head_t recovery_event;
 
@@ -344,6 +343,12 @@ struct ocfs2_super
 
        /* used to protect metaecc calculation check of xattr. */
        spinlock_t osb_xattr_lock;
+
+       unsigned int                    osb_dx_mask;
+       u32                             osb_dx_seed[4];
+
+       /* the group we used to allocate inodes. */
+       u64                             osb_inode_alloc_group;
 };
 
 #define OCFS2_SB(sb)       ((struct ocfs2_super *)(sb)->s_fs_info)
@@ -402,6 +407,51 @@ static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
        return 0;
 }
 
+static inline int ocfs2_supports_indexed_dirs(struct ocfs2_super *osb)
+{
+       if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
+               return 1;
+       return 0;
+}
+
+static inline unsigned int ocfs2_link_max(struct ocfs2_super *osb)
+{
+       if (ocfs2_supports_indexed_dirs(osb))
+               return OCFS2_DX_LINK_MAX;
+       return OCFS2_LINK_MAX;
+}
+
+static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di)
+{
+       u32 nlink = le16_to_cpu(di->i_links_count);
+       u32 hi = le16_to_cpu(di->i_links_count_hi);
+
+       if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL))
+               nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
+
+       return nlink;
+}
+
+static inline void ocfs2_set_links_count(struct ocfs2_dinode *di, u32 nlink)
+{
+       u16 lo, hi;
+
+       lo = nlink;
+       hi = nlink >> OCFS2_LINKS_HI_SHIFT;
+
+       di->i_links_count = cpu_to_le16(lo);
+       di->i_links_count_hi = cpu_to_le16(hi);
+}
+
+static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n)
+{
+       u32 links = ocfs2_read_links_count(di);
+
+       links += n;
+
+       ocfs2_set_links_count(di, links);
+}
+
 /* set / clear functions because cluster events can make these happen
  * in parallel so we want the transitions to be atomic. this also
  * means that any future flags osb_flags must be protected by spinlock
@@ -482,6 +532,12 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
 #define OCFS2_IS_VALID_DIR_TRAILER(ptr)                                        \
        (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
 
+#define OCFS2_IS_VALID_DX_ROOT(ptr)                                    \
+       (!strcmp((ptr)->dr_signature, OCFS2_DX_ROOT_SIGNATURE))
+
+#define OCFS2_IS_VALID_DX_LEAF(ptr)                                    \
+       (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE))
+
 static inline unsigned long ino_from_blkno(struct super_block *sb,
                                           u64 blkno)
 {
@@ -532,6 +588,16 @@ static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
        return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
 }
 
+static inline u64 ocfs2_block_to_cluster_start(struct super_block *sb,
+                                              u64 blocks)
+{
+       int bits = OCFS2_SB(sb)->s_clustersize_bits - sb->s_blocksize_bits;
+       unsigned int clusters;
+
+       clusters = ocfs2_blocks_to_clusters(sb, blocks);
+       return (u64)clusters << bits;
+}
+
 static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
                                                u64 bytes)
 {
index 2332ef740f4f9bc852ad064832d9fa5a161bb2fe..7ab6e9e5e77c0beda01c3d4bcfb6d95551127b61 100644 (file)
@@ -66,6 +66,8 @@
 #define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
 #define OCFS2_XATTR_BLOCK_SIGNATURE    "XATTR01"
 #define OCFS2_DIR_TRAILER_SIGNATURE    "DIRTRL1"
+#define OCFS2_DX_ROOT_SIGNATURE                "DXDIR01"
+#define OCFS2_DX_LEAF_SIGNATURE                "DXLEAF1"
 
 /* Compatibility flags */
 #define OCFS2_HAS_COMPAT_FEATURE(sb,mask)                      \
@@ -95,7 +97,8 @@
                                         | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
                                         | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
                                         | OCFS2_FEATURE_INCOMPAT_XATTR \
-                                        | OCFS2_FEATURE_INCOMPAT_META_ECC)
+                                        | OCFS2_FEATURE_INCOMPAT_META_ECC \
+                                        | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP   (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
                                         | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
                                         | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
 /* Support for extended attributes */
 #define OCFS2_FEATURE_INCOMPAT_XATTR           0x0200
 
+/* Support for indexed directores */
+#define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS    0x0400
+
 /* Metadata checksum and error correction */
 #define OCFS2_FEATURE_INCOMPAT_META_ECC                0x0800
 
@@ -411,8 +417,12 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
 #define OCFS2_DIR_REC_LEN(name_len)    (((name_len) + OCFS2_DIR_MEMBER_LEN + \
                                           OCFS2_DIR_ROUND) & \
                                         ~OCFS2_DIR_ROUND)
+#define OCFS2_DIR_MIN_REC_LEN  OCFS2_DIR_REC_LEN(1)
 
 #define OCFS2_LINK_MAX         32000
+#define        OCFS2_DX_LINK_MAX       ((1U << 31) - 1U)
+#define        OCFS2_LINKS_HI_SHIFT    16
+#define        OCFS2_DX_ENTRIES_MAX    (0xffffffffU)
 
 #define S_SHIFT                        12
 static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
@@ -628,8 +638,9 @@ struct ocfs2_super_block {
 /*B8*/ __le16 s_xattr_inline_size;     /* extended attribute inline size
                                           for this fs*/
        __le16 s_reserved0;
-       __le32 s_reserved1;
-/*C0*/  __le64 s_reserved2[16];                /* Fill out superblock */
+       __le32 s_dx_seed[3];            /* seed[0-2] for dx dir hash.
+                                        * s_uuid_hash serves as seed[3]. */
+/*C0*/  __le64 s_reserved2[15];                /* Fill out superblock */
 /*140*/
 
        /*
@@ -679,7 +690,7 @@ struct ocfs2_dinode {
                                           belongs to */
        __le16 i_suballoc_bit;          /* Bit offset in suballocator
                                           block group */
-/*10*/ __le16 i_reserved0;
+/*10*/ __le16 i_links_count_hi;        /* High 16 bits of links count */
        __le16 i_xattr_inline_size;
        __le32 i_clusters;              /* Cluster count */
        __le32 i_uid;                   /* Owner UID */
@@ -705,7 +716,8 @@ struct ocfs2_dinode {
        __le16 i_dyn_features;
        __le64 i_xattr_loc;
 /*80*/ struct ocfs2_block_check i_check;       /* Error checking */
-/*88*/ __le64 i_reserved2[6];
+/*88*/ __le64 i_dx_root;               /* Pointer to dir index root block */
+       __le64 i_reserved2[5];
 /*B8*/ union {
                __le64 i_pad1;          /* Generic way to refer to this
                                           64bit union */
@@ -781,6 +793,90 @@ struct ocfs2_dir_block_trailer {
 /*40*/
 };
 
+ /*
+ * A directory entry in the indexed tree. We don't store the full name here,
+ * but instead provide a pointer to the full dirent in the unindexed tree.
+ *
+ * We also store name_len here so as to reduce the number of leaf blocks we
+ * need to search in case of collisions.
+ */
+struct ocfs2_dx_entry {
+       __le32          dx_major_hash;  /* Used to find logical
+                                        * cluster in index */
+       __le32          dx_minor_hash;  /* Lower bits used to find
+                                        * block in cluster */
+       __le64          dx_dirent_blk;  /* Physical block in unindexed
+                                        * tree holding this dirent. */
+};
+
+struct ocfs2_dx_entry_list {
+       __le32          de_reserved;
+       __le16          de_count;       /* Maximum number of entries
+                                        * possible in de_entries */
+       __le16          de_num_used;    /* Current number of
+                                        * de_entries entries */
+       struct  ocfs2_dx_entry          de_entries[0];  /* Indexed dir entries
+                                                        * in a packed array of
+                                                        * length de_num_used */
+};
+
+#define OCFS2_DX_FLAG_INLINE   0x01
+
+/*
+ * A directory indexing block. Each indexed directory has one of these,
+ * pointed to by ocfs2_dinode.
+ *
+ * This block stores an indexed btree root, and a set of free space
+ * start-of-list pointers.
+ */
+struct ocfs2_dx_root_block {
+       __u8            dr_signature[8];        /* Signature for verification */
+       struct ocfs2_block_check dr_check;      /* Error checking */
+       __le16          dr_suballoc_slot;       /* Slot suballocator this
+                                                * block belongs to. */
+       __le16          dr_suballoc_bit;        /* Bit offset in suballocator
+                                                * block group */
+       __le32          dr_fs_generation;       /* Must match super block */
+       __le64          dr_blkno;               /* Offset on disk, in blocks */
+       __le64          dr_last_eb_blk;         /* Pointer to last
+                                                * extent block */
+       __le32          dr_clusters;            /* Clusters allocated
+                                                * to the indexed tree. */
+       __u8            dr_flags;               /* OCFS2_DX_FLAG_* flags */
+       __u8            dr_reserved0;
+       __le16          dr_reserved1;
+       __le64          dr_dir_blkno;           /* Pointer to parent inode */
+       __le32          dr_num_entries;         /* Total number of
+                                                * names stored in
+                                                * this directory.*/
+       __le32          dr_reserved2;
+       __le64          dr_free_blk;            /* Pointer to head of free
+                                                * unindexed block list. */
+       __le64          dr_reserved3[15];
+       union {
+               struct ocfs2_extent_list dr_list; /* Keep this aligned to 128
+                                                  * bits for maximum space
+                                                  * efficiency. */
+               struct ocfs2_dx_entry_list dr_entries; /* In-root-block list of
+                                                       * entries. We grow out
+                                                       * to extents if this
+                                                       * gets too big. */
+       };
+};
+
+/*
+ * The header of a leaf block in the indexed tree.
+ */
+struct ocfs2_dx_leaf {
+       __u8            dl_signature[8];/* Signature for verification */
+       struct ocfs2_block_check dl_check;      /* Error checking */
+       __le64          dl_blkno;       /* Offset on disk, in blocks */
+       __le32          dl_fs_generation;/* Must match super block */
+       __le32          dl_reserved0;
+       __le64          dl_reserved1;
+       struct ocfs2_dx_entry_list      dl_list;
+};
+
 /*
  * On disk allocator group structure for OCFS2
  */
@@ -1112,6 +1208,16 @@ static inline int ocfs2_extent_recs_per_inode_with_xattr(
        return size / sizeof(struct ocfs2_extent_rec);
 }
 
+static inline int ocfs2_extent_recs_per_dx_root(struct super_block *sb)
+{
+       int size;
+
+       size = sb->s_blocksize -
+               offsetof(struct ocfs2_dx_root_block, dr_list.l_recs);
+
+       return size / sizeof(struct ocfs2_extent_rec);
+}
+
 static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
 {
        int size;
@@ -1132,6 +1238,26 @@ static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
        return size / sizeof(struct ocfs2_extent_rec);
 }
 
+static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb)
+{
+       int size;
+
+       size = sb->s_blocksize -
+               offsetof(struct ocfs2_dx_leaf, dl_list.de_entries);
+
+       return size / sizeof(struct ocfs2_dx_entry);
+}
+
+static inline int ocfs2_dx_entries_per_root(struct super_block *sb)
+{
+       int size;
+
+       size = sb->s_blocksize -
+               offsetof(struct ocfs2_dx_root_block, dr_entries.de_entries);
+
+       return size / sizeof(struct ocfs2_dx_entry);
+}
+
 static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
 {
        u16 size;
index eb6f50c9cecae194cab61bb88ee5a185a90a1e93..a53ce87481bf20b1c724d2c699fab3c52674b991 100644 (file)
@@ -47,6 +47,7 @@ enum ocfs2_lock_type {
        OCFS2_LOCK_TYPE_OPEN,
        OCFS2_LOCK_TYPE_FLOCK,
        OCFS2_LOCK_TYPE_QINFO,
+       OCFS2_LOCK_TYPE_NFS_SYNC,
        OCFS2_NUM_LOCK_TYPES
 };
 
@@ -81,6 +82,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
                case OCFS2_LOCK_TYPE_QINFO:
                        c = 'Q';
                        break;
+               case OCFS2_LOCK_TYPE_NFS_SYNC:
+                       c = 'Y';
+                       break;
                default:
                        c = '\0';
        }
index a69628603e18273cdbc3d2b5d7e5a440623c17c8..b4ca5911caafc3824f6cc8c1a61e69f5a316c4eb 100644 (file)
@@ -48,7 +48,8 @@
 #include "buffer_head_io.h"
 
 #define NOT_ALLOC_NEW_GROUP            0
-#define ALLOC_NEW_GROUP                        1
+#define ALLOC_NEW_GROUP                        0x1
+#define ALLOC_GROUPS_FROM_GLOBAL       0x2
 
 #define OCFS2_MAX_INODES_TO_STEAL      1024
 
@@ -64,7 +65,9 @@ static int ocfs2_block_group_fill(handle_t *handle,
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                                   struct inode *alloc_inode,
                                   struct buffer_head *bh,
-                                  u64 max_block);
+                                  u64 max_block,
+                                  u64 *last_alloc_group,
+                                  int flags);
 
 static int ocfs2_cluster_group_search(struct inode *inode,
                                      struct buffer_head *group_bh,
@@ -116,6 +119,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
                                                u16 *bg_bit_off);
 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
                                             u32 bits_wanted, u64 max_block,
+                                            int flags,
                                             struct ocfs2_alloc_context **ac);
 
 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
@@ -403,7 +407,9 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                                   struct inode *alloc_inode,
                                   struct buffer_head *bh,
-                                  u64 max_block)
+                                  u64 max_block,
+                                  u64 *last_alloc_group,
+                                  int flags)
 {
        int status, credits;
        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
@@ -423,7 +429,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        cl = &fe->id2.i_chain;
        status = ocfs2_reserve_clusters_with_limit(osb,
                                                   le16_to_cpu(cl->cl_cpg),
-                                                  max_block, &ac);
+                                                  max_block, flags, &ac);
        if (status < 0) {
                if (status != -ENOSPC)
                        mlog_errno(status);
@@ -440,6 +446,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
                goto bail;
        }
 
+       if (last_alloc_group && *last_alloc_group != 0) {
+               mlog(0, "use old allocation group %llu for block group alloc\n",
+                    (unsigned long long)*last_alloc_group);
+               ac->ac_last_group = *last_alloc_group;
+       }
        status = ocfs2_claim_clusters(osb,
                                      handle,
                                      ac,
@@ -514,6 +525,11 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
 
        status = 0;
+
+       /* save the new last alloc group so that the caller can cache it. */
+       if (last_alloc_group)
+               *last_alloc_group = ac->ac_last_group;
+
 bail:
        if (handle)
                ocfs2_commit_trans(osb, handle);
@@ -531,7 +547,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
                                       struct ocfs2_alloc_context *ac,
                                       int type,
                                       u32 slot,
-                                      int alloc_new_group)
+                                      u64 *last_alloc_group,
+                                      int flags)
 {
        int status;
        u32 bits_wanted = ac->ac_bits_wanted;
@@ -587,7 +604,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
                        goto bail;
                }
 
-               if (alloc_new_group != ALLOC_NEW_GROUP) {
+               if (!(flags & ALLOC_NEW_GROUP)) {
                        mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
                             "and we don't alloc a new group for it.\n",
                             slot, bits_wanted, free_bits);
@@ -596,7 +613,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
                }
 
                status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
-                                                ac->ac_max_block);
+                                                ac->ac_max_block,
+                                                last_alloc_group, flags);
                if (status < 0) {
                        if (status != -ENOSPC)
                                mlog_errno(status);
@@ -640,7 +658,7 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
 
        status = ocfs2_reserve_suballoc_bits(osb, (*ac),
                                             EXTENT_ALLOC_SYSTEM_INODE,
-                                            slot, ALLOC_NEW_GROUP);
+                                            slot, NULL, ALLOC_NEW_GROUP);
        if (status < 0) {
                if (status != -ENOSPC)
                        mlog_errno(status);
@@ -686,7 +704,8 @@ static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
 
                status = ocfs2_reserve_suballoc_bits(osb, ac,
                                                     INODE_ALLOC_SYSTEM_INODE,
-                                                    slot, NOT_ALLOC_NEW_GROUP);
+                                                    slot, NULL,
+                                                    NOT_ALLOC_NEW_GROUP);
                if (status >= 0) {
                        ocfs2_set_inode_steal_slot(osb, slot);
                        break;
@@ -703,6 +722,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 {
        int status;
        s16 slot = ocfs2_get_inode_steal_slot(osb);
+       u64 alloc_group;
 
        *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
        if (!(*ac)) {
@@ -738,12 +758,22 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
                goto inode_steal;
 
        atomic_set(&osb->s_num_inodes_stolen, 0);
+       alloc_group = osb->osb_inode_alloc_group;
        status = ocfs2_reserve_suballoc_bits(osb, *ac,
                                             INODE_ALLOC_SYSTEM_INODE,
-                                            osb->slot_num, ALLOC_NEW_GROUP);
+                                            osb->slot_num,
+                                            &alloc_group,
+                                            ALLOC_NEW_GROUP |
+                                            ALLOC_GROUPS_FROM_GLOBAL);
        if (status >= 0) {
                status = 0;
 
+               spin_lock(&osb->osb_lock);
+               osb->osb_inode_alloc_group = alloc_group;
+               spin_unlock(&osb->osb_lock);
+               mlog(0, "after reservation, new allocation group is "
+                    "%llu\n", (unsigned long long)alloc_group);
+
                /*
                 * Some inodes must be freed by us, so try to allocate
                 * from our own next time.
@@ -790,7 +820,7 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
 
        status = ocfs2_reserve_suballoc_bits(osb, ac,
                                             GLOBAL_BITMAP_SYSTEM_INODE,
-                                            OCFS2_INVALID_SLOT,
+                                            OCFS2_INVALID_SLOT, NULL,
                                             ALLOC_NEW_GROUP);
        if (status < 0 && status != -ENOSPC) {
                mlog_errno(status);
@@ -806,6 +836,7 @@ bail:
  * things a bit. */
 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
                                             u32 bits_wanted, u64 max_block,
+                                            int flags,
                                             struct ocfs2_alloc_context **ac)
 {
        int status;
@@ -823,7 +854,8 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
        (*ac)->ac_max_block = max_block;
 
        status = -ENOSPC;
-       if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
+       if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
+           ocfs2_alloc_should_use_local(osb, bits_wanted)) {
                status = ocfs2_reserve_local_alloc_bits(osb,
                                                        bits_wanted,
                                                        *ac);
@@ -861,7 +893,8 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
                           u32 bits_wanted,
                           struct ocfs2_alloc_context **ac)
 {
-       return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac);
+       return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
+                                                ALLOC_NEW_GROUP, ac);
 }
 
 /*
@@ -1618,8 +1651,41 @@ bail:
        return status;
 }
 
+static void ocfs2_init_inode_ac_group(struct inode *dir,
+                                     struct buffer_head *parent_fe_bh,
+                                     struct ocfs2_alloc_context *ac)
+{
+       struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
+       /*
+        * Try to allocate inodes from some specific group.
+        *
+        * If the parent dir has recorded the last group used in allocation,
+        * cool, use it. Otherwise if we try to allocate new inode from the
+        * same slot the parent dir belongs to, use the same chunk.
+        *
+        * We are very careful here to avoid the mistake of setting
+        * ac_last_group to a group descriptor from a different (unlocked) slot.
+        */
+       if (OCFS2_I(dir)->ip_last_used_group &&
+           OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
+               ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
+       else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
+               ac->ac_last_group = ocfs2_which_suballoc_group(
+                                       le64_to_cpu(fe->i_blkno),
+                                       le16_to_cpu(fe->i_suballoc_bit));
+}
+
+static inline void ocfs2_save_inode_ac_group(struct inode *dir,
+                                            struct ocfs2_alloc_context *ac)
+{
+       OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
+       OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
+}
+
 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
                          handle_t *handle,
+                         struct inode *dir,
+                         struct buffer_head *parent_fe_bh,
                          struct ocfs2_alloc_context *ac,
                          u16 *suballoc_bit,
                          u64 *fe_blkno)
@@ -1635,6 +1701,8 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
        BUG_ON(ac->ac_bits_wanted != 1);
        BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
 
+       ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
+
        status = ocfs2_claim_suballoc_bits(osb,
                                           ac,
                                           handle,
@@ -1653,6 +1721,7 @@ int ocfs2_claim_new_inode(struct ocfs2_super *osb,
 
        *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
        ac->ac_bits_given++;
+       ocfs2_save_inode_ac_group(dir, ac);
        status = 0;
 bail:
        mlog_exit(status);
@@ -2116,3 +2185,162 @@ out:
 
        return ret;
 }
+
+/*
+ * Read the inode specified by blkno to get suballoc_slot and
+ * suballoc_bit.
+ */
+static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
+                                      u16 *suballoc_slot, u16 *suballoc_bit)
+{
+       int status;
+       struct buffer_head *inode_bh = NULL;
+       struct ocfs2_dinode *inode_fe;
+
+       mlog_entry("blkno: %llu\n", blkno);
+
+       /* dirty read disk */
+       status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
+       if (status < 0) {
+               mlog(ML_ERROR, "read block %llu failed %d\n", blkno, status);
+               goto bail;
+       }
+
+       inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
+       if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
+               mlog(ML_ERROR, "invalid inode %llu requested\n", blkno);
+               status = -EINVAL;
+               goto bail;
+       }
+
+       if (le16_to_cpu(inode_fe->i_suballoc_slot) != OCFS2_INVALID_SLOT &&
+           (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
+               mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
+                    blkno, (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
+               status = -EINVAL;
+               goto bail;
+       }
+
+       if (suballoc_slot)
+               *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
+       if (suballoc_bit)
+               *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
+
+bail:
+       brelse(inode_bh);
+
+       mlog_exit(status);
+       return status;
+}
+
+/*
+ * test whether bit is SET in allocator bitmap or not.  on success, 0
+ * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
+ * is returned and *res is meaningless.  Call this after you have
+ * cluster locked against suballoc, or you may get a result based on
+ * non-up2date contents
+ */
+static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
+                                  struct inode *suballoc,
+                                  struct buffer_head *alloc_bh, u64 blkno,
+                                  u16 bit, int *res)
+{
+       struct ocfs2_dinode *alloc_fe;
+       struct ocfs2_group_desc *group;
+       struct buffer_head *group_bh = NULL;
+       u64 bg_blkno;
+       int status;
+
+       mlog_entry("blkno: %llu bit: %u\n", blkno, (unsigned int)bit);
+
+       alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
+       if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
+               mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
+                    (unsigned int)bit,
+                    ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
+               status = -EINVAL;
+               goto bail;
+       }
+
+       bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
+       status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
+                                            &group_bh);
+       if (status < 0) {
+               mlog(ML_ERROR, "read group %llu failed %d\n", bg_blkno, status);
+               goto bail;
+       }
+
+       group = (struct ocfs2_group_desc *) group_bh->b_data;
+       *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
+
+bail:
+       brelse(group_bh);
+
+       mlog_exit(status);
+       return status;
+}
+
+/*
+ * Test if the bit representing this inode (blkno) is set in the
+ * suballocator.
+ *
+ * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
+ *
+ * In the event of failure, a negative value is returned and *res is
+ * meaningless.
+ *
+ * Callers must make sure to hold nfs_sync_lock to prevent
+ * ocfs2_delete_inode() on another node from accessing the same
+ * suballocator concurrently.
+ */
+int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
+{
+       int status;
+       u16 suballoc_bit = 0, suballoc_slot = 0;
+       struct inode *inode_alloc_inode;
+       struct buffer_head *alloc_bh = NULL;
+
+       mlog_entry("blkno: %llu", blkno);
+
+       status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
+                                            &suballoc_bit);
+       if (status < 0) {
+               mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
+               goto bail;
+       }
+
+       inode_alloc_inode =
+               ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
+                                           suballoc_slot);
+       if (!inode_alloc_inode) {
+               /* the error code could be inaccurate, but we are not able to
+                * get the correct one. */
+               status = -EINVAL;
+               mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
+                    (u32)suballoc_slot);
+               goto bail;
+       }
+
+       mutex_lock(&inode_alloc_inode->i_mutex);
+       status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
+       if (status < 0) {
+               mutex_unlock(&inode_alloc_inode->i_mutex);
+               mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
+                    (u32)suballoc_slot, status);
+               goto bail;
+       }
+
+       status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
+                                        blkno, suballoc_bit, res);
+       if (status < 0)
+               mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
+
+       ocfs2_inode_unlock(inode_alloc_inode, 0);
+       mutex_unlock(&inode_alloc_inode->i_mutex);
+
+       iput(inode_alloc_inode);
+       brelse(alloc_bh);
+bail:
+       mlog_exit(status);
+       return status;
+}
index e3c13c77f9e8272baa4c0aa7651caee03ea6260f..8c9a78a43164c1f900e4961d64a3dd68de174372 100644 (file)
@@ -88,6 +88,8 @@ int ocfs2_claim_metadata(struct ocfs2_super *osb,
                         u64 *blkno_start);
 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
                          handle_t *handle,
+                         struct inode *dir,
+                         struct buffer_head *parent_fe_bh,
                          struct ocfs2_alloc_context *ac,
                          u16 *suballoc_bit,
                          u64 *fe_blkno);
@@ -186,4 +188,6 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
                          u32 clusters_to_add, u32 extents_to_split,
                          struct ocfs2_alloc_context **data_ac,
                          struct ocfs2_alloc_context **meta_ac);
+
+int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
 #endif /* _CHAINALLOC_H_ */
index 7ac83a81ee55d86466329d7f9b568432666b5144..79ff8d9d37e0844252130272309cd5853b04445e 100644 (file)
@@ -201,6 +201,170 @@ static const match_table_t tokens = {
        {Opt_err, NULL}
 };
 
+#ifdef CONFIG_DEBUG_FS
+static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
+{
+       int out = 0;
+       int i;
+       struct ocfs2_cluster_connection *cconn = osb->cconn;
+       struct ocfs2_recovery_map *rm = osb->recovery_map;
+
+       out += snprintf(buf + out, len - out,
+                       "%10s => Id: %-s  Uuid: %-s  Gen: 0x%X  Label: %-s\n",
+                       "Device", osb->dev_str, osb->uuid_str,
+                       osb->fs_generation, osb->vol_label);
+
+       out += snprintf(buf + out, len - out,
+                       "%10s => State: %d  Flags: 0x%lX\n", "Volume",
+                       atomic_read(&osb->vol_state), osb->osb_flags);
+
+       out += snprintf(buf + out, len - out,
+                       "%10s => Block: %lu  Cluster: %d\n", "Sizes",
+                       osb->sb->s_blocksize, osb->s_clustersize);
+
+       out += snprintf(buf + out, len - out,
+                       "%10s => Compat: 0x%X  Incompat: 0x%X  "
+                       "ROcompat: 0x%X\n",
+                       "Features", osb->s_feature_compat,
+                       osb->s_feature_incompat, osb->s_feature_ro_compat);
+
+       out += snprintf(buf + out, len - out,
+                       "%10s => Opts: 0x%lX  AtimeQuanta: %u\n", "Mount",
+                       osb->s_mount_opt, osb->s_atime_quantum);
+
+       out += snprintf(buf + out, len - out,
+                       "%10s => Stack: %s  Name: %*s  Version: %d.%d\n",
+                       "Cluster",
+                       (*osb->osb_cluster_stack == '\0' ?
+                        "o2cb" : osb->osb_cluster_stack),
+                       cconn->cc_namelen, cconn->cc_name,
+                       cconn->cc_version.pv_major, cconn->cc_version.pv_minor);
+
+       spin_lock(&osb->dc_task_lock);
+       out += snprintf(buf + out, len - out,
+                       "%10s => Pid: %d  Count: %lu  WakeSeq: %lu  "
+                       "WorkSeq: %lu\n", "DownCnvt",
+                       task_pid_nr(osb->dc_task), osb->blocked_lock_count,
+                       osb->dc_wake_sequence, osb->dc_work_sequence);
+       spin_unlock(&osb->dc_task_lock);
+
+       spin_lock(&osb->osb_lock);
+       out += snprintf(buf + out, len - out, "%10s => Pid: %d  Nodes:",
+                       "Recovery",
+                       (osb->recovery_thread_task ?
+                        task_pid_nr(osb->recovery_thread_task) : -1));
+       if (rm->rm_used == 0)
+               out += snprintf(buf + out, len - out, " None\n");
+       else {
+               for (i = 0; i < rm->rm_used; i++)
+                       out += snprintf(buf + out, len - out, " %d",
+                                       rm->rm_entries[i]);
+               out += snprintf(buf + out, len - out, "\n");
+       }
+       spin_unlock(&osb->osb_lock);
+
+       out += snprintf(buf + out, len - out,
+                       "%10s => Pid: %d  Interval: %lu  Needs: %d\n", "Commit",
+                       task_pid_nr(osb->commit_task), osb->osb_commit_interval,
+                       atomic_read(&osb->needs_checkpoint));
+
+       out += snprintf(buf + out, len - out,
+                       "%10s => State: %d  NumTxns: %d  TxnId: %lu\n",
+                       "Journal", osb->journal->j_state,
+                       atomic_read(&osb->journal->j_num_trans),
+                       osb->journal->j_trans_id);
+
+       out += snprintf(buf + out, len - out,
+                       "%10s => GlobalAllocs: %d  LocalAllocs: %d  "
+                       "SubAllocs: %d  LAWinMoves: %d  SAExtends: %d\n",
+                       "Stats",
+                       atomic_read(&osb->alloc_stats.bitmap_data),
+                       atomic_read(&osb->alloc_stats.local_data),
+                       atomic_read(&osb->alloc_stats.bg_allocs),
+                       atomic_read(&osb->alloc_stats.moves),
+                       atomic_read(&osb->alloc_stats.bg_extends));
+
+       out += snprintf(buf + out, len - out,
+                       "%10s => State: %u  Descriptor: %llu  Size: %u bits  "
+                       "Default: %u bits\n",
+                       "LocalAlloc", osb->local_alloc_state,
+                       (unsigned long long)osb->la_last_gd,
+                       osb->local_alloc_bits, osb->local_alloc_default_bits);
+
+       spin_lock(&osb->osb_lock);
+       out += snprintf(buf + out, len - out,
+                       "%10s => Slot: %d  NumStolen: %d\n", "Steal",
+                       osb->s_inode_steal_slot,
+                       atomic_read(&osb->s_num_inodes_stolen));
+       spin_unlock(&osb->osb_lock);
+
+       out += snprintf(buf + out, len - out, "%10s => %3s  %10s\n",
+                       "Slots", "Num", "RecoGen");
+
+       for (i = 0; i < osb->max_slots; ++i) {
+               out += snprintf(buf + out, len - out,
+                               "%10s  %c %3d  %10d\n",
+                               " ",
+                               (i == osb->slot_num ? '*' : ' '),
+                               i, osb->slot_recovery_generations[i]);
+       }
+
+       return out;
+}
+
+static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
+{
+       struct ocfs2_super *osb = inode->i_private;
+       char *buf = NULL;
+
+       buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+       if (!buf)
+               goto bail;
+
+       i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE));
+
+       file->private_data = buf;
+
+       return 0;
+bail:
+       return -ENOMEM;
+}
+
+static int ocfs2_debug_release(struct inode *inode, struct file *file)
+{
+       kfree(file->private_data);
+       return 0;
+}
+
+static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
+                               size_t nbytes, loff_t *ppos)
+{
+       return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+                                      i_size_read(file->f_mapping->host));
+}
+#else
+static int ocfs2_osb_debug_open(struct inode *inode, struct file *file)
+{
+       return 0;
+}
+static int ocfs2_debug_release(struct inode *inode, struct file *file)
+{
+       return 0;
+}
+static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
+                               size_t nbytes, loff_t *ppos)
+{
+       return 0;
+}
+#endif /* CONFIG_DEBUG_FS */
+
+static struct file_operations ocfs2_osb_debug_fops = {
+       .open =         ocfs2_osb_debug_open,
+       .release =      ocfs2_debug_release,
+       .read =         ocfs2_debug_read,
+       .llseek =       generic_file_llseek,
+};
+
 /*
  * write_super and sync_fs ripped right out of ext3.
  */
@@ -926,6 +1090,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
                goto read_super_error;
        }
 
+       osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR,
+                                           osb->osb_debug_root,
+                                           osb,
+                                           &ocfs2_osb_debug_fops);
+       if (!osb->osb_ctxt) {
+               status = -EINVAL;
+               mlog_errno(status);
+               goto read_super_error;
+       }
+
        status = ocfs2_mount_volume(sb);
        if (osb->root_inode)
                inode = igrab(osb->root_inode);
@@ -1620,6 +1794,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        osb = OCFS2_SB(sb);
        BUG_ON(!osb);
 
+       debugfs_remove(osb->osb_ctxt);
+
        ocfs2_disable_quotas(osb);
 
        ocfs2_shutdown_local_alloc(osb);
@@ -1742,6 +1918,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
        bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
        sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
 
+       osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
+
+       for (i = 0; i < 3; i++)
+               osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]);
+       osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash);
+
        osb->sb = sb;
        /* Save off for ocfs2_rw_direct */
        osb->s_sectsize_bits = blksize_bits(sector_size);
@@ -2130,6 +2312,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
         * lock, and it's marked as dirty, set the bit in the recover
         * map and launch a recovery thread for it. */
        status = ocfs2_mark_dead_nodes(osb);
+       if (status < 0) {
+               mlog_errno(status);
+               goto finally;
+       }
+
+       status = ocfs2_compute_replay_slots(osb);
        if (status < 0)
                mlog_errno(status);
 
index 2563df89fc2a0e60f28a2e8958ddfda525099fb4..15631019dc634561838919310b2566e263f1d8dc 100644 (file)
@@ -512,7 +512,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
                          struct ocfs2_security_xattr_info *si,
                          int *want_clusters,
                          int *xattr_credits,
-                         struct ocfs2_alloc_context **xattr_ac)
+                         int *want_meta)
 {
        int ret = 0;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -554,11 +554,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
        if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
            (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) ||
            (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
-               ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
-               if (ret) {
-                       mlog_errno(ret);
-                       return ret;
-               }
+               *want_meta = *want_meta + 1;
                *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
        }
 
index 5a1ebc789f7eff51d8e364217171c3b0188983a8..1ca7e9a1b7bccbf4b34c270c7cc2009b53e3d370 100644 (file)
@@ -68,7 +68,7 @@ int ocfs2_calc_security_init(struct inode *,
                             int *, int *, struct ocfs2_alloc_context **);
 int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
                          int, struct ocfs2_security_xattr_info *,
-                         int *, int *, struct ocfs2_alloc_context **);
+                         int *, int *, int *);
 
 /*
  * xattrs can live inside an inode, as part of an external xattr block,
index 4ed0ba44a966ce053cdf11475333f4bdb0bb73b4..dd727d43e5b788cead6f13e85944f594791c5e14 100644 (file)
@@ -59,7 +59,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
                 */
                wait_on_page_writeback(page);
 
-               if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
+               if (page_has_private(page) &&
+                   !try_to_release_page(page, GFP_KERNEL))
                        goto out_unlock;
 
                /*
index 2ba481518ba784ec82cbe7c83fbb3801f1f17186..77cb4ec919b9a7bf941a3effe524c180343d8dee 100644 (file)
@@ -287,6 +287,7 @@ int fsync_super(struct super_block *sb)
        __fsync_super(sb);
        return sync_blockdev(sb->s_bdev);
 }
+EXPORT_SYMBOL_GPL(fsync_super);
 
 /**
  *     generic_shutdown_super  -       common helper for ->kill_sb()
index 2bb788a2acb16bf24ab25a708390e58842f06116..e48e9a3af76312d683723658ed42c51db7d8a740 100644 (file)
@@ -87,12 +87,12 @@ static int read_block_bitmap(struct super_block *sb,
 {
        struct buffer_head *bh = NULL;
        int retval = 0;
-       kernel_lb_addr loc;
+       struct kernel_lb_addr loc;
 
        loc.logicalBlockNum = bitmap->s_extPosition;
        loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
 
-       bh = udf_tread(sb, udf_get_lb_pblock(sb, loc, block));
+       bh = udf_tread(sb, udf_get_lb_pblock(sb, &loc, block));
        if (!bh)
                retval = -EIO;
 
@@ -140,27 +140,29 @@ static inline int load_block_bitmap(struct super_block *sb,
        return slot;
 }
 
-static bool udf_add_free_space(struct udf_sb_info *sbi,
-                               u16 partition, u32 cnt)
+static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt)
 {
+       struct udf_sb_info *sbi = UDF_SB(sb);
        struct logicalVolIntegrityDesc *lvid;
 
-       if (sbi->s_lvid_bh == NULL)
-               return false;
+       if (!sbi->s_lvid_bh)
+               return;
 
        lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data;
        le32_add_cpu(&lvid->freeSpaceTable[partition], cnt);
-       return true;
+       udf_updated_lvid(sb);
 }
 
 static void udf_bitmap_free_blocks(struct super_block *sb,
                                   struct inode *inode,
                                   struct udf_bitmap *bitmap,
-                                  kernel_lb_addr bloc, uint32_t offset,
+                                  struct kernel_lb_addr *bloc,
+                                  uint32_t offset,
                                   uint32_t count)
 {
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct buffer_head *bh = NULL;
+       struct udf_part_map *partmap;
        unsigned long block;
        unsigned long block_group;
        unsigned long bit;
@@ -169,17 +171,17 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
        unsigned long overflow;
 
        mutex_lock(&sbi->s_alloc_mutex);
-       if (bloc.logicalBlockNum < 0 ||
-           (bloc.logicalBlockNum + count) >
-               sbi->s_partmaps[bloc.partitionReferenceNum].s_partition_len) {
+       partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
+       if (bloc->logicalBlockNum < 0 ||
+           (bloc->logicalBlockNum + count) >
+               partmap->s_partition_len) {
                udf_debug("%d < %d || %d + %d > %d\n",
-                         bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count,
-                         sbi->s_partmaps[bloc.partitionReferenceNum].
-                                                       s_partition_len);
+                         bloc->logicalBlockNum, 0, bloc->logicalBlockNum,
+                         count, partmap->s_partition_len);
                goto error_return;
        }
 
-       block = bloc.logicalBlockNum + offset +
+       block = bloc->logicalBlockNum + offset +
                (sizeof(struct spaceBitmapDesc) << 3);
 
        do {
@@ -207,7 +209,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
                        } else {
                                if (inode)
                                        vfs_dq_free_block(inode, 1);
-                               udf_add_free_space(sbi, sbi->s_partition, 1);
+                               udf_add_free_space(sb, sbi->s_partition, 1);
                        }
                }
                mark_buffer_dirty(bh);
@@ -218,9 +220,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
        } while (overflow);
 
 error_return:
-       sb->s_dirt = 1;
-       if (sbi->s_lvid_bh)
-               mark_buffer_dirty(sbi->s_lvid_bh);
        mutex_unlock(&sbi->s_alloc_mutex);
 }
 
@@ -277,9 +276,7 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
        } while (block_count > 0);
 
 out:
-       if (udf_add_free_space(sbi, partition, -alloc_count))
-               mark_buffer_dirty(sbi->s_lvid_bh);
-       sb->s_dirt = 1;
+       udf_add_free_space(sb, partition, -alloc_count);
        mutex_unlock(&sbi->s_alloc_mutex);
        return alloc_count;
 }
@@ -409,9 +406,7 @@ got_block:
 
        mark_buffer_dirty(bh);
 
-       if (udf_add_free_space(sbi, partition, -1))
-               mark_buffer_dirty(sbi->s_lvid_bh);
-       sb->s_dirt = 1;
+       udf_add_free_space(sb, partition, -1);
        mutex_unlock(&sbi->s_alloc_mutex);
        *err = 0;
        return newblock;
@@ -425,26 +420,28 @@ error_return:
 static void udf_table_free_blocks(struct super_block *sb,
                                  struct inode *inode,
                                  struct inode *table,
-                                 kernel_lb_addr bloc, uint32_t offset,
+                                 struct kernel_lb_addr *bloc,
+                                 uint32_t offset,
                                  uint32_t count)
 {
        struct udf_sb_info *sbi = UDF_SB(sb);
+       struct udf_part_map *partmap;
        uint32_t start, end;
        uint32_t elen;
-       kernel_lb_addr eloc;
+       struct kernel_lb_addr eloc;
        struct extent_position oepos, epos;
        int8_t etype;
        int i;
        struct udf_inode_info *iinfo;
 
        mutex_lock(&sbi->s_alloc_mutex);
-       if (bloc.logicalBlockNum < 0 ||
-           (bloc.logicalBlockNum + count) >
-               sbi->s_partmaps[bloc.partitionReferenceNum].s_partition_len) {
+       partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
+       if (bloc->logicalBlockNum < 0 ||
+           (bloc->logicalBlockNum + count) >
+               partmap->s_partition_len) {
                udf_debug("%d < %d || %d + %d > %d\n",
                          bloc.logicalBlockNum, 0, bloc.logicalBlockNum, count,
-                         sbi->s_partmaps[bloc.partitionReferenceNum].
-                                                       s_partition_len);
+                         partmap->s_partition_len);
                goto error_return;
        }
 
@@ -453,11 +450,10 @@ static void udf_table_free_blocks(struct super_block *sb,
           could occure, but.. oh well */
        if (inode)
                vfs_dq_free_block(inode, count);
-       if (udf_add_free_space(sbi, sbi->s_partition, count))
-               mark_buffer_dirty(sbi->s_lvid_bh);
+       udf_add_free_space(sb, sbi->s_partition, count);
 
-       start = bloc.logicalBlockNum + offset;
-       end = bloc.logicalBlockNum + offset + count - 1;
+       start = bloc->logicalBlockNum + offset;
+       end = bloc->logicalBlockNum + offset + count - 1;
 
        epos.offset = oepos.offset = sizeof(struct unallocSpaceEntry);
        elen = 0;
@@ -483,7 +479,7 @@ static void udf_table_free_blocks(struct super_block *sb,
                                start += count;
                                count = 0;
                        }
-                       udf_write_aext(table, &oepos, eloc, elen, 1);
+                       udf_write_aext(table, &oepos, &eloc, elen, 1);
                } else if (eloc.logicalBlockNum == (end + 1)) {
                        if ((0x3FFFFFFF - elen) <
                                        (count << sb->s_blocksize_bits)) {
@@ -502,7 +498,7 @@ static void udf_table_free_blocks(struct super_block *sb,
                                end -= count;
                                count = 0;
                        }
-                       udf_write_aext(table, &oepos, eloc, elen, 1);
+                       udf_write_aext(table, &oepos, &eloc, elen, 1);
                }
 
                if (epos.bh != oepos.bh) {
@@ -532,8 +528,8 @@ static void udf_table_free_blocks(struct super_block *sb,
                 */
 
                int adsize;
-               short_ad *sad = NULL;
-               long_ad *lad = NULL;
+               struct short_ad *sad = NULL;
+               struct long_ad *lad = NULL;
                struct allocExtDesc *aed;
 
                eloc.logicalBlockNum = start;
@@ -541,9 +537,9 @@ static void udf_table_free_blocks(struct super_block *sb,
                        (count << sb->s_blocksize_bits);
 
                if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                       adsize = sizeof(short_ad);
+                       adsize = sizeof(struct short_ad);
                else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                       adsize = sizeof(long_ad);
+                       adsize = sizeof(struct long_ad);
                else {
                        brelse(oepos.bh);
                        brelse(epos.bh);
@@ -563,7 +559,7 @@ static void udf_table_free_blocks(struct super_block *sb,
                        elen -= sb->s_blocksize;
 
                        epos.bh = udf_tread(sb,
-                                       udf_get_lb_pblock(sb, epos.block, 0));
+                                       udf_get_lb_pblock(sb, &epos.block, 0));
                        if (!epos.bh) {
                                brelse(oepos.bh);
                                goto error_return;
@@ -601,15 +597,15 @@ static void udf_table_free_blocks(struct super_block *sb,
                        if (sbi->s_udfrev >= 0x0200)
                                udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
                                            3, 1, epos.block.logicalBlockNum,
-                                           sizeof(tag));
+                                           sizeof(struct tag));
                        else
                                udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
                                            2, 1, epos.block.logicalBlockNum,
-                                           sizeof(tag));
+                                           sizeof(struct tag));
 
                        switch (iinfo->i_alloc_type) {
                        case ICBTAG_FLAG_AD_SHORT:
-                               sad = (short_ad *)sptr;
+                               sad = (struct short_ad *)sptr;
                                sad->extLength = cpu_to_le32(
                                        EXT_NEXT_EXTENT_ALLOCDECS |
                                        sb->s_blocksize);
@@ -617,7 +613,7 @@ static void udf_table_free_blocks(struct super_block *sb,
                                        cpu_to_le32(epos.block.logicalBlockNum);
                                break;
                        case ICBTAG_FLAG_AD_LONG:
-                               lad = (long_ad *)sptr;
+                               lad = (struct long_ad *)sptr;
                                lad->extLength = cpu_to_le32(
                                        EXT_NEXT_EXTENT_ALLOCDECS |
                                        sb->s_blocksize);
@@ -635,7 +631,7 @@ static void udf_table_free_blocks(struct super_block *sb,
 
                /* It's possible that stealing the block emptied the extent */
                if (elen) {
-                       udf_write_aext(table, &epos, eloc, elen, 1);
+                       udf_write_aext(table, &epos, &eloc, elen, 1);
 
                        if (!epos.bh) {
                                iinfo->i_lenAlloc += adsize;
@@ -653,7 +649,6 @@ static void udf_table_free_blocks(struct super_block *sb,
        brelse(oepos.bh);
 
 error_return:
-       sb->s_dirt = 1;
        mutex_unlock(&sbi->s_alloc_mutex);
        return;
 }
@@ -666,7 +661,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
        struct udf_sb_info *sbi = UDF_SB(sb);
        int alloc_count = 0;
        uint32_t elen, adsize;
-       kernel_lb_addr eloc;
+       struct kernel_lb_addr eloc;
        struct extent_position epos;
        int8_t etype = -1;
        struct udf_inode_info *iinfo;
@@ -677,9 +672,9 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
 
        iinfo = UDF_I(table);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-               adsize = sizeof(short_ad);
+               adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-               adsize = sizeof(long_ad);
+               adsize = sizeof(struct long_ad);
        else
                return 0;
 
@@ -707,7 +702,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
                        alloc_count = block_count;
                        eloc.logicalBlockNum += alloc_count;
                        elen -= (alloc_count << sb->s_blocksize_bits);
-                       udf_write_aext(table, &epos, eloc,
+                       udf_write_aext(table, &epos, &eloc,
                                        (etype << 30) | elen, 1);
                } else
                        udf_delete_aext(table, epos, eloc,
@@ -718,10 +713,8 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
 
        brelse(epos.bh);
 
-       if (alloc_count && udf_add_free_space(sbi, partition, -alloc_count)) {
-               mark_buffer_dirty(sbi->s_lvid_bh);
-               sb->s_dirt = 1;
-       }
+       if (alloc_count)
+               udf_add_free_space(sb, partition, -alloc_count);
        mutex_unlock(&sbi->s_alloc_mutex);
        return alloc_count;
 }
@@ -735,7 +728,7 @@ static int udf_table_new_block(struct super_block *sb,
        uint32_t spread = 0xFFFFFFFF, nspread = 0xFFFFFFFF;
        uint32_t newblock = 0, adsize;
        uint32_t elen, goal_elen = 0;
-       kernel_lb_addr eloc, uninitialized_var(goal_eloc);
+       struct kernel_lb_addr eloc, uninitialized_var(goal_eloc);
        struct extent_position epos, goal_epos;
        int8_t etype;
        struct udf_inode_info *iinfo = UDF_I(table);
@@ -743,9 +736,9 @@ static int udf_table_new_block(struct super_block *sb,
        *err = -ENOSPC;
 
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-               adsize = sizeof(short_ad);
+               adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-               adsize = sizeof(long_ad);
+               adsize = sizeof(struct long_ad);
        else
                return newblock;
 
@@ -814,46 +807,37 @@ static int udf_table_new_block(struct super_block *sb,
        }
 
        if (goal_elen)
-               udf_write_aext(table, &goal_epos, goal_eloc, goal_elen, 1);
+               udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
        else
                udf_delete_aext(table, goal_epos, goal_eloc, goal_elen);
        brelse(goal_epos.bh);
 
-       if (udf_add_free_space(sbi, partition, -1))
-               mark_buffer_dirty(sbi->s_lvid_bh);
+       udf_add_free_space(sb, partition, -1);
 
-       sb->s_dirt = 1;
        mutex_unlock(&sbi->s_alloc_mutex);
        *err = 0;
        return newblock;
 }
 
-inline void udf_free_blocks(struct super_block *sb,
-                           struct inode *inode,
-                           kernel_lb_addr bloc, uint32_t offset,
-                           uint32_t count)
+void udf_free_blocks(struct super_block *sb, struct inode *inode,
+                    struct kernel_lb_addr *bloc, uint32_t offset,
+                    uint32_t count)
 {
-       uint16_t partition = bloc.partitionReferenceNum;
+       uint16_t partition = bloc->partitionReferenceNum;
        struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
 
        if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
-               return udf_bitmap_free_blocks(sb, inode,
-                                             map->s_uspace.s_bitmap,
-                                             bloc, offset, count);
+               udf_bitmap_free_blocks(sb, inode, map->s_uspace.s_bitmap,
+                                      bloc, offset, count);
        } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) {
-               return udf_table_free_blocks(sb, inode,
-                                            map->s_uspace.s_table,
-                                            bloc, offset, count);
+               udf_table_free_blocks(sb, inode, map->s_uspace.s_table,
+                                     bloc, offset, count);
        } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) {
-               return udf_bitmap_free_blocks(sb, inode,
-                                             map->s_fspace.s_bitmap,
-                                             bloc, offset, count);
+               udf_bitmap_free_blocks(sb, inode, map->s_fspace.s_bitmap,
+                                      bloc, offset, count);
        } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) {
-               return udf_table_free_blocks(sb, inode,
-                                            map->s_fspace.s_table,
-                                            bloc, offset, count);
-       } else {
-               return;
+               udf_table_free_blocks(sb, inode, map->s_fspace.s_table,
+                                     bloc, offset, count);
        }
 }
 
index 62dc270c69d1addaffb0919e1d5308421a2d1200..2efd4d5291b69cd8fb6c3be7ee020cd54d15f962 100644 (file)
@@ -51,7 +51,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
        uint8_t lfi;
        loff_t size = udf_ext0_offset(dir) + dir->i_size;
        struct buffer_head *tmp, *bha[16];
-       kernel_lb_addr eloc;
+       struct kernel_lb_addr eloc;
        uint32_t elen;
        sector_t offset;
        int i, num, ret = 0;
@@ -80,13 +80,13 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
                        ret = -ENOENT;
                        goto out;
                }
-               block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
+               block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
                if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
                        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                               epos.offset -= sizeof(short_ad);
+                               epos.offset -= sizeof(struct short_ad);
                        else if (iinfo->i_alloc_type ==
                                        ICBTAG_FLAG_AD_LONG)
-                               epos.offset -= sizeof(long_ad);
+                               epos.offset -= sizeof(struct long_ad);
                } else {
                        offset = 0;
                }
@@ -101,7 +101,7 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
                        if (i + offset > (elen >> dir->i_sb->s_blocksize_bits))
                                i = (elen >> dir->i_sb->s_blocksize_bits) - offset;
                        for (num = 0; i > 0; i--) {
-                               block = udf_get_lb_pblock(dir->i_sb, eloc, offset + i);
+                               block = udf_get_lb_pblock(dir->i_sb, &eloc, offset + i);
                                tmp = udf_tgetblk(dir->i_sb, block);
                                if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp))
                                        bha[num++] = tmp;
@@ -161,9 +161,9 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
                        memcpy(fname, "..", flen);
                        dt_type = DT_DIR;
                } else {
-                       kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
+                       struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
 
-                       iblock = udf_get_lb_pblock(dir->i_sb, tloc, 0);
+                       iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
                        flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
                        dt_type = DT_UNKNOWN;
                }
index 2820f8fcf4cc2e932b72f70cf25a7c69586546d9..1d2c570704c8f217ecf756a9b08c63457e139484 100644 (file)
@@ -20,7 +20,7 @@
 
 #if 0
 static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad,
-                               uint8_t ad_size, kernel_lb_addr fe_loc,
+                               uint8_t ad_size, struct kernel_lb_addr fe_loc,
                                int *pos, int *offset, struct buffer_head **bh,
                                int *error)
 {
@@ -75,7 +75,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
                                         struct udf_fileident_bh *fibh,
                                         struct fileIdentDesc *cfi,
                                         struct extent_position *epos,
-                                        kernel_lb_addr *eloc, uint32_t *elen,
+                                        struct kernel_lb_addr *eloc, uint32_t *elen,
                                         sector_t *offset)
 {
        struct fileIdentDesc *fi;
@@ -111,7 +111,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
                    (EXT_RECORDED_ALLOCATED >> 30))
                        return NULL;
 
-               block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset);
+               block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
 
                (*offset)++;
 
@@ -131,7 +131,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
                        if (i + *offset > (*elen >> blocksize_bits))
                                i = (*elen >> blocksize_bits)-*offset;
                        for (num = 0; i > 0; i--) {
-                               block = udf_get_lb_pblock(dir->i_sb, *eloc,
+                               block = udf_get_lb_pblock(dir->i_sb, eloc,
                                                          *offset + i);
                                tmp = udf_tgetblk(dir->i_sb, block);
                                if (tmp && !buffer_uptodate(tmp) &&
@@ -169,7 +169,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
                    (EXT_RECORDED_ALLOCATED >> 30))
                        return NULL;
 
-               block = udf_get_lb_pblock(dir->i_sb, *eloc, *offset);
+               block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
 
                (*offset)++;
 
@@ -249,9 +249,9 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
 }
 
 #if 0
-static extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
+static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
 {
-       extent_ad *ext;
+       struct extent_ad *ext;
        struct fileEntry *fe;
        uint8_t *ptr;
 
@@ -274,54 +274,54 @@ static extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
        if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs)))
                ptr += *offset;
 
-       ext = (extent_ad *)ptr;
+       ext = (struct extent_ad *)ptr;
 
-       *offset = *offset + sizeof(extent_ad);
+       *offset = *offset + sizeof(struct extent_ad);
        return ext;
 }
 #endif
 
-short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
+struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
                              int inc)
 {
-       short_ad *sa;
+       struct short_ad *sa;
 
        if ((!ptr) || (!offset)) {
                printk(KERN_ERR "udf: udf_get_fileshortad() invalidparms\n");
                return NULL;
        }
 
-       if ((*offset + sizeof(short_ad)) > maxoffset)
+       if ((*offset + sizeof(struct short_ad)) > maxoffset)
                return NULL;
        else {
-               sa = (short_ad *)ptr;
+               sa = (struct short_ad *)ptr;
                if (sa->extLength == 0)
                        return NULL;
        }
 
        if (inc)
-               *offset += sizeof(short_ad);
+               *offset += sizeof(struct short_ad);
        return sa;
 }
 
-long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc)
+struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc)
 {
-       long_ad *la;
+       struct long_ad *la;
 
        if ((!ptr) || (!offset)) {
                printk(KERN_ERR "udf: udf_get_filelongad() invalidparms\n");
                return NULL;
        }
 
-       if ((*offset + sizeof(long_ad)) > maxoffset)
+       if ((*offset + sizeof(struct long_ad)) > maxoffset)
                return NULL;
        else {
-               la = (long_ad *)ptr;
+               la = (struct long_ad *)ptr;
                if (la->extLength == 0)
                        return NULL;
        }
 
        if (inc)
-               *offset += sizeof(long_ad);
+               *offset += sizeof(struct long_ad);
        return la;
 }
index a0974df82b318b27d2be5e772eb5cf276f6658e1..4792b771aa8073231ee46116af2bfffe009b92a3 100644 (file)
 #define _ECMA_167_H 1
 
 /* Character set specification (ECMA 167r3 1/7.2.1) */
-typedef struct {
+struct charspec {
        uint8_t         charSetType;
        uint8_t         charSetInfo[63];
-} __attribute__ ((packed)) charspec;
+} __attribute__ ((packed));
 
 /* Character Set Type (ECMA 167r3 1/7.2.1.1) */
 #define CHARSPEC_TYPE_CS0              0x00    /* (1/7.2.2) */
@@ -57,7 +57,7 @@ typedef struct {
 typedef uint8_t                dstring;
 
 /* Timestamp (ECMA 167r3 1/7.3) */
-typedef struct {
+struct timestamp {
        __le16          typeAndTimezone;
        __le16          year;
        uint8_t         month;
@@ -68,7 +68,7 @@ typedef struct {
        uint8_t         centiseconds;
        uint8_t         hundredsOfMicroseconds;
        uint8_t         microseconds;
-} __attribute__ ((packed)) timestamp;
+} __attribute__ ((packed));
 
 /* Type and Time Zone (ECMA 167r3 1/7.3.1) */
 #define TIMESTAMP_TYPE_MASK            0xF000
@@ -78,11 +78,11 @@ typedef struct {
 #define TIMESTAMP_TIMEZONE_MASK                0x0FFF
 
 /* Entity identifier (ECMA 167r3 1/7.4) */
-typedef struct {
+struct regid {
        uint8_t         flags;
        uint8_t         ident[23];
        uint8_t         identSuffix[8];
-} __attribute__ ((packed)) regid;
+} __attribute__ ((packed));
 
 /* Flags (ECMA 167r3 1/7.4.1) */
 #define ENTITYID_FLAGS_DIRTY           0x00
@@ -126,38 +126,38 @@ struct terminatingExtendedAreaDesc {
 
 /* Boot Descriptor (ECMA 167r3 2/9.4) */
 struct bootDesc {
-       uint8_t         structType;
-       uint8_t         stdIdent[VSD_STD_ID_LEN];
-       uint8_t         structVersion;
-       uint8_t         reserved1;
-       regid           archType;
-       regid           bootIdent;
-       __le32          bootExtLocation;
-       __le32          bootExtLength;
-       __le64          loadAddress;
-       __le64          startAddress;
-       timestamp       descCreationDateAndTime;
-       __le16          flags;
-       uint8_t         reserved2[32];
-       uint8_t         bootUse[1906];
+       uint8_t                 structType;
+       uint8_t                 stdIdent[VSD_STD_ID_LEN];
+       uint8_t                 structVersion;
+       uint8_t                 reserved1;
+       struct regid            archType;
+       struct regid            bootIdent;
+       __le32                  bootExtLocation;
+       __le32                  bootExtLength;
+       __le64                  loadAddress;
+       __le64                  startAddress;
+       struct timestamp        descCreationDateAndTime;
+       __le16                  flags;
+       uint8_t                 reserved2[32];
+       uint8_t                 bootUse[1906];
 } __attribute__ ((packed));
 
 /* Flags (ECMA 167r3 2/9.4.12) */
 #define BOOT_FLAGS_ERASE               0x01
 
 /* Extent Descriptor (ECMA 167r3 3/7.1) */
-typedef struct {
+struct extent_ad {
        __le32          extLength;
        __le32          extLocation;
-} __attribute__ ((packed)) extent_ad;
+} __attribute__ ((packed));
 
-typedef struct {
+struct kernel_extent_ad {
        uint32_t        extLength;
        uint32_t        extLocation;
-} kernel_extent_ad;
+};
 
 /* Descriptor Tag (ECMA 167r3 3/7.2) */
-typedef struct {
+struct tag {
        __le16          tagIdent;
        __le16          descVersion;
        uint8_t         tagChecksum;
@@ -166,7 +166,7 @@ typedef struct {
        __le16          descCRC;
        __le16          descCRCLength;
        __le32          tagLocation;
-} __attribute__ ((packed)) tag;
+} __attribute__ ((packed));
 
 /* Tag Identifier (ECMA 167r3 3/7.2.1) */
 #define TAG_IDENT_PVD                  0x0001
@@ -190,28 +190,28 @@ struct NSRDesc {
 
 /* Primary Volume Descriptor (ECMA 167r3 3/10.1) */
 struct primaryVolDesc {
-       tag             descTag;
-       __le32          volDescSeqNum;
-       __le32          primaryVolDescNum;
-       dstring         volIdent[32];
-       __le16          volSeqNum;
-       __le16          maxVolSeqNum;
-       __le16          interchangeLvl;
-       __le16          maxInterchangeLvl;
-       __le32          charSetList;
-       __le32          maxCharSetList;
-       dstring         volSetIdent[128];
-       charspec        descCharSet;
-       charspec        explanatoryCharSet;
-       extent_ad       volAbstract;
-       extent_ad       volCopyright;
-       regid           appIdent;
-       timestamp       recordingDateAndTime;
-       regid           impIdent;
-       uint8_t         impUse[64];
-       __le32          predecessorVolDescSeqLocation;
-       __le16          flags;
-       uint8_t         reserved[22];
+       struct tag              descTag;
+       __le32                  volDescSeqNum;
+       __le32                  primaryVolDescNum;
+       dstring                 volIdent[32];
+       __le16                  volSeqNum;
+       __le16                  maxVolSeqNum;
+       __le16                  interchangeLvl;
+       __le16                  maxInterchangeLvl;
+       __le32                  charSetList;
+       __le32                  maxCharSetList;
+       dstring                 volSetIdent[128];
+       struct charspec         descCharSet;
+       struct charspec         explanatoryCharSet;
+       struct extent_ad        volAbstract;
+       struct extent_ad        volCopyright;
+       struct regid            appIdent;
+       struct timestamp        recordingDateAndTime;
+       struct regid            impIdent;
+       uint8_t                 impUse[64];
+       __le32                  predecessorVolDescSeqLocation;
+       __le16                  flags;
+       uint8_t                 reserved[22];
 } __attribute__ ((packed));
 
 /* Flags (ECMA 167r3 3/10.1.21) */
@@ -219,40 +219,40 @@ struct primaryVolDesc {
 
 /* Anchor Volume Descriptor Pointer (ECMA 167r3 3/10.2) */
 struct anchorVolDescPtr {
-       tag             descTag;
-       extent_ad       mainVolDescSeqExt;
-       extent_ad       reserveVolDescSeqExt;
-       uint8_t         reserved[480];
+       struct tag              descTag;
+       struct extent_ad        mainVolDescSeqExt;
+       struct extent_ad        reserveVolDescSeqExt;
+       uint8_t                 reserved[480];
 } __attribute__ ((packed));
 
 /* Volume Descriptor Pointer (ECMA 167r3 3/10.3) */
 struct volDescPtr {
-       tag             descTag;
-       __le32          volDescSeqNum;
-       extent_ad       nextVolDescSeqExt;
-       uint8_t         reserved[484];
+       struct tag              descTag;
+       __le32                  volDescSeqNum;
+       struct extent_ad        nextVolDescSeqExt;
+       uint8_t                 reserved[484];
 } __attribute__ ((packed));
 
 /* Implementation Use Volume Descriptor (ECMA 167r3 3/10.4) */
 struct impUseVolDesc {
-       tag             descTag;
+       struct tag      descTag;
        __le32          volDescSeqNum;
-       regid           impIdent;
+       struct regid    impIdent;
        uint8_t         impUse[460];
 } __attribute__ ((packed));
 
 /* Partition Descriptor (ECMA 167r3 3/10.5) */
 struct partitionDesc {
-       tag descTag;
+       struct tag descTag;
        __le32 volDescSeqNum;
        __le16 partitionFlags;
        __le16 partitionNumber;
-       regid partitionContents;
+       struct regid partitionContents;
        uint8_t partitionContentsUse[128];
        __le32 accessType;
        __le32 partitionStartingLocation;
        __le32 partitionLength;
-       regid impIdent;
+       struct regid impIdent;
        uint8_t impUse[128];
        uint8_t reserved[156];
 } __attribute__ ((packed));
@@ -278,19 +278,19 @@ struct partitionDesc {
 
 /* Logical Volume Descriptor (ECMA 167r3 3/10.6) */
 struct logicalVolDesc {
-       tag             descTag;
-       __le32          volDescSeqNum;
-       charspec        descCharSet;
-       dstring         logicalVolIdent[128];
-       __le32          logicalBlockSize;
-       regid           domainIdent;
-       uint8_t         logicalVolContentsUse[16];
-       __le32          mapTableLength;
-       __le32          numPartitionMaps;
-       regid           impIdent;
-       uint8_t         impUse[128];
-       extent_ad       integritySeqExt;
-       uint8_t         partitionMaps[0];
+       struct tag              descTag;
+       __le32                  volDescSeqNum;
+       struct charspec         descCharSet;
+       dstring                 logicalVolIdent[128];
+       __le32                  logicalBlockSize;
+       struct regid            domainIdent;
+       uint8_t                 logicalVolContentsUse[16];
+       __le32                  mapTableLength;
+       __le32                  numPartitionMaps;
+       struct regid            impIdent;
+       uint8_t                 impUse[128];
+       struct extent_ad        integritySeqExt;
+       uint8_t                 partitionMaps[0];
 } __attribute__ ((packed));
 
 /* Generic Partition Map (ECMA 167r3 3/10.7.1) */
@@ -322,30 +322,30 @@ struct genericPartitionMap2 {
 
 /* Unallocated Space Descriptor (ECMA 167r3 3/10.8) */
 struct unallocSpaceDesc {
-       tag             descTag;
-       __le32          volDescSeqNum;
-       __le32          numAllocDescs;
-       extent_ad       allocDescs[0];
+       struct tag              descTag;
+       __le32                  volDescSeqNum;
+       __le32                  numAllocDescs;
+       struct extent_ad        allocDescs[0];
 } __attribute__ ((packed));
 
 /* Terminating Descriptor (ECMA 167r3 3/10.9) */
 struct terminatingDesc {
-       tag             descTag;
+       struct tag      descTag;
        uint8_t         reserved[496];
 } __attribute__ ((packed));
 
 /* Logical Volume Integrity Descriptor (ECMA 167r3 3/10.10) */
 struct logicalVolIntegrityDesc {
-       tag             descTag;
-       timestamp       recordingDateAndTime;
-       __le32          integrityType;
-       extent_ad       nextIntegrityExt;
-       uint8_t         logicalVolContentsUse[32];
-       __le32          numOfPartitions;
-       __le32          lengthOfImpUse;
-       __le32          freeSpaceTable[0];
-       __le32          sizeTable[0];
-       uint8_t         impUse[0];
+       struct tag              descTag;
+       struct timestamp        recordingDateAndTime;
+       __le32                  integrityType;
+       struct extent_ad        nextIntegrityExt;
+       uint8_t                 logicalVolContentsUse[32];
+       __le32                  numOfPartitions;
+       __le32                  lengthOfImpUse;
+       __le32                  freeSpaceTable[0];
+       __le32                  sizeTable[0];
+       uint8_t                 impUse[0];
 } __attribute__ ((packed));
 
 /* Integrity Type (ECMA 167r3 3/10.10.3) */
@@ -353,50 +353,50 @@ struct logicalVolIntegrityDesc {
 #define LVID_INTEGRITY_TYPE_CLOSE      0x00000001
 
 /* Recorded Address (ECMA 167r3 4/7.1) */
-typedef struct {
+struct lb_addr {
        __le32          logicalBlockNum;
        __le16          partitionReferenceNum;
-} __attribute__ ((packed)) lb_addr;
+} __attribute__ ((packed));
 
 /* ... and its in-core analog */
-typedef struct {
+struct kernel_lb_addr {
        uint32_t                logicalBlockNum;
        uint16_t                partitionReferenceNum;
-} kernel_lb_addr;
+};
 
 /* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
-typedef struct {
+struct short_ad {
         __le32         extLength;
         __le32         extPosition;
-} __attribute__ ((packed)) short_ad;
+} __attribute__ ((packed));
 
 /* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */
-typedef struct {
+struct long_ad {
        __le32          extLength;
-       lb_addr         extLocation;
+       struct lb_addr  extLocation;
        uint8_t         impUse[6];
-} __attribute__ ((packed)) long_ad;
+} __attribute__ ((packed));
 
-typedef struct {
-       uint32_t        extLength;
-       kernel_lb_addr  extLocation;
-       uint8_t         impUse[6];
-} kernel_long_ad;
+struct kernel_long_ad {
+       uint32_t                extLength;
+       struct kernel_lb_addr   extLocation;
+       uint8_t                 impUse[6];
+};
 
 /* Extended Allocation Descriptor (ECMA 167r3 4/14.14.3) */
-typedef struct {
+struct ext_ad {
        __le32          extLength;
        __le32          recordedLength;
        __le32          informationLength;
-       lb_addr         extLocation;
-} __attribute__ ((packed)) ext_ad;
+       struct lb_addr  extLocation;
+} __attribute__ ((packed));
 
-typedef struct {
-       uint32_t        extLength;
-       uint32_t        recordedLength;
-       uint32_t        informationLength;
-       kernel_lb_addr  extLocation;
-} kernel_ext_ad;
+struct kernel_ext_ad {
+       uint32_t                extLength;
+       uint32_t                recordedLength;
+       uint32_t                informationLength;
+       struct kernel_lb_addr   extLocation;
+};
 
 /* Descriptor Tag (ECMA 167r3 4/7.2 - See 3/7.2) */
 
@@ -415,44 +415,44 @@ typedef struct {
 
 /* File Set Descriptor (ECMA 167r3 4/14.1) */
 struct fileSetDesc {
-       tag             descTag;
-       timestamp       recordingDateAndTime;
-       __le16          interchangeLvl;
-       __le16          maxInterchangeLvl;
-       __le32          charSetList;
-       __le32          maxCharSetList;
-       __le32          fileSetNum;
-       __le32          fileSetDescNum;
-       charspec        logicalVolIdentCharSet;
-       dstring         logicalVolIdent[128];
-       charspec        fileSetCharSet;
-       dstring         fileSetIdent[32];
-       dstring         copyrightFileIdent[32];
-       dstring         abstractFileIdent[32];
-       long_ad         rootDirectoryICB;
-       regid           domainIdent;
-       long_ad         nextExt;
-       long_ad         streamDirectoryICB;
-       uint8_t         reserved[32];
+       struct tag              descTag;
+       struct timestamp        recordingDateAndTime;
+       __le16                  interchangeLvl;
+       __le16                  maxInterchangeLvl;
+       __le32                  charSetList;
+       __le32                  maxCharSetList;
+       __le32                  fileSetNum;
+       __le32                  fileSetDescNum;
+       struct charspec         logicalVolIdentCharSet;
+       dstring                 logicalVolIdent[128];
+       struct charspec         fileSetCharSet;
+       dstring                 fileSetIdent[32];
+       dstring                 copyrightFileIdent[32];
+       dstring                 abstractFileIdent[32];
+       struct long_ad          rootDirectoryICB;
+       struct regid            domainIdent;
+       struct long_ad          nextExt;
+       struct long_ad          streamDirectoryICB;
+       uint8_t                 reserved[32];
 } __attribute__ ((packed));
 
 /* Partition Header Descriptor (ECMA 167r3 4/14.3) */
 struct partitionHeaderDesc {
-       short_ad        unallocSpaceTable;
-       short_ad        unallocSpaceBitmap;
-       short_ad        partitionIntegrityTable;
-       short_ad        freedSpaceTable;
-       short_ad        freedSpaceBitmap;
+       struct short_ad unallocSpaceTable;
+       struct short_ad unallocSpaceBitmap;
+       struct short_ad partitionIntegrityTable;
+       struct short_ad freedSpaceTable;
+       struct short_ad freedSpaceBitmap;
        uint8_t         reserved[88];
 } __attribute__ ((packed));
 
 /* File Identifier Descriptor (ECMA 167r3 4/14.4) */
 struct fileIdentDesc {
-       tag             descTag;
+       struct tag      descTag;
        __le16          fileVersionNum;
        uint8_t         fileCharacteristics;
        uint8_t         lengthFileIdent;
-       long_ad         icb;
+       struct long_ad  icb;
        __le16          lengthOfImpUse;
        uint8_t         impUse[0];
        uint8_t         fileIdent[0];
@@ -468,22 +468,22 @@ struct fileIdentDesc {
 
 /* Allocation Ext Descriptor (ECMA 167r3 4/14.5) */
 struct allocExtDesc {
-       tag             descTag;
+       struct tag      descTag;
        __le32          previousAllocExtLocation;
        __le32          lengthAllocDescs;
 } __attribute__ ((packed));
 
 /* ICB Tag (ECMA 167r3 4/14.6) */
-typedef struct {
+struct icbtag {
        __le32          priorRecordedNumDirectEntries;
        __le16          strategyType;
        __le16          strategyParameter;
        __le16          numEntries;
        uint8_t         reserved;
        uint8_t         fileType;
-       lb_addr         parentICBLocation;
+       struct lb_addr  parentICBLocation;
        __le16          flags;
-} __attribute__ ((packed)) icbtag;
+} __attribute__ ((packed));
 
 /* Strategy Type (ECMA 167r3 4/14.6.2) */
 #define ICBTAG_STRATEGY_TYPE_UNDEF     0x0000
@@ -528,41 +528,41 @@ typedef struct {
 
 /* Indirect Entry (ECMA 167r3 4/14.7) */
 struct indirectEntry {
-       tag             descTag;
-       icbtag          icbTag;
-       long_ad         indirectICB;
+       struct tag      descTag;
+       struct icbtag   icbTag;
+       struct long_ad  indirectICB;
 } __attribute__ ((packed));
 
 /* Terminal Entry (ECMA 167r3 4/14.8) */
 struct terminalEntry {
-       tag             descTag;
-       icbtag          icbTag;
+       struct tag      descTag;
+       struct icbtag   icbTag;
 } __attribute__ ((packed));
 
 /* File Entry (ECMA 167r3 4/14.9) */
 struct fileEntry {
-       tag             descTag;
-       icbtag          icbTag;
-       __le32          uid;
-       __le32          gid;
-       __le32          permissions;
-       __le16          fileLinkCount;
-       uint8_t         recordFormat;
-       uint8_t         recordDisplayAttr;
-       __le32          recordLength;
-       __le64          informationLength;
-       __le64          logicalBlocksRecorded;
-       timestamp       accessTime;
-       timestamp       modificationTime;
-       timestamp       attrTime;
-       __le32          checkpoint;
-       long_ad         extendedAttrICB;
-       regid           impIdent;
-       __le64          uniqueID;
-       __le32          lengthExtendedAttr;
-       __le32          lengthAllocDescs;
-       uint8_t         extendedAttr[0];
-       uint8_t         allocDescs[0];
+       struct tag              descTag;
+       struct icbtag           icbTag;
+       __le32                  uid;
+       __le32                  gid;
+       __le32                  permissions;
+       __le16                  fileLinkCount;
+       uint8_t                 recordFormat;
+       uint8_t                 recordDisplayAttr;
+       __le32                  recordLength;
+       __le64                  informationLength;
+       __le64                  logicalBlocksRecorded;
+       struct timestamp        accessTime;
+       struct timestamp        modificationTime;
+       struct timestamp        attrTime;
+       __le32                  checkpoint;
+       struct long_ad          extendedAttrICB;
+       struct regid            impIdent;
+       __le64                  uniqueID;
+       __le32                  lengthExtendedAttr;
+       __le32                  lengthAllocDescs;
+       uint8_t                 extendedAttr[0];
+       uint8_t                 allocDescs[0];
 } __attribute__ ((packed));
 
 /* Permissions (ECMA 167r3 4/14.9.5) */
@@ -604,7 +604,7 @@ struct fileEntry {
 
 /* Extended Attribute Header Descriptor (ECMA 167r3 4/14.10.1) */
 struct extendedAttrHeaderDesc {
-       tag             descTag;
+       struct tag      descTag;
        __le32          impAttrLocation;
        __le32          appAttrLocation;
 } __attribute__ ((packed));
@@ -687,7 +687,7 @@ struct impUseExtAttr {
        uint8_t         reserved[3];
        __le32          attrLength;
        __le32          impUseLength;
-       regid           impIdent;
+       struct regid    impIdent;
        uint8_t         impUse[0];
 } __attribute__ ((packed));
 
@@ -698,7 +698,7 @@ struct appUseExtAttr {
        uint8_t         reserved[3];
        __le32          attrLength;
        __le32          appUseLength;
-       regid           appIdent;
+       struct regid    appIdent;
        uint8_t         appUse[0];
 } __attribute__ ((packed));
 
@@ -712,15 +712,15 @@ struct appUseExtAttr {
 
 /* Unallocated Space Entry (ECMA 167r3 4/14.11) */
 struct unallocSpaceEntry {
-       tag             descTag;
-       icbtag          icbTag;
+       struct tag      descTag;
+       struct icbtag   icbTag;
        __le32          lengthAllocDescs;
        uint8_t         allocDescs[0];
 } __attribute__ ((packed));
 
 /* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */
 struct spaceBitmapDesc {
-       tag             descTag;
+       struct tag      descTag;
        __le32          numOfBits;
        __le32          numOfBytes;
        uint8_t         bitmap[0];
@@ -728,13 +728,13 @@ struct spaceBitmapDesc {
 
 /* Partition Integrity Entry (ECMA 167r3 4/14.13) */
 struct partitionIntegrityEntry {
-       tag             descTag;
-       icbtag          icbTag;
-       timestamp       recordingDateAndTime;
-       uint8_t         integrityType;
-       uint8_t         reserved[175];
-       regid           impIdent;
-       uint8_t         impUse[256];
+       struct tag              descTag;
+       struct icbtag           icbTag;
+       struct timestamp        recordingDateAndTime;
+       uint8_t                 integrityType;
+       uint8_t                 reserved[175];
+       struct regid            impIdent;
+       uint8_t                 impUse[256];
 } __attribute__ ((packed));
 
 /* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
@@ -765,32 +765,32 @@ struct pathComponent {
 
 /* File Entry (ECMA 167r3 4/14.17) */
 struct extendedFileEntry {
-       tag             descTag;
-       icbtag          icbTag;
-       __le32          uid;
-       __le32          gid;
-       __le32          permissions;
-       __le16          fileLinkCount;
-       uint8_t         recordFormat;
-       uint8_t         recordDisplayAttr;
-       __le32          recordLength;
-       __le64          informationLength;
-       __le64          objectSize;
-       __le64          logicalBlocksRecorded;
-       timestamp       accessTime;
-       timestamp       modificationTime;
-       timestamp       createTime;
-       timestamp       attrTime;
-       __le32          checkpoint;
-       __le32          reserved;
-       long_ad         extendedAttrICB;
-       long_ad         streamDirectoryICB;
-       regid           impIdent;
-       __le64          uniqueID;
-       __le32          lengthExtendedAttr;
-       __le32          lengthAllocDescs;
-       uint8_t         extendedAttr[0];
-       uint8_t         allocDescs[0];
+       struct tag              descTag;
+       struct icbtag           icbTag;
+       __le32                  uid;
+       __le32                  gid;
+       __le32                  permissions;
+       __le16                  fileLinkCount;
+       uint8_t                 recordFormat;
+       uint8_t                 recordDisplayAttr;
+       __le32                  recordLength;
+       __le64                  informationLength;
+       __le64                  objectSize;
+       __le64                  logicalBlocksRecorded;
+       struct timestamp        accessTime;
+       struct timestamp        modificationTime;
+       struct timestamp        createTime;
+       struct timestamp        attrTime;
+       __le32                  checkpoint;
+       __le32                  reserved;
+       struct long_ad          extendedAttrICB;
+       struct long_ad          streamDirectoryICB;
+       struct regid            impIdent;
+       __le64                  uniqueID;
+       __le32                  lengthExtendedAttr;
+       __le32                  lengthAllocDescs;
+       uint8_t                 extendedAttr[0];
+       uint8_t                 allocDescs[0];
 } __attribute__ ((packed));
 
 #endif /* _ECMA_167_H */
index 47dbe5613f90ff456b2a52f4a8a4c8aee8f6fb12..c10fa39f97e2e7dacd56ff39c13365140dc8f577 100644 (file)
@@ -49,12 +49,11 @@ void udf_free_inode(struct inode *inode)
                        le32_add_cpu(&lvidiu->numDirs, -1);
                else
                        le32_add_cpu(&lvidiu->numFiles, -1);
-
-               mark_buffer_dirty(sbi->s_lvid_bh);
+               udf_updated_lvid(sb);
        }
        mutex_unlock(&sbi->s_alloc_mutex);
 
-       udf_free_blocks(sb, NULL, UDF_I(inode)->i_location, 0, 1);
+       udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1);
 }
 
 struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
@@ -122,7 +121,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
                if (!(++uniqueID & 0x00000000FFFFFFFFUL))
                        uniqueID += 16;
                lvhd->uniqueID = cpu_to_le64(uniqueID);
-               mark_buffer_dirty(sbi->s_lvid_bh);
+               udf_updated_lvid(sb);
        }
        mutex_unlock(&sbi->s_alloc_mutex);
        inode->i_mode = mode;
@@ -138,7 +137,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
        iinfo->i_location.logicalBlockNum = block;
        iinfo->i_location.partitionReferenceNum =
                                dinfo->i_location.partitionReferenceNum;
-       inode->i_ino = udf_get_lb_pblock(sb, iinfo->i_location, 0);
+       inode->i_ino = udf_get_lb_pblock(sb, &iinfo->i_location, 0);
        inode->i_blocks = 0;
        iinfo->i_lenEAttr = 0;
        iinfo->i_lenAlloc = 0;
index 30ebde490f7f5ebc3a4b79331191706c180b5a8f..e7533f7856368d17a15728c5113e816546cff4e7 100644 (file)
@@ -55,15 +55,15 @@ static int udf_alloc_i_data(struct inode *inode, size_t size);
 static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
                                        sector_t *, int *);
 static int8_t udf_insert_aext(struct inode *, struct extent_position,
-                             kernel_lb_addr, uint32_t);
+                             struct kernel_lb_addr, uint32_t);
 static void udf_split_extents(struct inode *, int *, int, int,
-                             kernel_long_ad[EXTENT_MERGE_SIZE], int *);
+                             struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
 static void udf_prealloc_extents(struct inode *, int, int,
-                                kernel_long_ad[EXTENT_MERGE_SIZE], int *);
+                                struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
 static void udf_merge_extents(struct inode *,
-                             kernel_long_ad[EXTENT_MERGE_SIZE], int *);
+                             struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
 static void udf_update_extents(struct inode *,
-                              kernel_long_ad[EXTENT_MERGE_SIZE], int, int,
+                              struct kernel_long_ad[EXTENT_MERGE_SIZE], int, int,
                               struct extent_position *);
 static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
 
@@ -200,7 +200,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
 {
        int newblock;
        struct buffer_head *dbh = NULL;
-       kernel_lb_addr eloc;
+       struct kernel_lb_addr eloc;
        uint32_t elen;
        uint8_t alloctype;
        struct extent_position epos;
@@ -281,7 +281,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
        epos.bh = NULL;
        epos.block = iinfo->i_location;
        epos.offset = udf_file_entry_alloc_offset(inode);
-       udf_add_aext(inode, &epos, eloc, elen, 0);
+       udf_add_aext(inode, &epos, &eloc, elen, 0);
        /* UniqueID stuff */
 
        brelse(epos.bh);
@@ -359,12 +359,12 @@ static struct buffer_head *udf_getblk(struct inode *inode, long block,
 
 /* Extend the file by 'blocks' blocks, return the number of extents added */
 int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
-                   kernel_long_ad *last_ext, sector_t blocks)
+                   struct kernel_long_ad *last_ext, sector_t blocks)
 {
        sector_t add;
        int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
        struct super_block *sb = inode->i_sb;
-       kernel_lb_addr prealloc_loc = {};
+       struct kernel_lb_addr prealloc_loc = {};
        int prealloc_len = 0;
        struct udf_inode_info *iinfo;
 
@@ -411,11 +411,11 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
        }
 
        if (fake) {
-               udf_add_aext(inode, last_pos, last_ext->extLocation,
+               udf_add_aext(inode, last_pos, &last_ext->extLocation,
                             last_ext->extLength, 1);
                count++;
        } else
-               udf_write_aext(inode, last_pos, last_ext->extLocation,
+               udf_write_aext(inode, last_pos, &last_ext->extLocation,
                                last_ext->extLength, 1);
 
        /* Managed to do everything necessary? */
@@ -432,7 +432,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
        /* Create enough extents to cover the whole hole */
        while (blocks > add) {
                blocks -= add;
-               if (udf_add_aext(inode, last_pos, last_ext->extLocation,
+               if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
                                 last_ext->extLength, 1) == -1)
                        return -1;
                count++;
@@ -440,7 +440,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
        if (blocks) {
                last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
                        (blocks << sb->s_blocksize_bits);
-               if (udf_add_aext(inode, last_pos, last_ext->extLocation,
+               if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
                                 last_ext->extLength, 1) == -1)
                        return -1;
                count++;
@@ -449,7 +449,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
 out:
        /* Do we have some preallocated blocks saved? */
        if (prealloc_len) {
-               if (udf_add_aext(inode, last_pos, prealloc_loc,
+               if (udf_add_aext(inode, last_pos, &prealloc_loc,
                                 prealloc_len, 1) == -1)
                        return -1;
                last_ext->extLocation = prealloc_loc;
@@ -459,9 +459,9 @@ out:
 
        /* last_pos should point to the last written extent... */
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-               last_pos->offset -= sizeof(short_ad);
+               last_pos->offset -= sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-               last_pos->offset -= sizeof(long_ad);
+               last_pos->offset -= sizeof(struct long_ad);
        else
                return -1;
 
@@ -473,11 +473,11 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
 {
        static sector_t last_block;
        struct buffer_head *result = NULL;
-       kernel_long_ad laarr[EXTENT_MERGE_SIZE];
+       struct kernel_long_ad laarr[EXTENT_MERGE_SIZE];
        struct extent_position prev_epos, cur_epos, next_epos;
        int count = 0, startnum = 0, endnum = 0;
        uint32_t elen = 0, tmpelen;
-       kernel_lb_addr eloc, tmpeloc;
+       struct kernel_lb_addr eloc, tmpeloc;
        int c = 1;
        loff_t lbcount = 0, b_off = 0;
        uint32_t newblocknum, newblock;
@@ -550,12 +550,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
                        elen = EXT_RECORDED_ALLOCATED |
                                ((elen + inode->i_sb->s_blocksize - 1) &
                                 ~(inode->i_sb->s_blocksize - 1));
-                       etype = udf_write_aext(inode, &cur_epos, eloc, elen, 1);
+                       etype = udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
                }
                brelse(prev_epos.bh);
                brelse(cur_epos.bh);
                brelse(next_epos.bh);
-               newblock = udf_get_lb_pblock(inode->i_sb, eloc, offset);
+               newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
                *phys = newblock;
                return NULL;
        }
@@ -572,7 +572,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
                } else {
                        /* Create a fake extent when there's not one */
                        memset(&laarr[0].extLocation, 0x00,
-                               sizeof(kernel_lb_addr));
+                               sizeof(struct kernel_lb_addr));
                        laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
                        /* Will udf_extend_file() create real extent from
                           a fake one? */
@@ -602,7 +602,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
                        laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
                                inode->i_sb->s_blocksize;
                        memset(&laarr[c].extLocation, 0x00,
-                               sizeof(kernel_lb_addr));
+                               sizeof(struct kernel_lb_addr));
                        count++;
                        endnum++;
                }
@@ -699,7 +699,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
 
 static void udf_split_extents(struct inode *inode, int *c, int offset,
                              int newblocknum,
-                             kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+                             struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
                              int *endnum)
 {
        unsigned long blocksize = inode->i_sb->s_blocksize;
@@ -726,7 +726,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset,
                if (offset) {
                        if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
                                udf_free_blocks(inode->i_sb, inode,
-                                               laarr[curr].extLocation,
+                                               &laarr[curr].extLocation,
                                                0, offset);
                                laarr[curr].extLength =
                                        EXT_NOT_RECORDED_NOT_ALLOCATED |
@@ -763,7 +763,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset,
 }
 
 static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
-                                kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+                                struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
                                 int *endnum)
 {
        int start, length = 0, currlength = 0, i;
@@ -817,7 +817,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
                                         inode->i_sb->s_blocksize_bits);
                        else {
                                memmove(&laarr[c + 2], &laarr[c + 1],
-                                       sizeof(long_ad) * (*endnum - (c + 1)));
+                                       sizeof(struct long_ad) * (*endnum - (c + 1)));
                                (*endnum)++;
                                laarr[c + 1].extLocation.logicalBlockNum = next;
                                laarr[c + 1].extLocation.partitionReferenceNum =
@@ -846,7 +846,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
                                        if (*endnum > (i + 1))
                                                memmove(&laarr[i],
                                                        &laarr[i + 1],
-                                                       sizeof(long_ad) *
+                                                       sizeof(struct long_ad) *
                                                        (*endnum - (i + 1)));
                                        i--;
                                        (*endnum)--;
@@ -859,7 +859,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
 }
 
 static void udf_merge_extents(struct inode *inode,
-                             kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+                             struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
                              int *endnum)
 {
        int i;
@@ -867,8 +867,8 @@ static void udf_merge_extents(struct inode *inode,
        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
 
        for (i = 0; i < (*endnum - 1); i++) {
-               kernel_long_ad *li /*l[i]*/ = &laarr[i];
-               kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1];
+               struct kernel_long_ad *li /*l[i]*/ = &laarr[i];
+               struct kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1];
 
                if (((li->extLength >> 30) == (lip1->extLength >> 30)) &&
                        (((li->extLength >> 30) ==
@@ -902,7 +902,7 @@ static void udf_merge_extents(struct inode *inode,
                                         blocksize - 1) & ~(blocksize - 1));
                                if (*endnum > (i + 2))
                                        memmove(&laarr[i + 1], &laarr[i + 2],
-                                               sizeof(long_ad) *
+                                               sizeof(struct long_ad) *
                                                (*endnum - (i + 2)));
                                i--;
                                (*endnum)--;
@@ -911,7 +911,7 @@ static void udf_merge_extents(struct inode *inode,
                                (EXT_NOT_RECORDED_ALLOCATED >> 30)) &&
                           ((lip1->extLength >> 30) ==
                                (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) {
-                       udf_free_blocks(inode->i_sb, inode, li->extLocation, 0,
+                       udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0,
                                        ((li->extLength &
                                          UDF_EXTENT_LENGTH_MASK) +
                                         blocksize - 1) >> blocksize_bits);
@@ -937,7 +937,7 @@ static void udf_merge_extents(struct inode *inode,
                                          blocksize - 1) & ~(blocksize - 1));
                                if (*endnum > (i + 2))
                                        memmove(&laarr[i + 1], &laarr[i + 2],
-                                               sizeof(long_ad) *
+                                               sizeof(struct long_ad) *
                                                (*endnum - (i + 2)));
                                i--;
                                (*endnum)--;
@@ -945,7 +945,7 @@ static void udf_merge_extents(struct inode *inode,
                } else if ((li->extLength >> 30) ==
                                        (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
                        udf_free_blocks(inode->i_sb, inode,
-                                       li->extLocation, 0,
+                                       &li->extLocation, 0,
                                        ((li->extLength &
                                                UDF_EXTENT_LENGTH_MASK) +
                                         blocksize - 1) >> blocksize_bits);
@@ -959,12 +959,12 @@ static void udf_merge_extents(struct inode *inode,
 }
 
 static void udf_update_extents(struct inode *inode,
-                              kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+                              struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
                               int startnum, int endnum,
                               struct extent_position *epos)
 {
        int start = 0, i;
-       kernel_lb_addr tmploc;
+       struct kernel_lb_addr tmploc;
        uint32_t tmplen;
 
        if (startnum > endnum) {
@@ -983,7 +983,7 @@ static void udf_update_extents(struct inode *inode,
 
        for (i = start; i < endnum; i++) {
                udf_next_aext(inode, epos, &tmploc, &tmplen, 0);
-               udf_write_aext(inode, epos, laarr[i].extLocation,
+               udf_write_aext(inode, epos, &laarr[i].extLocation,
                               laarr[i].extLength, 1);
        }
 }
@@ -1076,7 +1076,7 @@ static void __udf_read_inode(struct inode *inode)
         *      i_nlink = 1
         *      i_op = NULL;
         */
-       bh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 0, &ident);
+       bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident);
        if (!bh) {
                printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n",
                       inode->i_ino);
@@ -1098,24 +1098,24 @@ static void __udf_read_inode(struct inode *inode)
        if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
                struct buffer_head *ibh;
 
-               ibh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 1,
+               ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1,
                                        &ident);
                if (ident == TAG_IDENT_IE && ibh) {
                        struct buffer_head *nbh = NULL;
-                       kernel_lb_addr loc;
+                       struct kernel_lb_addr loc;
                        struct indirectEntry *ie;
 
                        ie = (struct indirectEntry *)ibh->b_data;
                        loc = lelb_to_cpu(ie->indirectICB.extLocation);
 
                        if (ie->indirectICB.extLength &&
-                               (nbh = udf_read_ptagged(inode->i_sb, loc, 0,
+                               (nbh = udf_read_ptagged(inode->i_sb, &loc, 0,
                                                        &ident))) {
                                if (ident == TAG_IDENT_FE ||
                                        ident == TAG_IDENT_EFE) {
                                        memcpy(&iinfo->i_location,
                                                &loc,
-                                               sizeof(kernel_lb_addr));
+                                               sizeof(struct kernel_lb_addr));
                                        brelse(bh);
                                        brelse(ibh);
                                        brelse(nbh);
@@ -1222,8 +1222,15 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
        inode->i_size = le64_to_cpu(fe->informationLength);
        iinfo->i_lenExtents = inode->i_size;
 
-       inode->i_mode = udf_convert_permissions(fe);
-       inode->i_mode &= ~UDF_SB(inode->i_sb)->s_umask;
+       if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY &&
+                       sbi->s_fmode != UDF_INVALID_MODE)
+               inode->i_mode = sbi->s_fmode;
+       else if (fe->icbTag.fileType == ICBTAG_FILE_TYPE_DIRECTORY &&
+                       sbi->s_dmode != UDF_INVALID_MODE)
+               inode->i_mode = sbi->s_dmode;
+       else
+               inode->i_mode = udf_convert_permissions(fe);
+       inode->i_mode &= ~sbi->s_umask;
 
        if (iinfo->i_efe == 0) {
                inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
@@ -1396,7 +1403,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 
        bh = udf_tread(inode->i_sb,
                        udf_get_lb_pblock(inode->i_sb,
-                                         iinfo->i_location, 0));
+                                         &iinfo->i_location, 0));
        if (!bh) {
                udf_debug("bread failure\n");
                return -EIO;
@@ -1416,13 +1423,13 @@ static int udf_update_inode(struct inode *inode, int do_sync)
                       iinfo->i_ext.i_data, inode->i_sb->s_blocksize -
                                        sizeof(struct unallocSpaceEntry));
                crclen = sizeof(struct unallocSpaceEntry) +
-                               iinfo->i_lenAlloc - sizeof(tag);
+                               iinfo->i_lenAlloc - sizeof(struct tag);
                use->descTag.tagLocation = cpu_to_le32(
                                                iinfo->i_location.
                                                        logicalBlockNum);
                use->descTag.descCRCLength = cpu_to_le16(crclen);
                use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
-                                                          sizeof(tag),
+                                                          sizeof(struct tag),
                                                           crclen));
                use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
 
@@ -1459,23 +1466,23 @@ static int udf_update_inode(struct inode *inode, int do_sync)
        fe->informationLength = cpu_to_le64(inode->i_size);
 
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
-               regid *eid;
+               struct regid *eid;
                struct deviceSpec *dsea =
                        (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1);
                if (!dsea) {
                        dsea = (struct deviceSpec *)
                                udf_add_extendedattr(inode,
                                                     sizeof(struct deviceSpec) +
-                                                    sizeof(regid), 12, 0x3);
+                                                    sizeof(struct regid), 12, 0x3);
                        dsea->attrType = cpu_to_le32(12);
                        dsea->attrSubtype = 1;
                        dsea->attrLength = cpu_to_le32(
                                                sizeof(struct deviceSpec) +
-                                               sizeof(regid));
-                       dsea->impUseLength = cpu_to_le32(sizeof(regid));
+                                               sizeof(struct regid));
+                       dsea->impUseLength = cpu_to_le32(sizeof(struct regid));
                }
-               eid = (regid *)dsea->impUse;
-               memset(eid, 0, sizeof(regid));
+               eid = (struct regid *)dsea->impUse;
+               memset(eid, 0, sizeof(struct regid));
                strcpy(eid->ident, UDF_ID_DEVELOPER);
                eid->identSuffix[0] = UDF_OS_CLASS_UNIX;
                eid->identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1494,7 +1501,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
                udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
                udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
                udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime);
-               memset(&(fe->impIdent), 0, sizeof(regid));
+               memset(&(fe->impIdent), 0, sizeof(struct regid));
                strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER);
                fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
                fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1533,7 +1540,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
                udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
                udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime);
 
-               memset(&(efe->impIdent), 0, sizeof(regid));
+               memset(&(efe->impIdent), 0, sizeof(struct regid));
                strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER);
                efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
                efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1584,9 +1591,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
        fe->descTag.tagLocation = cpu_to_le32(
                                        iinfo->i_location.logicalBlockNum);
        crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc -
-                                                               sizeof(tag);
+                                                               sizeof(struct tag);
        fe->descTag.descCRCLength = cpu_to_le16(crclen);
-       fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(tag),
+       fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag),
                                                  crclen));
        fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
 
@@ -1606,7 +1613,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
        return err;
 }
 
-struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
+struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino)
 {
        unsigned long block = udf_get_lb_pblock(sb, ino, 0);
        struct inode *inode = iget_locked(sb, block);
@@ -1615,7 +1622,7 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
                return NULL;
 
        if (inode->i_state & I_NEW) {
-               memcpy(&UDF_I(inode)->i_location, &ino, sizeof(kernel_lb_addr));
+               memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
                __udf_read_inode(inode);
                unlock_new_inode(inode);
        }
@@ -1623,10 +1630,10 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
        if (is_bad_inode(inode))
                goto out_iput;
 
-       if (ino.logicalBlockNum >= UDF_SB(sb)->
-                       s_partmaps[ino.partitionReferenceNum].s_partition_len) {
+       if (ino->logicalBlockNum >= UDF_SB(sb)->
+                       s_partmaps[ino->partitionReferenceNum].s_partition_len) {
                udf_debug("block=%d, partition=%d out of range\n",
-                         ino.logicalBlockNum, ino.partitionReferenceNum);
+                         ino->logicalBlockNum, ino->partitionReferenceNum);
                make_bad_inode(inode);
                goto out_iput;
        }
@@ -1639,11 +1646,11 @@ struct inode *udf_iget(struct super_block *sb, kernel_lb_addr ino)
 }
 
 int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
-                   kernel_lb_addr eloc, uint32_t elen, int inc)
+                   struct kernel_lb_addr *eloc, uint32_t elen, int inc)
 {
        int adsize;
-       short_ad *sad = NULL;
-       long_ad *lad = NULL;
+       struct short_ad *sad = NULL;
+       struct long_ad *lad = NULL;
        struct allocExtDesc *aed;
        int8_t etype;
        uint8_t *ptr;
@@ -1657,9 +1664,9 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
                ptr = epos->bh->b_data + epos->offset;
 
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-               adsize = sizeof(short_ad);
+               adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-               adsize = sizeof(long_ad);
+               adsize = sizeof(struct long_ad);
        else
                return -1;
 
@@ -1667,7 +1674,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
                char *sptr, *dptr;
                struct buffer_head *nbh;
                int err, loffset;
-               kernel_lb_addr obloc = epos->block;
+               struct kernel_lb_addr obloc = epos->block;
 
                epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL,
                                                obloc.partitionReferenceNum,
@@ -1675,7 +1682,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
                if (!epos->block.logicalBlockNum)
                        return -1;
                nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
-                                                                epos->block,
+                                                                &epos->block,
                                                                 0));
                if (!nbh)
                        return -1;
@@ -1712,20 +1719,20 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
                }
                if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200)
                        udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1,
-                                   epos->block.logicalBlockNum, sizeof(tag));
+                                   epos->block.logicalBlockNum, sizeof(struct tag));
                else
                        udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1,
-                                   epos->block.logicalBlockNum, sizeof(tag));
+                                   epos->block.logicalBlockNum, sizeof(struct tag));
                switch (iinfo->i_alloc_type) {
                case ICBTAG_FLAG_AD_SHORT:
-                       sad = (short_ad *)sptr;
+                       sad = (struct short_ad *)sptr;
                        sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
                                                     inode->i_sb->s_blocksize);
                        sad->extPosition =
                                cpu_to_le32(epos->block.logicalBlockNum);
                        break;
                case ICBTAG_FLAG_AD_LONG:
-                       lad = (long_ad *)sptr;
+                       lad = (struct long_ad *)sptr;
                        lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
                                                     inode->i_sb->s_blocksize);
                        lad->extLocation = cpu_to_lelb(epos->block);
@@ -1769,12 +1776,12 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
 }
 
 int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
-                     kernel_lb_addr eloc, uint32_t elen, int inc)
+                     struct kernel_lb_addr *eloc, uint32_t elen, int inc)
 {
        int adsize;
        uint8_t *ptr;
-       short_ad *sad;
-       long_ad *lad;
+       struct short_ad *sad;
+       struct long_ad *lad;
        struct udf_inode_info *iinfo = UDF_I(inode);
 
        if (!epos->bh)
@@ -1786,17 +1793,17 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
 
        switch (iinfo->i_alloc_type) {
        case ICBTAG_FLAG_AD_SHORT:
-               sad = (short_ad *)ptr;
+               sad = (struct short_ad *)ptr;
                sad->extLength = cpu_to_le32(elen);
-               sad->extPosition = cpu_to_le32(eloc.logicalBlockNum);
-               adsize = sizeof(short_ad);
+               sad->extPosition = cpu_to_le32(eloc->logicalBlockNum);
+               adsize = sizeof(struct short_ad);
                break;
        case ICBTAG_FLAG_AD_LONG:
-               lad = (long_ad *)ptr;
+               lad = (struct long_ad *)ptr;
                lad->extLength = cpu_to_le32(elen);
-               lad->extLocation = cpu_to_lelb(eloc);
+               lad->extLocation = cpu_to_lelb(*eloc);
                memset(lad->impUse, 0x00, sizeof(lad->impUse));
-               adsize = sizeof(long_ad);
+               adsize = sizeof(struct long_ad);
                break;
        default:
                return -1;
@@ -1823,7 +1830,7 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
 }
 
 int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
-                    kernel_lb_addr *eloc, uint32_t *elen, int inc)
+                    struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
 {
        int8_t etype;
 
@@ -1833,7 +1840,7 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
                epos->block = *eloc;
                epos->offset = sizeof(struct allocExtDesc);
                brelse(epos->bh);
-               block = udf_get_lb_pblock(inode->i_sb, epos->block, 0);
+               block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0);
                epos->bh = udf_tread(inode->i_sb, block);
                if (!epos->bh) {
                        udf_debug("reading block %d failed!\n", block);
@@ -1845,13 +1852,13 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
 }
 
 int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
-                       kernel_lb_addr *eloc, uint32_t *elen, int inc)
+                       struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
 {
        int alen;
        int8_t etype;
        uint8_t *ptr;
-       short_ad *sad;
-       long_ad *lad;
+       struct short_ad *sad;
+       struct long_ad *lad;
        struct udf_inode_info *iinfo = UDF_I(inode);
 
        if (!epos->bh) {
@@ -1900,9 +1907,9 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
 }
 
 static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos,
-                             kernel_lb_addr neloc, uint32_t nelen)
+                             struct kernel_lb_addr neloc, uint32_t nelen)
 {
-       kernel_lb_addr oeloc;
+       struct kernel_lb_addr oeloc;
        uint32_t oelen;
        int8_t etype;
 
@@ -1910,18 +1917,18 @@ static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos,
                get_bh(epos.bh);
 
        while ((etype = udf_next_aext(inode, &epos, &oeloc, &oelen, 0)) != -1) {
-               udf_write_aext(inode, &epos, neloc, nelen, 1);
+               udf_write_aext(inode, &epos, &neloc, nelen, 1);
                neloc = oeloc;
                nelen = (etype << 30) | oelen;
        }
-       udf_add_aext(inode, &epos, neloc, nelen, 1);
+       udf_add_aext(inode, &epos, &neloc, nelen, 1);
        brelse(epos.bh);
 
        return (nelen >> 30);
 }
 
 int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
-                      kernel_lb_addr eloc, uint32_t elen)
+                      struct kernel_lb_addr eloc, uint32_t elen)
 {
        struct extent_position oepos;
        int adsize;
@@ -1936,9 +1943,9 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
 
        iinfo = UDF_I(inode);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-               adsize = sizeof(short_ad);
+               adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-               adsize = sizeof(long_ad);
+               adsize = sizeof(struct long_ad);
        else
                adsize = 0;
 
@@ -1947,7 +1954,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
                return -1;
 
        while ((etype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) {
-               udf_write_aext(inode, &oepos, eloc, (etype << 30) | elen, 1);
+               udf_write_aext(inode, &oepos, &eloc, (etype << 30) | elen, 1);
                if (oepos.bh != epos.bh) {
                        oepos.block = epos.block;
                        brelse(oepos.bh);
@@ -1956,13 +1963,13 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
                        oepos.offset = epos.offset - adsize;
                }
        }
-       memset(&eloc, 0x00, sizeof(kernel_lb_addr));
+       memset(&eloc, 0x00, sizeof(struct kernel_lb_addr));
        elen = 0;
 
        if (epos.bh != oepos.bh) {
-               udf_free_blocks(inode->i_sb, inode, epos.block, 0, 1);
-               udf_write_aext(inode, &oepos, eloc, elen, 1);
-               udf_write_aext(inode, &oepos, eloc, elen, 1);
+               udf_free_blocks(inode->i_sb, inode, &epos.block, 0, 1);
+               udf_write_aext(inode, &oepos, &eloc, elen, 1);
+               udf_write_aext(inode, &oepos, &eloc, elen, 1);
                if (!oepos.bh) {
                        iinfo->i_lenAlloc -= (adsize * 2);
                        mark_inode_dirty(inode);
@@ -1979,7 +1986,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
                        mark_buffer_dirty_inode(oepos.bh, inode);
                }
        } else {
-               udf_write_aext(inode, &oepos, eloc, elen, 1);
+               udf_write_aext(inode, &oepos, &eloc, elen, 1);
                if (!oepos.bh) {
                        iinfo->i_lenAlloc -= adsize;
                        mark_inode_dirty(inode);
@@ -2004,7 +2011,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
 }
 
 int8_t inode_bmap(struct inode *inode, sector_t block,
-                 struct extent_position *pos, kernel_lb_addr *eloc,
+                 struct extent_position *pos, struct kernel_lb_addr *eloc,
                  uint32_t *elen, sector_t *offset)
 {
        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
@@ -2036,7 +2043,7 @@ int8_t inode_bmap(struct inode *inode, sector_t block,
 
 long udf_block_map(struct inode *inode, sector_t block)
 {
-       kernel_lb_addr eloc;
+       struct kernel_lb_addr eloc;
        uint32_t elen;
        sector_t offset;
        struct extent_position epos = {};
@@ -2046,7 +2053,7 @@ long udf_block_map(struct inode *inode, sector_t block)
 
        if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) ==
                                                (EXT_RECORDED_ALLOCATED >> 30))
-               ret = udf_get_lb_pblock(inode->i_sb, eloc, offset);
+               ret = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
        else
                ret = 0;
 
index 84bf0fd4a4f1e24a3ff70f0e8de32fb514feb6e1..9215700c00a4448eedd1e5306cb2c3410f3f4b45 100644 (file)
@@ -134,10 +134,10 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
                        }
                }
                /* rewrite CRC + checksum of eahd */
-               crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(tag);
+               crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(struct tag);
                eahd->descTag.descCRCLength = cpu_to_le16(crclen);
                eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd +
-                                               sizeof(tag), crclen));
+                                               sizeof(struct tag), crclen));
                eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag);
                iinfo->i_lenEAttr += size;
                return (struct genericFormat *)&ea[offset];
@@ -202,7 +202,7 @@ struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type,
 struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
                                    uint32_t location, uint16_t *ident)
 {
-       tag *tag_p;
+       struct tag *tag_p;
        struct buffer_head *bh = NULL;
 
        /* Read the block */
@@ -216,7 +216,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
                return NULL;
        }
 
-       tag_p = (tag *)(bh->b_data);
+       tag_p = (struct tag *)(bh->b_data);
 
        *ident = le16_to_cpu(tag_p->tagIdent);
 
@@ -241,9 +241,9 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
        }
 
        /* Verify the descriptor CRC */
-       if (le16_to_cpu(tag_p->descCRCLength) + sizeof(tag) > sb->s_blocksize ||
+       if (le16_to_cpu(tag_p->descCRCLength) + sizeof(struct tag) > sb->s_blocksize ||
            le16_to_cpu(tag_p->descCRC) == crc_itu_t(0,
-                                       bh->b_data + sizeof(tag),
+                                       bh->b_data + sizeof(struct tag),
                                        le16_to_cpu(tag_p->descCRCLength)))
                return bh;
 
@@ -255,27 +255,28 @@ error_out:
        return NULL;
 }
 
-struct buffer_head *udf_read_ptagged(struct super_block *sb, kernel_lb_addr loc,
+struct buffer_head *udf_read_ptagged(struct super_block *sb,
+                                    struct kernel_lb_addr *loc,
                                     uint32_t offset, uint16_t *ident)
 {
        return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset),
-                              loc.logicalBlockNum + offset, ident);
+                              loc->logicalBlockNum + offset, ident);
 }
 
 void udf_update_tag(char *data, int length)
 {
-       tag *tptr = (tag *)data;
-       length -= sizeof(tag);
+       struct tag *tptr = (struct tag *)data;
+       length -= sizeof(struct tag);
 
        tptr->descCRCLength = cpu_to_le16(length);
-       tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(tag), length));
+       tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(struct tag), length));
        tptr->tagChecksum = udf_tag_checksum(tptr);
 }
 
 void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum,
                 uint32_t loc, int length)
 {
-       tag *tptr = (tag *)data;
+       struct tag *tptr = (struct tag *)data;
        tptr->tagIdent = cpu_to_le16(ident);
        tptr->descVersion = cpu_to_le16(version);
        tptr->tagSerialNum = cpu_to_le16(snum);
@@ -283,12 +284,12 @@ void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum,
        udf_update_tag(data, length);
 }
 
-u8 udf_tag_checksum(const tag *t)
+u8 udf_tag_checksum(const struct tag *t)
 {
        u8 *data = (u8 *)t;
        u8 checksum = 0;
        int i;
-       for (i = 0; i < sizeof(tag); ++i)
+       for (i = 0; i < sizeof(struct tag); ++i)
                if (i != 4) /* position of checksum */
                        checksum += data[i];
        return checksum;
index f84bfaa8d941bf2ba6b60793ea34d081c9c2fb2d..6a29fa34c478490cc45732af39072e78fee180e0 100644 (file)
@@ -47,7 +47,7 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
                 struct fileIdentDesc *sfi, struct udf_fileident_bh *fibh,
                 uint8_t *impuse, uint8_t *fileident)
 {
-       uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(tag);
+       uint16_t crclen = fibh->eoffset - fibh->soffset - sizeof(struct tag);
        uint16_t crc;
        int offset;
        uint16_t liu = le16_to_cpu(cfi->lengthOfImpUse);
@@ -99,18 +99,18 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
                memset(fibh->ebh->b_data, 0x00, padlen + offset);
        }
 
-       crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(tag),
-                     sizeof(struct fileIdentDesc) - sizeof(tag));
+       crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(struct tag),
+                     sizeof(struct fileIdentDesc) - sizeof(struct tag));
 
        if (fibh->sbh == fibh->ebh) {
                crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
-                             crclen + sizeof(tag) -
+                             crclen + sizeof(struct tag) -
                              sizeof(struct fileIdentDesc));
        } else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) {
                crc = crc_itu_t(crc, fibh->ebh->b_data +
                                        sizeof(struct fileIdentDesc) +
                                        fibh->soffset,
-                             crclen + sizeof(tag) -
+                             crclen + sizeof(struct tag) -
                                        sizeof(struct fileIdentDesc));
        } else {
                crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
@@ -154,7 +154,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
        uint8_t lfi;
        uint16_t liu;
        loff_t size;
-       kernel_lb_addr eloc;
+       struct kernel_lb_addr eloc;
        uint32_t elen;
        sector_t offset;
        struct extent_position epos = {};
@@ -171,12 +171,12 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
                if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
                    &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30))
                        goto out_err;
-               block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
+               block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
                if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
                        if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                               epos.offset -= sizeof(short_ad);
+                               epos.offset -= sizeof(struct short_ad);
                        else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                               epos.offset -= sizeof(long_ad);
+                               epos.offset -= sizeof(struct long_ad);
                } else
                        offset = 0;
 
@@ -268,7 +268,7 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
 #ifdef UDF_RECOVERY
        /* temporary shorthand for specifying files by inode number */
        if (!strncmp(dentry->d_name.name, ".B=", 3)) {
-               kernel_lb_addr lb = {
+               struct kernel_lb_addr lb = {
                        .logicalBlockNum = 0,
                        .partitionReferenceNum =
                                simple_strtoul(dentry->d_name.name + 3,
@@ -283,11 +283,14 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
 #endif /* UDF_RECOVERY */
 
        if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) {
+               struct kernel_lb_addr loc;
+
                if (fibh.sbh != fibh.ebh)
                        brelse(fibh.ebh);
                brelse(fibh.sbh);
 
-               inode = udf_iget(dir->i_sb, lelb_to_cpu(cfi.icb.extLocation));
+               loc = lelb_to_cpu(cfi.icb.extLocation);
+               inode = udf_iget(dir->i_sb, &loc);
                if (!inode) {
                        unlock_kernel();
                        return ERR_PTR(-EACCES);
@@ -313,7 +316,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
        uint8_t lfi;
        uint16_t liu;
        int block;
-       kernel_lb_addr eloc;
+       struct kernel_lb_addr eloc;
        uint32_t elen = 0;
        sector_t offset;
        struct extent_position epos = {};
@@ -351,16 +354,16 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
                if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
                    &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) {
                        block = udf_get_lb_pblock(dir->i_sb,
-                                       dinfo->i_location, 0);
+                                       &dinfo->i_location, 0);
                        fibh->soffset = fibh->eoffset = sb->s_blocksize;
                        goto add;
                }
-               block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
+               block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
                if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
                        if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                               epos.offset -= sizeof(short_ad);
+                               epos.offset -= sizeof(struct short_ad);
                        else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                               epos.offset -= sizeof(long_ad);
+                               epos.offset -= sizeof(struct long_ad);
                } else
                        offset = 0;
 
@@ -409,10 +412,10 @@ add:
        if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && elen) {
                elen = (elen + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1);
                if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                       epos.offset -= sizeof(short_ad);
+                       epos.offset -= sizeof(struct short_ad);
                else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                       epos.offset -= sizeof(long_ad);
-               udf_write_aext(dir, &epos, eloc, elen, 1);
+                       epos.offset -= sizeof(struct long_ad);
+               udf_write_aext(dir, &epos, &eloc, elen, 1);
        }
        f_pos += nfidlen;
 
@@ -494,10 +497,10 @@ add:
        memset(cfi, 0, sizeof(struct fileIdentDesc));
        if (UDF_SB(sb)->s_udfrev >= 0x0200)
                udf_new_tag((char *)cfi, TAG_IDENT_FID, 3, 1, block,
-                           sizeof(tag));
+                           sizeof(struct tag));
        else
                udf_new_tag((char *)cfi, TAG_IDENT_FID, 2, 1, block,
-                           sizeof(tag));
+                           sizeof(struct tag));
        cfi->fileVersionNum = cpu_to_le16(1);
        cfi->lengthFileIdent = namelen;
        cfi->lengthOfImpUse = cpu_to_le16(0);
@@ -530,7 +533,7 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
        cfi->fileCharacteristics |= FID_FILE_CHAR_DELETED;
 
        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT))
-               memset(&(cfi->icb), 0x00, sizeof(long_ad));
+               memset(&(cfi->icb), 0x00, sizeof(struct long_ad));
 
        return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL);
 }
@@ -710,7 +713,7 @@ static int empty_dir(struct inode *dir)
        loff_t f_pos;
        loff_t size = udf_ext0_offset(dir) + dir->i_size;
        int block;
-       kernel_lb_addr eloc;
+       struct kernel_lb_addr eloc;
        uint32_t elen;
        sector_t offset;
        struct extent_position epos = {};
@@ -724,12 +727,12 @@ static int empty_dir(struct inode *dir)
        else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits,
                              &epos, &eloc, &elen, &offset) ==
                                        (EXT_RECORDED_ALLOCATED >> 30)) {
-               block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
+               block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
                if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
                        if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-                               epos.offset -= sizeof(short_ad);
+                               epos.offset -= sizeof(struct short_ad);
                        else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-                               epos.offset -= sizeof(long_ad);
+                               epos.offset -= sizeof(struct long_ad);
                } else
                        offset = 0;
 
@@ -778,7 +781,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
        struct inode *inode = dentry->d_inode;
        struct udf_fileident_bh fibh;
        struct fileIdentDesc *fi, cfi;
-       kernel_lb_addr tloc;
+       struct kernel_lb_addr tloc;
 
        retval = -ENOENT;
        lock_kernel();
@@ -788,7 +791,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
 
        retval = -EIO;
        tloc = lelb_to_cpu(cfi.icb.extLocation);
-       if (udf_get_lb_pblock(dir->i_sb, tloc, 0) != inode->i_ino)
+       if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
                goto end_rmdir;
        retval = -ENOTEMPTY;
        if (!empty_dir(inode))
@@ -824,7 +827,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
        struct udf_fileident_bh fibh;
        struct fileIdentDesc *fi;
        struct fileIdentDesc cfi;
-       kernel_lb_addr tloc;
+       struct kernel_lb_addr tloc;
 
        retval = -ENOENT;
        lock_kernel();
@@ -834,7 +837,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
 
        retval = -EIO;
        tloc = lelb_to_cpu(cfi.icb.extLocation);
-       if (udf_get_lb_pblock(dir->i_sb, tloc, 0) != inode->i_ino)
+       if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
                goto end_unlink;
 
        if (!inode->i_nlink) {
@@ -897,7 +900,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        inode->i_op = &page_symlink_inode_operations;
 
        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
-               kernel_lb_addr eloc;
+               struct kernel_lb_addr eloc;
                uint32_t bsize;
 
                block = udf_new_block(inode->i_sb, inode,
@@ -913,7 +916,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                                iinfo->i_location.partitionReferenceNum;
                bsize = inode->i_sb->s_blocksize;
                iinfo->i_lenExtents = bsize;
-               udf_add_aext(inode, &epos, eloc, bsize, 0);
+               udf_add_aext(inode, &epos, &eloc, bsize, 0);
                brelse(epos.bh);
 
                block = udf_get_pblock(inode->i_sb, block,
@@ -1108,7 +1111,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct fileIdentDesc ocfi, ncfi;
        struct buffer_head *dir_bh = NULL;
        int retval = -ENOENT;
-       kernel_lb_addr tloc;
+       struct kernel_lb_addr tloc;
        struct udf_inode_info *old_iinfo = UDF_I(old_inode);
 
        lock_kernel();
@@ -1119,7 +1122,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
                brelse(ofibh.sbh);
        }
        tloc = lelb_to_cpu(ocfi.icb.extLocation);
-       if (!ofi || udf_get_lb_pblock(old_dir->i_sb, tloc, 0)
+       if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0)
            != old_inode->i_ino)
                goto end_rename;
 
@@ -1158,7 +1161,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
                if (!dir_fi)
                        goto end_rename;
                tloc = lelb_to_cpu(dir_fi->icb.extLocation);
-               if (udf_get_lb_pblock(old_inode->i_sb, tloc, 0) !=
+               if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) !=
                                old_dir->i_ino)
                        goto end_rename;
 
@@ -1187,7 +1190,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
         */
        ncfi.fileVersionNum = ocfi.fileVersionNum;
        ncfi.fileCharacteristics = ocfi.fileCharacteristics;
-       memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(long_ad));
+       memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(struct long_ad));
        udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL);
 
        /* The old fid may have moved - find it again */
@@ -1242,6 +1245,7 @@ end_rename:
 
 static struct dentry *udf_get_parent(struct dentry *child)
 {
+       struct kernel_lb_addr tloc;
        struct inode *inode = NULL;
        struct qstr dotdot = {.name = "..", .len = 2};
        struct fileIdentDesc cfi;
@@ -1255,8 +1259,8 @@ static struct dentry *udf_get_parent(struct dentry *child)
                brelse(fibh.ebh);
        brelse(fibh.sbh);
 
-       inode = udf_iget(child->d_inode->i_sb,
-                        lelb_to_cpu(cfi.icb.extLocation));
+       tloc = lelb_to_cpu(cfi.icb.extLocation);
+       inode = udf_iget(child->d_inode->i_sb, &tloc);
        if (!inode)
                goto out_unlock;
        unlock_kernel();
@@ -1272,14 +1276,14 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
                                        u16 partref, __u32 generation)
 {
        struct inode *inode;
-       kernel_lb_addr loc;
+       struct kernel_lb_addr loc;
 
        if (block == 0)
                return ERR_PTR(-ESTALE);
 
        loc.logicalBlockNum = block;
        loc.partitionReferenceNum = partref;
-       inode = udf_iget(sb, loc);
+       inode = udf_iget(sb, &loc);
 
        if (inode == NULL)
                return ERR_PTR(-ENOMEM);
@@ -1318,7 +1322,7 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
 {
        int len = *lenp;
        struct inode *inode =  de->d_inode;
-       kernel_lb_addr location = UDF_I(inode)->i_location;
+       struct kernel_lb_addr location = UDF_I(inode)->i_location;
        struct fid *fid = (struct fid *)fh;
        int type = FILEID_UDF_WITHOUT_PARENT;
 
index 65ff47902bd25784940b9ac3bc7744fc9da0e04f..fbff74654df2242f0df086237512226923c76429 100644 (file)
@@ -85,7 +85,7 @@ struct appIdentSuffix {
 /* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */
 /* Implementation Use (UDF 2.50 2.2.6.4) */
 struct logicalVolIntegrityDescImpUse {
-       regid           impIdent;
+       struct regid    impIdent;
        __le32          numFiles;
        __le32          numDirs;
        __le16          minUDFReadRev;
@@ -97,12 +97,12 @@ struct logicalVolIntegrityDescImpUse {
 /* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */
 /* Implementation Use (UDF 2.50 2.2.7.2) */
 struct impUseVolDescImpUse {
-       charspec        LVICharset;
+       struct charspec LVICharset;
        dstring         logicalVolIdent[128];
        dstring         LVInfo1[36];
        dstring         LVInfo2[36];
        dstring         LVInfo3[36];
-       regid           impIdent;
+       struct regid    impIdent;
        uint8_t         impUse[128];
 } __attribute__ ((packed));
 
@@ -110,7 +110,7 @@ struct udfPartitionMap2 {
        uint8_t         partitionMapType;
        uint8_t         partitionMapLength;
        uint8_t         reserved1[2];
-       regid           partIdent;
+       struct regid    partIdent;
        __le16          volSeqNum;
        __le16          partitionNum;
 } __attribute__ ((packed));
@@ -120,7 +120,7 @@ struct virtualPartitionMap {
        uint8_t         partitionMapType;
        uint8_t         partitionMapLength;
        uint8_t         reserved1[2];
-       regid           partIdent;
+       struct regid    partIdent;
        __le16          volSeqNum;
        __le16          partitionNum;
        uint8_t         reserved2[24];
@@ -131,7 +131,7 @@ struct sparablePartitionMap {
        uint8_t partitionMapType;
        uint8_t partitionMapLength;
        uint8_t reserved1[2];
-       regid partIdent;
+       struct regid partIdent;
        __le16 volSeqNum;
        __le16 partitionNum;
        __le16 packetLength;
@@ -146,7 +146,7 @@ struct metadataPartitionMap {
        uint8_t         partitionMapType;
        uint8_t         partitionMapLength;
        uint8_t         reserved1[2];
-       regid           partIdent;
+       struct regid    partIdent;
        __le16          volSeqNum;
        __le16          partitionNum;
        __le32          metadataFileLoc;
@@ -161,7 +161,7 @@ struct metadataPartitionMap {
 /* Virtual Allocation Table (UDF 1.5 2.2.10) */
 struct virtualAllocationTable15 {
        __le32          VirtualSector[0];
-       regid           vatIdent;
+       struct regid    vatIdent;
        __le32          previousVATICBLoc;
 } __attribute__ ((packed));
 
@@ -192,8 +192,8 @@ struct sparingEntry {
 } __attribute__ ((packed));
 
 struct sparingTable {
-       tag             descTag;
-       regid           sparingIdent;
+       struct tag      descTag;
+       struct regid    sparingIdent;
        __le16          reallocationTableLen;
        __le16          reserved;
        __le32          sequenceNum;
@@ -206,7 +206,7 @@ struct sparingTable {
 #define ICBTAG_FILE_TYPE_MIRROR                0xFB
 #define ICBTAG_FILE_TYPE_BITMAP                0xFC
 
-/* struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */
+/* struct struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */
 struct allocDescImpUse {
        __le16          flags;
        uint8_t         impUse[4];
index 96dfd207c3d6339f5a07377a217e5f15c01140e2..4b540ee632d5ba919ee93a5ef2689794c4935bb5 100644 (file)
@@ -273,7 +273,7 @@ static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
 {
        struct super_block *sb = inode->i_sb;
        struct udf_part_map *map;
-       kernel_lb_addr eloc;
+       struct kernel_lb_addr eloc;
        uint32_t elen;
        sector_t ext_offset;
        struct extent_position epos = {};
index e25e7010627b887bee05a004d55642888a00a291..72348cc855a45dd25126aaba8557772e0d4b491d 100644 (file)
@@ -81,16 +81,13 @@ static char error_buf[1024];
 /* These are the "meat" - everything else is stuffing */
 static int udf_fill_super(struct super_block *, void *, int);
 static void udf_put_super(struct super_block *);
-static void udf_write_super(struct super_block *);
+static int udf_sync_fs(struct super_block *, int);
 static int udf_remount_fs(struct super_block *, int *, char *);
-static int udf_check_valid(struct super_block *, int, int);
-static int udf_vrs(struct super_block *sb, int silent);
-static void udf_load_logicalvolint(struct super_block *, kernel_extent_ad);
-static void udf_find_anchor(struct super_block *);
-static int udf_find_fileset(struct super_block *, kernel_lb_addr *,
-                           kernel_lb_addr *);
+static void udf_load_logicalvolint(struct super_block *, struct kernel_extent_ad);
+static int udf_find_fileset(struct super_block *, struct kernel_lb_addr *,
+                           struct kernel_lb_addr *);
 static void udf_load_fileset(struct super_block *, struct buffer_head *,
-                            kernel_lb_addr *);
+                            struct kernel_lb_addr *);
 static void udf_open_lvid(struct super_block *);
 static void udf_close_lvid(struct super_block *);
 static unsigned int udf_count_free(struct super_block *);
@@ -181,7 +178,7 @@ static const struct super_operations udf_sb_ops = {
        .delete_inode   = udf_delete_inode,
        .clear_inode    = udf_clear_inode,
        .put_super      = udf_put_super,
-       .write_super    = udf_write_super,
+       .sync_fs        = udf_sync_fs,
        .statfs         = udf_statfs,
        .remount_fs     = udf_remount_fs,
        .show_options   = udf_show_options,
@@ -201,6 +198,8 @@ struct udf_options {
        mode_t umask;
        gid_t gid;
        uid_t uid;
+       mode_t fmode;
+       mode_t dmode;
        struct nls_table *nls_map;
 };
 
@@ -258,7 +257,7 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
 
        if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT))
                seq_puts(seq, ",nostrict");
-       if (sb->s_blocksize != UDF_DEFAULT_BLOCKSIZE)
+       if (UDF_QUERY_FLAG(sb, UDF_FLAG_BLOCKSIZE_SET))
                seq_printf(seq, ",bs=%lu", sb->s_blocksize);
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE))
                seq_puts(seq, ",unhide");
@@ -282,18 +281,16 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
                seq_printf(seq, ",gid=%u", sbi->s_gid);
        if (sbi->s_umask != 0)
                seq_printf(seq, ",umask=%o", sbi->s_umask);
+       if (sbi->s_fmode != UDF_INVALID_MODE)
+               seq_printf(seq, ",mode=%o", sbi->s_fmode);
+       if (sbi->s_dmode != UDF_INVALID_MODE)
+               seq_printf(seq, ",dmode=%o", sbi->s_dmode);
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET))
                seq_printf(seq, ",session=%u", sbi->s_session);
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET))
                seq_printf(seq, ",lastblock=%u", sbi->s_last_block);
-       /*
-        * s_anchor[2] could be zeroed out in case there is no anchor
-        * in the specified block, but then the "anchor=N" option
-        * originally given by the user wasn't effective, so it's OK
-        * if we don't show it.
-        */
-       if (sbi->s_anchor[2] != 0)
-               seq_printf(seq, ",anchor=%u", sbi->s_anchor[2]);
+       if (sbi->s_anchor != 0)
+               seq_printf(seq, ",anchor=%u", sbi->s_anchor);
        /*
         * volume, partition, fileset and rootdir seem to be ignored
         * currently
@@ -317,6 +314,8 @@ static int udf_show_options(struct seq_file *seq, struct vfsmount *mnt)
  *
  *     gid=            Set the default group.
  *     umask=          Set the default umask.
+ *     mode=           Set the default file permissions.
+ *     dmode=          Set the default directory permissions.
  *     uid=            Set the default user.
  *     bs=             Set the block size.
  *     unhide          Show otherwise hidden files.
@@ -366,7 +365,8 @@ enum {
        Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock,
        Opt_anchor, Opt_volume, Opt_partition, Opt_fileset,
        Opt_rootdir, Opt_utf8, Opt_iocharset,
-       Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore
+       Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore,
+       Opt_fmode, Opt_dmode
 };
 
 static const match_table_t tokens = {
@@ -395,6 +395,8 @@ static const match_table_t tokens = {
        {Opt_rootdir,   "rootdir=%u"},
        {Opt_utf8,      "utf8"},
        {Opt_iocharset, "iocharset=%s"},
+       {Opt_fmode,     "mode=%o"},
+       {Opt_dmode,     "dmode=%o"},
        {Opt_err,       NULL}
 };
 
@@ -405,7 +407,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
        int option;
 
        uopt->novrs = 0;
-       uopt->blocksize = UDF_DEFAULT_BLOCKSIZE;
        uopt->partition = 0xFFFF;
        uopt->session = 0xFFFFFFFF;
        uopt->lastblock = 0;
@@ -428,10 +429,12 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
                switch (token) {
                case Opt_novrs:
                        uopt->novrs = 1;
+                       break;
                case Opt_bs:
                        if (match_int(&args[0], &option))
                                return 0;
                        uopt->blocksize = option;
+                       uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
                        break;
                case Opt_unhide:
                        uopt->flags |= (1 << UDF_FLAG_UNHIDE);
@@ -531,6 +534,16 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
                case Opt_gforget:
                        uopt->flags |= (1 << UDF_FLAG_GID_FORGET);
                        break;
+               case Opt_fmode:
+                       if (match_octal(args, &option))
+                               return 0;
+                       uopt->fmode = option & 0777;
+                       break;
+               case Opt_dmode:
+                       if (match_octal(args, &option))
+                               return 0;
+                       uopt->dmode = option & 0777;
+                       break;
                default:
                        printk(KERN_ERR "udf: bad mount option \"%s\" "
                               "or missing value\n", p);
@@ -540,17 +553,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
        return 1;
 }
 
-static void udf_write_super(struct super_block *sb)
-{
-       lock_kernel();
-
-       if (!(sb->s_flags & MS_RDONLY))
-               udf_open_lvid(sb);
-       sb->s_dirt = 0;
-
-       unlock_kernel();
-}
-
 static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 {
        struct udf_options uopt;
@@ -560,6 +562,8 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
        uopt.uid   = sbi->s_uid;
        uopt.gid   = sbi->s_gid;
        uopt.umask = sbi->s_umask;
+       uopt.fmode = sbi->s_fmode;
+       uopt.dmode = sbi->s_dmode;
 
        if (!udf_parse_options(options, &uopt, true))
                return -EINVAL;
@@ -568,6 +572,8 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
        sbi->s_uid   = uopt.uid;
        sbi->s_gid   = uopt.gid;
        sbi->s_umask = uopt.umask;
+       sbi->s_fmode = uopt.fmode;
+       sbi->s_dmode = uopt.dmode;
 
        if (sbi->s_lvid_bh) {
                int write_rev = le16_to_cpu(udf_sb_lvidiu(sbi)->minUDFWriteRev);
@@ -585,22 +591,19 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
        return 0;
 }
 
-static int udf_vrs(struct super_block *sb, int silent)
+/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
+/* We also check any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */
+static loff_t udf_check_vsd(struct super_block *sb)
 {
        struct volStructDesc *vsd = NULL;
        loff_t sector = 32768;
        int sectorsize;
        struct buffer_head *bh = NULL;
-       int iso9660 = 0;
        int nsr02 = 0;
        int nsr03 = 0;
        struct udf_sb_info *sbi;
 
-       /* Block size must be a multiple of 512 */
-       if (sb->s_blocksize & 511)
-               return 0;
        sbi = UDF_SB(sb);
-
        if (sb->s_blocksize < sizeof(struct volStructDesc))
                sectorsize = sizeof(struct volStructDesc);
        else
@@ -627,7 +630,6 @@ static int udf_vrs(struct super_block *sb, int silent)
                        break;
                } else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001,
                                    VSD_STD_ID_LEN)) {
-                       iso9660 = sector;
                        switch (vsd->structType) {
                        case 0:
                                udf_debug("ISO9660 Boot Record found\n");
@@ -679,139 +681,9 @@ static int udf_vrs(struct super_block *sb, int silent)
                return 0;
 }
 
-/*
- * Check whether there is an anchor block in the given block
- */
-static int udf_check_anchor_block(struct super_block *sb, sector_t block)
-{
-       struct buffer_head *bh;
-       uint16_t ident;
-
-       if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
-           udf_fixed_to_variable(block) >=
-           sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
-               return 0;
-
-       bh = udf_read_tagged(sb, block, block, &ident);
-       if (!bh)
-               return 0;
-       brelse(bh);
-
-       return ident == TAG_IDENT_AVDP;
-}
-
-/* Search for an anchor volume descriptor pointer */
-static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock)
-{
-       sector_t last[6];
-       int i;
-       struct udf_sb_info *sbi = UDF_SB(sb);
-
-       last[0] = lastblock;
-       last[1] = last[0] - 1;
-       last[2] = last[0] + 1;
-       last[3] = last[0] - 2;
-       last[4] = last[0] - 150;
-       last[5] = last[0] - 152;
-
-       /*  according to spec, anchor is in either:
-        *     block 256
-        *     lastblock-256
-        *     lastblock
-        *  however, if the disc isn't closed, it could be 512 */
-
-       for (i = 0; i < ARRAY_SIZE(last); i++) {
-               if (last[i] < 0)
-                       continue;
-               if (last[i] >= sb->s_bdev->bd_inode->i_size >>
-                               sb->s_blocksize_bits)
-                       continue;
-
-               if (udf_check_anchor_block(sb, last[i])) {
-                       sbi->s_anchor[0] = last[i];
-                       sbi->s_anchor[1] = last[i] - 256;
-                       return last[i];
-               }
-
-               if (last[i] < 256)
-                       continue;
-
-               if (udf_check_anchor_block(sb, last[i] - 256)) {
-                       sbi->s_anchor[1] = last[i] - 256;
-                       return last[i];
-               }
-       }
-
-       if (udf_check_anchor_block(sb, sbi->s_session + 256)) {
-               sbi->s_anchor[0] = sbi->s_session + 256;
-               return last[0];
-       }
-       if (udf_check_anchor_block(sb, sbi->s_session + 512)) {
-               sbi->s_anchor[0] = sbi->s_session + 512;
-               return last[0];
-       }
-       return 0;
-}
-
-/*
- * Find an anchor volume descriptor. The function expects sbi->s_lastblock to
- * be the last block on the media.
- *
- * Return 1 if not found, 0 if ok
- *
- */
-static void udf_find_anchor(struct super_block *sb)
-{
-       sector_t lastblock;
-       struct buffer_head *bh = NULL;
-       uint16_t ident;
-       int i;
-       struct udf_sb_info *sbi = UDF_SB(sb);
-
-       lastblock = udf_scan_anchors(sb, sbi->s_last_block);
-       if (lastblock)
-               goto check_anchor;
-
-       /* No anchor found? Try VARCONV conversion of block numbers */
-       UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
-       /* Firstly, we try to not convert number of the last block */
-       lastblock = udf_scan_anchors(sb,
-                               udf_variable_to_fixed(sbi->s_last_block));
-       if (lastblock)
-               goto check_anchor;
-
-       /* Secondly, we try with converted number of the last block */
-       lastblock = udf_scan_anchors(sb, sbi->s_last_block);
-       if (!lastblock) {
-               /* VARCONV didn't help. Clear it. */
-               UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
-       }
-
-check_anchor:
-       /*
-        * Check located anchors and the anchor block supplied via
-        * mount options
-        */
-       for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) {
-               if (!sbi->s_anchor[i])
-                       continue;
-               bh = udf_read_tagged(sb, sbi->s_anchor[i],
-                                       sbi->s_anchor[i], &ident);
-               if (!bh)
-                       sbi->s_anchor[i] = 0;
-               else {
-                       brelse(bh);
-                       if (ident != TAG_IDENT_AVDP)
-                               sbi->s_anchor[i] = 0;
-               }
-       }
-
-       sbi->s_last_block = lastblock;
-}
-
 static int udf_find_fileset(struct super_block *sb,
-                           kernel_lb_addr *fileset,
-                           kernel_lb_addr *root)
+                           struct kernel_lb_addr *fileset,
+                           struct kernel_lb_addr *root)
 {
        struct buffer_head *bh = NULL;
        long lastblock;
@@ -820,7 +692,7 @@ static int udf_find_fileset(struct super_block *sb,
 
        if (fileset->logicalBlockNum != 0xFFFFFFFF ||
            fileset->partitionReferenceNum != 0xFFFF) {
-               bh = udf_read_ptagged(sb, *fileset, 0, &ident);
+               bh = udf_read_ptagged(sb, fileset, 0, &ident);
 
                if (!bh) {
                        return 1;
@@ -834,7 +706,7 @@ static int udf_find_fileset(struct super_block *sb,
        sbi = UDF_SB(sb);
        if (!bh) {
                /* Search backwards through the partitions */
-               kernel_lb_addr newfileset;
+               struct kernel_lb_addr newfileset;
 
 /* --> cvg: FIXME - is it reasonable? */
                return 1;
@@ -850,7 +722,7 @@ static int udf_find_fileset(struct super_block *sb,
                        newfileset.logicalBlockNum = 0;
 
                        do {
-                               bh = udf_read_ptagged(sb, newfileset, 0,
+                               bh = udf_read_ptagged(sb, &newfileset, 0,
                                                      &ident);
                                if (!bh) {
                                        newfileset.logicalBlockNum++;
@@ -902,14 +774,23 @@ static int udf_find_fileset(struct super_block *sb,
 static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
 {
        struct primaryVolDesc *pvoldesc;
-       struct ustr instr;
-       struct ustr outstr;
+       struct ustr *instr, *outstr;
        struct buffer_head *bh;
        uint16_t ident;
+       int ret = 1;
+
+       instr = kmalloc(sizeof(struct ustr), GFP_NOFS);
+       if (!instr)
+               return 1;
+
+       outstr = kmalloc(sizeof(struct ustr), GFP_NOFS);
+       if (!outstr)
+               goto out1;
 
        bh = udf_read_tagged(sb, block, block, &ident);
        if (!bh)
-               return 1;
+               goto out2;
+
        BUG_ON(ident != TAG_IDENT_PVD);
 
        pvoldesc = (struct primaryVolDesc *)bh->b_data;
@@ -917,7 +798,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
        if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time,
                              pvoldesc->recordingDateAndTime)) {
 #ifdef UDFFS_DEBUG
-               timestamp *ts = &pvoldesc->recordingDateAndTime;
+               struct timestamp *ts = &pvoldesc->recordingDateAndTime;
                udf_debug("recording time %04u/%02u/%02u"
                          " %02u:%02u (%x)\n",
                          le16_to_cpu(ts->year), ts->month, ts->day, ts->hour,
@@ -925,20 +806,25 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
 #endif
        }
 
-       if (!udf_build_ustr(&instr, pvoldesc->volIdent, 32))
-               if (udf_CS0toUTF8(&outstr, &instr)) {
-                       strncpy(UDF_SB(sb)->s_volume_ident, outstr.u_name,
-                               outstr.u_len > 31 ? 31 : outstr.u_len);
+       if (!udf_build_ustr(instr, pvoldesc->volIdent, 32))
+               if (udf_CS0toUTF8(outstr, instr)) {
+                       strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
+                               outstr->u_len > 31 ? 31 : outstr->u_len);
                        udf_debug("volIdent[] = '%s'\n",
                                        UDF_SB(sb)->s_volume_ident);
                }
 
-       if (!udf_build_ustr(&instr, pvoldesc->volSetIdent, 128))
-               if (udf_CS0toUTF8(&outstr, &instr))
-                       udf_debug("volSetIdent[] = '%s'\n", outstr.u_name);
+       if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128))
+               if (udf_CS0toUTF8(outstr, instr))
+                       udf_debug("volSetIdent[] = '%s'\n", outstr->u_name);
 
        brelse(bh);
-       return 0;
+       ret = 0;
+out2:
+       kfree(outstr);
+out1:
+       kfree(instr);
+       return ret;
 }
 
 static int udf_load_metadata_files(struct super_block *sb, int partition)
@@ -946,7 +832,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct udf_part_map *map;
        struct udf_meta_data *mdata;
-       kernel_lb_addr addr;
+       struct kernel_lb_addr addr;
        int fe_error = 0;
 
        map = &sbi->s_partmaps[partition];
@@ -959,7 +845,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
        udf_debug("Metadata file location: block = %d part = %d\n",
                          addr.logicalBlockNum, addr.partitionReferenceNum);
 
-       mdata->s_metadata_fe = udf_iget(sb, addr);
+       mdata->s_metadata_fe = udf_iget(sb, &addr);
 
        if (mdata->s_metadata_fe == NULL) {
                udf_warning(sb, __func__, "metadata inode efe not found, "
@@ -981,7 +867,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
        udf_debug("Mirror metadata file location: block = %d part = %d\n",
                          addr.logicalBlockNum, addr.partitionReferenceNum);
 
-       mdata->s_mirror_fe = udf_iget(sb, addr);
+       mdata->s_mirror_fe = udf_iget(sb, &addr);
 
        if (mdata->s_mirror_fe == NULL) {
                if (fe_error) {
@@ -1013,7 +899,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
                udf_debug("Bitmap file location: block = %d part = %d\n",
                        addr.logicalBlockNum, addr.partitionReferenceNum);
 
-               mdata->s_bitmap_fe = udf_iget(sb, addr);
+               mdata->s_bitmap_fe = udf_iget(sb, &addr);
 
                if (mdata->s_bitmap_fe == NULL) {
                        if (sb->s_flags & MS_RDONLY)
@@ -1037,7 +923,7 @@ error_exit:
 }
 
 static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
-                            kernel_lb_addr *root)
+                            struct kernel_lb_addr *root)
 {
        struct fileSetDesc *fset;
 
@@ -1119,13 +1005,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
 
        phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
        if (phd->unallocSpaceTable.extLength) {
-               kernel_lb_addr loc = {
+               struct kernel_lb_addr loc = {
                        .logicalBlockNum = le32_to_cpu(
                                phd->unallocSpaceTable.extPosition),
                        .partitionReferenceNum = p_index,
                };
 
-               map->s_uspace.s_table = udf_iget(sb, loc);
+               map->s_uspace.s_table = udf_iget(sb, &loc);
                if (!map->s_uspace.s_table) {
                        udf_debug("cannot load unallocSpaceTable (part %d)\n",
                                        p_index);
@@ -1154,13 +1040,13 @@ static int udf_fill_partdesc_info(struct super_block *sb,
                udf_debug("partitionIntegrityTable (part %d)\n", p_index);
 
        if (phd->freedSpaceTable.extLength) {
-               kernel_lb_addr loc = {
+               struct kernel_lb_addr loc = {
                        .logicalBlockNum = le32_to_cpu(
                                phd->freedSpaceTable.extPosition),
                        .partitionReferenceNum = p_index,
                };
 
-               map->s_fspace.s_table = udf_iget(sb, loc);
+               map->s_fspace.s_table = udf_iget(sb, &loc);
                if (!map->s_fspace.s_table) {
                        udf_debug("cannot load freedSpaceTable (part %d)\n",
                                p_index);
@@ -1192,7 +1078,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
 {
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct udf_part_map *map = &sbi->s_partmaps[p_index];
-       kernel_lb_addr ino;
+       struct kernel_lb_addr ino;
        struct buffer_head *bh = NULL;
        struct udf_inode_info *vati;
        uint32_t pos;
@@ -1201,7 +1087,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
        /* VAT file entry is in the last recorded block */
        ino.partitionReferenceNum = type1_index;
        ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
-       sbi->s_vat_inode = udf_iget(sb, ino);
+       sbi->s_vat_inode = udf_iget(sb, &ino);
        if (!sbi->s_vat_inode)
                return 1;
 
@@ -1322,7 +1208,7 @@ out_bh:
 }
 
 static int udf_load_logicalvol(struct super_block *sb, sector_t block,
-                              kernel_lb_addr *fileset)
+                              struct kernel_lb_addr *fileset)
 {
        struct logicalVolDesc *lvd;
        int i, j, offset;
@@ -1471,7 +1357,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
        }
 
        if (fileset) {
-               long_ad *la = (long_ad *)&(lvd->logicalVolContentsUse[0]);
+               struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]);
 
                *fileset = lelb_to_cpu(la->extLocation);
                udf_debug("FileSet found in LogicalVolDesc at block=%d, "
@@ -1490,7 +1376,7 @@ out_bh:
  * udf_load_logicalvolint
  *
  */
-static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc)
+static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ad loc)
 {
        struct buffer_head *bh = NULL;
        uint16_t ident;
@@ -1533,7 +1419,7 @@ static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc)
  *     Written, tested, and released.
  */
 static noinline int udf_process_sequence(struct super_block *sb, long block,
-                               long lastblock, kernel_lb_addr *fileset)
+                               long lastblock, struct kernel_lb_addr *fileset)
 {
        struct buffer_head *bh = NULL;
        struct udf_vds_record vds[VDS_POS_LENGTH];
@@ -1655,85 +1541,199 @@ static noinline int udf_process_sequence(struct super_block *sb, long block,
        return 0;
 }
 
+static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
+                            struct kernel_lb_addr *fileset)
+{
+       struct anchorVolDescPtr *anchor;
+       long main_s, main_e, reserve_s, reserve_e;
+       struct udf_sb_info *sbi;
+
+       sbi = UDF_SB(sb);
+       anchor = (struct anchorVolDescPtr *)bh->b_data;
+
+       /* Locate the main sequence */
+       main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation);
+       main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength);
+       main_e = main_e >> sb->s_blocksize_bits;
+       main_e += main_s;
+
+       /* Locate the reserve sequence */
+       reserve_s = le32_to_cpu(anchor->reserveVolDescSeqExt.extLocation);
+       reserve_e = le32_to_cpu(anchor->reserveVolDescSeqExt.extLength);
+       reserve_e = reserve_e >> sb->s_blocksize_bits;
+       reserve_e += reserve_s;
+
+       /* Process the main & reserve sequences */
+       /* responsible for finding the PartitionDesc(s) */
+       if (!udf_process_sequence(sb, main_s, main_e, fileset))
+               return 1;
+       return !udf_process_sequence(sb, reserve_s, reserve_e, fileset);
+}
+
 /*
- * udf_check_valid()
+ * Check whether there is an anchor block in the given block and
+ * load Volume Descriptor Sequence if so.
  */
-static int udf_check_valid(struct super_block *sb, int novrs, int silent)
+static int udf_check_anchor_block(struct super_block *sb, sector_t block,
+                                 struct kernel_lb_addr *fileset)
 {
-       long block;
-       struct udf_sb_info *sbi = UDF_SB(sb);
+       struct buffer_head *bh;
+       uint16_t ident;
+       int ret;
 
-       if (novrs) {
-               udf_debug("Validity check skipped because of novrs option\n");
+       if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
+           udf_fixed_to_variable(block) >=
+           sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
+               return 0;
+
+       bh = udf_read_tagged(sb, block, block, &ident);
+       if (!bh)
+               return 0;
+       if (ident != TAG_IDENT_AVDP) {
+               brelse(bh);
                return 0;
        }
-       /* Check that it is NSR02 compliant */
-       /* Process any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */
-       block = udf_vrs(sb, silent);
-       if (block == -1)
-               udf_debug("Failed to read byte 32768. Assuming open "
-                         "disc. Skipping validity check\n");
-       if (block && !sbi->s_last_block)
-               sbi->s_last_block = udf_get_last_block(sb);
-       return !block;
+       ret = udf_load_sequence(sb, bh, fileset);
+       brelse(bh);
+       return ret;
 }
 
-static int udf_load_sequence(struct super_block *sb, kernel_lb_addr *fileset)
+/* Search for an anchor volume descriptor pointer */
+static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock,
+                                struct kernel_lb_addr *fileset)
 {
-       struct anchorVolDescPtr *anchor;
-       uint16_t ident;
-       struct buffer_head *bh;
-       long main_s, main_e, reserve_s, reserve_e;
+       sector_t last[6];
        int i;
-       struct udf_sb_info *sbi;
-
-       if (!sb)
-               return 1;
-       sbi = UDF_SB(sb);
+       struct udf_sb_info *sbi = UDF_SB(sb);
+       int last_count = 0;
 
-       for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) {
-               if (!sbi->s_anchor[i])
+       /* First try user provided anchor */
+       if (sbi->s_anchor) {
+               if (udf_check_anchor_block(sb, sbi->s_anchor, fileset))
+                       return lastblock;
+       }
+       /*
+        * according to spec, anchor is in either:
+        *     block 256
+        *     lastblock-256
+        *     lastblock
+        *  however, if the disc isn't closed, it could be 512.
+        */
+       if (udf_check_anchor_block(sb, sbi->s_session + 256, fileset))
+               return lastblock;
+       /*
+        * The trouble is which block is the last one. Drives often misreport
+        * this so we try various possibilities.
+        */
+       last[last_count++] = lastblock;
+       if (lastblock >= 1)
+               last[last_count++] = lastblock - 1;
+       last[last_count++] = lastblock + 1;
+       if (lastblock >= 2)
+               last[last_count++] = lastblock - 2;
+       if (lastblock >= 150)
+               last[last_count++] = lastblock - 150;
+       if (lastblock >= 152)
+               last[last_count++] = lastblock - 152;
+
+       for (i = 0; i < last_count; i++) {
+               if (last[i] >= sb->s_bdev->bd_inode->i_size >>
+                               sb->s_blocksize_bits)
                        continue;
-
-               bh = udf_read_tagged(sb, sbi->s_anchor[i], sbi->s_anchor[i],
-                                    &ident);
-               if (!bh)
+               if (udf_check_anchor_block(sb, last[i], fileset))
+                       return last[i];
+               if (last[i] < 256)
                        continue;
+               if (udf_check_anchor_block(sb, last[i] - 256, fileset))
+                       return last[i];
+       }
 
-               anchor = (struct anchorVolDescPtr *)bh->b_data;
+       /* Finally try block 512 in case media is open */
+       if (udf_check_anchor_block(sb, sbi->s_session + 512, fileset))
+               return last[0];
+       return 0;
+}
 
-               /* Locate the main sequence */
-               main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation);
-               main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength);
-               main_e = main_e >> sb->s_blocksize_bits;
-               main_e += main_s;
+/*
+ * Find an anchor volume descriptor and load Volume Descriptor Sequence from
+ * area specified by it. The function expects sbi->s_lastblock to be the last
+ * block on the media.
+ *
+ * Return 1 if ok, 0 if not found.
+ *
+ */
+static int udf_find_anchor(struct super_block *sb,
+                          struct kernel_lb_addr *fileset)
+{
+       sector_t lastblock;
+       struct udf_sb_info *sbi = UDF_SB(sb);
 
-               /* Locate the reserve sequence */
-               reserve_s = le32_to_cpu(
-                               anchor->reserveVolDescSeqExt.extLocation);
-               reserve_e = le32_to_cpu(
-                               anchor->reserveVolDescSeqExt.extLength);
-               reserve_e = reserve_e >> sb->s_blocksize_bits;
-               reserve_e += reserve_s;
+       lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
+       if (lastblock)
+               goto out;
 
-               brelse(bh);
+       /* No anchor found? Try VARCONV conversion of block numbers */
+       UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+       /* Firstly, we try to not convert number of the last block */
+       lastblock = udf_scan_anchors(sb,
+                               udf_variable_to_fixed(sbi->s_last_block),
+                               fileset);
+       if (lastblock)
+               goto out;
 
-               /* Process the main & reserve sequences */
-               /* responsible for finding the PartitionDesc(s) */
-               if (!(udf_process_sequence(sb, main_s, main_e,
-                                          fileset) &&
-                     udf_process_sequence(sb, reserve_s, reserve_e,
-                                          fileset)))
-                       break;
+       /* Secondly, we try with converted number of the last block */
+       lastblock = udf_scan_anchors(sb, sbi->s_last_block, fileset);
+       if (!lastblock) {
+               /* VARCONV didn't help. Clear it. */
+               UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
+               return 0;
        }
+out:
+       sbi->s_last_block = lastblock;
+       return 1;
+}
 
-       if (i == ARRAY_SIZE(sbi->s_anchor)) {
-               udf_debug("No Anchor block found\n");
-               return 1;
+/*
+ * Check Volume Structure Descriptor, find Anchor block and load Volume
+ * Descriptor Sequence
+ */
+static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
+                       int silent, struct kernel_lb_addr *fileset)
+{
+       struct udf_sb_info *sbi = UDF_SB(sb);
+       loff_t nsr_off;
+
+       if (!sb_set_blocksize(sb, uopt->blocksize)) {
+               if (!silent)
+                       printk(KERN_WARNING "UDF-fs: Bad block size\n");
+               return 0;
+       }
+       sbi->s_last_block = uopt->lastblock;
+       if (!uopt->novrs) {
+               /* Check that it is NSR02 compliant */
+               nsr_off = udf_check_vsd(sb);
+               if (!nsr_off) {
+                       if (!silent)
+                               printk(KERN_WARNING "UDF-fs: No VRS found\n");
+                       return 0;
+               }
+               if (nsr_off == -1)
+                       udf_debug("Failed to read byte 32768. Assuming open "
+                                 "disc. Skipping validity check\n");
+               if (!sbi->s_last_block)
+                       sbi->s_last_block = udf_get_last_block(sb);
+       } else {
+               udf_debug("Validity check skipped because of novrs option\n");
        }
-       udf_debug("Using anchor in block %d\n", sbi->s_anchor[i]);
 
-       return 0;
+       /* Look for anchor block and load Volume Descriptor Sequence */
+       sbi->s_anchor = uopt->anchor;
+       if (!udf_find_anchor(sb, fileset)) {
+               if (!silent)
+                       printk(KERN_WARNING "UDF-fs: No anchor found\n");
+               return 0;
+       }
+       return 1;
 }
 
 static void udf_open_lvid(struct super_block *sb)
@@ -1742,9 +1742,9 @@ static void udf_open_lvid(struct super_block *sb)
        struct buffer_head *bh = sbi->s_lvid_bh;
        struct logicalVolIntegrityDesc *lvid;
        struct logicalVolIntegrityDescImpUse *lvidiu;
+
        if (!bh)
                return;
-
        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
        lvidiu = udf_sb_lvidiu(sbi);
 
@@ -1752,14 +1752,15 @@ static void udf_open_lvid(struct super_block *sb)
        lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
        udf_time_to_disk_stamp(&lvid->recordingDateAndTime,
                                CURRENT_TIME);
-       lvid->integrityType = LVID_INTEGRITY_TYPE_OPEN;
+       lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN);
 
        lvid->descTag.descCRC = cpu_to_le16(
-               crc_itu_t(0, (char *)lvid + sizeof(tag),
+               crc_itu_t(0, (char *)lvid + sizeof(struct tag),
                        le16_to_cpu(lvid->descTag.descCRCLength)));
 
        lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
        mark_buffer_dirty(bh);
+       sbi->s_lvid_dirty = 0;
 }
 
 static void udf_close_lvid(struct super_block *sb)
@@ -1773,10 +1774,6 @@ static void udf_close_lvid(struct super_block *sb)
                return;
 
        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
-
-       if (lvid->integrityType != LVID_INTEGRITY_TYPE_OPEN)
-               return;
-
        lvidiu = udf_sb_lvidiu(sbi);
        lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
        lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
@@ -1790,11 +1787,12 @@ static void udf_close_lvid(struct super_block *sb)
        lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
 
        lvid->descTag.descCRC = cpu_to_le16(
-                       crc_itu_t(0, (char *)lvid + sizeof(tag),
+                       crc_itu_t(0, (char *)lvid + sizeof(struct tag),
                                le16_to_cpu(lvid->descTag.descCRCLength)));
 
        lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
        mark_buffer_dirty(bh);
+       sbi->s_lvid_dirty = 0;
 }
 
 static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
@@ -1846,15 +1844,18 @@ static void udf_free_partition(struct udf_part_map *map)
 static int udf_fill_super(struct super_block *sb, void *options, int silent)
 {
        int i;
+       int ret;
        struct inode *inode = NULL;
        struct udf_options uopt;
-       kernel_lb_addr rootdir, fileset;
+       struct kernel_lb_addr rootdir, fileset;
        struct udf_sb_info *sbi;
 
        uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
        uopt.uid = -1;
        uopt.gid = -1;
        uopt.umask = 0;
+       uopt.fmode = UDF_INVALID_MODE;
+       uopt.dmode = UDF_INVALID_MODE;
 
        sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL);
        if (!sbi)
@@ -1892,15 +1893,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        sbi->s_uid = uopt.uid;
        sbi->s_gid = uopt.gid;
        sbi->s_umask = uopt.umask;
+       sbi->s_fmode = uopt.fmode;
+       sbi->s_dmode = uopt.dmode;
        sbi->s_nls_map = uopt.nls_map;
 
-       /* Set the block size for all transfers */
-       if (!sb_min_blocksize(sb, uopt.blocksize)) {
-               udf_debug("Bad block size (%d)\n", uopt.blocksize);
-               printk(KERN_ERR "udf: bad block size (%d)\n", uopt.blocksize);
-               goto error_out;
-       }
-
        if (uopt.session == 0xFFFFFFFF)
                sbi->s_session = udf_get_last_session(sb);
        else
@@ -1908,18 +1904,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 
        udf_debug("Multi-session=%d\n", sbi->s_session);
 
-       sbi->s_last_block = uopt.lastblock;
-       sbi->s_anchor[0] = sbi->s_anchor[1] = 0;
-       sbi->s_anchor[2] = uopt.anchor;
-
-       if (udf_check_valid(sb, uopt.novrs, silent)) {
-               /* read volume recognition sequences */
-               printk(KERN_WARNING "UDF-fs: No VRS found\n");
-               goto error_out;
-       }
-
-       udf_find_anchor(sb);
-
        /* Fill in the rest of the superblock */
        sb->s_op = &udf_sb_ops;
        sb->s_export_op = &udf_export_ops;
@@ -1928,7 +1912,21 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        sb->s_magic = UDF_SUPER_MAGIC;
        sb->s_time_gran = 1000;
 
-       if (udf_load_sequence(sb, &fileset)) {
+       if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
+               ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+       } else {
+               uopt.blocksize = bdev_hardsect_size(sb->s_bdev);
+               ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+               if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
+                       if (!silent)
+                               printk(KERN_NOTICE
+                                      "UDF-fs: Rescanning with blocksize "
+                                      "%d\n", UDF_DEFAULT_BLOCKSIZE);
+                       uopt.blocksize = UDF_DEFAULT_BLOCKSIZE;
+                       ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+               }
+       }
+       if (!ret) {
                printk(KERN_WARNING "UDF-fs: No partition found (1)\n");
                goto error_out;
        }
@@ -1978,7 +1976,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        }
 
        if (!silent) {
-               timestamp ts;
+               struct timestamp ts;
                udf_time_to_disk_stamp(&ts, sbi->s_record_time);
                udf_info("UDF: Mounting volume '%s', "
                         "timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
@@ -1991,7 +1989,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        /* Assign the root inode */
        /* assign inodes by physical block number */
        /* perhaps it's not extensible enough, but for now ... */
-       inode = udf_iget(sb, rootdir);
+       inode = udf_iget(sb, &rootdir);
        if (!inode) {
                printk(KERN_ERR "UDF-fs: Error in udf_iget, block=%d, "
                                "partition=%d\n",
@@ -2081,11 +2079,31 @@ static void udf_put_super(struct super_block *sb)
        sb->s_fs_info = NULL;
 }
 
+static int udf_sync_fs(struct super_block *sb, int wait)
+{
+       struct udf_sb_info *sbi = UDF_SB(sb);
+
+       mutex_lock(&sbi->s_alloc_mutex);
+       if (sbi->s_lvid_dirty) {
+               /*
+                * Blockdevice will be synced later so we don't have to submit
+                * the buffer for IO
+                */
+               mark_buffer_dirty(sbi->s_lvid_bh);
+               sb->s_dirt = 0;
+               sbi->s_lvid_dirty = 0;
+       }
+       mutex_unlock(&sbi->s_alloc_mutex);
+
+       return 0;
+}
+
 static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct logicalVolIntegrityDescImpUse *lvidiu;
+       u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
        if (sbi->s_lvid_bh != NULL)
                lvidiu = udf_sb_lvidiu(sbi);
@@ -2101,8 +2119,9 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
                                          le32_to_cpu(lvidiu->numDirs)) : 0)
                        + buf->f_bfree;
        buf->f_ffree = buf->f_bfree;
-       /* __kernel_fsid_t f_fsid */
        buf->f_namelen = UDF_NAME_LEN - 2;
+       buf->f_fsid.val[0] = (u32)id;
+       buf->f_fsid.val[1] = (u32)(id >> 32);
 
        return 0;
 }
@@ -2114,7 +2133,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
        unsigned int accum = 0;
        int index;
        int block = 0, newblock;
-       kernel_lb_addr loc;
+       struct kernel_lb_addr loc;
        uint32_t bytes;
        uint8_t *ptr;
        uint16_t ident;
@@ -2124,7 +2143,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
 
        loc.logicalBlockNum = bitmap->s_extPosition;
        loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
-       bh = udf_read_ptagged(sb, loc, 0, &ident);
+       bh = udf_read_ptagged(sb, &loc, 0, &ident);
 
        if (!bh) {
                printk(KERN_ERR "udf: udf_count_free failed\n");
@@ -2147,7 +2166,7 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
                bytes -= cur_bytes;
                if (bytes) {
                        brelse(bh);
-                       newblock = udf_get_lb_pblock(sb, loc, ++block);
+                       newblock = udf_get_lb_pblock(sb, &loc, ++block);
                        bh = udf_tread(sb, newblock);
                        if (!bh) {
                                udf_debug("read failed\n");
@@ -2170,7 +2189,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
 {
        unsigned int accum = 0;
        uint32_t elen;
-       kernel_lb_addr eloc;
+       struct kernel_lb_addr eloc;
        int8_t etype;
        struct extent_position epos;
 
index 65e19b4f9424c20f8f616b7ee5de3db55f33ed0e..225527cdc885591614e92ff6ff325c7fdd459229 100644 (file)
 #include "udf_sb.h"
 
 static void extent_trunc(struct inode *inode, struct extent_position *epos,
-                        kernel_lb_addr eloc, int8_t etype, uint32_t elen,
+                        struct kernel_lb_addr *eloc, int8_t etype, uint32_t elen,
                         uint32_t nelen)
 {
-       kernel_lb_addr neloc = {};
+       struct kernel_lb_addr neloc = {};
        int last_block = (elen + inode->i_sb->s_blocksize - 1) >>
                inode->i_sb->s_blocksize_bits;
        int first_block = (nelen + inode->i_sb->s_blocksize - 1) >>
@@ -43,12 +43,12 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos,
                                        last_block);
                        etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30);
                } else
-                       neloc = eloc;
+                       neloc = *eloc;
                nelen = (etype << 30) | nelen;
        }
 
        if (elen != nelen) {
-               udf_write_aext(inode, epos, neloc, nelen, 0);
+               udf_write_aext(inode, epos, &neloc, nelen, 0);
                if (last_block - first_block > 0) {
                        if (etype == (EXT_RECORDED_ALLOCATED >> 30))
                                mark_inode_dirty(inode);
@@ -68,7 +68,7 @@ static void extent_trunc(struct inode *inode, struct extent_position *epos,
 void udf_truncate_tail_extent(struct inode *inode)
 {
        struct extent_position epos = {};
-       kernel_lb_addr eloc;
+       struct kernel_lb_addr eloc;
        uint32_t elen, nelen;
        uint64_t lbcount = 0;
        int8_t etype = -1, netype;
@@ -83,9 +83,9 @@ void udf_truncate_tail_extent(struct inode *inode)
                return;
 
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-               adsize = sizeof(short_ad);
+               adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-               adsize = sizeof(long_ad);
+               adsize = sizeof(struct long_ad);
        else
                BUG();
 
@@ -106,7 +106,7 @@ void udf_truncate_tail_extent(struct inode *inode)
                                       (unsigned)elen);
                        nelen = elen - (lbcount - inode->i_size);
                        epos.offset -= adsize;
-                       extent_trunc(inode, &epos, eloc, etype, elen, nelen);
+                       extent_trunc(inode, &epos, &eloc, etype, elen, nelen);
                        epos.offset += adsize;
                        if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1)
                                printk(KERN_ERR "udf_truncate_tail_extent(): "
@@ -124,7 +124,7 @@ void udf_truncate_tail_extent(struct inode *inode)
 void udf_discard_prealloc(struct inode *inode)
 {
        struct extent_position epos = { NULL, 0, {0, 0} };
-       kernel_lb_addr eloc;
+       struct kernel_lb_addr eloc;
        uint32_t elen;
        uint64_t lbcount = 0;
        int8_t etype = -1, netype;
@@ -136,9 +136,9 @@ void udf_discard_prealloc(struct inode *inode)
                return;
 
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-               adsize = sizeof(short_ad);
+               adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-               adsize = sizeof(long_ad);
+               adsize = sizeof(struct long_ad);
        else
                adsize = 0;
 
@@ -152,7 +152,7 @@ void udf_discard_prealloc(struct inode *inode)
        if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
                epos.offset -= adsize;
                lbcount -= elen;
-               extent_trunc(inode, &epos, eloc, etype, elen, 0);
+               extent_trunc(inode, &epos, &eloc, etype, elen, 0);
                if (!epos.bh) {
                        iinfo->i_lenAlloc =
                                epos.offset -
@@ -200,7 +200,7 @@ static void udf_update_alloc_ext_desc(struct inode *inode,
 void udf_truncate_extents(struct inode *inode)
 {
        struct extent_position epos;
-       kernel_lb_addr eloc, neloc = {};
+       struct kernel_lb_addr eloc, neloc = {};
        uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc;
        int8_t etype;
        struct super_block *sb = inode->i_sb;
@@ -210,9 +210,9 @@ void udf_truncate_extents(struct inode *inode)
        struct udf_inode_info *iinfo = UDF_I(inode);
 
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-               adsize = sizeof(short_ad);
+               adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-               adsize = sizeof(long_ad);
+               adsize = sizeof(struct long_ad);
        else
                BUG();
 
@@ -221,7 +221,7 @@ void udf_truncate_extents(struct inode *inode)
                (inode->i_size & (sb->s_blocksize - 1));
        if (etype != -1) {
                epos.offset -= adsize;
-               extent_trunc(inode, &epos, eloc, etype, elen, byte_offset);
+               extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
                epos.offset += adsize;
                if (byte_offset)
                        lenalloc = epos.offset;
@@ -236,12 +236,12 @@ void udf_truncate_extents(struct inode *inode)
                while ((etype = udf_current_aext(inode, &epos, &eloc,
                                                 &elen, 0)) != -1) {
                        if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
-                               udf_write_aext(inode, &epos, neloc, nelen, 0);
+                               udf_write_aext(inode, &epos, &neloc, nelen, 0);
                                if (indirect_ext_len) {
                                        /* We managed to free all extents in the
                                         * indirect extent - free it too */
                                        BUG_ON(!epos.bh);
-                                       udf_free_blocks(sb, inode, epos.block,
+                                       udf_free_blocks(sb, inode, &epos.block,
                                                        0, indirect_ext_len);
                                } else if (!epos.bh) {
                                        iinfo->i_lenAlloc = lenalloc;
@@ -253,7 +253,7 @@ void udf_truncate_extents(struct inode *inode)
                                epos.offset = sizeof(struct allocExtDesc);
                                epos.block = eloc;
                                epos.bh = udf_tread(sb,
-                                               udf_get_lb_pblock(sb, eloc, 0));
+                                               udf_get_lb_pblock(sb, &eloc, 0));
                                if (elen)
                                        indirect_ext_len =
                                                (elen + sb->s_blocksize - 1) >>
@@ -261,7 +261,7 @@ void udf_truncate_extents(struct inode *inode)
                                else
                                        indirect_ext_len = 1;
                        } else {
-                               extent_trunc(inode, &epos, eloc, etype,
+                               extent_trunc(inode, &epos, &eloc, etype,
                                             elen, 0);
                                epos.offset += adsize;
                        }
@@ -269,7 +269,7 @@ void udf_truncate_extents(struct inode *inode)
 
                if (indirect_ext_len) {
                        BUG_ON(!epos.bh);
-                       udf_free_blocks(sb, inode, epos.block, 0,
+                       udf_free_blocks(sb, inode, &epos.block, 0,
                                        indirect_ext_len);
                } else if (!epos.bh) {
                        iinfo->i_lenAlloc = lenalloc;
@@ -278,7 +278,7 @@ void udf_truncate_extents(struct inode *inode)
                        udf_update_alloc_ext_desc(inode, &epos, lenalloc);
        } else if (inode->i_size) {
                if (byte_offset) {
-                       kernel_long_ad extent;
+                       struct kernel_long_ad extent;
 
                        /*
                         *  OK, there is not extent covering inode->i_size and
index 4f86b1d98a5d44ffbf1ae64bee4618c2dfcf3b4a..e58d1de41073ccd8768d3cda0ba8c058b17dfa12 100644 (file)
@@ -4,7 +4,7 @@
 struct udf_inode_info {
        struct timespec         i_crtime;
        /* Physical address of inode */
-       kernel_lb_addr          i_location;
+       struct kernel_lb_addr           i_location;
        __u64                   i_unique;
        __u32                   i_lenEAttr;
        __u32                   i_lenAlloc;
@@ -17,8 +17,8 @@ struct udf_inode_info {
        unsigned                i_strat4096 : 1;
        unsigned                reserved : 26;
        union {
-               short_ad        *i_sad;
-               long_ad         *i_lad;
+               struct short_ad *i_sad;
+               struct long_ad          *i_lad;
                __u8            *i_data;
        } i_ext;
        struct inode vfs_inode;
index 1c1c514a9725116f19cf559e3af969251d99c74e..d113b72c2768318cc9401175479515d8557bf387 100644 (file)
@@ -30,6 +30,7 @@
 #define UDF_FLAG_GID_SET       16
 #define UDF_FLAG_SESSION_SET   17
 #define UDF_FLAG_LASTBLOCK_SET 18
+#define UDF_FLAG_BLOCKSIZE_SET 19
 
 #define UDF_PART_FLAG_UNALLOC_BITMAP   0x0001
 #define UDF_PART_FLAG_UNALLOC_TABLE    0x0002
@@ -48,6 +49,8 @@
 #define UDF_SPARABLE_MAP15             0x1522U
 #define UDF_METADATA_MAP25             0x2511U
 
+#define UDF_INVALID_MODE               ((mode_t)-1)
+
 #pragma pack(1) /* XXX(hch): Why?  This file just defines in-core structures */
 
 struct udf_meta_data {
@@ -114,7 +117,7 @@ struct udf_sb_info {
 
        /* Sector headers */
        __s32                   s_session;
-       __u32                   s_anchor[3];
+       __u32                   s_anchor;
        __u32                   s_last_block;
 
        struct buffer_head      *s_lvid_bh;
@@ -123,6 +126,8 @@ struct udf_sb_info {
        mode_t                  s_umask;
        gid_t                   s_gid;
        uid_t                   s_uid;
+       mode_t                  s_fmode;
+       mode_t                  s_dmode;
 
        /* Root Info */
        struct timespec         s_record_time;
@@ -143,6 +148,8 @@ struct udf_sb_info {
        struct inode            *s_vat_inode;
 
        struct mutex            s_alloc_mutex;
+       /* Protected by s_alloc_mutex */
+       unsigned int            s_lvid_dirty;
 };
 
 static inline struct udf_sb_info *UDF_SB(struct super_block *sb)
index 8ec865de5f133052771c1a428bb94e3834b9f374..cac51b77a5d165beaffb57032e3498363321f64e 100644 (file)
@@ -62,10 +62,8 @@ static inline size_t udf_ext0_offset(struct inode *inode)
                return 0;
 }
 
-#define udf_get_lb_pblock(sb,loc,offset) udf_get_pblock((sb), (loc).logicalBlockNum, (loc).partitionReferenceNum, (offset))
-
 /* computes tag checksum */
-u8 udf_tag_checksum(const tag *t);
+u8 udf_tag_checksum(const struct tag *t);
 
 struct dentry;
 struct inode;
@@ -95,7 +93,7 @@ struct udf_vds_record {
 };
 
 struct generic_desc {
-       tag             descTag;
+       struct tag      descTag;
        __le32          volDescSeqNum;
 };
 
@@ -108,11 +106,22 @@ struct ustr {
 struct extent_position {
        struct buffer_head *bh;
        uint32_t offset;
-       kernel_lb_addr block;
+       struct kernel_lb_addr block;
 };
 
 /* super.c */
 extern void udf_warning(struct super_block *, const char *, const char *, ...);
+static inline void udf_updated_lvid(struct super_block *sb)
+{
+       struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh;
+
+       BUG_ON(!bh);
+       WARN_ON_ONCE(((struct logicalVolIntegrityDesc *)
+                    bh->b_data)->integrityType !=
+                    cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN));
+       sb->s_dirt = 1;
+       UDF_SB(sb)->s_lvid_dirty = 1;
+}
 
 /* namei.c */
 extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
@@ -124,7 +133,7 @@ extern int udf_ioctl(struct inode *, struct file *, unsigned int,
                     unsigned long);
 
 /* inode.c */
-extern struct inode *udf_iget(struct super_block *, kernel_lb_addr);
+extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
 extern int udf_sync_inode(struct inode *);
 extern void udf_expand_file_adinicb(struct inode *, int, int *);
 extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
@@ -136,19 +145,19 @@ extern void udf_clear_inode(struct inode *);
 extern int udf_write_inode(struct inode *, int);
 extern long udf_block_map(struct inode *, sector_t);
 extern int udf_extend_file(struct inode *, struct extent_position *,
-                          kernel_long_ad *, sector_t);
+                          struct kernel_long_ad *, sector_t);
 extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
-                        kernel_lb_addr *, uint32_t *, sector_t *);
+                        struct kernel_lb_addr *, uint32_t *, sector_t *);
 extern int8_t udf_add_aext(struct inode *, struct extent_position *,
-                          kernel_lb_addr, uint32_t, int);
+                          struct kernel_lb_addr *, uint32_t, int);
 extern int8_t udf_write_aext(struct inode *, struct extent_position *,
-                            kernel_lb_addr, uint32_t, int);
+                            struct kernel_lb_addr *, uint32_t, int);
 extern int8_t udf_delete_aext(struct inode *, struct extent_position,
-                             kernel_lb_addr, uint32_t);
+                             struct kernel_lb_addr, uint32_t);
 extern int8_t udf_next_aext(struct inode *, struct extent_position *,
-                           kernel_lb_addr *, uint32_t *, int);
+                           struct kernel_lb_addr *, uint32_t *, int);
 extern int8_t udf_current_aext(struct inode *, struct extent_position *,
-                              kernel_lb_addr *, uint32_t *, int);
+                              struct kernel_lb_addr *, uint32_t *, int);
 
 /* misc.c */
 extern struct buffer_head *udf_tgetblk(struct super_block *, int);
@@ -160,7 +169,7 @@ extern struct genericFormat *udf_get_extendedattr(struct inode *, uint32_t,
 extern struct buffer_head *udf_read_tagged(struct super_block *, uint32_t,
                                           uint32_t, uint16_t *);
 extern struct buffer_head *udf_read_ptagged(struct super_block *,
-                                           kernel_lb_addr, uint32_t,
+                                           struct kernel_lb_addr *, uint32_t,
                                            uint16_t *);
 extern void udf_update_tag(char *, int);
 extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int);
@@ -182,6 +191,14 @@ extern uint32_t udf_get_pblock_meta25(struct super_block *, uint32_t, uint16_t,
                                          uint32_t);
 extern int udf_relocate_blocks(struct super_block *, long, long *);
 
+static inline uint32_t
+udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc,
+                 uint32_t offset)
+{
+       return udf_get_pblock(sb, loc->logicalBlockNum,
+                       loc->partitionReferenceNum, offset);
+}
+
 /* unicode.c */
 extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int);
 extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *,
@@ -200,7 +217,7 @@ extern void udf_truncate_extents(struct inode *);
 
 /* balloc.c */
 extern void udf_free_blocks(struct super_block *, struct inode *,
-                           kernel_lb_addr, uint32_t, uint32_t);
+                           struct kernel_lb_addr *, uint32_t, uint32_t);
 extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t,
                               uint32_t, uint32_t);
 extern int udf_new_block(struct super_block *, struct inode *, uint16_t,
@@ -214,16 +231,16 @@ extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *,
                                                struct udf_fileident_bh *,
                                                struct fileIdentDesc *,
                                                struct extent_position *,
-                                               kernel_lb_addr *, uint32_t *,
+                                               struct kernel_lb_addr *, uint32_t *,
                                                sector_t *);
 extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize,
                                               int *offset);
-extern long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int);
-extern short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);
+extern struct long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int);
+extern struct short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);
 
 /* udftime.c */
 extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest,
-                                               timestamp src);
-extern timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec src);
+                                               struct timestamp src);
+extern struct timestamp *udf_time_to_disk_stamp(struct timestamp *dest, struct timespec src);
 
 #endif                         /* __UDF_DECL_H */
index 489f52fb428cd309fb73002be58ce8b38bb09f0d..6a9f3a9cc4281adf8f7ff6498ee6a38c3b04e565 100644 (file)
@@ -4,9 +4,9 @@
 #include <asm/byteorder.h>
 #include <linux/string.h>
 
-static inline kernel_lb_addr lelb_to_cpu(lb_addr in)
+static inline struct kernel_lb_addr lelb_to_cpu(struct lb_addr in)
 {
-       kernel_lb_addr out;
+       struct kernel_lb_addr out;
 
        out.logicalBlockNum = le32_to_cpu(in.logicalBlockNum);
        out.partitionReferenceNum = le16_to_cpu(in.partitionReferenceNum);
@@ -14,9 +14,9 @@ static inline kernel_lb_addr lelb_to_cpu(lb_addr in)
        return out;
 }
 
-static inline lb_addr cpu_to_lelb(kernel_lb_addr in)
+static inline struct lb_addr cpu_to_lelb(struct kernel_lb_addr in)
 {
-       lb_addr out;
+       struct lb_addr out;
 
        out.logicalBlockNum = cpu_to_le32(in.logicalBlockNum);
        out.partitionReferenceNum = cpu_to_le16(in.partitionReferenceNum);
@@ -24,9 +24,9 @@ static inline lb_addr cpu_to_lelb(kernel_lb_addr in)
        return out;
 }
 
-static inline short_ad lesa_to_cpu(short_ad in)
+static inline struct short_ad lesa_to_cpu(struct short_ad in)
 {
-       short_ad out;
+       struct short_ad out;
 
        out.extLength = le32_to_cpu(in.extLength);
        out.extPosition = le32_to_cpu(in.extPosition);
@@ -34,9 +34,9 @@ static inline short_ad lesa_to_cpu(short_ad in)
        return out;
 }
 
-static inline short_ad cpu_to_lesa(short_ad in)
+static inline struct short_ad cpu_to_lesa(struct short_ad in)
 {
-       short_ad out;
+       struct short_ad out;
 
        out.extLength = cpu_to_le32(in.extLength);
        out.extPosition = cpu_to_le32(in.extPosition);
@@ -44,9 +44,9 @@ static inline short_ad cpu_to_lesa(short_ad in)
        return out;
 }
 
-static inline kernel_long_ad lela_to_cpu(long_ad in)
+static inline struct kernel_long_ad lela_to_cpu(struct long_ad in)
 {
-       kernel_long_ad out;
+       struct kernel_long_ad out;
 
        out.extLength = le32_to_cpu(in.extLength);
        out.extLocation = lelb_to_cpu(in.extLocation);
@@ -54,9 +54,9 @@ static inline kernel_long_ad lela_to_cpu(long_ad in)
        return out;
 }
 
-static inline long_ad cpu_to_lela(kernel_long_ad in)
+static inline struct long_ad cpu_to_lela(struct kernel_long_ad in)
 {
-       long_ad out;
+       struct long_ad out;
 
        out.extLength = cpu_to_le32(in.extLength);
        out.extLocation = cpu_to_lelb(in.extLocation);
@@ -64,9 +64,9 @@ static inline long_ad cpu_to_lela(kernel_long_ad in)
        return out;
 }
 
-static inline kernel_extent_ad leea_to_cpu(extent_ad in)
+static inline struct kernel_extent_ad leea_to_cpu(struct extent_ad in)
 {
-       kernel_extent_ad out;
+       struct kernel_extent_ad out;
 
        out.extLength = le32_to_cpu(in.extLength);
        out.extLocation = le32_to_cpu(in.extLocation);
index 5f811655c9b51abbdbd33f39113a9e963df66aa5..b8c828c4d20034fcf046d21b2946e94044b5a74e 100644 (file)
@@ -85,7 +85,8 @@ extern struct timezone sys_tz;
 #define SECS_PER_HOUR  (60 * 60)
 #define SECS_PER_DAY   (SECS_PER_HOUR * 24)
 
-struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src)
+struct timespec *
+udf_disk_stamp_to_time(struct timespec *dest, struct timestamp src)
 {
        int yday;
        u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone);
@@ -116,7 +117,8 @@ struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src)
        return dest;
 }
 
-timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec ts)
+struct timestamp *
+udf_time_to_disk_stamp(struct timestamp *dest, struct timespec ts)
 {
        long int days, rem, y;
        const unsigned short int *ip;
index 9fdf8c93c58e7893616bea99730cc594a3731361..cefa8c8913e68a77100d2bdb5cf377b323483ec0 100644 (file)
@@ -254,7 +254,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
 {
        const uint8_t *ocu;
        uint8_t cmp_id, ocu_len;
-       int i;
+       int i, len;
 
 
        ocu_len = ocu_i->u_len;
@@ -279,8 +279,13 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
                if (cmp_id == 16)
                        c = (c << 8) | ocu[i++];
 
-               utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
-                                             UDF_NAME_LEN - utf_o->u_len);
+               len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
+                                   UDF_NAME_LEN - utf_o->u_len);
+               /* Valid character? */
+               if (len >= 0)
+                       utf_o->u_len += len;
+               else
+                       utf_o->u_name[utf_o->u_len++] = '?';
        }
        utf_o->u_cmpID = 8;
 
@@ -290,7 +295,8 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
 static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
                        int length)
 {
-       unsigned len, i, max_val;
+       int len;
+       unsigned i, max_val;
        uint16_t uni_char;
        int u_len;
 
@@ -302,8 +308,13 @@ try_again:
        u_len = 0U;
        for (i = 0U; i < uni->u_len; i++) {
                len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
-               if (len <= 0)
+               if (!len)
                        continue;
+               /* Invalid character, deal with it */
+               if (len < 0) {
+                       len = 1;
+                       uni_char = '?';
+               }
 
                if (uni_char > max_val) {
                        max_val = 0xffffU;
@@ -324,34 +335,43 @@ try_again:
 int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
                     int flen)
 {
-       struct ustr filename, unifilename;
-       int len;
+       struct ustr *filename, *unifilename;
+       int len = 0;
 
-       if (udf_build_ustr_exact(&unifilename, sname, flen))
+       filename = kmalloc(sizeof(struct ustr), GFP_NOFS);
+       if (!filename)
                return 0;
 
+       unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
+       if (!unifilename)
+               goto out1;
+
+       if (udf_build_ustr_exact(unifilename, sname, flen))
+               goto out2;
+
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
-               if (!udf_CS0toUTF8(&filename, &unifilename)) {
+               if (!udf_CS0toUTF8(filename, unifilename)) {
                        udf_debug("Failed in udf_get_filename: sname = %s\n",
                                  sname);
-                       return 0;
+                       goto out2;
                }
        } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
-               if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename,
-                                 &unifilename)) {
+               if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
+                                 unifilename)) {
                        udf_debug("Failed in udf_get_filename: sname = %s\n",
                                  sname);
-                       return 0;
+                       goto out2;
                }
        } else
-               return 0;
-
-       len = udf_translate_to_linux(dname, filename.u_name, filename.u_len,
-                                    unifilename.u_name, unifilename.u_len);
-       if (len)
-               return len;
-
-       return 0;
+               goto out2;
+
+       len = udf_translate_to_linux(dname, filename->u_name, filename->u_len,
+                                    unifilename->u_name, unifilename->u_len);
+out2:
+       kfree(unifilename);
+out1:
+       kfree(filename);
+       return len;
 }
 
 int udf_put_filename(struct super_block *sb, const uint8_t *sname,
index c3dc491fff89c468806fca873a34eedc5843cd91..60f107e47fe94253f1b1b55a443959bd15a036cc 100644 (file)
@@ -33,6 +33,7 @@ xfs-$(CONFIG_XFS_QUOTA)               += $(addprefix quota/, \
                                   xfs_qm_syscalls.o \
                                   xfs_qm_bhv.o \
                                   xfs_qm.o)
+xfs-$(CONFIG_XFS_QUOTA)                += linux-2.6/xfs_quotaops.o
 
 ifeq ($(CONFIG_XFS_QUOTA),y)
 xfs-$(CONFIG_PROC_FS)          += quota/xfs_qm_stats.o
diff --git a/fs/xfs/linux-2.6/mutex.h b/fs/xfs/linux-2.6/mutex.h
deleted file mode 100644 (file)
index 2a88d56..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_MUTEX_H__
-#define __XFS_SUPPORT_MUTEX_H__
-
-#include <linux/mutex.h>
-
-typedef struct mutex mutex_t;
-
-#endif /* __XFS_SUPPORT_MUTEX_H__ */
index de3a198f771e20627769b837d6f2460ec18077fc..c13f67300fe76aa0a40ee64b0690dd2c8161a8ea 100644 (file)
@@ -1623,4 +1623,5 @@ const struct address_space_operations xfs_address_space_operations = {
        .bmap                   = xfs_vm_bmap,
        .direct_IO              = xfs_vm_direct_IO,
        .migratepage            = buffer_migrate_page,
+       .is_partially_uptodate  = block_is_partially_uptodate,
 };
index 4bd112313f3351eef0304cabf898eb96fb91f9b0..d0b499418a7d43e16313c053571267433d6c03a6 100644 (file)
@@ -34,6 +34,7 @@
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_ioctl.h"
 #include "xfs_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_rtalloc.h"
@@ -78,92 +79,74 @@ xfs_find_handle(
        int                     hsize;
        xfs_handle_t            handle;
        struct inode            *inode;
+       struct file             *file = NULL;
+       struct path             path;
+       int                     error;
+       struct xfs_inode        *ip;
 
-       memset((char *)&handle, 0, sizeof(handle));
-
-       switch (cmd) {
-       case XFS_IOC_PATH_TO_FSHANDLE:
-       case XFS_IOC_PATH_TO_HANDLE: {
-               struct path path;
-               int error = user_lpath((const char __user *)hreq->path, &path);
+       if (cmd == XFS_IOC_FD_TO_HANDLE) {
+               file = fget(hreq->fd);
+               if (!file)
+                       return -EBADF;
+               inode = file->f_path.dentry->d_inode;
+       } else {
+               error = user_lpath((const char __user *)hreq->path, &path);
                if (error)
                        return error;
-
-               ASSERT(path.dentry);
-               ASSERT(path.dentry->d_inode);
-               inode = igrab(path.dentry->d_inode);
-               path_put(&path);
-               break;
+               inode = path.dentry->d_inode;
        }
+       ip = XFS_I(inode);
 
-       case XFS_IOC_FD_TO_HANDLE: {
-               struct file     *file;
-
-               file = fget(hreq->fd);
-               if (!file)
-                   return -EBADF;
+       /*
+        * We can only generate handles for inodes residing on a XFS filesystem,
+        * and only for regular files, directories or symbolic links.
+        */
+       error = -EINVAL;
+       if (inode->i_sb->s_magic != XFS_SB_MAGIC)
+               goto out_put;
 
-               ASSERT(file->f_path.dentry);
-               ASSERT(file->f_path.dentry->d_inode);
-               inode = igrab(file->f_path.dentry->d_inode);
-               fput(file);
-               break;
-       }
+       error = -EBADF;
+       if (!S_ISREG(inode->i_mode) &&
+           !S_ISDIR(inode->i_mode) &&
+           !S_ISLNK(inode->i_mode))
+               goto out_put;
 
-       default:
-               ASSERT(0);
-               return -XFS_ERROR(EINVAL);
-       }
 
-       if (inode->i_sb->s_magic != XFS_SB_MAGIC) {
-               /* we're not in XFS anymore, Toto */
-               iput(inode);
-               return -XFS_ERROR(EINVAL);
-       }
+       memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
 
-       switch (inode->i_mode & S_IFMT) {
-       case S_IFREG:
-       case S_IFDIR:
-       case S_IFLNK:
-               break;
-       default:
-               iput(inode);
-               return -XFS_ERROR(EBADF);
-       }
-
-       /* now we can grab the fsid */
-       memcpy(&handle.ha_fsid, XFS_I(inode)->i_mount->m_fixedfsid,
-                       sizeof(xfs_fsid_t));
-       hsize = sizeof(xfs_fsid_t);
-
-       if (cmd != XFS_IOC_PATH_TO_FSHANDLE) {
-               xfs_inode_t     *ip = XFS_I(inode);
+       if (cmd == XFS_IOC_PATH_TO_FSHANDLE) {
+               /*
+                * This handle only contains an fsid, zero the rest.
+                */
+               memset(&handle.ha_fid, 0, sizeof(handle.ha_fid));
+               hsize = sizeof(xfs_fsid_t);
+       } else {
                int             lock_mode;
 
-               /* need to get access to the xfs_inode to read the generation */
                lock_mode = xfs_ilock_map_shared(ip);
-
-               /* fill in fid section of handle from inode */
                handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
                                        sizeof(handle.ha_fid.fid_len);
                handle.ha_fid.fid_pad = 0;
                handle.ha_fid.fid_gen = ip->i_d.di_gen;
                handle.ha_fid.fid_ino = ip->i_ino;
-
                xfs_iunlock_map_shared(ip, lock_mode);
 
                hsize = XFS_HSIZE(handle);
        }
 
-       /* now copy our handle into the user buffer & write out the size */
+       error = -EFAULT;
        if (copy_to_user(hreq->ohandle, &handle, hsize) ||
-           copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) {
-               iput(inode);
-               return -XFS_ERROR(EFAULT);
-       }
+           copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32)))
+               goto out_put;
 
-       iput(inode);
-       return 0;
+       error = 0;
+
+ out_put:
+       if (cmd == XFS_IOC_FD_TO_HANDLE)
+               fput(file);
+       else
+               path_put(&path);
+       return error;
 }
 
 /*
index 2940612e3aeb4bf2f47c17bed0ed7982e121f327..6075382336d70b0feca5d7e84b1a53b9926987fd 100644 (file)
@@ -211,8 +211,13 @@ xfs_vn_mknod(
         * Irix uses Missed'em'V split, but doesn't want to see
         * the upper 5 bits of (14bit) major.
         */
-       if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
-               return -EINVAL;
+       if (S_ISCHR(mode) || S_ISBLK(mode)) {
+               if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
+                       return -EINVAL;
+               rdev = sysv_encode_dev(rdev);
+       } else {
+               rdev = 0;
+       }
 
        if (test_default_acl && test_default_acl(dir)) {
                if (!_ACL_ALLOC(default_acl)) {
@@ -224,28 +229,11 @@ xfs_vn_mknod(
                }
        }
 
-       xfs_dentry_to_name(&name, dentry);
-
        if (IS_POSIXACL(dir) && !default_acl)
                mode &= ~current_umask();
 
-       switch (mode & S_IFMT) {
-       case S_IFCHR:
-       case S_IFBLK:
-       case S_IFIFO:
-       case S_IFSOCK:
-               rdev = sysv_encode_dev(rdev);
-       case S_IFREG:
-               error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
-               break;
-       case S_IFDIR:
-               error = xfs_mkdir(XFS_I(dir), &name, mode, &ip, NULL);
-               break;
-       default:
-               error = EINVAL;
-               break;
-       }
-
+       xfs_dentry_to_name(&name, dentry);
+       error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
        if (unlikely(error))
                goto out_free_acl;
 
@@ -553,9 +541,6 @@ xfs_vn_getattr(
        stat->uid = ip->i_d.di_uid;
        stat->gid = ip->i_d.di_gid;
        stat->ino = ip->i_ino;
-#if XFS_BIG_INUMS
-       stat->ino += mp->m_inoadd;
-#endif
        stat->atime = inode->i_atime;
        stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec;
        stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
index 507492d6dccd0d9de4e8591ba2ca9dfdb831b44d..f65a53f8752f239410a66f6bc564c8c1a3e4dc81 100644 (file)
@@ -38,7 +38,6 @@
 #include <kmem.h>
 #include <mrlock.h>
 #include <sv.h>
-#include <mutex.h>
 #include <time.h>
 
 #include <support/ktrace.h>
@@ -51,6 +50,7 @@
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
 #include <linux/file.h>
 #include <linux/swap.h>
 #include <linux/errno.h>
 #define SYNCHRONIZE()  barrier()
 #define __return_address __builtin_return_address(0)
 
-/*
- * IRIX (BSD) quotactl makes use of separate commands for user/group,
- * whereas on Linux the syscall encodes this information into the cmd
- * field (see the QCMD macro in quota.h).  These macros help keep the
- * code portable - they are not visible from the syscall interface.
- */
-#define Q_XSETGQLIM    XQM_CMD(8)      /* set groups disk limits */
-#define Q_XGETGQUOTA   XQM_CMD(9)      /* get groups disk limits */
-#define Q_XSETPQLIM    XQM_CMD(10)     /* set projects disk limits */
-#define Q_XGETPQUOTA   XQM_CMD(11)     /* get projects disk limits */
-
 #define dfltprid       0
 #define MAXPATHLEN     1024
 
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
new file mode 100644 (file)
index 0000000..94d9a63
--- /dev/null
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2008, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_dmapi.h"
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "quota/xfs_qm.h"
+#include <linux/quota.h>
+
+
+STATIC int
+xfs_quota_type(int type)
+{
+       switch (type) {
+       case USRQUOTA:
+               return XFS_DQ_USER;
+       case GRPQUOTA:
+               return XFS_DQ_GROUP;
+       default:
+               return XFS_DQ_PROJ;
+       }
+}
+
+STATIC int
+xfs_fs_quota_sync(
+       struct super_block      *sb,
+       int                     type)
+{
+       struct xfs_mount        *mp = XFS_M(sb);
+
+       if (!XFS_IS_QUOTA_RUNNING(mp))
+               return -ENOSYS;
+       return -xfs_sync_inodes(mp, SYNC_DELWRI);
+}
+
+STATIC int
+xfs_fs_get_xstate(
+       struct super_block      *sb,
+       struct fs_quota_stat    *fqs)
+{
+       struct xfs_mount        *mp = XFS_M(sb);
+
+       if (!XFS_IS_QUOTA_RUNNING(mp))
+               return -ENOSYS;
+       return -xfs_qm_scall_getqstat(mp, fqs);
+}
+
+STATIC int
+xfs_fs_set_xstate(
+       struct super_block      *sb,
+       unsigned int            uflags,
+       int                     op)
+{
+       struct xfs_mount        *mp = XFS_M(sb);
+       unsigned int            flags = 0;
+
+       if (sb->s_flags & MS_RDONLY)
+               return -EROFS;
+       if (!XFS_IS_QUOTA_RUNNING(mp))
+               return -ENOSYS;
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (uflags & XFS_QUOTA_UDQ_ACCT)
+               flags |= XFS_UQUOTA_ACCT;
+       if (uflags & XFS_QUOTA_PDQ_ACCT)
+               flags |= XFS_PQUOTA_ACCT;
+       if (uflags & XFS_QUOTA_GDQ_ACCT)
+               flags |= XFS_GQUOTA_ACCT;
+       if (uflags & XFS_QUOTA_UDQ_ENFD)
+               flags |= XFS_UQUOTA_ENFD;
+       if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD))
+               flags |= XFS_OQUOTA_ENFD;
+
+       switch (op) {
+       case Q_XQUOTAON:
+               return -xfs_qm_scall_quotaon(mp, flags);
+       case Q_XQUOTAOFF:
+               if (!XFS_IS_QUOTA_ON(mp))
+                       return -EINVAL;
+               return -xfs_qm_scall_quotaoff(mp, flags);
+       case Q_XQUOTARM:
+               if (XFS_IS_QUOTA_ON(mp))
+                       return -EINVAL;
+               return -xfs_qm_scall_trunc_qfiles(mp, flags);
+       }
+
+       return -EINVAL;
+}
+
+STATIC int
+xfs_fs_get_xquota(
+       struct super_block      *sb,
+       int                     type,
+       qid_t                   id,
+       struct fs_disk_quota    *fdq)
+{
+       struct xfs_mount        *mp = XFS_M(sb);
+
+       if (!XFS_IS_QUOTA_RUNNING(mp))
+               return -ENOSYS;
+       if (!XFS_IS_QUOTA_ON(mp))
+               return -ESRCH;
+
+       return -xfs_qm_scall_getquota(mp, id, xfs_quota_type(type), fdq);
+}
+
+STATIC int
+xfs_fs_set_xquota(
+       struct super_block      *sb,
+       int                     type,
+       qid_t                   id,
+       struct fs_disk_quota    *fdq)
+{
+       struct xfs_mount        *mp = XFS_M(sb);
+
+       if (sb->s_flags & MS_RDONLY)
+               return -EROFS;
+       if (!XFS_IS_QUOTA_RUNNING(mp))
+               return -ENOSYS;
+       if (!XFS_IS_QUOTA_ON(mp))
+               return -ESRCH;
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
+}
+
+struct quotactl_ops xfs_quotactl_operations = {
+       .quota_sync             = xfs_fs_quota_sync,
+       .get_xstate             = xfs_fs_get_xstate,
+       .set_xstate             = xfs_fs_set_xstate,
+       .get_xquota             = xfs_fs_get_xquota,
+       .set_xquota             = xfs_fs_set_xquota,
+};
index 32ae5028e96b5cb8c8bafaf5ae93089ee798855a..bb685269f832ede5fa0f97eed990c3a988b26426 100644 (file)
@@ -68,7 +68,6 @@
 #include <linux/freezer.h>
 #include <linux/parser.h>
 
-static struct quotactl_ops xfs_quotactl_operations;
 static struct super_operations xfs_super_operations;
 static kmem_zone_t *xfs_ioend_zone;
 mempool_t *xfs_ioend_pool;
@@ -79,7 +78,6 @@ mempool_t *xfs_ioend_pool;
 #define MNTOPT_RTDEV   "rtdev"         /* realtime I/O device */
 #define MNTOPT_BIOSIZE "biosize"       /* log2 of preferred buffered io size */
 #define MNTOPT_WSYNC   "wsync"         /* safe-mode nfs compatible mount */
-#define MNTOPT_INO64   "ino64"         /* force inodes into 64-bit range */
 #define MNTOPT_NOALIGN "noalign"       /* turn off stripe alignment */
 #define MNTOPT_SWALLOC "swalloc"       /* turn on stripe width allocation */
 #define MNTOPT_SUNIT   "sunit"         /* data volume stripe unit */
@@ -180,7 +178,7 @@ xfs_parseargs(
        int                     dswidth = 0;
        int                     iosize = 0;
        int                     dmapi_implies_ikeep = 1;
-       uchar_t                 iosizelog = 0;
+       __uint8_t               iosizelog = 0;
 
        /*
         * Copy binary VFS mount flags we are interested in.
@@ -291,16 +289,6 @@ xfs_parseargs(
                        mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
                } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
                        mp->m_flags |= XFS_MOUNT_NORECOVERY;
-               } else if (!strcmp(this_char, MNTOPT_INO64)) {
-#if XFS_BIG_INUMS
-                       mp->m_flags |= XFS_MOUNT_INO64;
-                       mp->m_inoadd = XFS_INO64_OFFSET;
-#else
-                       cmn_err(CE_WARN,
-                               "XFS: %s option not allowed on this system",
-                               this_char);
-                       return EINVAL;
-#endif
                } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
                        mp->m_flags |= XFS_MOUNT_NOALIGN;
                } else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
@@ -529,7 +517,6 @@ xfs_showargs(
                /* the few simple ones we can get from the mount struct */
                { XFS_MOUNT_IKEEP,              "," MNTOPT_IKEEP },
                { XFS_MOUNT_WSYNC,              "," MNTOPT_WSYNC },
-               { XFS_MOUNT_INO64,              "," MNTOPT_INO64 },
                { XFS_MOUNT_NOALIGN,            "," MNTOPT_NOALIGN },
                { XFS_MOUNT_SWALLOC,            "," MNTOPT_SWALLOC },
                { XFS_MOUNT_NOUUID,             "," MNTOPT_NOUUID },
@@ -634,7 +621,7 @@ xfs_max_file_offset(
        return (((__uint64_t)pagefactor) << bitshift) - 1;
 }
 
-int
+STATIC int
 xfs_blkdev_get(
        xfs_mount_t             *mp,
        const char              *name,
@@ -651,7 +638,7 @@ xfs_blkdev_get(
        return -error;
 }
 
-void
+STATIC void
 xfs_blkdev_put(
        struct block_device     *bdev)
 {
@@ -872,7 +859,7 @@ xfsaild_wakeup(
        wake_up_process(ailp->xa_task);
 }
 
-int
+STATIC int
 xfsaild(
        void    *data)
 {
@@ -990,26 +977,57 @@ xfs_fs_write_inode(
        int                     sync)
 {
        struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
        int                     error = 0;
-       int                     flags = 0;
 
        xfs_itrace_entry(ip);
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return XFS_ERROR(EIO);
+
        if (sync) {
                error = xfs_wait_on_pages(ip, 0, -1);
                if (error)
-                       goto out_error;
-               flags |= FLUSH_SYNC;
+                       goto out;
        }
-       error = xfs_inode_flush(ip, flags);
 
-out_error:
+       /*
+        * Bypass inodes which have already been cleaned by
+        * the inode flush clustering code inside xfs_iflush
+        */
+       if (xfs_inode_clean(ip))
+               goto out;
+
+       /*
+        * We make this non-blocking if the inode is contended, return
+        * EAGAIN to indicate to the caller that they did not succeed.
+        * This prevents the flush path from blocking on inodes inside
+        * another operation right now, they get caught later by xfs_sync.
+        */
+       if (sync) {
+               xfs_ilock(ip, XFS_ILOCK_SHARED);
+               xfs_iflock(ip);
+
+               error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
+       } else {
+               error = EAGAIN;
+               if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
+                       goto out;
+               if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
+                       goto out_unlock;
+
+               error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK);
+       }
+
+ out_unlock:
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ out:
        /*
         * if we failed to write out the inode then mark
         * it dirty again so we'll try again later.
         */
        if (error)
                xfs_mark_inode_dirty_sync(ip);
-
        return -error;
 }
 
@@ -1169,18 +1187,12 @@ xfs_fs_statfs(
        statp->f_bfree = statp->f_bavail =
                                sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
        fakeinos = statp->f_bfree << sbp->sb_inopblog;
-#if XFS_BIG_INUMS
-       fakeinos += mp->m_inoadd;
-#endif
        statp->f_files =
            MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
        if (mp->m_maxicount)
-#if XFS_BIG_INUMS
-               if (!mp->m_inoadd)
-#endif
-                       statp->f_files = min_t(typeof(statp->f_files),
-                                               statp->f_files,
-                                               mp->m_maxicount);
+               statp->f_files = min_t(typeof(statp->f_files),
+                                       statp->f_files,
+                                       mp->m_maxicount);
        statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
        spin_unlock(&mp->m_sb_lock);
 
@@ -1302,57 +1314,6 @@ xfs_fs_show_options(
        return -xfs_showargs(XFS_M(mnt->mnt_sb), m);
 }
 
-STATIC int
-xfs_fs_quotasync(
-       struct super_block      *sb,
-       int                     type)
-{
-       return -XFS_QM_QUOTACTL(XFS_M(sb), Q_XQUOTASYNC, 0, NULL);
-}
-
-STATIC int
-xfs_fs_getxstate(
-       struct super_block      *sb,
-       struct fs_quota_stat    *fqs)
-{
-       return -XFS_QM_QUOTACTL(XFS_M(sb), Q_XGETQSTAT, 0, (caddr_t)fqs);
-}
-
-STATIC int
-xfs_fs_setxstate(
-       struct super_block      *sb,
-       unsigned int            flags,
-       int                     op)
-{
-       return -XFS_QM_QUOTACTL(XFS_M(sb), op, 0, (caddr_t)&flags);
-}
-
-STATIC int
-xfs_fs_getxquota(
-       struct super_block      *sb,
-       int                     type,
-       qid_t                   id,
-       struct fs_disk_quota    *fdq)
-{
-       return -XFS_QM_QUOTACTL(XFS_M(sb),
-                                (type == USRQUOTA) ? Q_XGETQUOTA :
-                                 ((type == GRPQUOTA) ? Q_XGETGQUOTA :
-                                  Q_XGETPQUOTA), id, (caddr_t)fdq);
-}
-
-STATIC int
-xfs_fs_setxquota(
-       struct super_block      *sb,
-       int                     type,
-       qid_t                   id,
-       struct fs_disk_quota    *fdq)
-{
-       return -XFS_QM_QUOTACTL(XFS_M(sb),
-                                (type == USRQUOTA) ? Q_XSETQLIM :
-                                 ((type == GRPQUOTA) ? Q_XSETGQLIM :
-                                  Q_XSETPQLIM), id, (caddr_t)fdq);
-}
-
 /*
  * This function fills in xfs_mount_t fields based on mount args.
  * Note: the superblock _has_ now been read in.
@@ -1435,7 +1396,9 @@ xfs_fs_fill_super(
        sb_min_blocksize(sb, BBSIZE);
        sb->s_xattr = xfs_xattr_handlers;
        sb->s_export_op = &xfs_export_operations;
+#ifdef CONFIG_XFS_QUOTA
        sb->s_qcop = &xfs_quotactl_operations;
+#endif
        sb->s_op = &xfs_super_operations;
 
        error = xfs_dmops_get(mp);
@@ -1578,14 +1541,6 @@ static struct super_operations xfs_super_operations = {
        .show_options           = xfs_fs_show_options,
 };
 
-static struct quotactl_ops xfs_quotactl_operations = {
-       .quota_sync             = xfs_fs_quotasync,
-       .get_xstate             = xfs_fs_getxstate,
-       .set_xstate             = xfs_fs_setxstate,
-       .get_xquota             = xfs_fs_getxquota,
-       .set_xquota             = xfs_fs_setxquota,
-};
-
 static struct file_system_type xfs_fs_type = {
        .owner                  = THIS_MODULE,
        .name                   = "xfs",
index d5d776d4cd6780fe92c389a763cc5295c076ee43..5a2ea3a217810be4efc6ebde932ece2d58d90360 100644 (file)
@@ -93,6 +93,7 @@ extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 
 extern const struct export_operations xfs_export_operations;
 extern struct xattr_handler *xfs_xattr_handlers[];
+extern struct quotactl_ops xfs_quotactl_operations;
 
 #define XFS_M(sb)              ((struct xfs_mount *)((sb)->s_fs_info))
 
index 5f6de1efe1f696c51ea5e433120762e0c7e20bf4..04f058c848ae18c320ac7c47c4b1c0e87ca0865b 100644 (file)
@@ -19,6 +19,7 @@
 #define XFS_SYNC_H 1
 
 struct xfs_mount;
+struct xfs_perag;
 
 typedef struct bhv_vfs_sync_work {
        struct list_head        w_list;
index f65983a230d3610b94c192d880dac5a302200c9e..ad7fbead4c97664331fb406d8d2f5a29640b9208 100644 (file)
@@ -40,11 +40,6 @@ struct attrlist_cursor_kern;
 #define IO_ISDIRECT    0x00004         /* bypass page cache */
 #define IO_INVIS       0x00020         /* don't update inode timestamps */
 
-/*
- * Flags for xfs_inode_flush
- */
-#define FLUSH_SYNC             1       /* wait for flush to complete   */
-
 /*
  * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
  */
@@ -54,33 +49,6 @@ struct attrlist_cursor_kern;
                                           Prevent VM access to the pages until
                                           the operation completes. */
 
-/*
- * Dealing with bad inodes
- */
-static inline int VN_BAD(struct inode *vp)
-{
-       return is_bad_inode(vp);
-}
-
-/*
- * Extracting atime values in various formats
- */
-static inline void vn_atime_to_bstime(struct inode *vp, xfs_bstime_t *bs_atime)
-{
-       bs_atime->tv_sec = vp->i_atime.tv_sec;
-       bs_atime->tv_nsec = vp->i_atime.tv_nsec;
-}
-
-static inline void vn_atime_to_timespec(struct inode *vp, struct timespec *ts)
-{
-       *ts = vp->i_atime;
-}
-
-static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt)
-{
-       *tt = vp->i_atime.tv_sec;
-}
-
 /*
  * Some useful predicates.
  */
index 6543c0b297534a33f1af97d95cda43662a9d700b..e4babcc63423fef26351c5e9255757ec5d4dbba7 100644 (file)
@@ -804,7 +804,7 @@ xfs_qm_dqlookup(
        uint                    flist_locked;
        xfs_dquot_t             *d;
 
-       ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+       ASSERT(mutex_is_locked(&qh->qh_lock));
 
        flist_locked = B_FALSE;
 
@@ -877,7 +877,7 @@ xfs_qm_dqlookup(
                        /*
                         * move the dquot to the front of the hashchain
                         */
-                       ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+                       ASSERT(mutex_is_locked(&qh->qh_lock));
                        if (dqp->HL_PREVP != &qh->qh_next) {
                                xfs_dqtrace_entry(dqp,
                                                  "DQLOOKUP: HASH MOVETOFRONT");
@@ -892,13 +892,13 @@ xfs_qm_dqlookup(
                        }
                        xfs_dqtrace_entry(dqp, "LOOKUP END");
                        *O_dqpp = dqp;
-                       ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+                       ASSERT(mutex_is_locked(&qh->qh_lock));
                        return (0);
                }
        }
 
        *O_dqpp = NULL;
-       ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+       ASSERT(mutex_is_locked(&qh->qh_lock));
        return (1);
 }
 
@@ -956,7 +956,7 @@ xfs_qm_dqget(
                        ASSERT(ip->i_gdquot == NULL);
        }
 #endif
-       XFS_DQ_HASH_LOCK(h);
+       mutex_lock(&h->qh_lock);
 
        /*
         * Look in the cache (hashtable).
@@ -971,7 +971,7 @@ xfs_qm_dqget(
                 */
                ASSERT(*O_dqpp);
                ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
-               XFS_DQ_HASH_UNLOCK(h);
+               mutex_unlock(&h->qh_lock);
                xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)");
                return (0);     /* success */
        }
@@ -991,7 +991,7 @@ xfs_qm_dqget(
         * we don't keep the lock across a disk read
         */
        version = h->qh_version;
-       XFS_DQ_HASH_UNLOCK(h);
+       mutex_unlock(&h->qh_lock);
 
        /*
         * Allocate the dquot on the kernel heap, and read the ondisk
@@ -1056,7 +1056,7 @@ xfs_qm_dqget(
        /*
         * Hashlock comes after ilock in lock order
         */
-       XFS_DQ_HASH_LOCK(h);
+       mutex_lock(&h->qh_lock);
        if (version != h->qh_version) {
                xfs_dquot_t *tmpdqp;
                /*
@@ -1072,7 +1072,7 @@ xfs_qm_dqget(
                         * and start over.
                         */
                        xfs_qm_dqput(tmpdqp);
-                       XFS_DQ_HASH_UNLOCK(h);
+                       mutex_unlock(&h->qh_lock);
                        xfs_qm_dqdestroy(dqp);
                        XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
                        goto again;
@@ -1083,7 +1083,7 @@ xfs_qm_dqget(
         * Put the dquot at the beginning of the hash-chain and mp's list
         * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
         */
-       ASSERT(XFS_DQ_IS_HASH_LOCKED(h));
+       ASSERT(mutex_is_locked(&h->qh_lock));
        dqp->q_hash = h;
        XQM_HASHLIST_INSERT(h, dqp);
 
@@ -1102,7 +1102,7 @@ xfs_qm_dqget(
        XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp);
 
        xfs_qm_mplist_unlock(mp);
-       XFS_DQ_HASH_UNLOCK(h);
+       mutex_unlock(&h->qh_lock);
  dqret:
        ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
        xfs_dqtrace_entry(dqp, "DQGET DONE");
@@ -1440,7 +1440,7 @@ xfs_qm_dqpurge(
        xfs_mount_t     *mp = dqp->q_mount;
 
        ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
-       ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash));
+       ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
 
        xfs_dqlock(dqp);
        /*
@@ -1453,7 +1453,7 @@ xfs_qm_dqpurge(
         */
        if (dqp->q_nrefs != 0) {
                xfs_dqunlock(dqp);
-               XFS_DQ_HASH_UNLOCK(dqp->q_hash);
+               mutex_unlock(&dqp->q_hash->qh_lock);
                return (1);
        }
 
@@ -1517,7 +1517,7 @@ xfs_qm_dqpurge(
        memset(&dqp->q_core, 0, sizeof(dqp->q_core));
        xfs_dqfunlock(dqp);
        xfs_dqunlock(dqp);
-       XFS_DQ_HASH_UNLOCK(thishash);
+       mutex_unlock(&thishash->qh_lock);
        return (0);
 }
 
index d443e93b43313c4c4f72fef3e14f8e1c2aa6e8a9..de0f402ddb4c2df0f97d040ccfc1df80a9bee190 100644 (file)
@@ -34,7 +34,7 @@
  */
 typedef struct xfs_dqhash {
        struct xfs_dquot *qh_next;
-       mutex_t           qh_lock;
+       struct mutex      qh_lock;
        uint              qh_version;   /* ever increasing version */
        uint              qh_nelems;    /* number of dquots on the list */
 } xfs_dqhash_t;
@@ -81,7 +81,7 @@ typedef struct xfs_dquot {
        xfs_qcnt_t       q_res_bcount;  /* total regular nblks used+reserved */
        xfs_qcnt_t       q_res_icount;  /* total inos allocd+reserved */
        xfs_qcnt_t       q_res_rtbcount;/* total realtime blks used+reserved */
-       mutex_t          q_qlock;       /* quota lock */
+       struct mutex     q_qlock;       /* quota lock */
        struct completion q_flush;      /* flush completion queue */
        atomic_t          q_pincount;   /* dquot pin count */
        wait_queue_head_t q_pinwait;    /* dquot pinning wait queue */
@@ -109,19 +109,6 @@ enum {
 
 #define XFS_DQHOLD(dqp)                ((dqp)->q_nrefs++)
 
-#ifdef DEBUG
-static inline int
-XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
-{
-       if (mutex_trylock(&dqp->q_qlock)) {
-               mutex_unlock(&dqp->q_qlock);
-               return 0;
-       }
-       return 1;
-}
-#endif
-
-
 /*
  * Manage the q_flush completion queue embedded in the dquot.  This completion
  * queue synchronizes processes attempting to flush the in-core dquot back to
@@ -142,6 +129,7 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
        complete(&dqp->q_flush);
 }
 
+#define XFS_DQ_IS_LOCKED(dqp)  (mutex_is_locked(&((dqp)->q_qlock)))
 #define XFS_DQ_IS_ON_FREELIST(dqp)  ((dqp)->dq_flnext != (dqp))
 #define XFS_DQ_IS_DIRTY(dqp)   ((dqp)->dq_flags & XFS_DQ_DIRTY)
 #define XFS_QM_ISUDQ(dqp)      ((dqp)->dq_flags & XFS_DQ_USER)
index 7a2beb64314fee1f50d35b65b1b501669dd35c9b..5b6695049e0037849751e1fff5a908b012962c29 100644 (file)
@@ -55,7 +55,7 @@
  * quota functionality, including maintaining the freelist and hash
  * tables of dquots.
  */
-mutex_t                xfs_Gqm_lock;
+struct mutex   xfs_Gqm_lock;
 struct xfs_qm  *xfs_Gqm;
 uint           ndquot;
 
@@ -69,8 +69,6 @@ STATIC void   xfs_qm_list_destroy(xfs_dqlist_t *);
 
 STATIC void    xfs_qm_freelist_init(xfs_frlist_t *);
 STATIC void    xfs_qm_freelist_destroy(xfs_frlist_t *);
-STATIC int     xfs_qm_mplist_nowait(xfs_mount_t *);
-STATIC int     xfs_qm_dqhashlock_nowait(xfs_dquot_t *);
 
 STATIC int     xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int     xfs_qm_init_quotainfo(xfs_mount_t *);
@@ -82,7 +80,7 @@ static struct shrinker xfs_qm_shaker = {
 };
 
 #ifdef DEBUG
-extern mutex_t qcheck_lock;
+extern struct mutex    qcheck_lock;
 #endif
 
 #ifdef QUOTADEBUG
@@ -219,7 +217,7 @@ xfs_qm_hold_quotafs_ref(
         * the structure could disappear between the entry to this routine and
         * a HOLD operation if not locked.
         */
-       XFS_QM_LOCK(xfs_Gqm);
+       mutex_lock(&xfs_Gqm_lock);
 
        if (xfs_Gqm == NULL)
                xfs_Gqm = xfs_Gqm_init();
@@ -228,8 +226,8 @@ xfs_qm_hold_quotafs_ref(
         * debugging and statistical purposes, but ...
         * Just take a reference and get out.
         */
-       XFS_QM_HOLD(xfs_Gqm);
-       XFS_QM_UNLOCK(xfs_Gqm);
+       xfs_Gqm->qm_nrefs++;
+       mutex_unlock(&xfs_Gqm_lock);
 
        return 0;
 }
@@ -277,13 +275,12 @@ xfs_qm_rele_quotafs_ref(
         * Destroy the entire XQM. If somebody mounts with quotaon, this'll
         * be restarted.
         */
-       XFS_QM_LOCK(xfs_Gqm);
-       XFS_QM_RELE(xfs_Gqm);
-       if (xfs_Gqm->qm_nrefs == 0) {
+       mutex_lock(&xfs_Gqm_lock);
+       if (--xfs_Gqm->qm_nrefs == 0) {
                xfs_qm_destroy(xfs_Gqm);
                xfs_Gqm = NULL;
        }
-       XFS_QM_UNLOCK(xfs_Gqm);
+       mutex_unlock(&xfs_Gqm_lock);
 }
 
 /*
@@ -577,10 +574,10 @@ xfs_qm_dqpurge_int(
                        continue;
                }
 
-               if (! xfs_qm_dqhashlock_nowait(dqp)) {
+               if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
                        nrecl = XFS_QI_MPLRECLAIMS(mp);
                        xfs_qm_mplist_unlock(mp);
-                       XFS_DQ_HASH_LOCK(dqp->q_hash);
+                       mutex_lock(&dqp->q_hash->qh_lock);
                        xfs_qm_mplist_lock(mp);
 
                        /*
@@ -590,7 +587,7 @@ xfs_qm_dqpurge_int(
                         * this point, but somebody might be taking things off.
                         */
                        if (nrecl != XFS_QI_MPLRECLAIMS(mp)) {
-                               XFS_DQ_HASH_UNLOCK(dqp->q_hash);
+                               mutex_unlock(&dqp->q_hash->qh_lock);
                                goto again;
                        }
                }
@@ -632,7 +629,6 @@ xfs_qm_dqattach_one(
        xfs_dqid_t      id,
        uint            type,
        uint            doalloc,
-       uint            dolock,
        xfs_dquot_t     *udqhint, /* hint */
        xfs_dquot_t     **IO_idqpp)
 {
@@ -641,16 +637,16 @@ xfs_qm_dqattach_one(
 
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        error = 0;
+
        /*
         * See if we already have it in the inode itself. IO_idqpp is
         * &i_udquot or &i_gdquot. This made the code look weird, but
         * made the logic a lot simpler.
         */
-       if ((dqp = *IO_idqpp)) {
-               if (dolock)
-                       xfs_dqlock(dqp);
+       dqp = *IO_idqpp;
+       if (dqp) {
                xfs_dqtrace_entry(dqp, "DQATTACH: found in ip");
-               goto done;
+               return 0;
        }
 
        /*
@@ -659,38 +655,38 @@ xfs_qm_dqattach_one(
         * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside
         * the user dquot.
         */
-       ASSERT(!udqhint || type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
-       if (udqhint && !dolock)
+       if (udqhint) {
+               ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
                xfs_dqlock(udqhint);
 
-       /*
-        * No need to take dqlock to look at the id.
-        * The ID can't change until it gets reclaimed, and it won't
-        * be reclaimed as long as we have a ref from inode and we hold
-        * the ilock.
-        */
-       if (udqhint &&
-           (dqp = udqhint->q_gdquot) &&
-           (be32_to_cpu(dqp->q_core.d_id) == id)) {
-               ASSERT(XFS_DQ_IS_LOCKED(udqhint));
-               xfs_dqlock(dqp);
-               XFS_DQHOLD(dqp);
-               ASSERT(*IO_idqpp == NULL);
-               *IO_idqpp = dqp;
-               if (!dolock) {
+               /*
+                * No need to take dqlock to look at the id.
+                *
+                * The ID can't change until it gets reclaimed, and it won't
+                * be reclaimed as long as we have a ref from inode and we
+                * hold the ilock.
+                */
+               dqp = udqhint->q_gdquot;
+               if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) {
+                       xfs_dqlock(dqp);
+                       XFS_DQHOLD(dqp);
+                       ASSERT(*IO_idqpp == NULL);
+                       *IO_idqpp = dqp;
+
                        xfs_dqunlock(dqp);
                        xfs_dqunlock(udqhint);
+                       return 0;
                }
-               goto done;
-       }
-       /*
-        * We can't hold a dquot lock when we call the dqget code.
-        * We'll deadlock in no time, because of (not conforming to)
-        * lock ordering - the inodelock comes before any dquot lock,
-        * and we may drop and reacquire the ilock in xfs_qm_dqget().
-        */
-       if (udqhint)
+
+               /*
+                * We can't hold a dquot lock when we call the dqget code.
+                * We'll deadlock in no time, because of (not conforming to)
+                * lock ordering - the inodelock comes before any dquot lock,
+                * and we may drop and reacquire the ilock in xfs_qm_dqget().
+                */
                xfs_dqunlock(udqhint);
+       }
+
        /*
         * Find the dquot from somewhere. This bumps the
         * reference count of dquot and returns it locked.
@@ -698,48 +694,19 @@ xfs_qm_dqattach_one(
         * disk and we didn't ask it to allocate;
         * ESRCH if quotas got turned off suddenly.
         */
-       if ((error = xfs_qm_dqget(ip->i_mount, ip, id, type,
-                                doalloc|XFS_QMOPT_DOWARN, &dqp))) {
-               if (udqhint && dolock)
-                       xfs_dqlock(udqhint);
-               goto done;
-       }
+       error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp);
+       if (error)
+               return error;
 
        xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget");
+
        /*
         * dqget may have dropped and re-acquired the ilock, but it guarantees
         * that the dquot returned is the one that should go in the inode.
         */
        *IO_idqpp = dqp;
-       ASSERT(dqp);
-       ASSERT(XFS_DQ_IS_LOCKED(dqp));
-       if (! dolock) {
-               xfs_dqunlock(dqp);
-               goto done;
-       }
-       if (! udqhint)
-               goto done;
-
-       ASSERT(udqhint);
-       ASSERT(dolock);
-       ASSERT(XFS_DQ_IS_LOCKED(dqp));
-       if (! xfs_qm_dqlock_nowait(udqhint)) {
-               xfs_dqunlock(dqp);
-               xfs_dqlock(udqhint);
-               xfs_dqlock(dqp);
-       }
-      done:
-#ifdef QUOTADEBUG
-       if (udqhint) {
-               if (dolock)
-                       ASSERT(XFS_DQ_IS_LOCKED(udqhint));
-       }
-       if (! error) {
-               if (dolock)
-                       ASSERT(XFS_DQ_IS_LOCKED(dqp));
-       }
-#endif
-       return error;
+       xfs_dqunlock(dqp);
+       return 0;
 }
 
 
@@ -754,24 +721,15 @@ xfs_qm_dqattach_one(
 STATIC void
 xfs_qm_dqattach_grouphint(
        xfs_dquot_t     *udq,
-       xfs_dquot_t     *gdq,
-       uint            locked)
+       xfs_dquot_t     *gdq)
 {
        xfs_dquot_t     *tmp;
 
-#ifdef QUOTADEBUG
-       if (locked) {
-               ASSERT(XFS_DQ_IS_LOCKED(udq));
-               ASSERT(XFS_DQ_IS_LOCKED(gdq));
-       }
-#endif
-       if (! locked)
-               xfs_dqlock(udq);
+       xfs_dqlock(udq);
 
        if ((tmp = udq->q_gdquot)) {
                if (tmp == gdq) {
-                       if (! locked)
-                               xfs_dqunlock(udq);
+                       xfs_dqunlock(udq);
                        return;
                }
 
@@ -781,8 +739,6 @@ xfs_qm_dqattach_grouphint(
                 * because the freelist lock comes before dqlocks.
                 */
                xfs_dqunlock(udq);
-               if (locked)
-                       xfs_dqunlock(gdq);
                /*
                 * we took a hard reference once upon a time in dqget,
                 * so give it back when the udquot no longer points at it
@@ -795,9 +751,7 @@ xfs_qm_dqattach_grouphint(
 
        } else {
                ASSERT(XFS_DQ_IS_LOCKED(udq));
-               if (! locked) {
-                       xfs_dqlock(gdq);
-               }
+               xfs_dqlock(gdq);
        }
 
        ASSERT(XFS_DQ_IS_LOCKED(udq));
@@ -810,10 +764,9 @@ xfs_qm_dqattach_grouphint(
                XFS_DQHOLD(gdq);
                udq->q_gdquot = gdq;
        }
-       if (! locked) {
-               xfs_dqunlock(gdq);
-               xfs_dqunlock(udq);
-       }
+
+       xfs_dqunlock(gdq);
+       xfs_dqunlock(udq);
 }
 
 
@@ -821,8 +774,6 @@ xfs_qm_dqattach_grouphint(
  * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
  * into account.
  * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
- * If XFS_QMOPT_DQLOCK, the dquot(s) will be returned locked. This option pretty
- * much made this code a complete mess, but it has been pretty useful.
  * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL.
  * Inode may get unlocked and relocked in here, and the caller must deal with
  * the consequences.
@@ -851,7 +802,6 @@ xfs_qm_dqattach(
        if (XFS_IS_UQUOTA_ON(mp)) {
                error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
                                                flags & XFS_QMOPT_DQALLOC,
-                                               flags & XFS_QMOPT_DQLOCK,
                                                NULL, &ip->i_udquot);
                if (error)
                        goto done;
@@ -863,11 +813,9 @@ xfs_qm_dqattach(
                error = XFS_IS_GQUOTA_ON(mp) ?
                        xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
                                                flags & XFS_QMOPT_DQALLOC,
-                                               flags & XFS_QMOPT_DQLOCK,
                                                ip->i_udquot, &ip->i_gdquot) :
                        xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ,
                                                flags & XFS_QMOPT_DQALLOC,
-                                               flags & XFS_QMOPT_DQLOCK,
                                                ip->i_udquot, &ip->i_gdquot);
                /*
                 * Don't worry about the udquot that we may have
@@ -898,22 +846,13 @@ xfs_qm_dqattach(
                /*
                 * Attach i_gdquot to the gdquot hint inside the i_udquot.
                 */
-               xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot,
-                                        flags & XFS_QMOPT_DQLOCK);
+               xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
        }
 
       done:
 
 #ifdef QUOTADEBUG
        if (! error) {
-               if (ip->i_udquot) {
-                       if (flags & XFS_QMOPT_DQLOCK)
-                               ASSERT(XFS_DQ_IS_LOCKED(ip->i_udquot));
-               }
-               if (ip->i_gdquot) {
-                       if (flags & XFS_QMOPT_DQLOCK)
-                               ASSERT(XFS_DQ_IS_LOCKED(ip->i_gdquot));
-               }
                if (XFS_IS_UQUOTA_ON(mp))
                        ASSERT(ip->i_udquot);
                if (XFS_IS_OQUOTA_ON(mp))
@@ -2086,7 +2025,7 @@ xfs_qm_shake_freelist(
                 * a dqlookup process that holds the hashlock that is
                 * waiting for the freelist lock.
                 */
-               if (! xfs_qm_dqhashlock_nowait(dqp)) {
+               if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
                        xfs_dqfunlock(dqp);
                        xfs_dqunlock(dqp);
                        dqp = dqp->dq_flnext;
@@ -2103,7 +2042,7 @@ xfs_qm_shake_freelist(
                        /* XXX put a sentinel so that we can come back here */
                        xfs_dqfunlock(dqp);
                        xfs_dqunlock(dqp);
-                       XFS_DQ_HASH_UNLOCK(hash);
+                       mutex_unlock(&hash->qh_lock);
                        xfs_qm_freelist_unlock(xfs_Gqm);
                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
                                return nreclaimed;
@@ -2120,7 +2059,7 @@ xfs_qm_shake_freelist(
                XQM_HASHLIST_REMOVE(hash, dqp);
                xfs_dqfunlock(dqp);
                xfs_qm_mplist_unlock(dqp->q_mount);
-               XFS_DQ_HASH_UNLOCK(hash);
+               mutex_unlock(&hash->qh_lock);
 
  off_freelist:
                XQM_FREELIST_REMOVE(dqp);
@@ -2262,7 +2201,7 @@ xfs_qm_dqreclaim_one(void)
                        continue;
                }
 
-               if (! xfs_qm_dqhashlock_nowait(dqp))
+               if (!mutex_trylock(&dqp->q_hash->qh_lock))
                        goto mplistunlock;
 
                ASSERT(dqp->q_nrefs == 0);
@@ -2271,7 +2210,7 @@ xfs_qm_dqreclaim_one(void)
                XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
                XQM_FREELIST_REMOVE(dqp);
                dqpout = dqp;
-               XFS_DQ_HASH_UNLOCK(dqp->q_hash);
+               mutex_unlock(&dqp->q_hash->qh_lock);
  mplistunlock:
                xfs_qm_mplist_unlock(dqp->q_mount);
                xfs_dqfunlock(dqp);
@@ -2774,34 +2713,3 @@ xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
 {
        xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
 }
-
-STATIC int
-xfs_qm_dqhashlock_nowait(
-       xfs_dquot_t *dqp)
-{
-       int locked;
-
-       locked = mutex_trylock(&((dqp)->q_hash->qh_lock));
-       return locked;
-}
-
-int
-xfs_qm_freelist_lock_nowait(
-       xfs_qm_t *xqm)
-{
-       int locked;
-
-       locked = mutex_trylock(&(xqm->qm_dqfreelist.qh_lock));
-       return locked;
-}
-
-STATIC int
-xfs_qm_mplist_nowait(
-       xfs_mount_t     *mp)
-{
-       int locked;
-
-       ASSERT(mp->m_quotainfo);
-       locked = mutex_trylock(&(XFS_QI_MPLLOCK(mp)));
-       return locked;
-}
index ddf09166387c0e89c168329b2c15ea34ca52cefd..a371954cae1b6a86eaf296181b0b7c513a4a3130 100644 (file)
@@ -27,7 +27,7 @@ struct xfs_qm;
 struct xfs_inode;
 
 extern uint            ndquot;
-extern mutex_t         xfs_Gqm_lock;
+extern struct mutex    xfs_Gqm_lock;
 extern struct xfs_qm   *xfs_Gqm;
 extern kmem_zone_t     *qm_dqzone;
 extern kmem_zone_t     *qm_dqtrxzone;
@@ -79,7 +79,7 @@ typedef xfs_dqhash_t  xfs_dqlist_t;
 typedef struct xfs_frlist {
        struct xfs_dquot *qh_next;
        struct xfs_dquot *qh_prev;
-       mutex_t          qh_lock;
+       struct mutex     qh_lock;
        uint             qh_version;
        uint             qh_nelems;
 } xfs_frlist_t;
@@ -115,7 +115,7 @@ typedef struct xfs_quotainfo {
        xfs_qwarncnt_t   qi_bwarnlimit;  /* limit for blks warnings */
        xfs_qwarncnt_t   qi_iwarnlimit;  /* limit for inodes warnings */
        xfs_qwarncnt_t   qi_rtbwarnlimit;/* limit for rt blks warnings */
-       mutex_t          qi_quotaofflock;/* to serialize quotaoff */
+       struct mutex     qi_quotaofflock;/* to serialize quotaoff */
        xfs_filblks_t    qi_dqchunklen;  /* # BBs in a chunk of dqs */
        uint             qi_dqperchunk;  /* # ondisk dqs in above chunk */
        xfs_qcnt_t       qi_bhardlimit;  /* default data blk hard limit */
@@ -158,11 +158,6 @@ typedef struct xfs_dquot_acct {
 #define XFS_QM_IWARNLIMIT      5
 #define XFS_QM_RTBWARNLIMIT    5
 
-#define XFS_QM_LOCK(xqm)       (mutex_lock(&xqm##_lock))
-#define XFS_QM_UNLOCK(xqm)     (mutex_unlock(&xqm##_lock))
-#define XFS_QM_HOLD(xqm)       ((xqm)->qm_nrefs++)
-#define XFS_QM_RELE(xqm)       ((xqm)->qm_nrefs--)
-
 extern void            xfs_qm_destroy_quotainfo(xfs_mount_t *);
 extern void            xfs_qm_mount_quotas(xfs_mount_t *);
 extern int             xfs_qm_quotacheck(xfs_mount_t *);
@@ -178,6 +173,16 @@ extern void                xfs_qm_dqdetach(xfs_inode_t *);
 extern int             xfs_qm_dqpurge_all(xfs_mount_t *, uint);
 extern void            xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
 
+/* quota ops */
+extern int             xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
+extern int             xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
+                                       fs_disk_quota_t *);
+extern int             xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
+                                       fs_disk_quota_t *);
+extern int             xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
+extern int             xfs_qm_scall_quotaon(xfs_mount_t *, uint);
+extern int             xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
+
 /* vop stuff */
 extern int             xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *,
                                        uid_t, gid_t, prid_t, uint,
@@ -194,11 +199,6 @@ extern int         xfs_qm_vop_chown_reserve(xfs_trans_t *, xfs_inode_t *,
 /* list stuff */
 extern void            xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
 extern void            xfs_qm_freelist_unlink(xfs_dquot_t *);
-extern int             xfs_qm_freelist_lock_nowait(xfs_qm_t *);
-
-/* system call interface */
-extern int             xfs_qm_quotactl(struct xfs_mount *, int, int,
-                               xfs_caddr_t);
 
 #ifdef DEBUG
 extern int             xfs_qm_internalqcheck(xfs_mount_t *);
index bc6c5cca3e1251d141d7e80084cb6757e9d41214..63037c689a4b2df95888afe7231ff7915ecfa73a 100644 (file)
@@ -235,7 +235,6 @@ struct xfs_qmops xfs_qmcore_xfs = {
        .xfs_dqvopchownresv     = xfs_qm_vop_chown_reserve,
        .xfs_dqstatvfs          = xfs_qm_statvfs,
        .xfs_dqsync             = xfs_qm_sync,
-       .xfs_quotactl           = xfs_qm_quotactl,
        .xfs_dqtrxops           = &xfs_trans_dquot_ops,
 };
 EXPORT_SYMBOL(xfs_qmcore_xfs);
index 68139b38aedef0f9e11806b3d69bab1537871a2e..c7b66f6506ced750585d266ae76e4c61debd2496 100644 (file)
 # define qdprintk(s, args...)  do { } while (0)
 #endif
 
-STATIC int     xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
-STATIC int     xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
-                                       fs_disk_quota_t *);
-STATIC int     xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
-STATIC int     xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
-                                       fs_disk_quota_t *);
-STATIC int     xfs_qm_scall_quotaon(xfs_mount_t *, uint);
-STATIC int     xfs_qm_scall_quotaoff(xfs_mount_t *, uint, boolean_t);
 STATIC int     xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
 STATIC int     xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
                                        uint);
-STATIC uint    xfs_qm_import_flags(uint);
 STATIC uint    xfs_qm_export_flags(uint);
-STATIC uint    xfs_qm_import_qtype_flags(uint);
 STATIC uint    xfs_qm_export_qtype_flags(uint);
 STATIC void    xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *,
                                        fs_disk_quota_t *);
 
 
-/*
- * The main distribution switch of all XFS quotactl system calls.
- */
-int
-xfs_qm_quotactl(
-       xfs_mount_t     *mp,
-       int             cmd,
-       int             id,
-       xfs_caddr_t     addr)
-{
-       int             error;
-
-       ASSERT(addr != NULL || cmd == Q_XQUOTASYNC);
-
-       /*
-        * The following commands are valid even when quotaoff.
-        */
-       switch (cmd) {
-       case Q_XQUOTARM:
-               /*
-                * Truncate quota files. quota must be off.
-                */
-               if (XFS_IS_QUOTA_ON(mp))
-                       return XFS_ERROR(EINVAL);
-               if (mp->m_flags & XFS_MOUNT_RDONLY)
-                       return XFS_ERROR(EROFS);
-               return (xfs_qm_scall_trunc_qfiles(mp,
-                              xfs_qm_import_qtype_flags(*(uint *)addr)));
-
-       case Q_XGETQSTAT:
-               /*
-                * Get quota status information.
-                */
-               return (xfs_qm_scall_getqstat(mp, (fs_quota_stat_t *)addr));
-
-       case Q_XQUOTAON:
-               /*
-                * QUOTAON - enabling quota enforcement.
-                * Quota accounting must be turned on at mount time.
-                */
-               if (mp->m_flags & XFS_MOUNT_RDONLY)
-                       return XFS_ERROR(EROFS);
-               return (xfs_qm_scall_quotaon(mp,
-                                         xfs_qm_import_flags(*(uint *)addr)));
-
-       case Q_XQUOTAOFF:
-               if (mp->m_flags & XFS_MOUNT_RDONLY)
-                       return XFS_ERROR(EROFS);
-               break;
-
-       case Q_XQUOTASYNC:
-               return xfs_sync_inodes(mp, SYNC_DELWRI);
-
-       default:
-               break;
-       }
-
-       if (! XFS_IS_QUOTA_ON(mp))
-               return XFS_ERROR(ESRCH);
-
-       switch (cmd) {
-       case Q_XQUOTAOFF:
-               if (mp->m_flags & XFS_MOUNT_RDONLY)
-                       return XFS_ERROR(EROFS);
-               error = xfs_qm_scall_quotaoff(mp,
-                                           xfs_qm_import_flags(*(uint *)addr),
-                                           B_FALSE);
-               break;
-
-       case Q_XGETQUOTA:
-               error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_USER,
-                                       (fs_disk_quota_t *)addr);
-               break;
-       case Q_XGETGQUOTA:
-               error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_GROUP,
-                                       (fs_disk_quota_t *)addr);
-               break;
-       case Q_XGETPQUOTA:
-               error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_PROJ,
-                                       (fs_disk_quota_t *)addr);
-               break;
-
-       case Q_XSETQLIM:
-               if (mp->m_flags & XFS_MOUNT_RDONLY)
-                       return XFS_ERROR(EROFS);
-               error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_USER,
-                                            (fs_disk_quota_t *)addr);
-               break;
-       case Q_XSETGQLIM:
-               if (mp->m_flags & XFS_MOUNT_RDONLY)
-                       return XFS_ERROR(EROFS);
-               error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_GROUP,
-                                            (fs_disk_quota_t *)addr);
-               break;
-       case Q_XSETPQLIM:
-               if (mp->m_flags & XFS_MOUNT_RDONLY)
-                       return XFS_ERROR(EROFS);
-               error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_PROJ,
-                                            (fs_disk_quota_t *)addr);
-               break;
-
-       default:
-               error = XFS_ERROR(EINVAL);
-               break;
-       }
-
-       return (error);
-}
-
 /*
  * Turn off quota accounting and/or enforcement for all udquots and/or
  * gdquots. Called only at unmount time.
@@ -193,11 +74,10 @@ xfs_qm_quotactl(
  * incore, and modifies the ondisk dquot directly. Therefore, for example,
  * it is an error to call this twice, without purging the cache.
  */
-STATIC int
+int
 xfs_qm_scall_quotaoff(
        xfs_mount_t             *mp,
-       uint                    flags,
-       boolean_t               force)
+       uint                    flags)
 {
        uint                    dqtype;
        int                     error;
@@ -205,8 +85,6 @@ xfs_qm_scall_quotaoff(
        xfs_qoff_logitem_t      *qoffstart;
        int                     nculprits;
 
-       if (!force && !capable(CAP_SYS_ADMIN))
-               return XFS_ERROR(EPERM);
        /*
         * No file system can have quotas enabled on disk but not in core.
         * Note that quota utilities (like quotaoff) _expect_
@@ -375,7 +253,7 @@ out_error:
        return (error);
 }
 
-STATIC int
+int
 xfs_qm_scall_trunc_qfiles(
        xfs_mount_t     *mp,
        uint            flags)
@@ -383,8 +261,6 @@ xfs_qm_scall_trunc_qfiles(
        int             error = 0, error2 = 0;
        xfs_inode_t     *qip;
 
-       if (!capable(CAP_SYS_ADMIN))
-               return XFS_ERROR(EPERM);
        if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
                qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
                return XFS_ERROR(EINVAL);
@@ -416,7 +292,7 @@ xfs_qm_scall_trunc_qfiles(
  * effect immediately.
  * (Switching on quota accounting must be done at mount time.)
  */
-STATIC int
+int
 xfs_qm_scall_quotaon(
        xfs_mount_t     *mp,
        uint            flags)
@@ -426,9 +302,6 @@ xfs_qm_scall_quotaon(
        uint            accflags;
        __int64_t       sbflags;
 
-       if (!capable(CAP_SYS_ADMIN))
-               return XFS_ERROR(EPERM);
-
        flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
        /*
         * Switching on quota accounting must be done at mount time.
@@ -517,7 +390,7 @@ xfs_qm_scall_quotaon(
 /*
  * Return quota status information, such as uquota-off, enforcements, etc.
  */
-STATIC int
+int
 xfs_qm_scall_getqstat(
        xfs_mount_t     *mp,
        fs_quota_stat_t *out)
@@ -582,7 +455,7 @@ xfs_qm_scall_getqstat(
 /*
  * Adjust quota limits, and start/stop timers accordingly.
  */
-STATIC int
+int
 xfs_qm_scall_setqlim(
        xfs_mount_t             *mp,
        xfs_dqid_t              id,
@@ -595,9 +468,6 @@ xfs_qm_scall_setqlim(
        int                     error;
        xfs_qcnt_t              hard, soft;
 
-       if (!capable(CAP_SYS_ADMIN))
-               return XFS_ERROR(EPERM);
-
        if ((newlim->d_fieldmask &
            (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0)
                return (0);
@@ -742,7 +612,7 @@ xfs_qm_scall_setqlim(
        return error;
 }
 
-STATIC int
+int
 xfs_qm_scall_getquota(
        xfs_mount_t     *mp,
        xfs_dqid_t      id,
@@ -934,30 +804,6 @@ xfs_qm_export_dquot(
 #endif
 }
 
-STATIC uint
-xfs_qm_import_qtype_flags(
-       uint            uflags)
-{
-       uint            oflags = 0;
-
-       /*
-        * Can't be more than one, or none.
-        */
-       if (((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ==
-                       (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ||
-           ((uflags & (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) ==
-                       (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) ||
-           ((uflags & (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) ==
-                       (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) ||
-           ((uflags & (XFS_GROUP_QUOTA|XFS_USER_QUOTA|XFS_PROJ_QUOTA)) == 0))
-               return (0);
-
-       oflags |= (uflags & XFS_USER_QUOTA) ? XFS_DQ_USER : 0;
-       oflags |= (uflags & XFS_PROJ_QUOTA) ? XFS_DQ_PROJ : 0;
-       oflags |= (uflags & XFS_GROUP_QUOTA) ? XFS_DQ_GROUP: 0;
-       return oflags;
-}
-
 STATIC uint
 xfs_qm_export_qtype_flags(
        uint flags)
@@ -978,26 +824,6 @@ xfs_qm_export_qtype_flags(
                        XFS_PROJ_QUOTA : XFS_GROUP_QUOTA;
 }
 
-STATIC uint
-xfs_qm_import_flags(
-       uint uflags)
-{
-       uint flags = 0;
-
-       if (uflags & XFS_QUOTA_UDQ_ACCT)
-               flags |= XFS_UQUOTA_ACCT;
-       if (uflags & XFS_QUOTA_PDQ_ACCT)
-               flags |= XFS_PQUOTA_ACCT;
-       if (uflags & XFS_QUOTA_GDQ_ACCT)
-               flags |= XFS_GQUOTA_ACCT;
-       if (uflags & XFS_QUOTA_UDQ_ENFD)
-               flags |= XFS_UQUOTA_ENFD;
-       if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD))
-               flags |= XFS_OQUOTA_ENFD;
-       return (flags);
-}
-
-
 STATIC uint
 xfs_qm_export_flags(
        uint flags)
@@ -1134,7 +960,7 @@ xfs_dqhash_t *qmtest_udqtab;
 xfs_dqhash_t *qmtest_gdqtab;
 int          qmtest_hashmask;
 int          qmtest_nfails;
-mutex_t              qcheck_lock;
+struct mutex  qcheck_lock;
 
 #define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
                                 (__psunsigned_t)(id)) & \
index c4fcea600bc2f15ed7087653bbe6728bc37a71a4..8286b2842b6bc01ef06a566b9c975b2107144ad4 100644 (file)
 #define XFS_QI_QOFFLOCK(mp)    ((mp)->m_quotainfo->qi_quotaofflock)
 
 #define XFS_QI_MPL_LIST(mp)    ((mp)->m_quotainfo->qi_dqlist)
-#define XFS_QI_MPLLOCK(mp)     ((mp)->m_quotainfo->qi_dqlist.qh_lock)
 #define XFS_QI_MPLNEXT(mp)     ((mp)->m_quotainfo->qi_dqlist.qh_next)
 #define XFS_QI_MPLNDQUOTS(mp)  ((mp)->m_quotainfo->qi_dqlist.qh_nelems)
 
-#define XQMLCK(h)                      (mutex_lock(&((h)->qh_lock)))
-#define XQMUNLCK(h)                    (mutex_unlock(&((h)->qh_lock)))
-#ifdef DEBUG
-struct xfs_dqhash;
-static inline int XQMISLCKD(struct xfs_dqhash *h)
-{
-       if (mutex_trylock(&h->qh_lock)) {
-               mutex_unlock(&h->qh_lock);
-               return 0;
-       }
-       return 1;
-}
-#endif
-
-#define XFS_DQ_HASH_LOCK(h)            XQMLCK(h)
-#define XFS_DQ_HASH_UNLOCK(h)          XQMUNLCK(h)
-#define XFS_DQ_IS_HASH_LOCKED(h)       XQMISLCKD(h)
-
-#define xfs_qm_mplist_lock(mp)         XQMLCK(&(XFS_QI_MPL_LIST(mp)))
-#define xfs_qm_mplist_unlock(mp)       XQMUNLCK(&(XFS_QI_MPL_LIST(mp)))
-#define XFS_QM_IS_MPLIST_LOCKED(mp)    XQMISLCKD(&(XFS_QI_MPL_LIST(mp)))
-
-#define xfs_qm_freelist_lock(qm)       XQMLCK(&((qm)->qm_dqfreelist))
-#define xfs_qm_freelist_unlock(qm)     XQMUNLCK(&((qm)->qm_dqfreelist))
+#define xfs_qm_mplist_lock(mp) \
+       mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock))
+#define xfs_qm_mplist_nowait(mp) \
+       mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock))
+#define xfs_qm_mplist_unlock(mp) \
+       mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock))
+#define XFS_QM_IS_MPLIST_LOCKED(mp) \
+       mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock))
+
+#define xfs_qm_freelist_lock(qm) \
+       mutex_lock(&((qm)->qm_dqfreelist.qh_lock))
+#define xfs_qm_freelist_lock_nowait(qm) \
+       mutex_trylock(&((qm)->qm_dqfreelist.qh_lock))
+#define xfs_qm_freelist_unlock(qm) \
+       mutex_unlock(&((qm)->qm_dqfreelist.qh_lock))
 
 /*
  * Hash into a bucket in the dquot hash table, based on <mp, id>.
index 99611381e74043f9974651d2179c08cbc83f9675..447173bcf96de899b7369e66e232363fb5cd5eae 100644 (file)
@@ -624,10 +624,9 @@ xfs_trans_dqresv(
        xfs_qcnt_t      *resbcountp;
        xfs_quotainfo_t *q = mp->m_quotainfo;
 
-       if (! (flags & XFS_QMOPT_DQLOCK)) {
-               xfs_dqlock(dqp);
-       }
-       ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+       xfs_dqlock(dqp);
+
        if (flags & XFS_TRANS_DQ_RES_BLKS) {
                hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
                if (!hardlimit)
@@ -740,10 +739,8 @@ xfs_trans_dqresv(
        ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
 
 error_return:
-       if (! (flags & XFS_QMOPT_DQLOCK)) {
-               xfs_dqunlock(dqp);
-       }
-       return (error);
+       xfs_dqunlock(dqp);
+       return error;
 }
 
 
@@ -753,8 +750,7 @@ error_return:
  * grp/prj quotas is important, because this follows a both-or-nothing
  * approach.
  *
- * flags = XFS_QMOPT_DQLOCK indicate if dquot(s) need to be locked.
- *        XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
+ * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
  *        XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT.  Used by pquota.
  *        XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks
  *        XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks
index ae5482965424d9ae4dddbed76bbff558bb86bb5d..3f3610a7ee059210eea3beadbffe13f87609dce9 100644 (file)
@@ -24,6 +24,7 @@
 #include "xfs_ag.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
+#include "xfs_error.h"
 
 static char            message[1024];  /* keep it off the stack */
 static DEFINE_SPINLOCK(xfs_err_lock);
index 5830c040ea7ebba66274eeebbdab553c206ab981..b83f76b6d4106d87928bc0c69c4d8be0d169d168 100644 (file)
  */
 #include <xfs.h>
 
-static DEFINE_MUTEX(uuid_monitor);
-static int     uuid_table_size;
-static uuid_t  *uuid_table;
-
 /* IRIX interpretation of an uuid_t */
 typedef struct {
        __be32  uu_timelow;
@@ -46,12 +42,6 @@ uuid_getnodeuniq(uuid_t *uuid, int fsid [2])
        fsid[1] = be32_to_cpu(uup->uu_timelow);
 }
 
-void
-uuid_create_nil(uuid_t *uuid)
-{
-       memset(uuid, 0, sizeof(*uuid));
-}
-
 int
 uuid_is_nil(uuid_t *uuid)
 {
@@ -71,64 +61,3 @@ uuid_equal(uuid_t *uuid1, uuid_t *uuid2)
 {
        return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1;
 }
-
-/*
- * Given a 128-bit uuid, return a 64-bit value by adding the top and bottom
- * 64-bit words.  NOTE: This function can not be changed EVER.  Although
- * brain-dead, some applications depend on this 64-bit value remaining
- * persistent.  Specifically, DMI vendors store the value as a persistent
- * filehandle.
- */
-__uint64_t
-uuid_hash64(uuid_t *uuid)
-{
-       __uint64_t      *sp = (__uint64_t *)uuid;
-
-       return sp[0] + sp[1];
-}
-
-int
-uuid_table_insert(uuid_t *uuid)
-{
-       int     i, hole;
-
-       mutex_lock(&uuid_monitor);
-       for (i = 0, hole = -1; i < uuid_table_size; i++) {
-               if (uuid_is_nil(&uuid_table[i])) {
-                       hole = i;
-                       continue;
-               }
-               if (uuid_equal(uuid, &uuid_table[i])) {
-                       mutex_unlock(&uuid_monitor);
-                       return 0;
-               }
-       }
-       if (hole < 0) {
-               uuid_table = kmem_realloc(uuid_table,
-                       (uuid_table_size + 1) * sizeof(*uuid_table),
-                       uuid_table_size  * sizeof(*uuid_table),
-                       KM_SLEEP);
-               hole = uuid_table_size++;
-       }
-       uuid_table[hole] = *uuid;
-       mutex_unlock(&uuid_monitor);
-       return 1;
-}
-
-void
-uuid_table_remove(uuid_t *uuid)
-{
-       int     i;
-
-       mutex_lock(&uuid_monitor);
-       for (i = 0; i < uuid_table_size; i++) {
-               if (uuid_is_nil(&uuid_table[i]))
-                       continue;
-               if (!uuid_equal(uuid, &uuid_table[i]))
-                       continue;
-               uuid_create_nil(&uuid_table[i]);
-               break;
-       }
-       ASSERT(i < uuid_table_size);
-       mutex_unlock(&uuid_monitor);
-}
index cff5b607d445e80888a73de26c2a23952bf30a08..4732d71262cc3f6f3aca0f143a96f81094bf5a22 100644 (file)
@@ -22,12 +22,8 @@ typedef struct {
        unsigned char   __u_bits[16];
 } uuid_t;
 
-extern void uuid_create_nil(uuid_t *uuid);
 extern int uuid_is_nil(uuid_t *uuid);
 extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
 extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
-extern __uint64_t uuid_hash64(uuid_t *uuid);
-extern int uuid_table_insert(uuid_t *uuid);
-extern void uuid_table_remove(uuid_t *uuid);
 
 #endif /* __XFS_SUPPORT_UUID_H__ */
index 143d63ecb20aa86725a1c6ae12b177d44d362d40..c8641f713caae0c97d9ad08481de1627626bb007 100644 (file)
@@ -223,8 +223,8 @@ typedef struct xfs_perag
                be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
 #define        XFS_MIN_FREELIST_PAG(pag,mp)    \
        (XFS_MIN_FREELIST_RAW(          \
-               (uint_t)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
-               (uint_t)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
+               (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
+               (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
 
 #define XFS_AGB_TO_FSB(mp,agno,agbno)  \
        (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
index 028e44e58ea986946f2e4615f2b2c0088a6845c8..2cf944eb796daf4ca455865d3dd22c83ee6d13ee 100644 (file)
@@ -1871,6 +1871,25 @@ xfs_alloc_compute_maxlevels(
        mp->m_ag_maxlevels = level;
 }
 
+/*
+ * Find the length of the longest extent in an AG.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(
+       struct xfs_mount        *mp,
+       struct xfs_perag        *pag)
+{
+       xfs_extlen_t            need, delta = 0;
+
+       need = XFS_MIN_FREELIST_PAG(pag, mp);
+       if (need > pag->pagf_flcount)
+               delta = need - pag->pagf_flcount;
+
+       if (pag->pagf_longest > delta)
+               return pag->pagf_longest - delta;
+       return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
+}
+
 /*
  * Decide whether to use this allocation group for this allocation.
  * If so, fix up the btree freelist's size.
@@ -1923,15 +1942,12 @@ xfs_alloc_fix_freelist(
        }
 
        if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
-               need = XFS_MIN_FREELIST_PAG(pag, mp);
-               delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
                /*
                 * If it looks like there isn't a long enough extent, or enough
                 * total blocks, reject it.
                 */
-               longest = (pag->pagf_longest > delta) ?
-                       (pag->pagf_longest - delta) :
-                       (pag->pagf_flcount > 0 || pag->pagf_longest > 0);
+               need = XFS_MIN_FREELIST_PAG(pag, mp);
+               longest = xfs_alloc_longest_free_extent(mp, pag);
                if ((args->minlen + args->alignment + args->minalignslop - 1) >
                                longest ||
                    ((int)(pag->pagf_freeblks + pag->pagf_flcount -
index 588172796f7b92c61f696591bb5241f1d8f42ccc..e704caee10dfaa2bca339969ad840cfc233e9003 100644 (file)
@@ -100,6 +100,12 @@ typedef struct xfs_alloc_arg {
 #define XFS_ALLOC_USERDATA             1       /* allocation is for user data*/
 #define XFS_ALLOC_INITIAL_USER_DATA    2       /* special case start of file */
 
+/*
+ * Find the length of the longest extent in an AG.
+ */
+xfs_extlen_t
+xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+               struct xfs_perag *pag);
 
 #ifdef __KERNEL__
 
index 6c323f8a4cd19cb8b169bcc17446dd62ffb4ea75..afdc8911637d20c3f7f01ff4eb50ca832e27ab28 100644 (file)
@@ -155,7 +155,8 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
                 * minimum offset only needs to be the space required for 
                 * the btree root.
                 */ 
-               if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > mp->m_attroffset)
+               if (!dp->i_d.di_forkoff && dp->i_df.if_bytes >
+                   xfs_default_attroffset(dp))
                        dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
                break;
                
@@ -297,6 +298,26 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
        xfs_sbversion_add_attr2(mp, args->trans);
 }
 
+/*
+ * After the last attribute is removed revert to original inode format,
+ * making all literal area available to the data fork once more.
+ */
+STATIC void
+xfs_attr_fork_reset(
+       struct xfs_inode        *ip,
+       struct xfs_trans        *tp)
+{
+       xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+       ip->i_d.di_forkoff = 0;
+       ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+
+       ASSERT(ip->i_d.di_anextents == 0);
+       ASSERT(ip->i_afp == NULL);
+
+       ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+
 /*
  * Remove an attribute from the shortform attribute list structure.
  */
@@ -344,22 +365,10 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
         */
        totsize -= size;
        if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
-                               !(args->op_flags & XFS_DA_OP_ADDNAME) &&
-                               (mp->m_flags & XFS_MOUNT_ATTR2) &&
-                               (dp->i_d.di_format != XFS_DINODE_FMT_BTREE)) {
-               /*
-                * Last attribute now removed, revert to original
-                * inode format making all literal area available
-                * to the data fork once more.
-                */
-               xfs_idestroy_fork(dp, XFS_ATTR_FORK);
-               dp->i_d.di_forkoff = 0;
-               dp->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
-               ASSERT(dp->i_d.di_anextents == 0);
-               ASSERT(dp->i_afp == NULL);
-               dp->i_df.if_ext_max =
-                       XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
-               xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+           (mp->m_flags & XFS_MOUNT_ATTR2) &&
+           (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+           !(args->op_flags & XFS_DA_OP_ADDNAME)) {
+               xfs_attr_fork_reset(dp, args->trans);
        } else {
                xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
                dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
@@ -786,20 +795,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
        if (forkoff == -1) {
                ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
                ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE);
-
-               /*
-                * Last attribute was removed, revert to original
-                * inode format making all literal area available
-                * to the data fork once more.
-                */
-               xfs_idestroy_fork(dp, XFS_ATTR_FORK);
-               dp->i_d.di_forkoff = 0;
-               dp->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
-               ASSERT(dp->i_d.di_anextents == 0);
-               ASSERT(dp->i_afp == NULL);
-               dp->i_df.if_ext_max =
-                       XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
-               xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+               xfs_attr_fork_reset(dp, args->trans);
                goto out;
        }
 
index c852cd65aaea59bdcf251c84dc5412ebef1c3c74..3a6ed426327ac15575fe0cf5e39b0a120dc3a373 100644 (file)
@@ -2479,7 +2479,7 @@ xfs_bmap_adjacent(
        fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
        /*
         * If allocating at eof, and there's a previous real block,
-        * try to use it's last block as our starting point.
+        * try to use its last block as our starting point.
         */
        if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
            !isnullstartblock(ap->prevp->br_startblock) &&
@@ -2712,9 +2712,6 @@ xfs_bmap_btalloc(
        xfs_agnumber_t  startag;
        xfs_alloc_arg_t args;
        xfs_extlen_t    blen;
-       xfs_extlen_t    delta;
-       xfs_extlen_t    longest;
-       xfs_extlen_t    need;
        xfs_extlen_t    nextminlen = 0;
        xfs_perag_t     *pag;
        int             nullfb;         /* true if ap->firstblock isn't set */
@@ -2796,13 +2793,8 @@ xfs_bmap_btalloc(
                         * See xfs_alloc_fix_freelist...
                         */
                        if (pag->pagf_init) {
-                               need = XFS_MIN_FREELIST_PAG(pag, mp);
-                               delta = need > pag->pagf_flcount ?
-                                       need - pag->pagf_flcount : 0;
-                               longest = (pag->pagf_longest > delta) ?
-                                       (pag->pagf_longest - delta) :
-                                       (pag->pagf_flcount > 0 ||
-                                        pag->pagf_longest > 0);
+                               xfs_extlen_t    longest;
+                               longest = xfs_alloc_longest_free_extent(mp, pag);
                                if (blen < longest)
                                        blen = longest;
                        } else
@@ -3576,6 +3568,27 @@ xfs_bmap_extents_to_btree(
        return 0;
 }
 
+/*
+ * Calculate the default attribute fork offset for newly created inodes.
+ */
+uint
+xfs_default_attroffset(
+       struct xfs_inode        *ip)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       uint                    offset;
+
+       if (mp->m_sb.sb_inodesize == 256) {
+               offset = XFS_LITINO(mp) -
+                               XFS_BMDR_SPACE_CALC(MINABTPTRS);
+       } else {
+               offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
+       }
+
+       ASSERT(offset < XFS_LITINO(mp));
+       return offset;
+}
+
 /*
  * Helper routine to reset inode di_forkoff field when switching
  * attribute fork from local to extent format - we reset it where
@@ -3588,15 +3601,18 @@ xfs_bmap_forkoff_reset(
        int             whichfork)
 {
        if (whichfork == XFS_ATTR_FORK &&
-           (ip->i_d.di_format != XFS_DINODE_FMT_DEV) &&
-           (ip->i_d.di_format != XFS_DINODE_FMT_UUID) &&
-           (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
-           ((mp->m_attroffset >> 3) > ip->i_d.di_forkoff)) {
-               ip->i_d.di_forkoff = mp->m_attroffset >> 3;
-               ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) /
-                                       (uint)sizeof(xfs_bmbt_rec_t);
-               ip->i_afp->if_ext_max = XFS_IFORK_ASIZE(ip) /
-                                       (uint)sizeof(xfs_bmbt_rec_t);
+           ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
+           ip->i_d.di_format != XFS_DINODE_FMT_UUID &&
+           ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
+               uint    dfl_forkoff = xfs_default_attroffset(ip) >> 3;
+
+               if (dfl_forkoff > ip->i_d.di_forkoff) {
+                       ip->i_d.di_forkoff = dfl_forkoff;
+                       ip->i_df.if_ext_max =
+                               XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
+                       ip->i_afp->if_ext_max =
+                               XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t);
+               }
        }
 }
 
@@ -4065,7 +4081,7 @@ xfs_bmap_add_attrfork(
        case XFS_DINODE_FMT_BTREE:
                ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
                if (!ip->i_d.di_forkoff)
-                       ip->i_d.di_forkoff = mp->m_attroffset >> 3;
+                       ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
                else if (mp->m_flags & XFS_MOUNT_ATTR2)
                        version = 2;
                break;
@@ -4212,12 +4228,12 @@ xfs_bmap_compute_maxlevels(
         * (a signed 16-bit number, xfs_aextnum_t).
         *
         * Note that we can no longer assume that if we are in ATTR1 that
-        * the fork offset of all the inodes will be (m_attroffset >> 3)
-        * because we could have mounted with ATTR2 and then mounted back
-        * with ATTR1, keeping the di_forkoff's fixed but probably at
-        * various positions. Therefore, for both ATTR1 and ATTR2
-        * we have to assume the worst case scenario of a minimum size
-        * available.
+        * the fork offset of all the inodes will be
+        * (xfs_default_attroffset(ip) >> 3) because we could have mounted
+        * with ATTR2 and then mounted back with ATTR1, keeping the
+        * di_forkoff's fixed but probably at various positions. Therefore,
+        * for both ATTR1 and ATTR2 we have to assume the worst case scenario
+        * of a minimum size available.
         */
        if (whichfork == XFS_DATA_FORK) {
                maxleafents = MAXEXTNUM;
@@ -4804,7 +4820,7 @@ xfs_bmapi(
        xfs_extlen_t    minlen;         /* min allocation size */
        xfs_mount_t     *mp;            /* xfs mount structure */
        int             n;              /* current extent index */
-       int             nallocs;        /* number of extents alloc\'d */
+       int             nallocs;        /* number of extents alloc'd */
        xfs_extnum_t    nextents;       /* number of extents in file */
        xfs_fileoff_t   obno;           /* old block number (offset) */
        xfs_bmbt_irec_t prev;           /* previous file extent record */
@@ -6204,7 +6220,7 @@ xfs_bmap_get_bp(
        return(bp);
 }
 
-void
+STATIC void
 xfs_check_block(
        struct xfs_btree_block  *block,
        xfs_mount_t             *mp,
@@ -6494,7 +6510,7 @@ xfs_bmap_count_tree(
        block = XFS_BUF_TO_BLOCK(bp);
 
        if (--level) {
-               /* Not at node above leafs, count this level of nodes */
+               /* Not at node above leaves, count this level of nodes */
                nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                while (nextbno != NULLFSBLOCK) {
                        if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
index be2979d88d326625a5df5fbe387dfa19c4a3ac8b..1b8ff9256bd0cba9515bc212c859a80c2335c157 100644 (file)
@@ -125,7 +125,7 @@ typedef struct xfs_bmalloca {
        struct xfs_bmbt_irec    *gotp;  /* extent after, or delayed */
        xfs_extlen_t            alen;   /* i/o length asked/allocated */
        xfs_extlen_t            total;  /* total blocks needed for xaction */
-       xfs_extlen_t            minlen; /* mininum allocation size (blocks) */
+       xfs_extlen_t            minlen; /* minimum allocation size (blocks) */
        xfs_extlen_t            minleft; /* amount must be left after alloc */
        char                    eof;    /* set if allocating past last extent */
        char                    wasdel; /* replacing a delayed allocation */
@@ -338,6 +338,10 @@ xfs_check_nostate_extents(
        xfs_extnum_t            idx,
        xfs_extnum_t            num);
 
+uint
+xfs_default_attroffset(
+       struct xfs_inode        *ip);
+
 #ifdef __KERNEL__
 
 /*
index e73c332eb23f92cce1c61fad98f6cc3fbc1870d1..e9df995748291047922ef29257e9423357d386db 100644 (file)
@@ -1883,7 +1883,7 @@ xfs_btree_lshift(
 
        /*
         * We add one entry to the left side and remove one for the right side.
-        * Accout for it here, the changes will be updated on disk and logged
+        * Account for it here, the changes will be updated on disk and logged
         * later.
         */
        lrecs++;
@@ -3535,7 +3535,7 @@ xfs_btree_delrec(
        XFS_BTREE_STATS_INC(cur, join);
 
        /*
-        * Fix up the the number of records and right block pointer in the
+        * Fix up the number of records and right block pointer in the
         * surviving block, and log it.
         */
        xfs_btree_set_numrecs(left, lrecs + rrecs);
index 789fffdf8b2f557bc1aa3ed3c51acc5081ee941b..4f852b735b961a37bb43f431937b52cbe1166212 100644 (file)
@@ -41,7 +41,7 @@ extern kmem_zone_t    *xfs_btree_cur_zone;
 /*
  * Generic btree header.
  *
- * This is a comination of the actual format used on disk for short and long
+ * This is a combination of the actual format used on disk for short and long
  * format btrees.  The first three fields are shared by both format, but
  * the pointers are different and should be used with care.
  *
index c45f74ff1a5b980bfb51f6d30ae306f2db880be4..9ff6e57a50758f10ddc076dc7875dad2172963a3 100644 (file)
@@ -1503,7 +1503,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
  * This is implemented with some source-level loop unrolling.
  */
 xfs_dahash_t
-xfs_da_hashname(const uchar_t *name, int namelen)
+xfs_da_hashname(const __uint8_t *name, int namelen)
 {
        xfs_dahash_t hash;
 
index 70b710c1792d33bfce9c94641b2e62016d48493e..8c536167bf754b030dcd333ed2c06c5a72c887a6 100644 (file)
@@ -91,9 +91,9 @@ enum xfs_dacmp {
  * Structure to ease passing around component names.
  */
 typedef struct xfs_da_args {
-       const uchar_t   *name;          /* string (maybe not NULL terminated) */
+       const __uint8_t *name;          /* string (maybe not NULL terminated) */
        int             namelen;        /* length of string (maybe no NULL) */
-       uchar_t         *value;         /* set of bytes (maybe contain NULLs) */
+       __uint8_t       *value;         /* set of bytes (maybe contain NULLs) */
        int             valuelen;       /* length of value */
        int             flags;          /* argument flags (eg: ATTR_NOCREATE) */
        xfs_dahash_t    hashval;        /* hash value of name */
@@ -185,7 +185,7 @@ typedef struct xfs_da_state {
        unsigned char           inleaf;         /* insert into 1->lf, 0->splf */
        unsigned char           extravalid;     /* T/F: extrablk is in use */
        unsigned char           extraafter;     /* T/F: extrablk is after new */
-       xfs_da_state_blk_t      extrablk;       /* for double-splits on leafs */
+       xfs_da_state_blk_t      extrablk;       /* for double-splits on leaves */
                                                /* for dirv2 extrablk is data */
 } xfs_da_state_t;
 
@@ -251,7 +251,7 @@ xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
 int    xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
                                          xfs_dabuf_t *dead_buf);
 
-uint xfs_da_hashname(const uchar_t *name_string, int name_length);
+uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
 enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
                                const char *name, int len);
 
@@ -268,5 +268,6 @@ xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf);
 
 extern struct kmem_zone *xfs_da_state_zone;
 extern struct kmem_zone *xfs_dabuf_zone;
+extern const struct xfs_nameops xfs_default_nameops;
 
 #endif /* __XFS_DA_BTREE_H__ */
index f8278cfcc1d328a1a9542663d448607de1c5f561..e6d839bddbf008b3bc522720e5f5e0711a954e10 100644 (file)
@@ -79,6 +79,12 @@ xfs_swapext(
                goto out_put_target_file;
        }
 
+       if (IS_SWAPFILE(file->f_path.dentry->d_inode) ||
+           IS_SWAPFILE(target_file->f_path.dentry->d_inode)) {
+               error = XFS_ERROR(EINVAL);
+               goto out_put_target_file;
+       }
+
        ip = XFS_I(file->f_path.dentry->d_inode);
        tip = XFS_I(target_file->f_path.dentry->d_inode);
 
@@ -118,19 +124,17 @@ xfs_swap_extents(
        xfs_bstat_t     *sbp = &sxp->sx_stat;
        xfs_ifork_t     *tempifp, *ifp, *tifp;
        int             ilf_fields, tilf_fields;
-       static uint     lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
        int             error = 0;
        int             aforkblks = 0;
        int             taforkblks = 0;
        __uint64_t      tmp;
-       char            locked = 0;
 
        mp = ip->i_mount;
 
        tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
        if (!tempifp) {
                error = XFS_ERROR(ENOMEM);
-               goto error0;
+               goto out;
        }
 
        sbp = &sxp->sx_stat;
@@ -143,25 +147,24 @@ xfs_swap_extents(
         */
        xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
        xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
-       locked = 1;
 
        /* Verify that both files have the same format */
        if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
                error = XFS_ERROR(EINVAL);
-               goto error0;
+               goto out_unlock;
        }
 
        /* Verify both files are either real-time or non-realtime */
        if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
                error = XFS_ERROR(EINVAL);
-               goto error0;
+               goto out_unlock;
        }
 
        /* Should never get a local format */
        if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
            tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
                error = XFS_ERROR(EINVAL);
-               goto error0;
+               goto out_unlock;
        }
 
        if (VN_CACHED(VFS_I(tip)) != 0) {
@@ -169,13 +172,13 @@ xfs_swap_extents(
                error = xfs_flushinval_pages(tip, 0, -1,
                                FI_REMAPF_LOCKED);
                if (error)
-                       goto error0;
+                       goto out_unlock;
        }
 
        /* Verify O_DIRECT for ftmp */
        if (VN_CACHED(VFS_I(tip)) != 0) {
                error = XFS_ERROR(EINVAL);
-               goto error0;
+               goto out_unlock;
        }
 
        /* Verify all data are being swapped */
@@ -183,7 +186,7 @@ xfs_swap_extents(
            sxp->sx_length != ip->i_d.di_size ||
            sxp->sx_length != tip->i_d.di_size) {
                error = XFS_ERROR(EFAULT);
-               goto error0;
+               goto out_unlock;
        }
 
        /*
@@ -193,7 +196,7 @@ xfs_swap_extents(
         */
        if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
                error = XFS_ERROR(EINVAL);
-               goto error0;
+               goto out_unlock;
        }
 
        /*
@@ -208,7 +211,7 @@ xfs_swap_extents(
            (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) ||
            (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) {
                error = XFS_ERROR(EBUSY);
-               goto error0;
+               goto out_unlock;
        }
 
        /* We need to fail if the file is memory mapped.  Once we have tossed
@@ -219,7 +222,7 @@ xfs_swap_extents(
         */
        if (VN_MAPPED(VFS_I(ip))) {
                error = XFS_ERROR(EBUSY);
-               goto error0;
+               goto out_unlock;
        }
 
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -242,8 +245,7 @@ xfs_swap_extents(
                xfs_iunlock(ip,  XFS_IOLOCK_EXCL);
                xfs_iunlock(tip, XFS_IOLOCK_EXCL);
                xfs_trans_cancel(tp, 0);
-               locked = 0;
-               goto error0;
+               goto out;
        }
        xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
 
@@ -253,19 +255,15 @@ xfs_swap_extents(
        if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
             (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
                error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
-               if (error) {
-                       xfs_trans_cancel(tp, 0);
-                       goto error0;
-               }
+               if (error)
+                       goto out_trans_cancel;
        }
        if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
             (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
                error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
                        &taforkblks);
-               if (error) {
-                       xfs_trans_cancel(tp, 0);
-                       goto error0;
-               }
+               if (error)
+                       goto out_trans_cancel;
        }
 
        /*
@@ -332,10 +330,10 @@ xfs_swap_extents(
 
 
        IHOLD(ip);
-       xfs_trans_ijoin(tp, ip, lock_flags);
+       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 
        IHOLD(tip);
-       xfs_trans_ijoin(tp, tip, lock_flags);
+       xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 
        xfs_trans_log_inode(tp, ip,  ilf_fields);
        xfs_trans_log_inode(tp, tip, tilf_fields);
@@ -344,19 +342,19 @@ xfs_swap_extents(
         * If this is a synchronous mount, make sure that the
         * transaction goes to disk before returning to the user.
         */
-       if (mp->m_flags & XFS_MOUNT_WSYNC) {
+       if (mp->m_flags & XFS_MOUNT_WSYNC)
                xfs_trans_set_sync(tp);
-       }
 
        error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
-       locked = 0;
 
- error0:
-       if (locked) {
-               xfs_iunlock(ip,  lock_flags);
-               xfs_iunlock(tip, lock_flags);
-       }
-       if (tempifp != NULL)
-               kmem_free(tempifp);
+out_unlock:
+       xfs_iunlock(ip,  XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+       xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+out:
+       kmem_free(tempifp);
        return error;
+
+out_trans_cancel:
+       xfs_trans_cancel(tp, 0);
+       goto out_unlock;
 }
index 162e8726df5e65b9666cee344aadda997909d878..e5b153b2e6a34b527ec65ec57e0b60d9ce9cb073 100644 (file)
@@ -103,7 +103,9 @@ typedef enum xfs_dinode_fmt {
 /*
  * Inode size for given fs.
  */
-#define        XFS_LITINO(mp)  ((mp)->m_litino)
+#define XFS_LITINO(mp) \
+       ((int)(((mp)->m_sb.sb_inodesize) - sizeof(struct xfs_dinode)))
+
 #define        XFS_BROOT_SIZE_ADJ      \
        (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t))
 
index 1afb12278b8d04b85eb42d0f67d0bd21936885be..c657bec6d9513849f2ad86dca172e88417742f14 100644 (file)
@@ -46,8 +46,6 @@
 
 struct xfs_name xfs_name_dotdot = {"..", 2};
 
-extern const struct xfs_nameops xfs_default_nameops;
-
 /*
  * ASCII case-insensitive (ie. A-Z) support for directories that was
  * used in IRIX.
index e1f0a06aaf042c945e02f07b872190414122a2b7..ab52e9e1c1eedd9c115161a4f786bf37d5bcff66 100644 (file)
@@ -448,7 +448,6 @@ xfs_dir2_block_getdents(
        xfs_mount_t             *mp;            /* filesystem mount point */
        char                    *ptr;           /* current data entry */
        int                     wantoff;        /* starting block offset */
-       xfs_ino_t               ino;
        xfs_off_t               cook;
 
        mp = dp->i_mount;
@@ -509,16 +508,12 @@ xfs_dir2_block_getdents(
 
                cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
                                            (char *)dep - (char *)block);
-               ino = be64_to_cpu(dep->inumber);
-#if XFS_BIG_INUMS
-               ino += mp->m_inoadd;
-#endif
 
                /*
                 * If it didn't fit, set the final offset to here & return.
                 */
                if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff,
-                           ino, DT_UNKNOWN)) {
+                           be64_to_cpu(dep->inumber), DT_UNKNOWN)) {
                        *offset = cook & 0x7fffffff;
                        xfs_da_brelse(NULL, bp);
                        return 0;
index b816e025273916ff02867b033acb0eeaa54a6751..efbc290c7fec1bee9a3914a0fb56d52acf32c41a 100644 (file)
@@ -38,7 +38,7 @@ struct xfs_trans;
 
 /*
  * Directory address space divided into sections,
- * spaces separated by 32gb.
+ * spaces separated by 32GB.
  */
 #define        XFS_DIR2_SPACE_SIZE     (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
 #define        XFS_DIR2_DATA_SPACE     0
index ef805a374eec174c21c77317da7c743ad47b21bf..fa913e4594421adefc392e80c254d492b8a9ec91 100644 (file)
@@ -549,7 +549,7 @@ xfs_dir2_leaf_addname(
  * Check the internal consistency of a leaf1 block.
  * Pop an assert if something is wrong.
  */
-void
+STATIC void
 xfs_dir2_leaf_check(
        xfs_inode_t             *dp,            /* incore directory inode */
        xfs_dabuf_t             *bp)            /* leaf's buffer */
@@ -780,7 +780,6 @@ xfs_dir2_leaf_getdents(
        int                     ra_index;       /* *map index for read-ahead */
        int                     ra_offset;      /* map entry offset for ra */
        int                     ra_want;        /* readahead count wanted */
-       xfs_ino_t               ino;
 
        /*
         * If the offset is at or past the largest allowed value,
@@ -1076,24 +1075,12 @@ xfs_dir2_leaf_getdents(
                        continue;
                }
 
-               /*
-                * Copy the entry into the putargs, and try formatting it.
-                */
                dep = (xfs_dir2_data_entry_t *)ptr;
-
                length = xfs_dir2_data_entsize(dep->namelen);
 
-               ino = be64_to_cpu(dep->inumber);
-#if XFS_BIG_INUMS
-               ino += mp->m_inoadd;
-#endif
-
-               /*
-                * Won't fit.  Return to caller.
-                */
                if (filldir(dirent, dep->name, dep->namelen,
                            xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
-                           ino, DT_UNKNOWN))
+                           be64_to_cpu(dep->inumber), DT_UNKNOWN))
                        break;
 
                /*
index fa6c3a5ddbc65fba6fa05dca30cdc0fd3664b567..5a81ccd1045b11ac762fc0978d6b4043c6d451b6 100644 (file)
@@ -1104,7 +1104,7 @@ xfs_dir2_leafn_remove(
        }
        xfs_dir2_leafn_check(dp, bp);
        /*
-        * Return indication of whether this leaf block is emtpy enough
+        * Return indication of whether this leaf block is empty enough
         * to justify trying to join it with a neighbor.
         */
        *rval =
index a8a8a6efad5b8142dad0689988a35597cbf32554..e89734e8464610a67797cc473951d91d1ce2fb90 100644 (file)
@@ -748,11 +748,7 @@ xfs_dir2_sf_getdents(
         * Put . entry unless we're starting past it.
         */
        if (*offset <= dot_offset) {
-               ino = dp->i_ino;
-#if XFS_BIG_INUMS
-               ino += mp->m_inoadd;
-#endif
-               if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, ino, DT_DIR)) {
+               if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) {
                        *offset = dot_offset & 0x7fffffff;
                        return 0;
                }
@@ -763,9 +759,6 @@ xfs_dir2_sf_getdents(
         */
        if (*offset <= dotdot_offset) {
                ino = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
-#if XFS_BIG_INUMS
-               ino += mp->m_inoadd;
-#endif
                if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) {
                        *offset = dotdot_offset & 0x7fffffff;
                        return 0;
@@ -786,10 +779,6 @@ xfs_dir2_sf_getdents(
                }
 
                ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep));
-#if XFS_BIG_INUMS
-               ino += mp->m_inoadd;
-#endif
-
                if (filldir(dirent, sfep->name, sfep->namelen,
                            off & 0x7fffffff, ino, DT_UNKNOWN)) {
                        *offset = off & 0x7fffffff;
index 2f049f63e85f73ea3ae1763c5eb66e6798368635..0d22c56fdf64247f6f208ef0ed2f2a72a7b64e22 100644 (file)
@@ -33,12 +33,10 @@ typedef struct xfs_extent {
  * conversion routine.
  */
 
-#ifndef HAVE_FORMAT32
 typedef struct xfs_extent_32 {
        __uint64_t      ext_start;
        __uint32_t      ext_len;
 } __attribute__((packed)) xfs_extent_32_t;
-#endif
 
 typedef struct xfs_extent_64 {
        __uint64_t      ext_start;
@@ -59,7 +57,6 @@ typedef struct xfs_efi_log_format {
        xfs_extent_t            efi_extents[1]; /* array of extents to free */
 } xfs_efi_log_format_t;
 
-#ifndef HAVE_FORMAT32
 typedef struct xfs_efi_log_format_32 {
        __uint16_t              efi_type;       /* efi log item type */
        __uint16_t              efi_size;       /* size of this item */
@@ -67,7 +64,6 @@ typedef struct xfs_efi_log_format_32 {
        __uint64_t              efi_id;         /* efi identifier */
        xfs_extent_32_t         efi_extents[1]; /* array of extents to free */
 } __attribute__((packed)) xfs_efi_log_format_32_t;
-#endif
 
 typedef struct xfs_efi_log_format_64 {
        __uint16_t              efi_type;       /* efi log item type */
@@ -90,7 +86,6 @@ typedef struct xfs_efd_log_format {
        xfs_extent_t            efd_extents[1]; /* array of extents freed */
 } xfs_efd_log_format_t;
 
-#ifndef HAVE_FORMAT32
 typedef struct xfs_efd_log_format_32 {
        __uint16_t              efd_type;       /* efd log item type */
        __uint16_t              efd_size;       /* size of this item */
@@ -98,7 +93,6 @@ typedef struct xfs_efd_log_format_32 {
        __uint64_t              efd_efi_id;     /* id of corresponding efi */
        xfs_extent_32_t         efd_extents[1]; /* array of extents freed */
 } __attribute__((packed)) xfs_efd_log_format_32_t;
-#endif
 
 typedef struct xfs_efd_log_format_64 {
        __uint16_t              efd_type;       /* efd log item type */
index f3bb75da384e0919d30b6ba4722875ffe5736da3..6c87c8f304efb8f7d887c7c41a667e4c85105882 100644 (file)
@@ -140,7 +140,7 @@ _xfs_filestream_pick_ag(
        xfs_extlen_t    minlen)
 {
        int             err, trylock, nscan;
-       xfs_extlen_t    delta, longest, need, free, minfree, maxfree = 0;
+       xfs_extlen_t    longest, free, minfree, maxfree = 0;
        xfs_agnumber_t  ag, max_ag = NULLAGNUMBER;
        struct xfs_perag *pag;
 
@@ -186,12 +186,7 @@ _xfs_filestream_pick_ag(
                        goto next_ag;
                }
 
-               need = XFS_MIN_FREELIST_PAG(pag, mp);
-               delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
-               longest = (pag->pagf_longest > delta) ?
-                         (pag->pagf_longest - delta) :
-                         (pag->pagf_flcount > 0 || pag->pagf_longest > 0);
-
+               longest = xfs_alloc_longest_free_extent(mp, pag);
                if (((minlen && longest >= minlen) ||
                     (!minlen && pag->pagf_freeblks >= minfree)) &&
                    (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
index 680d0e0ec93298576f46686f219098c242e152d8..8379e3bca26cb08d865a8c4cda3c7f6f1c48c276 100644 (file)
@@ -576,7 +576,7 @@ out:
        if (fdblks_delta) {
                /*
                 * If we are putting blocks back here, m_resblks_avail is
-                * already at it's max so this will put it in the free pool.
+                * already at its max so this will put it in the free pool.
                 *
                 * If we need space, we'll either succeed in getting it
                 * from the free block count or we'll get an enospc. If
index ab016e5ae7be1567e6782bb17725e8b52c501ed8..3120a3a5e20f90a07186e9258d3767f315678be5 100644 (file)
@@ -230,7 +230,7 @@ xfs_ialloc_ag_alloc(
                args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
 
                /* Allow space for the inode btree to split. */
-               args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
+               args.minleft = args.mp->m_in_maxlevels - 1;
                if ((error = xfs_alloc_vextent(&args)))
                        return error;
        } else
@@ -270,7 +270,7 @@ xfs_ialloc_ag_alloc(
                /*
                 * Allow space for the inode btree to split.
                 */
-               args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
+               args.minleft = args.mp->m_in_maxlevels - 1;
                if ((error = xfs_alloc_vextent(&args)))
                        return error;
        }
@@ -349,7 +349,7 @@ xfs_ialloc_ag_alloc(
                 * Initialize all inodes in this buffer and then log them.
                 *
                 * XXX: It would be much better if we had just one transaction to
-                *      log a whole cluster of inodes instead of all the indivdual
+                *      log a whole cluster of inodes instead of all the individual
                 *      transactions causing a lot of log traffic.
                 */
                xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
@@ -943,7 +943,7 @@ nextag:
        ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
                                   XFS_INODES_PER_CHUNK) == 0);
        ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
-       XFS_INOBT_CLR_FREE(&rec, offset);
+       rec.ir_free &= ~XFS_INOBT_MASK(offset);
        rec.ir_freecount--;
        if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
                        rec.ir_free)))
@@ -1105,11 +1105,11 @@ xfs_difree(
         */
        off = agino - rec.ir_startino;
        ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
-       ASSERT(!XFS_INOBT_IS_FREE(&rec, off));
+       ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
        /*
         * Mark the inode free & increment the count.
         */
-       XFS_INOBT_SET_FREE(&rec, off);
+       rec.ir_free |= XFS_INOBT_MASK(off);
        rec.ir_freecount++;
 
        /*
index 99f2408e8d8e634ac446914a505982bba3d48deb..c282a9af5393da36022e471332dc45d37705bcbb 100644 (file)
@@ -164,7 +164,7 @@ xfs_inobt_init_rec_from_cur(
 }
 
 /*
- * intial value of ptr for lookup
+ * initial value of ptr for lookup
  */
 STATIC void
 xfs_inobt_init_ptr_from_cur(
index 5580e255ff06637e6b979f6671c21f3dd90d8029..f782ad0c4769483ae23ab25dade35ee737ea4c63 100644 (file)
@@ -32,14 +32,14 @@ struct xfs_mount;
 #define        XFS_IBT_MAGIC   0x49414254      /* 'IABT' */
 
 typedef        __uint64_t      xfs_inofree_t;
-#define        XFS_INODES_PER_CHUNK    (NBBY * sizeof(xfs_inofree_t))
+#define        XFS_INODES_PER_CHUNK            (NBBY * sizeof(xfs_inofree_t))
 #define        XFS_INODES_PER_CHUNK_LOG        (XFS_NBBYLOG + 3)
-#define        XFS_INOBT_ALL_FREE      ((xfs_inofree_t)-1)
+#define        XFS_INOBT_ALL_FREE              ((xfs_inofree_t)-1)
+#define        XFS_INOBT_MASK(i)               ((xfs_inofree_t)1 << (i))
 
 static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
 {
-       return (((n) >= XFS_INODES_PER_CHUNK ? \
-               (xfs_inofree_t)0 : ((xfs_inofree_t)1 << (n))) - 1) << (i);
+       return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
 }
 
 /*
@@ -68,20 +68,6 @@ typedef struct xfs_inobt_key {
 /* btree pointer type */
 typedef __be32 xfs_inobt_ptr_t;
 
-/*
- * Bit manipulations for ir_free.
- */
-#define        XFS_INOBT_MASK(i)               ((xfs_inofree_t)1 << (i))
-#define        XFS_INOBT_IS_FREE(rp,i)         \
-               (((rp)->ir_free & XFS_INOBT_MASK(i)) != 0)
-#define        XFS_INOBT_SET_FREE(rp,i)        ((rp)->ir_free |= XFS_INOBT_MASK(i))
-#define        XFS_INOBT_CLR_FREE(rp,i)        ((rp)->ir_free &= ~XFS_INOBT_MASK(i))
-
-/*
- * Maximum number of inode btree levels.
- */
-#define        XFS_IN_MAXLEVELS(mp)            ((mp)->m_in_maxlevels)
-
 /*
  * block numbers in the AG.
  */
index 1f175fa34b225df1da087a6fceadb201205fcbe8..f879c1bc4b96deba7ec1c42fb8bc9fcfd0109a1d 100644 (file)
@@ -122,7 +122,7 @@ typedef struct xfs_ictimestamp {
 
 /*
  * NOTE:  This structure must be kept identical to struct xfs_dinode
- *       in xfs_dinode.h except for the endianess annotations.
+ *       in xfs_dinode.h except for the endianness annotations.
  */
 typedef struct xfs_icdinode {
        __uint16_t      di_magic;       /* inode magic # = XFS_DINODE_MAGIC */
index 9957d0602d549d4688e0087d8fd4223cadaae402..a52ac125f0556a61c81fd2f375056b735740113f 100644 (file)
@@ -40,7 +40,6 @@ typedef struct xfs_inode_log_format {
        __int32_t               ilf_boffset;    /* off of inode in buffer */
 } xfs_inode_log_format_t;
 
-#ifndef HAVE_FORMAT32
 typedef struct xfs_inode_log_format_32 {
        __uint16_t              ilf_type;       /* inode log item type */
        __uint16_t              ilf_size;       /* size of this item */
@@ -56,7 +55,6 @@ typedef struct xfs_inode_log_format_32 {
        __int32_t               ilf_len;        /* len of inode buffer */
        __int32_t               ilf_boffset;    /* off of inode in buffer */
 } __attribute__((packed)) xfs_inode_log_format_32_t;
-#endif
 
 typedef struct xfs_inode_log_format_64 {
        __uint16_t              ilf_type;       /* inode log item type */
index ee1a0c134cc274479a8da663bca33f1e9745d38f..a1cc1322fc0f2123a1526dc00391689348488643 100644 (file)
@@ -63,7 +63,7 @@ typedef enum {
  */
 
 typedef struct xfs_iomap {
-       xfs_daddr_t             iomap_bn;       /* first 512b blk of mapping */
+       xfs_daddr_t             iomap_bn;       /* first 512B blk of mapping */
        xfs_buftarg_t           *iomap_target;
        xfs_off_t               iomap_offset;   /* offset of mapping, bytes */
        xfs_off_t               iomap_bsize;    /* size of mapping, bytes */
index cf98a805ec90308651303af993572dd75c0827a0..aeb2d2221c7ddb15b8e4efdae36ee94f40b7fbe3 100644 (file)
@@ -83,7 +83,12 @@ xfs_bulkstat_one_iget(
        buf->bs_uid = dic->di_uid;
        buf->bs_gid = dic->di_gid;
        buf->bs_size = dic->di_size;
-       vn_atime_to_bstime(VFS_I(ip), &buf->bs_atime);
+       /*
+        * We are reading the atime from the Linux inode because the
+        * dinode might not be uptodate.
+        */
+       buf->bs_atime.tv_sec = VFS_I(ip)->i_atime.tv_sec;
+       buf->bs_atime.tv_nsec = VFS_I(ip)->i_atime.tv_nsec;
        buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
        buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
        buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
@@ -579,7 +584,7 @@ xfs_bulkstat(
                                 * first inode of the cluster.
                                 *
                                 * Careful with clustidx.   There can be
-                                * multple clusters per chunk, a single
+                                * multiple clusters per chunk, a single
                                 * cluster per chunk or a cluster that has
                                 * inodes represented from several different
                                 * chunks (if blocksize is large).
index f4726f702a9ea51e84ba08622c7f322dffab3a7c..f76c6d7cea21f9539b9ad21562d943c39c937fc7 100644 (file)
@@ -574,7 +574,7 @@ xfs_log_mount(
        error = xfs_trans_ail_init(mp);
        if (error) {
                cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error);
-               goto error;
+               goto out_free_log;
        }
        mp->m_log->l_ailp = mp->m_ail;
 
@@ -594,20 +594,22 @@ xfs_log_mount(
                        mp->m_flags |= XFS_MOUNT_RDONLY;
                if (error) {
                        cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error);
-                       goto error;
+                       goto out_destroy_ail;
                }
        }
 
        /* Normal transactions can now occur */
        mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
 
-       /* End mounting message in xfs_log_mount_finish */
        return 0;
-error:
-       xfs_log_unmount_dealloc(mp);
+
+out_destroy_ail:
+       xfs_trans_ail_destroy(mp);
+out_free_log:
+       xlog_dealloc_log(mp->m_log);
 out:
        return error;
-}      /* xfs_log_mount */
+}
 
 /*
  * Finish the recovery of the file system.  This is separate from
@@ -632,19 +634,6 @@ xfs_log_mount_finish(xfs_mount_t *mp)
        return error;
 }
 
-/*
- * Unmount processing for the log.
- */
-int
-xfs_log_unmount(xfs_mount_t *mp)
-{
-       int             error;
-
-       error = xfs_log_unmount_write(mp);
-       xfs_log_unmount_dealloc(mp);
-       return error;
-}
-
 /*
  * Final log writes as part of unmount.
  *
@@ -795,7 +784,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
  * and deallocate the log as the aild references the log.
  */
 void
-xfs_log_unmount_dealloc(xfs_mount_t *mp)
+xfs_log_unmount(xfs_mount_t *mp)
 {
        xfs_trans_ail_destroy(mp);
        xlog_dealloc_log(mp->m_log);
@@ -1109,7 +1098,7 @@ xlog_bdstrat_cb(struct xfs_buf *bp)
 /*
  * Return size of each in-core log record buffer.
  *
- * All machines get 8 x 32KB buffers by default, unless tuned otherwise.
+ * All machines get 8 x 32kB buffers by default, unless tuned otherwise.
  *
  * If the filesystem blocksize is too large, we may need to choose a
  * larger size since the directory code currently logs entire blocks.
@@ -1139,8 +1128,8 @@ xlog_get_iclog_buffer_size(xfs_mount_t    *mp,
                }
 
                if (xfs_sb_version_haslogv2(&mp->m_sb)) {
-                       /* # headers = size / 32K
-                        * one header holds cycles from 32K of data
+                       /* # headers = size / 32k
+                        * one header holds cycles from 32k of data
                         */
 
                        xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
@@ -1156,7 +1145,7 @@ xlog_get_iclog_buffer_size(xfs_mount_t    *mp,
                goto done;
        }
 
-       /* All machines use 32KB buffers by default. */
+       /* All machines use 32kB buffers by default. */
        log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
        log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
 
@@ -1164,32 +1153,8 @@ xlog_get_iclog_buffer_size(xfs_mount_t   *mp,
        log->l_iclog_hsize = BBSIZE;
        log->l_iclog_heads = 1;
 
-       /*
-        * For 16KB, we use 3 32KB buffers.  For 32KB block sizes, we use
-        * 4 32KB buffers.  For 64KB block sizes, we use 8 32KB buffers.
-        */
-       if (mp->m_sb.sb_blocksize >= 16*1024) {
-               log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
-               log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
-               if (mp->m_logbufs <= 0) {
-                       switch (mp->m_sb.sb_blocksize) {
-                           case 16*1024:                       /* 16 KB */
-                               log->l_iclog_bufs = 3;
-                               break;
-                           case 32*1024:                       /* 32 KB */
-                               log->l_iclog_bufs = 4;
-                               break;
-                           case 64*1024:                       /* 64 KB */
-                               log->l_iclog_bufs = 8;
-                               break;
-                           default:
-                               xlog_panic("XFS: Invalid blocksize");
-                               break;
-                       }
-               }
-       }
-
-done:  /* are we being asked to make the sizes selected above visible? */
+done:
+       /* are we being asked to make the sizes selected above visible? */
        if (mp->m_logbufs == 0)
                mp->m_logbufs = log->l_iclog_bufs;
        if (mp->m_logbsize == 0)
@@ -3214,7 +3179,7 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
  */
 
 /*
- * Free a used ticket when it's refcount falls to zero.
+ * Free a used ticket when its refcount falls to zero.
  */
 void
 xfs_log_ticket_put(
index 8a3e84e900a34e5153a453eae896bb795576a57b..d0c9baa50b1adec129a74586558595672577c536 100644 (file)
@@ -170,9 +170,8 @@ int   xfs_log_write(struct xfs_mount *mp,
                        int              nentries,
                        xfs_log_ticket_t ticket,
                        xfs_lsn_t        *start_lsn);
-int      xfs_log_unmount(struct xfs_mount *mp);
 int      xfs_log_unmount_write(struct xfs_mount *mp);
-void      xfs_log_unmount_dealloc(struct xfs_mount *mp);
+void      xfs_log_unmount(struct xfs_mount *mp);
 int      xfs_log_force_umount(struct xfs_mount *mp, int logerror);
 int      xfs_log_need_covered(struct xfs_mount *mp);
 
index 654167be0efb6c29aa4288f6ee44bd4627753cef..bcad5f4c1fd1b3f0c31fc7c0bcb848c12aaa21ee 100644 (file)
@@ -359,7 +359,7 @@ typedef struct xlog_in_core {
        int                     ic_size;
        int                     ic_offset;
        int                     ic_bwritecnt;
-       ushort_t                ic_state;
+       unsigned short          ic_state;
        char                    *ic_datap;      /* pointer to iclog data */
 #ifdef XFS_LOG_TRACE
        struct ktrace           *ic_trace;
@@ -455,7 +455,6 @@ extern void  xlog_recover_process_iunlinks(xlog_t *log);
 
 extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
 extern void     xlog_put_bp(struct xfs_buf *);
-extern int      xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *);
 
 extern kmem_zone_t     *xfs_log_ticket_zone;
 
index 61af610d79b395248aeb10b0dac0225d15c388af..7ba450116d4ffc97f247c99a8a89f190bf5a89f2 100644 (file)
@@ -94,12 +94,30 @@ xlog_put_bp(
        xfs_buf_free(bp);
 }
 
+STATIC xfs_caddr_t
+xlog_align(
+       xlog_t          *log,
+       xfs_daddr_t     blk_no,
+       int             nbblks,
+       xfs_buf_t       *bp)
+{
+       xfs_caddr_t     ptr;
+
+       if (!log->l_sectbb_log)
+               return XFS_BUF_PTR(bp);
+
+       ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
+       ASSERT(XFS_BUF_SIZE(bp) >=
+               BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
+       return ptr;
+}
+
 
 /*
  * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
  */
-int
-xlog_bread(
+STATIC int
+xlog_bread_noalign(
        xlog_t          *log,
        xfs_daddr_t     blk_no,
        int             nbblks,
@@ -137,6 +155,24 @@ xlog_bread(
        return error;
 }
 
+STATIC int
+xlog_bread(
+       xlog_t          *log,
+       xfs_daddr_t     blk_no,
+       int             nbblks,
+       xfs_buf_t       *bp,
+       xfs_caddr_t     *offset)
+{
+       int             error;
+
+       error = xlog_bread_noalign(log, blk_no, nbblks, bp);
+       if (error)
+               return error;
+
+       *offset = xlog_align(log, blk_no, nbblks, bp);
+       return 0;
+}
+
 /*
  * Write out the buffer at the given block for the given number of blocks.
  * The buffer is kept locked across the write and is returned locked.
@@ -180,24 +216,6 @@ xlog_bwrite(
        return error;
 }
 
-STATIC xfs_caddr_t
-xlog_align(
-       xlog_t          *log,
-       xfs_daddr_t     blk_no,
-       int             nbblks,
-       xfs_buf_t       *bp)
-{
-       xfs_caddr_t     ptr;
-
-       if (!log->l_sectbb_log)
-               return XFS_BUF_PTR(bp);
-
-       ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
-       ASSERT(XFS_BUF_SIZE(bp) >=
-               BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
-       return ptr;
-}
-
 #ifdef DEBUG
 /*
  * dump debug superblock and log record information
@@ -211,11 +229,11 @@ xlog_header_check_dump(
 
        cmn_err(CE_DEBUG, "%s:  SB : uuid = ", __func__);
        for (b = 0; b < 16; b++)
-               cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]);
+               cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&mp->m_sb.sb_uuid)[b]);
        cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
        cmn_err(CE_DEBUG, "    log : uuid = ");
        for (b = 0; b < 16; b++)
-               cmn_err(CE_DEBUG, "%02x",((uchar_t *)&head->h_fs_uuid)[b]);
+               cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&head->h_fs_uuid)[b]);
        cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt));
 }
 #else
@@ -321,9 +339,9 @@ xlog_find_cycle_start(
 
        mid_blk = BLK_AVG(first_blk, *last_blk);
        while (mid_blk != first_blk && mid_blk != *last_blk) {
-               if ((error = xlog_bread(log, mid_blk, 1, bp)))
+               error = xlog_bread(log, mid_blk, 1, bp, &offset);
+               if (error)
                        return error;
-               offset = xlog_align(log, mid_blk, 1, bp);
                mid_cycle = xlog_get_cycle(offset);
                if (mid_cycle == cycle) {
                        *last_blk = mid_blk;
@@ -379,10 +397,10 @@ xlog_find_verify_cycle(
 
                bcount = min(bufblks, (start_blk + nbblks - i));
 
-               if ((error = xlog_bread(log, i, bcount, bp)))
+               error = xlog_bread(log, i, bcount, bp, &buf);
+               if (error)
                        goto out;
 
-               buf = xlog_align(log, i, bcount, bp);
                for (j = 0; j < bcount; j++) {
                        cycle = xlog_get_cycle(buf);
                        if (cycle == stop_on_cycle_no) {
@@ -436,9 +454,9 @@ xlog_find_verify_log_record(
                        return ENOMEM;
                smallmem = 1;
        } else {
-               if ((error = xlog_bread(log, start_blk, num_blks, bp)))
+               error = xlog_bread(log, start_blk, num_blks, bp, &offset);
+               if (error)
                        goto out;
-               offset = xlog_align(log, start_blk, num_blks, bp);
                offset += ((num_blks - 1) << BBSHIFT);
        }
 
@@ -453,9 +471,9 @@ xlog_find_verify_log_record(
                }
 
                if (smallmem) {
-                       if ((error = xlog_bread(log, i, 1, bp)))
+                       error = xlog_bread(log, i, 1, bp, &offset);
+                       if (error)
                                goto out;
-                       offset = xlog_align(log, i, 1, bp);
                }
 
                head = (xlog_rec_header_t *)offset;
@@ -559,15 +577,18 @@ xlog_find_head(
        bp = xlog_get_bp(log, 1);
        if (!bp)
                return ENOMEM;
-       if ((error = xlog_bread(log, 0, 1, bp)))
+
+       error = xlog_bread(log, 0, 1, bp, &offset);
+       if (error)
                goto bp_err;
-       offset = xlog_align(log, 0, 1, bp);
+
        first_half_cycle = xlog_get_cycle(offset);
 
        last_blk = head_blk = log_bbnum - 1;    /* get cycle # of last block */
-       if ((error = xlog_bread(log, last_blk, 1, bp)))
+       error = xlog_bread(log, last_blk, 1, bp, &offset);
+       if (error)
                goto bp_err;
-       offset = xlog_align(log, last_blk, 1, bp);
+
        last_half_cycle = xlog_get_cycle(offset);
        ASSERT(last_half_cycle != 0);
 
@@ -817,9 +838,10 @@ xlog_find_tail(
        if (!bp)
                return ENOMEM;
        if (*head_blk == 0) {                           /* special case */
-               if ((error = xlog_bread(log, 0, 1, bp)))
+               error = xlog_bread(log, 0, 1, bp, &offset);
+               if (error)
                        goto bread_err;
-               offset = xlog_align(log, 0, 1, bp);
+
                if (xlog_get_cycle(offset) == 0) {
                        *tail_blk = 0;
                        /* leave all other log inited values alone */
@@ -832,9 +854,10 @@ xlog_find_tail(
         */
        ASSERT(*head_blk < INT_MAX);
        for (i = (int)(*head_blk) - 1; i >= 0; i--) {
-               if ((error = xlog_bread(log, i, 1, bp)))
+               error = xlog_bread(log, i, 1, bp, &offset);
+               if (error)
                        goto bread_err;
-               offset = xlog_align(log, i, 1, bp);
+
                if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
                        found = 1;
                        break;
@@ -848,9 +871,10 @@ xlog_find_tail(
         */
        if (!found) {
                for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
-                       if ((error = xlog_bread(log, i, 1, bp)))
+                       error = xlog_bread(log, i, 1, bp, &offset);
+                       if (error)
                                goto bread_err;
-                       offset = xlog_align(log, i, 1, bp);
+
                        if (XLOG_HEADER_MAGIC_NUM ==
                            be32_to_cpu(*(__be32 *)offset)) {
                                found = 2;
@@ -922,10 +946,10 @@ xlog_find_tail(
        if (*head_blk == after_umount_blk &&
            be32_to_cpu(rhead->h_num_logops) == 1) {
                umount_data_blk = (i + hblks) % log->l_logBBsize;
-               if ((error = xlog_bread(log, umount_data_blk, 1, bp))) {
+               error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
+               if (error)
                        goto bread_err;
-               }
-               offset = xlog_align(log, umount_data_blk, 1, bp);
+
                op_head = (xlog_op_header_t *)offset;
                if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
                        /*
@@ -1017,9 +1041,10 @@ xlog_find_zeroed(
        bp = xlog_get_bp(log, 1);
        if (!bp)
                return ENOMEM;
-       if ((error = xlog_bread(log, 0, 1, bp)))
+       error = xlog_bread(log, 0, 1, bp, &offset);
+       if (error)
                goto bp_err;
-       offset = xlog_align(log, 0, 1, bp);
+
        first_cycle = xlog_get_cycle(offset);
        if (first_cycle == 0) {         /* completely zeroed log */
                *blk_no = 0;
@@ -1028,9 +1053,10 @@ xlog_find_zeroed(
        }
 
        /* check partially zeroed log */
-       if ((error = xlog_bread(log, log_bbnum-1, 1, bp)))
+       error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
+       if (error)
                goto bp_err;
-       offset = xlog_align(log, log_bbnum-1, 1, bp);
+
        last_cycle = xlog_get_cycle(offset);
        if (last_cycle != 0) {          /* log completely written to */
                xlog_put_bp(bp);
@@ -1152,10 +1178,10 @@ xlog_write_log_records(
         */
        balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block);
        if (balign != start_block) {
-               if ((error = xlog_bread(log, start_block, 1, bp))) {
-                       xlog_put_bp(bp);
-                       return error;
-               }
+               error = xlog_bread_noalign(log, start_block, 1, bp);
+               if (error)
+                       goto out_put_bp;
+
                j = start_block - balign;
        }
 
@@ -1175,10 +1201,14 @@ xlog_write_log_records(
                        balign = BBTOB(ealign - start_block);
                        error = XFS_BUF_SET_PTR(bp, offset + balign,
                                                BBTOB(sectbb));
-                       if (!error)
-                               error = xlog_bread(log, ealign, sectbb, bp);
-                       if (!error)
-                               error = XFS_BUF_SET_PTR(bp, offset, bufblks);
+                       if (error)
+                               break;
+
+                       error = xlog_bread_noalign(log, ealign, sectbb, bp);
+                       if (error)
+                               break;
+
+                       error = XFS_BUF_SET_PTR(bp, offset, bufblks);
                        if (error)
                                break;
                }
@@ -1195,6 +1225,8 @@ xlog_write_log_records(
                start_block += endcount;
                j = 0;
        }
+
+ out_put_bp:
        xlog_put_bp(bp);
        return error;
 }
@@ -2511,16 +2543,10 @@ xlog_recover_do_inode_trans(
        }
 
 write_inode_buffer:
-       if (ITEM_TYPE(item) == XFS_LI_INODE) {
-               ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
-               bp->b_mount = mp;
-               XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
-               xfs_bdwrite(mp, bp);
-       } else {
-               XFS_BUF_STALE(bp);
-               error = xfs_bwrite(mp, bp);
-       }
-
+       ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
+       bp->b_mount = mp;
+       XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
+       xfs_bdwrite(mp, bp);
 error:
        if (need_free)
                kmem_free(in_f);
@@ -2769,51 +2795,48 @@ xlog_recover_do_trans(
        int                     error = 0;
        xlog_recover_item_t     *item, *first_item;
 
-       if ((error = xlog_recover_reorder_trans(trans)))
+       error = xlog_recover_reorder_trans(trans);
+       if (error)
                return error;
+
        first_item = item = trans->r_itemq;
        do {
-               /*
-                * we don't need to worry about the block number being
-                * truncated in > 1 TB buffers because in user-land,
-                * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so
-                * the blknos will get through the user-mode buffer
-                * cache properly.  The only bad case is o32 kernels
-                * where xfs_daddr_t is 32-bits but mount will warn us
-                * off a > 1 TB filesystem before we get here.
-                */
-               if ((ITEM_TYPE(item) == XFS_LI_BUF)) {
-                       if  ((error = xlog_recover_do_buffer_trans(log, item,
-                                                                pass)))
-                               break;
-               } else if ((ITEM_TYPE(item) == XFS_LI_INODE)) {
-                       if ((error = xlog_recover_do_inode_trans(log, item,
-                                                               pass)))
-                               break;
-               } else if (ITEM_TYPE(item) == XFS_LI_EFI) {
-                       if ((error = xlog_recover_do_efi_trans(log, item, trans->r_lsn,
-                                                 pass)))
-                               break;
-               } else if (ITEM_TYPE(item) == XFS_LI_EFD) {
+               switch (ITEM_TYPE(item)) {
+               case XFS_LI_BUF:
+                       error = xlog_recover_do_buffer_trans(log, item, pass);
+                       break;
+               case XFS_LI_INODE:
+                       error = xlog_recover_do_inode_trans(log, item, pass);
+                       break;
+               case XFS_LI_EFI:
+                       error = xlog_recover_do_efi_trans(log, item,
+                                                         trans->r_lsn, pass);
+                       break;
+               case XFS_LI_EFD:
                        xlog_recover_do_efd_trans(log, item, pass);
-               } else if (ITEM_TYPE(item) == XFS_LI_DQUOT) {
-                       if ((error = xlog_recover_do_dquot_trans(log, item,
-                                                                  pass)))
-                                       break;
-               } else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) {
-                       if ((error = xlog_recover_do_quotaoff_trans(log, item,
-                                                                  pass)))
-                                       break;
-               } else {
-                       xlog_warn("XFS: xlog_recover_do_trans");
+                       error = 0;
+                       break;
+               case XFS_LI_DQUOT:
+                       error = xlog_recover_do_dquot_trans(log, item, pass);
+                       break;
+               case XFS_LI_QUOTAOFF:
+                       error = xlog_recover_do_quotaoff_trans(log, item,
+                                                              pass);
+                       break;
+               default:
+                       xlog_warn(
+       "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
                        ASSERT(0);
                        error = XFS_ERROR(EIO);
                        break;
                }
+
+               if (error)
+                       return error;
                item = item->ri_next;
        } while (first_item != item);
 
-       return error;
+       return 0;
 }
 
 /*
@@ -3490,9 +3513,11 @@ xlog_do_recovery_pass(
                hbp = xlog_get_bp(log, 1);
                if (!hbp)
                        return ENOMEM;
-               if ((error = xlog_bread(log, tail_blk, 1, hbp)))
+
+               error = xlog_bread(log, tail_blk, 1, hbp, &offset);
+               if (error)
                        goto bread_err1;
-               offset = xlog_align(log, tail_blk, 1, hbp);
+
                rhead = (xlog_rec_header_t *)offset;
                error = xlog_valid_rec_header(log, rhead, tail_blk);
                if (error)
@@ -3526,9 +3551,10 @@ xlog_do_recovery_pass(
        memset(rhash, 0, sizeof(rhash));
        if (tail_blk <= head_blk) {
                for (blk_no = tail_blk; blk_no < head_blk; ) {
-                       if ((error = xlog_bread(log, blk_no, hblks, hbp)))
+                       error = xlog_bread(log, blk_no, hblks, hbp, &offset);
+                       if (error)
                                goto bread_err2;
-                       offset = xlog_align(log, blk_no, hblks, hbp);
+
                        rhead = (xlog_rec_header_t *)offset;
                        error = xlog_valid_rec_header(log, rhead, blk_no);
                        if (error)
@@ -3536,10 +3562,11 @@ xlog_do_recovery_pass(
 
                        /* blocks in data section */
                        bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
-                       error = xlog_bread(log, blk_no + hblks, bblks, dbp);
+                       error = xlog_bread(log, blk_no + hblks, bblks, dbp,
+                                          &offset);
                        if (error)
                                goto bread_err2;
-                       offset = xlog_align(log, blk_no + hblks, bblks, dbp);
+
                        xlog_unpack_data(rhead, offset, log);
                        if ((error = xlog_recover_process_data(log,
                                                rhash, rhead, offset, pass)))
@@ -3562,10 +3589,10 @@ xlog_do_recovery_pass(
                        wrapped_hblks = 0;
                        if (blk_no + hblks <= log->l_logBBsize) {
                                /* Read header in one read */
-                               error = xlog_bread(log, blk_no, hblks, hbp);
+                               error = xlog_bread(log, blk_no, hblks, hbp,
+                                                  &offset);
                                if (error)
                                        goto bread_err2;
-                               offset = xlog_align(log, blk_no, hblks, hbp);
                        } else {
                                /* This LR is split across physical log end */
                                if (blk_no != log->l_logBBsize) {
@@ -3573,12 +3600,13 @@ xlog_do_recovery_pass(
                                        ASSERT(blk_no <= INT_MAX);
                                        split_hblks = log->l_logBBsize - (int)blk_no;
                                        ASSERT(split_hblks > 0);
-                                       if ((error = xlog_bread(log, blk_no,
-                                                       split_hblks, hbp)))
+                                       error = xlog_bread(log, blk_no,
+                                                          split_hblks, hbp,
+                                                          &offset);
+                                       if (error)
                                                goto bread_err2;
-                                       offset = xlog_align(log, blk_no,
-                                                       split_hblks, hbp);
                                }
+
                                /*
                                 * Note: this black magic still works with
                                 * large sector sizes (non-512) only because:
@@ -3596,14 +3624,19 @@ xlog_do_recovery_pass(
                                error = XFS_BUF_SET_PTR(hbp,
                                                bufaddr + BBTOB(split_hblks),
                                                BBTOB(hblks - split_hblks));
-                               if (!error)
-                                       error = xlog_bread(log, 0,
-                                                       wrapped_hblks, hbp);
-                               if (!error)
-                                       error = XFS_BUF_SET_PTR(hbp, bufaddr,
+                               if (error)
+                                       goto bread_err2;
+
+                               error = xlog_bread_noalign(log, 0,
+                                                          wrapped_hblks, hbp);
+                               if (error)
+                                       goto bread_err2;
+
+                               error = XFS_BUF_SET_PTR(hbp, bufaddr,
                                                        BBTOB(hblks));
                                if (error)
                                        goto bread_err2;
+
                                if (!offset)
                                        offset = xlog_align(log, 0,
                                                        wrapped_hblks, hbp);
@@ -3619,10 +3652,10 @@ xlog_do_recovery_pass(
 
                        /* Read in data for log record */
                        if (blk_no + bblks <= log->l_logBBsize) {
-                               error = xlog_bread(log, blk_no, bblks, dbp);
+                               error = xlog_bread(log, blk_no, bblks, dbp,
+                                                  &offset);
                                if (error)
                                        goto bread_err2;
-                               offset = xlog_align(log, blk_no, bblks, dbp);
                        } else {
                                /* This log record is split across the
                                 * physical end of log */
@@ -3636,12 +3669,13 @@ xlog_do_recovery_pass(
                                        split_bblks =
                                                log->l_logBBsize - (int)blk_no;
                                        ASSERT(split_bblks > 0);
-                                       if ((error = xlog_bread(log, blk_no,
-                                                       split_bblks, dbp)))
+                                       error = xlog_bread(log, blk_no,
+                                                       split_bblks, dbp,
+                                                       &offset);
+                                       if (error)
                                                goto bread_err2;
-                                       offset = xlog_align(log, blk_no,
-                                                       split_bblks, dbp);
                                }
+
                                /*
                                 * Note: this black magic still works with
                                 * large sector sizes (non-512) only because:
@@ -3658,15 +3692,19 @@ xlog_do_recovery_pass(
                                error = XFS_BUF_SET_PTR(dbp,
                                                bufaddr + BBTOB(split_bblks),
                                                BBTOB(bblks - split_bblks));
-                               if (!error)
-                                       error = xlog_bread(log, wrapped_hblks,
-                                                       bblks - split_bblks,
-                                                       dbp);
-                               if (!error)
-                                       error = XFS_BUF_SET_PTR(dbp, bufaddr,
-                                                       h_size);
                                if (error)
                                        goto bread_err2;
+
+                               error = xlog_bread_noalign(log, wrapped_hblks,
+                                               bblks - split_bblks,
+                                               dbp);
+                               if (error)
+                                       goto bread_err2;
+
+                               error = XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
+                               if (error)
+                                       goto bread_err2;
+
                                if (!offset)
                                        offset = xlog_align(log, wrapped_hblks,
                                                bblks - split_bblks, dbp);
@@ -3683,17 +3721,21 @@ xlog_do_recovery_pass(
 
                /* read first part of physical log */
                while (blk_no < head_blk) {
-                       if ((error = xlog_bread(log, blk_no, hblks, hbp)))
+                       error = xlog_bread(log, blk_no, hblks, hbp, &offset);
+                       if (error)
                                goto bread_err2;
-                       offset = xlog_align(log, blk_no, hblks, hbp);
+
                        rhead = (xlog_rec_header_t *)offset;
                        error = xlog_valid_rec_header(log, rhead, blk_no);
                        if (error)
                                goto bread_err2;
+
                        bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
-                       if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp)))
+                       error = xlog_bread(log, blk_no+hblks, bblks, dbp,
+                                          &offset);
+                       if (error)
                                goto bread_err2;
-                       offset = xlog_align(log, blk_no+hblks, bblks, dbp);
+
                        xlog_unpack_data(rhead, offset, log);
                        if ((error = xlog_recover_process_data(log, rhash,
                                                        rhead, offset, pass)))
index 35300250e86d55fa2d44c7b3b15173fd8aa93767..b101990df027120632ff4f486713bc9783e32f31 100644 (file)
@@ -45,7 +45,6 @@
 #include "xfs_fsops.h"
 #include "xfs_utils.h"
 
-STATIC int     xfs_uuid_mount(xfs_mount_t *);
 STATIC void    xfs_unmountfs_wait(xfs_mount_t *);
 
 
@@ -121,6 +120,84 @@ static const struct {
     { sizeof(xfs_sb_t),                         0 }
 };
 
+static DEFINE_MUTEX(xfs_uuid_table_mutex);
+static int xfs_uuid_table_size;
+static uuid_t *xfs_uuid_table;
+
+/*
+ * See if the UUID is unique among mounted XFS filesystems.
+ * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
+ */
+STATIC int
+xfs_uuid_mount(
+       struct xfs_mount        *mp)
+{
+       uuid_t                  *uuid = &mp->m_sb.sb_uuid;
+       int                     hole, i;
+
+       if (mp->m_flags & XFS_MOUNT_NOUUID)
+               return 0;
+
+       if (uuid_is_nil(uuid)) {
+               cmn_err(CE_WARN,
+                       "XFS: Filesystem %s has nil UUID - can't mount",
+                       mp->m_fsname);
+               return XFS_ERROR(EINVAL);
+       }
+
+       mutex_lock(&xfs_uuid_table_mutex);
+       for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) {
+               if (uuid_is_nil(&xfs_uuid_table[i])) {
+                       hole = i;
+                       continue;
+               }
+               if (uuid_equal(uuid, &xfs_uuid_table[i]))
+                       goto out_duplicate;
+       }
+
+       if (hole < 0) {
+               xfs_uuid_table = kmem_realloc(xfs_uuid_table,
+                       (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
+                       xfs_uuid_table_size  * sizeof(*xfs_uuid_table),
+                       KM_SLEEP);
+               hole = xfs_uuid_table_size++;
+       }
+       xfs_uuid_table[hole] = *uuid;
+       mutex_unlock(&xfs_uuid_table_mutex);
+
+       return 0;
+
+ out_duplicate:
+       mutex_unlock(&xfs_uuid_table_mutex);
+       cmn_err(CE_WARN, "XFS: Filesystem %s has duplicate UUID - can't mount",
+                        mp->m_fsname);
+       return XFS_ERROR(EINVAL);
+}
+
+STATIC void
+xfs_uuid_unmount(
+       struct xfs_mount        *mp)
+{
+       uuid_t                  *uuid = &mp->m_sb.sb_uuid;
+       int                     i;
+
+       if (mp->m_flags & XFS_MOUNT_NOUUID)
+               return;
+
+       mutex_lock(&xfs_uuid_table_mutex);
+       for (i = 0; i < xfs_uuid_table_size; i++) {
+               if (uuid_is_nil(&xfs_uuid_table[i]))
+                       continue;
+               if (!uuid_equal(uuid, &xfs_uuid_table[i]))
+                       continue;
+               memset(&xfs_uuid_table[i], 0, sizeof(uuid_t));
+               break;
+       }
+       ASSERT(i < xfs_uuid_table_size);
+       mutex_unlock(&xfs_uuid_table_mutex);
+}
+
+
 /*
  * Free up the resources associated with a mount structure.  Assume that
  * the structure was initially zeroed, so we can tell which fields got
@@ -256,6 +333,22 @@ xfs_mount_validate_sb(
                return XFS_ERROR(ENOSYS);
        }
 
+       /*
+        * Currently only very few inode sizes are supported.
+        */
+       switch (sbp->sb_inodesize) {
+       case 256:
+       case 512:
+       case 1024:
+       case 2048:
+               break;
+       default:
+               xfs_fs_mount_cmn_err(flags,
+                       "inode size of %d bytes not supported",
+                       sbp->sb_inodesize);
+               return XFS_ERROR(ENOSYS);
+       }
+
        if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
            xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
                xfs_fs_mount_cmn_err(flags,
@@ -574,32 +667,10 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
        mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
        mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
        mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
-       mp->m_litino = sbp->sb_inodesize - sizeof(struct xfs_dinode);
        mp->m_blockmask = sbp->sb_blocksize - 1;
        mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
        mp->m_blockwmask = mp->m_blockwsize - 1;
 
-       /*
-        * Setup for attributes, in case they get created.
-        * This value is for inodes getting attributes for the first time,
-        * the per-inode value is for old attribute values.
-        */
-       ASSERT(sbp->sb_inodesize >= 256 && sbp->sb_inodesize <= 2048);
-       switch (sbp->sb_inodesize) {
-       case 256:
-               mp->m_attroffset = XFS_LITINO(mp) -
-                                  XFS_BMDR_SPACE_CALC(MINABTPTRS);
-               break;
-       case 512:
-       case 1024:
-       case 2048:
-               mp->m_attroffset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
-               break;
-       default:
-               ASSERT(0);
-       }
-       ASSERT(mp->m_attroffset < XFS_LITINO(mp));
-
        mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
        mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
        mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
@@ -645,7 +716,7 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
        for (index = 0; index < agcount; index++) {
                /*
                 * read the agf, then the agi. This gets us
-                * all the inforamtion we need and populates the
+                * all the information we need and populates the
                 * per-ag structures for us.
                 */
                error = xfs_alloc_pagf_init(mp, NULL, index, 0);
@@ -886,8 +957,6 @@ xfs_check_sizes(xfs_mount_t *mp)
 }
 
 /*
- * xfs_mountfs
- *
  * This function does the following on an initial mount of a file system:
  *     - reads the superblock from disk and init the mount struct
  *     - if we're a 32-bit kernel, do a size check on the superblock
@@ -905,7 +974,6 @@ xfs_mountfs(
        xfs_inode_t     *rip;
        __uint64_t      resblks;
        uint            quotamount, quotaflags;
-       int             uuid_mounted = 0;
        int             error = 0;
 
        xfs_mount_common(mp, sbp);
@@ -960,7 +1028,7 @@ xfs_mountfs(
         */
        error = xfs_update_alignment(mp);
        if (error)
-               goto error1;
+               goto out;
 
        xfs_alloc_compute_maxlevels(mp);
        xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
@@ -971,19 +1039,9 @@ xfs_mountfs(
 
        mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog);
 
-       /*
-        * XFS uses the uuid from the superblock as the unique
-        * identifier for fsid.  We can not use the uuid from the volume
-        * since a single partition filesystem is identical to a single
-        * partition volume/filesystem.
-        */
-       if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
-               if (xfs_uuid_mount(mp)) {
-                       error = XFS_ERROR(EINVAL);
-                       goto error1;
-               }
-               uuid_mounted=1;
-       }
+       error = xfs_uuid_mount(mp);
+       if (error)
+               goto out;
 
        /*
         * Set the minimum read and write sizes
@@ -1007,7 +1065,7 @@ xfs_mountfs(
         */
        error = xfs_check_sizes(mp);
        if (error)
-               goto error1;
+               goto out_remove_uuid;
 
        /*
         * Initialize realtime fields in the mount structure
@@ -1015,7 +1073,7 @@ xfs_mountfs(
        error = xfs_rtmount_init(mp);
        if (error) {
                cmn_err(CE_WARN, "XFS: RT mount failed");
-               goto error1;
+               goto out_remove_uuid;
        }
 
        /*
@@ -1045,26 +1103,26 @@ xfs_mountfs(
        mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t),
                                  KM_MAYFAIL);
        if (!mp->m_perag)
-               goto error1;
+               goto out_remove_uuid;
 
        mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
 
+       if (!sbp->sb_logblocks) {
+               cmn_err(CE_WARN, "XFS: no log defined");
+               XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
+               error = XFS_ERROR(EFSCORRUPTED);
+               goto out_free_perag;
+       }
+
        /*
         * log's mount-time initialization. Perform 1st part recovery if needed
         */
-       if (likely(sbp->sb_logblocks > 0)) {    /* check for volume case */
-               error = xfs_log_mount(mp, mp->m_logdev_targp,
-                                     XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
-                                     XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
-               if (error) {
-                       cmn_err(CE_WARN, "XFS: log mount failed");
-                       goto error2;
-               }
-       } else {        /* No log has been defined */
-               cmn_err(CE_WARN, "XFS: no log defined");
-               XFS_ERROR_REPORT("xfs_mountfs_int(1)", XFS_ERRLEVEL_LOW, mp);
-               error = XFS_ERROR(EFSCORRUPTED);
-               goto error2;
+       error = xfs_log_mount(mp, mp->m_logdev_targp,
+                             XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
+                             XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
+       if (error) {
+               cmn_err(CE_WARN, "XFS: log mount failed");
+               goto out_free_perag;
        }
 
        /*
@@ -1086,15 +1144,14 @@ xfs_mountfs(
         * If we are currently making the filesystem, the initialisation will
         * fail as the perag data is in an undefined state.
         */
-
        if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
            !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) &&
             !mp->m_sb.sb_inprogress) {
                error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
-               if (error) {
-                       goto error2;
-               }
+               if (error)
+                       goto out_free_perag;
        }
+
        /*
         * Get and sanity-check the root inode.
         * Save the pointer to it in the mount structure.
@@ -1102,7 +1159,7 @@ xfs_mountfs(
        error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0);
        if (error) {
                cmn_err(CE_WARN, "XFS: failed to read root inode");
-               goto error3;
+               goto out_log_dealloc;
        }
 
        ASSERT(rip != NULL);
@@ -1116,7 +1173,7 @@ xfs_mountfs(
                XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
                                 mp);
                error = XFS_ERROR(EFSCORRUPTED);
-               goto error4;
+               goto out_rele_rip;
        }
        mp->m_rootip = rip;     /* save it */
 
@@ -1131,7 +1188,7 @@ xfs_mountfs(
                 * Free up the root inode.
                 */
                cmn_err(CE_WARN, "XFS: failed to read RT inodes");
-               goto error4;
+               goto out_rele_rip;
        }
 
        /*
@@ -1143,7 +1200,7 @@ xfs_mountfs(
                error = xfs_mount_log_sb(mp, mp->m_update_flags);
                if (error) {
                        cmn_err(CE_WARN, "XFS: failed to write sb changes");
-                       goto error4;
+                       goto out_rtunmount;
                }
        }
 
@@ -1152,7 +1209,7 @@ xfs_mountfs(
         */
        error = XFS_QM_INIT(mp, &quotamount, &quotaflags);
        if (error)
-               goto error4;
+               goto out_rtunmount;
 
        /*
         * Finish recovering the file system.  This part needed to be
@@ -1162,7 +1219,7 @@ xfs_mountfs(
        error = xfs_log_mount_finish(mp);
        if (error) {
                cmn_err(CE_WARN, "XFS: log mount finish failed");
-               goto error4;
+               goto out_rtunmount;
        }
 
        /*
@@ -1170,7 +1227,7 @@ xfs_mountfs(
         */
        error = XFS_QM_MOUNT(mp, quotamount, quotaflags);
        if (error)
-               goto error4;
+               goto out_rtunmount;
 
        /*
         * Now we are mounted, reserve a small amount of unused space for
@@ -1194,18 +1251,17 @@ xfs_mountfs(
 
        return 0;
 
- error4:
-       /*
-        * Free up the root inode.
-        */
+ out_rtunmount:
+       xfs_rtunmount_inodes(mp);
+ out_rele_rip:
        IRELE(rip);
error3:
-       xfs_log_unmount_dealloc(mp);
error2:
out_log_dealloc:
+       xfs_log_unmount(mp);
out_free_perag:
        xfs_free_perag(mp);
error1:
-       if (uuid_mounted)
-               uuid_table_remove(&mp->m_sb.sb_uuid);
out_remove_uuid:
+       xfs_uuid_unmount(mp);
+ out:
        return error;
 }
 
@@ -1226,15 +1282,12 @@ xfs_unmountfs(
         */
        XFS_QM_UNMOUNT(mp);
 
-       if (mp->m_rbmip)
-               IRELE(mp->m_rbmip);
-       if (mp->m_rsumip)
-               IRELE(mp->m_rsumip);
+       xfs_rtunmount_inodes(mp);
        IRELE(mp->m_rootip);
 
        /*
         * We can potentially deadlock here if we have an inode cluster
-        * that has been freed has it's buffer still pinned in memory because
+        * that has been freed has its buffer still pinned in memory because
         * the transaction is still sitting in a iclog. The stale inodes
         * on that buffer will have their flush locks held until the
         * transaction hits the disk and the callbacks run. the inode
@@ -1266,7 +1319,7 @@ xfs_unmountfs(
         * Unreserve any blocks we have so that when we unmount we don't account
         * the reserved free space as used. This is really only necessary for
         * lazy superblock counting because it trusts the incore superblock
-        * counters to be aboslutely correct on clean unmount.
+        * counters to be absolutely correct on clean unmount.
         *
         * We don't bother correcting this elsewhere for lazy superblock
         * counting because on mount of an unclean filesystem we reconstruct the
@@ -1288,10 +1341,9 @@ xfs_unmountfs(
                                "Freespace may not be correct on next mount.");
        xfs_unmountfs_writesb(mp);
        xfs_unmountfs_wait(mp);                 /* wait for async bufs */
-       xfs_log_unmount(mp);                    /* Done! No more fs ops. */
-
-       if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
-               uuid_table_remove(&mp->m_sb.sb_uuid);
+       xfs_log_unmount_write(mp);
+       xfs_log_unmount(mp);
+       xfs_uuid_unmount(mp);
 
 #if defined(DEBUG)
        xfs_errortag_clearall(mp, 0);
@@ -1792,29 +1844,6 @@ xfs_freesb(
        mp->m_sb_bp = NULL;
 }
 
-/*
- * See if the UUID is unique among mounted XFS filesystems.
- * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
- */
-STATIC int
-xfs_uuid_mount(
-       xfs_mount_t     *mp)
-{
-       if (uuid_is_nil(&mp->m_sb.sb_uuid)) {
-               cmn_err(CE_WARN,
-                       "XFS: Filesystem %s has nil UUID - can't mount",
-                       mp->m_fsname);
-               return -1;
-       }
-       if (!uuid_table_insert(&mp->m_sb.sb_uuid)) {
-               cmn_err(CE_WARN,
-                       "XFS: Filesystem %s has duplicate UUID - can't mount",
-                       mp->m_fsname);
-               return -1;
-       }
-       return 0;
-}
-
 /*
  * Used to log changes to the superblock unit and width fields which could
  * be altered by the mount options, as well as any potential sb_features2
@@ -1868,7 +1897,7 @@ xfs_mount_log_sb(
  * we disable the per-cpu counter and go through the slow path.
  *
  * The slow path is the current xfs_mod_incore_sb() function.  This means that
- * when we disable a per-cpu counter, we need to drain it's resources back to
+ * when we disable a per-cpu counter, we need to drain its resources back to
  * the global superblock. We do this after disabling the counter to prevent
  * more threads from queueing up on the counter.
  *
index f5e9937f9bdb5266e2702c9c653cb7a683f09a2f..7af44adffc8f308a4b51100a25bd1f0e7dfb1465 100644 (file)
@@ -136,7 +136,6 @@ typedef int (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *,
                        struct xfs_dquot *, struct xfs_dquot *, uint);
 typedef void   (*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *);
 typedef int    (*xfs_dqsync_t)(struct xfs_mount *, int flags);
-typedef int    (*xfs_quotactl_t)(struct xfs_mount *, int, int, xfs_caddr_t);
 
 typedef struct xfs_qmops {
        xfs_qminit_t            xfs_qminit;
@@ -154,7 +153,6 @@ typedef struct xfs_qmops {
        xfs_dqvopchownresv_t    xfs_dqvopchownresv;
        xfs_dqstatvfs_t         xfs_dqstatvfs;
        xfs_dqsync_t            xfs_dqsync;
-       xfs_quotactl_t          xfs_quotactl;
        struct xfs_dqtrxops     *xfs_dqtrxops;
 } xfs_qmops_t;
 
@@ -188,8 +186,6 @@ typedef struct xfs_qmops {
        (*(ip)->i_mount->m_qm_ops->xfs_dqstatvfs)(ip, statp)
 #define XFS_QM_DQSYNC(mp, flags) \
        (*(mp)->m_qm_ops->xfs_dqsync)(mp, flags)
-#define XFS_QM_QUOTACTL(mp, cmd, id, addr) \
-       (*(mp)->m_qm_ops->xfs_quotactl)(mp, cmd, id, addr)
 
 #ifdef HAVE_PERCPU_SB
 
@@ -273,19 +269,17 @@ typedef struct xfs_mount {
        uint                    m_inobt_mnr[2]; /* min inobt btree records */
        uint                    m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
        uint                    m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
-       uint                    m_in_maxlevels; /* XFS_IN_MAXLEVELS */
+       uint                    m_in_maxlevels; /* max inobt btree levels. */
        struct xfs_perag        *m_perag;       /* per-ag accounting info */
        struct rw_semaphore     m_peraglock;    /* lock for m_perag (pointer) */
        struct mutex            m_growlock;     /* growfs mutex */
        int                     m_fixedfsid[2]; /* unchanged for life of FS */
        uint                    m_dmevmask;     /* DMI events for this FS */
        __uint64_t              m_flags;        /* global mount flags */
-       uint                    m_attroffset;   /* inode attribute offset */
        uint                    m_dir_node_ents; /* #entries in a dir danode */
        uint                    m_attr_node_ents; /* #entries in attr danode */
        int                     m_ialloc_inos;  /* inodes in inode allocation */
        int                     m_ialloc_blks;  /* blocks in inode allocation */
-       int                     m_litino;       /* size of inode union area */
        int                     m_inoalign_mask;/* mask sb_inoalignmt if used */
        uint                    m_qflags;       /* quota status flags */
        xfs_trans_reservations_t m_reservations;/* precomputed res values */
@@ -293,9 +287,6 @@ typedef struct xfs_mount {
        __uint64_t              m_maxioffset;   /* maximum inode offset */
        __uint64_t              m_resblks;      /* total reserved blocks */
        __uint64_t              m_resblks_avail;/* available reserved blocks */
-#if XFS_BIG_INUMS
-       xfs_ino_t               m_inoadd;       /* add value for ino64_offset */
-#endif
        int                     m_dalign;       /* stripe unit */
        int                     m_swidth;       /* stripe width */
        int                     m_sinoalign;    /* stripe unit inode alignment */
@@ -337,7 +328,6 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_WSYNC                (1ULL << 0)     /* for nfs - all metadata ops
                                                   must be synchronous except
                                                   for space allocations */
-#define XFS_MOUNT_INO64                (1ULL << 1)
 #define XFS_MOUNT_DMAPI                (1ULL << 2)     /* dmapi is enabled */
 #define XFS_MOUNT_WAS_CLEAN    (1ULL << 3)
 #define XFS_MOUNT_FS_SHUTDOWN  (1ULL << 4)     /* atomic stop of all filesystem
@@ -389,8 +379,8 @@ typedef struct xfs_mount {
  * Synchronous read and write sizes.  This should be
  * better for NFSv2 wsync filesystems.
  */
-#define        XFS_WSYNC_READIO_LOG    15      /* 32K */
-#define        XFS_WSYNC_WRITEIO_LOG   14      /* 16K */
+#define        XFS_WSYNC_READIO_LOG    15      /* 32k */
+#define        XFS_WSYNC_WRITEIO_LOG   14      /* 16k */
 
 /*
  * Allow large block sizes to be reported to userspace programs if the
@@ -500,9 +490,6 @@ typedef struct xfs_mod_sb {
        int64_t         msb_delta;      /* Change to make to specified field */
 } xfs_mod_sb_t;
 
-#define        XFS_MOUNT_ILOCK(mp)     mutex_lock(&((mp)->m_ilock))
-#define        XFS_MOUNT_IUNLOCK(mp)   mutex_unlock(&((mp)->m_ilock))
-
 extern int     xfs_log_sbcount(xfs_mount_t *, uint);
 extern int     xfs_mountfs(xfs_mount_t *mp);
 extern void    xfs_mountfs_check_barriers(xfs_mount_t *mp);
index 27f80581520ad2d19c138419a2e3bca42de86944..e101790ea8e795e3644b838885d8d3db700187b2 100644 (file)
@@ -126,7 +126,6 @@ static struct xfs_qmops xfs_qmcore_stub = {
        .xfs_dqvopchownresv     = (xfs_dqvopchownresv_t) fs_noerr,
        .xfs_dqstatvfs          = (xfs_dqstatvfs_t) fs_noval,
        .xfs_dqsync             = (xfs_dqsync_t) fs_noerr,
-       .xfs_quotactl           = (xfs_quotactl_t) fs_nosys,
 };
 
 int
index 48965ecaa1558589e5a1b9db4a829e9c367665e4..f5d1202dde258a85f806472e987e0f44fd9f4989 100644 (file)
@@ -18,6 +18,8 @@
 #ifndef __XFS_QUOTA_H__
 #define __XFS_QUOTA_H__
 
+struct xfs_trans;
+
 /*
  * The ondisk form of a dquot structure.
  */
@@ -185,7 +187,6 @@ typedef struct xfs_qoff_logformat {
  * to a single function. None of these XFS_QMOPT_* flags are meant to have
  * persistent values (ie. their values can and will change between versions)
  */
-#define XFS_QMOPT_DQLOCK       0x0000001 /* dqlock */
 #define XFS_QMOPT_DQALLOC      0x0000002 /* alloc dquot ondisk if needed */
 #define XFS_QMOPT_UQUOTA       0x0000004 /* user dquot requested */
 #define XFS_QMOPT_PQUOTA       0x0000008 /* project dquot requested */
index c5bb86f3ec053e1d47cefe6ff806cf2772aa2114..385f6dceba5db357a8bf7361fc24adde3e1185a2 100644 (file)
@@ -2288,6 +2288,16 @@ xfs_rtmount_inodes(
        return 0;
 }
 
+void
+xfs_rtunmount_inodes(
+       struct xfs_mount        *mp)
+{
+       if (mp->m_rbmip)
+               IRELE(mp->m_rbmip);
+       if (mp->m_rsumip)
+               IRELE(mp->m_rsumip);
+}
+
 /*
  * Pick an extent for allocation at the start of a new realtime file.
  * Use the sequence number stored in the atime field of the bitmap inode.
index 8d8dcd215716262d5a9bdb2a46e6c7b359604769..b2d67adb6a08f4f9f801168337e0c81c4bb00d5d 100644 (file)
@@ -23,8 +23,8 @@ struct xfs_trans;
 
 /* Min and max rt extent sizes, specified in bytes */
 #define        XFS_MAX_RTEXTSIZE       (1024 * 1024 * 1024)    /* 1GB */
-#define        XFS_DFL_RTEXTSIZE       (64 * 1024)             /* 64KB */
-#define        XFS_MIN_RTEXTSIZE       (4 * 1024)              /* 4KB */
+#define        XFS_DFL_RTEXTSIZE       (64 * 1024)             /* 64kB */
+#define        XFS_MIN_RTEXTSIZE       (4 * 1024)              /* 4kB */
 
 /*
  * Constants for bit manipulations.
@@ -108,6 +108,9 @@ xfs_rtfree_extent(
 int                                    /* error */
 xfs_rtmount_init(
        struct xfs_mount        *mp);   /* file system mount structure */
+void
+xfs_rtunmount_inodes(
+       struct xfs_mount        *mp);
 
 /*
  * Get the bitmap and summary inodes into the mount structure
@@ -146,6 +149,7 @@ xfs_growfs_rt(
 # define xfs_growfs_rt(mp,in)                           (ENOSYS)
 # define xfs_rtmount_init(m)    (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
 # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+# define xfs_rtunmount_inodes(m)
 #endif /* CONFIG_XFS_RT */
 
 #endif /* __KERNEL__ */
index d6fe4a88d79f0557bcdd2a46434a438debd9cb72..775249a54f6f9dc06584e6fe69617c6afa524697 100644 (file)
@@ -292,7 +292,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
  * In a write transaction we can allocate a maximum of 2
  * extents.  This gives:
  *    the inode getting the new extents: inode size
- *    the inode\'s bmap btree: max depth * block size
+ *    the inode's bmap btree: max depth * block size
  *    the agfs of the ags from which the extents are allocated: 2 * sector
  *    the superblock free block counter: sector size
  *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
@@ -321,7 +321,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 /*
  * In truncating a file we free up to two extents at once.  We can modify:
  *    the inode being truncated: inode size
- *    the inode\'s bmap btree: (max depth + 1) * block size
+ *    the inode's bmap btree: (max depth + 1) * block size
  * And the bmap_finish transaction can free the blocks and bmap blocks:
  *    the agf for each of the ags: 4 * sector size
  *    the agfl for each of the ags: 4 * sector size
@@ -343,7 +343,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
          (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \
          (128 * 5) + \
          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
            XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 
 #define        XFS_ITRUNCATE_LOG_RES(mp)   ((mp)->m_reservations.tr_itruncate)
@@ -431,8 +431,8 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
  *    the new inode: inode size
  *    the inode btree entry: 1 block
  *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode\'s bmap btree: (max depth + v2) * block size
- *    the blocks for the symlink: 1 KB
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ *    the blocks for the symlink: 1 kB
  * Or in the first xact we allocate some inodes giving:
  *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
  *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
@@ -449,9 +449,9 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
          (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \
         (2 * (mp)->m_sb.sb_sectsize + \
          XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
-         XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \
+         XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-         (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+         (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 
 #define        XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
@@ -463,7 +463,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
  *    the inode btree entry: block size
  *    the superblock for the nlink flag: sector size
  *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode\'s bmap btree: (max depth + v2) * block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
  * Or in the first xact we allocate some inodes giving:
  *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
  *    the superblock for the nlink flag: sector size
@@ -481,9 +481,9 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
          (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \
         (3 * (mp)->m_sb.sb_sectsize + \
          XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
-         XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \
+         XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-         (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+         (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 
 #define        XFS_CREATE_LOG_RES(mp)  ((mp)->m_reservations.tr_create)
@@ -513,7 +513,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
         MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \
         (128 * 5) + \
          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-         (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+         (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
 
 
@@ -637,7 +637,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 /*
  * Removing the attribute fork of a file
  *    the inode being truncated: inode size
- *    the inode\'s bmap btree: max depth * block size
+ *    the inode's bmap btree: max depth * block size
  * And the bmap_finish transaction can free the blocks and bmap blocks:
  *    the agf for each of the ags: 4 * sector size
  *    the agfl for each of the ags: 4 * sector size
index 2d47f10f8bed4cbe9b2441a29649311c2aa0acae..f31271c30de9bc6edf1b2ac575195585f6f99f3a 100644 (file)
@@ -79,7 +79,7 @@ xfs_trans_ail_tail(
  * the push is run asynchronously in a separate thread, so we return the tail
  * of the log right now instead of the tail after the push. This means we will
  * either continue right away, or we will sleep waiting on the async thread to
- * do it's work.
+ * do its work.
  *
  * We do this unlocked - we only need to know whether there is anything in the
  * AIL at the time we are called. We don't need to access the contents of
@@ -160,7 +160,7 @@ xfs_trans_ail_cursor_next(
 /*
  * Now that the traversal is complete, we need to remove the cursor
  * from the list of traversing cursors. Avoid removing the embedded
- * push cursor, but use the fact it is alway present to make the
+ * push cursor, but use the fact it is always present to make the
  * list deletion simple.
  */
 void
index e110bf57d7f496ddd70ce9f4927da8d67fa20545..eb3fc57f9eef681d73e39e07e3546cb45c350929 100644 (file)
@@ -22,7 +22,7 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
-/* XXX: from here down needed until struct xfs_trans has it's own ailp */
+/* XXX: from here down needed until struct xfs_trans has its own ailp */
 #include "xfs_bit.h"
 #include "xfs_buf_item.h"
 #include "xfs_sb.h"
index 4ea2e5074bdd2e68a7ea943191ddd60effdac791..7d2c920dfb9c01e0aac1fdc13576aa8f950bcf7a 100644 (file)
@@ -47,7 +47,7 @@
 #define        XFS_DIRREMOVE_SPACE_RES(mp)     \
        XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
 #define        XFS_IALLOC_SPACE_RES(mp)        \
-       (XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp)-1)
+       (XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1)
 
 /*
  * Space reservation values for various transactions.
index b2f724502f1bfbeed0f38712a72c825fa91b3268..d725428c9df6f64734591ed3366932e7550fe2d9 100644 (file)
 
 #ifdef __KERNEL__
 
-/*
- * POSIX Extensions
- */
-typedef unsigned char          uchar_t;
-typedef unsigned short         ushort_t;
-typedef unsigned int           uint_t;
-typedef unsigned long          ulong_t;
-
 /*
  * Additional type declarations for XFS
  */
index fcc2285d03ed7c9be2c56586d631242e6fe06874..79b9e5ea53590c25f8fd85797202653124ae35f7 100644 (file)
@@ -374,7 +374,7 @@ xfs_truncate_file(
 
        /*
         * Follow the normal truncate locking protocol.  Since we
-        * hold the inode in the transaction, we know that it's number
+        * hold the inode in the transaction, we know that its number
         * of references will stay constant.
         */
        xfs_ilock(ip, XFS_ILOCK_EXCL);
index 0e55c5d7db5fd2bbcc13e88baa38a915c2db94b7..7394c7af5de5ab0822beae7e742929f6dad4742d 100644 (file)
@@ -1136,7 +1136,7 @@ xfs_inactive(
         * If the inode is already free, then there can be nothing
         * to clean up here.
         */
-       if (ip->i_d.di_mode == 0 || VN_BAD(VFS_I(ip))) {
+       if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
                ASSERT(ip->i_df.if_real_bytes == 0);
                ASSERT(ip->i_df.if_broot_bytes == 0);
                return VN_INACTIVE_CACHE;
@@ -1387,23 +1387,28 @@ xfs_create(
        xfs_inode_t             **ipp,
        cred_t                  *credp)
 {
-       xfs_mount_t             *mp = dp->i_mount;
-       xfs_inode_t             *ip;
-       xfs_trans_t             *tp;
+       int                     is_dir = S_ISDIR(mode);
+       struct xfs_mount        *mp = dp->i_mount;
+       struct xfs_inode        *ip = NULL;
+       struct xfs_trans        *tp = NULL;
        int                     error;
        xfs_bmap_free_t         free_list;
        xfs_fsblock_t           first_block;
        boolean_t               unlock_dp_on_error = B_FALSE;
-       int                     dm_event_sent = 0;
        uint                    cancel_flags;
        int                     committed;
        xfs_prid_t              prid;
-       struct xfs_dquot        *udqp, *gdqp;
+       struct xfs_dquot        *udqp = NULL;
+       struct xfs_dquot        *gdqp = NULL;
        uint                    resblks;
+       uint                    log_res;
+       uint                    log_count;
 
-       ASSERT(!*ipp);
        xfs_itrace_entry(dp);
 
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return XFS_ERROR(EIO);
+
        if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
                error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
                                dp, DM_RIGHT_NULL, NULL,
@@ -1412,84 +1417,97 @@ xfs_create(
 
                if (error)
                        return error;
-               dm_event_sent = 1;
        }
 
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       /* Return through std_return after this point. */
-
-       udqp = gdqp = NULL;
        if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
                prid = dp->i_d.di_projid;
        else
-               prid = (xfs_prid_t)dfltprid;
+               prid = dfltprid;
 
        /*
         * Make sure that we have allocated dquot(s) on disk.
         */
        error = XFS_QM_DQVOPALLOC(mp, dp,
                        current_fsuid(), current_fsgid(), prid,
-                       XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
+                       XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
        if (error)
                goto std_return;
 
-       ip = NULL;
+       if (is_dir) {
+               rdev = 0;
+               resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
+               log_res = XFS_MKDIR_LOG_RES(mp);
+               log_count = XFS_MKDIR_LOG_COUNT;
+               tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
+       } else {
+               resblks = XFS_CREATE_SPACE_RES(mp, name->len);
+               log_res = XFS_CREATE_LOG_RES(mp);
+               log_count = XFS_CREATE_LOG_COUNT;
+               tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
+       }
 
-       tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-       resblks = XFS_CREATE_SPACE_RES(mp, name->len);
+
        /*
         * Initially assume that the file does not exist and
         * reserve the resources for that case.  If that is not
         * the case we'll drop the one we have and get a more
         * appropriate transaction later.
         */
-       error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
-                       XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+       error = xfs_trans_reserve(tp, resblks, log_res, 0,
+                       XFS_TRANS_PERM_LOG_RES, log_count);
        if (error == ENOSPC) {
                resblks = 0;
-               error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
-                               XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+               error = xfs_trans_reserve(tp, 0, log_res, 0,
+                               XFS_TRANS_PERM_LOG_RES, log_count);
        }
        if (error) {
                cancel_flags = 0;
-               goto error_return;
+               goto out_trans_cancel;
        }
 
        xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
        unlock_dp_on_error = B_TRUE;
 
-       xfs_bmap_init(&free_list, &first_block);
+       /*
+        * Check for directory link count overflow.
+        */
+       if (is_dir && dp->i_d.di_nlink >= XFS_MAXLINK) {
+               error = XFS_ERROR(EMLINK);
+               goto out_trans_cancel;
+       }
 
-       ASSERT(ip == NULL);
+       xfs_bmap_init(&free_list, &first_block);
 
        /*
         * Reserve disk quota and the inode.
         */
        error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
        if (error)
-               goto error_return;
+               goto out_trans_cancel;
 
        error = xfs_dir_canenter(tp, dp, name, resblks);
        if (error)
-               goto error_return;
-       error = xfs_dir_ialloc(&tp, dp, mode, 1,
-                       rdev, credp, prid, resblks > 0,
-                       &ip, &committed);
+               goto out_trans_cancel;
+
+       /*
+        * A newly created regular or special file just has one directory
+        * entry pointing to them, but a directory also the "." entry
+        * pointing to itself.
+        */
+       error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, credp,
+                              prid, resblks > 0, &ip, &committed);
        if (error) {
                if (error == ENOSPC)
-                       goto error_return;
-               goto abort_return;
+                       goto out_trans_cancel;
+               goto out_trans_abort;
        }
-       xfs_itrace_ref(ip);
 
        /*
         * At this point, we've gotten a newly allocated inode.
         * It is locked (and joined to the transaction).
         */
-
+       xfs_itrace_ref(ip);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 
        /*
@@ -1508,19 +1526,28 @@ xfs_create(
                                        resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
        if (error) {
                ASSERT(error != ENOSPC);
-               goto abort_return;
+               goto out_trans_abort;
        }
        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 
+       if (is_dir) {
+               error = xfs_dir_init(tp, ip, dp);
+               if (error)
+                       goto out_bmap_cancel;
+
+               error = xfs_bumplink(tp, dp);
+               if (error)
+                       goto out_bmap_cancel;
+       }
+
        /*
         * If this is a synchronous mount, make sure that the
         * create transaction goes to disk before returning to
         * the user.
         */
-       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
                xfs_trans_set_sync(tp);
-       }
 
        /*
         * Attach the dquot(s) to the inodes and modify them incore.
@@ -1537,16 +1564,13 @@ xfs_create(
        IHOLD(ip);
 
        error = xfs_bmap_finish(&tp, &free_list, &committed);
-       if (error) {
-               xfs_bmap_cancel(&free_list);
-               goto abort_rele;
-       }
+       if (error)
+               goto out_abort_rele;
 
        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
        if (error) {
                IRELE(ip);
-               tp = NULL;
-               goto error_return;
+               goto out_dqrele;
        }
 
        XFS_QM_DQRELE(mp, udqp);
@@ -1555,26 +1579,22 @@ xfs_create(
        *ipp = ip;
 
        /* Fallthrough to std_return with error = 0  */
-
-std_return:
-       if ((*ipp || (error != 0 && dm_event_sent != 0)) &&
-           DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
-               (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
-                       dp, DM_RIGHT_NULL,
-                       *ipp ? ip : NULL,
-                       DM_RIGHT_NULL, name->name, NULL,
-                       mode, error, 0);
+ std_return:
+       if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
+               XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, dp, DM_RIGHT_NULL,
+                               ip, DM_RIGHT_NULL, name->name, NULL, mode,
+                               error, 0);
        }
+
        return error;
 
- abort_return:
+ out_bmap_cancel:
+       xfs_bmap_cancel(&free_list);
+ out_trans_abort:
        cancel_flags |= XFS_TRANS_ABORT;
-       /* FALLTHROUGH */
-
- error_return:
-       if (tp != NULL)
-               xfs_trans_cancel(tp, cancel_flags);
-
+ out_trans_cancel:
+       xfs_trans_cancel(tp, cancel_flags);
+ out_dqrele:
        XFS_QM_DQRELE(mp, udqp);
        XFS_QM_DQRELE(mp, gdqp);
 
@@ -1583,20 +1603,18 @@ std_return:
 
        goto std_return;
 
- abort_rele:
out_abort_rele:
        /*
         * Wait until after the current transaction is aborted to
         * release the inode.  This prevents recursive transactions
         * and deadlocks from xfs_inactive.
         */
+       xfs_bmap_cancel(&free_list);
        cancel_flags |= XFS_TRANS_ABORT;
        xfs_trans_cancel(tp, cancel_flags);
        IRELE(ip);
-
-       XFS_QM_DQRELE(mp, udqp);
-       XFS_QM_DQRELE(mp, gdqp);
-
-       goto std_return;
+       unlock_dp_on_error = B_FALSE;
+       goto out_dqrele;
 }
 
 #ifdef DEBUG
@@ -2004,8 +2022,10 @@ xfs_link(
        /* Return through std_return after this point. */
 
        error = XFS_QM_DQATTACH(mp, sip, 0);
-       if (!error && sip != tdp)
-               error = XFS_QM_DQATTACH(mp, tdp, 0);
+       if (error)
+               goto std_return;
+
+       error = XFS_QM_DQATTACH(mp, tdp, 0);
        if (error)
                goto std_return;
 
@@ -2110,209 +2130,6 @@ std_return:
        goto std_return;
 }
 
-
-int
-xfs_mkdir(
-       xfs_inode_t             *dp,
-       struct xfs_name         *dir_name,
-       mode_t                  mode,
-       xfs_inode_t             **ipp,
-       cred_t                  *credp)
-{
-       xfs_mount_t             *mp = dp->i_mount;
-       xfs_inode_t             *cdp;   /* inode of created dir */
-       xfs_trans_t             *tp;
-       int                     cancel_flags;
-       int                     error;
-       int                     committed;
-       xfs_bmap_free_t         free_list;
-       xfs_fsblock_t           first_block;
-       boolean_t               unlock_dp_on_error = B_FALSE;
-       boolean_t               created = B_FALSE;
-       int                     dm_event_sent = 0;
-       xfs_prid_t              prid;
-       struct xfs_dquot        *udqp, *gdqp;
-       uint                    resblks;
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       tp = NULL;
-
-       if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
-               error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
-                                       dp, DM_RIGHT_NULL, NULL,
-                                       DM_RIGHT_NULL, dir_name->name, NULL,
-                                       mode, 0, 0);
-               if (error)
-                       return error;
-               dm_event_sent = 1;
-       }
-
-       /* Return through std_return after this point. */
-
-       xfs_itrace_entry(dp);
-
-       mp = dp->i_mount;
-       udqp = gdqp = NULL;
-       if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
-               prid = dp->i_d.di_projid;
-       else
-               prid = (xfs_prid_t)dfltprid;
-
-       /*
-        * Make sure that we have allocated dquot(s) on disk.
-        */
-       error = XFS_QM_DQVOPALLOC(mp, dp,
-                       current_fsuid(), current_fsgid(), prid,
-                       XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
-       if (error)
-               goto std_return;
-
-       tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-       resblks = XFS_MKDIR_SPACE_RES(mp, dir_name->len);
-       error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
-                                 XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
-       if (error == ENOSPC) {
-               resblks = 0;
-               error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
-                                         XFS_TRANS_PERM_LOG_RES,
-                                         XFS_MKDIR_LOG_COUNT);
-       }
-       if (error) {
-               cancel_flags = 0;
-               goto error_return;
-       }
-
-       xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
-       unlock_dp_on_error = B_TRUE;
-
-       /*
-        * Check for directory link count overflow.
-        */
-       if (dp->i_d.di_nlink >= XFS_MAXLINK) {
-               error = XFS_ERROR(EMLINK);
-               goto error_return;
-       }
-
-       /*
-        * Reserve disk quota and the inode.
-        */
-       error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
-       if (error)
-               goto error_return;
-
-       error = xfs_dir_canenter(tp, dp, dir_name, resblks);
-       if (error)
-               goto error_return;
-       /*
-        * create the directory inode.
-        */
-       error = xfs_dir_ialloc(&tp, dp, mode, 2,
-                       0, credp, prid, resblks > 0,
-               &cdp, NULL);
-       if (error) {
-               if (error == ENOSPC)
-                       goto error_return;
-               goto abort_return;
-       }
-       xfs_itrace_ref(cdp);
-
-       /*
-        * Now we add the directory inode to the transaction.
-        * We waited until now since xfs_dir_ialloc might start
-        * a new transaction.  Had we joined the transaction
-        * earlier, the locks might have gotten released. An error
-        * from here on will result in the transaction cancel
-        * unlocking dp so don't do it explicitly in the error path.
-        */
-       IHOLD(dp);
-       xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
-       unlock_dp_on_error = B_FALSE;
-
-       xfs_bmap_init(&free_list, &first_block);
-
-       error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino,
-                                       &first_block, &free_list, resblks ?
-                                       resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
-       if (error) {
-               ASSERT(error != ENOSPC);
-               goto error1;
-       }
-       xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
-       error = xfs_dir_init(tp, cdp, dp);
-       if (error)
-               goto error2;
-
-       error = xfs_bumplink(tp, dp);
-       if (error)
-               goto error2;
-
-       created = B_TRUE;
-
-       *ipp = cdp;
-       IHOLD(cdp);
-
-       /*
-        * Attach the dquots to the new inode and modify the icount incore.
-        */
-       XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
-
-       /*
-        * If this is a synchronous mount, make sure that the
-        * mkdir transaction goes to disk before returning to
-        * the user.
-        */
-       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
-               xfs_trans_set_sync(tp);
-       }
-
-       error = xfs_bmap_finish(&tp, &free_list, &committed);
-       if (error) {
-               IRELE(cdp);
-               goto error2;
-       }
-
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-       XFS_QM_DQRELE(mp, udqp);
-       XFS_QM_DQRELE(mp, gdqp);
-       if (error) {
-               IRELE(cdp);
-       }
-
-       /* Fall through to std_return with error = 0 or errno from
-        * xfs_trans_commit. */
-
-std_return:
-       if ((created || (error != 0 && dm_event_sent != 0)) &&
-           DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
-               (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
-                                       dp, DM_RIGHT_NULL,
-                                       created ? cdp : NULL,
-                                       DM_RIGHT_NULL,
-                                       dir_name->name, NULL,
-                                       mode, error, 0);
-       }
-       return error;
-
- error2:
- error1:
-       xfs_bmap_cancel(&free_list);
- abort_return:
-       cancel_flags |= XFS_TRANS_ABORT;
- error_return:
-       xfs_trans_cancel(tp, cancel_flags);
-       XFS_QM_DQRELE(mp, udqp);
-       XFS_QM_DQRELE(mp, gdqp);
-
-       if (unlock_dp_on_error)
-               xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
-       goto std_return;
-}
-
 int
 xfs_symlink(
        xfs_inode_t             *dp,
@@ -2586,51 +2403,6 @@ std_return:
        goto std_return;
 }
 
-int
-xfs_inode_flush(
-       xfs_inode_t     *ip,
-       int             flags)
-{
-       xfs_mount_t     *mp = ip->i_mount;
-       int             error = 0;
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       /*
-        * Bypass inodes which have already been cleaned by
-        * the inode flush clustering code inside xfs_iflush
-        */
-       if (xfs_inode_clean(ip))
-               return 0;
-
-       /*
-        * We make this non-blocking if the inode is contended,
-        * return EAGAIN to indicate to the caller that they
-        * did not succeed. This prevents the flush path from
-        * blocking on inodes inside another operation right
-        * now, they get caught later by xfs_sync.
-        */
-       if (flags & FLUSH_SYNC) {
-               xfs_ilock(ip, XFS_ILOCK_SHARED);
-               xfs_iflock(ip);
-       } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
-               if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
-                       xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                       return EAGAIN;
-               }
-       } else {
-               return EAGAIN;
-       }
-
-       error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC
-                                                   : XFS_IFLUSH_ASYNC_NOBLOCK);
-       xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-       return error;
-}
-
-
 int
 xfs_set_dmattrs(
        xfs_inode_t     *ip,
@@ -2676,7 +2448,7 @@ xfs_reclaim(
        ASSERT(!VN_MAPPED(VFS_I(ip)));
 
        /* bad inode, get out here ASAP */
-       if (VN_BAD(VFS_I(ip))) {
+       if (is_bad_inode(VFS_I(ip))) {
                xfs_ireclaim(ip);
                return 0;
        }
@@ -3090,7 +2862,7 @@ xfs_free_file_space(
 
        /*
         * Need to zero the stuff we're not freeing, on disk.
-        * If its a realtime file & can't use unwritten extents then we
+        * If it's a realtime file & can't use unwritten extents then we
         * actually need to zero the extent edges.  Otherwise xfs_bunmapi
         * will take care of it for us.
         */
index 76df328c61b42c16641ed4e7a8a18021efba7253..04373c6c61ff83f71d49df2f3f4250e7eb81f56d 100644 (file)
@@ -31,14 +31,11 @@ int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
                struct xfs_inode *ip);
 int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
                struct xfs_name *target_name);
-int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name,
-               mode_t mode, struct xfs_inode **ipp, cred_t *credp);
 int xfs_readdir(struct xfs_inode       *dp, void *dirent, size_t bufsize,
                       xfs_off_t *offset, filldir_t filldir);
 int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
                const char *target_path, mode_t mode, struct xfs_inode **ipp,
                cred_t *credp);
-int xfs_inode_flush(struct xfs_inode *ip, int flags);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
 int xfs_reclaim(struct xfs_inode *ip);
 int xfs_change_file_space(struct xfs_inode *ip, int cmd,
index 45f6297821bd6c7afee48e247d8a25b027bc7445..5fc2ef8d97fac5851eb2d40f7b7e2d67a3be48e8 100644 (file)
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 
+/* on architectures without dma-mapping capabilities we need to ensure
+ * that the asynchronous path compiles away
+ */
+#ifdef CONFIG_HAS_DMA
+#define __async_inline
+#else
+#define __async_inline __always_inline
+#endif
+
 /**
  * dma_chan_ref - object used to manage dma channels received from the
  *   dmaengine core.
index 8209e08969f9e4265e4fbbcd0c00b5a68467cad5..66ec05a5795558cef24450388e67ffd6bf0743f3 100644 (file)
@@ -139,6 +139,9 @@ struct target_type {
        dm_ioctl_fn ioctl;
        dm_merge_fn merge;
        dm_busy_fn busy;
+
+       /* For internal device-mapper use. */
+       struct list_head list;
 };
 
 struct io_restrictions {
index 600c5fb2daad4c231b53a4654316469895af9b0c..5e8b11d88f6f891a5c56328ca54ecb0fa4e2ce57 100644 (file)
@@ -28,6 +28,9 @@ struct dm_dirty_log_type {
        const char *name;
        struct module *module;
 
+       /* For internal device-mapper use */
+       struct list_head list;
+
        int (*ctr)(struct dm_dirty_log *log, struct dm_target *ti,
                   unsigned argc, char **argv);
        void (*dtr)(struct dm_dirty_log *log);
@@ -113,6 +116,16 @@ struct dm_dirty_log_type {
         */
        int (*status)(struct dm_dirty_log *log, status_type_t status_type,
                      char *result, unsigned maxlen);
+
+       /*
+        * is_remote_recovering is necessary for cluster mirroring. It provides
+        * a way to detect recovery on another node, so we aren't writing
+        * concurrently.  This function is likely to block (when a cluster log
+        * is used).
+        *
+        * Returns: 0, 1
+        */
+       int (*is_remote_recovering)(struct dm_dirty_log *log, region_t region);
 };
 
 int dm_dirty_log_type_register(struct dm_dirty_log_type *type);
index af1dab41674bfdf3ceacbf49c6aca7ec24cd606d..1a455f1f86d763da366f178852e7adb652bc86ce 100644 (file)
@@ -11,6 +11,7 @@
 
 #define DMA_PTE_READ (1)
 #define DMA_PTE_WRITE (2)
+#define DMA_PTE_SNP (1 << 11)
 
 struct intel_iommu;
 struct dmar_domain;
index 1956c8d46d326ea98c2286a6f3f47e8e428518a1..2e2aa3df170cfb5f2be5f8097ee9e2324c759143 100644 (file)
@@ -23,9 +23,6 @@
 
 #include <linux/device.h>
 #include <linux/uio.h>
-#include <linux/kref.h>
-#include <linux/completion.h>
-#include <linux/rcupdate.h>
 #include <linux/dma-mapping.h>
 
 /**
@@ -205,6 +202,7 @@ struct dma_async_tx_descriptor {
 /**
  * struct dma_device - info on the entity supplying DMA services
  * @chancnt: how many DMA channels are supported
+ * @privatecnt: how many DMA channels are requested by dma_request_channel
  * @channels: the list of struct dma_chan
  * @global_node: list_head for global dma_device_list
  * @cap_mask: one or more dma_capability flags
@@ -227,6 +225,7 @@ struct dma_async_tx_descriptor {
 struct dma_device {
 
        unsigned int chancnt;
+       unsigned int privatecnt;
        struct list_head channels;
        struct list_head global_node;
        dma_cap_mask_t  cap_mask;
@@ -291,6 +290,24 @@ static inline void net_dmaengine_put(void)
 }
 #endif
 
+#ifdef CONFIG_ASYNC_TX_DMA
+#define async_dmaengine_get()  dmaengine_get()
+#define async_dmaengine_put()  dmaengine_put()
+#define async_dma_find_channel(type) dma_find_channel(type)
+#else
+static inline void async_dmaengine_get(void)
+{
+}
+static inline void async_dmaengine_put(void)
+{
+}
+static inline struct dma_chan *
+async_dma_find_channel(enum dma_transaction_type type)
+{
+       return NULL;
+}
+#endif
+
 dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
        void *dest, void *src, size_t len);
 dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
@@ -337,6 +354,13 @@ __dma_cap_set(enum dma_transaction_type tx_type, dma_cap_mask_t *dstp)
        set_bit(tx_type, dstp->bits);
 }
 
+#define dma_cap_clear(tx, mask) __dma_cap_clear((tx), &(mask))
+static inline void
+__dma_cap_clear(enum dma_transaction_type tx_type, dma_cap_mask_t *dstp)
+{
+       clear_bit(tx_type, dstp->bits);
+}
+
 #define dma_cap_zero(mask) __dma_cap_zero(&(mask))
 static inline void __dma_cap_zero(dma_cap_mask_t *dstp)
 {
index d797dde247f763392427cac3d9cad4b1ba5cc595..c8aad713a0460bd1c66c0962f930602501b2f9e7 100644 (file)
@@ -74,4 +74,23 @@ struct dw_dma_slave {
 #define DWC_CFGL_HS_DST_POL    (1 << 18)       /* dst handshake active low */
 #define DWC_CFGL_HS_SRC_POL    (1 << 19)       /* src handshake active low */
 
+/* DMA API extensions */
+struct dw_cyclic_desc {
+       struct dw_desc  **desc;
+       unsigned long   periods;
+       void            (*period_callback)(void *param);
+       void            *period_callback_param;
+};
+
+struct dw_cyclic_desc *dw_dma_cyclic_prep(struct dma_chan *chan,
+               dma_addr_t buf_addr, size_t buf_len, size_t period_len,
+               enum dma_data_direction direction);
+void dw_dma_cyclic_free(struct dma_chan *chan);
+int dw_dma_cyclic_start(struct dma_chan *chan);
+void dw_dma_cyclic_stop(struct dma_chan *chan);
+
+dma_addr_t dw_dma_get_src_addr(struct dma_chan *chan);
+
+dma_addr_t dw_dma_get_dst_addr(struct dma_chan *chan);
+
 #endif /* DW_DMAC_H */
index e263acaa405b8deba36ad227c745dfc2fc4c8fad..634a5e5aba3e219844fbffadce72b0361c794203 100644 (file)
@@ -208,6 +208,7 @@ static inline __u32 ext3_mask_flags(umode_t mode, __u32 flags)
 #define EXT3_STATE_JDATA               0x00000001 /* journaled data exists */
 #define EXT3_STATE_NEW                 0x00000002 /* inode is newly created */
 #define EXT3_STATE_XATTR               0x00000004 /* has in-inode xattrs */
+#define EXT3_STATE_FLUSH_ON_CLOSE      0x00000008
 
 /* Used to pass group descriptor data when online resize is done */
 struct ext3_new_group_input {
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
new file mode 100644 (file)
index 0000000..84d3532
--- /dev/null
@@ -0,0 +1,505 @@
+/* General filesystem caching backing cache interface
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * NOTE!!! See:
+ *
+ *     Documentation/filesystems/caching/backend-api.txt
+ *
+ * for a description of the cache backend interface declared here.
+ */
+
+#ifndef _LINUX_FSCACHE_CACHE_H
+#define _LINUX_FSCACHE_CACHE_H
+
+#include <linux/fscache.h>
+#include <linux/sched.h>
+#include <linux/slow-work.h>
+
+#define NR_MAXCACHES BITS_PER_LONG
+
+struct fscache_cache;
+struct fscache_cache_ops;
+struct fscache_object;
+struct fscache_operation;
+
+/*
+ * cache tag definition
+ */
+struct fscache_cache_tag {
+       struct list_head        link;
+       struct fscache_cache    *cache;         /* cache referred to by this tag */
+       unsigned long           flags;
+#define FSCACHE_TAG_RESERVED   0               /* T if tag is reserved for a cache */
+       atomic_t                usage;
+       char                    name[0];        /* tag name */
+};
+
+/*
+ * cache definition
+ */
+struct fscache_cache {
+       const struct fscache_cache_ops *ops;
+       struct fscache_cache_tag *tag;          /* tag representing this cache */
+       struct kobject          *kobj;          /* system representation of this cache */
+       struct list_head        link;           /* link in list of caches */
+       size_t                  max_index_size; /* maximum size of index data */
+       char                    identifier[36]; /* cache label */
+
+       /* node management */
+       struct work_struct      op_gc;          /* operation garbage collector */
+       struct list_head        object_list;    /* list of data/index objects */
+       struct list_head        op_gc_list;     /* list of ops to be deleted */
+       spinlock_t              object_list_lock;
+       spinlock_t              op_gc_list_lock;
+       atomic_t                object_count;   /* no. of live objects in this cache */
+       struct fscache_object   *fsdef;         /* object for the fsdef index */
+       unsigned long           flags;
+#define FSCACHE_IOERROR                0       /* cache stopped on I/O error */
+#define FSCACHE_CACHE_WITHDRAWN        1       /* cache has been withdrawn */
+};
+
+extern wait_queue_head_t fscache_cache_cleared_wq;
+
+/*
+ * operation to be applied to a cache object
+ * - retrieval initiation operations are done in the context of the process
+ *   that issued them, and not in an async thread pool
+ */
+typedef void (*fscache_operation_release_t)(struct fscache_operation *op);
+typedef void (*fscache_operation_processor_t)(struct fscache_operation *op);
+
+struct fscache_operation {
+       union {
+               struct work_struct fast_work;   /* record for fast ops */
+               struct slow_work slow_work;     /* record for (very) slow ops */
+       };
+       struct list_head        pend_link;      /* link in object->pending_ops */
+       struct fscache_object   *object;        /* object to be operated upon */
+
+       unsigned long           flags;
+#define FSCACHE_OP_TYPE                0x000f  /* operation type */
+#define FSCACHE_OP_FAST                0x0001  /* - fast op, processor may not sleep for disk */
+#define FSCACHE_OP_SLOW                0x0002  /* - (very) slow op, processor may sleep for disk */
+#define FSCACHE_OP_MYTHREAD    0x0003  /* - processing is done be issuing thread, not pool */
+#define FSCACHE_OP_WAITING     4       /* cleared when op is woken */
+#define FSCACHE_OP_EXCLUSIVE   5       /* exclusive op, other ops must wait */
+#define FSCACHE_OP_DEAD                6       /* op is now dead */
+
+       atomic_t                usage;
+       unsigned                debug_id;       /* debugging ID */
+
+       /* operation processor callback
+        * - can be NULL if FSCACHE_OP_WAITING is going to be used to perform
+        *   the op in a non-pool thread */
+       fscache_operation_processor_t processor;
+
+       /* operation releaser */
+       fscache_operation_release_t release;
+};
+
+extern atomic_t fscache_op_debug_id;
+extern const struct slow_work_ops fscache_op_slow_work_ops;
+
+extern void fscache_enqueue_operation(struct fscache_operation *);
+extern void fscache_put_operation(struct fscache_operation *);
+
+/**
+ * fscache_operation_init - Do basic initialisation of an operation
+ * @op: The operation to initialise
+ * @release: The release function to assign
+ *
+ * Do basic initialisation of an operation.  The caller must still set flags,
+ * object, either fast_work or slow_work if necessary, and processor if needed.
+ */
+static inline void fscache_operation_init(struct fscache_operation *op,
+                                         fscache_operation_release_t release)
+{
+       atomic_set(&op->usage, 1);
+       op->debug_id = atomic_inc_return(&fscache_op_debug_id);
+       op->release = release;
+       INIT_LIST_HEAD(&op->pend_link);
+}
+
+/**
+ * fscache_operation_init_slow - Do additional initialisation of a slow op
+ * @op: The operation to initialise
+ * @processor: The processor function to assign
+ *
+ * Do additional initialisation of an operation as required for slow work.
+ */
+static inline
+void fscache_operation_init_slow(struct fscache_operation *op,
+                                fscache_operation_processor_t processor)
+{
+       op->processor = processor;
+       slow_work_init(&op->slow_work, &fscache_op_slow_work_ops);
+}
+
+/*
+ * data read operation
+ */
+struct fscache_retrieval {
+       struct fscache_operation op;
+       struct address_space    *mapping;       /* netfs pages */
+       fscache_rw_complete_t   end_io_func;    /* function to call on I/O completion */
+       void                    *context;       /* netfs read context (pinned) */
+       struct list_head        to_do;          /* list of things to be done by the backend */
+       unsigned long           start_time;     /* time at which retrieval started */
+};
+
+typedef int (*fscache_page_retrieval_func_t)(struct fscache_retrieval *op,
+                                            struct page *page,
+                                            gfp_t gfp);
+
+typedef int (*fscache_pages_retrieval_func_t)(struct fscache_retrieval *op,
+                                             struct list_head *pages,
+                                             unsigned *nr_pages,
+                                             gfp_t gfp);
+
+/**
+ * fscache_get_retrieval - Get an extra reference on a retrieval operation
+ * @op: The retrieval operation to get a reference on
+ *
+ * Get an extra reference on a retrieval operation.
+ */
+static inline
+struct fscache_retrieval *fscache_get_retrieval(struct fscache_retrieval *op)
+{
+       atomic_inc(&op->op.usage);
+       return op;
+}
+
+/**
+ * fscache_enqueue_retrieval - Enqueue a retrieval operation for processing
+ * @op: The retrieval operation affected
+ *
+ * Enqueue a retrieval operation for processing by the FS-Cache thread pool.
+ */
+static inline void fscache_enqueue_retrieval(struct fscache_retrieval *op)
+{
+       fscache_enqueue_operation(&op->op);
+}
+
+/**
+ * fscache_put_retrieval - Drop a reference to a retrieval operation
+ * @op: The retrieval operation affected
+ *
+ * Drop a reference to a retrieval operation.
+ */
+static inline void fscache_put_retrieval(struct fscache_retrieval *op)
+{
+       fscache_put_operation(&op->op);
+}
+
+/*
+ * cached page storage work item
+ * - used to do three things:
+ *   - batch writes to the cache
+ *   - do cache writes asynchronously
+ *   - defer writes until cache object lookup completion
+ */
+struct fscache_storage {
+       struct fscache_operation op;
+       pgoff_t                 store_limit;    /* don't write more than this */
+};
+
+/*
+ * cache operations
+ */
+struct fscache_cache_ops {
+       /* name of cache provider */
+       const char *name;
+
+       /* allocate an object record for a cookie */
+       struct fscache_object *(*alloc_object)(struct fscache_cache *cache,
+                                              struct fscache_cookie *cookie);
+
+       /* look up the object for a cookie */
+       void (*lookup_object)(struct fscache_object *object);
+
+       /* finished looking up */
+       void (*lookup_complete)(struct fscache_object *object);
+
+       /* increment the usage count on this object (may fail if unmounting) */
+       struct fscache_object *(*grab_object)(struct fscache_object *object);
+
+       /* pin an object in the cache */
+       int (*pin_object)(struct fscache_object *object);
+
+       /* unpin an object in the cache */
+       void (*unpin_object)(struct fscache_object *object);
+
+       /* store the updated auxilliary data on an object */
+       void (*update_object)(struct fscache_object *object);
+
+       /* discard the resources pinned by an object and effect retirement if
+        * necessary */
+       void (*drop_object)(struct fscache_object *object);
+
+       /* dispose of a reference to an object */
+       void (*put_object)(struct fscache_object *object);
+
+       /* sync a cache */
+       void (*sync_cache)(struct fscache_cache *cache);
+
+       /* notification that the attributes of a non-index object (such as
+        * i_size) have changed */
+       int (*attr_changed)(struct fscache_object *object);
+
+       /* reserve space for an object's data and associated metadata */
+       int (*reserve_space)(struct fscache_object *object, loff_t i_size);
+
+       /* request a backing block for a page be read or allocated in the
+        * cache */
+       fscache_page_retrieval_func_t read_or_alloc_page;
+
+       /* request backing blocks for a list of pages be read or allocated in
+        * the cache */
+       fscache_pages_retrieval_func_t read_or_alloc_pages;
+
+       /* request a backing block for a page be allocated in the cache so that
+        * it can be written directly */
+       fscache_page_retrieval_func_t allocate_page;
+
+       /* request backing blocks for pages be allocated in the cache so that
+        * they can be written directly */
+       fscache_pages_retrieval_func_t allocate_pages;
+
+       /* write a page to its backing block in the cache */
+       int (*write_page)(struct fscache_storage *op, struct page *page);
+
+       /* detach backing block from a page (optional)
+        * - must release the cookie lock before returning
+        * - may sleep
+        */
+       void (*uncache_page)(struct fscache_object *object,
+                            struct page *page);
+
+       /* dissociate a cache from all the pages it was backing */
+       void (*dissociate_pages)(struct fscache_cache *cache);
+};
+
+/*
+ * data file or index object cookie
+ * - a file will only appear in one cache
+ * - a request to cache a file may or may not be honoured, subject to
+ *   constraints such as disk space
+ * - indices are created on disk just-in-time
+ */
+struct fscache_cookie {
+       atomic_t                        usage;          /* number of users of this cookie */
+       atomic_t                        n_children;     /* number of children of this cookie */
+       spinlock_t                      lock;
+       struct hlist_head               backing_objects; /* object(s) backing this file/index */
+       const struct fscache_cookie_def *def;           /* definition */
+       struct fscache_cookie           *parent;        /* parent of this entry */
+       void                            *netfs_data;    /* back pointer to netfs */
+       struct radix_tree_root          stores;         /* pages to be stored on this cookie */
+#define FSCACHE_COOKIE_PENDING_TAG     0               /* pages tag: pending write to cache */
+
+       unsigned long                   flags;
+#define FSCACHE_COOKIE_LOOKING_UP      0       /* T if non-index cookie being looked up still */
+#define FSCACHE_COOKIE_CREATING                1       /* T if non-index object being created still */
+#define FSCACHE_COOKIE_NO_DATA_YET     2       /* T if new object with no cached data yet */
+#define FSCACHE_COOKIE_PENDING_FILL    3       /* T if pending initial fill on object */
+#define FSCACHE_COOKIE_FILLING         4       /* T if filling object incrementally */
+#define FSCACHE_COOKIE_UNAVAILABLE     5       /* T if cookie is unavailable (error, etc) */
+};
+
+extern struct fscache_cookie fscache_fsdef_index;
+
+/*
+ * on-disk cache file or index handle
+ */
+struct fscache_object {
+       enum fscache_object_state {
+               FSCACHE_OBJECT_INIT,            /* object in initial unbound state */
+               FSCACHE_OBJECT_LOOKING_UP,      /* looking up object */
+               FSCACHE_OBJECT_CREATING,        /* creating object */
+
+               /* active states */
+               FSCACHE_OBJECT_AVAILABLE,       /* cleaning up object after creation */
+               FSCACHE_OBJECT_ACTIVE,          /* object is usable */
+               FSCACHE_OBJECT_UPDATING,        /* object is updating */
+
+               /* terminal states */
+               FSCACHE_OBJECT_DYING,           /* object waiting for accessors to finish */
+               FSCACHE_OBJECT_LC_DYING,        /* object cleaning up after lookup/create */
+               FSCACHE_OBJECT_ABORT_INIT,      /* abort the init state */
+               FSCACHE_OBJECT_RELEASING,       /* releasing object */
+               FSCACHE_OBJECT_RECYCLING,       /* retiring object */
+               FSCACHE_OBJECT_WITHDRAWING,     /* withdrawing object */
+               FSCACHE_OBJECT_DEAD,            /* object is now dead */
+       } state;
+
+       int                     debug_id;       /* debugging ID */
+       int                     n_children;     /* number of child objects */
+       int                     n_ops;          /* number of ops outstanding on object */
+       int                     n_obj_ops;      /* number of object ops outstanding on object */
+       int                     n_in_progress;  /* number of ops in progress */
+       int                     n_exclusive;    /* number of exclusive ops queued */
+       spinlock_t              lock;           /* state and operations lock */
+
+       unsigned long           lookup_jif;     /* time at which lookup started */
+       unsigned long           event_mask;     /* events this object is interested in */
+       unsigned long           events;         /* events to be processed by this object
+                                                * (order is important - using fls) */
+#define FSCACHE_OBJECT_EV_REQUEUE      0       /* T if object should be requeued */
+#define FSCACHE_OBJECT_EV_UPDATE       1       /* T if object should be updated */
+#define FSCACHE_OBJECT_EV_CLEARED      2       /* T if accessors all gone */
+#define FSCACHE_OBJECT_EV_ERROR                3       /* T if fatal error occurred during processing */
+#define FSCACHE_OBJECT_EV_RELEASE      4       /* T if netfs requested object release */
+#define FSCACHE_OBJECT_EV_RETIRE       5       /* T if netfs requested object retirement */
+#define FSCACHE_OBJECT_EV_WITHDRAW     6       /* T if cache requested object withdrawal */
+
+       unsigned long           flags;
+#define FSCACHE_OBJECT_LOCK            0       /* T if object is busy being processed */
+#define FSCACHE_OBJECT_PENDING_WRITE   1       /* T if object has pending write */
+#define FSCACHE_OBJECT_WAITING         2       /* T if object is waiting on its parent */
+
+       struct list_head        cache_link;     /* link in cache->object_list */
+       struct hlist_node       cookie_link;    /* link in cookie->backing_objects */
+       struct fscache_cache    *cache;         /* cache that supplied this object */
+       struct fscache_cookie   *cookie;        /* netfs's file/index object */
+       struct fscache_object   *parent;        /* parent object */
+       struct slow_work        work;           /* attention scheduling record */
+       struct list_head        dependents;     /* FIFO of dependent objects */
+       struct list_head        dep_link;       /* link in parent's dependents list */
+       struct list_head        pending_ops;    /* unstarted operations on this object */
+       pgoff_t                 store_limit;    /* current storage limit */
+};
+
+extern const char *fscache_object_states[];
+
+#define fscache_object_is_active(obj)                        \
+       (!test_bit(FSCACHE_IOERROR, &(obj)->cache->flags) &&  \
+        (obj)->state >= FSCACHE_OBJECT_AVAILABLE &&          \
+        (obj)->state < FSCACHE_OBJECT_DYING)
+
+extern const struct slow_work_ops fscache_object_slow_work_ops;
+
+/**
+ * fscache_object_init - Initialise a cache object description
+ * @object: Object description
+ *
+ * Initialise a cache object description to its basic values.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_object_init(struct fscache_object *object,
+                        struct fscache_cookie *cookie,
+                        struct fscache_cache *cache)
+{
+       atomic_inc(&cache->object_count);
+
+       object->state = FSCACHE_OBJECT_INIT;
+       spin_lock_init(&object->lock);
+       INIT_LIST_HEAD(&object->cache_link);
+       INIT_HLIST_NODE(&object->cookie_link);
+       vslow_work_init(&object->work, &fscache_object_slow_work_ops);
+       INIT_LIST_HEAD(&object->dependents);
+       INIT_LIST_HEAD(&object->dep_link);
+       INIT_LIST_HEAD(&object->pending_ops);
+       object->n_children = 0;
+       object->n_ops = object->n_in_progress = object->n_exclusive = 0;
+       object->events = object->event_mask = 0;
+       object->flags = 0;
+       object->store_limit = 0;
+       object->cache = cache;
+       object->cookie = cookie;
+       object->parent = NULL;
+}
+
+extern void fscache_object_lookup_negative(struct fscache_object *object);
+extern void fscache_obtained_object(struct fscache_object *object);
+
+/**
+ * fscache_object_destroyed - Note destruction of an object in a cache
+ * @cache: The cache from which the object came
+ *
+ * Note the destruction and deallocation of an object record in a cache.
+ */
+static inline void fscache_object_destroyed(struct fscache_cache *cache)
+{
+       if (atomic_dec_and_test(&cache->object_count))
+               wake_up_all(&fscache_cache_cleared_wq);
+}
+
+/**
+ * fscache_object_lookup_error - Note an object encountered an error
+ * @object: The object on which the error was encountered
+ *
+ * Note that an object encountered a fatal error (usually an I/O error) and
+ * that it should be withdrawn as soon as possible.
+ */
+static inline void fscache_object_lookup_error(struct fscache_object *object)
+{
+       set_bit(FSCACHE_OBJECT_EV_ERROR, &object->events);
+}
+
+/**
+ * fscache_set_store_limit - Set the maximum size to be stored in an object
+ * @object: The object to set the maximum on
+ * @i_size: The limit to set in bytes
+ *
+ * Set the maximum size an object is permitted to reach, implying the highest
+ * byte that may be written.  Intended to be called by the attr_changed() op.
+ *
+ * See Documentation/filesystems/caching/backend-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_set_store_limit(struct fscache_object *object, loff_t i_size)
+{
+       object->store_limit = i_size >> PAGE_SHIFT;
+       if (i_size & ~PAGE_MASK)
+               object->store_limit++;
+}
+
+/**
+ * fscache_end_io - End a retrieval operation on a page
+ * @op: The FS-Cache operation covering the retrieval
+ * @page: The page that was to be fetched
+ * @error: The error code (0 if successful)
+ *
+ * Note the end of an operation to retrieve a page, as covered by a particular
+ * operation record.
+ */
+static inline void fscache_end_io(struct fscache_retrieval *op,
+                                 struct page *page, int error)
+{
+       op->end_io_func(page, op->context, error);
+}
+
+/*
+ * out-of-line cache backend functions
+ */
+extern void fscache_init_cache(struct fscache_cache *cache,
+                              const struct fscache_cache_ops *ops,
+                              const char *idfmt,
+                              ...) __attribute__ ((format (printf, 3, 4)));
+
+extern int fscache_add_cache(struct fscache_cache *cache,
+                            struct fscache_object *fsdef,
+                            const char *tagname);
+extern void fscache_withdraw_cache(struct fscache_cache *cache);
+
+extern void fscache_io_error(struct fscache_cache *cache);
+
+extern void fscache_mark_pages_cached(struct fscache_retrieval *op,
+                                     struct pagevec *pagevec);
+
+extern enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
+                                              const void *data,
+                                              uint16_t datalen);
+
+#endif /* _LINUX_FSCACHE_CACHE_H */
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
new file mode 100644 (file)
index 0000000..6d8ee46
--- /dev/null
@@ -0,0 +1,618 @@
+/* General filesystem caching interface
+ *
+ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * NOTE!!! See:
+ *
+ *     Documentation/filesystems/caching/netfs-api.txt
+ *
+ * for a description of the network filesystem interface declared here.
+ */
+
+#ifndef _LINUX_FSCACHE_H
+#define _LINUX_FSCACHE_H
+
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+
+#if defined(CONFIG_FSCACHE) || defined(CONFIG_FSCACHE_MODULE)
+#define fscache_available() (1)
+#define fscache_cookie_valid(cookie) (cookie)
+#else
+#define fscache_available() (0)
+#define fscache_cookie_valid(cookie) (0)
+#endif
+
+
+/*
+ * overload PG_private_2 to give us PG_fscache - this is used to indicate that
+ * a page is currently backed by a local disk cache
+ */
+#define PageFsCache(page)              PagePrivate2((page))
+#define SetPageFsCache(page)           SetPagePrivate2((page))
+#define ClearPageFsCache(page)         ClearPagePrivate2((page))
+#define TestSetPageFsCache(page)       TestSetPagePrivate2((page))
+#define TestClearPageFsCache(page)     TestClearPagePrivate2((page))
+
+/* pattern used to fill dead space in an index entry */
+#define FSCACHE_INDEX_DEADFILL_PATTERN 0x79
+
+struct pagevec;
+struct fscache_cache_tag;
+struct fscache_cookie;
+struct fscache_netfs;
+
+typedef void (*fscache_rw_complete_t)(struct page *page,
+                                     void *context,
+                                     int error);
+
+/* result of index entry consultation */
+enum fscache_checkaux {
+       FSCACHE_CHECKAUX_OKAY,          /* entry okay as is */
+       FSCACHE_CHECKAUX_NEEDS_UPDATE,  /* entry requires update */
+       FSCACHE_CHECKAUX_OBSOLETE,      /* entry requires deletion */
+};
+
+/*
+ * fscache cookie definition
+ */
+struct fscache_cookie_def {
+       /* name of cookie type */
+       char name[16];
+
+       /* cookie type */
+       uint8_t type;
+#define FSCACHE_COOKIE_TYPE_INDEX      0
+#define FSCACHE_COOKIE_TYPE_DATAFILE   1
+
+       /* select the cache into which to insert an entry in this index
+        * - optional
+        * - should return a cache identifier or NULL to cause the cache to be
+        *   inherited from the parent if possible or the first cache picked
+        *   for a non-index file if not
+        */
+       struct fscache_cache_tag *(*select_cache)(
+               const void *parent_netfs_data,
+               const void *cookie_netfs_data);
+
+       /* get an index key
+        * - should store the key data in the buffer
+        * - should return the amount of amount stored
+        * - not permitted to return an error
+        * - the netfs data from the cookie being used as the source is
+        *   presented
+        */
+       uint16_t (*get_key)(const void *cookie_netfs_data,
+                           void *buffer,
+                           uint16_t bufmax);
+
+       /* get certain file attributes from the netfs data
+        * - this function can be absent for an index
+        * - not permitted to return an error
+        * - the netfs data from the cookie being used as the source is
+        *   presented
+        */
+       void (*get_attr)(const void *cookie_netfs_data, uint64_t *size);
+
+       /* get the auxilliary data from netfs data
+        * - this function can be absent if the index carries no state data
+        * - should store the auxilliary data in the buffer
+        * - should return the amount of amount stored
+        * - not permitted to return an error
+        * - the netfs data from the cookie being used as the source is
+        *   presented
+        */
+       uint16_t (*get_aux)(const void *cookie_netfs_data,
+                           void *buffer,
+                           uint16_t bufmax);
+
+       /* consult the netfs about the state of an object
+        * - this function can be absent if the index carries no state data
+        * - the netfs data from the cookie being used as the target is
+        *   presented, as is the auxilliary data
+        */
+       enum fscache_checkaux (*check_aux)(void *cookie_netfs_data,
+                                          const void *data,
+                                          uint16_t datalen);
+
+       /* get an extra reference on a read context
+        * - this function can be absent if the completion function doesn't
+        *   require a context
+        */
+       void (*get_context)(void *cookie_netfs_data, void *context);
+
+       /* release an extra reference on a read context
+        * - this function can be absent if the completion function doesn't
+        *   require a context
+        */
+       void (*put_context)(void *cookie_netfs_data, void *context);
+
+       /* indicate pages that now have cache metadata retained
+        * - this function should mark the specified pages as now being cached
+        * - the pages will have been marked with PG_fscache before this is
+        *   called, so this is optional
+        */
+       void (*mark_pages_cached)(void *cookie_netfs_data,
+                                 struct address_space *mapping,
+                                 struct pagevec *cached_pvec);
+
+       /* indicate the cookie is no longer cached
+        * - this function is called when the backing store currently caching
+        *   a cookie is removed
+        * - the netfs should use this to clean up any markers indicating
+        *   cached pages
+        * - this is mandatory for any object that may have data
+        */
+       void (*now_uncached)(void *cookie_netfs_data);
+};
+
+/*
+ * fscache cached network filesystem type
+ * - name, version and ops must be filled in before registration
+ * - all other fields will be set during registration
+ */
+struct fscache_netfs {
+       uint32_t                        version;        /* indexing version */
+       const char                      *name;          /* filesystem name */
+       struct fscache_cookie           *primary_index;
+       struct list_head                link;           /* internal link */
+};
+
+/*
+ * slow-path functions for when there is actually caching available, and the
+ * netfs does actually have a valid token
+ * - these are not to be called directly
+ * - these are undefined symbols when FS-Cache is not configured and the
+ *   optimiser takes care of not using them
+ */
+extern int __fscache_register_netfs(struct fscache_netfs *);
+extern void __fscache_unregister_netfs(struct fscache_netfs *);
+extern struct fscache_cache_tag *__fscache_lookup_cache_tag(const char *);
+extern void __fscache_release_cache_tag(struct fscache_cache_tag *);
+
+extern struct fscache_cookie *__fscache_acquire_cookie(
+       struct fscache_cookie *,
+       const struct fscache_cookie_def *,
+       void *);
+extern void __fscache_relinquish_cookie(struct fscache_cookie *, int);
+extern void __fscache_update_cookie(struct fscache_cookie *);
+extern int __fscache_attr_changed(struct fscache_cookie *);
+extern int __fscache_read_or_alloc_page(struct fscache_cookie *,
+                                       struct page *,
+                                       fscache_rw_complete_t,
+                                       void *,
+                                       gfp_t);
+extern int __fscache_read_or_alloc_pages(struct fscache_cookie *,
+                                        struct address_space *,
+                                        struct list_head *,
+                                        unsigned *,
+                                        fscache_rw_complete_t,
+                                        void *,
+                                        gfp_t);
+extern int __fscache_alloc_page(struct fscache_cookie *, struct page *, gfp_t);
+extern int __fscache_write_page(struct fscache_cookie *, struct page *, gfp_t);
+extern void __fscache_uncache_page(struct fscache_cookie *, struct page *);
+extern bool __fscache_check_page_write(struct fscache_cookie *, struct page *);
+extern void __fscache_wait_on_page_write(struct fscache_cookie *, struct page *);
+
+/**
+ * fscache_register_netfs - Register a filesystem as desiring caching services
+ * @netfs: The description of the filesystem
+ *
+ * Register a filesystem as desiring caching services if they're available.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_register_netfs(struct fscache_netfs *netfs)
+{
+       if (fscache_available())
+               return __fscache_register_netfs(netfs);
+       else
+               return 0;
+}
+
+/**
+ * fscache_unregister_netfs - Indicate that a filesystem no longer desires
+ * caching services
+ * @netfs: The description of the filesystem
+ *
+ * Indicate that a filesystem no longer desires caching services for the
+ * moment.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_unregister_netfs(struct fscache_netfs *netfs)
+{
+       if (fscache_available())
+               __fscache_unregister_netfs(netfs);
+}
+
+/**
+ * fscache_lookup_cache_tag - Look up a cache tag
+ * @name: The name of the tag to search for
+ *
+ * Acquire a specific cache referral tag that can be used to select a specific
+ * cache in which to cache an index.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name)
+{
+       if (fscache_available())
+               return __fscache_lookup_cache_tag(name);
+       else
+               return NULL;
+}
+
+/**
+ * fscache_release_cache_tag - Release a cache tag
+ * @tag: The tag to release
+ *
+ * Release a reference to a cache referral tag previously looked up.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_release_cache_tag(struct fscache_cache_tag *tag)
+{
+       if (fscache_available())
+               __fscache_release_cache_tag(tag);
+}
+
+/**
+ * fscache_acquire_cookie - Acquire a cookie to represent a cache object
+ * @parent: The cookie that's to be the parent of this one
+ * @def: A description of the cache object, including callback operations
+ * @netfs_data: An arbitrary piece of data to be kept in the cookie to
+ * represent the cache object to the netfs
+ *
+ * This function is used to inform FS-Cache about part of an index hierarchy
+ * that can be used to locate files.  This is done by requesting a cookie for
+ * each index in the path to the file.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+struct fscache_cookie *fscache_acquire_cookie(
+       struct fscache_cookie *parent,
+       const struct fscache_cookie_def *def,
+       void *netfs_data)
+{
+       if (fscache_cookie_valid(parent))
+               return __fscache_acquire_cookie(parent, def, netfs_data);
+       else
+               return NULL;
+}
+
+/**
+ * fscache_relinquish_cookie - Return the cookie to the cache, maybe discarding
+ * it
+ * @cookie: The cookie being returned
+ * @retire: True if the cache object the cookie represents is to be discarded
+ *
+ * This function returns a cookie to the cache, forcibly discarding the
+ * associated cache object if retire is set to true.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
+{
+       if (fscache_cookie_valid(cookie))
+               __fscache_relinquish_cookie(cookie, retire);
+}
+
+/**
+ * fscache_update_cookie - Request that a cache object be updated
+ * @cookie: The cookie representing the cache object
+ *
+ * Request an update of the index data for the cache object associated with the
+ * cookie.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_update_cookie(struct fscache_cookie *cookie)
+{
+       if (fscache_cookie_valid(cookie))
+               __fscache_update_cookie(cookie);
+}
+
+/**
+ * fscache_pin_cookie - Pin a data-storage cache object in its cache
+ * @cookie: The cookie representing the cache object
+ *
+ * Permit data-storage cache objects to be pinned in the cache.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_pin_cookie(struct fscache_cookie *cookie)
+{
+       return -ENOBUFS;
+}
+
+/**
+ * fscache_pin_cookie - Unpin a data-storage cache object in its cache
+ * @cookie: The cookie representing the cache object
+ *
+ * Permit data-storage cache objects to be unpinned from the cache.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_unpin_cookie(struct fscache_cookie *cookie)
+{
+}
+
+/**
+ * fscache_attr_changed - Notify cache that an object's attributes changed
+ * @cookie: The cookie representing the cache object
+ *
+ * Send a notification to the cache indicating that an object's attributes have
+ * changed.  This includes the data size.  These attributes will be obtained
+ * through the get_attr() cookie definition op.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_attr_changed(struct fscache_cookie *cookie)
+{
+       if (fscache_cookie_valid(cookie))
+               return __fscache_attr_changed(cookie);
+       else
+               return -ENOBUFS;
+}
+
+/**
+ * fscache_reserve_space - Reserve data space for a cached object
+ * @cookie: The cookie representing the cache object
+ * @i_size: The amount of space to be reserved
+ *
+ * Reserve an amount of space in the cache for the cache object attached to a
+ * cookie so that a write to that object within the space can always be
+ * honoured.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_reserve_space(struct fscache_cookie *cookie, loff_t size)
+{
+       return -ENOBUFS;
+}
+
+/**
+ * fscache_read_or_alloc_page - Read a page from the cache or allocate a block
+ * in which to store it
+ * @cookie: The cookie representing the cache object
+ * @page: The netfs page to fill if possible
+ * @end_io_func: The callback to invoke when and if the page is filled
+ * @context: An arbitrary piece of data to pass on to end_io_func()
+ * @gfp: The conditions under which memory allocation should be made
+ *
+ * Read a page from the cache, or if that's not possible make a potential
+ * one-block reservation in the cache into which the page may be stored once
+ * fetched from the server.
+ *
+ * If the page is not backed by the cache object, or if it there's some reason
+ * it can't be, -ENOBUFS will be returned and nothing more will be done for
+ * that page.
+ *
+ * Else, if that page is backed by the cache, a read will be initiated directly
+ * to the netfs's page and 0 will be returned by this function.  The
+ * end_io_func() callback will be invoked when the operation terminates on a
+ * completion or failure.  Note that the callback may be invoked before the
+ * return.
+ *
+ * Else, if the page is unbacked, -ENODATA is returned and a block may have
+ * been allocated in the cache.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_read_or_alloc_page(struct fscache_cookie *cookie,
+                              struct page *page,
+                              fscache_rw_complete_t end_io_func,
+                              void *context,
+                              gfp_t gfp)
+{
+       if (fscache_cookie_valid(cookie))
+               return __fscache_read_or_alloc_page(cookie, page, end_io_func,
+                                                   context, gfp);
+       else
+               return -ENOBUFS;
+}
+
+/**
+ * fscache_read_or_alloc_pages - Read pages from the cache and/or allocate
+ * blocks in which to store them
+ * @cookie: The cookie representing the cache object
+ * @mapping: The netfs inode mapping to which the pages will be attached
+ * @pages: A list of potential netfs pages to be filled
+ * @end_io_func: The callback to invoke when and if each page is filled
+ * @context: An arbitrary piece of data to pass on to end_io_func()
+ * @gfp: The conditions under which memory allocation should be made
+ *
+ * Read a set of pages from the cache, or if that's not possible, attempt to
+ * make a potential one-block reservation for each page in the cache into which
+ * that page may be stored once fetched from the server.
+ *
+ * If some pages are not backed by the cache object, or if it there's some
+ * reason they can't be, -ENOBUFS will be returned and nothing more will be
+ * done for that pages.
+ *
+ * Else, if some of the pages are backed by the cache, a read will be initiated
+ * directly to the netfs's page and 0 will be returned by this function.  The
+ * end_io_func() callback will be invoked when the operation terminates on a
+ * completion or failure.  Note that the callback may be invoked before the
+ * return.
+ *
+ * Else, if a page is unbacked, -ENODATA is returned and a block may have
+ * been allocated in the cache.
+ *
+ * Because the function may want to return all of -ENOBUFS, -ENODATA and 0 in
+ * regard to different pages, the return values are prioritised in that order.
+ * Any pages submitted for reading are removed from the pages list.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
+                               struct address_space *mapping,
+                               struct list_head *pages,
+                               unsigned *nr_pages,
+                               fscache_rw_complete_t end_io_func,
+                               void *context,
+                               gfp_t gfp)
+{
+       if (fscache_cookie_valid(cookie))
+               return __fscache_read_or_alloc_pages(cookie, mapping, pages,
+                                                    nr_pages, end_io_func,
+                                                    context, gfp);
+       else
+               return -ENOBUFS;
+}
+
+/**
+ * fscache_alloc_page - Allocate a block in which to store a page
+ * @cookie: The cookie representing the cache object
+ * @page: The netfs page to allocate a page for
+ * @gfp: The conditions under which memory allocation should be made
+ *
+ * Request Allocation a block in the cache in which to store a netfs page
+ * without retrieving any contents from the cache.
+ *
+ * If the page is not backed by a file then -ENOBUFS will be returned and
+ * nothing more will be done, and no reservation will be made.
+ *
+ * Else, a block will be allocated if one wasn't already, and 0 will be
+ * returned
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_alloc_page(struct fscache_cookie *cookie,
+                      struct page *page,
+                      gfp_t gfp)
+{
+       if (fscache_cookie_valid(cookie))
+               return __fscache_alloc_page(cookie, page, gfp);
+       else
+               return -ENOBUFS;
+}
+
+/**
+ * fscache_write_page - Request storage of a page in the cache
+ * @cookie: The cookie representing the cache object
+ * @page: The netfs page to store
+ * @gfp: The conditions under which memory allocation should be made
+ *
+ * Request the contents of the netfs page be written into the cache.  This
+ * request may be ignored if no cache block is currently allocated, in which
+ * case it will return -ENOBUFS.
+ *
+ * If a cache block was already allocated, a write will be initiated and 0 will
+ * be returned.  The PG_fscache_write page bit is set immediately and will then
+ * be cleared at the completion of the write to indicate the success or failure
+ * of the operation.  Note that the completion may happen before the return.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+int fscache_write_page(struct fscache_cookie *cookie,
+                      struct page *page,
+                      gfp_t gfp)
+{
+       if (fscache_cookie_valid(cookie))
+               return __fscache_write_page(cookie, page, gfp);
+       else
+               return -ENOBUFS;
+}
+
+/**
+ * fscache_uncache_page - Indicate that caching is no longer required on a page
+ * @cookie: The cookie representing the cache object
+ * @page: The netfs page that was being cached.
+ *
+ * Tell the cache that we no longer want a page to be cached and that it should
+ * remove any knowledge of the netfs page it may have.
+ *
+ * Note that this cannot cancel any outstanding I/O operations between this
+ * page and the cache.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_uncache_page(struct fscache_cookie *cookie,
+                         struct page *page)
+{
+       if (fscache_cookie_valid(cookie))
+               __fscache_uncache_page(cookie, page);
+}
+
+/**
+ * fscache_check_page_write - Ask if a page is being writing to the cache
+ * @cookie: The cookie representing the cache object
+ * @page: The netfs page that is being cached.
+ *
+ * Ask the cache if a page is being written to the cache.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+bool fscache_check_page_write(struct fscache_cookie *cookie,
+                             struct page *page)
+{
+       if (fscache_cookie_valid(cookie))
+               return __fscache_check_page_write(cookie, page);
+       return false;
+}
+
+/**
+ * fscache_wait_on_page_write - Wait for a page to complete writing to the cache
+ * @cookie: The cookie representing the cache object
+ * @page: The netfs page that is being cached.
+ *
+ * Ask the cache to wake us up when a page is no longer being written to the
+ * cache.
+ *
+ * See Documentation/filesystems/caching/netfs-api.txt for a complete
+ * description.
+ */
+static inline
+void fscache_wait_on_page_write(struct fscache_cookie *cookie,
+                               struct page *page)
+{
+       if (fscache_cookie_valid(cookie))
+               __fscache_wait_on_page_write(cookie, page);
+}
+
+#endif /* _LINUX_FSCACHE_H */
index ed21bd3dbd2552a9322a45c943e078d0e6a4e2fc..29ee2873f4a878026595fe6ac1014a91ec8109d6 100644 (file)
@@ -1,68 +1,6 @@
 #ifndef _LINUX_HDREG_H
 #define _LINUX_HDREG_H
 
-#ifdef __KERNEL__
-#include <linux/ata.h>
-
-/*
- * This file contains some defines for the AT-hd-controller.
- * Various sources.
- */
-
-/* ide.c has its own port definitions in "ide.h" */
-
-#define HD_IRQ         14
-
-/* Hd controller regs. Ref: IBM AT Bios-listing */
-#define HD_DATA                0x1f0           /* _CTL when writing */
-#define HD_ERROR       0x1f1           /* see err-bits */
-#define HD_NSECTOR     0x1f2           /* nr of sectors to read/write */
-#define HD_SECTOR      0x1f3           /* starting sector */
-#define HD_LCYL                0x1f4           /* starting cylinder */
-#define HD_HCYL                0x1f5           /* high byte of starting cyl */
-#define HD_CURRENT     0x1f6           /* 101dhhhh , d=drive, hhhh=head */
-#define HD_STATUS      0x1f7           /* see status-bits */
-#define HD_FEATURE     HD_ERROR        /* same io address, read=error, write=feature */
-#define HD_PRECOMP     HD_FEATURE      /* obsolete use of this port - predates IDE */
-#define HD_COMMAND     HD_STATUS       /* same io address, read=status, write=cmd */
-
-#define HD_CMD         0x3f6           /* used for resets */
-#define HD_ALTSTATUS   0x3f6           /* same as HD_STATUS but doesn't clear irq */
-
-/* remainder is shared between hd.c, ide.c, ide-cd.c, and the hdparm utility */
-
-/* Bits of HD_STATUS */
-#define ERR_STAT               0x01
-#define INDEX_STAT             0x02
-#define ECC_STAT               0x04    /* Corrected error */
-#define DRQ_STAT               0x08
-#define SEEK_STAT              0x10
-#define SRV_STAT               0x10
-#define WRERR_STAT             0x20
-#define READY_STAT             0x40
-#define BUSY_STAT              0x80
-
-/* Bits for HD_ERROR */
-#define MARK_ERR               0x01    /* Bad address mark */
-#define ILI_ERR                        0x01    /* Illegal Length Indication (ATAPI) */
-#define TRK0_ERR               0x02    /* couldn't find track 0 */
-#define EOM_ERR                        0x02    /* End Of Media (ATAPI) */
-#define ABRT_ERR               0x04    /* Command aborted */
-#define MCR_ERR                        0x08    /* media change request */
-#define ID_ERR                 0x10    /* ID field not found */
-#define MC_ERR                 0x20    /* media changed */
-#define ECC_ERR                        0x40    /* Uncorrectable ECC error */
-#define BBD_ERR                        0x80    /* pre-EIDE meaning:  block marked bad */
-#define ICRC_ERR               0x80    /* new meaning:  CRC error during transfer */
-#define LFS_ERR                        0xf0    /* Last Failed Sense (ATAPI) */
-
-/* Bits of HD_NSECTOR */
-#define CD                     0x01
-#define IO                     0x02
-#define REL                    0x04
-#define TAG_MASK               0xf8
-#endif /* __KERNEL__ */
-
 #include <linux/types.h>
 
 /*
@@ -191,6 +129,7 @@ typedef struct hd_drive_hob_hdr {
 #define TASKFILE_INVALID               0x7fff
 #endif
 
+#ifndef __KERNEL__
 /* ATA/ATAPI Commands pre T13 Spec */
 #define WIN_NOP                                0x00
 /*
@@ -379,6 +318,7 @@ typedef struct hd_drive_hob_hdr {
 #define SECURITY_ERASE_UNIT            0xBD
 #define SECURITY_FREEZE_LOCK           0xBE
 #define SECURITY_DISABLE_PASSWORD      0xBF
+#endif /* __KERNEL__ */
 
 struct hd_geometry {
       unsigned char heads;
@@ -448,6 +388,7 @@ enum {
 
 #define __NEW_HD_DRIVE_ID
 
+#ifndef __KERNEL__
 /*
  * Structure returned by HDIO_GET_IDENTITY, as per ANSI NCITS ATA6 rev.1b spec.
  *
@@ -699,6 +640,7 @@ struct hd_driveid {
                                         *  7:0 Signature
                                         */
 };
+#endif /* __KERNEL__ */
 
 /*
  * IDE "nice" flags. These are used on a per drive basis to determine
index 7ff5c55f9b554120379baadd28be15cebbfe839a..1fcb7126a01f1aa3e7a4c63abd46c4311946c843 100644 (file)
@@ -19,8 +19,21 @@ static inline void flush_kernel_dcache_page(struct page *page)
 }
 #endif
 
-#ifdef CONFIG_HIGHMEM
+#include <asm/kmap_types.h>
+
+#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT)
+
+void debug_kmap_atomic(enum km_type type);
+
+#else
 
+static inline void debug_kmap_atomic(enum km_type type)
+{
+}
+
+#endif
+
+#ifdef CONFIG_HIGHMEM
 #include <asm/highmem.h>
 
 /* declarations for linux/mm/highmem.c */
@@ -44,8 +57,6 @@ static inline void *kmap(struct page *page)
 
 #define kunmap(page) do { (void) (page); } while (0)
 
-#include <asm/kmap_types.h>
-
 static inline void *kmap_atomic(struct page *page, enum km_type idx)
 {
        pagefault_disable();
@@ -187,16 +198,4 @@ static inline void copy_highpage(struct page *to, struct page *from)
        kunmap_atomic(vto, KM_USER1);
 }
 
-#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT)
-
-void debug_kmap_atomic(enum km_type type);
-
-#else
-
-static inline void debug_kmap_atomic(enum km_type type)
-{
-}
-
-#endif
-
 #endif /* _LINUX_HIGHMEM_H */
index 8137f660a5cc58a3768d498d6b77ed9dc25ad12f..0dc80ef249752dea6c1d1cb735d6200341f46a9e 100644 (file)
@@ -218,6 +218,53 @@ int twl4030_i2c_read(u8 mod_no, u8 *value, u8 reg, unsigned num_bytes);
 
 /*----------------------------------------------------------------------*/
 
+/* Power bus message definitions */
+
+#define DEV_GRP_NULL           0x0
+#define DEV_GRP_P1             0x1
+#define DEV_GRP_P2             0x2
+#define DEV_GRP_P3             0x4
+
+#define RES_GRP_RES            0x0
+#define RES_GRP_PP             0x1
+#define RES_GRP_RC             0x2
+#define RES_GRP_PP_RC          0x3
+#define RES_GRP_PR             0x4
+#define RES_GRP_PP_PR          0x5
+#define RES_GRP_RC_PR          0x6
+#define RES_GRP_ALL            0x7
+
+#define RES_TYPE2_R0           0x0
+
+#define RES_TYPE_ALL           0x7
+
+#define RES_STATE_WRST         0xF
+#define RES_STATE_ACTIVE       0xE
+#define RES_STATE_SLEEP                0x8
+#define RES_STATE_OFF          0x0
+
+/*
+ * Power Bus Message Format ... these can be sent individually by Linux,
+ * but are usually part of downloaded scripts that are run when various
+ * power events are triggered.
+ *
+ *  Broadcast Message (16 Bits):
+ *    DEV_GRP[15:13] MT[12]  RES_GRP[11:9]  RES_TYPE2[8:7] RES_TYPE[6:4]
+ *    RES_STATE[3:0]
+ *
+ *  Singular Message (16 Bits):
+ *    DEV_GRP[15:13] MT[12]  RES_ID[11:4]  RES_STATE[3:0]
+ */
+
+#define MSG_BROADCAST(devgrp, grp, type, type2, state) \
+       ( (devgrp) << 13 | 1 << 12 | (grp) << 9 | (type2) << 7 \
+       | (type) << 4 | (state))
+
+#define MSG_SINGULAR(devgrp, id, state) \
+       ((devgrp) << 13 | 0 << 12 | (id) << 4 | (state))
+
+/*----------------------------------------------------------------------*/
+
 struct twl4030_bci_platform_data {
        int *battery_tmp_tbl;
        unsigned int tblsize;
index 1d6c71d96edeac0d730fb178e91120bebe695300..77214ead1a36343ee4267647c00fd9bd9e3d28c8 100644 (file)
@@ -123,7 +123,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 #define ecap_eim_support(e)    ((e >> 4) & 0x1)
 #define ecap_ir_support(e)     ((e >> 3) & 0x1)
 #define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
-
+#define ecap_sc_support(e)     ((e >> 7) & 0x1) /* Snooping Control */
 
 /* IOTLB_REG */
 #define DMA_TLB_FLUSH_GRANU_OFFSET  60
index 8a7bfb1b6ca09c4275b99187b74c807bd6bdb9ae..3af4ffd591b978c4f7f815a5952131a0b673aa13 100644 (file)
@@ -21,6 +21,7 @@
 
 #define IOMMU_READ     (1)
 #define IOMMU_WRITE    (2)
+#define IOMMU_CACHE    (4) /* DMA cache coherency */
 
 struct device;
 
@@ -28,6 +29,8 @@ struct iommu_domain {
        void *priv;
 };
 
+#define IOMMU_CAP_CACHE_COHERENCY      0x1
+
 struct iommu_ops {
        int (*domain_init)(struct iommu_domain *domain);
        void (*domain_destroy)(struct iommu_domain *domain);
@@ -39,6 +42,8 @@ struct iommu_ops {
                      size_t size);
        phys_addr_t (*iova_to_phys)(struct iommu_domain *domain,
                                    unsigned long iova);
+       int (*domain_has_cap)(struct iommu_domain *domain,
+                             unsigned long cap);
 };
 
 #ifdef CONFIG_IOMMU_API
@@ -57,6 +62,8 @@ extern void iommu_unmap_range(struct iommu_domain *domain, unsigned long iova,
                              size_t size);
 extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
                                      unsigned long iova);
+extern int iommu_domain_has_cap(struct iommu_domain *domain,
+                               unsigned long cap);
 
 #else /* CONFIG_IOMMU_API */
 
@@ -107,6 +114,12 @@ static inline phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
        return 0;
 }
 
+static inline int domain_has_cap(struct iommu_domain *domain,
+                                unsigned long cap)
+{
+       return 0;
+}
+
 #endif /* CONFIG_IOMMU_API */
 
 #endif /* __LINUX_IOMMU_H */
index 64246dce5663079b5f69afe9eabc05efc3e1229e..2c6943152c21ae27982f429b26f509b95ce7f34c 100644 (file)
@@ -552,6 +552,11 @@ struct transaction_s
         */
        int t_handle_count;
 
+       /*
+        * This transaction is being forced and some process is
+        * waiting for it to finish.
+        */
+       int t_synchronous_commit:1;
 };
 
 /**
index 4e457256bd33dd18734a0f292c23d2404ee0b60f..3e7615e9087e400c767332dfa3c64e5ec8483356 100644 (file)
@@ -192,5 +192,10 @@ static inline void mmc_signal_sdio_irq(struct mmc_host *host)
        wake_up_process(host->sdio_irq_thread);
 }
 
+struct regulator;
+
+int mmc_regulator_get_ocrmask(struct regulator *supply);
+int mmc_regulator_set_ocr(struct regulator *supply, unsigned short vdd_bit);
+
 #endif
 
index bde2557c2a9cec10613328369aebe64bbf3150c4..fdffb413b19276e7f7c9a2cc49bf7ec84e80a7cf 100644 (file)
@@ -185,6 +185,9 @@ struct nfs_inode {
        fmode_t                  delegation_state;
        struct rw_semaphore     rwsem;
 #endif /* CONFIG_NFS_V4*/
+#ifdef CONFIG_NFS_FSCACHE
+       struct fscache_cookie   *fscache;
+#endif
        struct inode            vfs_inode;
 };
 
@@ -207,6 +210,8 @@ struct nfs_inode {
 #define NFS_INO_ACL_LRU_SET    (2)             /* Inode is on the LRU list */
 #define NFS_INO_MOUNTPOINT     (3)             /* inode is remote mountpoint */
 #define NFS_INO_FLUSHING       (4)             /* inode is flushing out data */
+#define NFS_INO_FSCACHE                (5)             /* inode can be cached by FS-Cache */
+#define NFS_INO_FSCACHE_LOCK   (6)             /* FS-Cache cookie management lock */
 
 static inline struct nfs_inode *NFS_I(const struct inode *inode)
 {
@@ -260,6 +265,11 @@ static inline int NFS_STALE(const struct inode *inode)
        return test_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
 }
 
+static inline int NFS_FSCACHE(const struct inode *inode)
+{
+       return test_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
+}
+
 static inline __u64 NFS_FILEID(const struct inode *inode)
 {
        return NFS_I(inode)->fileid;
@@ -506,6 +516,8 @@ extern int  nfs_readpages(struct file *, struct address_space *,
                struct list_head *, unsigned);
 extern int  nfs_readpage_result(struct rpc_task *, struct nfs_read_data *);
 extern void nfs_readdata_release(void *data);
+extern int  nfs_readpage_async(struct nfs_open_context *, struct inode *,
+                              struct page *);
 
 /*
  * Allocate nfs_read_data structures
@@ -583,6 +595,7 @@ extern void * nfs_root_data(void);
 #define NFSDBG_CALLBACK                0x0100
 #define NFSDBG_CLIENT          0x0200
 #define NFSDBG_MOUNT           0x0400
+#define NFSDBG_FSCACHE         0x0800
 #define NFSDBG_ALL             0xFFFF
 
 #ifdef __KERNEL__
index 29b1e40dce99cc3f76fea6b55904546af753725e..6ad75948cbf76613fb18c69069c0ccc8efdf580a 100644 (file)
@@ -64,6 +64,10 @@ struct nfs_client {
        char                    cl_ipaddr[48];
        unsigned char           cl_id_uniquifier;
 #endif
+
+#ifdef CONFIG_NFS_FSCACHE
+       struct fscache_cookie   *fscache;       /* client index cache cookie */
+#endif
 };
 
 /*
@@ -96,12 +100,19 @@ struct nfs_server {
        unsigned int            acdirmin;
        unsigned int            acdirmax;
        unsigned int            namelen;
+       unsigned int            options;        /* extra options enabled by mount */
+#define NFS_OPTION_FSCACHE     0x00000001      /* - local caching enabled */
 
        struct nfs_fsid         fsid;
        __u64                   maxfilesize;    /* maximum file size */
        unsigned long           mount_time;     /* when this fs was mounted */
        dev_t                   s_dev;          /* superblock dev numbers */
 
+#ifdef CONFIG_NFS_FSCACHE
+       struct nfs_fscache_key  *fscache_key;   /* unique key for superblock */
+       struct fscache_cookie   *fscache;       /* superblock cookie */
+#endif
+
 #ifdef CONFIG_NFS_V4
        u32                     attr_bitmask[2];/* V4 bitmask representing the set
                                                   of attributes supported on this
index 1cb9a3fed2b3bc953ad3a94a5008dc4a529fe275..68b10f5f8907fe0517d0bc5352686b34dd5be9e3 100644 (file)
@@ -116,4 +116,16 @@ enum nfs_stat_eventcounters {
        __NFSIOS_COUNTSMAX,
 };
 
+/*
+ * NFS local caching servicing counters
+ */
+enum nfs_stat_fscachecounters {
+       NFSIOS_FSCACHE_PAGES_READ_OK,
+       NFSIOS_FSCACHE_PAGES_READ_FAIL,
+       NFSIOS_FSCACHE_PAGES_WRITTEN_OK,
+       NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL,
+       NFSIOS_FSCACHE_PAGES_UNCACHED,
+       __NFSIOS_FSCACHEMAX,
+};
+
 #endif /* _LINUX_NFS_IOSTAT */
index 61df1779b2a52050c89dd68534cd402f9c3f4100..62214c7d2d939e734352bf888e058001e44ccc30 100644 (file)
@@ -82,6 +82,7 @@ enum pageflags {
        PG_arch_1,
        PG_reserved,
        PG_private,             /* If pagecache, has fs-private data */
+       PG_private_2,           /* If pagecache, has fs aux data */
        PG_writeback,           /* Page is under writeback */
 #ifdef CONFIG_PAGEFLAGS_EXTENDED
        PG_head,                /* A head page */
@@ -108,6 +109,12 @@ enum pageflags {
        /* Filesystems */
        PG_checked = PG_owner_priv_1,
 
+       /* Two page bits are conscripted by FS-Cache to maintain local caching
+        * state.  These bits are set on pages belonging to the netfs's inodes
+        * when those inodes are being locally cached.
+        */
+       PG_fscache = PG_private_2,      /* page backed by cache */
+
        /* XEN */
        PG_pinned = PG_owner_priv_1,
        PG_savepinned = PG_dirty,
@@ -182,7 +189,7 @@ static inline int TestClearPage##uname(struct page *page) { return 0; }
 
 struct page;   /* forward declaration */
 
-TESTPAGEFLAG(Locked, locked)
+TESTPAGEFLAG(Locked, locked) TESTSETFLAG(Locked, locked)
 PAGEFLAG(Error, error)
 PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
 PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
@@ -194,8 +201,6 @@ PAGEFLAG(Checked, checked)          /* Used by some filesystems */
 PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned)    /* Xen */
 PAGEFLAG(SavePinned, savepinned);                      /* Xen */
 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
-PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private)
-       __SETPAGEFLAG(Private, private)
 PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
 
 __PAGEFLAG(SlobPage, slob_page)
@@ -204,6 +209,16 @@ __PAGEFLAG(SlobFree, slob_free)
 __PAGEFLAG(SlubFrozen, slub_frozen)
 __PAGEFLAG(SlubDebug, slub_debug)
 
+/*
+ * Private page markings that may be used by the filesystem that owns the page
+ * for its own purposes.
+ * - PG_private and PG_private_2 cause releasepage() and co to be invoked
+ */
+PAGEFLAG(Private, private) __SETPAGEFLAG(Private, private)
+       __CLEARPAGEFLAG(Private, private)
+PAGEFLAG(Private2, private_2) TESTSCFLAG(Private2, private_2)
+PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
+
 /*
  * Only test-and-set exist for PG_writeback.  The unconditional operators are
  * risky: they bypass page accounting.
@@ -384,9 +399,10 @@ static inline void __ClearPageTail(struct page *page)
  * these flags set.  It they are, there is a problem.
  */
 #define PAGE_FLAGS_CHECK_AT_FREE \
-       (1 << PG_lru   | 1 << PG_private   | 1 << PG_locked | \
-        1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \
-        1 << PG_slab  | 1 << PG_swapcache | 1 << PG_active | \
+       (1 << PG_lru     | 1 << PG_locked    | \
+        1 << PG_private | 1 << PG_private_2 | \
+        1 << PG_buddy   | 1 << PG_writeback | 1 << PG_reserved | \
+        1 << PG_slab    | 1 << PG_swapcache | 1 << PG_active | \
         __PG_UNEVICTABLE | __PG_MLOCKED)
 
 /*
@@ -397,4 +413,16 @@ static inline void __ClearPageTail(struct page *page)
 #define PAGE_FLAGS_CHECK_AT_PREP       ((1 << NR_PAGEFLAGS) - 1)
 
 #endif /* !__GENERATING_BOUNDS_H */
+
+/**
+ * page_has_private - Determine if page has private stuff
+ * @page: The page to be checked
+ *
+ * Determine if a page has private stuff, indicating that release routines
+ * should be invoked upon it.
+ */
+#define page_has_private(page)                 \
+       ((page)->flags & ((1 << PG_private) |   \
+                         (1 << PG_private_2)))
+
 #endif /* PAGE_FLAGS_H */
index 076a7dc67c2bd25c89bec24ce036d719f1886dc3..34da5230faab4617cfe9bba4c43ac83cc083ab12 100644 (file)
@@ -383,6 +383,11 @@ static inline void wait_on_page_writeback(struct page *page)
 
 extern void end_page_writeback(struct page *page);
 
+/*
+ * Add an arbitrary waiter to a page's wait queue
+ */
+extern void add_page_wait_queue(struct page *page, wait_queue_t *waiter);
+
 /*
  * Fault a userspace page into pagetables.  Return non-zero on a fault.
  *
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
deleted file mode 100644 (file)
index 82bea14..0000000
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
-   md.h : Multiple Devices driver for Linux
-          Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
-          Copyright (C) 1994-96 Marc ZYNGIER
-         <zyngier@ufr-info-p7.ibp.fr> or
-         <maz@gloups.fdn.fr>
-         
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2, or (at your option)
-   any later version.
-   
-   You should have received a copy of the GNU General Public License
-   (for example /usr/src/linux/COPYING); if not, write to the Free
-   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
-*/
-
-#ifndef _MD_H
-#define _MD_H
-
-#include <linux/blkdev.h>
-#include <linux/seq_file.h>
-
-/*
- * 'md_p.h' holds the 'physical' layout of RAID devices
- * 'md_u.h' holds the user <=> kernel API
- *
- * 'md_k.h' holds kernel internal definitions
- */
-
-#include <linux/raid/md_p.h>
-#include <linux/raid/md_u.h>
-#include <linux/raid/md_k.h>
-
-#ifdef CONFIG_MD
-
-/*
- * Different major versions are not compatible.
- * Different minor versions are only downward compatible.
- * Different patchlevel versions are downward and upward compatible.
- */
-#define MD_MAJOR_VERSION                0
-#define MD_MINOR_VERSION                90
-/*
- * MD_PATCHLEVEL_VERSION indicates kernel functionality.
- * >=1 means different superblock formats are selectable using SET_ARRAY_INFO
- *     and major_version/minor_version accordingly
- * >=2 means that Internal bitmaps are supported by setting MD_SB_BITMAP_PRESENT
- *     in the super status byte
- * >=3 means that bitmap superblock version 4 is supported, which uses
- *     little-ending representation rather than host-endian
- */
-#define MD_PATCHLEVEL_VERSION           3
-
-extern int mdp_major;
-
-extern int register_md_personality(struct mdk_personality *p);
-extern int unregister_md_personality(struct mdk_personality *p);
-extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
-                               mddev_t *mddev, const char *name);
-extern void md_unregister_thread(mdk_thread_t *thread);
-extern void md_wakeup_thread(mdk_thread_t *thread);
-extern void md_check_recovery(mddev_t *mddev);
-extern void md_write_start(mddev_t *mddev, struct bio *bi);
-extern void md_write_end(mddev_t *mddev);
-extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
-extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
-
-extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
-                          sector_t sector, int size, struct page *page);
-extern void md_super_wait(mddev_t *mddev);
-extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
-                       struct page *page, int rw);
-extern void md_do_sync(mddev_t *mddev);
-extern void md_new_event(mddev_t *mddev);
-extern int md_allow_write(mddev_t *mddev);
-extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
-
-#endif /* CONFIG_MD */
-#endif 
-
index 7192035fc4b0680094f3c4f0c2780a79c20dfd1b..fb1abb3367e9520a7e68cf738709e7510abe80f0 100644 (file)
 #ifndef _MD_U_H
 #define _MD_U_H
 
+/*
+ * Different major versions are not compatible.
+ * Different minor versions are only downward compatible.
+ * Different patchlevel versions are downward and upward compatible.
+ */
+#define MD_MAJOR_VERSION                0
+#define MD_MINOR_VERSION                90
+/*
+ * MD_PATCHLEVEL_VERSION indicates kernel functionality.
+ * >=1 means different superblock formats are selectable using SET_ARRAY_INFO
+ *     and major_version/minor_version accordingly
+ * >=2 means that Internal bitmaps are supported by setting MD_SB_BITMAP_PRESENT
+ *     in the super status byte
+ * >=3 means that bitmap superblock version 4 is supported, which uses
+ *     little-ending representation rather than host-endian
+ */
+#define MD_PATCHLEVEL_VERSION           3
+
 /* ioctls */
 
 /* status */
 #define STOP_ARRAY_RO          _IO (MD_MAJOR, 0x33)
 #define RESTART_ARRAY_RW       _IO (MD_MAJOR, 0x34)
 
+/* 63 partitions with the alternate major number (mdp) */
+#define MdpMinorShift 6
+#ifdef __KERNEL__
+extern int mdp_major;
+#endif
+
 typedef struct mdu_version_s {
        int major;
        int minor;
@@ -85,6 +109,17 @@ typedef struct mdu_array_info_s {
 
 } mdu_array_info_t;
 
+/* non-obvious values for 'level' */
+#define        LEVEL_MULTIPATH         (-4)
+#define        LEVEL_LINEAR            (-1)
+#define        LEVEL_FAULTY            (-5)
+
+/* we need a value for 'no level specified' and 0
+ * means 'raid0', so we need something else.  This is
+ * for internal use only
+ */
+#define        LEVEL_NONE              (-1000000)
+
 typedef struct mdu_disk_info_s {
        /*
         * configuration/status of one particular disk
similarity index 86%
rename from drivers/md/raid6.h
rename to include/linux/raid/pq.h
index 98dcde88470e4c23b8fc204e46587be0243bfa32..d92480f8285c76bec5b67fb5950fdc12f2d06c3c 100644 (file)
@@ -5,7 +5,7 @@
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
- *   Bostom MA 02111-1307, USA; either version 2 of the License, or
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
  *   (at your option) any later version; incorporated herein by reference.
  *
  * ----------------------------------------------------------------------- */
 
 /* Set to 1 to use kernel-wide empty_zero_page */
 #define RAID6_USE_EMPTY_ZERO_PAGE 0
-
-#include <linux/raid/md.h>
-#include <linux/raid/raid5.h>
-
-typedef raid5_conf_t raid6_conf_t; /* Same configuration */
-
-/* Additional compute_parity mode -- updates the parity w/o LOCKING */
-#define UPDATE_PARITY  4
+#include <linux/blkdev.h>
 
 /* We need a pre-zeroed page... if we don't want to use the kernel-provided
    one define it here */
@@ -68,6 +61,10 @@ extern const char raid6_empty_zero_page[PAGE_SIZE];
 #define enable_kernel_altivec()
 #define disable_kernel_altivec()
 
+#define EXPORT_SYMBOL(sym)
+#define MODULE_LICENSE(licence)
+#define subsys_initcall(x)
+#define module_exit(x)
 #endif /* __KERNEL__ */
 
 /* Routine choices */
@@ -98,9 +95,11 @@ extern const u8 raid6_gfinv[256]      __attribute__((aligned(256)));
 extern const u8 raid6_gfexi[256]      __attribute__((aligned(256)));
 
 /* Recovery routines */
-void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, void **ptrs);
+void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
+                      void **ptrs);
 void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs);
-void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs);
+void raid6_dual_recov(int disks, size_t bytes, int faila, int failb,
+                     void **ptrs);
 
 /* Some definitions to allow code to be compiled for testing in userspace */
 #ifndef __KERNEL__
@@ -108,8 +107,11 @@ void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs
 # define jiffies       raid6_jiffies()
 # define printk        printf
 # define GFP_KERNEL    0
-# define __get_free_pages(x,y) ((unsigned long)mmap(NULL, PAGE_SIZE << (y), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0))
-# define free_pages(x,y)       munmap((void *)(x), (y)*PAGE_SIZE)
+# define __get_free_pages(x, y)        ((unsigned long)mmap(NULL, PAGE_SIZE << (y), \
+                                                    PROT_READ|PROT_WRITE,   \
+                                                    MAP_PRIVATE|MAP_ANONYMOUS,\
+                                                    0, 0))
+# define free_pages(x, y)      munmap((void *)(x), (y)*PAGE_SIZE)
 
 static inline void cpu_relax(void)
 {
index 3e120587eadac53669053e040db23b747ad4a91f..5a210959e3f8a71f5251ccf325bc52f629df124a 100644 (file)
@@ -1,8 +1,6 @@
 #ifndef _XOR_H
 #define _XOR_H
 
-#include <linux/raid/md.h>
-
 #define MAX_XOR_BLOCKS 4
 
 extern void xor_blocks(unsigned int count, unsigned int bytes,
index e84b0a9feda579e64025fdbd67f4b4d5127f9620..a6d014005d49e6df67ded6c9b7cede3acbd40501 100644 (file)
@@ -10,6 +10,8 @@
  *
  */
 
+struct regulator_init_data;
+
 /**
  * bq24022_mach_info - platform data for bq24022
  * @gpio_nce: GPIO line connected to the nCE pin, used to enable / disable charging
@@ -18,4 +20,5 @@
 struct bq24022_mach_info {
        int gpio_nce;
        int gpio_iset2;
+       struct regulator_init_data *init_data;
 };
index 801bf77ff4e2c522f90ffafabc648f8679309cba..277f4b964df531a9f4cde24cbb8b60c3c1a0c8f7 100644 (file)
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2007, 2008 Wolfson Microelectronics PLC.
  *
- * Author: Liam Girdwood <lg@opensource.wolfsonmicro.com>
+ * Author: Liam Girdwood <lrg@slimlogic.co.uk>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -88,6 +88,7 @@
  * FAIL           Regulator output has failed.
  * OVER_TEMP      Regulator over temp.
  * FORCE_DISABLE  Regulator shut down by software.
+ * VOLTAGE_CHANGE Regulator voltage changed.
  *
  * NOTE: These events can be OR'ed together when passed into handler.
  */
@@ -98,6 +99,7 @@
 #define REGULATOR_EVENT_FAIL                   0x08
 #define REGULATOR_EVENT_OVER_TEMP              0x10
 #define REGULATOR_EVENT_FORCE_DISABLE          0x20
+#define REGULATOR_EVENT_VOLTAGE_CHANGE         0x40
 
 struct regulator;
 
@@ -140,6 +142,8 @@ int regulator_bulk_disable(int num_consumers,
 void regulator_bulk_free(int num_consumers,
                         struct regulator_bulk_data *consumers);
 
+int regulator_count_voltages(struct regulator *regulator);
+int regulator_list_voltage(struct regulator *regulator, unsigned selector);
 int regulator_set_voltage(struct regulator *regulator, int min_uV, int max_uV);
 int regulator_get_voltage(struct regulator *regulator);
 int regulator_set_current_limit(struct regulator *regulator,
index 2dae05705f13e21b9d3410b6c5dfb494a1b9bab8..4848d8dacd903b81edba899dd76d69eefed487a8 100644 (file)
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2007, 2008 Wolfson Microelectronics PLC.
  *
- * Author: Liam Girdwood <lg@opensource.wolfsonmicro.com>
+ * Author: Liam Girdwood <lrg@slimlogic.co.uk>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
 struct regulator_dev;
 struct regulator_init_data;
 
+enum regulator_status {
+       REGULATOR_STATUS_OFF,
+       REGULATOR_STATUS_ON,
+       REGULATOR_STATUS_ERROR,
+       /* fast/normal/idle/standby are flavors of "on" */
+       REGULATOR_STATUS_FAST,
+       REGULATOR_STATUS_NORMAL,
+       REGULATOR_STATUS_IDLE,
+       REGULATOR_STATUS_STANDBY,
+};
+
 /**
  * struct regulator_ops - regulator operations.
  *
- * This struct describes regulator operations which can be implemented by
- * regulator chip drivers.
- *
- * @enable: Enable the regulator.
- * @disable: Disable the regulator.
+ * @enable: Configure the regulator as enabled.
+ * @disable: Configure the regulator as disabled.
  * @is_enabled: Return 1 if the regulator is enabled, 0 otherwise.
  *
  * @set_voltage: Set the voltage for the regulator within the range specified.
  *               The driver should select the voltage closest to min_uV.
  * @get_voltage: Return the currently configured voltage for the regulator.
+ * @list_voltage: Return one of the supported voltages, in microvolts; zero
+ *     if the selector indicates a voltage that is unusable on this system;
+ *     or negative errno.  Selectors range from zero to one less than
+ *     regulator_desc.n_voltages.  Voltages may be reported in any order.
  *
  * @set_current_limit: Configure a limit for a current-limited regulator.
- * @get_current_limit: Get the limit for a current-limited regulator.
+ * @get_current_limit: Get the configured limit for a current-limited regulator.
  *
- * @set_mode: Set the operating mode for the regulator.
- * @get_mode: Get the current operating mode for the regulator.
+ * @get_mode: Get the configured operating mode for the regulator.
+ * @get_status: Return actual (not as-configured) status of regulator, as a
+ *     REGULATOR_STATUS value (or negative errno)
  * @get_optimum_mode: Get the most efficient operating mode for the regulator
  *                    when running with the specified parameters.
  *
@@ -51,9 +64,15 @@ struct regulator_init_data;
  *                       suspended.
  * @set_suspend_mode: Set the operating mode for the regulator when the
  *                    system is suspended.
+ *
+ * This struct describes regulator operations which can be implemented by
+ * regulator chip drivers.
  */
 struct regulator_ops {
 
+       /* enumerate supported voltages */
+       int (*list_voltage) (struct regulator_dev *, unsigned selector);
+
        /* get/set regulator voltage */
        int (*set_voltage) (struct regulator_dev *, int min_uV, int max_uV);
        int (*get_voltage) (struct regulator_dev *);
@@ -72,6 +91,13 @@ struct regulator_ops {
        int (*set_mode) (struct regulator_dev *, unsigned int mode);
        unsigned int (*get_mode) (struct regulator_dev *);
 
+       /* report regulator status ... most other accessors report
+        * control inputs, this reports results of combining inputs
+        * from Linux (and other sources) with the actual load.
+        * returns REGULATOR_STATUS_* or negative errno.
+        */
+       int (*get_status)(struct regulator_dev *);
+
        /* get most efficient regulator operating mode for load */
        unsigned int (*get_optimum_mode) (struct regulator_dev *, int input_uV,
                                          int output_uV, int load_uA);
@@ -106,6 +132,7 @@ enum regulator_type {
  *
  * @name: Identifying name for the regulator.
  * @id: Numerical identifier for the regulator.
+ * @n_voltages: Number of selectors available for ops.list_voltage().
  * @ops: Regulator operations table.
  * @irq: Interrupt number for the regulator.
  * @type: Indicates if the regulator is a voltage or current regulator.
@@ -114,14 +141,48 @@ enum regulator_type {
 struct regulator_desc {
        const char *name;
        int id;
+       unsigned n_voltages;
        struct regulator_ops *ops;
        int irq;
        enum regulator_type type;
        struct module *owner;
 };
 
+/*
+ * struct regulator_dev
+ *
+ * Voltage / Current regulator class device. One for each
+ * regulator.
+ *
+ * This should *not* be used directly by anything except the regulator
+ * core and notification injection (which should take the mutex and do
+ * no other direct access).
+ */
+struct regulator_dev {
+       struct regulator_desc *desc;
+       int use_count;
+
+       /* lists we belong to */
+       struct list_head list; /* list of all regulators */
+       struct list_head slist; /* list of supplied regulators */
+
+       /* lists we own */
+       struct list_head consumer_list; /* consumers we supply */
+       struct list_head supply_list; /* regulators we supply */
+
+       struct blocking_notifier_head notifier;
+       struct mutex mutex; /* consumer lock */
+       struct module *owner;
+       struct device dev;
+       struct regulation_constraints *constraints;
+       struct regulator_dev *supply;   /* for tree */
+
+       void *reg_data;         /* regulator_dev data */
+};
+
 struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
-       struct device *dev, void *driver_data);
+       struct device *dev, struct regulator_init_data *init_data,
+       void *driver_data);
 void regulator_unregister(struct regulator_dev *rdev);
 
 int regulator_notifier_call_chain(struct regulator_dev *rdev,
index 1387a5d2190e627801c90ec0308b538265f6d1c6..91b4da31f1b510a0c21bc5d579b0143e753152ec 100644 (file)
 #ifndef __REGULATOR_FIXED_H
 #define __REGULATOR_FIXED_H
 
+struct regulator_init_data;
+
 struct fixed_voltage_config {
        const char *supply_name;
        int microvolts;
+       struct regulator_init_data *init_data;
 };
 
 #endif
index 3794773b23d2cfed344b2c3b00efd9606e1ff41d..bac64fa390f20981c1dd8a1fc8b49657bc2ef30b 100644 (file)
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2007, 2008 Wolfson Microelectronics PLC.
  *
- * Author: Liam Girdwood <lg@opensource.wolfsonmicro.com>
+ * Author: Liam Girdwood <lrg@slimlogic.co.uk>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -73,7 +73,9 @@ struct regulator_state {
  *
  * @always_on: Set if the regulator should never be disabled.
  * @boot_on: Set if the regulator is enabled when the system is initially
- *           started.
+ *           started.  If the regulator is not enabled by the hardware or
+ *           bootloader then it will be enabled when the constraints are
+ *           applied.
  * @apply_uV: Apply the voltage constraint when initialising.
  *
  * @input_uV: Input voltage for regulator when supplied by another regulator.
@@ -83,6 +85,7 @@ struct regulator_state {
  * @state_standby: State for regulator when system is suspended in standby
  *                 mode.
  * @initial_state: Suspend state to set by default.
+ * @initial_mode: Mode to set at startup.
  */
 struct regulation_constraints {
 
@@ -111,6 +114,9 @@ struct regulation_constraints {
        struct regulator_state state_standby;
        suspend_state_t initial_state; /* suspend state to set at init */
 
+       /* mode to set on startup */
+       unsigned int initial_mode;
+
        /* constriant flags */
        unsigned always_on:1;   /* regulator never off when system is on */
        unsigned boot_on:1;     /* bootloader/firmware enabled regulator */
@@ -160,4 +166,6 @@ struct regulator_init_data {
 
 int regulator_suspend_prepare(suspend_state_t state);
 
+void regulator_has_full_constraints(void);
+
 #endif
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
new file mode 100644 (file)
index 0000000..8595827
--- /dev/null
@@ -0,0 +1,95 @@
+/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ *
+ * See Documentation/slow-work.txt
+ */
+
+#ifndef _LINUX_SLOW_WORK_H
+#define _LINUX_SLOW_WORK_H
+
+#ifdef CONFIG_SLOW_WORK
+
+#include <linux/sysctl.h>
+
+struct slow_work;
+
+/*
+ * The operations used to support slow work items
+ */
+struct slow_work_ops {
+       /* get a ref on a work item
+        * - return 0 if successful, -ve if not
+        */
+       int (*get_ref)(struct slow_work *work);
+
+       /* discard a ref to a work item */
+       void (*put_ref)(struct slow_work *work);
+
+       /* execute a work item */
+       void (*execute)(struct slow_work *work);
+};
+
+/*
+ * A slow work item
+ * - A reference is held on the parent object by the thread pool when it is
+ *   queued
+ */
+struct slow_work {
+       unsigned long           flags;
+#define SLOW_WORK_PENDING      0       /* item pending (further) execution */
+#define SLOW_WORK_EXECUTING    1       /* item currently executing */
+#define SLOW_WORK_ENQ_DEFERRED 2       /* item enqueue deferred */
+#define SLOW_WORK_VERY_SLOW    3       /* item is very slow */
+       const struct slow_work_ops *ops; /* operations table for this item */
+       struct list_head        link;   /* link in queue */
+};
+
+/**
+ * slow_work_init - Initialise a slow work item
+ * @work: The work item to initialise
+ * @ops: The operations to use to handle the slow work item
+ *
+ * Initialise a slow work item.
+ */
+static inline void slow_work_init(struct slow_work *work,
+                                 const struct slow_work_ops *ops)
+{
+       work->flags = 0;
+       work->ops = ops;
+       INIT_LIST_HEAD(&work->link);
+}
+
+/**
+ * slow_work_init - Initialise a very slow work item
+ * @work: The work item to initialise
+ * @ops: The operations to use to handle the slow work item
+ *
+ * Initialise a very slow work item.  This item will be restricted such that
+ * only a certain number of the pool threads will be able to execute items of
+ * this type.
+ */
+static inline void vslow_work_init(struct slow_work *work,
+                                  const struct slow_work_ops *ops)
+{
+       work->flags = 1 << SLOW_WORK_VERY_SLOW;
+       work->ops = ops;
+       INIT_LIST_HEAD(&work->link);
+}
+
+extern int slow_work_enqueue(struct slow_work *work);
+extern int slow_work_register_user(void);
+extern void slow_work_unregister_user(void);
+
+#ifdef CONFIG_SYSCTL
+extern ctl_table slow_work_sysctls[];
+#endif
+
+#endif /* CONFIG_SLOW_WORK */
+#endif /* _LINUX_SLOW_WORK_H */
index dd253177f65fe3f53ccc737b4af7bb9dab8e9364..3e08a1c86830cafe031e0eea27e2eb0ee07ba4b7 100644 (file)
@@ -14,7 +14,7 @@ struct timeriomem_rng_data {
        struct completion       completion;
        unsigned int            present:1;
 
-       u32 __iomem             *address;
+       void __iomem            *address;
 
        /* measures in usecs */
        unsigned int            period;
index 1398a14b01919d500c59367ff5cffd61d9f39cad..236a79377b8e382d7fcedd2eb63bbee172141e95 100644 (file)
@@ -1014,6 +1014,18 @@ config MARKERS
 
 source "arch/Kconfig"
 
+config SLOW_WORK
+       default n
+       bool "Enable slow work thread pool"
+       help
+         The slow work thread pool provides a number of dynamically allocated
+         threads that can be used by the kernel to perform operations that
+         take a relatively long time.
+
+         An example of this would be CacheFiles doing a path lookup followed
+         by a series of mkdirs and a create call, all of which have to touch
+         disk.
+
 endmenu                # General setup
 
 config HAVE_GENERIC_DMA_COHERENT
index 9aa968d5432937a3aeb799d87ff42e87dfb5c871..f5b978a9bb92892a5e876ae3ce1338ad8a896e04 100644 (file)
@@ -1,4 +1,5 @@
 #include <linux/kernel.h>
+#include <linux/blkdev.h>
 #include <linux/init.h>
 #include <linux/syscalls.h>
 #include <linux/unistd.h>
index 9bdddbcb3d6a62614b0d43783abc9a265948bdde..69aebbf8fd2dbf4ebeb301cf1fa590d3c864b9c9 100644 (file)
@@ -1,5 +1,6 @@
 #include <linux/delay.h>
-#include <linux/raid/md.h>
+#include <linux/raid/md_u.h>
+#include <linux/raid/md_p.h>
 
 #include "do_mounts.h"
 
@@ -112,8 +113,6 @@ static int __init md_setup(char *str)
        return 1;
 }
 
-#define MdpMinorShift 6
-
 static void __init md_setup_drive(void)
 {
        int minor, i, ent, partitioned;
index e4791b3ba55d45c163f3dd8cc0551f0aa7a1e766..bab1dffe37e94013d19795e33c2754f8ea0222c0 100644 (file)
@@ -93,6 +93,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
+obj-$(CONFIG_SLOW_WORK) += slow-work.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
new file mode 100644 (file)
index 0000000..cf2bc01
--- /dev/null
@@ -0,0 +1,640 @@
+/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ *
+ * See Documentation/slow-work.txt
+ */
+
+#include <linux/module.h>
+#include <linux/slow-work.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/wait.h>
+
+#define SLOW_WORK_CULL_TIMEOUT (5 * HZ)        /* cull threads 5s after running out of
+                                        * things to do */
+#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
+                                        * OOM */
+
+static void slow_work_cull_timeout(unsigned long);
+static void slow_work_oom_timeout(unsigned long);
+
+#ifdef CONFIG_SYSCTL
+static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *,
+                                       void __user *, size_t *, loff_t *);
+
+static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *,
+                                       void __user *, size_t *, loff_t *);
+#endif
+
+/*
+ * The pool of threads has at least min threads in it as long as someone is
+ * using the facility, and may have as many as max.
+ *
+ * A portion of the pool may be processing very slow operations.
+ */
+static unsigned slow_work_min_threads = 2;
+static unsigned slow_work_max_threads = 4;
+static unsigned vslow_work_proportion = 50; /* % of threads that may process
+                                            * very slow work */
+
+#ifdef CONFIG_SYSCTL
+static const int slow_work_min_min_threads = 2;
+static int slow_work_max_max_threads = 255;
+static const int slow_work_min_vslow = 1;
+static const int slow_work_max_vslow = 99;
+
+ctl_table slow_work_sysctls[] = {
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "min-threads",
+               .data           = &slow_work_min_threads,
+               .maxlen         = sizeof(unsigned),
+               .mode           = 0644,
+               .proc_handler   = slow_work_min_threads_sysctl,
+               .extra1         = (void *) &slow_work_min_min_threads,
+               .extra2         = &slow_work_max_threads,
+       },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "max-threads",
+               .data           = &slow_work_max_threads,
+               .maxlen         = sizeof(unsigned),
+               .mode           = 0644,
+               .proc_handler   = slow_work_max_threads_sysctl,
+               .extra1         = &slow_work_min_threads,
+               .extra2         = (void *) &slow_work_max_max_threads,
+       },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "vslow-percentage",
+               .data           = &vslow_work_proportion,
+               .maxlen         = sizeof(unsigned),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_minmax,
+               .extra1         = (void *) &slow_work_min_vslow,
+               .extra2         = (void *) &slow_work_max_vslow,
+       },
+       { .ctl_name = 0 }
+};
+#endif
+
+/*
+ * The active state of the thread pool
+ */
+static atomic_t slow_work_thread_count;
+static atomic_t vslow_work_executing_count;
+
+static bool slow_work_may_not_start_new_thread;
+static bool slow_work_cull; /* cull a thread due to lack of activity */
+static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
+static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
+static struct slow_work slow_work_new_thread; /* new thread starter */
+
+/*
+ * The queues of work items and the lock governing access to them.  These are
+ * shared between all the CPUs.  It doesn't make sense to have per-CPU queues
+ * as the number of threads bears no relation to the number of CPUs.
+ *
+ * There are two queues of work items: one for slow work items, and one for
+ * very slow work items.
+ */
+static LIST_HEAD(slow_work_queue);
+static LIST_HEAD(vslow_work_queue);
+static DEFINE_SPINLOCK(slow_work_queue_lock);
+
+/*
+ * The thread controls.  A variable used to signal to the threads that they
+ * should exit when the queue is empty, a waitqueue used by the threads to wait
+ * for signals, and a completion set by the last thread to exit.
+ */
+static bool slow_work_threads_should_exit;
+static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
+static DECLARE_COMPLETION(slow_work_last_thread_exited);
+
+/*
+ * The number of users of the thread pool and its lock.  Whilst this is zero we
+ * have no threads hanging around, and when this reaches zero, we wait for all
+ * active or queued work items to complete and kill all the threads we do have.
+ */
+static int slow_work_user_count;
+static DEFINE_MUTEX(slow_work_user_lock);
+
+/*
+ * Calculate the maximum number of active threads in the pool that are
+ * permitted to process very slow work items.
+ *
+ * The answer is rounded up to at least 1, but may not equal or exceed the
+ * maximum number of the threads in the pool.  This means we always have at
+ * least one thread that can process slow work items, and we always have at
+ * least one thread that won't get tied up doing so.
+ */
+static unsigned slow_work_calc_vsmax(void)
+{
+       unsigned vsmax;
+
+       vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
+       vsmax /= 100;
+       vsmax = max(vsmax, 1U);
+       return min(vsmax, slow_work_max_threads - 1);
+}
+
+/*
+ * Attempt to execute stuff queued on a slow thread.  Return true if we managed
+ * it, false if there was nothing to do.
+ */
+static bool slow_work_execute(void)
+{
+       struct slow_work *work = NULL;
+       unsigned vsmax;
+       bool very_slow;
+
+       vsmax = slow_work_calc_vsmax();
+
+       /* see if we can schedule a new thread to be started if we're not
+        * keeping up with the work */
+       if (!waitqueue_active(&slow_work_thread_wq) &&
+           (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
+           atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
+           !slow_work_may_not_start_new_thread)
+               slow_work_enqueue(&slow_work_new_thread);
+
+       /* find something to execute */
+       spin_lock_irq(&slow_work_queue_lock);
+       if (!list_empty(&vslow_work_queue) &&
+           atomic_read(&vslow_work_executing_count) < vsmax) {
+               work = list_entry(vslow_work_queue.next,
+                                 struct slow_work, link);
+               if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
+                       BUG();
+               list_del_init(&work->link);
+               atomic_inc(&vslow_work_executing_count);
+               very_slow = true;
+       } else if (!list_empty(&slow_work_queue)) {
+               work = list_entry(slow_work_queue.next,
+                                 struct slow_work, link);
+               if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
+                       BUG();
+               list_del_init(&work->link);
+               very_slow = false;
+       } else {
+               very_slow = false; /* avoid the compiler warning */
+       }
+       spin_unlock_irq(&slow_work_queue_lock);
+
+       if (!work)
+               return false;
+
+       if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
+               BUG();
+
+       work->ops->execute(work);
+
+       if (very_slow)
+               atomic_dec(&vslow_work_executing_count);
+       clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
+
+       /* if someone tried to enqueue the item whilst we were executing it,
+        * then it'll be left unenqueued to avoid multiple threads trying to
+        * execute it simultaneously
+        *
+        * there is, however, a race between us testing the pending flag and
+        * getting the spinlock, and between the enqueuer setting the pending
+        * flag and getting the spinlock, so we use a deferral bit to tell us
+        * if the enqueuer got there first
+        */
+       if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
+               spin_lock_irq(&slow_work_queue_lock);
+
+               if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
+                   test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
+                       goto auto_requeue;
+
+               spin_unlock_irq(&slow_work_queue_lock);
+       }
+
+       work->ops->put_ref(work);
+       return true;
+
+auto_requeue:
+       /* we must complete the enqueue operation
+        * - we transfer our ref on the item back to the appropriate queue
+        * - don't wake another thread up as we're awake already
+        */
+       if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
+               list_add_tail(&work->link, &vslow_work_queue);
+       else
+               list_add_tail(&work->link, &slow_work_queue);
+       spin_unlock_irq(&slow_work_queue_lock);
+       return true;
+}
+
+/**
+ * slow_work_enqueue - Schedule a slow work item for processing
+ * @work: The work item to queue
+ *
+ * Schedule a slow work item for processing.  If the item is already undergoing
+ * execution, this guarantees not to re-enter the execution routine until the
+ * first execution finishes.
+ *
+ * The item is pinned by this function as it retains a reference to it, managed
+ * through the item operations.  The item is unpinned once it has been
+ * executed.
+ *
+ * An item may hog the thread that is running it for a relatively large amount
+ * of time, sufficient, for example, to perform several lookup, mkdir, create
+ * and setxattr operations.  It may sleep on I/O and may sleep to obtain locks.
+ *
+ * Conversely, if a number of items are awaiting processing, it may take some
+ * time before any given item is given attention.  The number of threads in the
+ * pool may be increased to deal with demand, but only up to a limit.
+ *
+ * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
+ * the very slow queue, from which only a portion of the threads will be
+ * allowed to pick items to execute.  This ensures that very slow items won't
+ * overly block ones that are just ordinarily slow.
+ *
+ * Returns 0 if successful, -EAGAIN if not.
+ */
+int slow_work_enqueue(struct slow_work *work)
+{
+       unsigned long flags;
+
+       BUG_ON(slow_work_user_count <= 0);
+       BUG_ON(!work);
+       BUG_ON(!work->ops);
+       BUG_ON(!work->ops->get_ref);
+
+       /* when honouring an enqueue request, we only promise that we will run
+        * the work function in the future; we do not promise to run it once
+        * per enqueue request
+        *
+        * we use the PENDING bit to merge together repeat requests without
+        * having to disable IRQs and take the spinlock, whilst still
+        * maintaining our promise
+        */
+       if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
+               spin_lock_irqsave(&slow_work_queue_lock, flags);
+
+               /* we promise that we will not attempt to execute the work
+                * function in more than one thread simultaneously
+                *
+                * this, however, leaves us with a problem if we're asked to
+                * enqueue the work whilst someone is executing the work
+                * function as simply queueing the work immediately means that
+                * another thread may try executing it whilst it is already
+                * under execution
+                *
+                * to deal with this, we set the ENQ_DEFERRED bit instead of
+                * enqueueing, and the thread currently executing the work
+                * function will enqueue the work item when the work function
+                * returns and it has cleared the EXECUTING bit
+                */
+               if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
+                       set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
+               } else {
+                       if (work->ops->get_ref(work) < 0)
+                               goto cant_get_ref;
+                       if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
+                               list_add_tail(&work->link, &vslow_work_queue);
+                       else
+                               list_add_tail(&work->link, &slow_work_queue);
+                       wake_up(&slow_work_thread_wq);
+               }
+
+               spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+       }
+       return 0;
+
+cant_get_ref:
+       spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+       return -EAGAIN;
+}
+EXPORT_SYMBOL(slow_work_enqueue);
+
+/*
+ * Worker thread culling algorithm
+ */
+static bool slow_work_cull_thread(void)
+{
+       unsigned long flags;
+       bool do_cull = false;
+
+       spin_lock_irqsave(&slow_work_queue_lock, flags);
+
+       if (slow_work_cull) {
+               slow_work_cull = false;
+
+               if (list_empty(&slow_work_queue) &&
+                   list_empty(&vslow_work_queue) &&
+                   atomic_read(&slow_work_thread_count) >
+                   slow_work_min_threads) {
+                       mod_timer(&slow_work_cull_timer,
+                                 jiffies + SLOW_WORK_CULL_TIMEOUT);
+                       do_cull = true;
+               }
+       }
+
+       spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+       return do_cull;
+}
+
+/*
+ * Determine if there is slow work available for dispatch
+ */
+static inline bool slow_work_available(int vsmax)
+{
+       return !list_empty(&slow_work_queue) ||
+               (!list_empty(&vslow_work_queue) &&
+                atomic_read(&vslow_work_executing_count) < vsmax);
+}
+
+/*
+ * Worker thread dispatcher
+ */
+static int slow_work_thread(void *_data)
+{
+       int vsmax;
+
+       DEFINE_WAIT(wait);
+
+       set_freezable();
+       set_user_nice(current, -5);
+
+       for (;;) {
+               vsmax = vslow_work_proportion;
+               vsmax *= atomic_read(&slow_work_thread_count);
+               vsmax /= 100;
+
+               prepare_to_wait(&slow_work_thread_wq, &wait,
+                               TASK_INTERRUPTIBLE);
+               if (!freezing(current) &&
+                   !slow_work_threads_should_exit &&
+                   !slow_work_available(vsmax) &&
+                   !slow_work_cull)
+                       schedule();
+               finish_wait(&slow_work_thread_wq, &wait);
+
+               try_to_freeze();
+
+               vsmax = vslow_work_proportion;
+               vsmax *= atomic_read(&slow_work_thread_count);
+               vsmax /= 100;
+
+               if (slow_work_available(vsmax) && slow_work_execute()) {
+                       cond_resched();
+                       if (list_empty(&slow_work_queue) &&
+                           list_empty(&vslow_work_queue) &&
+                           atomic_read(&slow_work_thread_count) >
+                           slow_work_min_threads)
+                               mod_timer(&slow_work_cull_timer,
+                                         jiffies + SLOW_WORK_CULL_TIMEOUT);
+                       continue;
+               }
+
+               if (slow_work_threads_should_exit)
+                       break;
+
+               if (slow_work_cull && slow_work_cull_thread())
+                       break;
+       }
+
+       if (atomic_dec_and_test(&slow_work_thread_count))
+               complete_and_exit(&slow_work_last_thread_exited, 0);
+       return 0;
+}
+
+/*
+ * Handle thread cull timer expiration
+ */
+static void slow_work_cull_timeout(unsigned long data)
+{
+       slow_work_cull = true;
+       wake_up(&slow_work_thread_wq);
+}
+
+/*
+ * Get a reference on slow work thread starter
+ */
+static int slow_work_new_thread_get_ref(struct slow_work *work)
+{
+       return 0;
+}
+
+/*
+ * Drop a reference on slow work thread starter
+ */
+static void slow_work_new_thread_put_ref(struct slow_work *work)
+{
+}
+
+/*
+ * Start a new slow work thread
+ */
+static void slow_work_new_thread_execute(struct slow_work *work)
+{
+       struct task_struct *p;
+
+       if (slow_work_threads_should_exit)
+               return;
+
+       if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
+               return;
+
+       if (!mutex_trylock(&slow_work_user_lock))
+               return;
+
+       slow_work_may_not_start_new_thread = true;
+       atomic_inc(&slow_work_thread_count);
+       p = kthread_run(slow_work_thread, NULL, "kslowd");
+       if (IS_ERR(p)) {
+               printk(KERN_DEBUG "Slow work thread pool: OOM\n");
+               if (atomic_dec_and_test(&slow_work_thread_count))
+                       BUG(); /* we're running on a slow work thread... */
+               mod_timer(&slow_work_oom_timer,
+                         jiffies + SLOW_WORK_OOM_TIMEOUT);
+       } else {
+               /* ratelimit the starting of new threads */
+               mod_timer(&slow_work_oom_timer, jiffies + 1);
+       }
+
+       mutex_unlock(&slow_work_user_lock);
+}
+
+static const struct slow_work_ops slow_work_new_thread_ops = {
+       .get_ref        = slow_work_new_thread_get_ref,
+       .put_ref        = slow_work_new_thread_put_ref,
+       .execute        = slow_work_new_thread_execute,
+};
+
+/*
+ * post-OOM new thread start suppression expiration
+ */
+static void slow_work_oom_timeout(unsigned long data)
+{
+       slow_work_may_not_start_new_thread = false;
+}
+
+#ifdef CONFIG_SYSCTL
+/*
+ * Handle adjustment of the minimum number of threads
+ */
+static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
+                                       struct file *filp, void __user *buffer,
+                                       size_t *lenp, loff_t *ppos)
+{
+       int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+       int n;
+
+       if (ret == 0) {
+               mutex_lock(&slow_work_user_lock);
+               if (slow_work_user_count > 0) {
+                       /* see if we need to start or stop threads */
+                       n = atomic_read(&slow_work_thread_count) -
+                               slow_work_min_threads;
+
+                       if (n < 0 && !slow_work_may_not_start_new_thread)
+                               slow_work_enqueue(&slow_work_new_thread);
+                       else if (n > 0)
+                               mod_timer(&slow_work_cull_timer,
+                                         jiffies + SLOW_WORK_CULL_TIMEOUT);
+               }
+               mutex_unlock(&slow_work_user_lock);
+       }
+
+       return ret;
+}
+
+/*
+ * Handle adjustment of the maximum number of threads
+ */
+static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
+                                       struct file *filp, void __user *buffer,
+                                       size_t *lenp, loff_t *ppos)
+{
+       int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+       int n;
+
+       if (ret == 0) {
+               mutex_lock(&slow_work_user_lock);
+               if (slow_work_user_count > 0) {
+                       /* see if we need to stop threads */
+                       n = slow_work_max_threads -
+                               atomic_read(&slow_work_thread_count);
+
+                       if (n < 0)
+                               mod_timer(&slow_work_cull_timer,
+                                         jiffies + SLOW_WORK_CULL_TIMEOUT);
+               }
+               mutex_unlock(&slow_work_user_lock);
+       }
+
+       return ret;
+}
+#endif /* CONFIG_SYSCTL */
+
+/**
+ * slow_work_register_user - Register a user of the facility
+ *
+ * Register a user of the facility, starting up the initial threads if there
+ * aren't any other users at this point.  This will return 0 if successful, or
+ * an error if not.
+ */
+int slow_work_register_user(void)
+{
+       struct task_struct *p;
+       int loop;
+
+       mutex_lock(&slow_work_user_lock);
+
+       if (slow_work_user_count == 0) {
+               printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
+               init_completion(&slow_work_last_thread_exited);
+
+               slow_work_threads_should_exit = false;
+               slow_work_init(&slow_work_new_thread,
+                              &slow_work_new_thread_ops);
+               slow_work_may_not_start_new_thread = false;
+               slow_work_cull = false;
+
+               /* start the minimum number of threads */
+               for (loop = 0; loop < slow_work_min_threads; loop++) {
+                       atomic_inc(&slow_work_thread_count);
+                       p = kthread_run(slow_work_thread, NULL, "kslowd");
+                       if (IS_ERR(p))
+                               goto error;
+               }
+               printk(KERN_NOTICE "Slow work thread pool: Ready\n");
+       }
+
+       slow_work_user_count++;
+       mutex_unlock(&slow_work_user_lock);
+       return 0;
+
+error:
+       if (atomic_dec_and_test(&slow_work_thread_count))
+               complete(&slow_work_last_thread_exited);
+       if (loop > 0) {
+               printk(KERN_ERR "Slow work thread pool:"
+                      " Aborting startup on ENOMEM\n");
+               slow_work_threads_should_exit = true;
+               wake_up_all(&slow_work_thread_wq);
+               wait_for_completion(&slow_work_last_thread_exited);
+               printk(KERN_ERR "Slow work thread pool: Aborted\n");
+       }
+       mutex_unlock(&slow_work_user_lock);
+       return PTR_ERR(p);
+}
+EXPORT_SYMBOL(slow_work_register_user);
+
+/**
+ * slow_work_unregister_user - Unregister a user of the facility
+ *
+ * Unregister a user of the facility, killing all the threads if this was the
+ * last one.
+ */
+void slow_work_unregister_user(void)
+{
+       mutex_lock(&slow_work_user_lock);
+
+       BUG_ON(slow_work_user_count <= 0);
+
+       slow_work_user_count--;
+       if (slow_work_user_count == 0) {
+               printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
+               slow_work_threads_should_exit = true;
+               wake_up_all(&slow_work_thread_wq);
+               wait_for_completion(&slow_work_last_thread_exited);
+               printk(KERN_NOTICE "Slow work thread pool:"
+                      " Shut down complete\n");
+       }
+
+       del_timer_sync(&slow_work_cull_timer);
+
+       mutex_unlock(&slow_work_user_lock);
+}
+EXPORT_SYMBOL(slow_work_unregister_user);
+
+/*
+ * Initialise the slow work facility
+ */
+static int __init init_slow_work(void)
+{
+       unsigned nr_cpus = num_possible_cpus();
+
+       if (slow_work_max_threads < nr_cpus)
+               slow_work_max_threads = nr_cpus;
+#ifdef CONFIG_SYSCTL
+       if (slow_work_max_max_threads < nr_cpus * 2)
+               slow_work_max_max_threads = nr_cpus * 2;
+#endif
+       return 0;
+}
+
+subsys_initcall(init_slow_work);
index 5ec4543dfc06513d20e288f8a19e06068b65e7f9..82350f8f04f61a276cfaedbd0ca7fb5e1ffea378 100644 (file)
@@ -48,6 +48,7 @@
 #include <linux/acpi.h>
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
+#include <linux/slow-work.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -897,6 +898,14 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = &scan_unevictable_handler,
        },
 #endif
+#ifdef CONFIG_SLOW_WORK
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "slow-work",
+               .mode           = 0555,
+               .child          = slow_work_sysctls,
+       },
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
index 126d3973b3d1f3be7a3f5e4f42630725c796f31f..fc11974f2bee5ea0d39941f3dfce8b0b84910579 100644 (file)
@@ -564,6 +564,24 @@ void wait_on_page_bit(struct page *page, int bit_nr)
 }
 EXPORT_SYMBOL(wait_on_page_bit);
 
+/**
+ * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
+ * @page - Page defining the wait queue of interest
+ * @waiter - Waiter to add to the queue
+ *
+ * Add an arbitrary @waiter to the wait queue for the nominated @page.
+ */
+void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
+{
+       wait_queue_head_t *q = page_waitqueue(page);
+       unsigned long flags;
+
+       spin_lock_irqsave(&q->lock, flags);
+       __add_wait_queue(q, waiter);
+       spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(add_page_wait_queue);
+
 /**
  * unlock_page - unlock a locked page
  * @page: the page
@@ -2463,6 +2481,9 @@ EXPORT_SYMBOL(generic_file_aio_write);
  * (presumably at page->private).  If the release was successful, return `1'.
  * Otherwise return zero.
  *
+ * This may also be called if PG_fscache is set on a page, indicating that the
+ * page is known to the local caching routines.
+ *
  * The @gfp_mask argument specifies whether I/O may be performed to release
  * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
  *
index a9eff3f092f622a451753a1dc155f5b7fbdd7ab6..068655d8f883a8d2a79ae3f3247ced23f0b5db76 100644 (file)
@@ -250,7 +250,7 @@ out:
  * The number of remaining references must be:
  * 1 for anonymous pages without a mapping
  * 2 for pages with a mapping
- * 3 for pages with a mapping and PagePrivate set.
+ * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
  */
 static int migrate_page_move_mapping(struct address_space *mapping,
                struct page *newpage, struct page *page)
@@ -270,7 +270,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        pslot = radix_tree_lookup_slot(&mapping->page_tree,
                                        page_index(page));
 
-       expected_count = 2 + !!PagePrivate(page);
+       expected_count = 2 + !!page_has_private(page);
        if (page_count(page) != expected_count ||
                        (struct page *)radix_tree_deref_slot(pslot) != page) {
                spin_unlock_irq(&mapping->tree_lock);
@@ -386,7 +386,7 @@ EXPORT_SYMBOL(fail_migrate_page);
 
 /*
  * Common logic to directly migrate a single page suitable for
- * pages that do not use PagePrivate.
+ * pages that do not use PagePrivate/PagePrivate2.
  *
  * Pages are locked upon entry and exit.
  */
@@ -522,7 +522,7 @@ static int fallback_migrate_page(struct address_space *mapping,
         * Buffers may be managed in a filesystem specific way.
         * We must have no buffers or drop them.
         */
-       if (PagePrivate(page) &&
+       if (page_has_private(page) &&
            !try_to_release_page(page, GFP_KERNEL))
                return -EAGAIN;
 
@@ -655,7 +655,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
         * free the metadata, so the page can be freed.
         */
        if (!page->mapping) {
-               if (!PageAnon(page) && PagePrivate(page)) {
+               if (!PageAnon(page) && page_has_private(page)) {
                        /*
                         * Go direct to try_to_free_buffers() here because
                         * a) that's what try_to_release_page() would do anyway
index 9ce303d4b8109a32bf198ffc02ba4fe6b77eec49..133b6d525513a886247ce3adc5eff41ef4e91752 100644 (file)
@@ -31,6 +31,42 @@ EXPORT_SYMBOL_GPL(file_ra_state_init);
 
 #define list_to_page(head) (list_entry((head)->prev, struct page, lru))
 
+/*
+ * see if a page needs releasing upon read_cache_pages() failure
+ * - the caller of read_cache_pages() may have set PG_private or PG_fscache
+ *   before calling, such as the NFS fs marking pages that are cached locally
+ *   on disk, thus we need to give the fs a chance to clean up in the event of
+ *   an error
+ */
+static void read_cache_pages_invalidate_page(struct address_space *mapping,
+                                            struct page *page)
+{
+       if (page_has_private(page)) {
+               if (!trylock_page(page))
+                       BUG();
+               page->mapping = mapping;
+               do_invalidatepage(page, 0);
+               page->mapping = NULL;
+               unlock_page(page);
+       }
+       page_cache_release(page);
+}
+
+/*
+ * release a list of pages, invalidating them first if need be
+ */
+static void read_cache_pages_invalidate_pages(struct address_space *mapping,
+                                             struct list_head *pages)
+{
+       struct page *victim;
+
+       while (!list_empty(pages)) {
+               victim = list_to_page(pages);
+               list_del(&victim->lru);
+               read_cache_pages_invalidate_page(mapping, victim);
+       }
+}
+
 /**
  * read_cache_pages - populate an address space with some pages & start reads against them
  * @mapping: the address_space
@@ -52,14 +88,14 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
                list_del(&page->lru);
                if (add_to_page_cache_lru(page, mapping,
                                        page->index, GFP_KERNEL)) {
-                       page_cache_release(page);
+                       read_cache_pages_invalidate_page(mapping, page);
                        continue;
                }
                page_cache_release(page);
 
                ret = filler(data, page);
                if (unlikely(ret)) {
-                       put_pages_list(pages);
+                       read_cache_pages_invalidate_pages(mapping, pages);
                        break;
                }
                task_io_account_read(PAGE_CACHE_SIZE);
index 6e83084c1f6c3bf2c1a9fa36fb5649ea181e363c..bede23ce64ea28e52d991168f2c93cd2243c893a 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -448,8 +448,8 @@ void pagevec_strip(struct pagevec *pvec)
        for (i = 0; i < pagevec_count(pvec); i++) {
                struct page *page = pvec->pages[i];
 
-               if (PagePrivate(page) && trylock_page(page)) {
-                       if (PagePrivate(page))
+               if (page_has_private(page) && trylock_page(page)) {
+                       if (page_has_private(page))
                                try_to_release_page(page, 0);
                        unlock_page(page);
                }
index 1229211104f84beb0e05e5df891f1780d392e418..55206fab7b994e18fb026ca36269e3feeb2705a1 100644 (file)
@@ -50,7 +50,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
        zero_user_segment(page, partial, PAGE_CACHE_SIZE);
-       if (PagePrivate(page))
+       if (page_has_private(page))
                do_invalidatepage(page, partial);
 }
 
@@ -99,7 +99,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
        if (page->mapping != mapping)
                return;
 
-       if (PagePrivate(page))
+       if (page_has_private(page))
                do_invalidatepage(page, 0);
 
        cancel_dirty_page(page, PAGE_CACHE_SIZE);
@@ -126,7 +126,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
        if (page->mapping != mapping)
                return 0;
 
-       if (PagePrivate(page) && !try_to_release_page(page, 0))
+       if (page_has_private(page) && !try_to_release_page(page, 0))
                return 0;
 
        clear_page_mlock(page);
@@ -348,7 +348,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
        if (page->mapping != mapping)
                return 0;
 
-       if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
+       if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
                return 0;
 
        spin_lock_irq(&mapping->tree_lock);
@@ -356,7 +356,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
                goto failed;
 
        clear_page_mlock(page);
-       BUG_ON(PagePrivate(page));
+       BUG_ON(page_has_private(page));
        __remove_from_page_cache(page);
        spin_unlock_irq(&mapping->tree_lock);
        page_cache_release(page);       /* pagecache ref */
index 06e72693b4587a6181b002dec4f14ac5a7d4ba06..425244988bb230965c36c02c3901a906348b92ea 100644 (file)
@@ -283,7 +283,7 @@ static inline int page_mapping_inuse(struct page *page)
 
 static inline int is_page_cache_freeable(struct page *page)
 {
-       return page_count(page) - !!PagePrivate(page) == 2;
+       return page_count(page) - !!page_has_private(page) == 2;
 }
 
 static int may_write_to_queue(struct backing_dev_info *bdi)
@@ -367,7 +367,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                 * Some data journaling orphaned pages can have
                 * page->mapping == NULL while being dirty with clean buffers.
                 */
-               if (PagePrivate(page)) {
+               if (page_has_private(page)) {
                        if (try_to_free_buffers(page)) {
                                ClearPageDirty(page);
                                printk("%s: orphaned page\n", __func__);
@@ -727,7 +727,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * process address space (page_count == 1) it can be freed.
                 * Otherwise, leave the page on the LRU so it is swappable.
                 */
-               if (PagePrivate(page)) {
+               if (page_has_private(page)) {
                        if (!try_to_release_page(page, sc->gfp_mask))
                                goto activate_locked;
                        if (!mapping && page_count(page) == 1) {
index 28574ae551703157d6cea9e525c6cf43edb85da8..b1fd48db1640d50320de63ede7e3a7e699794408 100644 (file)
@@ -75,6 +75,10 @@ case "${ARCH}" in
        alpha)
                [ -f "${objtree}/arch/alpha/boot/vmlinux.gz" ] && cp -v -- "${objtree}/arch/alpha/boot/vmlinux.gz" "${tmpdir}/boot/vmlinuz-${KERNELRELEASE}"
                ;;
+       parisc*)
+               [ -f "${KBUILD_IMAGE}" ] && cp -v -- "${KBUILD_IMAGE}" "${tmpdir}/boot/vmlinux-${KERNELRELEASE}"
+               [ -f "${objtree}/lifimage" ] && cp -v -- "${objtree}/lifimage" "${tmpdir}/boot/lifimage-${KERNELRELEASE}"
+               ;;
        vax)
                [ -f "${objtree}/vmlinux.SYS" ] && cp -v -- "${objtree}/vmlinux.SYS" "${tmpdir}/boot/vmlinux-${KERNELRELEASE}.SYS"
                [ -f "${objtree}/vmlinux.dsk" ] && cp -v -- "${objtree}/vmlinux.dsk" "${tmpdir}/boot/vmlinux-${KERNELRELEASE}.dsk"
index 206e53844d2f86b5658d9524b0af9d83ddd41c05..5284255c5cdff9869ac4086c3a8976a4d9ad0a1a 100644 (file)
@@ -445,6 +445,7 @@ int security_inode_create(struct inode *dir, struct dentry *dentry, int mode)
                return 0;
        return security_ops->inode_create(dir, dentry, mode);
 }
+EXPORT_SYMBOL_GPL(security_inode_create);
 
 int security_inode_link(struct dentry *old_dentry, struct inode *dir,
                         struct dentry *new_dentry)
@@ -475,6 +476,7 @@ int security_inode_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                return 0;
        return security_ops->inode_mkdir(dir, dentry, mode);
 }
+EXPORT_SYMBOL_GPL(security_inode_mkdir);
 
 int security_inode_rmdir(struct inode *dir, struct dentry *dentry)
 {