]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/commitdiff
Merge branch 'linus' into x86/core
authorIngo Molnar <mingo@elte.hu>
Sat, 28 Mar 2009 21:27:45 +0000 (22:27 +0100)
committerIngo Molnar <mingo@elte.hu>
Sat, 28 Mar 2009 21:27:45 +0000 (22:27 +0100)
146 files changed:
Documentation/x86/earlyprintk.txt [new file with mode: 0644]
arch/x86/Kconfig
arch/x86/Kconfig.cpu
arch/x86/Makefile
arch/x86/boot/Makefile
arch/x86/boot/header.S
arch/x86/boot/pm.c
arch/x86/boot/pmjump.S
arch/x86/boot/setup.ld
arch/x86/boot/tools/build.c
arch/x86/boot/video-vga.c
arch/x86/include/asm/apic.h
arch/x86/include/asm/apicdef.h
arch/x86/include/asm/boot.h
arch/x86/include/asm/cacheflush.h
arch/x86/include/asm/cpu_debug.h [new file with mode: 0755]
arch/x86/include/asm/desc.h
arch/x86/include/asm/dmi.h
arch/x86/include/asm/e820.h
arch/x86/include/asm/entry_arch.h
arch/x86/include/asm/hardirq.h
arch/x86/include/asm/highmem.h
arch/x86/include/asm/hw_irq.h
arch/x86/include/asm/init.h [new file with mode: 0644]
arch/x86/include/asm/io_apic.h
arch/x86/include/asm/irq.h
arch/x86/include/asm/irq_remapping.h
arch/x86/include/asm/irq_vectors.h
arch/x86/include/asm/kexec.h
arch/x86/include/asm/linkage.h
arch/x86/include/asm/mce.h
arch/x86/include/asm/msidef.h
arch/x86/include/asm/msr-index.h
arch/x86/include/asm/page_32_types.h
arch/x86/include/asm/page_types.h
arch/x86/include/asm/paravirt.h
arch/x86/include/asm/pat.h
arch/x86/include/asm/pgtable-2level.h
arch/x86/include/asm/pgtable-3level.h
arch/x86/include/asm/pgtable.h
arch/x86/include/asm/pgtable_32.h
arch/x86/include/asm/pgtable_32_types.h
arch/x86/include/asm/pgtable_types.h
arch/x86/include/asm/processor.h
arch/x86/include/asm/sections.h
arch/x86/include/asm/setup.h
arch/x86/include/asm/uv/uv_hub.h
arch/x86/include/asm/xen/hypercall.h
arch/x86/kernel/Makefile
arch/x86/kernel/alternative.c
arch/x86/kernel/apic/apic.c
arch/x86/kernel/apic/apic_flat_64.c
arch/x86/kernel/apic/io_apic.c
arch/x86/kernel/apic/probe_64.c
arch/x86/kernel/apic/x2apic_cluster.c
arch/x86/kernel/apic/x2apic_phys.c
arch/x86/kernel/check.c
arch/x86/kernel/cpu/Makefile
arch/x86/kernel/cpu/addon_cpuid_features.c
arch/x86/kernel/cpu/amd.c
arch/x86/kernel/cpu/centaur.c
arch/x86/kernel/cpu/centaur_64.c [deleted file]
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/cpu.h
arch/x86/kernel/cpu/cpu_debug.c [new file with mode: 0755]
arch/x86/kernel/cpu/cyrix.c
arch/x86/kernel/cpu/intel.c
arch/x86/kernel/cpu/intel_cacheinfo.c
arch/x86/kernel/cpu/mcheck/Makefile
arch/x86/kernel/cpu/mcheck/mce_32.c
arch/x86/kernel/cpu/mcheck/mce_64.c
arch/x86/kernel/cpu/mcheck/mce_amd_64.c
arch/x86/kernel/cpu/mcheck/mce_intel_64.c
arch/x86/kernel/cpu/mcheck/threshold.c [new file with mode: 0644]
arch/x86/kernel/cpu/mtrr/Makefile
arch/x86/kernel/cpu/mtrr/cleanup.c [new file with mode: 0644]
arch/x86/kernel/cpu/mtrr/generic.c
arch/x86/kernel/cpu/mtrr/main.c
arch/x86/kernel/cpu/mtrr/mtrr.h
arch/x86/kernel/cpu/transmeta.c
arch/x86/kernel/cpu/umc.c
arch/x86/kernel/e820.c
arch/x86/kernel/early_printk.c
arch/x86/kernel/entry_32.S
arch/x86/kernel/entry_64.S
arch/x86/kernel/head32.c
arch/x86/kernel/head64.c
arch/x86/kernel/head_32.S
arch/x86/kernel/i8253.c
arch/x86/kernel/io_delay.c
arch/x86/kernel/irq.c
arch/x86/kernel/irqinit_32.c
arch/x86/kernel/irqinit_64.c
arch/x86/kernel/kdebugfs.c
arch/x86/kernel/kprobes.c
arch/x86/kernel/kvm.c
arch/x86/kernel/machine_kexec_32.c
arch/x86/kernel/machine_kexec_64.c
arch/x86/kernel/mmconf-fam10h_64.c
arch/x86/kernel/mpparse.c
arch/x86/kernel/paravirt.c
arch/x86/kernel/pci-nommu.c
arch/x86/kernel/process.c
arch/x86/kernel/ptrace.c
arch/x86/kernel/quirks.c
arch/x86/kernel/relocate_kernel_32.S
arch/x86/kernel/relocate_kernel_64.S
arch/x86/kernel/rtc.c
arch/x86/kernel/setup.c
arch/x86/kernel/signal.c
arch/x86/kernel/smpboot.c
arch/x86/kernel/tlb_uv.c
arch/x86/kernel/topology.c
arch/x86/kernel/uv_time.c [new file with mode: 0644]
arch/x86/kernel/visws_quirks.c
arch/x86/kernel/vmi_32.c
arch/x86/kernel/vmlinux_32.lds.S
arch/x86/kernel/vmlinux_64.lds.S
arch/x86/kernel/vsmp_64.c
arch/x86/lguest/boot.c
arch/x86/lib/memcpy_64.S
arch/x86/mm/highmem_32.c
arch/x86/mm/init.c
arch/x86/mm/init_32.c
arch/x86/mm/init_64.c
arch/x86/mm/iomap_32.c
arch/x86/mm/ioremap.c
arch/x86/mm/kmmio.c
arch/x86/mm/memtest.c
arch/x86/mm/numa_32.c
arch/x86/mm/pageattr.c
arch/x86/mm/pat.c
arch/x86/mm/pgtable_32.c
arch/x86/mm/tlb.c
arch/x86/pci/common.c
arch/x86/pci/fixup.c
arch/x86/pci/i386.c
arch/x86/xen/mmu.c
drivers/pci/dmar.c
drivers/pci/intel-iommu.c
drivers/pci/intr_remapping.c
include/linux/dmar.h
include/linux/intel-iommu.h
include/linux/mm.h
kernel/sched.c
mm/memory.c

diff --git a/Documentation/x86/earlyprintk.txt b/Documentation/x86/earlyprintk.txt
new file mode 100644 (file)
index 0000000..607b1a0
--- /dev/null
@@ -0,0 +1,101 @@
+
+Mini-HOWTO for using the earlyprintk=dbgp boot option with a
+USB2 Debug port key and a debug cable, on x86 systems.
+
+You need two computers, the 'USB debug key' special gadget and
+and two USB cables, connected like this:
+
+  [host/target] <-------> [USB debug key] <-------> [client/console]
+
+1. There are three specific hardware requirements:
+
+ a.) Host/target system needs to have USB debug port capability.
+
+ You can check this capability by looking at a 'Debug port' bit in
+ the lspci -vvv output:
+
+ # lspci -vvv
+ ...
+ 00:1d.7 USB Controller: Intel Corporation 82801H (ICH8 Family) USB2 EHCI Controller #1 (rev 03) (prog-if 20 [EHCI])
+         Subsystem: Lenovo ThinkPad T61
+         Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx-
+         Status: Cap+ 66MHz- UDF- FastB2B+ ParErr- DEVSEL=medium >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
+         Latency: 0
+         Interrupt: pin D routed to IRQ 19
+         Region 0: Memory at fe227000 (32-bit, non-prefetchable) [size=1K]
+         Capabilities: [50] Power Management version 2
+                 Flags: PMEClk- DSI- D1- D2- AuxCurrent=375mA PME(D0+,D1-,D2-,D3hot+,D3cold+)
+                 Status: D0 PME-Enable- DSel=0 DScale=0 PME+
+         Capabilities: [58] Debug port: BAR=1 offset=00a0
+                            ^^^^^^^^^^^ <==================== [ HERE ]
+        Kernel driver in use: ehci_hcd
+         Kernel modules: ehci-hcd
+ ...
+
+( If your system does not list a debug port capability then you probably
+  wont be able to use the USB debug key. )
+
+ b.) You also need a Netchip USB debug cable/key:
+
+        http://www.plxtech.com/products/NET2000/NET20DC/default.asp
+
+     This is a small blue plastic connector with two USB connections,
+     it draws power from its USB connections.
+
+ c.) Thirdly, you need a second client/console system with a regular USB port.
+
+2. Software requirements:
+
+ a.) On the host/target system:
+
+    You need to enable the following kernel config option:
+
+      CONFIG_EARLY_PRINTK_DBGP=y
+
+    And you need to add the boot command line: "earlyprintk=dbgp".
+    (If you are using Grub, append it to the 'kernel' line in
+     /etc/grub.conf)
+
+    NOTE: normally earlyprintk console gets turned off once the
+    regular console is alive - use "earlyprintk=dbgp,keep" to keep
+    this channel open beyond early bootup. This can be useful for
+    debugging crashes under Xorg, etc.
+
+ b.) On the client/console system:
+
+    You should enable the following kernel config option:
+
+      CONFIG_USB_SERIAL_DEBUG=y
+
+    On the next bootup with the modified kernel you should
+    get a /dev/ttyUSBx device(s).
+
+    Now this channel of kernel messages is ready to be used: start
+    your favorite terminal emulator (minicom, etc.) and set
+    it up to use /dev/ttyUSB0 - or use a raw 'cat /dev/ttyUSBx' to
+    see the raw output.
+
+ c.) On Nvidia Southbridge based systems: the kernel will try to probe
+     and find out which port has debug device connected.
+
+3. Testing that it works fine:
+
+   You can test the output by using earlyprintk=dbgp,keep and provoking
+   kernel messages on the host/target system. You can provoke a harmless
+   kernel message by for example doing:
+
+     echo h > /proc/sysrq-trigger
+
+   On the host/target system you should see this help line in "dmesg" output:
+
+     SysRq : HELP : loglevel(0-9) reBoot Crashdump terminate-all-tasks(E) memory-full-oom-kill(F) kill-all-tasks(I) saK show-backtrace-all-active-cpus(L) show-memory-usage(M) nice-all-RT-tasks(N) powerOff show-registers(P) show-all-timers(Q) unRaw Sync show-task-states(T) Unmount show-blocked-tasks(W) dump-ftrace-buffer(Z)
+
+   On the client/console system do:
+
+       cat /dev/ttyUSB0
+
+   And you should see the help line above displayed shortly after you've
+   provoked it on the host system.
+
+If it does not work then please ask about it on the linux-kernel@vger.kernel.org
+mailing list or contact the x86 maintainers.
index 06c02c00d7d9b040a95c477570a66555325a3df7..34bc3a89228b58cb31d2aa18c70c1965bb35ace6 100644 (file)
@@ -786,6 +786,11 @@ config X86_MCE_AMD
           Additional support for AMD specific MCE features such as
           the DRAM Error Threshold.
 
+config X86_MCE_THRESHOLD
+       depends on X86_MCE_AMD || X86_MCE_INTEL
+       bool
+       default y
+
 config X86_MCE_NONFATAL
        tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
        depends on X86_32 && X86_MCE
@@ -929,6 +934,12 @@ config X86_CPUID
          with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
          /dev/cpu/31/cpuid.
 
+config X86_CPU_DEBUG
+       tristate "/sys/kernel/debug/x86/cpu/* - CPU Debug support"
+       ---help---
+         If you select this option, this will provide various x86 CPUs
+         information through debugfs.
+
 choice
        prompt "High Memory Support"
        default HIGHMEM4G if !X86_NUMAQ
@@ -1121,7 +1132,7 @@ config NUMA_EMU
 
 config NODES_SHIFT
        int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
-       range 1 9   if X86_64
+       range 1 9
        default "9" if MAXSMP
        default "6" if X86_64
        default "4" if X86_NUMAQ
@@ -1429,7 +1440,7 @@ config CRASH_DUMP
 config KEXEC_JUMP
        bool "kexec jump (EXPERIMENTAL)"
        depends on EXPERIMENTAL
-       depends on KEXEC && HIBERNATION && X86_32
+       depends on KEXEC && HIBERNATION
        ---help---
          Jump between original kernel and kexeced kernel and invoke
          code in physical address mode via KEXEC
index a95eaf0e582ada3e141ee0ae35fa5d17eb056f1b..924e156a85abdf61627336e41e97283393e8709f 100644 (file)
@@ -456,24 +456,9 @@ config CPU_SUP_AMD
 
          If unsure, say N.
 
-config CPU_SUP_CENTAUR_32
+config CPU_SUP_CENTAUR
        default y
        bool "Support Centaur processors" if PROCESSOR_SELECT
-       depends on !64BIT
-       ---help---
-         This enables detection, tunings and quirks for Centaur processors
-
-         You need this enabled if you want your kernel to run on a
-         Centaur CPU. Disabling this option on other types of CPUs
-         makes the kernel a tiny bit smaller. Disabling it on a Centaur
-         CPU might render the kernel unbootable.
-
-         If unsure, say N.
-
-config CPU_SUP_CENTAUR_64
-       default y
-       bool "Support Centaur processors" if PROCESSOR_SELECT
-       depends on 64BIT
        ---help---
          This enables detection, tunings and quirks for Centaur processors
 
index 1836191839ee3f6f35f32203cbf6e7556518d97f..f05d8c91d9e51ed29b21a6b99588398182879bb2 100644 (file)
@@ -153,34 +153,23 @@ endif
 
 boot := arch/x86/boot
 
-PHONY += zImage bzImage compressed zlilo bzlilo \
-         zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install
+BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage install
+
+PHONY += bzImage $(BOOT_TARGETS)
 
 # Default kernel to build
 all: bzImage
 
 # KBUILD_IMAGE specify target image being built
-                    KBUILD_IMAGE := $(boot)/bzImage
-zImage zlilo zdisk: KBUILD_IMAGE := $(boot)/zImage
+KBUILD_IMAGE := $(boot)/bzImage
 
-zImage bzImage: vmlinux
+bzImage: vmlinux
        $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
        $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
        $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
 
-compressed: zImage
-
-zlilo bzlilo: vmlinux
-       $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zlilo
-
-zdisk bzdisk: vmlinux
-       $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zdisk
-
-fdimage fdimage144 fdimage288 isoimage: vmlinux
-       $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@
-
-install:
-       $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install
+$(BOOT_TARGETS): vmlinux
+       $(Q)$(MAKE) $(build)=$(boot) $@
 
 PHONY += vdso_install
 vdso_install:
@@ -205,7 +194,3 @@ define archhelp
   echo  '                  FDARGS="..."  arguments for the booted kernel'
   echo  '                  FDINITRD=file initrd for the booted kernel'
 endef
-
-CLEAN_FILES += arch/x86/boot/fdimage \
-              arch/x86/boot/image.iso \
-              arch/x86/boot/mtools.conf
index c70eff69a1fb7b6a8708c5acd193a2f1a8a163e4..fb737ce5888dfbd7213c171bf07ea43f88e66f6f 100644 (file)
@@ -6,26 +6,24 @@
 # for more details.
 #
 # Copyright (C) 1994 by Linus Torvalds
+# Changed by many, many contributors over the years.
 #
 
 # ROOT_DEV specifies the default root-device when making the image.
 # This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case
 # the default of FLOPPY is used by 'build'.
 
-ROOT_DEV := CURRENT
+ROOT_DEV       := CURRENT
 
 # If you want to preset the SVGA mode, uncomment the next line and
 # set SVGA_MODE to whatever number you want.
 # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
 # The number is the same as you would ordinarily press at bootup.
 
-SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
+SVGA_MODE      := -DSVGA_MODE=NORMAL_VGA
 
-# If you want the RAM disk device, define this to be the size in blocks.
-
-#RAMDISK := -DRAMDISK=512
-
-targets                := vmlinux.bin setup.bin setup.elf zImage bzImage
+targets                := vmlinux.bin setup.bin setup.elf bzImage
+targets                += fdimage fdimage144 fdimage288 image.iso mtools.conf
 subdir-                := compressed
 
 setup-y                += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o
@@ -71,17 +69,13 @@ KBUILD_CFLAGS       := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
 KBUILD_CFLAGS +=   $(call cc-option,-m32)
 KBUILD_AFLAGS  := $(KBUILD_CFLAGS) -D__ASSEMBLY__
 
-$(obj)/zImage:  asflags-y := $(SVGA_MODE) $(RAMDISK)
-$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__
-$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
-$(obj)/bzImage: BUILDFLAGS   := -b
+$(obj)/bzImage: asflags-y  := $(SVGA_MODE)
 
 quiet_cmd_image = BUILD   $@
-cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/setup.bin \
-           $(obj)/vmlinux.bin $(ROOT_DEV) > $@
+cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \
+       $(ROOT_DEV) > $@
 
-$(obj)/zImage $(obj)/bzImage: $(obj)/setup.bin \
-                             $(obj)/vmlinux.bin $(obj)/tools/build FORCE
+$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE
        $(call if_changed,image)
        @echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
 
@@ -116,9 +110,11 @@ $(obj)/setup.bin: $(obj)/setup.elf FORCE
 $(obj)/compressed/vmlinux: FORCE
        $(Q)$(MAKE) $(build)=$(obj)/compressed $@
 
-# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel
+# Set this if you want to pass append arguments to the
+# bzdisk/fdimage/isoimage kernel
 FDARGS =
-# Set this if you want an initrd included with the zdisk/fdimage/isoimage kernel
+# Set this if you want an initrd included with the
+# bzdisk/fdimage/isoimage kernel
 FDINITRD =
 
 image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,)
@@ -127,7 +123,7 @@ $(obj)/mtools.conf: $(src)/mtools.conf.in
        sed -e 's|@OBJ@|$(obj)|g' < $< > $@
 
 # This requires write access to /dev/fd0
-zdisk: $(BOOTIMAGE) $(obj)/mtools.conf
+bzdisk: $(obj)/bzImage $(obj)/mtools.conf
        MTOOLSRC=$(obj)/mtools.conf mformat a:                  ; sync
        syslinux /dev/fd0                                       ; sync
        echo '$(image_cmdline)' | \
@@ -135,10 +131,10 @@ zdisk: $(BOOTIMAGE) $(obj)/mtools.conf
        if [ -f '$(FDINITRD)' ] ; then \
                MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \
        fi
-       MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) a:linux  ; sync
+       MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage a:linux        ; sync
 
 # These require being root or having syslinux 2.02 or higher installed
-fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf
+fdimage fdimage144: $(obj)/bzImage $(obj)/mtools.conf
        dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440
        MTOOLSRC=$(obj)/mtools.conf mformat v:                  ; sync
        syslinux $(obj)/fdimage                                 ; sync
@@ -147,9 +143,9 @@ fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf
        if [ -f '$(FDINITRD)' ] ; then \
                MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \
        fi
-       MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) v:linux  ; sync
+       MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage v:linux        ; sync
 
-fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf
+fdimage288: $(obj)/bzImage $(obj)/mtools.conf
        dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880
        MTOOLSRC=$(obj)/mtools.conf mformat w:                  ; sync
        syslinux $(obj)/fdimage                                 ; sync
@@ -158,9 +154,9 @@ fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf
        if [ -f '$(FDINITRD)' ] ; then \
                MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \
        fi
-       MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) w:linux  ; sync
+       MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage w:linux        ; sync
 
-isoimage: $(BOOTIMAGE)
+isoimage: $(obj)/bzImage
        -rm -rf $(obj)/isoimage
        mkdir $(obj)/isoimage
        for i in lib lib64 share end ; do \
@@ -170,7 +166,7 @@ isoimage: $(BOOTIMAGE)
                fi ; \
                if [ $$i = end ] ; then exit 1 ; fi ; \
        done
-       cp $(BOOTIMAGE) $(obj)/isoimage/linux
+       cp $(obj)/bzImage $(obj)/isoimage/linux
        echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg
        if [ -f '$(FDINITRD)' ] ; then \
                cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \
@@ -181,12 +177,13 @@ isoimage: $(BOOTIMAGE)
        isohybrid $(obj)/image.iso 2>/dev/null || true
        rm -rf $(obj)/isoimage
 
-zlilo: $(BOOTIMAGE)
+bzlilo: $(obj)/bzImage
        if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi
        if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi
-       cat $(BOOTIMAGE) > $(INSTALL_PATH)/vmlinuz
+       cat $(obj)/bzImage > $(INSTALL_PATH)/vmlinuz
        cp System.map $(INSTALL_PATH)/
        if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
 
 install:
-       sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)"
+       sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
+               System.map "$(INSTALL_PATH)"
index 7ccff4884a23e03c1a2ef59bf15de9b146401369..5d84d1c74e4c6d2a84666c7b7125641b00a1dcad 100644 (file)
 #include "boot.h"
 #include "offsets.h"
 
-SETUPSECTS     = 4                     /* default nr of setup-sectors */
 BOOTSEG                = 0x07C0                /* original address of boot-sector */
-SYSSEG         = DEF_SYSSEG            /* system loaded at 0x10000 (65536) */
-SYSSIZE                = DEF_SYSSIZE           /* system size: # of 16-byte clicks */
-                                       /* to be loaded */
-ROOT_DEV       = 0                     /* ROOT_DEV is now written by "build" */
+SYSSEG         = 0x1000                /* historical load address >> 4 */
 
 #ifndef SVGA_MODE
 #define SVGA_MODE ASK_VGA
@@ -97,12 +93,12 @@ bugger_off_msg:
        .section ".header", "a"
        .globl  hdr
 hdr:
-setup_sects:   .byte SETUPSECTS
+setup_sects:   .byte 0                 /* Filled in by build.c */
 root_flags:    .word ROOT_RDONLY
-syssize:       .long SYSSIZE
-ram_size:      .word RAMDISK
+syssize:       .long 0                 /* Filled in by build.c */
+ram_size:      .word 0                 /* Obsolete */
 vid_mode:      .word SVGA_MODE
-root_dev:      .word ROOT_DEV
+root_dev:      .word 0                 /* Filled in by build.c */
 boot_flag:     .word 0xAA55
 
        # offset 512, entry point
@@ -123,14 +119,15 @@ _start:
                                        # or else old loadlin-1.5 will fail)
                .globl realmode_swtch
 realmode_swtch:        .word   0, 0            # default_switch, SETUPSEG
-start_sys_seg: .word   SYSSEG
+start_sys_seg: .word   SYSSEG          # obsolete and meaningless, but just
+                                       # in case something decided to "use" it
                .word   kernel_version-512 # pointing to kernel version string
                                        # above section of header is compatible
                                        # with loadlin-1.5 (header v1.5). Don't
                                        # change it.
 
-type_of_loader:        .byte   0               # = 0, old one (LILO, Loadlin,
-                                       #      Bootlin, SYSLX, bootsect...)
+type_of_loader:        .byte   0               # 0 means ancient bootloader, newer
+                                       # bootloaders know to change this.
                                        # See Documentation/i386/boot.txt for
                                        # assigned ids
 
@@ -142,11 +139,7 @@ CAN_USE_HEAP       = 0x80                  # If set, the loader also has set
                                        # space behind setup.S can be used for
                                        # heap purposes.
                                        # Only the loader knows what is free
-#ifndef __BIG_KERNEL__
-               .byte   0
-#else
                .byte   LOADED_HIGH
-#endif
 
 setup_move_size: .word  0x8000         # size to move, when setup is not
                                        # loaded at 0x90000. We will move setup
@@ -157,11 +150,7 @@ setup_move_size: .word  0x8000             # size to move, when setup is not
 
 code32_start:                          # here loaders can put a different
                                        # start address for 32-bit code.
-#ifndef __BIG_KERNEL__
-               .long   0x1000          #   0x1000 = default for zImage
-#else
                .long   0x100000        # 0x100000 = default for big kernel
-#endif
 
 ramdisk_image: .long   0               # address of loaded ramdisk image
                                        # Here the loader puts the 32-bit
index 85a1cd8a8ff8a4daee99ee11d86176bd7a97c166..8062f89152504b19babbd9e293791356ac2791aa 100644 (file)
@@ -32,47 +32,6 @@ static void realmode_switch_hook(void)
        }
 }
 
-/*
- * A zImage kernel is loaded at 0x10000 but wants to run at 0x1000.
- * A bzImage kernel is loaded and runs at 0x100000.
- */
-static void move_kernel_around(void)
-{
-       /* Note: rely on the compile-time option here rather than
-          the LOADED_HIGH flag.  The Qemu kernel loader unconditionally
-          sets the loadflags to zero. */
-#ifndef __BIG_KERNEL__
-       u16 dst_seg, src_seg;
-       u32 syssize;
-
-       dst_seg =  0x1000 >> 4;
-       src_seg = 0x10000 >> 4;
-       syssize = boot_params.hdr.syssize; /* Size in 16-byte paragraphs */
-
-       while (syssize) {
-               int paras  = (syssize >= 0x1000) ? 0x1000 : syssize;
-               int dwords = paras << 2;
-
-               asm volatile("pushw %%es ; "
-                            "pushw %%ds ; "
-                            "movw %1,%%es ; "
-                            "movw %2,%%ds ; "
-                            "xorw %%di,%%di ; "
-                            "xorw %%si,%%si ; "
-                            "rep;movsl ; "
-                            "popw %%ds ; "
-                            "popw %%es"
-                            : "+c" (dwords)
-                            : "r" (dst_seg), "r" (src_seg)
-                            : "esi", "edi");
-
-               syssize -= paras;
-               dst_seg += paras;
-               src_seg += paras;
-       }
-#endif
-}
-
 /*
  * Disable all interrupts at the legacy PIC.
  */
@@ -147,9 +106,6 @@ void go_to_protected_mode(void)
        /* Hook before leaving real mode, also disables interrupts */
        realmode_switch_hook();
 
-       /* Move the kernel/setup to their final resting places */
-       move_kernel_around();
-
        /* Enable the A20 gate */
        if (enable_a20()) {
                puts("A20 gate not responding, unable to boot...\n");
index 019c17a7585115a7cfdb0c27696544b8127599b5..3e0edc6d2a2079892e1a655ef2e7a35a013c4ef3 100644 (file)
@@ -47,6 +47,7 @@ GLOBAL(protected_mode_jump)
 ENDPROC(protected_mode_jump)
 
        .code32
+       .section ".text32","ax"
 GLOBAL(in_pm32)
        # Set up data segments for flat 32-bit mode
        movl    %ecx, %ds
index df9234b3a5e07f6f0dd0bd10a2ad19065306097b..bb8dc2de796936c6a21d0b1bfbd934ca8a3e5877 100644 (file)
@@ -17,7 +17,8 @@ SECTIONS
        .header         : { *(.header) }
        .inittext       : { *(.inittext) }
        .initdata       : { *(.initdata) }
-       .text           : { *(.text*) }
+       .text           : { *(.text) }
+       .text32         : { *(.text32) }
 
        . = ALIGN(16);
        .rodata         : { *(.rodata*) }
index 44dc1923c0e3eb277a90c582addb524cd4163629..ee3a4ea923ace82fc0c1f4aa07e4c1adc32ee318 100644 (file)
@@ -130,7 +130,7 @@ static void die(const char * str, ...)
 
 static void usage(void)
 {
-       die("Usage: build [-b] setup system [rootdev] [> image]");
+       die("Usage: build setup system [rootdev] [> image]");
 }
 
 int main(int argc, char ** argv)
@@ -145,11 +145,6 @@ int main(int argc, char ** argv)
        void *kernel;
        u32 crc = 0xffffffffUL;
 
-       if (argc > 2 && !strcmp(argv[1], "-b"))
-         {
-           is_big_kernel = 1;
-           argc--, argv++;
-         }
        if ((argc < 3) || (argc > 4))
                usage();
        if (argc > 3) {
@@ -216,8 +211,6 @@ int main(int argc, char ** argv)
                die("Unable to mmap '%s': %m", argv[2]);
        /* Number of 16-byte paragraphs, including space for a 4-byte CRC */
        sys_size = (sz + 15 + 4) / 16;
-       if (!is_big_kernel && sys_size > DEF_SYSSIZE)
-               die("System is too big. Try using bzImage or modules.");
 
        /* Patch the setup code with the appropriate size parameters */
        buf[0x1f1] = setup_sectors-1;
index 5d4742ed4aa21dd83d968798a1c7ff92447907a1..95d86ce0421c84dd647f26f34cd65fc9f0143d3e 100644 (file)
@@ -129,41 +129,45 @@ u16 vga_crtc(void)
        return (inb(0x3cc) & 1) ? 0x3d4 : 0x3b4;
 }
 
-static void vga_set_480_scanlines(int end)
+static void vga_set_480_scanlines(int lines)
 {
-       u16 crtc;
-       u8  csel;
+       u16 crtc;               /* CRTC base address */
+       u8  csel;               /* CRTC miscellaneous output register */
+       u8  ovfw;               /* CRTC overflow register */
+       int end = lines-1;
 
        crtc = vga_crtc();
 
+       ovfw = 0x3c | ((end >> (8-1)) & 0x02) | ((end >> (9-6)) & 0x40);
+
        out_idx(0x0c, crtc, 0x11); /* Vertical sync end, unlock CR0-7 */
        out_idx(0x0b, crtc, 0x06); /* Vertical total */
-       out_idx(0x3e, crtc, 0x07); /* Vertical overflow */
+       out_idx(ovfw, crtc, 0x07); /* Vertical overflow */
        out_idx(0xea, crtc, 0x10); /* Vertical sync start */
-       out_idx(end, crtc, 0x12); /* Vertical display end */
+       out_idx(end,  crtc, 0x12); /* Vertical display end */
        out_idx(0xe7, crtc, 0x15); /* Vertical blank start */
        out_idx(0x04, crtc, 0x16); /* Vertical blank end */
        csel = inb(0x3cc);
        csel &= 0x0d;
        csel |= 0xe2;
-       outb(csel, 0x3cc);
+       outb(csel, 0x3c2);
 }
 
 static void vga_set_80x30(void)
 {
-       vga_set_480_scanlines(0xdf);
+       vga_set_480_scanlines(30*16);
 }
 
 static void vga_set_80x34(void)
 {
        vga_set_14font();
-       vga_set_480_scanlines(0xdb);
+       vga_set_480_scanlines(34*14);
 }
 
 static void vga_set_80x60(void)
 {
        vga_set_8font();
-       vga_set_480_scanlines(0xdf);
+       vga_set_480_scanlines(60*8);
 }
 
 static int vga_set_mode(struct mode_info *mode)
index 4ef949c1972e6bfacb80d477fefc427cadfecd8f..df8a300dfe6c380b5fdcf60e379ef9fb855d67ae 100644 (file)
@@ -75,7 +75,7 @@ static inline void default_inquire_remote_apic(int apicid)
 #define setup_secondary_clock setup_secondary_APIC_clock
 #endif
 
-#ifdef CONFIG_X86_VSMP
+#ifdef CONFIG_X86_64
 extern int is_vsmp_box(void);
 #else
 static inline int is_vsmp_box(void)
@@ -108,6 +108,16 @@ extern void native_apic_icr_write(u32 low, u32 id);
 extern u64 native_apic_icr_read(void);
 
 #ifdef CONFIG_X86_X2APIC
+/*
+ * Make previous memory operations globally visible before
+ * sending the IPI through x2apic wrmsr. We need a serializing instruction or
+ * mfence for this.
+ */
+static inline void x2apic_wrmsr_fence(void)
+{
+       asm volatile("mfence" : : : "memory");
+}
+
 static inline void native_apic_msr_write(u32 reg, u32 v)
 {
        if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
@@ -184,6 +194,9 @@ static inline int x2apic_enabled(void)
 {
        return 0;
 }
+
+#define        x2apic  0
+
 #endif
 
 extern int get_physical_broadcast(void);
@@ -379,6 +392,7 @@ static inline u32 safe_apic_wait_icr_idle(void)
 
 static inline void ack_APIC_irq(void)
 {
+#ifdef CONFIG_X86_LOCAL_APIC
        /*
         * ack_APIC_irq() actually gets compiled as a single instruction
         * ... yummie.
@@ -386,6 +400,7 @@ static inline void ack_APIC_irq(void)
 
        /* Docs say use 0 for future compatibility */
        apic_write(APIC_EOI, 0);
+#endif
 }
 
 static inline unsigned default_get_apic_id(unsigned long x)
@@ -474,10 +489,19 @@ static inline int default_apic_id_registered(void)
        return physid_isset(read_apic_id(), phys_cpu_present_map);
 }
 
+static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+       return cpuid_apic >> index_msb;
+}
+
+extern int default_apicid_to_node(int logical_apicid);
+
+#endif
+
 static inline unsigned int
 default_cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
-       return cpumask_bits(cpumask)[0];
+       return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
 }
 
 static inline unsigned int
@@ -491,15 +515,6 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
        return (unsigned int)(mask1 & mask2 & mask3);
 }
 
-static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
-{
-       return cpuid_apic >> index_msb;
-}
-
-extern int default_apicid_to_node(int logical_apicid);
-
-#endif
-
 static inline unsigned long default_check_apicid_used(physid_mask_t bitmap, int apicid)
 {
        return physid_isset(apicid, bitmap);
index 63134e31e8b933acb973ee2e66f62a2f815bb326..bc9514fb3b13f70d75b11f037546eca8e43d07fd 100644 (file)
@@ -53,6 +53,7 @@
 #define                APIC_ESR_SENDILL        0x00020
 #define                APIC_ESR_RECVILL        0x00040
 #define                APIC_ESR_ILLREGA        0x00080
+#define        APIC_LVTCMCI    0x2f0
 #define        APIC_ICR        0x300
 #define                APIC_DEST_SELF          0x40000
 #define                APIC_DEST_ALLINC        0x80000
index 6526cf08b0e4604583b1b3e8c972e0f31fe1b44e..6ba23dd9fc9216de96b9cb52d1954b9bb7ae2b99 100644 (file)
@@ -1,10 +1,6 @@
 #ifndef _ASM_X86_BOOT_H
 #define _ASM_X86_BOOT_H
 
-/* Don't touch these, unless you really know what you're doing. */
-#define DEF_SYSSEG     0x1000
-#define DEF_SYSSIZE    0x7F00
-
 /* Internal svga startup constants */
 #define NORMAL_VGA     0xffff          /* 80x25 mode */
 #define EXTENDED_VGA   0xfffe          /* 80x50 mode */
index 5b301b7ff5f4bcac4854caaed6dc4877a389814b..b3894bf52fcddf68f8d16a38e2705fc1b3ec519d 100644 (file)
@@ -90,6 +90,9 @@ int set_memory_4k(unsigned long addr, int numpages);
 int set_memory_array_uc(unsigned long *addr, int addrinarray);
 int set_memory_array_wb(unsigned long *addr, int addrinarray);
 
+int set_pages_array_uc(struct page **pages, int addrinarray);
+int set_pages_array_wb(struct page **pages, int addrinarray);
+
 /*
  * For legacy compatibility with the old APIs, a few functions
  * are provided that work on a "struct page".
diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h
new file mode 100755 (executable)
index 0000000..2228020
--- /dev/null
@@ -0,0 +1,226 @@
+#ifndef _ASM_X86_CPU_DEBUG_H
+#define _ASM_X86_CPU_DEBUG_H
+
+/*
+ * CPU x86 architecture debug
+ *
+ * Copyright(C) 2009 Jaswinder Singh Rajput
+ */
+
+/* Register flags */
+enum cpu_debug_bit {
+/* Model Specific Registers (MSRs)                                     */
+       CPU_MC_BIT,                             /* Machine Check        */
+       CPU_MONITOR_BIT,                        /* Monitor              */
+       CPU_TIME_BIT,                           /* Time                 */
+       CPU_PMC_BIT,                            /* Performance Monitor  */
+       CPU_PLATFORM_BIT,                       /* Platform             */
+       CPU_APIC_BIT,                           /* APIC                 */
+       CPU_POWERON_BIT,                        /* Power-on             */
+       CPU_CONTROL_BIT,                        /* Control              */
+       CPU_FEATURES_BIT,                       /* Features control     */
+       CPU_LBRANCH_BIT,                        /* Last Branch          */
+       CPU_BIOS_BIT,                           /* BIOS                 */
+       CPU_FREQ_BIT,                           /* Frequency            */
+       CPU_MTTR_BIT,                           /* MTRR                 */
+       CPU_PERF_BIT,                           /* Performance          */
+       CPU_CACHE_BIT,                          /* Cache                */
+       CPU_SYSENTER_BIT,                       /* Sysenter             */
+       CPU_THERM_BIT,                          /* Thermal              */
+       CPU_MISC_BIT,                           /* Miscellaneous        */
+       CPU_DEBUG_BIT,                          /* Debug                */
+       CPU_PAT_BIT,                            /* PAT                  */
+       CPU_VMX_BIT,                            /* VMX                  */
+       CPU_CALL_BIT,                           /* System Call          */
+       CPU_BASE_BIT,                           /* BASE Address         */
+       CPU_VER_BIT,                            /* Version ID           */
+       CPU_CONF_BIT,                           /* Configuration        */
+       CPU_SMM_BIT,                            /* System mgmt mode     */
+       CPU_SVM_BIT,                            /*Secure Virtual Machine*/
+       CPU_OSVM_BIT,                           /* OS-Visible Workaround*/
+/* Standard Registers                                                  */
+       CPU_TSS_BIT,                            /* Task Stack Segment   */
+       CPU_CR_BIT,                             /* Control Registers    */
+       CPU_DT_BIT,                             /* Descriptor Table     */
+/* End of Registers flags                                              */
+       CPU_REG_ALL_BIT,                        /* Select all Registers */
+};
+
+#define        CPU_REG_ALL             (~0)            /* Select all Registers */
+
+#define        CPU_MC                  (1 << CPU_MC_BIT)
+#define        CPU_MONITOR             (1 << CPU_MONITOR_BIT)
+#define        CPU_TIME                (1 << CPU_TIME_BIT)
+#define        CPU_PMC                 (1 << CPU_PMC_BIT)
+#define        CPU_PLATFORM            (1 << CPU_PLATFORM_BIT)
+#define        CPU_APIC                (1 << CPU_APIC_BIT)
+#define        CPU_POWERON             (1 << CPU_POWERON_BIT)
+#define        CPU_CONTROL             (1 << CPU_CONTROL_BIT)
+#define        CPU_FEATURES            (1 << CPU_FEATURES_BIT)
+#define        CPU_LBRANCH             (1 << CPU_LBRANCH_BIT)
+#define        CPU_BIOS                (1 << CPU_BIOS_BIT)
+#define        CPU_FREQ                (1 << CPU_FREQ_BIT)
+#define        CPU_MTRR                (1 << CPU_MTTR_BIT)
+#define        CPU_PERF                (1 << CPU_PERF_BIT)
+#define        CPU_CACHE               (1 << CPU_CACHE_BIT)
+#define        CPU_SYSENTER            (1 << CPU_SYSENTER_BIT)
+#define        CPU_THERM               (1 << CPU_THERM_BIT)
+#define        CPU_MISC                (1 << CPU_MISC_BIT)
+#define        CPU_DEBUG               (1 << CPU_DEBUG_BIT)
+#define        CPU_PAT                 (1 << CPU_PAT_BIT)
+#define        CPU_VMX                 (1 << CPU_VMX_BIT)
+#define        CPU_CALL                (1 << CPU_CALL_BIT)
+#define        CPU_BASE                (1 << CPU_BASE_BIT)
+#define        CPU_VER                 (1 << CPU_VER_BIT)
+#define        CPU_CONF                (1 << CPU_CONF_BIT)
+#define        CPU_SMM                 (1 << CPU_SMM_BIT)
+#define        CPU_SVM                 (1 << CPU_SVM_BIT)
+#define        CPU_OSVM                (1 << CPU_OSVM_BIT)
+#define        CPU_TSS                 (1 << CPU_TSS_BIT)
+#define        CPU_CR                  (1 << CPU_CR_BIT)
+#define        CPU_DT                  (1 << CPU_DT_BIT)
+
+/* Register file flags */
+enum cpu_file_bit {
+       CPU_INDEX_BIT,                          /* index                */
+       CPU_VALUE_BIT,                          /* value                */
+};
+
+#define        CPU_FILE_VALUE                  (1 << CPU_VALUE_BIT)
+
+/*
+ * DisplayFamily_DisplayModel  Processor Families/Processor Number Series
+ * --------------------------  ------------------------------------------
+ * 05_01, 05_02, 05_04         Pentium, Pentium with MMX
+ *
+ * 06_01                       Pentium Pro
+ * 06_03, 06_05                        Pentium II Xeon, Pentium II
+ * 06_07, 06_08, 06_0A, 06_0B  Pentium III Xeon, Pentum III
+ *
+ * 06_09, 060D                 Pentium M
+ *
+ * 06_0E                       Core Duo, Core Solo
+ *
+ * 06_0F                       Xeon 3000, 3200, 5100, 5300, 7300 series,
+ *                             Core 2 Quad, Core 2 Extreme, Core 2 Duo,
+ *                             Pentium dual-core
+ * 06_17                       Xeon 5200, 5400 series, Core 2 Quad Q9650
+ *
+ * 06_1C                       Atom
+ *
+ * 0F_00, 0F_01, 0F_02         Xeon, Xeon MP, Pentium 4
+ * 0F_03, 0F_04                        Xeon, Xeon MP, Pentium 4, Pentium D
+ *
+ * 0F_06                       Xeon 7100, 5000 Series, Xeon MP,
+ *                             Pentium 4, Pentium D
+ */
+
+/* Register processors bits */
+enum cpu_processor_bit {
+       CPU_NONE,
+/* Intel */
+       CPU_INTEL_PENTIUM_BIT,
+       CPU_INTEL_P6_BIT,
+       CPU_INTEL_PENTIUM_M_BIT,
+       CPU_INTEL_CORE_BIT,
+       CPU_INTEL_CORE2_BIT,
+       CPU_INTEL_ATOM_BIT,
+       CPU_INTEL_XEON_P4_BIT,
+       CPU_INTEL_XEON_MP_BIT,
+/* AMD */
+       CPU_AMD_K6_BIT,
+       CPU_AMD_K7_BIT,
+       CPU_AMD_K8_BIT,
+       CPU_AMD_0F_BIT,
+       CPU_AMD_10_BIT,
+       CPU_AMD_11_BIT,
+};
+
+#define        CPU_INTEL_PENTIUM       (1 << CPU_INTEL_PENTIUM_BIT)
+#define        CPU_INTEL_P6            (1 << CPU_INTEL_P6_BIT)
+#define        CPU_INTEL_PENTIUM_M     (1 << CPU_INTEL_PENTIUM_M_BIT)
+#define        CPU_INTEL_CORE          (1 << CPU_INTEL_CORE_BIT)
+#define        CPU_INTEL_CORE2         (1 << CPU_INTEL_CORE2_BIT)
+#define        CPU_INTEL_ATOM          (1 << CPU_INTEL_ATOM_BIT)
+#define        CPU_INTEL_XEON_P4       (1 << CPU_INTEL_XEON_P4_BIT)
+#define        CPU_INTEL_XEON_MP       (1 << CPU_INTEL_XEON_MP_BIT)
+
+#define        CPU_INTEL_PX            (CPU_INTEL_P6 | CPU_INTEL_PENTIUM_M)
+#define        CPU_INTEL_COREX         (CPU_INTEL_CORE | CPU_INTEL_CORE2)
+#define        CPU_INTEL_XEON          (CPU_INTEL_XEON_P4 | CPU_INTEL_XEON_MP)
+#define        CPU_CO_AT               (CPU_INTEL_CORE | CPU_INTEL_ATOM)
+#define        CPU_C2_AT               (CPU_INTEL_CORE2 | CPU_INTEL_ATOM)
+#define        CPU_CX_AT               (CPU_INTEL_COREX | CPU_INTEL_ATOM)
+#define        CPU_CX_XE               (CPU_INTEL_COREX | CPU_INTEL_XEON)
+#define        CPU_P6_XE               (CPU_INTEL_P6 | CPU_INTEL_XEON)
+#define        CPU_PM_CO_AT            (CPU_INTEL_PENTIUM_M | CPU_CO_AT)
+#define        CPU_C2_AT_XE            (CPU_C2_AT | CPU_INTEL_XEON)
+#define        CPU_CX_AT_XE            (CPU_CX_AT | CPU_INTEL_XEON)
+#define        CPU_P6_CX_AT            (CPU_INTEL_P6 | CPU_CX_AT)
+#define        CPU_P6_CX_XE            (CPU_P6_XE | CPU_INTEL_COREX)
+#define        CPU_P6_CX_AT_XE         (CPU_INTEL_P6 | CPU_CX_AT_XE)
+#define        CPU_PM_CX_AT_XE         (CPU_INTEL_PENTIUM_M | CPU_CX_AT_XE)
+#define        CPU_PM_CX_AT            (CPU_INTEL_PENTIUM_M | CPU_CX_AT)
+#define        CPU_PM_CX_XE            (CPU_INTEL_PENTIUM_M | CPU_CX_XE)
+#define        CPU_PX_CX_AT            (CPU_INTEL_PX | CPU_CX_AT)
+#define        CPU_PX_CX_AT_XE         (CPU_INTEL_PX | CPU_CX_AT_XE)
+
+/* Select all supported Intel CPUs */
+#define        CPU_INTEL_ALL           (CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE)
+
+#define        CPU_AMD_K6              (1 << CPU_AMD_K6_BIT)
+#define        CPU_AMD_K7              (1 << CPU_AMD_K7_BIT)
+#define        CPU_AMD_K8              (1 << CPU_AMD_K8_BIT)
+#define        CPU_AMD_0F              (1 << CPU_AMD_0F_BIT)
+#define        CPU_AMD_10              (1 << CPU_AMD_10_BIT)
+#define        CPU_AMD_11              (1 << CPU_AMD_11_BIT)
+
+#define        CPU_K10_PLUS            (CPU_AMD_10 | CPU_AMD_11)
+#define        CPU_K0F_PLUS            (CPU_AMD_0F | CPU_K10_PLUS)
+#define        CPU_K8_PLUS             (CPU_AMD_K8 | CPU_K0F_PLUS)
+#define        CPU_K7_PLUS             (CPU_AMD_K7 | CPU_K8_PLUS)
+
+/* Select all supported AMD CPUs */
+#define        CPU_AMD_ALL             (CPU_AMD_K6 | CPU_K7_PLUS)
+
+/* Select all supported CPUs */
+#define        CPU_ALL                 (CPU_INTEL_ALL | CPU_AMD_ALL)
+
+#define MAX_CPU_FILES          512
+
+struct cpu_private {
+       unsigned                cpu;
+       unsigned                type;
+       unsigned                reg;
+       unsigned                file;
+};
+
+struct cpu_debug_base {
+       char                    *name;          /* Register name        */
+       unsigned                flag;           /* Register flag        */
+       unsigned                write;          /* Register write flag  */
+};
+
+/*
+ * Currently it looks similar to cpu_debug_base but once we add more files
+ * cpu_file_base will go in different direction
+ */
+struct cpu_file_base {
+       char                    *name;          /* Register file name   */
+       unsigned                flag;           /* Register file flag   */
+       unsigned                write;          /* Register write flag  */
+};
+
+struct cpu_cpuX_base {
+       struct dentry           *dentry;        /* Register dentry      */
+       int                     init;           /* Register index file  */
+};
+
+struct cpu_debug_range {
+       unsigned                min;            /* Register range min   */
+       unsigned                max;            /* Register range max   */
+       unsigned                flag;           /* Supported flags      */
+       unsigned                model;          /* Supported models     */
+};
+
+#endif /* _ASM_X86_CPU_DEBUG_H */
index dc27705f5443268cc69857590fde7e1d369f97a2..5623c50d67b268a00c10dc60f8a5013f04bb28aa 100644 (file)
@@ -91,7 +91,6 @@ static inline int desc_empty(const void *ptr)
 #define store_gdt(dtr) native_store_gdt(dtr)
 #define store_idt(dtr) native_store_idt(dtr)
 #define store_tr(tr) (tr = native_store_tr())
-#define store_ldt(ldt) asm("sldt %0":"=m" (ldt))
 
 #define load_TLS(t, cpu) native_load_tls(t, cpu)
 #define set_ldt native_set_ldt
@@ -112,6 +111,8 @@ static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
 }
 #endif /* CONFIG_PARAVIRT */
 
+#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
+
 static inline void native_write_idt_entry(gate_desc *idt, int entry,
                                          const gate_desc *gate)
 {
index bc68212c6bc0097f66ae21b936c0af59e4c3f705..fd8f9e2ca35f16ffdac6b9e609be0086ae055803 100644 (file)
@@ -1,22 +1,15 @@
 #ifndef _ASM_X86_DMI_H
 #define _ASM_X86_DMI_H
 
-#include <asm/io.h>
-
-#define DMI_MAX_DATA 2048
+#include <linux/compiler.h>
+#include <linux/init.h>
 
-extern int dmi_alloc_index;
-extern char dmi_alloc_data[DMI_MAX_DATA];
+#include <asm/io.h>
+#include <asm/setup.h>
 
-/* This is so early that there is no good way to allocate dynamic memory.
-   Allocate data in an BSS array. */
-static inline void *dmi_alloc(unsigned len)
+static __always_inline __init void *dmi_alloc(unsigned len)
 {
-       int idx = dmi_alloc_index;
-       if ((dmi_alloc_index + len) > DMI_MAX_DATA)
-               return NULL;
-       dmi_alloc_index += len;
-       return dmi_alloc_data + idx;
+       return extend_brk(len, sizeof(int));
 }
 
 /* Use early IO mappings for DMI because it's initialized early */
index 00d41ce4c84453b4cbdf3659ab91a2d49176eba1..7ecba4d85089d58400ad8694c6b4024d85029eee 100644 (file)
@@ -72,7 +72,7 @@ extern int e820_all_mapped(u64 start, u64 end, unsigned type);
 extern void e820_add_region(u64 start, u64 size, int type);
 extern void e820_print_map(char *who);
 extern int
-sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map);
+sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, u32 *pnr_map);
 extern u64 e820_update_range(u64 start, u64 size, unsigned old_type,
                               unsigned new_type);
 extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type,
index 854d538ae85797a27601df6ff051ac3c44fe38f5..c2e6bedaf25873b75df79f6a0f809b4ad9070787 100644 (file)
@@ -33,6 +33,8 @@ BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
                 smp_invalidate_interrupt)
 #endif
 
+BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR)
+
 /*
  * every pentium local APIC has two 'local interrupts', with a
  * soft-definable vector attached to both interrupts, one of
index 176f058e715947cbd7b803b90654e14b307dc8ea..039db6aa8e0271951706ee20bd9b819098f8002a 100644 (file)
@@ -12,6 +12,7 @@ typedef struct {
        unsigned int apic_timer_irqs;   /* arch dependent */
        unsigned int irq_spurious_count;
 #endif
+       unsigned int generic_irqs;      /* arch dependent */
 #ifdef CONFIG_SMP
        unsigned int irq_resched_count;
        unsigned int irq_call_count;
index bf9276bea6602fca7bf2b93e6aa640a083e57f99..014c2b85ae45eae61201dcf19da9e04f2940e721 100644 (file)
@@ -63,6 +63,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
 void *kmap_atomic(struct page *page, enum km_type type);
 void kunmap_atomic(void *kvaddr, enum km_type type);
 void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
+void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
 struct page *kmap_atomic_to_page(void *ptr);
 
 #ifndef CONFIG_PARAVIRT
index 370e1c83bb49a27048d4641546bef5f76efd0faa..b762ea49bd703ab3b28958cf83b6908e825e57a4 100644 (file)
@@ -27,6 +27,7 @@
 
 /* Interrupt handlers registered during init_IRQ */
 extern void apic_timer_interrupt(void);
+extern void generic_interrupt(void);
 extern void error_interrupt(void);
 extern void spurious_interrupt(void);
 extern void thermal_interrupt(void);
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
new file mode 100644 (file)
index 0000000..36fb1a6
--- /dev/null
@@ -0,0 +1,18 @@
+#ifndef _ASM_X86_INIT_32_H
+#define _ASM_X86_INIT_32_H
+
+#ifdef CONFIG_X86_32
+extern void __init early_ioremap_page_table_range_init(void);
+#endif
+
+extern unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+                            unsigned long end,
+                            unsigned long page_size_mask);
+
+
+extern unsigned long __initdata e820_table_start;
+extern unsigned long __meminitdata e820_table_end;
+extern unsigned long __meminitdata e820_table_top;
+
+#endif /* _ASM_X86_INIT_32_H */
index 59cb4a1317b774b0f89a87890355bf70157a016b..373cc2bbcad2cdfb17ef4e0448921da892691d0e 100644 (file)
@@ -162,7 +162,8 @@ extern int (*ioapic_renumber_irq)(int ioapic, int irq);
 extern void ioapic_init_mappings(void);
 
 #ifdef CONFIG_X86_64
-extern int save_mask_IO_APIC_setup(void);
+extern int save_IO_APIC_setup(void);
+extern void mask_IO_APIC_setup(void);
 extern void restore_IO_APIC_setup(void);
 extern void reinit_intr_remapped_IO_APIC(int);
 #endif
@@ -172,7 +173,7 @@ extern void probe_nr_irqs_gsi(void);
 extern int setup_ioapic_entry(int apic, int irq,
                              struct IO_APIC_route_entry *entry,
                              unsigned int destination, int trigger,
-                             int polarity, int vector);
+                             int polarity, int vector, int pin);
 extern void ioapic_write_entry(int apic, int pin,
                               struct IO_APIC_route_entry e);
 #else  /* !CONFIG_X86_IO_APIC */
index 107eb2196691134cd4f8bc2944e953a47e58805d..f38481bcd45538f26d640a2c460a3510246510d2 100644 (file)
@@ -36,6 +36,7 @@ static inline int irq_canonicalize(int irq)
 extern void fixup_irqs(void);
 #endif
 
+extern void (*generic_interrupt_extension)(void);
 extern void init_IRQ(void);
 extern void native_init_IRQ(void);
 extern bool handle_irq(unsigned irq, struct pt_regs *regs);
index 20e1fd588dbfbfefc30b567b35625098002f105c..0396760fccb85be05de0d5666c6019316d386d55 100644 (file)
@@ -1,8 +1,6 @@
 #ifndef _ASM_X86_IRQ_REMAPPING_H
 #define _ASM_X86_IRQ_REMAPPING_H
 
-extern int x2apic;
-
 #define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8)
 
 #endif /* _ASM_X86_IRQ_REMAPPING_H */
index 8a285f356f8aef9e0e64b1210303c7d82acdfa19..3cbd79bbb47c82613341fca01ee6e174319342fe 100644 (file)
  */
 #define LOCAL_PERF_VECTOR              0xee
 
+/*
+ * Generic system vector for platform specific use
+ */
+#define GENERIC_INTERRUPT_VECTOR       0xed
+
 /*
  * First APIC vector available to drivers: (vectors 0x30-0xee) we
  * start at 0x31(0x41) to spread out vectors evenly between priority
index 0ceb6d19ed30a3285f6536799ef1cabe4af1cba9..317ff1703d0b0a5e7eaf17b94fd93640b8e62997 100644 (file)
@@ -9,13 +9,13 @@
 # define PAGES_NR              4
 #else
 # define PA_CONTROL_PAGE       0
-# define PA_TABLE_PAGE         1
-# define PAGES_NR              2
+# define VA_CONTROL_PAGE       1
+# define PA_TABLE_PAGE         2
+# define PA_SWAP_PAGE          3
+# define PAGES_NR              4
 #endif
 
-#ifdef CONFIG_X86_32
 # define KEXEC_CONTROL_CODE_MAX_SIZE   2048
-#endif
 
 #ifndef __ASSEMBLY__
 
@@ -136,10 +136,11 @@ relocate_kernel(unsigned long indirection_page,
                unsigned int has_pae,
                unsigned int preserve_context);
 #else
-NORET_TYPE void
+unsigned long
 relocate_kernel(unsigned long indirection_page,
                unsigned long page_list,
-               unsigned long start_address) ATTRIB_NORET;
+               unsigned long start_address,
+               unsigned int preserve_context);
 #endif
 
 #define ARCH_HAS_KIMAGE_ARCH
index 9320e2a8a26a46e99aa2c5eae5a6bd381cc90610..12d55e773eb605ff85c67c8cbf8d453697e01871 100644 (file)
@@ -1,14 +1,11 @@
 #ifndef _ASM_X86_LINKAGE_H
 #define _ASM_X86_LINKAGE_H
 
+#include <linux/stringify.h>
+
 #undef notrace
 #define notrace __attribute__((no_instrument_function))
 
-#ifdef CONFIG_X86_64
-#define __ALIGN .p2align 4,,15
-#define __ALIGN_STR ".p2align 4,,15"
-#endif
-
 #ifdef CONFIG_X86_32
 #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
 /*
        __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
                              "g" (arg4), "g" (arg5), "g" (arg6))
 
-#endif
+#endif /* CONFIG_X86_32 */
+
+#ifdef __ASSEMBLY__
 
 #define GLOBAL(name)   \
        .globl name;    \
        name:
 
-#ifdef CONFIG_X86_ALIGNMENT_16
-#define __ALIGN .align 16,0x90
-#define __ALIGN_STR ".align 16,0x90"
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_ALIGNMENT_16)
+#define __ALIGN                .p2align 4, 0x90
+#define __ALIGN_STR    __stringify(__ALIGN)
 #endif
 
+#endif /* __ASSEMBLY__ */
+
 #endif /* _ASM_X86_LINKAGE_H */
 
index 32c6e17b960b7aed994078dc1d5c1fb4e547b2a4..563933e06a35fa48f6c828d3884f9b35886ab4eb 100644 (file)
@@ -11,6 +11,8 @@
  */
 
 #define MCG_CTL_P       (1UL<<8)   /* MCG_CAP register available */
+#define MCG_EXT_P       (1ULL<<9)   /* Extended registers available */
+#define MCG_CMCI_P      (1ULL<<10)  /* CMCI supported */
 
 #define MCG_STATUS_RIPV  (1UL<<0)   /* restart ip valid */
 #define MCG_STATUS_EIPV  (1UL<<1)   /* ip points to correct instruction */
@@ -90,14 +92,29 @@ extern int mce_disabled;
 
 #include <asm/atomic.h>
 
+void mce_setup(struct mce *m);
 void mce_log(struct mce *m);
 DECLARE_PER_CPU(struct sys_device, device_mce);
 extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
 
+/*
+ * To support more than 128 would need to escape the predefined
+ * Linux defined extended banks first.
+ */
+#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
+
 #ifdef CONFIG_X86_MCE_INTEL
 void mce_intel_feature_init(struct cpuinfo_x86 *c);
+void cmci_clear(void);
+void cmci_reenable(void);
+void cmci_rediscover(int dying);
+void cmci_recheck(void);
 #else
 static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
+static inline void cmci_clear(void) {}
+static inline void cmci_reenable(void) {}
+static inline void cmci_rediscover(int dying) {}
+static inline void cmci_recheck(void) {}
 #endif
 
 #ifdef CONFIG_X86_MCE_AMD
@@ -106,11 +123,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
 static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
 #endif
 
-void mce_log_therm_throt_event(unsigned int cpu, __u64 status);
+extern int mce_available(struct cpuinfo_x86 *c);
+
+void mce_log_therm_throt_event(__u64 status);
 
 extern atomic_t mce_entry;
 
 extern void do_machine_check(struct pt_regs *, long);
+
+typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
+DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
+
+enum mcp_flags {
+       MCP_TIMESTAMP = (1 << 0),       /* log time stamp */
+       MCP_UC = (1 << 1),              /* log uncorrected errors */
+};
+extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
+
 extern int mce_notify_user(void);
 
 #endif /* !CONFIG_X86_32 */
@@ -120,8 +149,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c);
 #else
 #define mcheck_init(c) do { } while (0)
 #endif
-extern void stop_mce(void);
-extern void restart_mce(void);
+
+extern void (*mce_threshold_vector)(void);
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_X86_MCE_H */
index 6706b3006f13296ee02c5a4455268a967ca1c5f3..4cc48af23fefa3e9bec1da781d13ea5beaebd379 100644 (file)
@@ -47,6 +47,7 @@
 #define         MSI_ADDR_DEST_ID_MASK          0x00ffff0
 #define  MSI_ADDR_DEST_ID(dest)                (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \
                                         MSI_ADDR_DEST_ID_MASK)
+#define MSI_ADDR_EXT_DEST_ID(dest)     ((dest) & 0xffffff00)
 
 #define MSI_ADDR_IR_EXT_INT            (1 << 4)
 #define MSI_ADDR_IR_SHV                        (1 << 3)
index f4e505f286bc85cf828679dd91b4a743c582edde..ec41fc16c167cb19ef617f9a5e7ec2dda6f88bd1 100644 (file)
 #define MSR_IA32_MC0_ADDR              0x00000402
 #define MSR_IA32_MC0_MISC              0x00000403
 
+/* These are consecutive and not in the normal 4er MCE bank block */
+#define MSR_IA32_MC0_CTL2              0x00000280
+#define CMCI_EN                        (1ULL << 30)
+#define CMCI_THRESHOLD_MASK            0xffffULL
+
 #define MSR_P6_PERFCTR0                        0x000000c1
 #define MSR_P6_PERFCTR1                        0x000000c2
 #define MSR_P6_EVNTSEL0                        0x00000186
index f1e4a79a6e415702ab6930cedce937afc3e71d54..0f915ae649a717e99ebdfc079652b25fda315e72 100644 (file)
 #define __VIRTUAL_MASK_SHIFT   32
 #endif /* CONFIG_X86_PAE */
 
+/*
+ * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S)
+ */
+#define KERNEL_IMAGE_SIZE      (512 * 1024 * 1024)
+
 #ifndef __ASSEMBLY__
 
 /*
index 2d625da6603c36958cd51a130f6ae41ce2df0f3d..826ad37006ab1a075993ddbe69bd3281412b7f09 100644 (file)
 
 #ifndef __ASSEMBLY__
 
-struct pgprot;
-
 extern int page_is_ram(unsigned long pagenr);
 extern int devmem_is_allowed(unsigned long pagenr);
-extern void map_devmem(unsigned long pfn, unsigned long size,
-                      struct pgprot vma_prot);
-extern void unmap_devmem(unsigned long pfn, unsigned long size,
-                        struct pgprot vma_prot);
 
 extern unsigned long max_low_pfn_mapped;
 extern unsigned long max_pfn_mapped;
index 0617d5cc9712d00a3ca61ad80f515484cb7798e5..7727aa8b7dda99087ca5ced1046a92046ab7c4d5 100644 (file)
@@ -317,8 +317,6 @@ struct pv_mmu_ops {
 #if PAGETABLE_LEVELS >= 3
 #ifdef CONFIG_X86_PAE
        void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
-       void (*set_pte_present)(struct mm_struct *mm, unsigned long addr,
-                               pte_t *ptep, pte_t pte);
        void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
                          pte_t *ptep);
        void (*pmd_clear)(pmd_t *pmdp);
@@ -389,7 +387,7 @@ extern struct pv_lock_ops pv_lock_ops;
 
 #define paravirt_type(op)                              \
        [paravirt_typenum] "i" (PARAVIRT_PATCH(op)),    \
-       [paravirt_opptr] "m" (op)
+       [paravirt_opptr] "i" (&(op))
 #define paravirt_clobber(clobber)              \
        [paravirt_clobber] "i" (clobber)
 
@@ -443,7 +441,7 @@ int paravirt_disable_iospace(void);
  * offset into the paravirt_patch_template structure, and can therefore be
  * freely converted back into a structure offset.
  */
-#define PARAVIRT_CALL  "call *%[paravirt_opptr];"
+#define PARAVIRT_CALL  "call *%c[paravirt_opptr];"
 
 /*
  * These macros are intended to wrap calls through one of the paravirt
@@ -1365,13 +1363,6 @@ static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
                    pte.pte, pte.pte >> 32);
 }
 
-static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
-                                  pte_t *ptep, pte_t pte)
-{
-       /* 5 arg words */
-       pv_mmu_ops.set_pte_present(mm, addr, ptep, pte);
-}
-
 static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
                             pte_t *ptep)
 {
@@ -1388,12 +1379,6 @@ static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
        set_pte(ptep, pte);
 }
 
-static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
-                                  pte_t *ptep, pte_t pte)
-{
-       set_pte(ptep, pte);
-}
-
 static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
                             pte_t *ptep)
 {
index b0e70056838e9d49c4e530f0396926c91abef1fb..2cd07b9422f49cbe8b87e561e140f0f31c28e77f 100644 (file)
@@ -2,6 +2,7 @@
 #define _ASM_X86_PAT_H
 
 #include <linux/types.h>
+#include <asm/pgtable_types.h>
 
 #ifdef CONFIG_X86_PAT
 extern int pat_enabled;
@@ -17,5 +18,9 @@ extern int free_memtype(u64 start, u64 end);
 
 extern int kernel_map_sync_memtype(u64 base, unsigned long size,
                unsigned long flag);
+extern void map_devmem(unsigned long pfn, unsigned long size,
+                      struct pgprot vma_prot);
+extern void unmap_devmem(unsigned long pfn, unsigned long size,
+                        struct pgprot vma_prot);
 
 #endif /* _ASM_X86_PAT_H */
index c1774ac9da7a90f30dd47da6ac99e7b9b98494fd..2334982b339ebcb08e92846c54e2aeb375917984 100644 (file)
@@ -26,13 +26,6 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
        native_set_pte(ptep, pte);
 }
 
-static inline void native_set_pte_present(struct mm_struct *mm,
-                                         unsigned long addr,
-                                         pte_t *ptep, pte_t pte)
-{
-       native_set_pte(ptep, pte);
-}
-
 static inline void native_pmd_clear(pmd_t *pmdp)
 {
        native_set_pmd(pmdp, __pmd(0));
index 3f13cdf61156b4ed330ca60ea717b2044bc9faa6..177b0165ea01fa8aeeb1f8749252d422de52b3a6 100644 (file)
@@ -31,23 +31,6 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)
        ptep->pte_low = pte.pte_low;
 }
 
-/*
- * Since this is only called on user PTEs, and the page fault handler
- * must handle the already racy situation of simultaneous page faults,
- * we are justified in merely clearing the PTE present bit, followed
- * by a set.  The ordering here is important.
- */
-static inline void native_set_pte_present(struct mm_struct *mm,
-                                         unsigned long addr,
-                                         pte_t *ptep, pte_t pte)
-{
-       ptep->pte_low = 0;
-       smp_wmb();
-       ptep->pte_high = pte.pte_high;
-       smp_wmb();
-       ptep->pte_low = pte.pte_low;
-}
-
 static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
 {
        set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
index d0812e155f1d60da04c614bee8b97e2e33869876..29d96d168bc097195e9cbe66ab723c90929e1116 100644 (file)
@@ -31,8 +31,6 @@ extern struct list_head pgd_list;
 #define set_pte(ptep, pte)             native_set_pte(ptep, pte)
 #define set_pte_at(mm, addr, ptep, pte)        native_set_pte_at(mm, addr, ptep, pte)
 
-#define set_pte_present(mm, addr, ptep, pte)                           \
-       native_set_pte_present(mm, addr, ptep, pte)
 #define set_pte_atomic(ptep, pte)                                      \
        native_set_pte_atomic(ptep, pte)
 
index 97612fc7632f0ef95ebb3afecd3886bfe1b176ae..31bd120cf2a2ec10469929c43a110ea559f2adf7 100644 (file)
@@ -42,9 +42,6 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
  */
 #undef TEST_ACCESS_OK
 
-/* The boot page tables (all created as a single array) */
-extern unsigned long pg0[];
-
 #ifdef CONFIG_X86_PAE
 # include <asm/pgtable-3level.h>
 #else
index bd8df3b2fe04fc61fc0912569ebbf498969a028f..2733fad45f989bfd5a775c99e5decb35cfe8014a 100644 (file)
  * area for the same reason. ;)
  */
 #define VMALLOC_OFFSET (8 * 1024 * 1024)
+
+#ifndef __ASSEMBLER__
+extern bool __vmalloc_start_set; /* set once high_memory is set */
+#endif
+
 #define VMALLOC_START  ((unsigned long)high_memory + VMALLOC_OFFSET)
 #ifdef CONFIG_X86_PAE
 #define LAST_PKMAP 512
index 4d258ad76a0fc04925ac484c1fa9b0bfc47a1cb9..b8238dc8786d9f6b4291c2d7f912abbc0ea04e18 100644 (file)
@@ -273,6 +273,7 @@ typedef struct page *pgtable_t;
 
 extern pteval_t __supported_pte_mask;
 extern int nx_enabled;
+extern void set_nx(void);
 
 #define pgprot_writecombine    pgprot_writecombine
 extern pgprot_t pgprot_writecombine(pgprot_t prot);
index 76139506c3e4f489d6252edd272e98d0e8637ead..ae85a8d66a30601a1a22c3b6b84f3df6d3056477 100644 (file)
@@ -75,9 +75,9 @@ struct cpuinfo_x86 {
 #else
        /* Number of 4K pages in DTLB/ITLB combined(in pages): */
        int                     x86_tlbsize;
+#endif
        __u8                    x86_virt_bits;
        __u8                    x86_phys_bits;
-#endif
        /* CPUID returned core id bits: */
        __u8                    x86_coreid_bits;
        /* Max extended CPUID function supported: */
@@ -391,6 +391,9 @@ DECLARE_PER_CPU(union irq_stack_union, irq_stack_union);
 DECLARE_INIT_PER_CPU(irq_stack_union);
 
 DECLARE_PER_CPU(char *, irq_stack_ptr);
+DECLARE_PER_CPU(unsigned int, irq_count);
+extern unsigned long kernel_eflags;
+extern asmlinkage void ignore_sysret(void);
 #else  /* X86_64 */
 #ifdef CONFIG_CC_STACKPROTECTOR
 DECLARE_PER_CPU(unsigned long, stack_canary);
index 2b8c5160388fb4863f05c33c15e42e22c53fc7a2..1b7ee5d673c23552514e7f6d6466bbcaa10ebeda 100644 (file)
@@ -1 +1,8 @@
+#ifndef _ASM_X86_SECTIONS_H
+#define _ASM_X86_SECTIONS_H
+
 #include <asm-generic/sections.h>
+
+extern char __brk_base[], __brk_limit[];
+
+#endif /* _ASM_X86_SECTIONS_H */
index 05c6f6b11fd5d25354ddcd7d3d07cf7cf7bf3fe5..bdc2ada05ae06056ad95e79bd8d6434d73510f5a 100644 (file)
@@ -64,7 +64,7 @@ extern void x86_quirk_time_init(void);
 #include <asm/bootparam.h>
 
 /* Interrupt control for vSMPowered x86_64 systems */
-#ifdef CONFIG_X86_VSMP
+#ifdef CONFIG_X86_64
 void vsmp_init(void);
 #else
 static inline void vsmp_init(void) { }
@@ -100,20 +100,51 @@ extern struct boot_params boot_params;
  */
 #define LOWMEMSIZE()   (0x9f000)
 
+/* exceedingly early brk-like allocator */
+extern unsigned long _brk_end;
+void *extend_brk(size_t size, size_t align);
+
+/*
+ * Reserve space in the brk section.  The name must be unique within
+ * the file, and somewhat descriptive.  The size is in bytes.  Must be
+ * used at file scope.
+ *
+ * (This uses a temp function to wrap the asm so we can pass it the
+ * size parameter; otherwise we wouldn't be able to.  We can't use a
+ * "section" attribute on a normal variable because it always ends up
+ * being @progbits, which ends up allocating space in the vmlinux
+ * executable.)
+ */
+#define RESERVE_BRK(name,sz)                                           \
+       static void __section(.discard) __used                          \
+       __brk_reservation_fn_##name##__(void) {                         \
+               asm volatile (                                          \
+                       ".pushsection .brk_reservation,\"aw\",@nobits;" \
+                       ".brk." #name ":"                               \
+                       " 1:.skip %c0;"                                 \
+                       " .size .brk." #name ", . - 1b;"                \
+                       " .popsection"                                  \
+                       : : "i" (sz));                                  \
+       }
+
 #ifdef __i386__
 
 void __init i386_start_kernel(void);
 extern void probe_roms(void);
 
-extern unsigned long init_pg_tables_start;
-extern unsigned long init_pg_tables_end;
-
 #else
 void __init x86_64_start_kernel(char *real_mode);
 void __init x86_64_start_reservations(char *real_mode_data);
 
 #endif /* __i386__ */
 #endif /* _SETUP */
+#else
+#define RESERVE_BRK(name,sz)                           \
+       .pushsection .brk_reservation,"aw",@nobits;     \
+.brk.name:                                             \
+1:     .skip sz;                                       \
+       .size .brk.name,.-1b;                           \
+       .popsection
 #endif /* __ASSEMBLY__ */
 #endif  /*  __KERNEL__  */
 
index 777327ef05c19f0887c0b9b089a6f71135395e94..9f4dfba33b2899a17ee672c19a4891b09ac816dc 100644 (file)
@@ -199,6 +199,10 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
 #define SCIR_CPU_ACTIVITY      0x02    /* not idle */
 #define SCIR_CPU_HB_INTERVAL   (HZ)    /* once per second */
 
+/* Loop through all installed blades */
+#define for_each_possible_blade(bid)           \
+       for ((bid) = 0; (bid) < uv_num_possible_blades(); (bid)++)
+
 /*
  * Macros for converting between kernel virtual addresses, socket local physical
  * addresses, and UV global physical addresses.
index 5e79ca694326cdeb85886d371565ad506fe6f0af..9c371e4a9fa6e122a25b6cf745fde11b027ba313 100644 (file)
@@ -296,6 +296,8 @@ HYPERVISOR_get_debugreg(int reg)
 static inline int
 HYPERVISOR_update_descriptor(u64 ma, u64 desc)
 {
+       if (sizeof(u64) == sizeof(long))
+               return _hypercall2(int, update_descriptor, ma, desc);
        return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
 }
 
index 95f216bbfaf16f6b8572c40c4cdfa756758f6be5..6e9c1f320acfae807d5bd8bf546030713c6a5f2c 100644 (file)
@@ -70,7 +70,6 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER)   += ftrace.o
 obj-$(CONFIG_KEXEC)            += machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)            += relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)       += crash_dump_$(BITS).o
-obj-$(CONFIG_X86_VSMP)         += vsmp_64.o
 obj-$(CONFIG_KPROBES)          += kprobes.o
 obj-$(CONFIG_MODULES)          += module_$(BITS).o
 obj-$(CONFIG_EFI)              += efi.o efi_$(BITS).o efi_stub_$(BITS).o
@@ -111,7 +110,7 @@ obj-$(CONFIG_SWIOTLB)                       += pci-swiotlb_64.o # NB rename without _64
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
-       obj-$(CONFIG_X86_UV)            += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o
+       obj-$(CONFIG_X86_UV)            += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
        obj-$(CONFIG_X86_PM_TIMER)      += pmtimer_64.o
        obj-$(CONFIG_AUDIT)             += audit_64.o
 
@@ -120,4 +119,5 @@ ifeq ($(CONFIG_X86_64),y)
        obj-$(CONFIG_AMD_IOMMU)         += amd_iommu_init.o amd_iommu.o
 
        obj-$(CONFIG_PCI_MMCONFIG)      += mmconf-fam10h_64.o
+       obj-y                           += vsmp_64.o
 endif
index 6907b8e85d52c580083d47971e4fa9192259c7f9..4c80f15574335d6b02bc93a7ed797c088c1da6c2 100644 (file)
@@ -414,9 +414,17 @@ void __init alternative_instructions(void)
           that might execute the to be patched code.
           Other CPUs are not running. */
        stop_nmi();
-#ifdef CONFIG_X86_MCE
-       stop_mce();
-#endif
+
+       /*
+        * Don't stop machine check exceptions while patching.
+        * MCEs only happen when something got corrupted and in this
+        * case we must do something about the corruption.
+        * Ignoring it is worse than a unlikely patching race.
+        * Also machine checks tend to be broadcast and if one CPU
+        * goes into machine check the others follow quickly, so we don't
+        * expect a machine check to cause undue problems during to code
+        * patching.
+        */
 
        apply_alternatives(__alt_instructions, __alt_instructions_end);
 
@@ -456,9 +464,6 @@ void __init alternative_instructions(void)
                                (unsigned long)__smp_locks_end);
 
        restart_nmi();
-#ifdef CONFIG_X86_MCE
-       restart_mce();
-#endif
 }
 
 /**
index f9cecdfd05c5523cb6298a27a3d03706e92fb21a..85eb8e100818d2dfef0bae935684e6fe0253c888 100644 (file)
@@ -46,6 +46,7 @@
 #include <asm/idle.h>
 #include <asm/mtrr.h>
 #include <asm/smp.h>
+#include <asm/mce.h>
 
 unsigned int num_processors;
 
@@ -808,7 +809,7 @@ void clear_local_APIC(void)
        u32 v;
 
        /* APIC hasn't been mapped yet */
-       if (!apic_phys)
+       if (!x2apic && !apic_phys)
                return;
 
        maxlvt = lapic_get_maxlvt();
@@ -842,6 +843,14 @@ void clear_local_APIC(void)
                apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
        }
 #endif
+#ifdef CONFIG_X86_MCE_INTEL
+       if (maxlvt >= 6) {
+               v = apic_read(APIC_LVTCMCI);
+               if (!(v & APIC_LVT_MASKED))
+                       apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
+       }
+#endif
+
        /*
         * Clean APIC state for other OSs:
         */
@@ -1241,6 +1250,12 @@ void __cpuinit setup_local_APIC(void)
        apic_write(APIC_LVT1, value);
 
        preempt_enable();
+
+#ifdef CONFIG_X86_MCE_INTEL
+       /* Recheck CMCI information after local APIC is up on CPU #0 */
+       if (smp_processor_id() == 0)
+               cmci_recheck();
+#endif
 }
 
 void __cpuinit end_local_APIC_setup(void)
@@ -1319,15 +1334,16 @@ void __init enable_IR_x2apic(void)
                return;
        }
 
-       local_irq_save(flags);
-       mask_8259A();
-
-       ret = save_mask_IO_APIC_setup();
+       ret = save_IO_APIC_setup();
        if (ret) {
                pr_info("Saving IO-APIC state failed: %d\n", ret);
                goto end;
        }
 
+       local_irq_save(flags);
+       mask_IO_APIC_setup();
+       mask_8259A();
+
        ret = enable_intr_remapping(1);
 
        if (ret && x2apic_preenabled) {
@@ -1352,10 +1368,10 @@ end_restore:
        else
                reinit_intr_remapped_IO_APIC(x2apic_preenabled);
 
-end:
        unmask_8259A();
        local_irq_restore(flags);
 
+end:
        if (!ret) {
                if (!x2apic_preenabled)
                        pr_info("Enabled x2apic and interrupt-remapping\n");
@@ -1508,12 +1524,10 @@ void __init early_init_lapic_mapping(void)
  */
 void __init init_apic_mappings(void)
 {
-#ifdef CONFIG_X86_X2APIC
        if (x2apic) {
                boot_cpu_physical_apicid = read_apic_id();
                return;
        }
-#endif
 
        /*
         * If no local APIC can be found then set up a fake all
@@ -1957,12 +1971,9 @@ static int lapic_resume(struct sys_device *dev)
 
        local_irq_save(flags);
 
-#ifdef CONFIG_X86_X2APIC
        if (x2apic)
                enable_x2apic();
-       else
-#endif
-       {
+       else {
                /*
                 * Make sure the APICBASE points to the right address
                 *
index f933822dba188fa8cac62e5b404ead77becf4b78..0014714ea97b6e58f8ceeb012aa0a1ae0d9e0975 100644 (file)
@@ -159,20 +159,6 @@ static int flat_apic_id_registered(void)
        return physid_isset(read_xapic_id(), phys_cpu_present_map);
 }
 
-static unsigned int flat_cpu_mask_to_apicid(const struct cpumask *cpumask)
-{
-       return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
-}
-
-static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
-                                               const struct cpumask *andmask)
-{
-       unsigned long mask1 = cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
-       unsigned long mask2 = cpumask_bits(andmask)[0] & APIC_ALL_CPUS;
-
-       return mask1 & mask2;
-}
-
 static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
 {
        return hard_smp_processor_id() >> index_msb;
@@ -213,8 +199,8 @@ struct apic apic_flat =  {
        .set_apic_id                    = set_apic_id,
        .apic_id_mask                   = 0xFFu << 24,
 
-       .cpu_mask_to_apicid             = flat_cpu_mask_to_apicid,
-       .cpu_mask_to_apicid_and         = flat_cpu_mask_to_apicid_and,
+       .cpu_mask_to_apicid             = default_cpu_mask_to_apicid,
+       .cpu_mask_to_apicid_and         = default_cpu_mask_to_apicid_and,
 
        .send_IPI_mask                  = flat_send_IPI_mask,
        .send_IPI_mask_allbutself       = flat_send_IPI_mask_allbutself,
index 00e6071cefc436a792a71acf571860fe324a1ff6..da99ffcdfde62a006bcbcf84a496c4f4ea8de755 100644 (file)
@@ -389,6 +389,8 @@ struct io_apic {
        unsigned int index;
        unsigned int unused[3];
        unsigned int data;
+       unsigned int unused2[11];
+       unsigned int eoi;
 };
 
 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
@@ -397,6 +399,12 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
                + (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
 }
 
+static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
+{
+       struct io_apic __iomem *io_apic = io_apic_base(apic);
+       writel(vector, &io_apic->eoi);
+}
+
 static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
 {
        struct io_apic __iomem *io_apic = io_apic_base(apic);
@@ -546,16 +554,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
 
                apic = entry->apic;
                pin = entry->pin;
-#ifdef CONFIG_INTR_REMAP
                /*
                 * With interrupt-remapping, destination information comes
                 * from interrupt-remapping table entry.
                 */
                if (!irq_remapped(irq))
                        io_apic_write(apic, 0x11 + pin*2, dest);
-#else
-               io_apic_write(apic, 0x11 + pin*2, dest);
-#endif
                reg = io_apic_read(apic, 0x10 + pin*2);
                reg &= ~IO_APIC_REDIR_VECTOR_MASK;
                reg |= vector;
@@ -588,10 +592,12 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
        if (assign_irq_vector(irq, cfg, mask))
                return BAD_APICID;
 
-       cpumask_and(desc->affinity, cfg->domain, mask);
+       /* check that before desc->addinity get updated */
        set_extra_move_desc(desc, mask);
 
-       return apic->cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask);
+       cpumask_copy(desc->affinity, mask);
+
+       return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
 }
 
 static void
@@ -849,9 +855,9 @@ __setup("pirq=", ioapic_pirq_setup);
 static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
 
 /*
- * Saves and masks all the unmasked IO-APIC RTE's
+ * Saves all the IO-APIC RTE's
  */
-int save_mask_IO_APIC_setup(void)
+int save_IO_APIC_setup(void)
 {
        union IO_APIC_reg_01 reg_01;
        unsigned long flags;
@@ -876,16 +882,9 @@ int save_mask_IO_APIC_setup(void)
        }
 
        for (apic = 0; apic < nr_ioapics; apic++)
-               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
-                       struct IO_APIC_route_entry entry;
-
-                       entry = early_ioapic_entries[apic][pin] =
+               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+                       early_ioapic_entries[apic][pin] =
                                ioapic_read_entry(apic, pin);
-                       if (!entry.mask) {
-                               entry.mask = 1;
-                               ioapic_write_entry(apic, pin, entry);
-                       }
-               }
 
        return 0;
 
@@ -898,6 +897,25 @@ nomem:
        return -ENOMEM;
 }
 
+void mask_IO_APIC_setup(void)
+{
+       int apic, pin;
+
+       for (apic = 0; apic < nr_ioapics; apic++) {
+               if (!early_ioapic_entries[apic])
+                       break;
+               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+                       struct IO_APIC_route_entry entry;
+
+                       entry = early_ioapic_entries[apic][pin];
+                       if (!entry.mask) {
+                               entry.mask = 1;
+                               ioapic_write_entry(apic, pin, entry);
+                       }
+               }
+       }
+}
+
 void restore_IO_APIC_setup(void)
 {
        int apic, pin;
@@ -1411,9 +1429,7 @@ void __setup_vector_irq(int cpu)
 }
 
 static struct irq_chip ioapic_chip;
-#ifdef CONFIG_INTR_REMAP
 static struct irq_chip ir_ioapic_chip;
-#endif
 
 #define IOAPIC_AUTO     -1
 #define IOAPIC_EDGE     0
@@ -1452,7 +1468,6 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
        else
                desc->status &= ~IRQ_LEVEL;
 
-#ifdef CONFIG_INTR_REMAP
        if (irq_remapped(irq)) {
                desc->status |= IRQ_MOVE_PCNTXT;
                if (trigger)
@@ -1464,7 +1479,7 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
                                                      handle_edge_irq, "edge");
                return;
        }
-#endif
+
        if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
            trigger == IOAPIC_LEVEL)
                set_irq_chip_and_handler_name(irq, &ioapic_chip,
@@ -1478,14 +1493,13 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
 int setup_ioapic_entry(int apic_id, int irq,
                       struct IO_APIC_route_entry *entry,
                       unsigned int destination, int trigger,
-                      int polarity, int vector)
+                      int polarity, int vector, int pin)
 {
        /*
         * add it to the IO-APIC irq-routing table:
         */
        memset(entry,0,sizeof(*entry));
 
-#ifdef CONFIG_INTR_REMAP
        if (intr_remapping_enabled) {
                struct intel_iommu *iommu = map_ioapic_to_ir(apic_id);
                struct irte irte;
@@ -1504,7 +1518,14 @@ int setup_ioapic_entry(int apic_id, int irq,
 
                irte.present = 1;
                irte.dst_mode = apic->irq_dest_mode;
-               irte.trigger_mode = trigger;
+               /*
+                * Trigger mode in the IRTE will always be edge, and the
+                * actual level or edge trigger will be setup in the IO-APIC
+                * RTE. This will help simplify level triggered irq migration.
+                * For more details, see the comments above explainig IO-APIC
+                * irq migration in the presence of interrupt-remapping.
+                */
+               irte.trigger_mode = 0;
                irte.dlvry_mode = apic->irq_delivery_mode;
                irte.vector = vector;
                irte.dest_id = IRTE_DEST(destination);
@@ -1515,18 +1536,21 @@ int setup_ioapic_entry(int apic_id, int irq,
                ir_entry->zero = 0;
                ir_entry->format = 1;
                ir_entry->index = (index & 0x7fff);
-       } else
-#endif
-       {
+               /*
+                * IO-APIC RTE will be configured with virtual vector.
+                * irq handler will do the explicit EOI to the io-apic.
+                */
+               ir_entry->vector = pin;
+       } else {
                entry->delivery_mode = apic->irq_delivery_mode;
                entry->dest_mode = apic->irq_dest_mode;
                entry->dest = destination;
+               entry->vector = vector;
        }
 
        entry->mask = 0;                                /* enable IRQ */
        entry->trigger = trigger;
        entry->polarity = polarity;
-       entry->vector = vector;
 
        /* Mask level triggered irqs.
         * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
@@ -1561,7 +1585,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
 
 
        if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,
-                              dest, trigger, polarity, cfg->vector)) {
+                              dest, trigger, polarity, cfg->vector, pin)) {
                printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
                       mp_ioapics[apic_id].apicid, pin);
                __clear_irq_vector(irq, cfg);
@@ -1642,10 +1666,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
 {
        struct IO_APIC_route_entry entry;
 
-#ifdef CONFIG_INTR_REMAP
        if (intr_remapping_enabled)
                return;
-#endif
 
        memset(&entry, 0, sizeof(entry));
 
@@ -2040,8 +2062,13 @@ void disable_IO_APIC(void)
         * If the i8259 is routed through an IOAPIC
         * Put that IOAPIC in virtual wire mode
         * so legacy interrupts can be delivered.
+        *
+        * With interrupt-remapping, for now we will use virtual wire A mode,
+        * as virtual wire B is little complex (need to configure both
+        * IOAPIC RTE aswell as interrupt-remapping table entry).
+        * As this gets called during crash dump, keep this simple for now.
         */
-       if (ioapic_i8259.pin != -1) {
+       if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
                struct IO_APIC_route_entry entry;
 
                memset(&entry, 0, sizeof(entry));
@@ -2061,7 +2088,10 @@ void disable_IO_APIC(void)
                ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
        }
 
-       disconnect_bsp_APIC(ioapic_i8259.pin != -1);
+       /*
+        * Use virtual wire A mode when interrupt remapping is enabled.
+        */
+       disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1);
 }
 
 #ifdef CONFIG_X86_32
@@ -2303,37 +2333,24 @@ static int ioapic_retrigger_irq(unsigned int irq)
 #ifdef CONFIG_SMP
 
 #ifdef CONFIG_INTR_REMAP
-static void ir_irq_migration(struct work_struct *work);
-
-static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
 
 /*
  * Migrate the IO-APIC irq in the presence of intr-remapping.
  *
- * For edge triggered, irq migration is a simple atomic update(of vector
- * and cpu destination) of IRTE and flush the hardware cache.
- *
- * For level triggered, we need to modify the io-apic RTE aswell with the update
- * vector information, along with modifying IRTE with vector and destination.
- * So irq migration for level triggered is little  bit more complex compared to
- * edge triggered migration. But the good news is, we use the same algorithm
- * for level triggered migration as we have today, only difference being,
- * we now initiate the irq migration from process context instead of the
- * interrupt context.
+ * For both level and edge triggered, irq migration is a simple atomic
+ * update(of vector and cpu destination) of IRTE and flush the hardware cache.
  *
- * In future, when we do a directed EOI (combined with cpu EOI broadcast
- * suppression) to the IO-APIC, level triggered irq migration will also be
- * as simple as edge triggered migration and we can do the irq migration
- * with a simple atomic update to IO-APIC RTE.
+ * For level triggered, we eliminate the io-apic RTE modification (with the
+ * updated vector information), by using a virtual vector (io-apic pin number).
+ * Real vector that is used for interrupting cpu will be coming from
+ * the interrupt-remapping table entry.
  */
 static void
 migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 {
        struct irq_cfg *cfg;
        struct irte irte;
-       int modify_ioapic_rte;
        unsigned int dest;
-       unsigned long flags;
        unsigned int irq;
 
        if (!cpumask_intersects(mask, cpu_online_mask))
@@ -2351,13 +2368,6 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 
        dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
 
-       modify_ioapic_rte = desc->status & IRQ_LEVEL;
-       if (modify_ioapic_rte) {
-               spin_lock_irqsave(&ioapic_lock, flags);
-               __target_IO_APIC_irq(irq, dest, cfg);
-               spin_unlock_irqrestore(&ioapic_lock, flags);
-       }
-
        irte.vector = cfg->vector;
        irte.dest_id = IRTE_DEST(dest);
 
@@ -2372,73 +2382,12 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
        cpumask_copy(desc->affinity, mask);
 }
 
-static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
-{
-       int ret = -1;
-       struct irq_cfg *cfg = desc->chip_data;
-
-       mask_IO_APIC_irq_desc(desc);
-
-       if (io_apic_level_ack_pending(cfg)) {
-               /*
-                * Interrupt in progress. Migrating irq now will change the
-                * vector information in the IO-APIC RTE and that will confuse
-                * the EOI broadcast performed by cpu.
-                * So, delay the irq migration to the next instance.
-                */
-               schedule_delayed_work(&ir_migration_work, 1);
-               goto unmask;
-       }
-
-       /* everthing is clear. we have right of way */
-       migrate_ioapic_irq_desc(desc, desc->pending_mask);
-
-       ret = 0;
-       desc->status &= ~IRQ_MOVE_PENDING;
-       cpumask_clear(desc->pending_mask);
-
-unmask:
-       unmask_IO_APIC_irq_desc(desc);
-
-       return ret;
-}
-
-static void ir_irq_migration(struct work_struct *work)
-{
-       unsigned int irq;
-       struct irq_desc *desc;
-
-       for_each_irq_desc(irq, desc) {
-               if (desc->status & IRQ_MOVE_PENDING) {
-                       unsigned long flags;
-
-                       spin_lock_irqsave(&desc->lock, flags);
-                       if (!desc->chip->set_affinity ||
-                           !(desc->status & IRQ_MOVE_PENDING)) {
-                               desc->status &= ~IRQ_MOVE_PENDING;
-                               spin_unlock_irqrestore(&desc->lock, flags);
-                               continue;
-                       }
-
-                       desc->chip->set_affinity(irq, desc->pending_mask);
-                       spin_unlock_irqrestore(&desc->lock, flags);
-               }
-       }
-}
-
 /*
  * Migrates the IRQ destination in the process context.
  */
 static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
                                            const struct cpumask *mask)
 {
-       if (desc->status & IRQ_LEVEL) {
-               desc->status |= IRQ_MOVE_PENDING;
-               cpumask_copy(desc->pending_mask, mask);
-               migrate_irq_remapped_level_desc(desc);
-               return;
-       }
-
        migrate_ioapic_irq_desc(desc, mask);
 }
 static void set_ir_ioapic_affinity_irq(unsigned int irq,
@@ -2448,6 +2397,11 @@ static void set_ir_ioapic_affinity_irq(unsigned int irq,
 
        set_ir_ioapic_affinity_irq_desc(desc, mask);
 }
+#else
+static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+                                                  const struct cpumask *mask)
+{
+}
 #endif
 
 asmlinkage void smp_irq_move_cleanup_interrupt(void)
@@ -2461,6 +2415,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
        me = smp_processor_id();
        for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
                unsigned int irq;
+               unsigned int irr;
                struct irq_desc *desc;
                struct irq_cfg *cfg;
                irq = __get_cpu_var(vector_irq)[vector];
@@ -2480,6 +2435,18 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
                if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
                        goto unlock;
 
+               irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+               /*
+                * Check if the vector that needs to be cleanedup is
+                * registered at the cpu's IRR. If so, then this is not
+                * the best time to clean it up. Lets clean it up in the
+                * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
+                * to myself.
+                */
+               if (irr  & (1 << (vector % 32))) {
+                       apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
+                       goto unlock;
+               }
                __get_cpu_var(vector_irq)[vector] = -1;
                cfg->move_cleanup_count--;
 unlock:
@@ -2529,9 +2496,44 @@ static inline void irq_complete_move(struct irq_desc **descp) {}
 #endif
 
 #ifdef CONFIG_INTR_REMAP
+static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+{
+       int apic, pin;
+       struct irq_pin_list *entry;
+
+       entry = cfg->irq_2_pin;
+       for (;;) {
+
+               if (!entry)
+                       break;
+
+               apic = entry->apic;
+               pin = entry->pin;
+               io_apic_eoi(apic, pin);
+               entry = entry->next;
+       }
+}
+
+static void
+eoi_ioapic_irq(struct irq_desc *desc)
+{
+       struct irq_cfg *cfg;
+       unsigned long flags;
+       unsigned int irq;
+
+       irq = desc->irq;
+       cfg = desc->chip_data;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       __eoi_ioapic_irq(irq, cfg);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
 static void ack_x2apic_level(unsigned int irq)
 {
+       struct irq_desc *desc = irq_to_desc(irq);
        ack_x2APIC_irq();
+       eoi_ioapic_irq(desc);
 }
 
 static void ack_x2apic_edge(unsigned int irq)
@@ -2662,20 +2664,20 @@ static struct irq_chip ioapic_chip __read_mostly = {
        .retrigger      = ioapic_retrigger_irq,
 };
 
-#ifdef CONFIG_INTR_REMAP
 static struct irq_chip ir_ioapic_chip __read_mostly = {
        .name           = "IR-IO-APIC",
        .startup        = startup_ioapic_irq,
        .mask           = mask_IO_APIC_irq,
        .unmask         = unmask_IO_APIC_irq,
+#ifdef CONFIG_INTR_REMAP
        .ack            = ack_x2apic_edge,
        .eoi            = ack_x2apic_level,
 #ifdef CONFIG_SMP
        .set_affinity   = set_ir_ioapic_affinity_irq,
+#endif
 #endif
        .retrigger      = ioapic_retrigger_irq,
 };
-#endif
 
 static inline void init_IO_APIC_traps(void)
 {
@@ -2901,10 +2903,8 @@ static inline void __init check_timer(void)
         * 8259A.
         */
        if (pin1 == -1) {
-#ifdef CONFIG_INTR_REMAP
                if (intr_remapping_enabled)
                        panic("BIOS bug: timer not connected to IO-APIC");
-#endif
                pin1 = pin2;
                apic1 = apic2;
                no_pin1 = 1;
@@ -2940,10 +2940,8 @@ static inline void __init check_timer(void)
                                clear_IO_APIC_pin(0, pin1);
                        goto out;
                }
-#ifdef CONFIG_INTR_REMAP
                if (intr_remapping_enabled)
                        panic("timer doesn't work through Interrupt-remapped IO-APIC");
-#endif
                local_irq_disable();
                clear_IO_APIC_pin(apic1, pin1);
                if (!no_pin1)
@@ -3237,9 +3235,7 @@ void destroy_irq(unsigned int irq)
        if (desc)
                desc->chip_data = cfg;
 
-#ifdef CONFIG_INTR_REMAP
        free_irte(irq);
-#endif
        spin_lock_irqsave(&vector_lock, flags);
        __clear_irq_vector(irq, cfg);
        spin_unlock_irqrestore(&vector_lock, flags);
@@ -3265,7 +3261,6 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 
        dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
 
-#ifdef CONFIG_INTR_REMAP
        if (irq_remapped(irq)) {
                struct irte irte;
                int ir_index;
@@ -3291,10 +3286,13 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
                                  MSI_ADDR_IR_SHV |
                                  MSI_ADDR_IR_INDEX1(ir_index) |
                                  MSI_ADDR_IR_INDEX2(ir_index);
-       } else
-#endif
-       {
-               msg->address_hi = MSI_ADDR_BASE_HI;
+       } else {
+               if (x2apic_enabled())
+                       msg->address_hi = MSI_ADDR_BASE_HI |
+                                         MSI_ADDR_EXT_DEST_ID(dest);
+               else
+                       msg->address_hi = MSI_ADDR_BASE_HI;
+
                msg->address_lo =
                        MSI_ADDR_BASE_LO |
                        ((apic->irq_dest_mode == 0) ?
@@ -3394,14 +3392,15 @@ static struct irq_chip msi_chip = {
        .retrigger      = ioapic_retrigger_irq,
 };
 
-#ifdef CONFIG_INTR_REMAP
 static struct irq_chip msi_ir_chip = {
        .name           = "IR-PCI-MSI",
        .unmask         = unmask_msi_irq,
        .mask           = mask_msi_irq,
+#ifdef CONFIG_INTR_REMAP
        .ack            = ack_x2apic_edge,
 #ifdef CONFIG_SMP
        .set_affinity   = ir_set_msi_irq_affinity,
+#endif
 #endif
        .retrigger      = ioapic_retrigger_irq,
 };
@@ -3432,7 +3431,6 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
        }
        return index;
 }
-#endif
 
 static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 {
@@ -3446,7 +3444,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
        set_irq_msi(irq, msidesc);
        write_msi_msg(irq, &msg);
 
-#ifdef CONFIG_INTR_REMAP
        if (irq_remapped(irq)) {
                struct irq_desc *desc = irq_to_desc(irq);
                /*
@@ -3455,7 +3452,6 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
                desc->status |= IRQ_MOVE_PCNTXT;
                set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
        } else
-#endif
                set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
 
        dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
@@ -3469,11 +3465,8 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
        int ret, sub_handle;
        struct msi_desc *msidesc;
        unsigned int irq_want;
-
-#ifdef CONFIG_INTR_REMAP
-       struct intel_iommu *iommu = 0;
+       struct intel_iommu *iommu = NULL;
        int index = 0;
-#endif
 
        irq_want = nr_irqs_gsi;
        sub_handle = 0;
@@ -3482,7 +3475,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
                if (irq == 0)
                        return -1;
                irq_want = irq + 1;
-#ifdef CONFIG_INTR_REMAP
                if (!intr_remapping_enabled)
                        goto no_ir;
 
@@ -3510,7 +3502,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
                        set_irte_irq(irq, iommu, index, sub_handle);
                }
 no_ir:
-#endif
                ret = setup_msi_irq(dev, msidesc, irq);
                if (ret < 0)
                        goto error;
@@ -3528,7 +3519,7 @@ void arch_teardown_msi_irq(unsigned int irq)
        destroy_irq(irq);
 }
 
-#ifdef CONFIG_DMAR
+#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
 #ifdef CONFIG_SMP
 static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
@@ -3609,7 +3600,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 #endif /* CONFIG_SMP */
 
-struct irq_chip hpet_msi_type = {
+static struct irq_chip hpet_msi_type = {
        .name = "HPET_MSI",
        .unmask = hpet_msi_unmask,
        .mask = hpet_msi_mask,
@@ -4045,11 +4036,9 @@ void __init setup_ioapic_dest(void)
                        else
                                mask = apic->target_cpus();
 
-#ifdef CONFIG_INTR_REMAP
                        if (intr_remapping_enabled)
                                set_ir_ioapic_affinity_irq_desc(desc, mask);
                        else
-#endif
                                set_ioapic_affinity_irq_desc(desc, mask);
                }
 
@@ -4142,9 +4131,12 @@ static int __init ioapic_insert_resources(void)
        struct resource *r = ioapic_resources;
 
        if (!r) {
-               printk(KERN_ERR
-                      "IO APIC resources could be not be allocated.\n");
-               return -1;
+               if (nr_ioapics > 0) {
+                       printk(KERN_ERR
+                               "IO APIC resources couldn't be allocated.\n");
+                       return -1;
+               }
+               return 0;
        }
 
        for (i = 0; i < nr_ioapics; i++) {
index 8d7748efe6a801859e2cda4c96354a57d69237ea..1783652bb0e56c8140e411a4ba5643a64ab4d0fc 100644 (file)
@@ -68,6 +68,13 @@ void __init default_setup_apic_routing(void)
                        apic = &apic_physflat;
                printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
        }
+
+       /*
+        * Now that apic routing model is selected, configure the
+        * fault handling for intr remapping.
+        */
+       if (intr_remapping_enabled)
+               enable_drhd_fault_handling();
 }
 
 /* Same for both flat and physical. */
index 8fb87b6dd63330c193890ac3ae5d704eda096ba2..4a903e2f0d179d68d6a30f5afe7c1b0543a45870 100644 (file)
@@ -57,6 +57,8 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
        unsigned long query_cpu;
        unsigned long flags;
 
+       x2apic_wrmsr_fence();
+
        local_irq_save(flags);
        for_each_cpu(query_cpu, mask) {
                __x2apic_send_IPI_dest(
@@ -73,6 +75,8 @@ static void
        unsigned long query_cpu;
        unsigned long flags;
 
+       x2apic_wrmsr_fence();
+
        local_irq_save(flags);
        for_each_cpu(query_cpu, mask) {
                if (query_cpu == this_cpu)
@@ -90,6 +94,8 @@ static void x2apic_send_IPI_allbutself(int vector)
        unsigned long query_cpu;
        unsigned long flags;
 
+       x2apic_wrmsr_fence();
+
        local_irq_save(flags);
        for_each_online_cpu(query_cpu) {
                if (query_cpu == this_cpu)
index 23625b9f98b28530657ca7d572cf4b7a4a60a378..a284359627e7fe23ed8a84ab00be47e0a24ae9bf 100644 (file)
@@ -58,6 +58,8 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
        unsigned long query_cpu;
        unsigned long flags;
 
+       x2apic_wrmsr_fence();
+
        local_irq_save(flags);
        for_each_cpu(query_cpu, mask) {
                __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
@@ -73,6 +75,8 @@ static void
        unsigned long query_cpu;
        unsigned long flags;
 
+       x2apic_wrmsr_fence();
+
        local_irq_save(flags);
        for_each_cpu(query_cpu, mask) {
                if (query_cpu != this_cpu)
@@ -89,6 +93,8 @@ static void x2apic_send_IPI_allbutself(int vector)
        unsigned long query_cpu;
        unsigned long flags;
 
+       x2apic_wrmsr_fence();
+
        local_irq_save(flags);
        for_each_online_cpu(query_cpu) {
                if (query_cpu == this_cpu)
index 2ac0ab71412a59f9c9e0b025d88f0acf9e7e39ff..fc999e6fc46a34cf1417cd600c11a0e07af1acfd 100644 (file)
@@ -83,15 +83,15 @@ void __init setup_bios_corruption_check(void)
                u64 size;
                addr = find_e820_area_size(addr, &size, PAGE_SIZE);
 
-               if (addr == 0)
+               if (!(addr + 1))
+                       break;
+
+               if (addr >= corruption_check_size)
                        break;
 
                if ((addr + size) > corruption_check_size)
                        size = corruption_check_size - addr;
 
-               if (size == 0)
-                       break;
-
                e820_update_range(addr, size, E820_RAM, E820_RESERVED);
                scan_areas[num_scan_areas].addr = addr;
                scan_areas[num_scan_areas].size = size;
index 82db7f45e2de655430009b58fb776c11fc9b74de..4e242f9a06e43a3bc479e40237997b3c0e1a53c4 100644 (file)
@@ -14,11 +14,12 @@ obj-y                       += vmware.o hypervisor.o
 obj-$(CONFIG_X86_32)   += bugs.o cmpxchg.o
 obj-$(CONFIG_X86_64)   += bugs_64.o
 
+obj-$(CONFIG_X86_CPU_DEBUG)            += cpu_debug.o
+
 obj-$(CONFIG_CPU_SUP_INTEL)            += intel.o
 obj-$(CONFIG_CPU_SUP_AMD)              += amd.o
 obj-$(CONFIG_CPU_SUP_CYRIX_32)         += cyrix.o
-obj-$(CONFIG_CPU_SUP_CENTAUR_32)       += centaur.o
-obj-$(CONFIG_CPU_SUP_CENTAUR_64)       += centaur_64.o
+obj-$(CONFIG_CPU_SUP_CENTAUR)          += centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)     += transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)           += umc.o
 
index 6882a735d9c0ad70b0c6386d11f53d5d50673d40..8220ae69849d4aa3e5405a412a6bca2b03c4b958 100644 (file)
@@ -29,7 +29,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
        u32 regs[4];
        const struct cpuid_bit *cb;
 
-       static const struct cpuid_bit cpuid_bits[] = {
+       static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
                { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
                { 0, 0, 0, 0 }
        };
index 25423a5b80ed28a7058bd01cf3f5f0358fb167b4..7e4a459daa644f30309f50e9ca65ece4be85fad7 100644 (file)
@@ -5,6 +5,7 @@
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/apic.h>
+#include <asm/cpu.h>
 
 #ifdef CONFIG_X86_64
 # include <asm/numa_64.h>
@@ -141,6 +142,55 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
        }
 }
 
+static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+       /* calling is from identify_secondary_cpu() ? */
+       if (c->cpu_index == boot_cpu_id)
+               return;
+
+       /*
+        * Certain Athlons might work (for various values of 'work') in SMP
+        * but they are not certified as MP capable.
+        */
+       /* Athlon 660/661 is valid. */
+       if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
+           (c->x86_mask == 1)))
+               goto valid_k7;
+
+       /* Duron 670 is valid */
+       if ((c->x86_model == 7) && (c->x86_mask == 0))
+               goto valid_k7;
+
+       /*
+        * Athlon 662, Duron 671, and Athlon >model 7 have capability
+        * bit. It's worth noting that the A5 stepping (662) of some
+        * Athlon XP's have the MP bit set.
+        * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
+        * more.
+        */
+       if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
+           ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
+            (c->x86_model > 7))
+               if (cpu_has_mp)
+                       goto valid_k7;
+
+       /* If we get here, not a certified SMP capable AMD system. */
+
+       /*
+        * Don't taint if we are running SMP kernel on a single non-MP
+        * approved Athlon
+        */
+       WARN_ONCE(1, "WARNING: This combination of AMD"
+               "processors is not suitable for SMP.\n");
+       if (!test_taint(TAINT_UNSAFE_SMP))
+               add_taint(TAINT_UNSAFE_SMP);
+
+valid_k7:
+       ;
+#endif
+}
+
 static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
 {
        u32 l, h;
@@ -175,6 +225,8 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
        }
 
        set_cpu_cap(c, X86_FEATURE_K7);
+
+       amd_k7_smp_check(c);
 }
 #endif
 
@@ -450,7 +502,7 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int
 }
 #endif
 
-static struct cpu_dev amd_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
        .c_vendor       = "AMD",
        .c_ident        = { "AuthenticAMD" },
 #ifdef CONFIG_X86_32
index 89bfdd9cacc69a363fc0d9f024dc45b97310bf9e..c95e831bb0954d8c5c50520c27aa4c0e4819f548 100644 (file)
@@ -1,11 +1,11 @@
+#include <linux/bitops.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
-#include <linux/bitops.h>
 
 #include <asm/processor.h>
-#include <asm/msr.h>
 #include <asm/e820.h>
 #include <asm/mtrr.h>
+#include <asm/msr.h>
 
 #include "cpu.h"
 
@@ -276,7 +276,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
                 */
                c->x86_capability[5] = cpuid_edx(0xC0000001);
        }
-
+#ifdef CONFIG_X86_32
        /* Cyrix III family needs CX8 & PGE explicitly enabled. */
        if (c->x86_model >= 6 && c->x86_model <= 9) {
                rdmsr(MSR_VIA_FCR, lo, hi);
@@ -288,6 +288,11 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
        /* Before Nehemiah, the C3's had 3dNOW! */
        if (c->x86_model >= 6 && c->x86_model < 9)
                set_cpu_cap(c, X86_FEATURE_3DNOW);
+#endif
+       if (c->x86 == 0x6 && c->x86_model >= 0xf) {
+               c->x86_cache_alignment = c->x86_clflush_size * 2;
+               set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+       }
 
        display_cacheinfo(c);
 }
@@ -316,16 +321,25 @@ enum {
 static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
 {
        switch (c->x86) {
+#ifdef CONFIG_X86_32
        case 5:
                /* Emulate MTRRs using Centaur's MCR. */
                set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
                break;
+#endif
+       case 6:
+               if (c->x86_model >= 0xf)
+                       set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+               break;
        }
+#ifdef CONFIG_X86_64
+       set_cpu_cap(c, X86_FEATURE_SYSENTER32);
+#endif
 }
 
 static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
 {
-
+#ifdef CONFIG_X86_32
        char *name;
        u32  fcr_set = 0;
        u32  fcr_clr = 0;
@@ -337,8 +351,10 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
         * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
         */
        clear_cpu_cap(c, 0*32+31);
-
+#endif
+       early_init_centaur(c);
        switch (c->x86) {
+#ifdef CONFIG_X86_32
        case 5:
                switch (c->x86_model) {
                case 4:
@@ -442,16 +458,20 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
                }
                sprintf(c->x86_model_id, "WinChip %s", name);
                break;
-
+#endif
        case 6:
                init_c3(c);
                break;
        }
+#ifdef CONFIG_X86_64
+       set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+#endif
 }
 
 static unsigned int __cpuinit
 centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
 {
+#ifdef CONFIG_X86_32
        /* VIA C3 CPUs (670-68F) need further shifting. */
        if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
                size >>= 8;
@@ -464,11 +484,11 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
        if ((c->x86 == 6) && (c->x86_model == 9) &&
                                (c->x86_mask == 1) && (size == 65))
                size -= 1;
-
+#endif
        return size;
 }
 
-static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst centaur_cpu_dev = {
        .c_vendor       = "Centaur",
        .c_ident        = { "CentaurHauls" },
        .c_early_init   = early_init_centaur,
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
deleted file mode 100644 (file)
index a1625f5..0000000
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <linux/init.h>
-#include <linux/smp.h>
-
-#include <asm/cpufeature.h>
-#include <asm/processor.h>
-
-#include "cpu.h"
-
-static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
-{
-       if (c->x86 == 0x6 && c->x86_model >= 0xf)
-               set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-
-       set_cpu_cap(c, X86_FEATURE_SYSENTER32);
-}
-
-static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
-{
-       early_init_centaur(c);
-
-       if (c->x86 == 0x6 && c->x86_model >= 0xf) {
-               c->x86_cache_alignment = c->x86_clflush_size * 2;
-               set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-       }
-       set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
-}
-
-static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
-       .c_vendor       = "Centaur",
-       .c_ident        = { "CentaurHauls" },
-       .c_early_init   = early_init_centaur,
-       .c_init         = init_centaur,
-       .c_x86_vendor   = X86_VENDOR_CENTAUR,
-};
-
-cpu_dev_register(centaur_cpu_dev);
-
index 826d5c87627826911ac393f215c0cecb03f50ef9..e2962cc1e27b742965f6af45a8cfdcf9b4c1a0b8 100644 (file)
@@ -1,52 +1,52 @@
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
 #include <linux/bootmem.h>
+#include <linux/linkage.h>
 #include <linux/bitops.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/kgdb.h>
-#include <linux/topology.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/kgdb.h>
 #include <linux/smp.h>
-#include <linux/percpu.h>
-#include <asm/i387.h>
-#include <asm/msr.h>
-#include <asm/io.h>
-#include <asm/linkage.h>
+#include <linux/io.h>
+
+#include <asm/stackprotector.h>
 #include <asm/mmu_context.h>
+#include <asm/hypervisor.h>
+#include <asm/processor.h>
+#include <asm/sections.h>
+#include <asm/topology.h>
+#include <asm/cpumask.h>
+#include <asm/pgtable.h>
+#include <asm/atomic.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
 #include <asm/mtrr.h>
+#include <asm/numa.h>
+#include <asm/asm.h>
+#include <asm/cpu.h>
 #include <asm/mce.h>
+#include <asm/msr.h>
 #include <asm/pat.h>
-#include <asm/asm.h>
-#include <asm/numa.h>
 #include <asm/smp.h>
-#include <asm/cpu.h>
-#include <asm/cpumask.h>
-#include <asm/apic.h>
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/uv/uv.h>
 #endif
 
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/desc.h>
-#include <asm/atomic.h>
-#include <asm/proto.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <asm/hypervisor.h>
-#include <asm/stackprotector.h>
-
 #include "cpu.h"
 
 #ifdef CONFIG_X86_64
 
 /* all of these masks are initialized in setup_cpu_local_masks() */
-cpumask_var_t cpu_callin_mask;
-cpumask_var_t cpu_callout_mask;
 cpumask_var_t cpu_initialized_mask;
+cpumask_var_t cpu_callout_mask;
+cpumask_var_t cpu_callin_mask;
 
 /* representing cpus for which sibling maps can be computed */
 cpumask_var_t cpu_sibling_setup_mask;
@@ -62,15 +62,15 @@ void __init setup_cpu_local_masks(void)
 
 #else /* CONFIG_X86_32 */
 
-cpumask_t cpu_callin_map;
+cpumask_t cpu_sibling_setup_map;
 cpumask_t cpu_callout_map;
 cpumask_t cpu_initialized;
-cpumask_t cpu_sibling_setup_map;
+cpumask_t cpu_callin_map;
 
 #endif /* CONFIG_X86_32 */
 
 
-static struct cpu_dev *this_cpu __cpuinitdata;
+static const struct cpu_dev *this_cpu __cpuinitdata;
 
 DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 #ifdef CONFIG_X86_64
@@ -79,48 +79,48 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
         * IRET will check the segment types  kkeil 2000/10/28
         * Also sysret mandates a special GDT layout
         *
-        * The TLS descriptors are currently at a different place compared to i386.
+        * TLS descriptors are currently at a different place compared to i386.
         * Hopefully nobody expects them at a fixed place (Wine?)
         */
-       [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
-       [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
-       [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
-       [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
-       [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
-       [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
+       [GDT_ENTRY_KERNEL32_CS]         = { { { 0x0000ffff, 0x00cf9b00 } } },
+       [GDT_ENTRY_KERNEL_CS]           = { { { 0x0000ffff, 0x00af9b00 } } },
+       [GDT_ENTRY_KERNEL_DS]           = { { { 0x0000ffff, 0x00cf9300 } } },
+       [GDT_ENTRY_DEFAULT_USER32_CS]   = { { { 0x0000ffff, 0x00cffb00 } } },
+       [GDT_ENTRY_DEFAULT_USER_DS]     = { { { 0x0000ffff, 0x00cff300 } } },
+       [GDT_ENTRY_DEFAULT_USER_CS]     = { { { 0x0000ffff, 0x00affb00 } } },
 #else
-       [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
-       [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
-       [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
-       [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
+       [GDT_ENTRY_KERNEL_CS]           = { { { 0x0000ffff, 0x00cf9a00 } } },
+       [GDT_ENTRY_KERNEL_DS]           = { { { 0x0000ffff, 0x00cf9200 } } },
+       [GDT_ENTRY_DEFAULT_USER_CS]     = { { { 0x0000ffff, 0x00cffa00 } } },
+       [GDT_ENTRY_DEFAULT_USER_DS]     = { { { 0x0000ffff, 0x00cff200 } } },
        /*
         * Segments used for calling PnP BIOS have byte granularity.
         * They code segments and data segments have fixed 64k limits,
         * the transfer segment sizes are set at run time.
         */
        /* 32-bit code */
-       [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
+       [GDT_ENTRY_PNPBIOS_CS32]        = { { { 0x0000ffff, 0x00409a00 } } },
        /* 16-bit code */
-       [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
+       [GDT_ENTRY_PNPBIOS_CS16]        = { { { 0x0000ffff, 0x00009a00 } } },
        /* 16-bit data */
-       [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
+       [GDT_ENTRY_PNPBIOS_DS]          = { { { 0x0000ffff, 0x00009200 } } },
        /* 16-bit data */
-       [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
+       [GDT_ENTRY_PNPBIOS_TS1]         = { { { 0x00000000, 0x00009200 } } },
        /* 16-bit data */
-       [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
+       [GDT_ENTRY_PNPBIOS_TS2]         = { { { 0x00000000, 0x00009200 } } },
        /*
         * The APM segments have byte granularity and their bases
         * are set at run time.  All have 64k limits.
         */
        /* 32-bit code */
-       [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
+       [GDT_ENTRY_APMBIOS_BASE]        = { { { 0x0000ffff, 0x00409a00 } } },
        /* 16-bit code */
-       [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
+       [GDT_ENTRY_APMBIOS_BASE+1]      = { { { 0x0000ffff, 0x00009a00 } } },
        /* data */
-       [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
+       [GDT_ENTRY_APMBIOS_BASE+2]      = { { { 0x0000ffff, 0x00409200 } } },
 
-       [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
-       [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
+       [GDT_ENTRY_ESPFIX_SS]           = { { { 0x00000000, 0x00c09200 } } },
+       [GDT_ENTRY_PERCPU]              = { { { 0x0000ffff, 0x00cf9200 } } },
        GDT_STACK_CANARY_INIT
 #endif
 } };
@@ -164,16 +164,17 @@ static inline int flag_is_changeable_p(u32 flag)
         * the CPUID. Add "volatile" to not allow gcc to
         * optimize the subsequent calls to this function.
         */
-       asm volatile ("pushfl\n\t"
-                     "pushfl\n\t"
-                     "popl %0\n\t"
-                     "movl %0,%1\n\t"
-                     "xorl %2,%0\n\t"
-                     "pushl %0\n\t"
-                     "popfl\n\t"
-                     "pushfl\n\t"
-                     "popl %0\n\t"
-                     "popfl\n\t"
+       asm volatile ("pushfl           \n\t"
+                     "pushfl           \n\t"
+                     "popl %0          \n\t"
+                     "movl %0, %1      \n\t"
+                     "xorl %2, %0      \n\t"
+                     "pushl %0         \n\t"
+                     "popfl            \n\t"
+                     "pushfl           \n\t"
+                     "popl %0          \n\t"
+                     "popfl            \n\t"
+
                      : "=&r" (f1), "=&r" (f2)
                      : "ir" (flag));
 
@@ -188,18 +189,22 @@ static int __cpuinit have_cpuid_p(void)
 
 static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
 {
-       if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
-               /* Disable processor serial number */
-               unsigned long lo, hi;
-               rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
-               lo |= 0x200000;
-               wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
-               printk(KERN_NOTICE "CPU serial number disabled.\n");
-               clear_cpu_cap(c, X86_FEATURE_PN);
-
-               /* Disabling the serial number may affect the cpuid level */
-               c->cpuid_level = cpuid_eax(0);
-       }
+       unsigned long lo, hi;
+
+       if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr)
+               return;
+
+       /* Disable processor serial number: */
+
+       rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+       lo |= 0x200000;
+       wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+
+       printk(KERN_NOTICE "CPU serial number disabled.\n");
+       clear_cpu_cap(c, X86_FEATURE_PN);
+
+       /* Disabling the serial number may affect the cpuid level */
+       c->cpuid_level = cpuid_eax(0);
 }
 
 static int __init x86_serial_nr_setup(char *s)
@@ -232,6 +237,7 @@ struct cpuid_dependent_feature {
        u32 feature;
        u32 level;
 };
+
 static const struct cpuid_dependent_feature __cpuinitconst
 cpuid_dependent_features[] = {
        { X86_FEATURE_MWAIT,            0x00000005 },
@@ -243,7 +249,11 @@ cpuid_dependent_features[] = {
 static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
 {
        const struct cpuid_dependent_feature *df;
+
        for (df = cpuid_dependent_features; df->feature; df++) {
+
+               if (!cpu_has(c, df->feature))
+                       continue;
                /*
                 * Note: cpuid_level is set to -1 if unavailable, but
                 * extended_extended_level is set to 0 if unavailable
@@ -251,32 +261,32 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
                 * when signed; hence the weird messing around with
                 * signs here...
                 */
-               if (cpu_has(c, df->feature) &&
-                   ((s32)df->level < 0 ?
+               if (!((s32)df->level < 0 ?
                     (u32)df->level > (u32)c->extended_cpuid_level :
-                    (s32)df->level > (s32)c->cpuid_level)) {
-                       clear_cpu_cap(c, df->feature);
-                       if (warn)
-                               printk(KERN_WARNING
-                                      "CPU: CPU feature %s disabled "
-                                      "due to lack of CPUID level 0x%x\n",
-                                      x86_cap_flags[df->feature],
-                                      df->level);
-               }
+                    (s32)df->level > (s32)c->cpuid_level))
+                       continue;
+
+               clear_cpu_cap(c, df->feature);
+               if (!warn)
+                       continue;
+
+               printk(KERN_WARNING
+                      "CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
+                               x86_cap_flags[df->feature], df->level);
        }
 }
 
 /*
  * Naming convention should be: <Name> [(<Codename>)]
  * This table only is used unless init_<vendor>() below doesn't set it;
- * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
- *
+ * in particular, if CPUID levels 0x80000002..4 are supported, this
+ * isn't used
  */
 
 /* Look up CPU names by table lookup. */
-static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
+static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
 {
-       struct cpu_model_info *info;
+       const struct cpu_model_info *info;
 
        if (c->x86_model >= 16)
                return NULL;    /* Range check */
@@ -307,8 +317,10 @@ void load_percpu_segment(int cpu)
        load_stack_canary_segment();
 }
 
-/* Current gdt points %fs at the "master" per-cpu area: after this,
- * it's on the real one. */
+/*
+ * Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one.
+ */
 void switch_to_new_gdt(int cpu)
 {
        struct desc_ptr gdt_descr;
@@ -321,7 +333,7 @@ void switch_to_new_gdt(int cpu)
        load_percpu_segment(cpu);
 }
 
-static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
 
 static void __cpuinit default_init(struct cpuinfo_x86 *c)
 {
@@ -340,7 +352,7 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
 #endif
 }
 
-static struct cpu_dev __cpuinitdata default_cpu = {
+static const struct cpu_dev __cpuinitconst default_cpu = {
        .c_init = default_init,
        .c_vendor = "Unknown",
        .c_x86_vendor = X86_VENDOR_UNKNOWN,
@@ -354,22 +366,24 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
        if (c->extended_cpuid_level < 0x80000004)
                return;
 
-       v = (unsigned int *) c->x86_model_id;
+       v = (unsigned int *)c->x86_model_id;
        cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
        cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
        cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
        c->x86_model_id[48] = 0;
 
-       /* Intel chips right-justify this string for some dumb reason;
-          undo that brain damage */
+       /*
+        * Intel chips right-justify this string for some dumb reason;
+        * undo that brain damage:
+        */
        p = q = &c->x86_model_id[0];
        while (*p == ' ')
-            p++;
+               p++;
        if (p != q) {
-            while (*p)
-                 *q++ = *p++;
-            while (q <= &c->x86_model_id[48])
-                 *q++ = '\0';  /* Zero-pad the rest */
+               while (*p)
+                       *q++ = *p++;
+               while (q <= &c->x86_model_id[48])
+                       *q++ = '\0';    /* Zero-pad the rest */
        }
 }
 
@@ -438,27 +452,30 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 
        if (smp_num_siblings == 1) {
                printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
-       } else if (smp_num_siblings > 1) {
+               goto out;
+       }
 
-               if (smp_num_siblings > nr_cpu_ids) {
-                       printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
-                                       smp_num_siblings);
-                       smp_num_siblings = 1;
-                       return;
-               }
+       if (smp_num_siblings <= 1)
+               goto out;
+
+       if (smp_num_siblings > nr_cpu_ids) {
+               pr_warning("CPU: Unsupported number of siblings %d",
+                          smp_num_siblings);
+               smp_num_siblings = 1;
+               return;
+       }
 
-               index_msb = get_count_order(smp_num_siblings);
-               c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
+       index_msb = get_count_order(smp_num_siblings);
+       c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
 
-               smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+       smp_num_siblings = smp_num_siblings / c->x86_max_cores;
 
-               index_msb = get_count_order(smp_num_siblings);
+       index_msb = get_count_order(smp_num_siblings);
 
-               core_bits = get_count_order(c->x86_max_cores);
+       core_bits = get_count_order(c->x86_max_cores);
 
-               c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
-                                              ((1 << core_bits) - 1);
-       }
+       c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
+                                      ((1 << core_bits) - 1);
 
 out:
        if ((c->x86_max_cores * smp_num_siblings) > 1) {
@@ -473,8 +490,8 @@ out:
 static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 {
        char *v = c->x86_vendor_id;
-       int i;
        static int printed;
+       int i;
 
        for (i = 0; i < X86_VENDOR_NUM; i++) {
                if (!cpu_devs[i])
@@ -483,6 +500,7 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
                if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
                    (cpu_devs[i]->c_ident[1] &&
                     !strcmp(v, cpu_devs[i]->c_ident[1]))) {
+
                        this_cpu = cpu_devs[i];
                        c->x86_vendor = this_cpu->c_x86_vendor;
                        return;
@@ -491,7 +509,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 
        if (!printed) {
                printed++;
-               printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v);
+               printk(KERN_ERR
+                   "CPU: vendor_id '%s' unknown, using generic init.\n", v);
+
                printk(KERN_ERR "CPU: Your system may be unstable.\n");
        }
 
@@ -511,14 +531,17 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
        /* Intel-defined flags: level 0x00000001 */
        if (c->cpuid_level >= 0x00000001) {
                u32 junk, tfms, cap0, misc;
+
                cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
                c->x86 = (tfms >> 8) & 0xf;
                c->x86_model = (tfms >> 4) & 0xf;
                c->x86_mask = tfms & 0xf;
+
                if (c->x86 == 0xf)
                        c->x86 += (tfms >> 20) & 0xff;
                if (c->x86 >= 0x6)
                        c->x86_model += ((tfms >> 16) & 0xf) << 4;
+
                if (cap0 & (1<<19)) {
                        c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
                        c->x86_cache_alignment = c->x86_clflush_size;
@@ -534,6 +557,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
        /* Intel-defined flags: level 0x00000001 */
        if (c->cpuid_level >= 0x00000001) {
                u32 capability, excap;
+
                cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
                c->x86_capability[0] = capability;
                c->x86_capability[4] = excap;
@@ -542,6 +566,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
        /* AMD-defined flags: level 0x80000001 */
        xlvl = cpuid_eax(0x80000000);
        c->extended_cpuid_level = xlvl;
+
        if ((xlvl & 0xffff0000) == 0x80000000) {
                if (xlvl >= 0x80000001) {
                        c->x86_capability[1] = cpuid_edx(0x80000001);
@@ -549,13 +574,15 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
                }
        }
 
-#ifdef CONFIG_X86_64
        if (c->extended_cpuid_level >= 0x80000008) {
                u32 eax = cpuid_eax(0x80000008);
 
                c->x86_virt_bits = (eax >> 8) & 0xff;
                c->x86_phys_bits = eax & 0xff;
        }
+#ifdef CONFIG_X86_32
+       else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
+               c->x86_phys_bits = 36;
 #endif
 
        if (c->extended_cpuid_level >= 0x80000007)
@@ -602,8 +629,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_X86_64
        c->x86_clflush_size = 64;
+       c->x86_phys_bits = 36;
+       c->x86_virt_bits = 48;
 #else
        c->x86_clflush_size = 32;
+       c->x86_phys_bits = 32;
+       c->x86_virt_bits = 32;
 #endif
        c->x86_cache_alignment = c->x86_clflush_size;
 
@@ -634,12 +665,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 
 void __init early_cpu_init(void)
 {
-       struct cpu_dev **cdev;
+       const struct cpu_dev *const *cdev;
        int count = 0;
 
-       printk("KERNEL supported cpus:\n");
+       printk(KERN_INFO "KERNEL supported cpus:\n");
        for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
-               struct cpu_dev *cpudev = *cdev;
+               const struct cpu_dev *cpudev = *cdev;
                unsigned int j;
 
                if (count >= X86_VENDOR_NUM)
@@ -650,7 +681,7 @@ void __init early_cpu_init(void)
                for (j = 0; j < 2; j++) {
                        if (!cpudev->c_ident[j])
                                continue;
-                       printk("  %s %s\n", cpudev->c_vendor,
+                       printk(KERN_INFO "  %s %s\n", cpudev->c_vendor,
                                cpudev->c_ident[j]);
                }
        }
@@ -726,9 +757,13 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
        c->x86_coreid_bits = 0;
 #ifdef CONFIG_X86_64
        c->x86_clflush_size = 64;
+       c->x86_phys_bits = 36;
+       c->x86_virt_bits = 48;
 #else
        c->cpuid_level = -1;    /* CPUID not detected */
        c->x86_clflush_size = 32;
+       c->x86_phys_bits = 32;
+       c->x86_virt_bits = 32;
 #endif
        c->x86_cache_alignment = c->x86_clflush_size;
        memset(&c->x86_capability, 0, sizeof c->x86_capability);
@@ -759,8 +794,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
        squash_the_stupid_serial_number(c);
 
        /*
-        * The vendor-specific functions might have changed features.  Now
-        * we do "generic changes."
+        * The vendor-specific functions might have changed features.
+        * Now we do "generic changes."
         */
 
        /* Filter out anything that depends on CPUID levels we don't have */
@@ -768,7 +803,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 
        /* If the model name is still unset, do table lookup. */
        if (!c->x86_model_id[0]) {
-               char *p;
+               const char *p;
                p = table_lookup_model(c);
                if (p)
                        strcpy(c->x86_model_id, p);
@@ -843,11 +878,11 @@ void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
 }
 
 struct msr_range {
-       unsigned min;
-       unsigned max;
+       unsigned        min;
+       unsigned        max;
 };
 
-static struct msr_range msr_range_array[] __cpuinitdata = {
+static const struct msr_range msr_range_array[] __cpuinitconst = {
        { 0x00000000, 0x00000418},
        { 0xc0000000, 0xc000040b},
        { 0xc0010000, 0xc0010142},
@@ -856,14 +891,15 @@ static struct msr_range msr_range_array[] __cpuinitdata = {
 
 static void __cpuinit print_cpu_msr(void)
 {
+       unsigned index_min, index_max;
        unsigned index;
        u64 val;
        int i;
-       unsigned index_min, index_max;
 
        for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
                index_min = msr_range_array[i].min;
                index_max = msr_range_array[i].max;
+
                for (index = index_min; index < index_max; index++) {
                        if (rdmsrl_amd_safe(index, &val))
                                continue;
@@ -873,6 +909,7 @@ static void __cpuinit print_cpu_msr(void)
 }
 
 static int show_msr __cpuinitdata;
+
 static __init int setup_show_msr(char *arg)
 {
        int num;
@@ -894,12 +931,14 @@ __setup("noclflush", setup_noclflush);
 
 void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 {
-       char *vendor = NULL;
+       const char *vendor = NULL;
 
-       if (c->x86_vendor < X86_VENDOR_NUM)
+       if (c->x86_vendor < X86_VENDOR_NUM) {
                vendor = this_cpu->c_vendor;
-       else if (c->cpuid_level >= 0)
-               vendor = c->x86_vendor_id;
+       } else {
+               if (c->cpuid_level >= 0)
+                       vendor = c->x86_vendor_id;
+       }
 
        if (vendor && !strstr(c->x86_model_id, vendor))
                printk(KERN_CONT "%s ", vendor);
@@ -926,10 +965,12 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 static __init int setup_disablecpuid(char *arg)
 {
        int bit;
+
        if (get_option(&arg, &bit) && bit < NCAPINTS*32)
                setup_clear_cpu_cap(bit);
        else
                return 0;
+
        return 1;
 }
 __setup("clearcpuid=", setup_disablecpuid);
@@ -939,6 +980,7 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
 
 DEFINE_PER_CPU_FIRST(union irq_stack_union,
                     irq_stack_union) __aligned(PAGE_SIZE);
+
 DEFINE_PER_CPU(char *, irq_stack_ptr) =
        init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
 
@@ -948,12 +990,21 @@ EXPORT_PER_CPU_SYMBOL(kernel_stack);
 
 DEFINE_PER_CPU(unsigned int, irq_count) = -1;
 
+/*
+ * Special IST stacks which the CPU switches to when it calls
+ * an IST-marked descriptor entry. Up to 7 stacks (hardware
+ * limit), all of them are 4K, except the debug stack which
+ * is 8K.
+ */
+static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
+         [0 ... N_EXCEPTION_STACKS - 1]        = EXCEPTION_STKSZ,
+         [DEBUG_STACK - 1]                     = DEBUG_STKSZ
+};
+
 static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
        [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
        __aligned(PAGE_SIZE);
 
-extern asmlinkage void ignore_sysret(void);
-
 /* May not be marked __init: used by software suspend */
 void syscall_init(void)
 {
@@ -983,7 +1034,7 @@ unsigned long kernel_eflags;
  */
 DEFINE_PER_CPU(struct orig_ist, orig_ist);
 
-#else  /* x86_64 */
+#else  /* CONFIG_X86_64 */
 
 #ifdef CONFIG_CC_STACKPROTECTOR
 DEFINE_PER_CPU(unsigned long, stack_canary);
@@ -995,9 +1046,26 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
        memset(regs, 0, sizeof(struct pt_regs));
        regs->fs = __KERNEL_PERCPU;
        regs->gs = __KERNEL_STACK_CANARY;
+
        return regs;
 }
-#endif /* x86_64 */
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Clear all 6 debug registers:
+ */
+static void clear_all_debug_regs(void)
+{
+       int i;
+
+       for (i = 0; i < 8; i++) {
+               /* Ignore db4, db5 */
+               if ((i == 4) || (i == 5))
+                       continue;
+
+               set_debugreg(0, i);
+       }
+}
 
 /*
  * cpu_init() initializes state that is per-CPU. Some data is already
@@ -1007,15 +1075,20 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
  * A lot of state is already set up in PDA init for 64 bit
  */
 #ifdef CONFIG_X86_64
+
 void __cpuinit cpu_init(void)
 {
-       int cpu = stack_smp_processor_id();
-       struct tss_struct *t = &per_cpu(init_tss, cpu);
-       struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
-       unsigned long v;
+       struct orig_ist *orig_ist;
        struct task_struct *me;
+       struct tss_struct *t;
+       unsigned long v;
+       int cpu;
        int i;
 
+       cpu = stack_smp_processor_id();
+       t = &per_cpu(init_tss, cpu);
+       orig_ist = &per_cpu(orig_ist, cpu);
+
 #ifdef CONFIG_NUMA
        if (cpu != 0 && percpu_read(node_number) == 0 &&
            cpu_to_node(cpu) != NUMA_NO_NODE)
@@ -1056,19 +1129,17 @@ void __cpuinit cpu_init(void)
         * set up and load the per-CPU TSS
         */
        if (!orig_ist->ist[0]) {
-               static const unsigned int sizes[N_EXCEPTION_STACKS] = {
-                 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
-                 [DEBUG_STACK - 1] = DEBUG_STKSZ
-               };
                char *estacks = per_cpu(exception_stacks, cpu);
+
                for (v = 0; v < N_EXCEPTION_STACKS; v++) {
-                       estacks += sizes[v];
+                       estacks += exception_stack_sizes[v];
                        orig_ist->ist[v] = t->x86_tss.ist[v] =
                                        (unsigned long)estacks;
                }
        }
 
        t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+
        /*
         * <= is required because the CPU will access up to
         * 8 bits beyond the end of the IO permission bitmap.
@@ -1078,8 +1149,7 @@ void __cpuinit cpu_init(void)
 
        atomic_inc(&init_mm.mm_count);
        me->active_mm = &init_mm;
-       if (me->mm)
-               BUG();
+       BUG_ON(me->mm);
        enter_lazy_tlb(&init_mm, me);
 
        load_sp0(t, &current->thread);
@@ -1098,17 +1168,7 @@ void __cpuinit cpu_init(void)
                arch_kgdb_ops.correct_hw_break();
        else
 #endif
-       {
-               /*
-                * Clear all 6 debug registers:
-                */
-               set_debugreg(0UL, 0);
-               set_debugreg(0UL, 1);
-               set_debugreg(0UL, 2);
-               set_debugreg(0UL, 3);
-               set_debugreg(0UL, 6);
-               set_debugreg(0UL, 7);
-       }
+               clear_all_debug_regs();
 
        fpu_init();
 
@@ -1129,7 +1189,8 @@ void __cpuinit cpu_init(void)
 
        if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
                printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
-               for (;;) local_irq_enable();
+               for (;;)
+                       local_irq_enable();
        }
 
        printk(KERN_INFO "Initializing CPU#%d\n", cpu);
@@ -1145,8 +1206,7 @@ void __cpuinit cpu_init(void)
         */
        atomic_inc(&init_mm.mm_count);
        curr->active_mm = &init_mm;
-       if (curr->mm)
-               BUG();
+       BUG_ON(curr->mm);
        enter_lazy_tlb(&init_mm, curr);
 
        load_sp0(t, thread);
@@ -1159,13 +1219,7 @@ void __cpuinit cpu_init(void)
        __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
 #endif
 
-       /* Clear all 6 debug registers: */
-       set_debugreg(0, 0);
-       set_debugreg(0, 1);
-       set_debugreg(0, 2);
-       set_debugreg(0, 3);
-       set_debugreg(0, 6);
-       set_debugreg(0, 7);
+       clear_all_debug_regs();
 
        /*
         * Force FPU initialization:
@@ -1185,6 +1239,4 @@ void __cpuinit cpu_init(void)
 
        xsave_init();
 }
-
-
 #endif
index de4094a3921071b94b9b71a1bcd2d85158c6c957..6de9a908e4008bf6d99e56cb8a9ef909b1f3949c 100644 (file)
@@ -3,33 +3,34 @@
 #define ARCH_X86_CPU_H
 
 struct cpu_model_info {
-       int vendor;
-       int family;
-       char *model_names[16];
+       int             vendor;
+       int             family;
+       const char      *model_names[16];
 };
 
 /* attempt to consolidate cpu attributes */
 struct cpu_dev {
-       char    * c_vendor;
+       const char      *c_vendor;
 
        /* some have two possibilities for cpuid string */
-       char    * c_ident[2];   
+       const char      *c_ident[2];
 
        struct          cpu_model_info c_models[4];
 
-       void            (*c_early_init)(struct cpuinfo_x86 *c);
-       void            (*c_init)(struct cpuinfo_x86 * c);
-       void            (*c_identify)(struct cpuinfo_x86 * c);
-       unsigned int    (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size);
-       int     c_x86_vendor;
+       void            (*c_early_init)(struct cpuinfo_x86 *);
+       void            (*c_init)(struct cpuinfo_x86 *);
+       void            (*c_identify)(struct cpuinfo_x86 *);
+       unsigned int    (*c_size_cache)(struct cpuinfo_x86 *, unsigned int);
+       int             c_x86_vendor;
 };
 
 #define cpu_dev_register(cpu_devX) \
-       static struct cpu_dev *__cpu_dev_##cpu_devX __used \
+       static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
        __attribute__((__section__(".x86_cpu_dev.init"))) = \
        &cpu_devX;
 
-extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[];
+extern const struct cpu_dev *const __x86_cpu_dev_start[],
+                           *const __x86_cpu_dev_end[];
 
 extern void display_cacheinfo(struct cpuinfo_x86 *c);
 
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
new file mode 100755 (executable)
index 0000000..46e29ab
--- /dev/null
@@ -0,0 +1,901 @@
+/*
+ * CPU x86 architecture debug code
+ *
+ * Copyright(C) 2009 Jaswinder Singh Rajput
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/interrupt.h>
+#include <linux/compiler.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+
+#include <asm/cpu_debug.h>
+#include <asm/paravirt.h>
+#include <asm/system.h>
+#include <asm/traps.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+
+static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
+static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
+static DEFINE_PER_CPU(unsigned, cpu_modelflag);
+static DEFINE_PER_CPU(int, cpu_priv_count);
+static DEFINE_PER_CPU(unsigned, cpu_model);
+
+static DEFINE_MUTEX(cpu_debug_lock);
+
+static struct dentry *cpu_debugfs_dir;
+
+static struct cpu_debug_base cpu_base[] = {
+       { "mc",         CPU_MC,         0       },
+       { "monitor",    CPU_MONITOR,    0       },
+       { "time",       CPU_TIME,       0       },
+       { "pmc",        CPU_PMC,        1       },
+       { "platform",   CPU_PLATFORM,   0       },
+       { "apic",       CPU_APIC,       0       },
+       { "poweron",    CPU_POWERON,    0       },
+       { "control",    CPU_CONTROL,    0       },
+       { "features",   CPU_FEATURES,   0       },
+       { "lastbranch", CPU_LBRANCH,    0       },
+       { "bios",       CPU_BIOS,       0       },
+       { "freq",       CPU_FREQ,       0       },
+       { "mtrr",       CPU_MTRR,       0       },
+       { "perf",       CPU_PERF,       0       },
+       { "cache",      CPU_CACHE,      0       },
+       { "sysenter",   CPU_SYSENTER,   0       },
+       { "therm",      CPU_THERM,      0       },
+       { "misc",       CPU_MISC,       0       },
+       { "debug",      CPU_DEBUG,      0       },
+       { "pat",        CPU_PAT,        0       },
+       { "vmx",        CPU_VMX,        0       },
+       { "call",       CPU_CALL,       0       },
+       { "base",       CPU_BASE,       0       },
+       { "ver",        CPU_VER,        0       },
+       { "conf",       CPU_CONF,       0       },
+       { "smm",        CPU_SMM,        0       },
+       { "svm",        CPU_SVM,        0       },
+       { "osvm",       CPU_OSVM,       0       },
+       { "tss",        CPU_TSS,        0       },
+       { "cr",         CPU_CR,         0       },
+       { "dt",         CPU_DT,         0       },
+       { "registers",  CPU_REG_ALL,    0       },
+};
+
+static struct cpu_file_base cpu_file[] = {
+       { "index",      CPU_REG_ALL,    0       },
+       { "value",      CPU_REG_ALL,    1       },
+};
+
+/* Intel Registers Range */
+static struct cpu_debug_range cpu_intel_range[] = {
+       { 0x00000000, 0x00000001, CPU_MC,       CPU_INTEL_ALL           },
+       { 0x00000006, 0x00000007, CPU_MONITOR,  CPU_CX_AT_XE            },
+       { 0x00000010, 0x00000010, CPU_TIME,     CPU_INTEL_ALL           },
+       { 0x00000011, 0x00000013, CPU_PMC,      CPU_INTEL_PENTIUM       },
+       { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE         },
+       { 0x0000001B, 0x0000001B, CPU_APIC,     CPU_P6_CX_AT_XE         },
+
+       { 0x0000002A, 0x0000002A, CPU_POWERON,  CPU_PX_CX_AT_XE         },
+       { 0x0000002B, 0x0000002B, CPU_POWERON,  CPU_INTEL_XEON          },
+       { 0x0000002C, 0x0000002C, CPU_FREQ,     CPU_INTEL_XEON          },
+       { 0x0000003A, 0x0000003A, CPU_CONTROL,  CPU_CX_AT_XE            },
+
+       { 0x00000040, 0x00000043, CPU_LBRANCH,  CPU_PM_CX_AT_XE         },
+       { 0x00000044, 0x00000047, CPU_LBRANCH,  CPU_PM_CO_AT            },
+       { 0x00000060, 0x00000063, CPU_LBRANCH,  CPU_C2_AT               },
+       { 0x00000064, 0x00000067, CPU_LBRANCH,  CPU_INTEL_ATOM          },
+
+       { 0x00000079, 0x00000079, CPU_BIOS,     CPU_P6_CX_AT_XE         },
+       { 0x00000088, 0x0000008A, CPU_CACHE,    CPU_INTEL_P6            },
+       { 0x0000008B, 0x0000008B, CPU_BIOS,     CPU_P6_CX_AT_XE         },
+       { 0x0000009B, 0x0000009B, CPU_MONITOR,  CPU_INTEL_XEON          },
+
+       { 0x000000C1, 0x000000C2, CPU_PMC,      CPU_P6_CX_AT            },
+       { 0x000000CD, 0x000000CD, CPU_FREQ,     CPU_CX_AT               },
+       { 0x000000E7, 0x000000E8, CPU_PERF,     CPU_CX_AT               },
+       { 0x000000FE, 0x000000FE, CPU_MTRR,     CPU_P6_CX_XE            },
+
+       { 0x00000116, 0x00000116, CPU_CACHE,    CPU_INTEL_P6            },
+       { 0x00000118, 0x00000118, CPU_CACHE,    CPU_INTEL_P6            },
+       { 0x00000119, 0x00000119, CPU_CACHE,    CPU_INTEL_PX            },
+       { 0x0000011A, 0x0000011B, CPU_CACHE,    CPU_INTEL_P6            },
+       { 0x0000011E, 0x0000011E, CPU_CACHE,    CPU_PX_CX_AT            },
+
+       { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE         },
+       { 0x00000179, 0x0000017A, CPU_MC,       CPU_PX_CX_AT_XE         },
+       { 0x0000017B, 0x0000017B, CPU_MC,       CPU_P6_XE               },
+       { 0x00000186, 0x00000187, CPU_PMC,      CPU_P6_CX_AT            },
+       { 0x00000198, 0x00000199, CPU_PERF,     CPU_PM_CX_AT_XE         },
+       { 0x0000019A, 0x0000019A, CPU_TIME,     CPU_PM_CX_AT_XE         },
+       { 0x0000019B, 0x0000019D, CPU_THERM,    CPU_PM_CX_AT_XE         },
+       { 0x000001A0, 0x000001A0, CPU_MISC,     CPU_PM_CX_AT_XE         },
+
+       { 0x000001C9, 0x000001C9, CPU_LBRANCH,  CPU_PM_CX_AT            },
+       { 0x000001D7, 0x000001D8, CPU_LBRANCH,  CPU_INTEL_XEON          },
+       { 0x000001D9, 0x000001D9, CPU_DEBUG,    CPU_CX_AT_XE            },
+       { 0x000001DA, 0x000001DA, CPU_LBRANCH,  CPU_INTEL_XEON          },
+       { 0x000001DB, 0x000001DB, CPU_LBRANCH,  CPU_P6_XE               },
+       { 0x000001DC, 0x000001DC, CPU_LBRANCH,  CPU_INTEL_P6            },
+       { 0x000001DD, 0x000001DE, CPU_LBRANCH,  CPU_PX_CX_AT_XE         },
+       { 0x000001E0, 0x000001E0, CPU_LBRANCH,  CPU_INTEL_P6            },
+
+       { 0x00000200, 0x0000020F, CPU_MTRR,     CPU_P6_CX_XE            },
+       { 0x00000250, 0x00000250, CPU_MTRR,     CPU_P6_CX_XE            },
+       { 0x00000258, 0x00000259, CPU_MTRR,     CPU_P6_CX_XE            },
+       { 0x00000268, 0x0000026F, CPU_MTRR,     CPU_P6_CX_XE            },
+       { 0x00000277, 0x00000277, CPU_PAT,      CPU_C2_AT_XE            },
+       { 0x000002FF, 0x000002FF, CPU_MTRR,     CPU_P6_CX_XE            },
+
+       { 0x00000300, 0x00000308, CPU_PMC,      CPU_INTEL_XEON          },
+       { 0x00000309, 0x0000030B, CPU_PMC,      CPU_C2_AT_XE            },
+       { 0x0000030C, 0x00000311, CPU_PMC,      CPU_INTEL_XEON          },
+       { 0x00000345, 0x00000345, CPU_PMC,      CPU_C2_AT               },
+       { 0x00000360, 0x00000371, CPU_PMC,      CPU_INTEL_XEON          },
+       { 0x0000038D, 0x00000390, CPU_PMC,      CPU_C2_AT               },
+       { 0x000003A0, 0x000003BE, CPU_PMC,      CPU_INTEL_XEON          },
+       { 0x000003C0, 0x000003CD, CPU_PMC,      CPU_INTEL_XEON          },
+       { 0x000003E0, 0x000003E1, CPU_PMC,      CPU_INTEL_XEON          },
+       { 0x000003F0, 0x000003F0, CPU_PMC,      CPU_INTEL_XEON          },
+       { 0x000003F1, 0x000003F1, CPU_PMC,      CPU_C2_AT_XE            },
+       { 0x000003F2, 0x000003F2, CPU_PMC,      CPU_INTEL_XEON          },
+
+       { 0x00000400, 0x00000402, CPU_MC,       CPU_PM_CX_AT_XE         },
+       { 0x00000403, 0x00000403, CPU_MC,       CPU_INTEL_XEON          },
+       { 0x00000404, 0x00000406, CPU_MC,       CPU_PM_CX_AT_XE         },
+       { 0x00000407, 0x00000407, CPU_MC,       CPU_INTEL_XEON          },
+       { 0x00000408, 0x0000040A, CPU_MC,       CPU_PM_CX_AT_XE         },
+       { 0x0000040B, 0x0000040B, CPU_MC,       CPU_INTEL_XEON          },
+       { 0x0000040C, 0x0000040E, CPU_MC,       CPU_PM_CX_XE            },
+       { 0x0000040F, 0x0000040F, CPU_MC,       CPU_INTEL_XEON          },
+       { 0x00000410, 0x00000412, CPU_MC,       CPU_PM_CX_AT_XE         },
+       { 0x00000413, 0x00000417, CPU_MC,       CPU_CX_AT_XE            },
+       { 0x00000480, 0x0000048B, CPU_VMX,      CPU_CX_AT_XE            },
+
+       { 0x00000600, 0x00000600, CPU_DEBUG,    CPU_PM_CX_AT_XE         },
+       { 0x00000680, 0x0000068F, CPU_LBRANCH,  CPU_INTEL_XEON          },
+       { 0x000006C0, 0x000006CF, CPU_LBRANCH,  CPU_INTEL_XEON          },
+
+       { 0x000107CC, 0x000107D3, CPU_PMC,      CPU_INTEL_XEON_MP       },
+
+       { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON          },
+       { 0xC0000081, 0xC0000082, CPU_CALL,     CPU_INTEL_XEON          },
+       { 0xC0000084, 0xC0000084, CPU_CALL,     CPU_INTEL_XEON          },
+       { 0xC0000100, 0xC0000102, CPU_BASE,     CPU_INTEL_XEON          },
+};
+
+/* AMD Registers Range */
+static struct cpu_debug_range cpu_amd_range[] = {
+       { 0x00000000, 0x00000001, CPU_MC,       CPU_K10_PLUS,           },
+       { 0x00000010, 0x00000010, CPU_TIME,     CPU_K8_PLUS,            },
+       { 0x0000001B, 0x0000001B, CPU_APIC,     CPU_K8_PLUS,            },
+       { 0x0000002A, 0x0000002A, CPU_POWERON,  CPU_K7_PLUS             },
+       { 0x0000008B, 0x0000008B, CPU_VER,      CPU_K8_PLUS             },
+       { 0x000000FE, 0x000000FE, CPU_MTRR,     CPU_K8_PLUS,            },
+
+       { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS,            },
+       { 0x00000179, 0x0000017B, CPU_MC,       CPU_K8_PLUS,            },
+       { 0x000001D9, 0x000001D9, CPU_DEBUG,    CPU_K8_PLUS,            },
+       { 0x000001DB, 0x000001DE, CPU_LBRANCH,  CPU_K8_PLUS,            },
+
+       { 0x00000200, 0x0000020F, CPU_MTRR,     CPU_K8_PLUS,            },
+       { 0x00000250, 0x00000250, CPU_MTRR,     CPU_K8_PLUS,            },
+       { 0x00000258, 0x00000259, CPU_MTRR,     CPU_K8_PLUS,            },
+       { 0x00000268, 0x0000026F, CPU_MTRR,     CPU_K8_PLUS,            },
+       { 0x00000277, 0x00000277, CPU_PAT,      CPU_K8_PLUS,            },
+       { 0x000002FF, 0x000002FF, CPU_MTRR,     CPU_K8_PLUS,            },
+
+       { 0x00000400, 0x00000413, CPU_MC,       CPU_K8_PLUS,            },
+
+       { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL,            },
+       { 0xC0000081, 0xC0000084, CPU_CALL,     CPU_K8_PLUS,            },
+       { 0xC0000100, 0xC0000102, CPU_BASE,     CPU_K8_PLUS,            },
+       { 0xC0000103, 0xC0000103, CPU_TIME,     CPU_K10_PLUS,           },
+
+       { 0xC0010000, 0xC0010007, CPU_PMC,      CPU_K8_PLUS,            },
+       { 0xC0010010, 0xC0010010, CPU_CONF,     CPU_K7_PLUS,            },
+       { 0xC0010015, 0xC0010015, CPU_CONF,     CPU_K7_PLUS,            },
+       { 0xC0010016, 0xC001001A, CPU_MTRR,     CPU_K8_PLUS,            },
+       { 0xC001001D, 0xC001001D, CPU_MTRR,     CPU_K8_PLUS,            },
+       { 0xC001001F, 0xC001001F, CPU_CONF,     CPU_K8_PLUS,            },
+       { 0xC0010030, 0xC0010035, CPU_BIOS,     CPU_K8_PLUS,            },
+       { 0xC0010044, 0xC0010048, CPU_MC,       CPU_K8_PLUS,            },
+       { 0xC0010050, 0xC0010056, CPU_SMM,      CPU_K0F_PLUS,           },
+       { 0xC0010058, 0xC0010058, CPU_CONF,     CPU_K10_PLUS,           },
+       { 0xC0010060, 0xC0010060, CPU_CACHE,    CPU_AMD_11,             },
+       { 0xC0010061, 0xC0010068, CPU_SMM,      CPU_K10_PLUS,           },
+       { 0xC0010069, 0xC001006B, CPU_SMM,      CPU_AMD_11,             },
+       { 0xC0010070, 0xC0010071, CPU_SMM,      CPU_K10_PLUS,           },
+       { 0xC0010111, 0xC0010113, CPU_SMM,      CPU_K8_PLUS,            },
+       { 0xC0010114, 0xC0010118, CPU_SVM,      CPU_K10_PLUS,           },
+       { 0xC0010140, 0xC0010141, CPU_OSVM,     CPU_K10_PLUS,           },
+       { 0xC0011022, 0xC0011023, CPU_CONF,     CPU_K10_PLUS,           },
+};
+
+
+/* Intel */
+static int get_intel_modelflag(unsigned model)
+{
+       int flag;
+
+       switch (model) {
+       case 0x0501:
+       case 0x0502:
+       case 0x0504:
+               flag = CPU_INTEL_PENTIUM;
+               break;
+       case 0x0601:
+       case 0x0603:
+       case 0x0605:
+       case 0x0607:
+       case 0x0608:
+       case 0x060A:
+       case 0x060B:
+               flag = CPU_INTEL_P6;
+               break;
+       case 0x0609:
+       case 0x060D:
+               flag = CPU_INTEL_PENTIUM_M;
+               break;
+       case 0x060E:
+               flag = CPU_INTEL_CORE;
+               break;
+       case 0x060F:
+       case 0x0617:
+               flag = CPU_INTEL_CORE2;
+               break;
+       case 0x061C:
+               flag = CPU_INTEL_ATOM;
+               break;
+       case 0x0F00:
+       case 0x0F01:
+       case 0x0F02:
+       case 0x0F03:
+       case 0x0F04:
+               flag = CPU_INTEL_XEON_P4;
+               break;
+       case 0x0F06:
+               flag = CPU_INTEL_XEON_MP;
+               break;
+       default:
+               flag = CPU_NONE;
+               break;
+       }
+
+       return flag;
+}
+
+/* AMD */
+static int get_amd_modelflag(unsigned model)
+{
+       int flag;
+
+       switch (model >> 8) {
+       case 0x6:
+               flag = CPU_AMD_K6;
+               break;
+       case 0x7:
+               flag = CPU_AMD_K7;
+               break;
+       case 0x8:
+               flag = CPU_AMD_K8;
+               break;
+       case 0xf:
+               flag = CPU_AMD_0F;
+               break;
+       case 0x10:
+               flag = CPU_AMD_10;
+               break;
+       case 0x11:
+               flag = CPU_AMD_11;
+               break;
+       default:
+               flag = CPU_NONE;
+               break;
+       }
+
+       return flag;
+}
+
+static int get_cpu_modelflag(unsigned cpu)
+{
+       int flag;
+
+       flag = per_cpu(cpu_model, cpu);
+
+       switch (flag >> 16) {
+       case X86_VENDOR_INTEL:
+               flag = get_intel_modelflag(flag);
+               break;
+       case X86_VENDOR_AMD:
+               flag = get_amd_modelflag(flag & 0xffff);
+               break;
+       default:
+               flag = CPU_NONE;
+               break;
+       }
+
+       return flag;
+}
+
+static int get_cpu_range_count(unsigned cpu)
+{
+       int index;
+
+       switch (per_cpu(cpu_model, cpu) >> 16) {
+       case X86_VENDOR_INTEL:
+               index = ARRAY_SIZE(cpu_intel_range);
+               break;
+       case X86_VENDOR_AMD:
+               index = ARRAY_SIZE(cpu_amd_range);
+               break;
+       default:
+               index = 0;
+               break;
+       }
+
+       return index;
+}
+
+static int is_typeflag_valid(unsigned cpu, unsigned flag)
+{
+       unsigned vendor, modelflag;
+       int i, index;
+
+       /* Standard Registers should be always valid */
+       if (flag >= CPU_TSS)
+               return 1;
+
+       modelflag = per_cpu(cpu_modelflag, cpu);
+       vendor = per_cpu(cpu_model, cpu) >> 16;
+       index = get_cpu_range_count(cpu);
+
+       for (i = 0; i < index; i++) {
+               switch (vendor) {
+               case X86_VENDOR_INTEL:
+                       if ((cpu_intel_range[i].model & modelflag) &&
+                           (cpu_intel_range[i].flag & flag))
+                               return 1;
+                       break;
+               case X86_VENDOR_AMD:
+                       if ((cpu_amd_range[i].model & modelflag) &&
+                           (cpu_amd_range[i].flag & flag))
+                               return 1;
+                       break;
+               }
+       }
+
+       /* Invalid */
+       return 0;
+}
+
+static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
+                             int index, unsigned flag)
+{
+       unsigned modelflag;
+
+       modelflag = per_cpu(cpu_modelflag, cpu);
+       *max = 0;
+       switch (per_cpu(cpu_model, cpu) >> 16) {
+       case X86_VENDOR_INTEL:
+               if ((cpu_intel_range[index].model & modelflag) &&
+                   (cpu_intel_range[index].flag & flag)) {
+                       *min = cpu_intel_range[index].min;
+                       *max = cpu_intel_range[index].max;
+               }
+               break;
+       case X86_VENDOR_AMD:
+               if ((cpu_amd_range[index].model & modelflag) &&
+                   (cpu_amd_range[index].flag & flag)) {
+                       *min = cpu_amd_range[index].min;
+                       *max = cpu_amd_range[index].max;
+               }
+               break;
+       }
+
+       return *max;
+}
+
+/* This function can also be called with seq = NULL for printk */
+static void print_cpu_data(struct seq_file *seq, unsigned type,
+                          u32 low, u32 high)
+{
+       struct cpu_private *priv;
+       u64 val = high;
+
+       if (seq) {
+               priv = seq->private;
+               if (priv->file) {
+                       val = (val << 32) | low;
+                       seq_printf(seq, "0x%llx\n", val);
+               } else
+                       seq_printf(seq, " %08x: %08x_%08x\n",
+                                  type, high, low);
+       } else
+               printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low);
+}
+
+/* This function can also be called with seq = NULL for printk */
+static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
+{
+       unsigned msr, msr_min, msr_max;
+       struct cpu_private *priv;
+       u32 low, high;
+       int i, range;
+
+       if (seq) {
+               priv = seq->private;
+               if (priv->file) {
+                       if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg,
+                                              &low, &high))
+                               print_cpu_data(seq, priv->reg, low, high);
+                       return;
+               }
+       }
+
+       range = get_cpu_range_count(cpu);
+
+       for (i = 0; i < range; i++) {
+               if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
+                       continue;
+
+               for (msr = msr_min; msr <= msr_max; msr++) {
+                       if (rdmsr_safe_on_cpu(cpu, msr, &low, &high))
+                               continue;
+                       print_cpu_data(seq, msr, low, high);
+               }
+       }
+}
+
+static void print_tss(void *arg)
+{
+       struct pt_regs *regs = task_pt_regs(current);
+       struct seq_file *seq = arg;
+       unsigned int seg;
+
+       seq_printf(seq, " RAX\t: %016lx\n", regs->ax);
+       seq_printf(seq, " RBX\t: %016lx\n", regs->bx);
+       seq_printf(seq, " RCX\t: %016lx\n", regs->cx);
+       seq_printf(seq, " RDX\t: %016lx\n", regs->dx);
+
+       seq_printf(seq, " RSI\t: %016lx\n", regs->si);
+       seq_printf(seq, " RDI\t: %016lx\n", regs->di);
+       seq_printf(seq, " RBP\t: %016lx\n", regs->bp);
+       seq_printf(seq, " ESP\t: %016lx\n", regs->sp);
+
+#ifdef CONFIG_X86_64
+       seq_printf(seq, " R08\t: %016lx\n", regs->r8);
+       seq_printf(seq, " R09\t: %016lx\n", regs->r9);
+       seq_printf(seq, " R10\t: %016lx\n", regs->r10);
+       seq_printf(seq, " R11\t: %016lx\n", regs->r11);
+       seq_printf(seq, " R12\t: %016lx\n", regs->r12);
+       seq_printf(seq, " R13\t: %016lx\n", regs->r13);
+       seq_printf(seq, " R14\t: %016lx\n", regs->r14);
+       seq_printf(seq, " R15\t: %016lx\n", regs->r15);
+#endif
+
+       asm("movl %%cs,%0" : "=r" (seg));
+       seq_printf(seq, " CS\t:             %04x\n", seg);
+       asm("movl %%ds,%0" : "=r" (seg));
+       seq_printf(seq, " DS\t:             %04x\n", seg);
+       seq_printf(seq, " SS\t:             %04lx\n", regs->ss & 0xffff);
+       asm("movl %%es,%0" : "=r" (seg));
+       seq_printf(seq, " ES\t:             %04x\n", seg);
+       asm("movl %%fs,%0" : "=r" (seg));
+       seq_printf(seq, " FS\t:             %04x\n", seg);
+       asm("movl %%gs,%0" : "=r" (seg));
+       seq_printf(seq, " GS\t:             %04x\n", seg);
+
+       seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags);
+
+       seq_printf(seq, " EIP\t: %016lx\n", regs->ip);
+}
+
+static void print_cr(void *arg)
+{
+       struct seq_file *seq = arg;
+
+       seq_printf(seq, " cr0\t: %016lx\n", read_cr0());
+       seq_printf(seq, " cr2\t: %016lx\n", read_cr2());
+       seq_printf(seq, " cr3\t: %016lx\n", read_cr3());
+       seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe());
+#ifdef CONFIG_X86_64
+       seq_printf(seq, " cr8\t: %016lx\n", read_cr8());
+#endif
+}
+
+static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt)
+{
+       seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size));
+}
+
+static void print_dt(void *seq)
+{
+       struct desc_ptr dt;
+       unsigned long ldt;
+
+       /* IDT */
+       store_idt((struct desc_ptr *)&dt);
+       print_desc_ptr("IDT", seq, dt);
+
+       /* GDT */
+       store_gdt((struct desc_ptr *)&dt);
+       print_desc_ptr("GDT", seq, dt);
+
+       /* LDT */
+       store_ldt(ldt);
+       seq_printf(seq, " LDT\t: %016lx\n", ldt);
+
+       /* TR */
+       store_tr(ldt);
+       seq_printf(seq, " TR\t: %016lx\n", ldt);
+}
+
+static void print_dr(void *arg)
+{
+       struct seq_file *seq = arg;
+       unsigned long dr;
+       int i;
+
+       for (i = 0; i < 8; i++) {
+               /* Ignore db4, db5 */
+               if ((i == 4) || (i == 5))
+                       continue;
+               get_debugreg(dr, i);
+               seq_printf(seq, " dr%d\t: %016lx\n", i, dr);
+       }
+
+       seq_printf(seq, "\n MSR\t:\n");
+}
+
+static void print_apic(void *arg)
+{
+       struct seq_file *seq = arg;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+       seq_printf(seq, " LAPIC\t:\n");
+       seq_printf(seq, " ID\t\t: %08x\n",  apic_read(APIC_ID) >> 24);
+       seq_printf(seq, " LVR\t\t: %08x\n",  apic_read(APIC_LVR));
+       seq_printf(seq, " TASKPRI\t: %08x\n",  apic_read(APIC_TASKPRI));
+       seq_printf(seq, " ARBPRI\t\t: %08x\n",  apic_read(APIC_ARBPRI));
+       seq_printf(seq, " PROCPRI\t: %08x\n",  apic_read(APIC_PROCPRI));
+       seq_printf(seq, " LDR\t\t: %08x\n",  apic_read(APIC_LDR));
+       seq_printf(seq, " DFR\t\t: %08x\n",  apic_read(APIC_DFR));
+       seq_printf(seq, " SPIV\t\t: %08x\n",  apic_read(APIC_SPIV));
+       seq_printf(seq, " ISR\t\t: %08x\n",  apic_read(APIC_ISR));
+       seq_printf(seq, " ESR\t\t: %08x\n",  apic_read(APIC_ESR));
+       seq_printf(seq, " ICR\t\t: %08x\n",  apic_read(APIC_ICR));
+       seq_printf(seq, " ICR2\t\t: %08x\n",  apic_read(APIC_ICR2));
+       seq_printf(seq, " LVTT\t\t: %08x\n",  apic_read(APIC_LVTT));
+       seq_printf(seq, " LVTTHMR\t: %08x\n",  apic_read(APIC_LVTTHMR));
+       seq_printf(seq, " LVTPC\t\t: %08x\n",  apic_read(APIC_LVTPC));
+       seq_printf(seq, " LVT0\t\t: %08x\n",  apic_read(APIC_LVT0));
+       seq_printf(seq, " LVT1\t\t: %08x\n",  apic_read(APIC_LVT1));
+       seq_printf(seq, " LVTERR\t\t: %08x\n",  apic_read(APIC_LVTERR));
+       seq_printf(seq, " TMICT\t\t: %08x\n",  apic_read(APIC_TMICT));
+       seq_printf(seq, " TMCCT\t\t: %08x\n",  apic_read(APIC_TMCCT));
+       seq_printf(seq, " TDCR\t\t: %08x\n",  apic_read(APIC_TDCR));
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+       seq_printf(seq, "\n MSR\t:\n");
+}
+
+static int cpu_seq_show(struct seq_file *seq, void *v)
+{
+       struct cpu_private *priv = seq->private;
+
+       if (priv == NULL)
+               return -EINVAL;
+
+       switch (cpu_base[priv->type].flag) {
+       case CPU_TSS:
+               smp_call_function_single(priv->cpu, print_tss, seq, 1);
+               break;
+       case CPU_CR:
+               smp_call_function_single(priv->cpu, print_cr, seq, 1);
+               break;
+       case CPU_DT:
+               smp_call_function_single(priv->cpu, print_dt, seq, 1);
+               break;
+       case CPU_DEBUG:
+               if (priv->file == CPU_INDEX_BIT)
+                       smp_call_function_single(priv->cpu, print_dr, seq, 1);
+               print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
+               break;
+       case CPU_APIC:
+               if (priv->file == CPU_INDEX_BIT)
+                       smp_call_function_single(priv->cpu, print_apic, seq, 1);
+               print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
+               break;
+
+       default:
+               print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
+               break;
+       }
+       seq_printf(seq, "\n");
+
+       return 0;
+}
+
+static void *cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+       if (*pos == 0) /* One time is enough ;-) */
+               return seq;
+
+       return NULL;
+}
+
+static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+       (*pos)++;
+
+       return cpu_seq_start(seq, pos);
+}
+
+static void cpu_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations cpu_seq_ops = {
+       .start          = cpu_seq_start,
+       .next           = cpu_seq_next,
+       .stop           = cpu_seq_stop,
+       .show           = cpu_seq_show,
+};
+
+static int cpu_seq_open(struct inode *inode, struct file *file)
+{
+       struct cpu_private *priv = inode->i_private;
+       struct seq_file *seq;
+       int err;
+
+       err = seq_open(file, &cpu_seq_ops);
+       if (!err) {
+               seq = file->private_data;
+               seq->private = priv;
+       }
+
+       return err;
+}
+
+static int write_msr(struct cpu_private *priv, u64 val)
+{
+       u32 low, high;
+
+       high = (val >> 32) & 0xffffffff;
+       low = val & 0xffffffff;
+
+       if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high))
+               return 0;
+
+       return -EPERM;
+}
+
+static int write_cpu_register(struct cpu_private *priv, const char *buf)
+{
+       int ret = -EPERM;
+       u64 val;
+
+       ret = strict_strtoull(buf, 0, &val);
+       if (ret < 0)
+               return ret;
+
+       /* Supporting only MSRs */
+       if (priv->type < CPU_TSS_BIT)
+               return write_msr(priv, val);
+
+       return ret;
+}
+
+static ssize_t cpu_write(struct file *file, const char __user *ubuf,
+                            size_t count, loff_t *off)
+{
+       struct seq_file *seq = file->private_data;
+       struct cpu_private *priv = seq->private;
+       char buf[19];
+
+       if ((priv == NULL) || (count >= sizeof(buf)))
+               return -EINVAL;
+
+       if (copy_from_user(&buf, ubuf, count))
+               return -EFAULT;
+
+       buf[count] = 0;
+
+       if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write))
+               if (!write_cpu_register(priv, buf))
+                       return count;
+
+       return -EACCES;
+}
+
+static const struct file_operations cpu_fops = {
+       .owner          = THIS_MODULE,
+       .open           = cpu_seq_open,
+       .read           = seq_read,
+       .write          = cpu_write,
+       .llseek         = seq_lseek,
+       .release        = seq_release,
+};
+
+static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
+                          unsigned file, struct dentry *dentry)
+{
+       struct cpu_private *priv = NULL;
+
+       /* Already intialized */
+       if (file == CPU_INDEX_BIT)
+               if (per_cpu(cpu_arr[type].init, cpu))
+                       return 0;
+
+       priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+       if (priv == NULL)
+               return -ENOMEM;
+
+       priv->cpu = cpu;
+       priv->type = type;
+       priv->reg = reg;
+       priv->file = file;
+       mutex_lock(&cpu_debug_lock);
+       per_cpu(priv_arr[type], cpu) = priv;
+       per_cpu(cpu_priv_count, cpu)++;
+       mutex_unlock(&cpu_debug_lock);
+
+       if (file)
+               debugfs_create_file(cpu_file[file].name, S_IRUGO,
+                                   dentry, (void *)priv, &cpu_fops);
+       else {
+               debugfs_create_file(cpu_base[type].name, S_IRUGO,
+                                   per_cpu(cpu_arr[type].dentry, cpu),
+                                   (void *)priv, &cpu_fops);
+               mutex_lock(&cpu_debug_lock);
+               per_cpu(cpu_arr[type].init, cpu) = 1;
+               mutex_unlock(&cpu_debug_lock);
+       }
+
+       return 0;
+}
+
+static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg,
+                            struct dentry *dentry)
+{
+       unsigned file;
+       int err = 0;
+
+       for (file = 0; file <  ARRAY_SIZE(cpu_file); file++) {
+               err = cpu_create_file(cpu, type, reg, file, dentry);
+               if (err)
+                       return err;
+       }
+
+       return err;
+}
+
+static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
+{
+       struct dentry *cpu_dentry = NULL;
+       unsigned reg, reg_min, reg_max;
+       int i, range, err = 0;
+       char reg_dir[12];
+       u32 low, high;
+
+       range = get_cpu_range_count(cpu);
+
+       for (i = 0; i < range; i++) {
+               if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
+                                  cpu_base[type].flag))
+                       continue;
+
+               for (reg = reg_min; reg <= reg_max; reg++) {
+                       if (rdmsr_safe_on_cpu(cpu, reg, &low, &high))
+                               continue;
+
+                       sprintf(reg_dir, "0x%x", reg);
+                       cpu_dentry = debugfs_create_dir(reg_dir, dentry);
+                       err = cpu_init_regfiles(cpu, type, reg, cpu_dentry);
+                       if (err)
+                               return err;
+               }
+       }
+
+       return err;
+}
+
+static int cpu_init_allreg(unsigned cpu, struct dentry *dentry)
+{
+       struct dentry *cpu_dentry = NULL;
+       unsigned type;
+       int err = 0;
+
+       for (type = 0; type <  ARRAY_SIZE(cpu_base) - 1; type++) {
+               if (!is_typeflag_valid(cpu, cpu_base[type].flag))
+                       continue;
+               cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
+               per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry;
+
+               if (type < CPU_TSS_BIT)
+                       err = cpu_init_msr(cpu, type, cpu_dentry);
+               else
+                       err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT,
+                                             cpu_dentry);
+               if (err)
+                       return err;
+       }
+
+       return err;
+}
+
+static int cpu_init_cpu(void)
+{
+       struct dentry *cpu_dentry = NULL;
+       struct cpuinfo_x86 *cpui;
+       char cpu_dir[12];
+       unsigned cpu;
+       int err = 0;
+
+       for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+               cpui = &cpu_data(cpu);
+               if (!cpu_has(cpui, X86_FEATURE_MSR))
+                       continue;
+               per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
+                                          (cpui->x86 << 8) |
+                                          (cpui->x86_model));
+               per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
+
+               sprintf(cpu_dir, "cpu%d", cpu);
+               cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
+               err = cpu_init_allreg(cpu, cpu_dentry);
+
+               pr_info("cpu%d(%d) debug files %d\n",
+                       cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu));
+               if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) {
+                       pr_err("Register files count %d exceeds limit %d\n",
+                               per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES);
+                       per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES;
+                       err = -ENFILE;
+               }
+               if (err)
+                       return err;
+       }
+
+       return err;
+}
+
+static int __init cpu_debug_init(void)
+{
+       cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir);
+
+       return cpu_init_cpu();
+}
+
+static void __exit cpu_debug_exit(void)
+{
+       int i, cpu;
+
+       if (cpu_debugfs_dir)
+               debugfs_remove_recursive(cpu_debugfs_dir);
+
+       for (cpu = 0; cpu <  nr_cpu_ids; cpu++)
+               for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++)
+                       kfree(per_cpu(priv_arr[i], cpu));
+}
+
+module_init(cpu_debug_init);
+module_exit(cpu_debug_exit);
+
+MODULE_AUTHOR("Jaswinder Singh Rajput");
+MODULE_DESCRIPTION("CPU Debug module");
+MODULE_LICENSE("GPL");
index ffd0f5ed071a705e5f55e279ac12f0c2d0cf9419..593171e967ef2009a8f109e8a1f9f81f63c469c5 100644 (file)
@@ -61,23 +61,23 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
  */
 static unsigned char Cx86_dir0_msb __cpuinitdata = 0;
 
-static char Cx86_model[][9] __cpuinitdata = {
+static const char __cpuinitconst Cx86_model[][9] = {
        "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
        "M II ", "Unknown"
 };
-static char Cx486_name[][5] __cpuinitdata = {
+static const char __cpuinitconst Cx486_name[][5] = {
        "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
        "SRx2", "DRx2"
 };
-static char Cx486S_name[][4] __cpuinitdata = {
+static const char __cpuinitconst Cx486S_name[][4] = {
        "S", "S2", "Se", "S2e"
 };
-static char Cx486D_name[][4] __cpuinitdata = {
+static const char __cpuinitconst Cx486D_name[][4] = {
        "DX", "DX2", "?", "?", "?", "DX4"
 };
 static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock";
-static char cyrix_model_mult1[] __cpuinitdata = "12??43";
-static char cyrix_model_mult2[] __cpuinitdata = "12233445";
+static const char __cpuinitconst cyrix_model_mult1[] = "12??43";
+static const char __cpuinitconst cyrix_model_mult2[] = "12233445";
 
 /*
  * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
@@ -435,7 +435,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
        }
 }
 
-static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = {
        .c_vendor       = "Cyrix",
        .c_ident        = { "CyrixInstead" },
        .c_early_init   = early_init_cyrix,
@@ -446,7 +446,7 @@ static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
 
 cpu_dev_register(cyrix_cpu_dev);
 
-static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst nsc_cpu_dev = {
        .c_vendor       = "NSC",
        .c_ident        = { "Geode by NSC" },
        .c_init         = init_nsc,
index 1a89a2b68d1539a92939e4d33747a1bcd916390c..7437fa133c02dfdde137ca7201fb9734888a85bc 100644 (file)
@@ -14,6 +14,7 @@
 #include <asm/uaccess.h>
 #include <asm/ds.h>
 #include <asm/bugs.h>
+#include <asm/cpu.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/topology.h>
@@ -54,6 +55,11 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
                c->x86_cache_alignment = 128;
 #endif
 
+       /* CPUID workaround for 0F33/0F34 CPU */
+       if (c->x86 == 0xF && c->x86_model == 0x3
+           && (c->x86_mask == 0x3 || c->x86_mask == 0x4))
+               c->x86_phys_bits = 36;
+
        /*
         * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
         * with P/T states and does not stop in deep C-states.
@@ -116,6 +122,28 @@ static void __cpuinit trap_init_f00f_bug(void)
 }
 #endif
 
+static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+       /* calling is from identify_secondary_cpu() ? */
+       if (c->cpu_index == boot_cpu_id)
+               return;
+
+       /*
+        * Mask B, Pentium, but not Pentium MMX
+        */
+       if (c->x86 == 5 &&
+           c->x86_mask >= 1 && c->x86_mask <= 4 &&
+           c->x86_model <= 3) {
+               /*
+                * Remember we have B step Pentia with bugs
+                */
+               WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
+                                   "with B stepping processors.\n");
+       }
+#endif
+}
+
 static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
 {
        unsigned long lo, hi;
@@ -192,6 +220,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
 #ifdef CONFIG_X86_NUMAQ
        numaq_tsc_disable();
 #endif
+
+       intel_smp_check(c);
 }
 #else
 static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
@@ -391,7 +421,7 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
 }
 #endif
 
-static struct cpu_dev intel_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
        .c_vendor       = "Intel",
        .c_ident        = { "GenuineIntel" },
 #ifdef CONFIG_X86_32
index 7293508d8f5c82e4cd4f196c59e28ca332747bad..c471eb1a389cc02c4f788d5a828f3f4bfe56c93d 100644 (file)
@@ -32,7 +32,7 @@ struct _cache_table
 };
 
 /* all the cache descriptor types we care about (no TLB or trace cache entries) */
-static struct _cache_table cache_table[] __cpuinitdata =
+static const struct _cache_table __cpuinitconst cache_table[] =
 {
        { 0x06, LVL_1_INST, 8 },        /* 4-way set assoc, 32 byte line size */
        { 0x08, LVL_1_INST, 16 },       /* 4-way set assoc, 32 byte line size */
@@ -206,15 +206,15 @@ union l3_cache {
        unsigned val;
 };
 
-static unsigned short assocs[] __cpuinitdata = {
+static const unsigned short __cpuinitconst assocs[] = {
        [1] = 1, [2] = 2, [4] = 4, [6] = 8,
        [8] = 16, [0xa] = 32, [0xb] = 48,
        [0xc] = 64,
        [0xf] = 0xffff // ??
 };
 
-static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 };
-static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 };
+static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
+static const unsigned char __cpuinitconst types[] = { 1, 2, 3, 3 };
 
 static void __cpuinit
 amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
index d7d2323bbb6976ffa603246bd06845d280150371..b2f89829bbe824d2cecfd74d8c09b2923406493b 100644 (file)
@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32)            += k7.o p4.o p5.o p6.o winchip.o
 obj-$(CONFIG_X86_MCE_INTEL)    += mce_intel_64.o
 obj-$(CONFIG_X86_MCE_AMD)      += mce_amd_64.o
 obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
+obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
index dfaebce3633e3f9b8415e79f29ad9f012276825a..3552119b091da51e65ce933c9852eee694920b9b 100644 (file)
@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)
        }
 }
 
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
-       old_cr4 = read_cr4();
-       clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
-       if (old_cr4 & X86_CR4_MCE)
-               set_in_cr4(X86_CR4_MCE);
-}
-
 static int __init mcheck_disable(char *str)
 {
        mce_disabled = 1;
index fe79985ce0f2f6fb4ae23f3bc92a791994ed9f62..ca14604611ec7ccc6217afc008e50d88af6ad88f 100644 (file)
@@ -3,6 +3,8 @@
  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
  * Rest from unknown author(s).
  * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
  */
 
 #include <linux/init.h>
@@ -24,6 +26,9 @@
 #include <linux/ctype.h>
 #include <linux/kmod.h>
 #include <linux/kdebug.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/ratelimit.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 #include <asm/mce.h>
@@ -32,7 +37,6 @@
 #include <asm/idle.h>
 
 #define MISC_MCELOG_MINOR 227
-#define NR_SYSFS_BANKS 6
 
 atomic_t mce_entry;
 
@@ -47,7 +51,7 @@ static int mce_dont_init;
  */
 static int tolerant = 1;
 static int banks;
-static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
+static u64 *bank;
 static unsigned long notify_user;
 static int rip_msr;
 static int mce_bootlog = -1;
@@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL };
 
 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
 
+/* MCA banks polled by the period polling timer for corrected events */
+DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
+       [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
+};
+
+/* Do initial initialization of a struct mce */
+void mce_setup(struct mce *m)
+{
+       memset(m, 0, sizeof(struct mce));
+       m->cpu = smp_processor_id();
+       rdtscll(m->tsc);
+}
+
 /*
  * Lockless MCE logging infrastructure.
  * This avoids deadlocks on printk locks without having to break locks. Also
@@ -119,11 +136,11 @@ static void print_mce(struct mce *m)
                        print_symbol("{%s}", m->ip);
                printk("\n");
        }
-       printk(KERN_EMERG "TSC %Lx ", m->tsc);
+       printk(KERN_EMERG "TSC %llx ", m->tsc);
        if (m->addr)
-               printk("ADDR %Lx ", m->addr);
+               printk("ADDR %llx ", m->addr);
        if (m->misc)
-               printk("MISC %Lx ", m->misc);
+               printk("MISC %llx ", m->misc);
        printk("\n");
        printk(KERN_EMERG "This is not a software problem!\n");
        printk(KERN_EMERG "Run through mcelog --ascii to decode "
@@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
        panic(msg);
 }
 
-static int mce_available(struct cpuinfo_x86 *c)
+int mce_available(struct cpuinfo_x86 *c)
 {
+       if (mce_dont_init)
+               return 0;
        return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 }
 
@@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 }
 
 /*
- * The actual machine check handler
+ * Poll for corrected events or events that happened before reset.
+ * Those are just logged through /dev/mcelog.
+ *
+ * This is executed in standard interrupt context.
+ */
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+{
+       struct mce m;
+       int i;
+
+       mce_setup(&m);
+
+       rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+       for (i = 0; i < banks; i++) {
+               if (!bank[i] || !test_bit(i, *b))
+                       continue;
+
+               m.misc = 0;
+               m.addr = 0;
+               m.bank = i;
+               m.tsc = 0;
+
+               barrier();
+               rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
+               if (!(m.status & MCI_STATUS_VAL))
+                       continue;
+
+               /*
+                * Uncorrected events are handled by the exception handler
+                * when it is enabled. But when the exception is disabled log
+                * everything.
+                *
+                * TBD do the same check for MCI_STATUS_EN here?
+                */
+               if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
+                       continue;
+
+               if (m.status & MCI_STATUS_MISCV)
+                       rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
+               if (m.status & MCI_STATUS_ADDRV)
+                       rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
+
+               if (!(flags & MCP_TIMESTAMP))
+                       m.tsc = 0;
+               /*
+                * Don't get the IP here because it's unlikely to
+                * have anything to do with the actual error location.
+                */
+
+               mce_log(&m);
+               add_taint(TAINT_MACHINE_CHECK);
+
+               /*
+                * Clear state for this bank.
+                */
+               wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+       }
+
+       /*
+        * Don't clear MCG_STATUS here because it's only defined for
+        * exceptions.
+        */
+}
+
+/*
+ * The actual machine check handler. This only handles real
+ * exceptions when something got corrupted coming in through int 18.
+ *
+ * This is executed in NMI context not subject to normal locking rules. This
+ * implies that most kernel services cannot be safely used. Don't even
+ * think about putting a printk in there!
  */
 void do_machine_check(struct pt_regs * regs, long error_code)
 {
@@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code)
         * error.
         */
        int kill_it = 0;
+       DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 
        atomic_inc(&mce_entry);
 
-       if ((regs
-            && notify_die(DIE_NMI, "machine check", regs, error_code,
+       if (notify_die(DIE_NMI, "machine check", regs, error_code,
                           18, SIGKILL) == NOTIFY_STOP)
-           || !banks)
+               goto out2;
+       if (!banks)
                goto out2;
 
-       memset(&m, 0, sizeof(struct mce));
-       m.cpu = smp_processor_id();
+       mce_setup(&m);
+
        rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
        /* if the restart IP is not valid, we're done for */
        if (!(m.mcgstatus & MCG_STATUS_RIPV))
@@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code)
        barrier();
 
        for (i = 0; i < banks; i++) {
-               if (i < NR_SYSFS_BANKS && !bank[i])
+               __clear_bit(i, toclear);
+               if (!bank[i])
                        continue;
 
                m.misc = 0;
                m.addr = 0;
                m.bank = i;
-               m.tsc = 0;
 
                rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
                if ((m.status & MCI_STATUS_VAL) == 0)
                        continue;
 
+               /*
+                * Non uncorrected errors are handled by machine_check_poll
+                * Leave them alone.
+                */
+               if ((m.status & MCI_STATUS_UC) == 0)
+                       continue;
+
+               /*
+                * Set taint even when machine check was not enabled.
+                */
+               add_taint(TAINT_MACHINE_CHECK);
+
+               __set_bit(i, toclear);
+
                if (m.status & MCI_STATUS_EN) {
                        /* if PCC was set, there's no way out */
                        no_way_out |= !!(m.status & MCI_STATUS_PCC);
@@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
                                        no_way_out = 1;
                                kill_it = 1;
                        }
+               } else {
+                       /*
+                        * Machine check event was not enabled. Clear, but
+                        * ignore.
+                        */
+                       continue;
                }
 
                if (m.status & MCI_STATUS_MISCV)
@@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
                        rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 
                mce_get_rip(&m, regs);
-               if (error_code >= 0)
-                       rdtscll(m.tsc);
-               if (error_code != -2)
-                       mce_log(&m);
+               mce_log(&m);
 
                /* Did this bank cause the exception? */
                /* Assume that the bank with uncorrectable errors did it,
@@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
                        panicm = m;
                        panicm_found = 1;
                }
-
-               add_taint(TAINT_MACHINE_CHECK);
        }
 
-       /* Never do anything final in the polling timer */
-       if (!regs)
-               goto out;
-
        /* If we didn't find an uncorrectable error, pick
           the last one (shouldn't happen, just being safe). */
        if (!panicm_found)
@@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
        /* notify userspace ASAP */
        set_thread_flag(TIF_MCE_NOTIFY);
 
- out:
        /* the last thing we do is clear state */
-       for (i = 0; i < banks; i++)
-               wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+       for (i = 0; i < banks; i++) {
+               if (test_bit(i, toclear))
+                       wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+       }
        wrmsrl(MSR_IA32_MCG_STATUS, 0);
  out2:
        atomic_dec(&mce_entry);
@@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code)
  * and historically has been the register value of the
  * MSR_IA32_THERMAL_STATUS (Intel) msr.
  */
-void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
+void mce_log_therm_throt_event(__u64 status)
 {
        struct mce m;
 
-       memset(&m, 0, sizeof(m));
-       m.cpu = cpu;
+       mce_setup(&m);
        m.bank = MCE_THERMAL_BANK;
        m.status = status;
-       rdtscll(m.tsc);
        mce_log(&m);
 }
 #endif /* CONFIG_X86_MCE_INTEL */
@@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
 
 static int check_interval = 5 * 60; /* 5 minutes */
 static int next_interval; /* in jiffies */
-static void mcheck_timer(struct work_struct *work);
-static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
+static void mcheck_timer(unsigned long);
+static DEFINE_PER_CPU(struct timer_list, mce_timer);
 
-static void mcheck_check_cpu(void *info)
+static void mcheck_timer(unsigned long data)
 {
-       if (mce_available(&current_cpu_data))
-               do_machine_check(NULL, 0);
-}
+       struct timer_list *t = &per_cpu(mce_timer, data);
 
-static void mcheck_timer(struct work_struct *work)
-{
-       on_each_cpu(mcheck_check_cpu, NULL, 1);
+       WARN_ON(smp_processor_id() != data);
+
+       if (mce_available(&current_cpu_data))
+               machine_check_poll(MCP_TIMESTAMP,
+                               &__get_cpu_var(mce_poll_banks));
 
        /*
         * Alert userspace if needed.  If we logged an MCE, reduce the
@@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work)
                                (int)round_jiffies_relative(check_interval*HZ));
        }
 
-       schedule_delayed_work(&mcheck_work, next_interval);
+       t->expires = jiffies + next_interval;
+       add_timer(t);
+}
+
+static void mce_do_trigger(struct work_struct *work)
+{
+       call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
 }
 
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
 /*
- * This is only called from process context.  This is where we do
- * anything we need to alert userspace about new MCEs.  This is called
- * directly from the poller and also from entry.S and idle, thanks to
- * TIF_MCE_NOTIFY.
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
  */
 int mce_notify_user(void)
 {
+       /* Not more than two messages every minute */
+       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
        clear_thread_flag(TIF_MCE_NOTIFY);
        if (test_and_clear_bit(0, &notify_user)) {
-               static unsigned long last_print;
-               unsigned long now = jiffies;
-
                wake_up_interruptible(&mce_wait);
-               if (trigger[0])
-                       call_usermodehelper(trigger, trigger_argv, NULL,
-                                               UMH_NO_WAIT);
 
-               if (time_after_eq(now, last_print + (check_interval*HZ))) {
-                       last_print = now;
+               /*
+                * There is no risk of missing notifications because
+                * work_pending is always cleared before the function is
+                * executed.
+                */
+               if (trigger[0] && !work_pending(&mce_trigger_work))
+                       schedule_work(&mce_trigger_work);
+
+               if (__ratelimit(&ratelimit))
                        printk(KERN_INFO "Machine check events logged\n");
-               }
 
                return 1;
        }
@@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = {
 
 static __init int periodic_mcheck_init(void)
 {
-       next_interval = check_interval * HZ;
-       if (next_interval)
-               schedule_delayed_work(&mcheck_work,
-                                     round_jiffies_relative(next_interval));
-       idle_notifier_register(&mce_idle_notifier);
-       return 0;
+       idle_notifier_register(&mce_idle_notifier);
+       return 0;
 }
 __initcall(periodic_mcheck_init);
 
-
 /*
  * Initialize Machine Checks for a CPU.
  */
-static void mce_init(void *dummy)
+static int mce_cap_init(void)
 {
        u64 cap;
-       int i;
+       unsigned b;
 
        rdmsrl(MSR_IA32_MCG_CAP, cap);
-       banks = cap & 0xff;
-       if (banks > MCE_EXTENDED_BANK) {
-               banks = MCE_EXTENDED_BANK;
-               printk(KERN_INFO "MCE: warning: using only %d banks\n",
-                      MCE_EXTENDED_BANK);
+       b = cap & 0xff;
+       if (b > MAX_NR_BANKS) {
+               printk(KERN_WARNING
+                      "MCE: Using only %u machine check banks out of %u\n",
+                       MAX_NR_BANKS, b);
+               b = MAX_NR_BANKS;
        }
+
+       /* Don't support asymmetric configurations today */
+       WARN_ON(banks != 0 && b != banks);
+       banks = b;
+       if (!bank) {
+               bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
+               if (!bank)
+                       return -ENOMEM;
+               memset(bank, 0xff, banks * sizeof(u64));
+       }
+
        /* Use accurate RIP reporting if available. */
        if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
                rip_msr = MSR_IA32_MCG_EIP;
 
-       /* Log the machine checks left over from the previous reset.
-          This also clears all registers */
-       do_machine_check(NULL, mce_bootlog ? -1 : -2);
+       return 0;
+}
+
+static void mce_init(void *dummy)
+{
+       u64 cap;
+       int i;
+       mce_banks_t all_banks;
+
+       /*
+        * Log the machine checks left over from the previous reset.
+        */
+       bitmap_fill(all_banks, MAX_NR_BANKS);
+       machine_check_poll(MCP_UC, &all_banks);
 
        set_in_cr4(X86_CR4_MCE);
 
+       rdmsrl(MSR_IA32_MCG_CAP, cap);
        if (cap & MCG_CTL_P)
                wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 
        for (i = 0; i < banks; i++) {
-               if (i < NR_SYSFS_BANKS)
-                       wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
-               else
-                       wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
-
+               wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
                wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
        }
 }
 
 /* Add per CPU specific workarounds here */
-static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
+static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 {
        /* This should be disabled by the BIOS, but isn't always */
        if (c->x86_vendor == X86_VENDOR_AMD) {
-               if(c->x86 == 15)
+               if (c->x86 == 15 && banks > 4)
                        /* disable GART TBL walk error reporting, which trips off
                           incorrectly with the IOMMU & 3ware & Cerberus. */
-                       clear_bit(10, &bank[4]);
+                       clear_bit(10, (unsigned long *)&bank[4]);
                if(c->x86 <= 17 && mce_bootlog < 0)
                        /* Lots of broken BIOS around that don't clear them
                           by default and leave crap in there. Don't log. */
@@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
        }
 }
 
+static void mce_init_timer(void)
+{
+       struct timer_list *t = &__get_cpu_var(mce_timer);
+
+       /* data race harmless because everyone sets to the same value */
+       if (!next_interval)
+               next_interval = check_interval * HZ;
+       if (!next_interval)
+               return;
+       setup_timer(t, mcheck_timer, smp_processor_id());
+       t->expires = round_jiffies(jiffies + next_interval);
+       add_timer(t);
+}
+
 /*
  * Called for each booted CPU to set up machine checks.
  * Must be called with preempt off.
  */
 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 {
-       mce_cpu_quirks(c);
+       if (!mce_available(c))
+               return;
 
-       if (mce_dont_init ||
-           !mce_available(c))
+       if (mce_cap_init() < 0) {
+               mce_dont_init = 1;
                return;
+       }
+       mce_cpu_quirks(c);
 
        mce_init(NULL);
        mce_cpu_features(c);
+       mce_init_timer();
 }
 
 /*
@@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 {
        unsigned long *cpu_tsc;
        static DEFINE_MUTEX(mce_read_mutex);
-       unsigned next;
+       unsigned prev, next;
        char __user *buf = ubuf;
        int i, err;
 
@@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
        }
 
        err = 0;
-       for (i = 0; i < next; i++) {
-               unsigned long start = jiffies;
-
-               while (!mcelog.entry[i].finished) {
-                       if (time_after_eq(jiffies, start + 2)) {
-                               memset(mcelog.entry + i,0, sizeof(struct mce));
-                               goto timeout;
+       prev = 0;
+       do {
+               for (i = prev; i < next; i++) {
+                       unsigned long start = jiffies;
+
+                       while (!mcelog.entry[i].finished) {
+                               if (time_after_eq(jiffies, start + 2)) {
+                                       memset(mcelog.entry + i, 0,
+                                              sizeof(struct mce));
+                                       goto timeout;
+                               }
+                               cpu_relax();
                        }
-                       cpu_relax();
+                       smp_rmb();
+                       err |= copy_to_user(buf, mcelog.entry + i,
+                                           sizeof(struct mce));
+                       buf += sizeof(struct mce);
+timeout:
+                       ;
                }
-               smp_rmb();
-               err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
-               buf += sizeof(struct mce);
- timeout:
-               ;
-       }
 
-       memset(mcelog.entry, 0, next * sizeof(struct mce));
-       mcelog.next = 0;
+               memset(mcelog.entry + prev, 0,
+                      (next - prev) * sizeof(struct mce));
+               prev = next;
+               next = cmpxchg(&mcelog.next, prev, 0);
+       } while (next != prev);
 
        synchronize_sched();
 
@@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = {
        &mce_chrdev_ops,
 };
 
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
-       old_cr4 = read_cr4();
-       clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
-       if (old_cr4 & X86_CR4_MCE)
-               set_in_cr4(X86_CR4_MCE);
-}
-
 /*
  * Old style boot options parsing. Only for compatibility.
  */
@@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str)
        return 1;
 }
 
-/* mce=off disables machine check. Note you can re-enable it later
-   using sysfs.
+/* mce=off disables machine check.
    mce=TOLERANCELEVEL (number, see above)
    mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
    mce=nobootlog Don't log MCEs from before booting. */
@@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable);
  * Sysfs support
  */
 
+/*
+ * Disable machine checks on suspend and shutdown. We can't really handle
+ * them later.
+ */
+static int mce_disable(void)
+{
+       int i;
+
+       for (i = 0; i < banks; i++)
+               wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+       return 0;
+}
+
+static int mce_suspend(struct sys_device *dev, pm_message_t state)
+{
+       return mce_disable();
+}
+
+static int mce_shutdown(struct sys_device *dev)
+{
+       return mce_disable();
+}
+
 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
    Only one CPU is active at this time, the others get readded later using
    CPU hotplug. */
@@ -738,20 +896,24 @@ static int mce_resume(struct sys_device *dev)
        return 0;
 }
 
+static void mce_cpu_restart(void *data)
+{
+       del_timer_sync(&__get_cpu_var(mce_timer));
+       if (mce_available(&current_cpu_data))
+               mce_init(NULL);
+       mce_init_timer();
+}
+
 /* Reinit MCEs after user configuration changes */
 static void mce_restart(void)
 {
-       if (next_interval)
-               cancel_delayed_work(&mcheck_work);
-       /* Timer race is harmless here */
-       on_each_cpu(mce_init, NULL, 1);
        next_interval = check_interval * HZ;
-       if (next_interval)
-               schedule_delayed_work(&mcheck_work,
-                                     round_jiffies_relative(next_interval));
+       on_each_cpu(mce_cpu_restart, NULL, 1);
 }
 
 static struct sysdev_class mce_sysclass = {
+       .suspend = mce_suspend,
+       .shutdown = mce_shutdown,
        .resume = mce_resume,
        .name = "machinecheck",
 };
@@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
        }                                                               \
        static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 
-/*
- * TBD should generate these dynamically based on number of available banks.
- * Have only 6 contol banks in /sysfs until then.
- */
-ACCESSOR(bank0ctl,bank[0],mce_restart())
-ACCESSOR(bank1ctl,bank[1],mce_restart())
-ACCESSOR(bank2ctl,bank[2],mce_restart())
-ACCESSOR(bank3ctl,bank[3],mce_restart())
-ACCESSOR(bank4ctl,bank[4],mce_restart())
-ACCESSOR(bank5ctl,bank[5],mce_restart())
+static struct sysdev_attribute *bank_attrs;
+
+static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
+                        char *buf)
+{
+       u64 b = bank[attr - bank_attrs];
+       return sprintf(buf, "%llx\n", b);
+}
+
+static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
+                       const char *buf, size_t siz)
+{
+       char *end;
+       u64 new = simple_strtoull(buf, &end, 0);
+       if (end == buf)
+               return -EINVAL;
+       bank[attr - bank_attrs] = new;
+       mce_restart();
+       return end-buf;
+}
 
 static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
                                char *buf)
@@ -814,8 +986,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
 ACCESSOR(check_interval,check_interval,mce_restart())
 static struct sysdev_attribute *mce_attributes[] = {
-       &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
-       &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
        &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
        NULL
 };
@@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu)
                if (err)
                        goto error;
        }
+       for (i = 0; i < banks; i++) {
+               err = sysdev_create_file(&per_cpu(device_mce, cpu),
+                                       &bank_attrs[i]);
+               if (err)
+                       goto error2;
+       }
        cpu_set(cpu, mce_device_initialized);
 
        return 0;
+error2:
+       while (--i >= 0) {
+               sysdev_remove_file(&per_cpu(device_mce, cpu),
+                                       &bank_attrs[i]);
+       }
 error:
-       while (i--) {
+       while (--i >= 0) {
                sysdev_remove_file(&per_cpu(device_mce,cpu),
                                   mce_attributes[i]);
        }
@@ -868,15 +1049,46 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
        for (i = 0; mce_attributes[i]; i++)
                sysdev_remove_file(&per_cpu(device_mce,cpu),
                        mce_attributes[i]);
+       for (i = 0; i < banks; i++)
+               sysdev_remove_file(&per_cpu(device_mce, cpu),
+                       &bank_attrs[i]);
        sysdev_unregister(&per_cpu(device_mce,cpu));
        cpu_clear(cpu, mce_device_initialized);
 }
 
+/* Make sure there are no machine checks on offlined CPUs. */
+static void mce_disable_cpu(void *h)
+{
+       int i;
+       unsigned long action = *(unsigned long *)h;
+
+       if (!mce_available(&current_cpu_data))
+               return;
+       if (!(action & CPU_TASKS_FROZEN))
+               cmci_clear();
+       for (i = 0; i < banks; i++)
+               wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+}
+
+static void mce_reenable_cpu(void *h)
+{
+       int i;
+       unsigned long action = *(unsigned long *)h;
+
+       if (!mce_available(&current_cpu_data))
+               return;
+       if (!(action & CPU_TASKS_FROZEN))
+               cmci_reenable();
+       for (i = 0; i < banks; i++)
+               wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
+}
+
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
 static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
                                      unsigned long action, void *hcpu)
 {
        unsigned int cpu = (unsigned long)hcpu;
+       struct timer_list *t = &per_cpu(mce_timer, cpu);
 
        switch (action) {
        case CPU_ONLINE:
@@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
                        threshold_cpu_callback(action, cpu);
                mce_remove_device(cpu);
                break;
+       case CPU_DOWN_PREPARE:
+       case CPU_DOWN_PREPARE_FROZEN:
+               del_timer_sync(t);
+               smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+               break;
+       case CPU_DOWN_FAILED:
+       case CPU_DOWN_FAILED_FROZEN:
+               t->expires = round_jiffies(jiffies + next_interval);
+               add_timer_on(t, cpu);
+               smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+               break;
+       case CPU_POST_DEAD:
+               /* intentionally ignoring frozen here */
+               cmci_rediscover(cpu);
+               break;
        }
        return NOTIFY_OK;
 }
@@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
        .notifier_call = mce_cpu_callback,
 };
 
+static __init int mce_init_banks(void)
+{
+       int i;
+
+       bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
+                               GFP_KERNEL);
+       if (!bank_attrs)
+               return -ENOMEM;
+
+       for (i = 0; i < banks; i++) {
+               struct sysdev_attribute *a = &bank_attrs[i];
+               a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
+               if (!a->attr.name)
+                       goto nomem;
+               a->attr.mode = 0644;
+               a->show = show_bank;
+               a->store = set_bank;
+       }
+       return 0;
+
+nomem:
+       while (--i >= 0)
+               kfree(bank_attrs[i].attr.name);
+       kfree(bank_attrs);
+       bank_attrs = NULL;
+       return -ENOMEM;
+}
+
 static __init int mce_init_device(void)
 {
        int err;
@@ -906,6 +1161,11 @@ static __init int mce_init_device(void)
 
        if (!mce_available(&boot_cpu_data))
                return -EIO;
+
+       err = mce_init_banks();
+       if (err)
+               return err;
+
        err = sysdev_class_register(&mce_sysclass);
        if (err)
                return err;
index 9817506dd4698c6578646f04100836dd6ad435c8..7d01be868870d1a7922c7ec18fc7a8b666ddb25b 100644 (file)
@@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = {
 
 static DEFINE_PER_CPU(unsigned char, bank_map);        /* see which banks are on */
 
+static void amd_threshold_interrupt(void);
+
 /*
  * CPU Initialization
  */
@@ -90,7 +92,8 @@ struct thresh_restart {
 };
 
 /* must be called with correct cpu affinity */
-static long threshold_restart_bank(void *_tr)
+/* Called via smp_call_function_single() */
+static void threshold_restart_bank(void *_tr)
 {
        struct thresh_restart *tr = _tr;
        u32 mci_misc_hi, mci_misc_lo;
@@ -117,7 +120,6 @@ static long threshold_restart_bank(void *_tr)
 
        mci_misc_hi |= MASK_COUNT_EN_HI;
        wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
-       return 0;
 }
 
 /* cpu init entry point, called from mce.c with preempt off */
@@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
                        tr.reset = 0;
                        tr.old_limit = 0;
                        threshold_restart_bank(&tr);
+
+                       mce_threshold_vector = amd_threshold_interrupt;
                }
        }
 }
@@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
  * the interrupt goes off when error_count reaches threshold_limit.
  * the handler will simply log mcelog w/ software defined bank number.
  */
-asmlinkage void mce_threshold_interrupt(void)
+static void amd_threshold_interrupt(void)
 {
        unsigned int bank, block;
        struct mce m;
        u32 low = 0, high = 0, address = 0;
 
-       ack_APIC_irq();
-       exit_idle();
-       irq_enter();
-
-       memset(&m, 0, sizeof(m));
-       rdtscll(m.tsc);
-       m.cpu = smp_processor_id();
+       mce_setup(&m);
 
        /* assume first bank caused it */
        for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void)
 
                        /* Log the machine check that caused the threshold
                           event. */
-                       do_machine_check(NULL, 0);
+                       machine_check_poll(MCP_TIMESTAMP,
+                                       &__get_cpu_var(mce_poll_banks));
 
                        if (high & MASK_OVERFLOW_HI) {
                                rdmsrl(address, m.misc);
@@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void)
                                       + bank * NR_BLOCKS
                                       + block;
                                mce_log(&m);
-                               goto out;
+                               return;
                        }
                }
        }
-out:
-       inc_irq_stat(irq_threshold_count);
-       irq_exit();
 }
 
 /*
@@ -283,7 +279,7 @@ static ssize_t store_interrupt_enable(struct threshold_block *b,
        tr.b = b;
        tr.reset = 0;
        tr.old_limit = 0;
-       work_on_cpu(b->cpu, threshold_restart_bank, &tr);
+       smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
 
        return end - buf;
 }
@@ -305,23 +301,32 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
        tr.b = b;
        tr.reset = 0;
 
-       work_on_cpu(b->cpu, threshold_restart_bank, &tr);
+       smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
 
        return end - buf;
 }
 
-static long local_error_count(void *_b)
+struct threshold_block_cross_cpu {
+       struct threshold_block *tb;
+       long retval;
+};
+
+static void local_error_count_handler(void *_tbcc)
 {
-       struct threshold_block *b = _b;
+       struct threshold_block_cross_cpu *tbcc = _tbcc;
+       struct threshold_block *b = tbcc->tb;
        u32 low, high;
 
        rdmsr(b->address, low, high);
-       return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
+       tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
 }
 
 static ssize_t show_error_count(struct threshold_block *b, char *buf)
 {
-       return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b));
+       struct threshold_block_cross_cpu tbcc = { .tb = b, };
+
+       smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1);
+       return sprintf(buf, "%lx\n", tbcc.retval);
 }
 
 static ssize_t store_error_count(struct threshold_block *b,
@@ -329,7 +334,7 @@ static ssize_t store_error_count(struct threshold_block *b,
 {
        struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
 
-       work_on_cpu(b->cpu, threshold_restart_bank, &tr);
+       smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
        return 1;
 }
 
@@ -398,7 +403,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
        if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
                return 0;
 
-       if (rdmsr_safe(address, &low, &high))
+       if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
                return 0;
 
        if (!(high & MASK_VALID_HI)) {
@@ -462,12 +467,11 @@ out_free:
        return err;
 }
 
-static __cpuinit long local_allocate_threshold_blocks(void *_bank)
+static __cpuinit long
+local_allocate_threshold_blocks(int cpu, unsigned int bank)
 {
-       unsigned int *bank = _bank;
-
-       return allocate_threshold_blocks(smp_processor_id(), *bank, 0,
-                                        MSR_IA32_MC0_MISC + *bank * 4);
+       return allocate_threshold_blocks(cpu, bank, 0,
+                                        MSR_IA32_MC0_MISC + bank * 4);
 }
 
 /* symlinks sibling shared banks to first core.  first core owns dir/files. */
@@ -530,7 +534,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 
        per_cpu(threshold_banks, cpu)[bank] = b;
 
-       err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank);
+       err = local_allocate_threshold_blocks(cpu, bank);
        if (err)
                goto out_free;
 
index aa5e287c98e01334565a241d7b89e768c403985f..57df3d383470bb158a7d64bccb7948d45ec9e4ac 100644 (file)
@@ -1,6 +1,8 @@
 /*
  * Intel specific MCE features.
  * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Author: Andi Kleen
  */
 
 #include <linux/init.h>
@@ -13,6 +15,7 @@
 #include <asm/hw_irq.h>
 #include <asm/idle.h>
 #include <asm/therm_throt.h>
+#include <asm/apic.h>
 
 asmlinkage void smp_thermal_interrupt(void)
 {
@@ -25,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void)
 
        rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
        if (therm_throt_process(msr_val & 1))
-               mce_log_therm_throt_event(smp_processor_id(), msr_val);
+               mce_log_therm_throt_event(msr_val);
 
        inc_irq_stat(irq_thermal_count);
        irq_exit();
@@ -85,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
        return;
 }
 
+/*
+ * Support for Intel Correct Machine Check Interrupts. This allows
+ * the CPU to raise an interrupt when a corrected machine check happened.
+ * Normally we pick those up using a regular polling timer.
+ * Also supports reliable discovery of shared banks.
+ */
+
+static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
+
+/*
+ * cmci_discover_lock protects against parallel discovery attempts
+ * which could race against each other.
+ */
+static DEFINE_SPINLOCK(cmci_discover_lock);
+
+#define CMCI_THRESHOLD 1
+
+static int cmci_supported(int *banks)
+{
+       u64 cap;
+
+       /*
+        * Vendor check is not strictly needed, but the initial
+        * initialization is vendor keyed and this
+        * makes sure none of the backdoors are entered otherwise.
+        */
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+               return 0;
+       if (!cpu_has_apic || lapic_get_maxlvt() < 6)
+               return 0;
+       rdmsrl(MSR_IA32_MCG_CAP, cap);
+       *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
+       return !!(cap & MCG_CMCI_P);
+}
+
+/*
+ * The interrupt handler. This is called on every event.
+ * Just call the poller directly to log any events.
+ * This could in theory increase the threshold under high load,
+ * but doesn't for now.
+ */
+static void intel_threshold_interrupt(void)
+{
+       machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+       mce_notify_user();
+}
+
+static void print_update(char *type, int *hdr, int num)
+{
+       if (*hdr == 0)
+               printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
+       *hdr = 1;
+       printk(KERN_CONT " %s:%d", type, num);
+}
+
+/*
+ * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
+ * on this CPU. Use the algorithm recommended in the SDM to discover shared
+ * banks.
+ */
+static void cmci_discover(int banks, int boot)
+{
+       unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
+       int hdr = 0;
+       int i;
+
+       spin_lock(&cmci_discover_lock);
+       for (i = 0; i < banks; i++) {
+               u64 val;
+
+               if (test_bit(i, owned))
+                       continue;
+
+               rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+
+               /* Already owned by someone else? */
+               if (val & CMCI_EN) {
+                       if (test_and_clear_bit(i, owned) || boot)
+                               print_update("SHD", &hdr, i);
+                       __clear_bit(i, __get_cpu_var(mce_poll_banks));
+                       continue;
+               }
+
+               val |= CMCI_EN | CMCI_THRESHOLD;
+               wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+               rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+
+               /* Did the enable bit stick? -- the bank supports CMCI */
+               if (val & CMCI_EN) {
+                       if (!test_and_set_bit(i, owned) || boot)
+                               print_update("CMCI", &hdr, i);
+                       __clear_bit(i, __get_cpu_var(mce_poll_banks));
+               } else {
+                       WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
+               }
+       }
+       spin_unlock(&cmci_discover_lock);
+       if (hdr)
+               printk(KERN_CONT "\n");
+}
+
+/*
+ * Just in case we missed an event during initialization check
+ * all the CMCI owned banks.
+ */
+void cmci_recheck(void)
+{
+       unsigned long flags;
+       int banks;
+
+       if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
+               return;
+       local_irq_save(flags);
+       machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+       local_irq_restore(flags);
+}
+
+/*
+ * Disable CMCI on this CPU for all banks it owns when it goes down.
+ * This allows other CPUs to claim the banks on rediscovery.
+ */
+void cmci_clear(void)
+{
+       int i;
+       int banks;
+       u64 val;
+
+       if (!cmci_supported(&banks))
+               return;
+       spin_lock(&cmci_discover_lock);
+       for (i = 0; i < banks; i++) {
+               if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
+                       continue;
+               /* Disable CMCI */
+               rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+               val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
+               wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+               __clear_bit(i, __get_cpu_var(mce_banks_owned));
+       }
+       spin_unlock(&cmci_discover_lock);
+}
+
+/*
+ * After a CPU went down cycle through all the others and rediscover
+ * Must run in process context.
+ */
+void cmci_rediscover(int dying)
+{
+       int banks;
+       int cpu;
+       cpumask_var_t old;
+
+       if (!cmci_supported(&banks))
+               return;
+       if (!alloc_cpumask_var(&old, GFP_KERNEL))
+               return;
+       cpumask_copy(old, &current->cpus_allowed);
+
+       for_each_online_cpu (cpu) {
+               if (cpu == dying)
+                       continue;
+               if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)))
+                       continue;
+               /* Recheck banks in case CPUs don't all have the same */
+               if (cmci_supported(&banks))
+                       cmci_discover(banks, 0);
+       }
+
+       set_cpus_allowed_ptr(current, old);
+       free_cpumask_var(old);
+}
+
+/*
+ * Reenable CMCI on this CPU in case a CPU down failed.
+ */
+void cmci_reenable(void)
+{
+       int banks;
+       if (cmci_supported(&banks))
+               cmci_discover(banks, 0);
+}
+
+static void intel_init_cmci(void)
+{
+       int banks;
+
+       if (!cmci_supported(&banks))
+               return;
+
+       mce_threshold_vector = intel_threshold_interrupt;
+       cmci_discover(banks, 1);
+       /*
+        * For CPU #0 this runs with still disabled APIC, but that's
+        * ok because only the vector is set up. We still do another
+        * check for the banks later for CPU #0 just to make sure
+        * to not miss any events.
+        */
+       apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
+       cmci_recheck();
+}
+
 void mce_intel_feature_init(struct cpuinfo_x86 *c)
 {
        intel_init_thermal(c);
+       intel_init_cmci();
 }
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
new file mode 100644 (file)
index 0000000..23ee9e7
--- /dev/null
@@ -0,0 +1,29 @@
+/*
+ * Common corrected MCE threshold handler code:
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+
+#include <asm/irq_vectors.h>
+#include <asm/apic.h>
+#include <asm/idle.h>
+#include <asm/mce.h>
+
+static void default_threshold_interrupt(void)
+{
+       printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
+                        THRESHOLD_APIC_VECTOR);
+}
+
+void (*mce_threshold_vector)(void) = default_threshold_interrupt;
+
+asmlinkage void mce_threshold_interrupt(void)
+{
+       exit_idle();
+       irq_enter();
+       inc_irq_stat(irq_threshold_count);
+       mce_threshold_vector();
+       irq_exit();
+       /* Ack only at the end to avoid potential reentry */
+       ack_APIC_irq();
+}
index 191fc05336494bcb30c38c1a3e76d452030cd5d7..f4361b56f8e92efab75d05b1f361f1416d9aec03 100644 (file)
@@ -1,3 +1,3 @@
-obj-y          := main.o if.o generic.o state.o
+obj-y          := main.o if.o generic.o state.o cleanup.o
 obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
 
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
new file mode 100644 (file)
index 0000000..ce0fe4b
--- /dev/null
@@ -0,0 +1,1101 @@
+/*  MTRR (Memory Type Range Register) cleanup
+
+    Copyright (C) 2009 Yinghai Lu
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Library General Public
+    License as published by the Free Software Foundation; either
+    version 2 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Library General Public License for more details.
+
+    You should have received a copy of the GNU Library General Public
+    License along with this library; if not, write to the Free
+    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/sort.h>
+
+#include <asm/e820.h>
+#include <asm/mtrr.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/kvm_para.h>
+#include "mtrr.h"
+
+/* should be related to MTRR_VAR_RANGES nums */
+#define RANGE_NUM 256
+
+struct res_range {
+       unsigned long start;
+       unsigned long end;
+};
+
+static int __init
+add_range(struct res_range *range, int nr_range, unsigned long start,
+                             unsigned long end)
+{
+       /* out of slots */
+       if (nr_range >= RANGE_NUM)
+               return nr_range;
+
+       range[nr_range].start = start;
+       range[nr_range].end = end;
+
+       nr_range++;
+
+       return nr_range;
+}
+
+static int __init
+add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
+                             unsigned long end)
+{
+       int i;
+
+       /* try to merge it with old one */
+       for (i = 0; i < nr_range; i++) {
+               unsigned long final_start, final_end;
+               unsigned long common_start, common_end;
+
+               if (!range[i].end)
+                       continue;
+
+               common_start = max(range[i].start, start);
+               common_end = min(range[i].end, end);
+               if (common_start > common_end + 1)
+                       continue;
+
+               final_start = min(range[i].start, start);
+               final_end = max(range[i].end, end);
+
+               range[i].start = final_start;
+               range[i].end =  final_end;
+               return nr_range;
+       }
+
+       /* need to add that */
+       return add_range(range, nr_range, start, end);
+}
+
+static void __init
+subtract_range(struct res_range *range, unsigned long start, unsigned long end)
+{
+       int i, j;
+
+       for (j = 0; j < RANGE_NUM; j++) {
+               if (!range[j].end)
+                       continue;
+
+               if (start <= range[j].start && end >= range[j].end) {
+                       range[j].start = 0;
+                       range[j].end = 0;
+                       continue;
+               }
+
+               if (start <= range[j].start && end < range[j].end &&
+                   range[j].start < end + 1) {
+                       range[j].start = end + 1;
+                       continue;
+               }
+
+
+               if (start > range[j].start && end >= range[j].end &&
+                   range[j].end > start - 1) {
+                       range[j].end = start - 1;
+                       continue;
+               }
+
+               if (start > range[j].start && end < range[j].end) {
+                       /* find the new spare */
+                       for (i = 0; i < RANGE_NUM; i++) {
+                               if (range[i].end == 0)
+                                       break;
+                       }
+                       if (i < RANGE_NUM) {
+                               range[i].end = range[j].end;
+                               range[i].start = end + 1;
+                       } else {
+                               printk(KERN_ERR "run of slot in ranges\n");
+                       }
+                       range[j].end = start - 1;
+                       continue;
+               }
+       }
+}
+
+static int __init cmp_range(const void *x1, const void *x2)
+{
+       const struct res_range *r1 = x1;
+       const struct res_range *r2 = x2;
+       long start1, start2;
+
+       start1 = r1->start;
+       start2 = r2->start;
+
+       return start1 - start2;
+}
+
+struct var_mtrr_range_state {
+       unsigned long base_pfn;
+       unsigned long size_pfn;
+       mtrr_type type;
+};
+
+static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
+static int __initdata debug_print;
+
+static int __init
+x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
+                      unsigned long extra_remove_base,
+                      unsigned long extra_remove_size)
+{
+       unsigned long base, size;
+       mtrr_type type;
+       int i;
+
+       for (i = 0; i < num_var_ranges; i++) {
+               type = range_state[i].type;
+               if (type != MTRR_TYPE_WRBACK)
+                       continue;
+               base = range_state[i].base_pfn;
+               size = range_state[i].size_pfn;
+               nr_range = add_range_with_merge(range, nr_range, base,
+                                               base + size - 1);
+       }
+       if (debug_print) {
+               printk(KERN_DEBUG "After WB checking\n");
+               for (i = 0; i < nr_range; i++)
+                       printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+                                range[i].start, range[i].end + 1);
+       }
+
+       /* take out UC ranges */
+       for (i = 0; i < num_var_ranges; i++) {
+               type = range_state[i].type;
+               if (type != MTRR_TYPE_UNCACHABLE &&
+                   type != MTRR_TYPE_WRPROT)
+                       continue;
+               size = range_state[i].size_pfn;
+               if (!size)
+                       continue;
+               base = range_state[i].base_pfn;
+               if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
+                   (mtrr_state.enabled & 1)) {
+                       /* Var MTRR contains UC entry below 1M? Skip it: */
+                       printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d "
+                               "contains strange UC entry under 1M, check "
+                               "with your system vendor!\n", i);
+                       if (base + size <= (1<<(20-PAGE_SHIFT)))
+                               continue;
+                       size -= (1<<(20-PAGE_SHIFT)) - base;
+                       base = 1<<(20-PAGE_SHIFT);
+               }
+               subtract_range(range, base, base + size - 1);
+       }
+       if (extra_remove_size)
+               subtract_range(range, extra_remove_base,
+                                extra_remove_base + extra_remove_size  - 1);
+
+       /* get new range num */
+       nr_range = 0;
+       for (i = 0; i < RANGE_NUM; i++) {
+               if (!range[i].end)
+                       continue;
+               nr_range++;
+       }
+       if  (debug_print) {
+               printk(KERN_DEBUG "After UC checking\n");
+               for (i = 0; i < nr_range; i++)
+                       printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+                                range[i].start, range[i].end + 1);
+       }
+
+       /* sort the ranges */
+       sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+       if  (debug_print) {
+               printk(KERN_DEBUG "After sorting\n");
+               for (i = 0; i < nr_range; i++)
+                       printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+                                range[i].start, range[i].end + 1);
+       }
+
+       /* clear those is not used */
+       for (i = nr_range; i < RANGE_NUM; i++)
+               memset(&range[i], 0, sizeof(range[i]));
+
+       return nr_range;
+}
+
+static struct res_range __initdata range[RANGE_NUM];
+static int __initdata nr_range;
+
+#ifdef CONFIG_MTRR_SANITIZER
+
+static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
+{
+       unsigned long sum;
+       int i;
+
+       sum = 0;
+       for (i = 0; i < nr_range; i++)
+               sum += range[i].end + 1 - range[i].start;
+
+       return sum;
+}
+
+static int enable_mtrr_cleanup __initdata =
+       CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
+
+static int __init disable_mtrr_cleanup_setup(char *str)
+{
+       enable_mtrr_cleanup = 0;
+       return 0;
+}
+early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
+
+static int __init enable_mtrr_cleanup_setup(char *str)
+{
+       enable_mtrr_cleanup = 1;
+       return 0;
+}
+early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
+
+static int __init mtrr_cleanup_debug_setup(char *str)
+{
+       debug_print = 1;
+       return 0;
+}
+early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
+
+struct var_mtrr_state {
+       unsigned long   range_startk;
+       unsigned long   range_sizek;
+       unsigned long   chunk_sizek;
+       unsigned long   gran_sizek;
+       unsigned int    reg;
+};
+
+static void __init
+set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+               unsigned char type, unsigned int address_bits)
+{
+       u32 base_lo, base_hi, mask_lo, mask_hi;
+       u64 base, mask;
+
+       if (!sizek) {
+               fill_mtrr_var_range(reg, 0, 0, 0, 0);
+               return;
+       }
+
+       mask = (1ULL << address_bits) - 1;
+       mask &= ~((((u64)sizek) << 10) - 1);
+
+       base  = ((u64)basek) << 10;
+
+       base |= type;
+       mask |= 0x800;
+
+       base_lo = base & ((1ULL<<32) - 1);
+       base_hi = base >> 32;
+
+       mask_lo = mask & ((1ULL<<32) - 1);
+       mask_hi = mask >> 32;
+
+       fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
+}
+
+static void __init
+save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+               unsigned char type)
+{
+       range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
+       range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
+       range_state[reg].type = type;
+}
+
+static void __init
+set_var_mtrr_all(unsigned int address_bits)
+{
+       unsigned long basek, sizek;
+       unsigned char type;
+       unsigned int reg;
+
+       for (reg = 0; reg < num_var_ranges; reg++) {
+               basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
+               sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
+               type = range_state[reg].type;
+
+               set_var_mtrr(reg, basek, sizek, type, address_bits);
+       }
+}
+
+static unsigned long to_size_factor(unsigned long sizek, char *factorp)
+{
+       char factor;
+       unsigned long base = sizek;
+
+       if (base & ((1<<10) - 1)) {
+               /* not MB alignment */
+               factor = 'K';
+       } else if (base & ((1<<20) - 1)) {
+               factor = 'M';
+               base >>= 10;
+       } else {
+               factor = 'G';
+               base >>= 20;
+       }
+
+       *factorp = factor;
+
+       return base;
+}
+
+static unsigned int __init
+range_to_mtrr(unsigned int reg, unsigned long range_startk,
+             unsigned long range_sizek, unsigned char type)
+{
+       if (!range_sizek || (reg >= num_var_ranges))
+               return reg;
+
+       while (range_sizek) {
+               unsigned long max_align, align;
+               unsigned long sizek;
+
+               /* Compute the maximum size I can make a range */
+               if (range_startk)
+                       max_align = ffs(range_startk) - 1;
+               else
+                       max_align = 32;
+               align = fls(range_sizek) - 1;
+               if (align > max_align)
+                       align = max_align;
+
+               sizek = 1 << align;
+               if (debug_print) {
+                       char start_factor = 'K', size_factor = 'K';
+                       unsigned long start_base, size_base;
+
+                       start_base = to_size_factor(range_startk,
+                                                        &start_factor),
+                       size_base = to_size_factor(sizek, &size_factor),
+
+                       printk(KERN_DEBUG "Setting variable MTRR %d, "
+                               "base: %ld%cB, range: %ld%cB, type %s\n",
+                               reg, start_base, start_factor,
+                               size_base, size_factor,
+                               (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
+                                  ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")
+                               );
+               }
+               save_var_mtrr(reg++, range_startk, sizek, type);
+               range_startk += sizek;
+               range_sizek -= sizek;
+               if (reg >= num_var_ranges)
+                       break;
+       }
+       return reg;
+}
+
+static unsigned __init
+range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
+                       unsigned long sizek)
+{
+       unsigned long hole_basek, hole_sizek;
+       unsigned long second_basek, second_sizek;
+       unsigned long range0_basek, range0_sizek;
+       unsigned long range_basek, range_sizek;
+       unsigned long chunk_sizek;
+       unsigned long gran_sizek;
+
+       hole_basek = 0;
+       hole_sizek = 0;
+       second_basek = 0;
+       second_sizek = 0;
+       chunk_sizek = state->chunk_sizek;
+       gran_sizek = state->gran_sizek;
+
+       /* align with gran size, prevent small block used up MTRRs */
+       range_basek = ALIGN(state->range_startk, gran_sizek);
+       if ((range_basek > basek) && basek)
+               return second_sizek;
+       state->range_sizek -= (range_basek - state->range_startk);
+       range_sizek = ALIGN(state->range_sizek, gran_sizek);
+
+       while (range_sizek > state->range_sizek) {
+               range_sizek -= gran_sizek;
+               if (!range_sizek)
+                       return 0;
+       }
+       state->range_sizek = range_sizek;
+
+       /* try to append some small hole */
+       range0_basek = state->range_startk;
+       range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
+
+       /* no increase */
+       if (range0_sizek == state->range_sizek) {
+               if (debug_print)
+                       printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
+                               range0_basek<<10,
+                               (range0_basek + state->range_sizek)<<10);
+               state->reg = range_to_mtrr(state->reg, range0_basek,
+                               state->range_sizek, MTRR_TYPE_WRBACK);
+               return 0;
+       }
+
+       /* only cut back, when it is not the last */
+       if (sizek) {
+               while (range0_basek + range0_sizek > (basek + sizek)) {
+                       if (range0_sizek >= chunk_sizek)
+                               range0_sizek -= chunk_sizek;
+                       else
+                               range0_sizek = 0;
+
+                       if (!range0_sizek)
+                               break;
+               }
+       }
+
+second_try:
+       range_basek = range0_basek + range0_sizek;
+
+       /* one hole in the middle */
+       if (range_basek > basek && range_basek <= (basek + sizek))
+               second_sizek = range_basek - basek;
+
+       if (range0_sizek > state->range_sizek) {
+
+               /* one hole in middle or at end */
+               hole_sizek = range0_sizek - state->range_sizek - second_sizek;
+
+               /* hole size should be less than half of range0 size */
+               if (hole_sizek >= (range0_sizek >> 1) &&
+                   range0_sizek >= chunk_sizek) {
+                       range0_sizek -= chunk_sizek;
+                       second_sizek = 0;
+                       hole_sizek = 0;
+
+                       goto second_try;
+               }
+       }
+
+       if (range0_sizek) {
+               if (debug_print)
+                       printk(KERN_DEBUG "range0: %016lx - %016lx\n",
+                               range0_basek<<10,
+                               (range0_basek + range0_sizek)<<10);
+               state->reg = range_to_mtrr(state->reg, range0_basek,
+                               range0_sizek, MTRR_TYPE_WRBACK);
+       }
+
+       if (range0_sizek < state->range_sizek) {
+               /* need to handle left over */
+               range_sizek = state->range_sizek - range0_sizek;
+
+               if (debug_print)
+                       printk(KERN_DEBUG "range: %016lx - %016lx\n",
+                                range_basek<<10,
+                                (range_basek + range_sizek)<<10);
+               state->reg = range_to_mtrr(state->reg, range_basek,
+                                range_sizek, MTRR_TYPE_WRBACK);
+       }
+
+       if (hole_sizek) {
+               hole_basek = range_basek - hole_sizek - second_sizek;
+               if (debug_print)
+                       printk(KERN_DEBUG "hole: %016lx - %016lx\n",
+                                hole_basek<<10,
+                                (hole_basek + hole_sizek)<<10);
+               state->reg = range_to_mtrr(state->reg, hole_basek,
+                                hole_sizek, MTRR_TYPE_UNCACHABLE);
+       }
+
+       return second_sizek;
+}
+
+static void __init
+set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
+                  unsigned long size_pfn)
+{
+       unsigned long basek, sizek;
+       unsigned long second_sizek = 0;
+
+       if (state->reg >= num_var_ranges)
+               return;
+
+       basek = base_pfn << (PAGE_SHIFT - 10);
+       sizek = size_pfn << (PAGE_SHIFT - 10);
+
+       /* See if I can merge with the last range */
+       if ((basek <= 1024) ||
+           (state->range_startk + state->range_sizek == basek)) {
+               unsigned long endk = basek + sizek;
+               state->range_sizek = endk - state->range_startk;
+               return;
+       }
+       /* Write the range mtrrs */
+       if (state->range_sizek != 0)
+               second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
+
+       /* Allocate an msr */
+       state->range_startk = basek + second_sizek;
+       state->range_sizek  = sizek - second_sizek;
+}
+
+/* mininum size of mtrr block that can take hole */
+static u64 mtrr_chunk_size __initdata = (256ULL<<20);
+
+static int __init parse_mtrr_chunk_size_opt(char *p)
+{
+       if (!p)
+               return -EINVAL;
+       mtrr_chunk_size = memparse(p, &p);
+       return 0;
+}
+early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
+
+/* granity of mtrr of block */
+static u64 mtrr_gran_size __initdata;
+
+static int __init parse_mtrr_gran_size_opt(char *p)
+{
+       if (!p)
+               return -EINVAL;
+       mtrr_gran_size = memparse(p, &p);
+       return 0;
+}
+early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
+
+static int nr_mtrr_spare_reg __initdata =
+                                CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
+
+static int __init parse_mtrr_spare_reg(char *arg)
+{
+       if (arg)
+               nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
+       return 0;
+}
+
+early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
+
+static int __init
+x86_setup_var_mtrrs(struct res_range *range, int nr_range,
+                   u64 chunk_size, u64 gran_size)
+{
+       struct var_mtrr_state var_state;
+       int i;
+       int num_reg;
+
+       var_state.range_startk  = 0;
+       var_state.range_sizek   = 0;
+       var_state.reg           = 0;
+       var_state.chunk_sizek   = chunk_size >> 10;
+       var_state.gran_sizek    = gran_size >> 10;
+
+       memset(range_state, 0, sizeof(range_state));
+
+       /* Write the range etc */
+       for (i = 0; i < nr_range; i++)
+               set_var_mtrr_range(&var_state, range[i].start,
+                                  range[i].end - range[i].start + 1);
+
+       /* Write the last range */
+       if (var_state.range_sizek != 0)
+               range_to_mtrr_with_hole(&var_state, 0, 0);
+
+       num_reg = var_state.reg;
+       /* Clear out the extra MTRR's */
+       while (var_state.reg < num_var_ranges) {
+               save_var_mtrr(var_state.reg, 0, 0, 0);
+               var_state.reg++;
+       }
+
+       return num_reg;
+}
+
+struct mtrr_cleanup_result {
+       unsigned long gran_sizek;
+       unsigned long chunk_sizek;
+       unsigned long lose_cover_sizek;
+       unsigned int num_reg;
+       int bad;
+};
+
+/*
+ * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
+ * chunk size: gran_size, ..., 2G
+ * so we need (1+16)*8
+ */
+#define NUM_RESULT     136
+#define PSHIFT         (PAGE_SHIFT - 10)
+
+static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
+static unsigned long __initdata min_loss_pfn[RANGE_NUM];
+
+static void __init print_out_mtrr_range_state(void)
+{
+       int i;
+       char start_factor = 'K', size_factor = 'K';
+       unsigned long start_base, size_base;
+       mtrr_type type;
+
+       for (i = 0; i < num_var_ranges; i++) {
+
+               size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
+               if (!size_base)
+                       continue;
+
+               size_base = to_size_factor(size_base, &size_factor),
+               start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
+               start_base = to_size_factor(start_base, &start_factor),
+               type = range_state[i].type;
+
+               printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+                       i, start_base, start_factor,
+                       size_base, size_factor,
+                       (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
+                           ((type == MTRR_TYPE_WRPROT) ? "WP" :
+                            ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
+                       );
+       }
+}
+
+static int __init mtrr_need_cleanup(void)
+{
+       int i;
+       mtrr_type type;
+       unsigned long size;
+       /* extra one for all 0 */
+       int num[MTRR_NUM_TYPES + 1];
+
+       /* check entries number */
+       memset(num, 0, sizeof(num));
+       for (i = 0; i < num_var_ranges; i++) {
+               type = range_state[i].type;
+               size = range_state[i].size_pfn;
+               if (type >= MTRR_NUM_TYPES)
+                       continue;
+               if (!size)
+                       type = MTRR_NUM_TYPES;
+               if (type == MTRR_TYPE_WRPROT)
+                       type = MTRR_TYPE_UNCACHABLE;
+               num[type]++;
+       }
+
+       /* check if we got UC entries */
+       if (!num[MTRR_TYPE_UNCACHABLE])
+               return 0;
+
+       /* check if we only had WB and UC */
+       if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+               num_var_ranges - num[MTRR_NUM_TYPES])
+               return 0;
+
+       return 1;
+}
+
+static unsigned long __initdata range_sums;
+static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
+                                        unsigned long extra_remove_base,
+                                        unsigned long extra_remove_size,
+                                        int i)
+{
+       int num_reg;
+       static struct res_range range_new[RANGE_NUM];
+       static int nr_range_new;
+       unsigned long range_sums_new;
+
+       /* convert ranges to var ranges state */
+       num_reg = x86_setup_var_mtrrs(range, nr_range,
+                                               chunk_size, gran_size);
+
+       /* we got new setting in range_state, check it */
+       memset(range_new, 0, sizeof(range_new));
+       nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
+                               extra_remove_base, extra_remove_size);
+       range_sums_new = sum_ranges(range_new, nr_range_new);
+
+       result[i].chunk_sizek = chunk_size >> 10;
+       result[i].gran_sizek = gran_size >> 10;
+       result[i].num_reg = num_reg;
+       if (range_sums < range_sums_new) {
+               result[i].lose_cover_sizek =
+                       (range_sums_new - range_sums) << PSHIFT;
+               result[i].bad = 1;
+       } else
+               result[i].lose_cover_sizek =
+                       (range_sums - range_sums_new) << PSHIFT;
+
+       /* double check it */
+       if (!result[i].bad && !result[i].lose_cover_sizek) {
+               if (nr_range_new != nr_range ||
+                       memcmp(range, range_new, sizeof(range)))
+                               result[i].bad = 1;
+       }
+
+       if (!result[i].bad && (range_sums - range_sums_new <
+                               min_loss_pfn[num_reg])) {
+               min_loss_pfn[num_reg] =
+                       range_sums - range_sums_new;
+       }
+}
+
+static void __init mtrr_print_out_one_result(int i)
+{
+       char gran_factor, chunk_factor, lose_factor;
+       unsigned long gran_base, chunk_base, lose_base;
+
+       gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+       chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+       lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+       printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
+                       result[i].bad ? "*BAD*" : " ",
+                       gran_base, gran_factor, chunk_base, chunk_factor);
+       printk(KERN_CONT "num_reg: %d  \tlose cover RAM: %s%ld%c\n",
+                       result[i].num_reg, result[i].bad ? "-" : "",
+                       lose_base, lose_factor);
+}
+
+static int __init mtrr_search_optimal_index(void)
+{
+       int i;
+       int num_reg_good;
+       int index_good;
+
+       if (nr_mtrr_spare_reg >= num_var_ranges)
+               nr_mtrr_spare_reg = num_var_ranges - 1;
+       num_reg_good = -1;
+       for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
+               if (!min_loss_pfn[i])
+                       num_reg_good = i;
+       }
+
+       index_good = -1;
+       if (num_reg_good != -1) {
+               for (i = 0; i < NUM_RESULT; i++) {
+                       if (!result[i].bad &&
+                           result[i].num_reg == num_reg_good &&
+                           !result[i].lose_cover_sizek) {
+                               index_good = i;
+                               break;
+                       }
+               }
+       }
+
+       return index_good;
+}
+
+
+int __init mtrr_cleanup(unsigned address_bits)
+{
+       unsigned long extra_remove_base, extra_remove_size;
+       unsigned long base, size, def, dummy;
+       mtrr_type type;
+       u64 chunk_size, gran_size;
+       int index_good;
+       int i;
+
+       if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
+               return 0;
+       rdmsr(MTRRdefType_MSR, def, dummy);
+       def &= 0xff;
+       if (def != MTRR_TYPE_UNCACHABLE)
+               return 0;
+
+       /* get it and store it aside */
+       memset(range_state, 0, sizeof(range_state));
+       for (i = 0; i < num_var_ranges; i++) {
+               mtrr_if->get(i, &base, &size, &type);
+               range_state[i].base_pfn = base;
+               range_state[i].size_pfn = size;
+               range_state[i].type = type;
+       }
+
+       /* check if we need handle it and can handle it */
+       if (!mtrr_need_cleanup())
+               return 0;
+
+       /* print original var MTRRs at first, for debugging: */
+       printk(KERN_DEBUG "original variable MTRRs\n");
+       print_out_mtrr_range_state();
+
+       memset(range, 0, sizeof(range));
+       extra_remove_size = 0;
+       extra_remove_base = 1 << (32 - PAGE_SHIFT);
+       if (mtrr_tom2)
+               extra_remove_size =
+                       (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
+       nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
+                                         extra_remove_size);
+       /*
+        * [0, 1M) should always be coverred by var mtrr with WB
+        * and fixed mtrrs should take effective before var mtrr for it
+        */
+       nr_range = add_range_with_merge(range, nr_range, 0,
+                                       (1ULL<<(20 - PAGE_SHIFT)) - 1);
+       /* sort the ranges */
+       sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+
+       range_sums = sum_ranges(range, nr_range);
+       printk(KERN_INFO "total RAM coverred: %ldM\n",
+              range_sums >> (20 - PAGE_SHIFT));
+
+       if (mtrr_chunk_size && mtrr_gran_size) {
+               i = 0;
+               mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
+                                     extra_remove_base, extra_remove_size, i);
+
+               mtrr_print_out_one_result(i);
+
+               if (!result[i].bad) {
+                       set_var_mtrr_all(address_bits);
+                       printk(KERN_DEBUG "New variable MTRRs\n");
+                       print_out_mtrr_range_state();
+                       return 1;
+               }
+               printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
+                      "will find optimal one\n");
+       }
+
+       i = 0;
+       memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
+       memset(result, 0, sizeof(result));
+       for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
+
+               for (chunk_size = gran_size; chunk_size < (1ULL<<32);
+                    chunk_size <<= 1) {
+
+                       if (i >= NUM_RESULT)
+                               continue;
+
+                       mtrr_calc_range_state(chunk_size, gran_size,
+                                     extra_remove_base, extra_remove_size, i);
+                       if (debug_print) {
+                               mtrr_print_out_one_result(i);
+                               printk(KERN_INFO "\n");
+                       }
+
+                       i++;
+               }
+       }
+
+       /* try to find the optimal index */
+       index_good = mtrr_search_optimal_index();
+
+       if (index_good != -1) {
+               printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
+               i = index_good;
+               mtrr_print_out_one_result(i);
+
+               /* convert ranges to var ranges state */
+               chunk_size = result[i].chunk_sizek;
+               chunk_size <<= 10;
+               gran_size = result[i].gran_sizek;
+               gran_size <<= 10;
+               x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
+               set_var_mtrr_all(address_bits);
+               printk(KERN_DEBUG "New variable MTRRs\n");
+               print_out_mtrr_range_state();
+               return 1;
+       } else {
+               /* print out all */
+               for (i = 0; i < NUM_RESULT; i++)
+                       mtrr_print_out_one_result(i);
+       }
+
+       printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
+       printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
+
+       return 0;
+}
+#else
+int __init mtrr_cleanup(unsigned address_bits)
+{
+       return 0;
+}
+#endif
+
+static int disable_mtrr_trim;
+
+static int __init disable_mtrr_trim_setup(char *str)
+{
+       disable_mtrr_trim = 1;
+       return 0;
+}
+early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
+
+/*
+ * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
+ * for memory >4GB. Check for that here.
+ * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
+ * apply to are wrong, but so far we don't know of any such case in the wild.
+ */
+#define Tom2Enabled (1U << 21)
+#define Tom2ForceMemTypeWB (1U << 22)
+
+int __init amd_special_default_mtrr(void)
+{
+       u32 l, h;
+
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+               return 0;
+       if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
+               return 0;
+       /* In case some hypervisor doesn't pass SYSCFG through */
+       if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
+               return 0;
+       /*
+        * Memory between 4GB and top of mem is forced WB by this magic bit.
+        * Reserved before K8RevF, but should be zero there.
+        */
+       if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
+                (Tom2Enabled | Tom2ForceMemTypeWB))
+               return 1;
+       return 0;
+}
+
+static u64 __init real_trim_memory(unsigned long start_pfn,
+                                  unsigned long limit_pfn)
+{
+       u64 trim_start, trim_size;
+       trim_start = start_pfn;
+       trim_start <<= PAGE_SHIFT;
+       trim_size = limit_pfn;
+       trim_size <<= PAGE_SHIFT;
+       trim_size -= trim_start;
+
+       return e820_update_range(trim_start, trim_size, E820_RAM,
+                               E820_RESERVED);
+}
+/**
+ * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
+ * @end_pfn: ending page frame number
+ *
+ * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
+ * memory configurations.  This routine checks that the highest MTRR matches
+ * the end of memory, to make sure the MTRRs having a write back type cover
+ * all of the memory the kernel is intending to use. If not, it'll trim any
+ * memory off the end by adjusting end_pfn, removing it from the kernel's
+ * allocation pools, warning the user with an obnoxious message.
+ */
+int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
+{
+       unsigned long i, base, size, highest_pfn = 0, def, dummy;
+       mtrr_type type;
+       u64 total_trim_size;
+
+       /* extra one for all 0 */
+       int num[MTRR_NUM_TYPES + 1];
+       /*
+        * Make sure we only trim uncachable memory on machines that
+        * support the Intel MTRR architecture:
+        */
+       if (!is_cpu(INTEL) || disable_mtrr_trim)
+               return 0;
+       rdmsr(MTRRdefType_MSR, def, dummy);
+       def &= 0xff;
+       if (def != MTRR_TYPE_UNCACHABLE)
+               return 0;
+
+       /* get it and store it aside */
+       memset(range_state, 0, sizeof(range_state));
+       for (i = 0; i < num_var_ranges; i++) {
+               mtrr_if->get(i, &base, &size, &type);
+               range_state[i].base_pfn = base;
+               range_state[i].size_pfn = size;
+               range_state[i].type = type;
+       }
+
+       /* Find highest cached pfn */
+       for (i = 0; i < num_var_ranges; i++) {
+               type = range_state[i].type;
+               if (type != MTRR_TYPE_WRBACK)
+                       continue;
+               base = range_state[i].base_pfn;
+               size = range_state[i].size_pfn;
+               if (highest_pfn < base + size)
+                       highest_pfn = base + size;
+       }
+
+       /* kvm/qemu doesn't have mtrr set right, don't trim them all */
+       if (!highest_pfn) {
+               printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
+               return 0;
+       }
+
+       /* check entries number */
+       memset(num, 0, sizeof(num));
+       for (i = 0; i < num_var_ranges; i++) {
+               type = range_state[i].type;
+               if (type >= MTRR_NUM_TYPES)
+                       continue;
+               size = range_state[i].size_pfn;
+               if (!size)
+                       type = MTRR_NUM_TYPES;
+               num[type]++;
+       }
+
+       /* no entry for WB? */
+       if (!num[MTRR_TYPE_WRBACK])
+               return 0;
+
+       /* check if we only had WB and UC */
+       if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+               num_var_ranges - num[MTRR_NUM_TYPES])
+               return 0;
+
+       memset(range, 0, sizeof(range));
+       nr_range = 0;
+       if (mtrr_tom2) {
+               range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
+               range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
+               if (highest_pfn < range[nr_range].end + 1)
+                       highest_pfn = range[nr_range].end + 1;
+               nr_range++;
+       }
+       nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
+
+       total_trim_size = 0;
+       /* check the head */
+       if (range[0].start)
+               total_trim_size += real_trim_memory(0, range[0].start);
+       /* check the holes */
+       for (i = 0; i < nr_range - 1; i++) {
+               if (range[i].end + 1 < range[i+1].start)
+                       total_trim_size += real_trim_memory(range[i].end + 1,
+                                                           range[i+1].start);
+       }
+       /* check the top */
+       i = nr_range - 1;
+       if (range[i].end + 1 < end_pfn)
+               total_trim_size += real_trim_memory(range[i].end + 1,
+                                                        end_pfn);
+
+       if (total_trim_size) {
+               printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
+                       " all of memory, losing %lluMB of RAM.\n",
+                       total_trim_size >> 20);
+
+               if (!changed_by_mtrr_cleanup)
+                       WARN_ON(1);
+
+               printk(KERN_INFO "update e820 for mtrr\n");
+               update_e820();
+
+               return 1;
+       }
+
+       return 0;
+}
+
index 0c0a455fe95c0b9468c6f59e350603ba30f32e68..37f28fc7cf959674151839b73a600a01e6a3e27f 100644 (file)
@@ -33,13 +33,31 @@ u64 mtrr_tom2;
 struct mtrr_state_type mtrr_state = {};
 EXPORT_SYMBOL_GPL(mtrr_state);
 
-static int __initdata mtrr_show;
-static int __init mtrr_debug(char *opt)
+/**
+ * BIOS is expected to clear MtrrFixDramModEn bit, see for example
+ * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
+ * Opteron Processors" (26094 Rev. 3.30 February 2006), section
+ * "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set
+ * to 1 during BIOS initalization of the fixed MTRRs, then cleared to
+ * 0 for operation."
+ */
+static inline void k8_check_syscfg_dram_mod_en(void)
 {
-       mtrr_show = 1;
-       return 0;
+       u32 lo, hi;
+
+       if (!((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
+             (boot_cpu_data.x86 >= 0x0f)))
+               return;
+
+       rdmsr(MSR_K8_SYSCFG, lo, hi);
+       if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
+               printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
+                      " not cleared by BIOS, clearing this bit\n",
+                      smp_processor_id());
+               lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
+               mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi);
+       }
 }
-early_param("mtrr.show", mtrr_debug);
 
 /*
  * Returns the effective MTRR type for the region
@@ -174,6 +192,8 @@ get_fixed_ranges(mtrr_type * frs)
        unsigned int *p = (unsigned int *) frs;
        int i;
 
+       k8_check_syscfg_dram_mod_en();
+
        rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
 
        for (i = 0; i < 2; i++)
@@ -188,18 +208,94 @@ void mtrr_save_fixed_ranges(void *info)
                get_fixed_ranges(mtrr_state.fixed_ranges);
 }
 
-static void print_fixed(unsigned base, unsigned step, const mtrr_type*types)
+static unsigned __initdata last_fixed_start;
+static unsigned __initdata last_fixed_end;
+static mtrr_type __initdata last_fixed_type;
+
+static void __init print_fixed_last(void)
+{
+       if (!last_fixed_end)
+               return;
+
+       printk(KERN_DEBUG "  %05X-%05X %s\n", last_fixed_start,
+               last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
+
+       last_fixed_end = 0;
+}
+
+static void __init update_fixed_last(unsigned base, unsigned end,
+                                      mtrr_type type)
+{
+       last_fixed_start = base;
+       last_fixed_end = end;
+       last_fixed_type = type;
+}
+
+static void __init print_fixed(unsigned base, unsigned step,
+                              const mtrr_type *types)
 {
        unsigned i;
 
-       for (i = 0; i < 8; ++i, ++types, base += step)
-               printk(KERN_INFO "MTRR %05X-%05X %s\n",
-                       base, base + step - 1, mtrr_attrib_to_str(*types));
+       for (i = 0; i < 8; ++i, ++types, base += step) {
+               if (last_fixed_end == 0) {
+                       update_fixed_last(base, base + step, *types);
+                       continue;
+               }
+               if (last_fixed_end == base && last_fixed_type == *types) {
+                       last_fixed_end = base + step;
+                       continue;
+               }
+               /* new segments: gap or different type */
+               print_fixed_last();
+               update_fixed_last(base, base + step, *types);
+       }
 }
 
 static void prepare_set(void);
 static void post_set(void);
 
+static void __init print_mtrr_state(void)
+{
+       unsigned int i;
+       int high_width;
+
+       printk(KERN_DEBUG "MTRR default type: %s\n",
+                        mtrr_attrib_to_str(mtrr_state.def_type));
+       if (mtrr_state.have_fixed) {
+               printk(KERN_DEBUG "MTRR fixed ranges %sabled:\n",
+                      mtrr_state.enabled & 1 ? "en" : "dis");
+               print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
+               for (i = 0; i < 2; ++i)
+                       print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
+               for (i = 0; i < 8; ++i)
+                       print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
+
+               /* tail */
+               print_fixed_last();
+       }
+       printk(KERN_DEBUG "MTRR variable ranges %sabled:\n",
+              mtrr_state.enabled & 2 ? "en" : "dis");
+       high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
+       for (i = 0; i < num_var_ranges; ++i) {
+               if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
+                       printk(KERN_DEBUG "  %u base %0*X%05X000 mask %0*X%05X000 %s\n",
+                              i,
+                              high_width,
+                              mtrr_state.var_ranges[i].base_hi,
+                              mtrr_state.var_ranges[i].base_lo >> 12,
+                              high_width,
+                              mtrr_state.var_ranges[i].mask_hi,
+                              mtrr_state.var_ranges[i].mask_lo >> 12,
+                              mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
+               else
+                       printk(KERN_DEBUG "  %u disabled\n", i);
+       }
+       if (mtrr_tom2) {
+               printk(KERN_DEBUG "TOM2: %016llx aka %lldM\n",
+                                 mtrr_tom2, mtrr_tom2>>20);
+       }
+}
+
 /*  Grab all of the MTRR state for this CPU into *state  */
 void __init get_mtrr_state(void)
 {
@@ -231,41 +327,9 @@ void __init get_mtrr_state(void)
                mtrr_tom2 |= low;
                mtrr_tom2 &= 0xffffff800000ULL;
        }
-       if (mtrr_show) {
-               int high_width;
-
-               printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type));
-               if (mtrr_state.have_fixed) {
-                       printk(KERN_INFO "MTRR fixed ranges %sabled:\n",
-                              mtrr_state.enabled & 1 ? "en" : "dis");
-                       print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
-                       for (i = 0; i < 2; ++i)
-                               print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
-                       for (i = 0; i < 8; ++i)
-                               print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
-               }
-               printk(KERN_INFO "MTRR variable ranges %sabled:\n",
-                      mtrr_state.enabled & 2 ? "en" : "dis");
-               high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
-               for (i = 0; i < num_var_ranges; ++i) {
-                       if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
-                               printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n",
-                                      i,
-                                      high_width,
-                                      mtrr_state.var_ranges[i].base_hi,
-                                      mtrr_state.var_ranges[i].base_lo >> 12,
-                                      high_width,
-                                      mtrr_state.var_ranges[i].mask_hi,
-                                      mtrr_state.var_ranges[i].mask_lo >> 12,
-                                      mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
-                       else
-                               printk(KERN_INFO "MTRR %u disabled\n", i);
-               }
-               if (mtrr_tom2) {
-                       printk(KERN_INFO "TOM2: %016llx aka %lldM\n",
-                                         mtrr_tom2, mtrr_tom2>>20);
-               }
-       }
+
+       print_mtrr_state();
+
        mtrr_state_set = 1;
 
        /* PAT setup for BP. We need to go through sync steps here */
@@ -307,28 +371,11 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
                        smp_processor_id(), msr, a, b);
 }
 
-/**
- * Enable and allow read/write of extended fixed-range MTRR bits on K8 CPUs
- * see AMD publication no. 24593, chapter 3.2.1 for more information
- */
-static inline void k8_enable_fixed_iorrs(void)
-{
-       unsigned lo, hi;
-
-       rdmsr(MSR_K8_SYSCFG, lo, hi);
-       mtrr_wrmsr(MSR_K8_SYSCFG, lo
-                               | K8_MTRRFIXRANGE_DRAM_ENABLE
-                               | K8_MTRRFIXRANGE_DRAM_MODIFY, hi);
-}
-
 /**
  * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have
  * @msr: MSR address of the MTTR which should be checked and updated
  * @changed: pointer which indicates whether the MTRR needed to be changed
  * @msrwords: pointer to the MSR values which the MSR should have
- *
- * If K8 extentions are wanted, update the K8 SYSCFG MSR also.
- * See AMD publication no. 24593, chapter 7.8.1, page 233 for more information.
  */
 static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
 {
@@ -337,10 +384,6 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
        rdmsr(msr, lo, hi);
 
        if (lo != msrwords[0] || hi != msrwords[1]) {
-               if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-                   (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) &&
-                   ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
-                       k8_enable_fixed_iorrs();
                mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
                *changed = true;
        }
@@ -376,22 +419,31 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
 {
        unsigned int mask_lo, mask_hi, base_lo, base_hi;
        unsigned int tmp, hi;
+       int cpu;
+
+       /*
+        * get_mtrr doesn't need to update mtrr_state, also it could be called
+        * from any cpu, so try to print it out directly.
+        */
+       cpu = get_cpu();
 
        rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
+
        if ((mask_lo & 0x800) == 0) {
                /*  Invalid (i.e. free) range  */
                *base = 0;
                *size = 0;
                *type = 0;
-               return;
+               goto out_put_cpu;
        }
 
        rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
 
-       /* Work out the shifted address mask. */
+       /* Work out the shifted address mask: */
        tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
        mask_lo = size_or_mask | tmp;
-       /* Expand tmp with high bits to all 1s*/
+
+       /* Expand tmp with high bits to all 1s: */
        hi = fls(tmp);
        if (hi > 0) {
                tmp |= ~((1<<(hi - 1)) - 1);
@@ -402,11 +454,19 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
                }
        }
 
-       /* This works correctly if size is a power of two, i.e. a
-          contiguous range. */
+       /*
+        * This works correctly if size is a power of two, i.e. a
+        * contiguous range:
+        */
        *size = -mask_lo;
        *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
        *type = base_lo & 0xff;
+
+       printk(KERN_DEBUG "  get_mtrr: cpu%d reg%02d base=%010lx size=%010lx %s\n",
+                       cpu, reg, *base, *size,
+                       mtrr_attrib_to_str(*type & 0xff));
+out_put_cpu:
+       put_cpu();
 }
 
 /**
@@ -419,6 +479,8 @@ static int set_fixed_ranges(mtrr_type * frs)
        bool changed = false;
        int block=-1, range;
 
+       k8_check_syscfg_dram_mod_en();
+
        while (fixed_range_blocks[++block].ranges)
            for (range=0; range < fixed_range_blocks[block].ranges; range++)
                set_fixed_range(fixed_range_blocks[block].base_msr + range,
index 236a401b82597b0d48217db77ea5c10e692e9b2e..03cda01f57c7125173a8b348d92105d5bcd5dd28 100644 (file)
@@ -574,7 +574,7 @@ struct mtrr_value {
        unsigned long   lsize;
 };
 
-static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES];
+static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
 
 static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
 {
@@ -582,9 +582,9 @@ static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
 
        for (i = 0; i < num_var_ranges; i++) {
                mtrr_if->get(i,
-                            &mtrr_state[i].lbase,
-                            &mtrr_state[i].lsize,
-                            &mtrr_state[i].ltype);
+                            &mtrr_value[i].lbase,
+                            &mtrr_value[i].lsize,
+                            &mtrr_value[i].ltype);
        }
        return 0;
 }
@@ -594,11 +594,11 @@ static int mtrr_restore(struct sys_device * sysdev)
        int i;
 
        for (i = 0; i < num_var_ranges; i++) {
-               if (mtrr_state[i].lsize) 
+               if (mtrr_value[i].lsize)
                        set_mtrr(i,
-                                mtrr_state[i].lbase,
-                                mtrr_state[i].lsize,
-                                mtrr_state[i].ltype);
+                                mtrr_value[i].lbase,
+                                mtrr_value[i].lsize,
+                                mtrr_value[i].ltype);
        }
        return 0;
 }
@@ -610,1058 +610,7 @@ static struct sysdev_driver mtrr_sysdev_driver = {
        .resume         = mtrr_restore,
 };
 
-/* should be related to MTRR_VAR_RANGES nums */
-#define RANGE_NUM 256
-
-struct res_range {
-       unsigned long start;
-       unsigned long end;
-};
-
-static int __init
-add_range(struct res_range *range, int nr_range, unsigned long start,
-                             unsigned long end)
-{
-       /* out of slots */
-       if (nr_range >= RANGE_NUM)
-               return nr_range;
-
-       range[nr_range].start = start;
-       range[nr_range].end = end;
-
-       nr_range++;
-
-       return nr_range;
-}
-
-static int __init
-add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
-                             unsigned long end)
-{
-       int i;
-
-       /* try to merge it with old one */
-       for (i = 0; i < nr_range; i++) {
-               unsigned long final_start, final_end;
-               unsigned long common_start, common_end;
-
-               if (!range[i].end)
-                       continue;
-
-               common_start = max(range[i].start, start);
-               common_end = min(range[i].end, end);
-               if (common_start > common_end + 1)
-                       continue;
-
-               final_start = min(range[i].start, start);
-               final_end = max(range[i].end, end);
-
-               range[i].start = final_start;
-               range[i].end =  final_end;
-               return nr_range;
-       }
-
-       /* need to add that */
-       return add_range(range, nr_range, start, end);
-}
-
-static void __init
-subtract_range(struct res_range *range, unsigned long start, unsigned long end)
-{
-       int i, j;
-
-       for (j = 0; j < RANGE_NUM; j++) {
-               if (!range[j].end)
-                       continue;
-
-               if (start <= range[j].start && end >= range[j].end) {
-                       range[j].start = 0;
-                       range[j].end = 0;
-                       continue;
-               }
-
-               if (start <= range[j].start && end < range[j].end &&
-                   range[j].start < end + 1) {
-                       range[j].start = end + 1;
-                       continue;
-               }
-
-
-               if (start > range[j].start && end >= range[j].end &&
-                   range[j].end > start - 1) {
-                       range[j].end = start - 1;
-                       continue;
-               }
-
-               if (start > range[j].start && end < range[j].end) {
-                       /* find the new spare */
-                       for (i = 0; i < RANGE_NUM; i++) {
-                               if (range[i].end == 0)
-                                       break;
-                       }
-                       if (i < RANGE_NUM) {
-                               range[i].end = range[j].end;
-                               range[i].start = end + 1;
-                       } else {
-                               printk(KERN_ERR "run of slot in ranges\n");
-                       }
-                       range[j].end = start - 1;
-                       continue;
-               }
-       }
-}
-
-static int __init cmp_range(const void *x1, const void *x2)
-{
-       const struct res_range *r1 = x1;
-       const struct res_range *r2 = x2;
-       long start1, start2;
-
-       start1 = r1->start;
-       start2 = r2->start;
-
-       return start1 - start2;
-}
-
-struct var_mtrr_range_state {
-       unsigned long base_pfn;
-       unsigned long size_pfn;
-       mtrr_type type;
-};
-
-static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
-static int __initdata debug_print;
-
-static int __init
-x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
-                      unsigned long extra_remove_base,
-                      unsigned long extra_remove_size)
-{
-       unsigned long i, base, size;
-       mtrr_type type;
-
-       for (i = 0; i < num_var_ranges; i++) {
-               type = range_state[i].type;
-               if (type != MTRR_TYPE_WRBACK)
-                       continue;
-               base = range_state[i].base_pfn;
-               size = range_state[i].size_pfn;
-               nr_range = add_range_with_merge(range, nr_range, base,
-                                               base + size - 1);
-       }
-       if (debug_print) {
-               printk(KERN_DEBUG "After WB checking\n");
-               for (i = 0; i < nr_range; i++)
-                       printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
-                                range[i].start, range[i].end + 1);
-       }
-
-       /* take out UC ranges */
-       for (i = 0; i < num_var_ranges; i++) {
-               type = range_state[i].type;
-               if (type != MTRR_TYPE_UNCACHABLE &&
-                   type != MTRR_TYPE_WRPROT)
-                       continue;
-               size = range_state[i].size_pfn;
-               if (!size)
-                       continue;
-               base = range_state[i].base_pfn;
-               subtract_range(range, base, base + size - 1);
-       }
-       if (extra_remove_size)
-               subtract_range(range, extra_remove_base,
-                                extra_remove_base + extra_remove_size  - 1);
-
-       /* get new range num */
-       nr_range = 0;
-       for (i = 0; i < RANGE_NUM; i++) {
-               if (!range[i].end)
-                       continue;
-               nr_range++;
-       }
-       if  (debug_print) {
-               printk(KERN_DEBUG "After UC checking\n");
-               for (i = 0; i < nr_range; i++)
-                       printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
-                                range[i].start, range[i].end + 1);
-       }
-
-       /* sort the ranges */
-       sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
-       if  (debug_print) {
-               printk(KERN_DEBUG "After sorting\n");
-               for (i = 0; i < nr_range; i++)
-                       printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
-                                range[i].start, range[i].end + 1);
-       }
-
-       /* clear those is not used */
-       for (i = nr_range; i < RANGE_NUM; i++)
-               memset(&range[i], 0, sizeof(range[i]));
-
-       return nr_range;
-}
-
-static struct res_range __initdata range[RANGE_NUM];
-static int __initdata nr_range;
-
-#ifdef CONFIG_MTRR_SANITIZER
-
-static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
-{
-       unsigned long sum;
-       int i;
-
-       sum = 0;
-       for (i = 0; i < nr_range; i++)
-               sum += range[i].end + 1 - range[i].start;
-
-       return sum;
-}
-
-static int enable_mtrr_cleanup __initdata =
-       CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
-
-static int __init disable_mtrr_cleanup_setup(char *str)
-{
-       enable_mtrr_cleanup = 0;
-       return 0;
-}
-early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
-
-static int __init enable_mtrr_cleanup_setup(char *str)
-{
-       enable_mtrr_cleanup = 1;
-       return 0;
-}
-early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
-
-static int __init mtrr_cleanup_debug_setup(char *str)
-{
-       debug_print = 1;
-       return 0;
-}
-early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
-
-struct var_mtrr_state {
-       unsigned long   range_startk;
-       unsigned long   range_sizek;
-       unsigned long   chunk_sizek;
-       unsigned long   gran_sizek;
-       unsigned int    reg;
-};
-
-static void __init
-set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
-               unsigned char type, unsigned int address_bits)
-{
-       u32 base_lo, base_hi, mask_lo, mask_hi;
-       u64 base, mask;
-
-       if (!sizek) {
-               fill_mtrr_var_range(reg, 0, 0, 0, 0);
-               return;
-       }
-
-       mask = (1ULL << address_bits) - 1;
-       mask &= ~((((u64)sizek) << 10) - 1);
-
-       base  = ((u64)basek) << 10;
-
-       base |= type;
-       mask |= 0x800;
-
-       base_lo = base & ((1ULL<<32) - 1);
-       base_hi = base >> 32;
-
-       mask_lo = mask & ((1ULL<<32) - 1);
-       mask_hi = mask >> 32;
-
-       fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
-}
-
-static void __init
-save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
-               unsigned char type)
-{
-       range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
-       range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
-       range_state[reg].type = type;
-}
-
-static void __init
-set_var_mtrr_all(unsigned int address_bits)
-{
-       unsigned long basek, sizek;
-       unsigned char type;
-       unsigned int reg;
-
-       for (reg = 0; reg < num_var_ranges; reg++) {
-               basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
-               sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
-               type = range_state[reg].type;
-
-               set_var_mtrr(reg, basek, sizek, type, address_bits);
-       }
-}
-
-static unsigned long to_size_factor(unsigned long sizek, char *factorp)
-{
-       char factor;
-       unsigned long base = sizek;
-
-       if (base & ((1<<10) - 1)) {
-               /* not MB alignment */
-               factor = 'K';
-       } else if (base & ((1<<20) - 1)){
-               factor = 'M';
-               base >>= 10;
-       } else {
-               factor = 'G';
-               base >>= 20;
-       }
-
-       *factorp = factor;
-
-       return base;
-}
-
-static unsigned int __init
-range_to_mtrr(unsigned int reg, unsigned long range_startk,
-             unsigned long range_sizek, unsigned char type)
-{
-       if (!range_sizek || (reg >= num_var_ranges))
-               return reg;
-
-       while (range_sizek) {
-               unsigned long max_align, align;
-               unsigned long sizek;
-
-               /* Compute the maximum size I can make a range */
-               if (range_startk)
-                       max_align = ffs(range_startk) - 1;
-               else
-                       max_align = 32;
-               align = fls(range_sizek) - 1;
-               if (align > max_align)
-                       align = max_align;
-
-               sizek = 1 << align;
-               if (debug_print) {
-                       char start_factor = 'K', size_factor = 'K';
-                       unsigned long start_base, size_base;
-
-                       start_base = to_size_factor(range_startk, &start_factor),
-                       size_base = to_size_factor(sizek, &size_factor),
-
-                       printk(KERN_DEBUG "Setting variable MTRR %d, "
-                               "base: %ld%cB, range: %ld%cB, type %s\n",
-                               reg, start_base, start_factor,
-                               size_base, size_factor,
-                               (type == MTRR_TYPE_UNCACHABLE)?"UC":
-                                   ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
-                               );
-               }
-               save_var_mtrr(reg++, range_startk, sizek, type);
-               range_startk += sizek;
-               range_sizek -= sizek;
-               if (reg >= num_var_ranges)
-                       break;
-       }
-       return reg;
-}
-
-static unsigned __init
-range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
-                       unsigned long sizek)
-{
-       unsigned long hole_basek, hole_sizek;
-       unsigned long second_basek, second_sizek;
-       unsigned long range0_basek, range0_sizek;
-       unsigned long range_basek, range_sizek;
-       unsigned long chunk_sizek;
-       unsigned long gran_sizek;
-
-       hole_basek = 0;
-       hole_sizek = 0;
-       second_basek = 0;
-       second_sizek = 0;
-       chunk_sizek = state->chunk_sizek;
-       gran_sizek = state->gran_sizek;
-
-       /* align with gran size, prevent small block used up MTRRs */
-       range_basek = ALIGN(state->range_startk, gran_sizek);
-       if ((range_basek > basek) && basek)
-               return second_sizek;
-       state->range_sizek -= (range_basek - state->range_startk);
-       range_sizek = ALIGN(state->range_sizek, gran_sizek);
-
-       while (range_sizek > state->range_sizek) {
-               range_sizek -= gran_sizek;
-               if (!range_sizek)
-                       return 0;
-       }
-       state->range_sizek = range_sizek;
-
-       /* try to append some small hole */
-       range0_basek = state->range_startk;
-       range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
-
-       /* no increase */
-       if (range0_sizek == state->range_sizek) {
-               if (debug_print)
-                       printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
-                               range0_basek<<10,
-                               (range0_basek + state->range_sizek)<<10);
-               state->reg = range_to_mtrr(state->reg, range0_basek,
-                               state->range_sizek, MTRR_TYPE_WRBACK);
-               return 0;
-       }
-
-       /* only cut back, when it is not the last */
-       if (sizek) {
-               while (range0_basek + range0_sizek > (basek + sizek)) {
-                       if (range0_sizek >= chunk_sizek)
-                               range0_sizek -= chunk_sizek;
-                       else
-                               range0_sizek = 0;
-
-                       if (!range0_sizek)
-                               break;
-               }
-       }
-
-second_try:
-       range_basek = range0_basek + range0_sizek;
-
-       /* one hole in the middle */
-       if (range_basek > basek && range_basek <= (basek + sizek))
-               second_sizek = range_basek - basek;
-
-       if (range0_sizek > state->range_sizek) {
-
-               /* one hole in middle or at end */
-               hole_sizek = range0_sizek - state->range_sizek - second_sizek;
-
-               /* hole size should be less than half of range0 size */
-               if (hole_sizek >= (range0_sizek >> 1) &&
-                   range0_sizek >= chunk_sizek) {
-                       range0_sizek -= chunk_sizek;
-                       second_sizek = 0;
-                       hole_sizek = 0;
-
-                       goto second_try;
-               }
-       }
-
-       if (range0_sizek) {
-               if (debug_print)
-                       printk(KERN_DEBUG "range0: %016lx - %016lx\n",
-                               range0_basek<<10,
-                               (range0_basek + range0_sizek)<<10);
-               state->reg = range_to_mtrr(state->reg, range0_basek,
-                               range0_sizek, MTRR_TYPE_WRBACK);
-       }
-
-       if (range0_sizek < state->range_sizek) {
-               /* need to handle left over */
-               range_sizek = state->range_sizek - range0_sizek;
-
-               if (debug_print)
-                       printk(KERN_DEBUG "range: %016lx - %016lx\n",
-                                range_basek<<10,
-                                (range_basek + range_sizek)<<10);
-               state->reg = range_to_mtrr(state->reg, range_basek,
-                                range_sizek, MTRR_TYPE_WRBACK);
-       }
-
-       if (hole_sizek) {
-               hole_basek = range_basek - hole_sizek - second_sizek;
-               if (debug_print)
-                       printk(KERN_DEBUG "hole: %016lx - %016lx\n",
-                                hole_basek<<10,
-                                (hole_basek + hole_sizek)<<10);
-               state->reg = range_to_mtrr(state->reg, hole_basek,
-                                hole_sizek, MTRR_TYPE_UNCACHABLE);
-       }
-
-       return second_sizek;
-}
-
-static void __init
-set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
-                  unsigned long size_pfn)
-{
-       unsigned long basek, sizek;
-       unsigned long second_sizek = 0;
-
-       if (state->reg >= num_var_ranges)
-               return;
-
-       basek = base_pfn << (PAGE_SHIFT - 10);
-       sizek = size_pfn << (PAGE_SHIFT - 10);
-
-       /* See if I can merge with the last range */
-       if ((basek <= 1024) ||
-           (state->range_startk + state->range_sizek == basek)) {
-               unsigned long endk = basek + sizek;
-               state->range_sizek = endk - state->range_startk;
-               return;
-       }
-       /* Write the range mtrrs */
-       if (state->range_sizek != 0)
-               second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
-
-       /* Allocate an msr */
-       state->range_startk = basek + second_sizek;
-       state->range_sizek  = sizek - second_sizek;
-}
-
-/* mininum size of mtrr block that can take hole */
-static u64 mtrr_chunk_size __initdata = (256ULL<<20);
-
-static int __init parse_mtrr_chunk_size_opt(char *p)
-{
-       if (!p)
-               return -EINVAL;
-       mtrr_chunk_size = memparse(p, &p);
-       return 0;
-}
-early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
-
-/* granity of mtrr of block */
-static u64 mtrr_gran_size __initdata;
-
-static int __init parse_mtrr_gran_size_opt(char *p)
-{
-       if (!p)
-               return -EINVAL;
-       mtrr_gran_size = memparse(p, &p);
-       return 0;
-}
-early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
-
-static int nr_mtrr_spare_reg __initdata =
-                                CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
-
-static int __init parse_mtrr_spare_reg(char *arg)
-{
-       if (arg)
-               nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
-       return 0;
-}
-
-early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
-
-static int __init
-x86_setup_var_mtrrs(struct res_range *range, int nr_range,
-                   u64 chunk_size, u64 gran_size)
-{
-       struct var_mtrr_state var_state;
-       int i;
-       int num_reg;
-
-       var_state.range_startk  = 0;
-       var_state.range_sizek   = 0;
-       var_state.reg           = 0;
-       var_state.chunk_sizek   = chunk_size >> 10;
-       var_state.gran_sizek    = gran_size >> 10;
-
-       memset(range_state, 0, sizeof(range_state));
-
-       /* Write the range etc */
-       for (i = 0; i < nr_range; i++)
-               set_var_mtrr_range(&var_state, range[i].start,
-                                  range[i].end - range[i].start + 1);
-
-       /* Write the last range */
-       if (var_state.range_sizek != 0)
-               range_to_mtrr_with_hole(&var_state, 0, 0);
-
-       num_reg = var_state.reg;
-       /* Clear out the extra MTRR's */
-       while (var_state.reg < num_var_ranges) {
-               save_var_mtrr(var_state.reg, 0, 0, 0);
-               var_state.reg++;
-       }
-
-       return num_reg;
-}
-
-struct mtrr_cleanup_result {
-       unsigned long gran_sizek;
-       unsigned long chunk_sizek;
-       unsigned long lose_cover_sizek;
-       unsigned int num_reg;
-       int bad;
-};
-
-/*
- * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
- * chunk size: gran_size, ..., 2G
- * so we need (1+16)*8
- */
-#define NUM_RESULT     136
-#define PSHIFT         (PAGE_SHIFT - 10)
-
-static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
-static unsigned long __initdata min_loss_pfn[RANGE_NUM];
-
-static void __init print_out_mtrr_range_state(void)
-{
-       int i;
-       char start_factor = 'K', size_factor = 'K';
-       unsigned long start_base, size_base;
-       mtrr_type type;
-
-       for (i = 0; i < num_var_ranges; i++) {
-
-               size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
-               if (!size_base)
-                       continue;
-
-               size_base = to_size_factor(size_base, &size_factor),
-               start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
-               start_base = to_size_factor(start_base, &start_factor),
-               type = range_state[i].type;
-
-               printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
-                       i, start_base, start_factor,
-                       size_base, size_factor,
-                       (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
-                           ((type == MTRR_TYPE_WRPROT) ? "WP" :
-                            ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
-                       );
-       }
-}
-
-static int __init mtrr_need_cleanup(void)
-{
-       int i;
-       mtrr_type type;
-       unsigned long size;
-       /* extra one for all 0 */
-       int num[MTRR_NUM_TYPES + 1];
-
-       /* check entries number */
-       memset(num, 0, sizeof(num));
-       for (i = 0; i < num_var_ranges; i++) {
-               type = range_state[i].type;
-               size = range_state[i].size_pfn;
-               if (type >= MTRR_NUM_TYPES)
-                       continue;
-               if (!size)
-                       type = MTRR_NUM_TYPES;
-               if (type == MTRR_TYPE_WRPROT)
-                       type = MTRR_TYPE_UNCACHABLE;
-               num[type]++;
-       }
-
-       /* check if we got UC entries */
-       if (!num[MTRR_TYPE_UNCACHABLE])
-               return 0;
-
-       /* check if we only had WB and UC */
-       if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
-               num_var_ranges - num[MTRR_NUM_TYPES])
-               return 0;
-
-       return 1;
-}
-
-static unsigned long __initdata range_sums;
-static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
-                                        unsigned long extra_remove_base,
-                                        unsigned long extra_remove_size,
-                                        int i)
-{
-       int num_reg;
-       static struct res_range range_new[RANGE_NUM];
-       static int nr_range_new;
-       unsigned long range_sums_new;
-
-       /* convert ranges to var ranges state */
-       num_reg = x86_setup_var_mtrrs(range, nr_range,
-                                               chunk_size, gran_size);
-
-       /* we got new setting in range_state, check it */
-       memset(range_new, 0, sizeof(range_new));
-       nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
-                               extra_remove_base, extra_remove_size);
-       range_sums_new = sum_ranges(range_new, nr_range_new);
-
-       result[i].chunk_sizek = chunk_size >> 10;
-       result[i].gran_sizek = gran_size >> 10;
-       result[i].num_reg = num_reg;
-       if (range_sums < range_sums_new) {
-               result[i].lose_cover_sizek =
-                       (range_sums_new - range_sums) << PSHIFT;
-               result[i].bad = 1;
-       } else
-               result[i].lose_cover_sizek =
-                       (range_sums - range_sums_new) << PSHIFT;
-
-       /* double check it */
-       if (!result[i].bad && !result[i].lose_cover_sizek) {
-               if (nr_range_new != nr_range ||
-                       memcmp(range, range_new, sizeof(range)))
-                               result[i].bad = 1;
-       }
-
-       if (!result[i].bad && (range_sums - range_sums_new <
-                               min_loss_pfn[num_reg])) {
-               min_loss_pfn[num_reg] =
-                       range_sums - range_sums_new;
-       }
-}
-
-static void __init mtrr_print_out_one_result(int i)
-{
-       char gran_factor, chunk_factor, lose_factor;
-       unsigned long gran_base, chunk_base, lose_base;
-
-       gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
-       chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
-       lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
-       printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
-                       result[i].bad ? "*BAD*" : " ",
-                       gran_base, gran_factor, chunk_base, chunk_factor);
-       printk(KERN_CONT "num_reg: %d  \tlose cover RAM: %s%ld%c\n",
-                       result[i].num_reg, result[i].bad ? "-" : "",
-                       lose_base, lose_factor);
-}
-
-static int __init mtrr_search_optimal_index(void)
-{
-       int i;
-       int num_reg_good;
-       int index_good;
-
-       if (nr_mtrr_spare_reg >= num_var_ranges)
-               nr_mtrr_spare_reg = num_var_ranges - 1;
-       num_reg_good = -1;
-       for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
-               if (!min_loss_pfn[i])
-                       num_reg_good = i;
-       }
-
-       index_good = -1;
-       if (num_reg_good != -1) {
-               for (i = 0; i < NUM_RESULT; i++) {
-                       if (!result[i].bad &&
-                           result[i].num_reg == num_reg_good &&
-                           !result[i].lose_cover_sizek) {
-                               index_good = i;
-                               break;
-                       }
-               }
-       }
-
-       return index_good;
-}
-
-
-static int __init mtrr_cleanup(unsigned address_bits)
-{
-       unsigned long extra_remove_base, extra_remove_size;
-       unsigned long base, size, def, dummy;
-       mtrr_type type;
-       u64 chunk_size, gran_size;
-       int index_good;
-       int i;
-
-       if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
-               return 0;
-       rdmsr(MTRRdefType_MSR, def, dummy);
-       def &= 0xff;
-       if (def != MTRR_TYPE_UNCACHABLE)
-               return 0;
-
-       /* get it and store it aside */
-       memset(range_state, 0, sizeof(range_state));
-       for (i = 0; i < num_var_ranges; i++) {
-               mtrr_if->get(i, &base, &size, &type);
-               range_state[i].base_pfn = base;
-               range_state[i].size_pfn = size;
-               range_state[i].type = type;
-       }
-
-       /* check if we need handle it and can handle it */
-       if (!mtrr_need_cleanup())
-               return 0;
-
-       /* print original var MTRRs at first, for debugging: */
-       printk(KERN_DEBUG "original variable MTRRs\n");
-       print_out_mtrr_range_state();
-
-       memset(range, 0, sizeof(range));
-       extra_remove_size = 0;
-       extra_remove_base = 1 << (32 - PAGE_SHIFT);
-       if (mtrr_tom2)
-               extra_remove_size =
-                       (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
-       nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
-                                         extra_remove_size);
-       /*
-        * [0, 1M) should always be coverred by var mtrr with WB
-        * and fixed mtrrs should take effective before var mtrr for it
-        */
-       nr_range = add_range_with_merge(range, nr_range, 0,
-                                       (1ULL<<(20 - PAGE_SHIFT)) - 1);
-       /* sort the ranges */
-       sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
-
-       range_sums = sum_ranges(range, nr_range);
-       printk(KERN_INFO "total RAM coverred: %ldM\n",
-              range_sums >> (20 - PAGE_SHIFT));
-
-       if (mtrr_chunk_size && mtrr_gran_size) {
-               i = 0;
-               mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
-                                     extra_remove_base, extra_remove_size, i);
-
-               mtrr_print_out_one_result(i);
-
-               if (!result[i].bad) {
-                       set_var_mtrr_all(address_bits);
-                       return 1;
-               }
-               printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
-                      "will find optimal one\n");
-       }
-
-       i = 0;
-       memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
-       memset(result, 0, sizeof(result));
-       for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
-
-               for (chunk_size = gran_size; chunk_size < (1ULL<<32);
-                    chunk_size <<= 1) {
-
-                       if (i >= NUM_RESULT)
-                               continue;
-
-                       mtrr_calc_range_state(chunk_size, gran_size,
-                                     extra_remove_base, extra_remove_size, i);
-                       if (debug_print) {
-                               mtrr_print_out_one_result(i);
-                               printk(KERN_INFO "\n");
-                       }
-
-                       i++;
-               }
-       }
-
-       /* try to find the optimal index */
-       index_good = mtrr_search_optimal_index();
-
-       if (index_good != -1) {
-               printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
-               i = index_good;
-               mtrr_print_out_one_result(i);
-
-               /* convert ranges to var ranges state */
-               chunk_size = result[i].chunk_sizek;
-               chunk_size <<= 10;
-               gran_size = result[i].gran_sizek;
-               gran_size <<= 10;
-               x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
-               set_var_mtrr_all(address_bits);
-               printk(KERN_DEBUG "New variable MTRRs\n");
-               print_out_mtrr_range_state();
-               return 1;
-       } else {
-               /* print out all */
-               for (i = 0; i < NUM_RESULT; i++)
-                       mtrr_print_out_one_result(i);
-       }
-
-       printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
-       printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
-
-       return 0;
-}
-#else
-static int __init mtrr_cleanup(unsigned address_bits)
-{
-       return 0;
-}
-#endif
-
-static int __initdata changed_by_mtrr_cleanup;
-
-static int disable_mtrr_trim;
-
-static int __init disable_mtrr_trim_setup(char *str)
-{
-       disable_mtrr_trim = 1;
-       return 0;
-}
-early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
-
-/*
- * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
- * for memory >4GB. Check for that here.
- * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
- * apply to are wrong, but so far we don't know of any such case in the wild.
- */
-#define Tom2Enabled (1U << 21)
-#define Tom2ForceMemTypeWB (1U << 22)
-
-int __init amd_special_default_mtrr(void)
-{
-       u32 l, h;
-
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
-               return 0;
-       if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
-               return 0;
-       /* In case some hypervisor doesn't pass SYSCFG through */
-       if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
-               return 0;
-       /*
-        * Memory between 4GB and top of mem is forced WB by this magic bit.
-        * Reserved before K8RevF, but should be zero there.
-        */
-       if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
-                (Tom2Enabled | Tom2ForceMemTypeWB))
-               return 1;
-       return 0;
-}
-
-static u64 __init real_trim_memory(unsigned long start_pfn,
-                                  unsigned long limit_pfn)
-{
-       u64 trim_start, trim_size;
-       trim_start = start_pfn;
-       trim_start <<= PAGE_SHIFT;
-       trim_size = limit_pfn;
-       trim_size <<= PAGE_SHIFT;
-       trim_size -= trim_start;
-
-       return e820_update_range(trim_start, trim_size, E820_RAM,
-                               E820_RESERVED);
-}
-/**
- * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
- * @end_pfn: ending page frame number
- *
- * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
- * memory configurations.  This routine checks that the highest MTRR matches
- * the end of memory, to make sure the MTRRs having a write back type cover
- * all of the memory the kernel is intending to use. If not, it'll trim any
- * memory off the end by adjusting end_pfn, removing it from the kernel's
- * allocation pools, warning the user with an obnoxious message.
- */
-int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
-{
-       unsigned long i, base, size, highest_pfn = 0, def, dummy;
-       mtrr_type type;
-       u64 total_trim_size;
-
-       /* extra one for all 0 */
-       int num[MTRR_NUM_TYPES + 1];
-       /*
-        * Make sure we only trim uncachable memory on machines that
-        * support the Intel MTRR architecture:
-        */
-       if (!is_cpu(INTEL) || disable_mtrr_trim)
-               return 0;
-       rdmsr(MTRRdefType_MSR, def, dummy);
-       def &= 0xff;
-       if (def != MTRR_TYPE_UNCACHABLE)
-               return 0;
-
-       /* get it and store it aside */
-       memset(range_state, 0, sizeof(range_state));
-       for (i = 0; i < num_var_ranges; i++) {
-               mtrr_if->get(i, &base, &size, &type);
-               range_state[i].base_pfn = base;
-               range_state[i].size_pfn = size;
-               range_state[i].type = type;
-       }
-
-       /* Find highest cached pfn */
-       for (i = 0; i < num_var_ranges; i++) {
-               type = range_state[i].type;
-               if (type != MTRR_TYPE_WRBACK)
-                       continue;
-               base = range_state[i].base_pfn;
-               size = range_state[i].size_pfn;
-               if (highest_pfn < base + size)
-                       highest_pfn = base + size;
-       }
-
-       /* kvm/qemu doesn't have mtrr set right, don't trim them all */
-       if (!highest_pfn) {
-               printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
-               return 0;
-       }
-
-       /* check entries number */
-       memset(num, 0, sizeof(num));
-       for (i = 0; i < num_var_ranges; i++) {
-               type = range_state[i].type;
-               if (type >= MTRR_NUM_TYPES)
-                       continue;
-               size = range_state[i].size_pfn;
-               if (!size)
-                       type = MTRR_NUM_TYPES;
-               num[type]++;
-       }
-
-       /* no entry for WB? */
-       if (!num[MTRR_TYPE_WRBACK])
-               return 0;
-
-       /* check if we only had WB and UC */
-       if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
-               num_var_ranges - num[MTRR_NUM_TYPES])
-               return 0;
-
-       memset(range, 0, sizeof(range));
-       nr_range = 0;
-       if (mtrr_tom2) {
-               range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
-               range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
-               if (highest_pfn < range[nr_range].end + 1)
-                       highest_pfn = range[nr_range].end + 1;
-               nr_range++;
-       }
-       nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
-
-       total_trim_size = 0;
-       /* check the head */
-       if (range[0].start)
-               total_trim_size += real_trim_memory(0, range[0].start);
-       /* check the holes */
-       for (i = 0; i < nr_range - 1; i++) {
-               if (range[i].end + 1 < range[i+1].start)
-                       total_trim_size += real_trim_memory(range[i].end + 1,
-                                                           range[i+1].start);
-       }
-       /* check the top */
-       i = nr_range - 1;
-       if (range[i].end + 1 < end_pfn)
-               total_trim_size += real_trim_memory(range[i].end + 1,
-                                                        end_pfn);
-
-       if (total_trim_size) {
-               printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
-                       " all of memory, losing %lluMB of RAM.\n",
-                       total_trim_size >> 20);
-
-               if (!changed_by_mtrr_cleanup)
-                       WARN_ON(1);
-
-               printk(KERN_INFO "update e820 for mtrr\n");
-               update_e820();
-
-               return 1;
-       }
-
-       return 0;
-}
+int __initdata changed_by_mtrr_cleanup;
 
 /**
  * mtrr_bp_init - initialize mtrrs on the boot CPU
index ffd60409cc6d410ee16eac82d76f4f83badab39e..77f67f7b347a3a97a093b1094c5da86a2ca9ad35 100644 (file)
@@ -79,6 +79,7 @@ extern struct mtrr_ops * mtrr_if;
 
 extern unsigned int num_var_ranges;
 extern u64 mtrr_tom2;
+extern struct mtrr_state_type mtrr_state;
 
 void mtrr_state_warn(void);
 const char *mtrr_attrib_to_str(int x);
@@ -88,3 +89,6 @@ void mtrr_wrmsr(unsigned, unsigned, unsigned);
 int amd_init_mtrr(void);
 int cyrix_init_mtrr(void);
 int centaur_init_mtrr(void);
+
+extern int changed_by_mtrr_cleanup;
+extern int mtrr_cleanup(unsigned address_bits);
index 52b3fefbd5af13c78b037b70f0e6bbeb6bbc780a..bb62b3e5caadca0faba5e489fb874bf563e0073c 100644 (file)
@@ -98,7 +98,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
 #endif
 }
 
-static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst transmeta_cpu_dev = {
        .c_vendor       = "Transmeta",
        .c_ident        = { "GenuineTMx86", "TransmetaCPU" },
        .c_early_init   = early_init_transmeta,
index e777f79e0960d29fdaae20d800098569746450fe..fd2c37bf7acb6c6cc9a322600dc2c9dade028dbb 100644 (file)
@@ -8,7 +8,7 @@
  * so no special init takes place.
  */
 
-static struct cpu_dev umc_cpu_dev __cpuinitdata = {
+static const struct cpu_dev __cpuinitconst umc_cpu_dev = {
        .c_vendor       = "UMC",
        .c_ident        = { "UMC UMC UMC" },
        .c_models = {
index 508bec1cee27af03844337a32497cd84a29f80ce..ef2c3563357d52c6f7d314a1df2cd018bbdca000 100644 (file)
@@ -110,19 +110,50 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type)
 /*
  * Add a memory region to the kernel e820 map.
  */
-void __init e820_add_region(u64 start, u64 size, int type)
+static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
+                                        int type)
 {
-       int x = e820.nr_map;
+       int x = e820x->nr_map;
 
-       if (x == ARRAY_SIZE(e820.map)) {
+       if (x == ARRAY_SIZE(e820x->map)) {
                printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
                return;
        }
 
-       e820.map[x].addr = start;
-       e820.map[x].size = size;
-       e820.map[x].type = type;
-       e820.nr_map++;
+       e820x->map[x].addr = start;
+       e820x->map[x].size = size;
+       e820x->map[x].type = type;
+       e820x->nr_map++;
+}
+
+void __init e820_add_region(u64 start, u64 size, int type)
+{
+       __e820_add_region(&e820, start, size, type);
+}
+
+static void __init e820_print_type(u32 type)
+{
+       switch (type) {
+       case E820_RAM:
+       case E820_RESERVED_KERN:
+               printk(KERN_CONT "(usable)");
+               break;
+       case E820_RESERVED:
+               printk(KERN_CONT "(reserved)");
+               break;
+       case E820_ACPI:
+               printk(KERN_CONT "(ACPI data)");
+               break;
+       case E820_NVS:
+               printk(KERN_CONT "(ACPI NVS)");
+               break;
+       case E820_UNUSABLE:
+               printk(KERN_CONT "(unusable)");
+               break;
+       default:
+               printk(KERN_CONT "type %u", type);
+               break;
+       }
 }
 
 void __init e820_print_map(char *who)
@@ -134,27 +165,8 @@ void __init e820_print_map(char *who)
                       (unsigned long long) e820.map[i].addr,
                       (unsigned long long)
                       (e820.map[i].addr + e820.map[i].size));
-               switch (e820.map[i].type) {
-               case E820_RAM:
-               case E820_RESERVED_KERN:
-                       printk(KERN_CONT "(usable)\n");
-                       break;
-               case E820_RESERVED:
-                       printk(KERN_CONT "(reserved)\n");
-                       break;
-               case E820_ACPI:
-                       printk(KERN_CONT "(ACPI data)\n");
-                       break;
-               case E820_NVS:
-                       printk(KERN_CONT "(ACPI NVS)\n");
-                       break;
-               case E820_UNUSABLE:
-                       printk("(unusable)\n");
-                       break;
-               default:
-                       printk(KERN_CONT "type %u\n", e820.map[i].type);
-                       break;
-               }
+               e820_print_type(e820.map[i].type);
+               printk(KERN_CONT "\n");
        }
 }
 
@@ -221,7 +233,7 @@ void __init e820_print_map(char *who)
  */
 
 int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
-                               int *pnr_map)
+                            u32 *pnr_map)
 {
        struct change_member {
                struct e820entry *pbios; /* pointer to original bios entry */
@@ -417,11 +429,12 @@ static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
        return __append_e820_map(biosmap, nr_map);
 }
 
-static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
+static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
                                        u64 size, unsigned old_type,
                                        unsigned new_type)
 {
-       int i;
+       u64 end;
+       unsigned int i;
        u64 real_updated_size = 0;
 
        BUG_ON(old_type == new_type);
@@ -429,27 +442,55 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
        if (size > (ULLONG_MAX - start))
                size = ULLONG_MAX - start;
 
-       for (i = 0; i < e820.nr_map; i++) {
+       end = start + size;
+       printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
+                      (unsigned long long) start,
+                      (unsigned long long) end);
+       e820_print_type(old_type);
+       printk(KERN_CONT " ==> ");
+       e820_print_type(new_type);
+       printk(KERN_CONT "\n");
+
+       for (i = 0; i < e820x->nr_map; i++) {
                struct e820entry *ei = &e820x->map[i];
                u64 final_start, final_end;
+               u64 ei_end;
+
                if (ei->type != old_type)
                        continue;
-               /* totally covered? */
-               if (ei->addr >= start &&
-                   (ei->addr + ei->size) <= (start + size)) {
+
+               ei_end = ei->addr + ei->size;
+               /* totally covered by new range? */
+               if (ei->addr >= start && ei_end <= end) {
                        ei->type = new_type;
                        real_updated_size += ei->size;
                        continue;
                }
+
+               /* new range is totally covered? */
+               if (ei->addr < start && ei_end > end) {
+                       __e820_add_region(e820x, start, size, new_type);
+                       __e820_add_region(e820x, end, ei_end - end, ei->type);
+                       ei->size = start - ei->addr;
+                       real_updated_size += size;
+                       continue;
+               }
+
                /* partially covered */
                final_start = max(start, ei->addr);
-               final_end = min(start + size, ei->addr + ei->size);
+               final_end = min(end, ei_end);
                if (final_start >= final_end)
                        continue;
-               e820_add_region(final_start, final_end - final_start,
-                                        new_type);
+
+               __e820_add_region(e820x, final_start, final_end - final_start,
+                                 new_type);
+
                real_updated_size += final_end - final_start;
 
+               /*
+                * left range could be head or tail, so need to update
+                * size at first.
+                */
                ei->size -= final_end - final_start;
                if (ei->addr < final_start)
                        continue;
@@ -461,13 +502,13 @@ static u64 __init e820_update_range_map(struct e820map *e820x, u64 start,
 u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
                             unsigned new_type)
 {
-       return e820_update_range_map(&e820, start, size, old_type, new_type);
+       return __e820_update_range(&e820, start, size, old_type, new_type);
 }
 
 static u64 __init e820_update_range_saved(u64 start, u64 size,
                                          unsigned old_type, unsigned new_type)
 {
-       return e820_update_range_map(&e820_saved, start, size, old_type,
+       return __e820_update_range(&e820_saved, start, size, old_type,
                                     new_type);
 }
 
@@ -511,7 +552,7 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
 
 void __init update_e820(void)
 {
-       int nr_map;
+       u32 nr_map;
 
        nr_map = e820.nr_map;
        if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
@@ -522,7 +563,7 @@ void __init update_e820(void)
 }
 static void __init update_e820_saved(void)
 {
-       int nr_map;
+       u32 nr_map;
 
        nr_map = e820_saved.nr_map;
        if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
@@ -1020,8 +1061,8 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
                        continue;
                return addr;
        }
-       return -1UL;
 
+       return -1ULL;
 }
 
 /*
@@ -1034,13 +1075,22 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
        u64 start;
 
        start = startt;
-       while (size < sizet)
+       while (size < sizet && (start + 1))
                start = find_e820_area_size(start, &size, align);
 
        if (size < sizet)
                return 0;
 
+#ifdef CONFIG_X86_32
+       if (start >= MAXMEM)
+               return 0;
+       if (start + size > MAXMEM)
+               size = MAXMEM - start;
+#endif
+
        addr = round_down(start + size - sizet, align);
+       if (addr < start)
+               return 0;
        e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
        e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
        printk(KERN_INFO "update e820 for early_reserve_e820\n");
@@ -1253,7 +1303,7 @@ early_param("memmap", parse_memmap_opt);
 void __init finish_e820_parsing(void)
 {
        if (userdef) {
-               int nr = e820.nr_map;
+               u32 nr = e820.nr_map;
 
                if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
                        early_panic("Invalid user supplied memory map");
@@ -1336,7 +1386,7 @@ void __init e820_reserve_resources_late(void)
 char *__init default_machine_specific_memory_setup(void)
 {
        char *who = "BIOS-e820";
-       int new_nr;
+       u32 new_nr;
        /*
         * Try to copy the BIOS-supplied E820-map.
         *
index 639ad98238a2b395344b680e07ad6aa717ed9f45..335f049d110f5b095793cb6832a3571ee1d2586c 100644 (file)
@@ -250,7 +250,7 @@ static int dbgp_wait_until_complete(void)
        return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
 }
 
-static void dbgp_mdelay(int ms)
+static void __init dbgp_mdelay(int ms)
 {
        int i;
 
@@ -311,7 +311,7 @@ static void dbgp_set_data(const void *buf, int size)
        writel(hi, &ehci_debug->data47);
 }
 
-static void dbgp_get_data(void *buf, int size)
+static void __init dbgp_get_data(void *buf, int size)
 {
        unsigned char *bytes = buf;
        u32 lo, hi;
@@ -355,7 +355,7 @@ static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
        return ret;
 }
 
-static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
+static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
                                 int size)
 {
        u32 pids, addr, ctrl;
@@ -386,8 +386,8 @@ static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
        return ret;
 }
 
-static int dbgp_control_msg(unsigned devnum, int requesttype, int request,
-       int value, int index, void *data, int size)
+static int __init dbgp_control_msg(unsigned devnum, int requesttype,
+       int request, int value, int index, void *data, int size)
 {
        u32 pids, addr, ctrl;
        struct usb_ctrlrequest req;
@@ -489,7 +489,7 @@ static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
        return 0;
 }
 
-static int ehci_reset_port(int port)
+static int __init ehci_reset_port(int port)
 {
        u32 portsc;
        u32 delay_time, delay;
@@ -532,7 +532,7 @@ static int ehci_reset_port(int port)
        return -EBUSY;
 }
 
-static int ehci_wait_for_port(int port)
+static int __init ehci_wait_for_port(int port)
 {
        u32 status;
        int ret, reps;
@@ -557,13 +557,13 @@ static inline void dbgp_printk(const char *fmt, ...) { }
 
 typedef void (*set_debug_port_t)(int port);
 
-static void default_set_debug_port(int port)
+static void __init default_set_debug_port(int port)
 {
 }
 
-static set_debug_port_t set_debug_port = default_set_debug_port;
+static set_debug_port_t __initdata set_debug_port = default_set_debug_port;
 
-static void nvidia_set_debug_port(int port)
+static void __init nvidia_set_debug_port(int port)
 {
        u32 dword;
        dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
index 899e8938e79f3d571cb207d6205b14800b2aa713..c929add475c9caf073f97c11b15ff6d645226126 100644 (file)
@@ -442,8 +442,7 @@ sysenter_past_esp:
 
        GET_THREAD_INFO(%ebp)
 
-       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-       testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+       testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
        jnz sysenter_audit
 sysenter_do_call:
        cmpl $(nr_syscalls), %eax
@@ -454,7 +453,7 @@ sysenter_do_call:
        DISABLE_INTERRUPTS(CLBR_ANY)
        TRACE_IRQS_OFF
        movl TI_flags(%ebp), %ecx
-       testw $_TIF_ALLWORK_MASK, %cx
+       testl $_TIF_ALLWORK_MASK, %ecx
        jne sysexit_audit
 sysenter_exit:
 /* if something modifies registers it must also disable sysexit */
@@ -468,7 +467,7 @@ sysenter_exit:
 
 #ifdef CONFIG_AUDITSYSCALL
 sysenter_audit:
-       testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+       testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
        jnz syscall_trace_entry
        addl $4,%esp
        CFI_ADJUST_CFA_OFFSET -4
@@ -485,7 +484,7 @@ sysenter_audit:
        jmp sysenter_do_call
 
 sysexit_audit:
-       testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
+       testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
        jne syscall_exit_work
        TRACE_IRQS_ON
        ENABLE_INTERRUPTS(CLBR_ANY)
@@ -498,7 +497,7 @@ sysexit_audit:
        DISABLE_INTERRUPTS(CLBR_ANY)
        TRACE_IRQS_OFF
        movl TI_flags(%ebp), %ecx
-       testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
+       testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
        jne syscall_exit_work
        movl PT_EAX(%esp),%eax  /* reload syscall return value */
        jmp sysenter_exit
@@ -523,8 +522,7 @@ ENTRY(system_call)
        SAVE_ALL
        GET_THREAD_INFO(%ebp)
                                        # system call tracing in operation / emulation
-       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-       testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+       testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
        jnz syscall_trace_entry
        cmpl $(nr_syscalls), %eax
        jae syscall_badsys
@@ -538,7 +536,7 @@ syscall_exit:
                                        # between sampling and the iret
        TRACE_IRQS_OFF
        movl TI_flags(%ebp), %ecx
-       testw $_TIF_ALLWORK_MASK, %cx   # current->work
+       testl $_TIF_ALLWORK_MASK, %ecx  # current->work
        jne syscall_exit_work
 
 restore_all:
@@ -673,7 +671,7 @@ END(syscall_trace_entry)
        # perform syscall exit tracing
        ALIGN
 syscall_exit_work:
-       testb $_TIF_WORK_SYSCALL_EXIT, %cl
+       testl $_TIF_WORK_SYSCALL_EXIT, %ecx
        jz work_pending
        TRACE_IRQS_ON
        ENABLE_INTERRUPTS(CLBR_ANY)     # could let syscall_trace_leave() call
index 83d1836b9467e74069d71805a65a53dbbcbce1b2..a331ec38af9ebb5d186685623448ac2e5263fd31 100644 (file)
@@ -368,6 +368,7 @@ ENTRY(save_rest)
 END(save_rest)
 
 /* save complete stack frame */
+       .pushsection .kprobes.text, "ax"
 ENTRY(save_paranoid)
        XCPT_FRAME 1 RDI+8
        cld
@@ -396,6 +397,7 @@ ENTRY(save_paranoid)
 1:     ret
        CFI_ENDPROC
 END(save_paranoid)
+       .popsection
 
 /*
  * A newly forked process directly context switches into this address.
@@ -416,7 +418,6 @@ ENTRY(ret_from_fork)
 
        GET_THREAD_INFO(%rcx)
 
-       CFI_REMEMBER_STATE
        RESTORE_REST
 
        testl $3, CS-ARGOFFSET(%rsp)            # from kernel_thread?
@@ -428,7 +429,6 @@ ENTRY(ret_from_fork)
        RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
        jmp ret_from_sys_call                   # go to the SYSRET fastpath
 
-       CFI_RESTORE_STATE
        CFI_ENDPROC
 END(ret_from_fork)
 
@@ -984,6 +984,8 @@ apicinterrupt UV_BAU_MESSAGE \
 #endif
 apicinterrupt LOCAL_TIMER_VECTOR \
        apic_timer_interrupt smp_apic_timer_interrupt
+apicinterrupt GENERIC_INTERRUPT_VECTOR \
+       generic_interrupt smp_generic_interrupt
 
 #ifdef CONFIG_SMP
 apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
index ac108d1fe182a44abcd0da75f18aae5fde32e1c0..3f8579f8d42cfa52710cc2331494542b1d1942dc 100644 (file)
@@ -18,7 +18,7 @@ void __init i386_start_kernel(void)
 {
        reserve_trampoline_memory();
 
-       reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+       reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
 
 #ifdef CONFIG_BLK_DEV_INITRD
        /* Reserve INITRD */
@@ -29,9 +29,6 @@ void __init i386_start_kernel(void)
                reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
        }
 #endif
-       reserve_early(init_pg_tables_start, init_pg_tables_end,
-                       "INIT_PG_TABLE");
-
        reserve_ebda_region();
 
        /*
index f5b2722476907bcd3fe7ace735238330c604b9e9..70eaa852c732a2db3ae5c809f56f27b9c0f42dc1 100644 (file)
@@ -100,7 +100,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
 
        reserve_trampoline_memory();
 
-       reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+       reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
 
 #ifdef CONFIG_BLK_DEV_INITRD
        /* Reserve INITRD */
index c32ca19d591a5de88057d2d9534168332fa7bf77..30683883e0cdcf0bd669cb439206fefaf74301f2 100644 (file)
 #define X86_VENDOR_ID  new_cpu_data+CPUINFO_x86_vendor_id
 
 /*
- * This is how much memory *in addition to the memory covered up to
- * and including _end* we need mapped initially.
+ * This is how much memory in addition to the memory covered up to
+ * and including _end we need mapped initially.
  * We need:
- *  - one bit for each possible page, but only in low memory, which means
- *     2^32/4096/8 = 128K worst case (4G/4G split.)
- *  - enough space to map all low memory, which means
- *     (2^32/4096) / 1024 pages (worst case, non PAE)
- *     (2^32/4096) / 512 + 4 pages (worst case for PAE)
- *  - a few pages for allocator use before the kernel pagetable has
- *     been set up
+ *     (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
+ *     (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
  *
  * Modulo rounding, each megabyte assigned here requires a kilobyte of
  * memory, which is currently unreclaimed.
  *
  * This should be a multiple of a page.
+ *
+ * KERNEL_IMAGE_SIZE should be greater than pa(_end)
+ * and small than max_low_pfn, otherwise will waste some page table entries
  */
-LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
-
-/*
- * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate
- * pagetables from above the 16MB DMA limit, so we'll have to set
- * up pagetables 16MB more (worst-case):
- */
-#ifdef CONFIG_DEBUG_PAGEALLOC
-LOW_PAGES = LOW_PAGES + 0x1000000
-#endif
 
 #if PTRS_PER_PMD > 1
-PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
+#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
 #else
-PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
+#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
 #endif
-BOOTBITMAP_SIZE = LOW_PAGES / 8
-ALLOCATOR_SLOP = 4
 
-INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
+/* Enough space to fit pagetables for the low memory linear map */
+MAPPING_BEYOND_END = \
+       PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
+
+/*
+ * Worst-case size of the kernel mapping we need to make:
+ * the worst-case size of the kernel itself, plus the extra we need
+ * to map for the linear map.
+ */
+KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT
+
+INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm
+RESERVE_BRK(pagetables, INIT_MAP_SIZE)
 
 /*
  * 32-bit kernel entrypoint; only used by the boot CPU.  On entry,
@@ -166,10 +164,10 @@ num_subarch_entries = (. - subarch_entries) / 4
 
 /*
  * Initialize page tables.  This creates a PDE and a set of page
- * tables, which are located immediately beyond _end.  The variable
- * init_pg_tables_end is set up to point to the first "safe" location.
+ * tables, which are located immediately beyond __brk_base.  The variable
+ * _brk_end is set up to point to the first "safe" location.
  * Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
+ * and PAGE_OFFSET for up to _end.
  *
  * Note that the stack is not yet set up!
  */
@@ -190,8 +188,7 @@ default_entry:
 
        xorl %ebx,%ebx                          /* %ebx is kept at zero */
 
-       movl $pa(pg0), %edi
-       movl %edi, pa(init_pg_tables_start)
+       movl $pa(__brk_base), %edi
        movl $pa(swapper_pg_pmd), %edx
        movl $PTE_IDENT_ATTR, %eax
 10:
@@ -209,14 +206,14 @@ default_entry:
        loop 11b
 
        /*
-        * End condition: we must map up to and including INIT_MAP_BEYOND_END
-        * bytes beyond the end of our own page tables.
+        * End condition: we must map up to the end + MAPPING_BEYOND_END.
         */
-       leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
+       movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
        cmpl %ebp,%eax
        jb 10b
 1:
-       movl %edi,pa(init_pg_tables_end)
+       addl $__PAGE_OFFSET, %edi
+       movl %edi, pa(_brk_end)
        shrl $12, %eax
        movl %eax, pa(max_pfn_mapped)
 
@@ -227,8 +224,7 @@ default_entry:
 
 page_pde_offset = (__PAGE_OFFSET >> 20);
 
-       movl $pa(pg0), %edi
-       movl %edi, pa(init_pg_tables_start)
+       movl $pa(__brk_base), %edi
        movl $pa(swapper_pg_dir), %edx
        movl $PTE_IDENT_ATTR, %eax
 10:
@@ -242,14 +238,13 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
        addl $0x1000,%eax
        loop 11b
        /*
-        * End condition: we must map up to and including INIT_MAP_BEYOND_END
-        * bytes beyond the end of our own page tables; the +0x007 is
-        * the attribute bits
+        * End condition: we must map up to the end + MAPPING_BEYOND_END.
         */
-       leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
+       movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
        cmpl %ebp,%eax
        jb 10b
-       movl %edi,pa(init_pg_tables_end)
+       addl $__PAGE_OFFSET, %edi
+       movl %edi, pa(_brk_end)
        shrl $12, %eax
        movl %eax, pa(max_pfn_mapped)
 
@@ -636,6 +631,7 @@ swapper_pg_fixmap:
        .fill 1024,4,0
 ENTRY(empty_zero_page)
        .fill 4096,1,0
+
 /*
  * This starts the data section.
  */
index 10f92fb532f34e1c0e0c6a93059fa845eed7049c..3475440baa549b00c9666a67da874d47efa23ef9 100644 (file)
@@ -3,17 +3,17 @@
  *
  */
 #include <linux/clockchips.h>
-#include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/spinlock.h>
 #include <linux/jiffies.h>
 #include <linux/module.h>
-#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/io.h>
 
-#include <asm/smp.h>
-#include <asm/delay.h>
 #include <asm/i8253.h>
-#include <asm/io.h>
 #include <asm/hpet.h>
+#include <asm/smp.h>
 
 DEFINE_SPINLOCK(i8253_lock);
 EXPORT_SYMBOL(i8253_lock);
@@ -40,7 +40,7 @@ static void init_pit_timer(enum clock_event_mode mode,
 {
        spin_lock(&i8253_lock);
 
-       switch(mode) {
+       switch (mode) {
        case CLOCK_EVT_MODE_PERIODIC:
                /* binary, mode 2, LSB/MSB, ch 0 */
                outb_pit(0x34, PIT_MODE);
@@ -95,7 +95,7 @@ static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
  * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
  * !using_apic_timer decisions in do_timer_interrupt_hook()
  */
-static struct clock_event_device pit_clockevent = {
+static struct clock_event_device pit_ce = {
        .name           = "pit",
        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
        .set_mode       = init_pit_timer,
@@ -114,15 +114,13 @@ void __init setup_pit_timer(void)
         * Start pit with the boot cpu mask and make it global after the
         * IO_APIC has been initialized.
         */
-       pit_clockevent.cpumask = cpumask_of(smp_processor_id());
-       pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC,
-                                    pit_clockevent.shift);
-       pit_clockevent.max_delta_ns =
-               clockevent_delta2ns(0x7FFF, &pit_clockevent);
-       pit_clockevent.min_delta_ns =
-               clockevent_delta2ns(0xF, &pit_clockevent);
-       clockevents_register_device(&pit_clockevent);
-       global_clock_event = &pit_clockevent;
+       pit_ce.cpumask = cpumask_of(smp_processor_id());
+       pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift);
+       pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce);
+       pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce);
+
+       clockevents_register_device(&pit_ce);
+       global_clock_event = &pit_ce;
 }
 
 #ifndef CONFIG_X86_64
@@ -133,11 +131,11 @@ void __init setup_pit_timer(void)
  */
 static cycle_t pit_read(void)
 {
+       static int old_count;
+       static u32 old_jifs;
        unsigned long flags;
        int count;
        u32 jifs;
-       static int old_count;
-       static u32 old_jifs;
 
        spin_lock_irqsave(&i8253_lock, flags);
        /*
@@ -179,9 +177,9 @@ static cycle_t pit_read(void)
         * Previous attempts to handle these cases intelligently were
         * buggy, so we just do the simple thing now.
         */
-       if (count > old_count && jifs == old_jifs) {
+       if (count > old_count && jifs == old_jifs)
                count = old_count;
-       }
+
        old_count = count;
        old_jifs = jifs;
 
@@ -192,13 +190,13 @@ static cycle_t pit_read(void)
        return (cycle_t)(jifs * LATCH) + count;
 }
 
-static struct clocksource clocksource_pit = {
-       .name   = "pit",
-       .rating = 110,
-       .read   = pit_read,
-       .mask   = CLOCKSOURCE_MASK(32),
-       .mult   = 0,
-       .shift  = 20,
+static struct clocksource pit_cs = {
+       .name           = "pit",
+       .rating         = 110,
+       .read           = pit_read,
+       .mask           = CLOCKSOURCE_MASK(32),
+       .mult           = 0,
+       .shift          = 20,
 };
 
 static void pit_disable_clocksource(void)
@@ -206,9 +204,9 @@ static void pit_disable_clocksource(void)
        /*
         * Use mult to check whether it is registered or not
         */
-       if (clocksource_pit.mult) {
-               clocksource_unregister(&clocksource_pit);
-               clocksource_pit.mult = 0;
+       if (pit_cs.mult) {
+               clocksource_unregister(&pit_cs);
+               pit_cs.mult = 0;
        }
 }
 
@@ -222,13 +220,13 @@ static int __init init_pit_clocksource(void)
          * - when local APIC timer is active (PIT is switched off)
          */
        if (num_possible_cpus() > 1 || is_hpet_enabled() ||
-           pit_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
+           pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
                return 0;
 
-       clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE,
-                                                  clocksource_pit.shift);
-       return clocksource_register(&clocksource_pit);
+       pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift);
+
+       return clocksource_register(&pit_cs);
 }
 arch_initcall(init_pit_clocksource);
 
-#endif
+#endif /* !CONFIG_X86_64 */
index 720d2607aacbc2bee985b00289b8bc5c70cb4fb5..a979b5bd2fc06ee5d07e2eb6a292caa773113f1a 100644 (file)
@@ -7,10 +7,10 @@
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/init.h>
 #include <linux/dmi.h>
-#include <asm/io.h>
+#include <linux/io.h>
 
 int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE;
 
@@ -47,8 +47,7 @@ EXPORT_SYMBOL(native_io_delay);
 static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id)
 {
        if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) {
-               printk(KERN_NOTICE "%s: using 0xed I/O delay port\n",
-                       id->ident);
+               pr_notice("%s: using 0xed I/O delay port\n", id->ident);
                io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
        }
 
@@ -64,40 +63,40 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
                .callback       = dmi_io_delay_0xed_port,
                .ident          = "Compaq Presario V6000",
                .matches        = {
-                       DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
-                       DMI_MATCH(DMI_BOARD_NAME, "30B7")
+                       DMI_MATCH(DMI_BOARD_VENDOR,     "Quanta"),
+                       DMI_MATCH(DMI_BOARD_NAME,       "30B7")
                }
        },
        {
                .callback       = dmi_io_delay_0xed_port,
                .ident          = "HP Pavilion dv9000z",
                .matches        = {
-                       DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
-                       DMI_MATCH(DMI_BOARD_NAME, "30B9")
+                       DMI_MATCH(DMI_BOARD_VENDOR,     "Quanta"),
+                       DMI_MATCH(DMI_BOARD_NAME,       "30B9")
                }
        },
        {
                .callback       = dmi_io_delay_0xed_port,
                .ident          = "HP Pavilion dv6000",
                .matches        = {
-                       DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
-                       DMI_MATCH(DMI_BOARD_NAME, "30B8")
+                       DMI_MATCH(DMI_BOARD_VENDOR,     "Quanta"),
+                       DMI_MATCH(DMI_BOARD_NAME,       "30B8")
                }
        },
        {
                .callback       = dmi_io_delay_0xed_port,
                .ident          = "HP Pavilion tx1000",
                .matches        = {
-                       DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
-                       DMI_MATCH(DMI_BOARD_NAME, "30BF")
+                       DMI_MATCH(DMI_BOARD_VENDOR,     "Quanta"),
+                       DMI_MATCH(DMI_BOARD_NAME,       "30BF")
                }
        },
        {
                .callback       = dmi_io_delay_0xed_port,
                .ident          = "Presario F700",
                .matches        = {
-                       DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
-                       DMI_MATCH(DMI_BOARD_NAME, "30D3")
+                       DMI_MATCH(DMI_BOARD_VENDOR,     "Quanta"),
+                       DMI_MATCH(DMI_BOARD_NAME,       "30D3")
                }
        },
        { }
index f13ca1650aafce84b1ecf5c1e665f1bbf4a9f5c6..3aaf7b9e3a8bd19405575502a2fbe9faaffba5b2 100644 (file)
@@ -15,6 +15,9 @@
 
 atomic_t irq_err_count;
 
+/* Function pointer for generic interrupt vector handling */
+void (*generic_interrupt_extension)(void) = NULL;
+
 /*
  * 'what should we do if we get a hw irq event on an illegal vector'.
  * each architecture has to answer this themselves.
@@ -42,55 +45,60 @@ void ack_bad_irq(unsigned int irq)
 /*
  * /proc/interrupts printing:
  */
-static int show_other_interrupts(struct seq_file *p)
+static int show_other_interrupts(struct seq_file *p, int prec)
 {
        int j;
 
-       seq_printf(p, "NMI: ");
+       seq_printf(p, "%*s: ", prec, "NMI");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
        seq_printf(p, "  Non-maskable interrupts\n");
 #ifdef CONFIG_X86_LOCAL_APIC
-       seq_printf(p, "LOC: ");
+       seq_printf(p, "%*s: ", prec, "LOC");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
        seq_printf(p, "  Local timer interrupts\n");
+
+       seq_printf(p, "%*s: ", prec, "SPU");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
+       seq_printf(p, "  Spurious interrupts\n");
 #endif
+       if (generic_interrupt_extension) {
+               seq_printf(p, "PLT: ");
+               for_each_online_cpu(j)
+                       seq_printf(p, "%10u ", irq_stats(j)->generic_irqs);
+               seq_printf(p, "  Platform interrupts\n");
+       }
 #ifdef CONFIG_SMP
-       seq_printf(p, "RES: ");
+       seq_printf(p, "%*s: ", prec, "RES");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
        seq_printf(p, "  Rescheduling interrupts\n");
-       seq_printf(p, "CAL: ");
+       seq_printf(p, "%*s: ", prec, "CAL");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
        seq_printf(p, "  Function call interrupts\n");
-       seq_printf(p, "TLB: ");
+       seq_printf(p, "%*s: ", prec, "TLB");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
        seq_printf(p, "  TLB shootdowns\n");
 #endif
 #ifdef CONFIG_X86_MCE
-       seq_printf(p, "TRM: ");
+       seq_printf(p, "%*s: ", prec, "TRM");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
        seq_printf(p, "  Thermal event interrupts\n");
 # ifdef CONFIG_X86_64
-       seq_printf(p, "THR: ");
+       seq_printf(p, "%*s: ", prec, "THR");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
        seq_printf(p, "  Threshold APIC interrupts\n");
 # endif
 #endif
-#ifdef CONFIG_X86_LOCAL_APIC
-       seq_printf(p, "SPU: ");
-       for_each_online_cpu(j)
-               seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
-       seq_printf(p, "  Spurious interrupts\n");
-#endif
-       seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+       seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
 #if defined(CONFIG_X86_IO_APIC)
-       seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+       seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
 #endif
        return 0;
 }
@@ -98,19 +106,22 @@ static int show_other_interrupts(struct seq_file *p)
 int show_interrupts(struct seq_file *p, void *v)
 {
        unsigned long flags, any_count = 0;
-       int i = *(loff_t *) v, j;
+       int i = *(loff_t *) v, j, prec;
        struct irqaction *action;
        struct irq_desc *desc;
 
        if (i > nr_irqs)
                return 0;
 
+       for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
+               j *= 10;
+
        if (i == nr_irqs)
-               return show_other_interrupts(p);
+               return show_other_interrupts(p, prec);
 
        /* print header */
        if (i == 0) {
-               seq_printf(p, "           ");
+               seq_printf(p, "%*s", prec + 8, "");
                for_each_online_cpu(j)
                        seq_printf(p, "CPU%-8d", j);
                seq_putc(p, '\n');
@@ -121,23 +132,15 @@ int show_interrupts(struct seq_file *p, void *v)
                return 0;
 
        spin_lock_irqsave(&desc->lock, flags);
-#ifndef CONFIG_SMP
-       any_count = kstat_irqs(i);
-#else
        for_each_online_cpu(j)
                any_count |= kstat_irqs_cpu(i, j);
-#endif
        action = desc->action;
        if (!action && !any_count)
                goto out;
 
-       seq_printf(p, "%3d: ", i);
-#ifndef CONFIG_SMP
-       seq_printf(p, "%10u ", kstat_irqs(i));
-#else
+       seq_printf(p, "%*d: ", prec, i);
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
-#endif
        seq_printf(p, " %8s", desc->chip->name);
        seq_printf(p, "-%-8s", desc->name);
 
@@ -162,7 +165,10 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 
 #ifdef CONFIG_X86_LOCAL_APIC
        sum += irq_stats(cpu)->apic_timer_irqs;
+       sum += irq_stats(cpu)->irq_spurious_count;
 #endif
+       if (generic_interrupt_extension)
+               sum += irq_stats(cpu)->generic_irqs;
 #ifdef CONFIG_SMP
        sum += irq_stats(cpu)->irq_resched_count;
        sum += irq_stats(cpu)->irq_call_count;
@@ -173,9 +179,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 # ifdef CONFIG_X86_64
        sum += irq_stats(cpu)->irq_threshold_count;
 #endif
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
-       sum += irq_stats(cpu)->irq_spurious_count;
 #endif
        return sum;
 }
@@ -226,4 +229,27 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
        return 1;
 }
 
+/*
+ * Handler for GENERIC_INTERRUPT_VECTOR.
+ */
+void smp_generic_interrupt(struct pt_regs *regs)
+{
+       struct pt_regs *old_regs = set_irq_regs(regs);
+
+       ack_APIC_irq();
+
+       exit_idle();
+
+       irq_enter();
+
+       inc_irq_stat(generic_irqs);
+
+       if (generic_interrupt_extension)
+               generic_interrupt_extension();
+
+       irq_exit();
+
+       set_irq_regs(old_regs);
+}
+
 EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
index 50b8c3a3006cb6f22ae0aa0f0cf3a50071475154..bc132610544829de0f38a12dd8461d73104b39a2 100644 (file)
@@ -175,6 +175,9 @@ void __init native_init_IRQ(void)
        /* self generated IPI for local APIC timer */
        alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
+       /* generic IPI for platform specific use */
+       alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+
        /* IPI vectors for APIC spurious and error interrupts */
        alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
        alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
index da481a1e3f303f8fe3bf10995e8e218cc556f2d4..c7a49e0ffbfbc0b4b2b0cd12b01d5294ca73745c 100644 (file)
@@ -147,6 +147,9 @@ static void __init apic_intr_init(void)
        /* self generated IPI for local APIC timer */
        alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
+       /* generic IPI for platform specific use */
+       alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+
        /* IPI vectors for APIC spurious and error interrupts */
        alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
        alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
index ff7d3b0124f1db07bbb0750ec0fe0e9237f66574..e444357375ce1421e137d6731adc3edba30def28 100644 (file)
@@ -8,11 +8,11 @@
  */
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
-#include <linux/stat.h>
+#include <linux/module.h>
 #include <linux/init.h>
+#include <linux/stat.h>
 #include <linux/io.h>
 #include <linux/mm.h>
-#include <linux/module.h>
 
 #include <asm/setup.h>
 
@@ -26,9 +26,8 @@ struct setup_data_node {
        u32 len;
 };
 
-static ssize_t
-setup_data_read(struct file *file, char __user *user_buf, size_t count,
-               loff_t *ppos)
+static ssize_t setup_data_read(struct file *file, char __user *user_buf,
+                              size_t count, loff_t *ppos)
 {
        struct setup_data_node *node = file->private_data;
        unsigned long remain;
@@ -39,20 +38,21 @@ setup_data_read(struct file *file, char __user *user_buf, size_t count,
 
        if (pos < 0)
                return -EINVAL;
+
        if (pos >= node->len)
                return 0;
 
        if (count > node->len - pos)
                count = node->len - pos;
+
        pa = node->paddr + sizeof(struct setup_data) + pos;
        pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT);
        if (PageHighMem(pg)) {
                p = ioremap_cache(pa, count);
                if (!p)
                        return -ENXIO;
-       } else {
+       } else
                p = __va(pa);
-       }
 
        remain = copy_to_user(user_buf, p, count);
 
@@ -70,12 +70,13 @@ setup_data_read(struct file *file, char __user *user_buf, size_t count,
 static int setup_data_open(struct inode *inode, struct file *file)
 {
        file->private_data = inode->i_private;
+
        return 0;
 }
 
 static const struct file_operations fops_setup_data = {
-       .read =         setup_data_read,
-       .open =         setup_data_open,
+       .read           = setup_data_read,
+       .open           = setup_data_open,
 };
 
 static int __init
@@ -84,57 +85,50 @@ create_setup_data_node(struct dentry *parent, int no,
 {
        struct dentry *d, *type, *data;
        char buf[16];
-       int error;
 
        sprintf(buf, "%d", no);
        d = debugfs_create_dir(buf, parent);
-       if (!d) {
-               error = -ENOMEM;
-               goto err_return;
-       }
+       if (!d)
+               return -ENOMEM;
+
        type = debugfs_create_x32("type", S_IRUGO, d, &node->type);
-       if (!type) {
-               error = -ENOMEM;
+       if (!type)
                goto err_dir;
-       }
+
        data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data);
-       if (!data) {
-               error = -ENOMEM;
+       if (!data)
                goto err_type;
-       }
+
        return 0;
 
 err_type:
        debugfs_remove(type);
 err_dir:
        debugfs_remove(d);
-err_return:
-       return error;
+       return -ENOMEM;
 }
 
 static int __init create_setup_data_nodes(struct dentry *parent)
 {
        struct setup_data_node *node;
        struct setup_data *data;
-       int error, no = 0;
+       int error = -ENOMEM;
        struct dentry *d;
        struct page *pg;
        u64 pa_data;
+       int no = 0;
 
        d = debugfs_create_dir("setup_data", parent);
-       if (!d) {
-               error = -ENOMEM;
-               goto err_return;
-       }
+       if (!d)
+               return -ENOMEM;
 
        pa_data = boot_params.hdr.setup_data;
 
        while (pa_data) {
                node = kmalloc(sizeof(*node), GFP_KERNEL);
-               if (!node) {
-                       error = -ENOMEM;
+               if (!node)
                        goto err_dir;
-               }
+
                pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
                if (PageHighMem(pg)) {
                        data = ioremap_cache(pa_data, sizeof(*data));
@@ -143,9 +137,8 @@ static int __init create_setup_data_nodes(struct dentry *parent)
                                error = -ENXIO;
                                goto err_dir;
                        }
-               } else {
+               } else
                        data = __va(pa_data);
-               }
 
                node->paddr = pa_data;
                node->type = data->type;
@@ -159,11 +152,11 @@ static int __init create_setup_data_nodes(struct dentry *parent)
                        goto err_dir;
                no++;
        }
+
        return 0;
 
 err_dir:
        debugfs_remove(d);
-err_return:
        return error;
 }
 
@@ -175,28 +168,26 @@ static struct debugfs_blob_wrapper boot_params_blob = {
 static int __init boot_params_kdebugfs_init(void)
 {
        struct dentry *dbp, *version, *data;
-       int error;
+       int error = -ENOMEM;
 
        dbp = debugfs_create_dir("boot_params", NULL);
-       if (!dbp) {
-               error = -ENOMEM;
-               goto err_return;
-       }
+       if (!dbp)
+               return -ENOMEM;
+
        version = debugfs_create_x16("version", S_IRUGO, dbp,
                                     &boot_params.hdr.version);
-       if (!version) {
-               error = -ENOMEM;
+       if (!version)
                goto err_dir;
-       }
+
        data = debugfs_create_blob("data", S_IRUGO, dbp,
                                   &boot_params_blob);
-       if (!data) {
-               error = -ENOMEM;
+       if (!data)
                goto err_version;
-       }
+
        error = create_setup_data_nodes(dbp);
        if (error)
                goto err_data;
+
        return 0;
 
 err_data:
@@ -205,10 +196,9 @@ err_version:
        debugfs_remove(version);
 err_dir:
        debugfs_remove(dbp);
-err_return:
        return error;
 }
-#endif
+#endif /* CONFIG_DEBUG_BOOT_PARAMS */
 
 static int __init arch_kdebugfs_init(void)
 {
index 4558dd3918cf3ec66263d7ac8e705acb4a5ea5a9..55b94614e34845cbab6e340754295e0d74462fa8 100644 (file)
@@ -193,7 +193,7 @@ static int __kprobes can_boost(kprobe_opcode_t *opcodes)
        kprobe_opcode_t opcode;
        kprobe_opcode_t *orig_opcodes = opcodes;
 
-       if (search_exception_tables(opcodes))
+       if (search_exception_tables((unsigned long)opcodes))
                return 0;       /* Page fault may occur on this address. */
 
 retry:
index 478bca986eca0d5213f3fe992cdb409db3670ce7..33019ddb56b452ce736f4b70fcb5d4efe8d0ae9d 100644 (file)
@@ -138,12 +138,6 @@ static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
        kvm_mmu_write(ptep, pte_val(pte));
 }
 
-static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr,
-                               pte_t *ptep, pte_t pte)
-{
-       kvm_mmu_write(ptep, pte_val(pte));
-}
-
 static void kvm_pte_clear(struct mm_struct *mm,
                          unsigned long addr, pte_t *ptep)
 {
@@ -220,7 +214,6 @@ static void paravirt_ops_setup(void)
 #if PAGETABLE_LEVELS >= 3
 #ifdef CONFIG_X86_PAE
                pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
-               pv_mmu_ops.set_pte_present = kvm_set_pte_present;
                pv_mmu_ops.pte_clear = kvm_pte_clear;
                pv_mmu_ops.pmd_clear = kvm_pmd_clear;
 #endif
index f5fc8c781a62f5dcc41d5ef7213b24203d273213..e7368c1da01dfa26eeedc563390e8191eb6eb08f 100644 (file)
 #include <linux/ftrace.h>
 #include <linux/suspend.h>
 #include <linux/gfp.h>
+#include <linux/io.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
-#include <asm/io.h>
 #include <asm/apic.h>
 #include <asm/cpufeature.h>
 #include <asm/desc.h>
@@ -63,7 +63,7 @@ static void load_segments(void)
                "\tmovl %%eax,%%fs\n"
                "\tmovl %%eax,%%gs\n"
                "\tmovl %%eax,%%ss\n"
-               ::: "eax", "memory");
+               : : : "eax", "memory");
 #undef STR
 #undef __STR
 }
@@ -205,7 +205,8 @@ void machine_kexec(struct kimage *image)
 
        if (image->preserve_context) {
 #ifdef CONFIG_X86_IO_APIC
-               /* We need to put APICs in legacy mode so that we can
+               /*
+                * We need to put APICs in legacy mode so that we can
                 * get timer interrupts in second kernel. kexec/kdump
                 * paths already have calls to disable_IO_APIC() in
                 * one form or other. kexec jump path also need
@@ -227,7 +228,8 @@ void machine_kexec(struct kimage *image)
                page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
                                                << PAGE_SHIFT);
 
-       /* The segment registers are funny things, they have both a
+       /*
+        * The segment registers are funny things, they have both a
         * visible and an invisible part.  Whenever the visible part is
         * set to a specific selector, the invisible part is loaded
         * with from a table in memory.  At no other time is the
@@ -237,11 +239,12 @@ void machine_kexec(struct kimage *image)
         * segments, before I zap the gdt with an invalid value.
         */
        load_segments();
-       /* The gdt & idt are now invalid.
+       /*
+        * The gdt & idt are now invalid.
         * If you want to load them you must set up your own idt & gdt.
         */
-       set_gdt(phys_to_virt(0),0);
-       set_idt(phys_to_virt(0),0);
+       set_gdt(phys_to_virt(0), 0);
+       set_idt(phys_to_virt(0), 0);
 
        /* now call it */
        image->start = relocate_kernel_ptr((unsigned long)image->head,
index 6993d51b7fd819bd72fae83545490c870538f50e..89cea4d44679641146411bbce9e3e94b9b2bff84 100644 (file)
 #include <linux/reboot.h>
 #include <linux/numa.h>
 #include <linux/ftrace.h>
+#include <linux/io.h>
+#include <linux/suspend.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
-#include <asm/io.h>
+
+static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
+                               unsigned long addr)
+{
+       pud_t *pud;
+       pmd_t *pmd;
+       struct page *page;
+       int result = -ENOMEM;
+
+       addr &= PMD_MASK;
+       pgd += pgd_index(addr);
+       if (!pgd_present(*pgd)) {
+               page = kimage_alloc_control_pages(image, 0);
+               if (!page)
+                       goto out;
+               pud = (pud_t *)page_address(page);
+               memset(pud, 0, PAGE_SIZE);
+               set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+       }
+       pud = pud_offset(pgd, addr);
+       if (!pud_present(*pud)) {
+               page = kimage_alloc_control_pages(image, 0);
+               if (!page)
+                       goto out;
+               pmd = (pmd_t *)page_address(page);
+               memset(pmd, 0, PAGE_SIZE);
+               set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+       }
+       pmd = pmd_offset(pud, addr);
+       if (!pmd_present(*pmd))
+               set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+       result = 0;
+out:
+       return result;
+}
 
 static void init_level2_page(pmd_t *level2p, unsigned long addr)
 {
@@ -83,9 +119,8 @@ static int init_level4_page(struct kimage *image, pgd_t *level4p,
                }
                level3p = (pud_t *)page_address(page);
                result = init_level3_page(image, level3p, addr, last_addr);
-               if (result) {
+               if (result)
                        goto out;
-               }
                set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
                addr += PGDIR_SIZE;
        }
@@ -154,6 +189,13 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
        int result;
        level4p = (pgd_t *)__va(start_pgtable);
        result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
+       if (result)
+               return result;
+       /*
+        * image->start may be outside 0 ~ max_pfn, for example when
+        * jump back to original kernel from kexeced kernel
+        */
+       result = init_one_level2_page(image, level4p, image->start);
        if (result)
                return result;
        return init_transition_pgtable(image, level4p);
@@ -229,20 +271,45 @@ void machine_kexec(struct kimage *image)
 {
        unsigned long page_list[PAGES_NR];
        void *control_page;
+       int save_ftrace_enabled;
 
-       tracer_disable();
+#ifdef CONFIG_KEXEC_JUMP
+       if (kexec_image->preserve_context)
+               save_processor_state();
+#endif
+
+       save_ftrace_enabled = __ftrace_enabled_save();
 
        /* Interrupts aren't acceptable while we reboot */
        local_irq_disable();
 
+       if (image->preserve_context) {
+#ifdef CONFIG_X86_IO_APIC
+               /*
+                * We need to put APICs in legacy mode so that we can
+                * get timer interrupts in second kernel. kexec/kdump
+                * paths already have calls to disable_IO_APIC() in
+                * one form or other. kexec jump path also need
+                * one.
+                */
+               disable_IO_APIC();
+#endif
+       }
+
        control_page = page_address(image->control_code_page) + PAGE_SIZE;
-       memcpy(control_page, relocate_kernel, PAGE_SIZE);
+       memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
 
        page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
+       page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
        page_list[PA_TABLE_PAGE] =
          (unsigned long)__pa(page_address(image->control_code_page));
 
-       /* The segment registers are funny things, they have both a
+       if (image->type == KEXEC_TYPE_DEFAULT)
+               page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
+                                               << PAGE_SHIFT);
+
+       /*
+        * The segment registers are funny things, they have both a
         * visible and an invisible part.  Whenever the visible part is
         * set to a specific selector, the invisible part is loaded
         * with from a table in memory.  At no other time is the
@@ -252,15 +319,25 @@ void machine_kexec(struct kimage *image)
         * segments, before I zap the gdt with an invalid value.
         */
        load_segments();
-       /* The gdt & idt are now invalid.
+       /*
+        * The gdt & idt are now invalid.
         * If you want to load them you must set up your own idt & gdt.
         */
-       set_gdt(phys_to_virt(0),0);
-       set_idt(phys_to_virt(0),0);
+       set_gdt(phys_to_virt(0), 0);
+       set_idt(phys_to_virt(0), 0);
 
        /* now call it */
-       relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
-                       image->start);
+       image->start = relocate_kernel((unsigned long)image->head,
+                                      (unsigned long)page_list,
+                                      image->start,
+                                      image->preserve_context);
+
+#ifdef CONFIG_KEXEC_JUMP
+       if (kexec_image->preserve_context)
+               restore_processor_state();
+#endif
+
+       __ftrace_enabled_restore(save_ftrace_enabled);
 }
 
 void arch_crash_save_vmcoreinfo(void)
index 666e43df51f99ce58b92c844149e1f8efcfa258f..712d15fdc416f9cd84139208434644ee887400a1 100644 (file)
@@ -226,7 +226,7 @@ static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d)
         return 0;
 }
 
-static struct dmi_system_id __devinitdata mmconf_dmi_table[] = {
+static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
         {
                 .callback = set_check_enable_amd_mmconf,
                 .ident = "Sun Microsystems Machine",
index 37cb1bda1baf98fc9fd591aeb6d9ad26049517a1..dce99dca6cf876401149e3108e0ea9c34f8ef388 100644 (file)
@@ -109,9 +109,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
        } else
                printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
 }
-#endif
-
-#ifdef CONFIG_X86_IO_APIC
 
 static int bad_ioapic(unsigned long address)
 {
@@ -224,8 +221,12 @@ static void __init MP_intsrc_info(struct mpc_intsrc *m)
        if (++mp_irq_entries == MAX_IRQ_SOURCES)
                panic("Max # of irq sources exceeded!!\n");
 }
+#else /* CONFIG_X86_IO_APIC */
+static inline void __init MP_bus_info(struct mpc_bus *m) {}
+static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
+static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {}
+#endif /* CONFIG_X86_IO_APIC */
 
-#endif
 
 static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
 {
@@ -275,6 +276,20 @@ static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
        return 1;
 }
 
+static void skip_entry(unsigned char **ptr, int *count, int size)
+{
+       *ptr += size;
+       *count += size;
+}
+
+static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
+{
+       printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"
+               "type %x\n", *mpt);
+       print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
+                       1, mpc, mpc->length, 1);
+}
+
 static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
 {
        char str[16];
@@ -310,61 +325,30 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
        while (count < mpc->length) {
                switch (*mpt) {
                case MP_PROCESSOR:
-                       {
-                               struct mpc_cpu *m = (struct mpc_cpu *)mpt;
-                               /* ACPI may have already provided this data */
-                               if (!acpi_lapic)
-                                       MP_processor_info(m);
-                               mpt += sizeof(*m);
-                               count += sizeof(*m);
-                               break;
-                       }
+                       /* ACPI may have already provided this data */
+                       if (!acpi_lapic)
+                               MP_processor_info((struct mpc_cpu *)mpt);
+                       skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
+                       break;
                case MP_BUS:
-                       {
-                               struct mpc_bus *m = (struct mpc_bus *)mpt;
-#ifdef CONFIG_X86_IO_APIC
-                               MP_bus_info(m);
-#endif
-                               mpt += sizeof(*m);
-                               count += sizeof(*m);
-                               break;
-                       }
+                       MP_bus_info((struct mpc_bus *)mpt);
+                       skip_entry(&mpt, &count, sizeof(struct mpc_bus));
+                       break;
                case MP_IOAPIC:
-                       {
-#ifdef CONFIG_X86_IO_APIC
-                               struct mpc_ioapic *m = (struct mpc_ioapic *)mpt;
-                               MP_ioapic_info(m);
-#endif
-                               mpt += sizeof(struct mpc_ioapic);
-                               count += sizeof(struct mpc_ioapic);
-                               break;
-                       }
+                       MP_ioapic_info((struct mpc_ioapic *)mpt);
+                       skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
+                       break;
                case MP_INTSRC:
-                       {
-#ifdef CONFIG_X86_IO_APIC
-                               struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
-
-                               MP_intsrc_info(m);
-#endif
-                               mpt += sizeof(struct mpc_intsrc);
-                               count += sizeof(struct mpc_intsrc);
-                               break;
-                       }
+                       MP_intsrc_info((struct mpc_intsrc *)mpt);
+                       skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
+                       break;
                case MP_LINTSRC:
-                       {
-                               struct mpc_lintsrc *m =
-                                   (struct mpc_lintsrc *)mpt;
-                               MP_lintsrc_info(m);
-                               mpt += sizeof(*m);
-                               count += sizeof(*m);
-                               break;
-                       }
+                       MP_lintsrc_info((struct mpc_lintsrc *)mpt);
+                       skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
+                       break;
                default:
                        /* wrong mptable */
-                       printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
-                       printk(KERN_ERR "type %x\n", *mpt);
-                       print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
-                                       1, mpc, mpc->length, 1);
+                       smp_dump_mptable(mpc, mpt);
                        count = mpc->length;
                        break;
                }
@@ -558,6 +542,68 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
 
 static struct mpf_intel *mpf_found;
 
+static unsigned long __init get_mpc_size(unsigned long physptr)
+{
+       struct mpc_table *mpc;
+       unsigned long size;
+
+       mpc = early_ioremap(physptr, PAGE_SIZE);
+       size = mpc->length;
+       early_iounmap(mpc, PAGE_SIZE);
+       apic_printk(APIC_VERBOSE, "  mpc: %lx-%lx\n", physptr, physptr + size);
+
+       return size;
+}
+
+static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
+{
+       struct mpc_table *mpc;
+       unsigned long size;
+
+       size = get_mpc_size(mpf->physptr);
+       mpc = early_ioremap(mpf->physptr, size);
+       /*
+        * Read the physical hardware table.  Anything here will
+        * override the defaults.
+        */
+       if (!smp_read_mpc(mpc, early)) {
+#ifdef CONFIG_X86_LOCAL_APIC
+               smp_found_config = 0;
+#endif
+               printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"
+                       "... disabling SMP support. (tell your hw vendor)\n");
+               early_iounmap(mpc, size);
+               return -1;
+       }
+       early_iounmap(mpc, size);
+
+       if (early)
+               return -1;
+
+#ifdef CONFIG_X86_IO_APIC
+       /*
+        * If there are no explicit MP IRQ entries, then we are
+        * broken.  We set up most of the low 16 IO-APIC pins to
+        * ISA defaults and hope it will work.
+        */
+       if (!mp_irq_entries) {
+               struct mpc_bus bus;
+
+               printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
+                      "using default mptable. (tell your hw vendor)\n");
+
+               bus.type = MP_BUS;
+               bus.busid = 0;
+               memcpy(bus.bustype, "ISA   ", 6);
+               MP_bus_info(&bus);
+
+               construct_default_ioirq_mptable(0);
+       }
+#endif
+
+       return 0;
+}
+
 /*
  * Scan the memory blocks for an SMP configuration block.
  */
@@ -611,45 +657,8 @@ static void __init __get_smp_config(unsigned int early)
                construct_default_ISA_mptable(mpf->feature1);
 
        } else if (mpf->physptr) {
-
-               /*
-                * Read the physical hardware table.  Anything here will
-                * override the defaults.
-                */
-               if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) {
-#ifdef CONFIG_X86_LOCAL_APIC
-                       smp_found_config = 0;
-#endif
-                       printk(KERN_ERR
-                              "BIOS bug, MP table errors detected!...\n");
-                       printk(KERN_ERR "... disabling SMP support. "
-                              "(tell your hw vendor)\n");
-                       return;
-               }
-
-               if (early)
+               if (check_physptr(mpf, early))
                        return;
-#ifdef CONFIG_X86_IO_APIC
-               /*
-                * If there are no explicit MP IRQ entries, then we are
-                * broken.  We set up most of the low 16 IO-APIC pins to
-                * ISA defaults and hope it will work.
-                */
-               if (!mp_irq_entries) {
-                       struct mpc_bus bus;
-
-                       printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
-                              "using default mptable. "
-                              "(tell your hw vendor)\n");
-
-                       bus.type = MP_BUS;
-                       bus.busid = 0;
-                       memcpy(bus.bustype, "ISA   ", 6);
-                       MP_bus_info(&bus);
-
-                       construct_default_ioirq_mptable(0);
-               }
-#endif
        } else
                BUG();
 
@@ -670,6 +679,31 @@ void __init get_smp_config(void)
        __get_smp_config(0);
 }
 
+static void smp_reserve_bootmem(struct mpf_intel *mpf)
+{
+       unsigned long size = get_mpc_size(mpf->physptr);
+#ifdef CONFIG_X86_32
+       /*
+        * We cannot access to MPC table to compute table size yet,
+        * as only few megabytes from the bottom is mapped now.
+        * PC-9800's MPC table places on the very last of physical
+        * memory; so that simply reserving PAGE_SIZE from mpf->physptr
+        * yields BUG() in reserve_bootmem.
+        * also need to make sure physptr is below than max_low_pfn
+        * we don't need reserve the area above max_low_pfn
+        */
+       unsigned long end = max_low_pfn * PAGE_SIZE;
+
+       if (mpf->physptr < end) {
+               if (mpf->physptr + size > end)
+                       size = end - mpf->physptr;
+               reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
+       }
+#else
+       reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
+#endif
+}
+
 static int __init smp_scan_config(unsigned long base, unsigned long length,
                                  unsigned reserve)
 {
@@ -697,36 +731,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 
                        if (!reserve)
                                return 1;
-                       reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
-                                       BOOTMEM_DEFAULT);
-                       if (mpf->physptr) {
-                               unsigned long size = PAGE_SIZE;
-#ifdef CONFIG_X86_32
-                               /*
-                                * We cannot access to MPC table to compute
-                                * table size yet, as only few megabytes from
-                                * the bottom is mapped now.
-                                * PC-9800's MPC table places on the very last
-                                * of physical memory; so that simply reserving
-                                * PAGE_SIZE from mpf->physptr yields BUG()
-                                * in reserve_bootmem.
-                                * also need to make sure physptr is below than
-                                * max_low_pfn
-                                * we don't need reserve the area above max_low_pfn
-                                */
-                               unsigned long end = max_low_pfn * PAGE_SIZE;
-
-                               if (mpf->physptr < end) {
-                                       if (mpf->physptr + size > end)
-                                               size = end - mpf->physptr;
-                                       reserve_bootmem_generic(mpf->physptr, size,
-                                                       BOOTMEM_DEFAULT);
-                               }
-#else
-                               reserve_bootmem_generic(mpf->physptr, size,
+                       reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
                                                BOOTMEM_DEFAULT);
-#endif
-                       }
+                       if (mpf->physptr)
+                               smp_reserve_bootmem(mpf);
 
                        return 1;
                }
@@ -829,7 +837,57 @@ static int  __init get_MP_intsrc_index(struct mpc_intsrc *m)
 #define SPARE_SLOT_NUM 20
 
 static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
-#endif
+
+static void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
+{
+       int i;
+
+       apic_printk(APIC_VERBOSE, "OLD ");
+       print_MP_intsrc_info(m);
+
+       i = get_MP_intsrc_index(m);
+       if (i > 0) {
+               assign_to_mpc_intsrc(&mp_irqs[i], m);
+               apic_printk(APIC_VERBOSE, "NEW ");
+               print_mp_irq_info(&mp_irqs[i]);
+               return;
+       }
+       if (!i) {
+               /* legacy, do nothing */
+               return;
+       }
+       if (*nr_m_spare < SPARE_SLOT_NUM) {
+               /*
+                * not found (-1), or duplicated (-2) are invalid entries,
+                * we need to use the slot later
+                */
+               m_spare[*nr_m_spare] = m;
+               *nr_m_spare += 1;
+       }
+}
+#else /* CONFIG_X86_IO_APIC */
+static inline void check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
+#endif /* CONFIG_X86_IO_APIC */
+
+static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length,
+                     int count)
+{
+       if (!mpc_new_phys) {
+               pr_info("No spare slots, try to append...take your risk, "
+                       "new mpc_length %x\n", count);
+       } else {
+               if (count <= mpc_new_length)
+                       pr_info("No spare slots, try to append..., "
+                               "new mpc_length %x\n", count);
+               else {
+                       pr_err("mpc_new_length %lx is too small\n",
+                               mpc_new_length);
+                       return -1;
+               }
+       }
+
+       return 0;
+}
 
 static int  __init replace_intsrc_all(struct mpc_table *mpc,
                                        unsigned long mpc_new_phys,
@@ -837,77 +895,33 @@ static int  __init replace_intsrc_all(struct mpc_table *mpc,
 {
 #ifdef CONFIG_X86_IO_APIC
        int i;
-       int nr_m_spare = 0;
 #endif
-
        int count = sizeof(*mpc);
+       int nr_m_spare = 0;
        unsigned char *mpt = ((unsigned char *)mpc) + count;
 
        printk(KERN_INFO "mpc_length %x\n", mpc->length);
        while (count < mpc->length) {
                switch (*mpt) {
                case MP_PROCESSOR:
-                       {
-                               struct mpc_cpu *m = (struct mpc_cpu *)mpt;
-                               mpt += sizeof(*m);
-                               count += sizeof(*m);
-                               break;
-                       }
+                       skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
+                       break;
                case MP_BUS:
-                       {
-                               struct mpc_bus *m = (struct mpc_bus *)mpt;
-                               mpt += sizeof(*m);
-                               count += sizeof(*m);
-                               break;
-                       }
+                       skip_entry(&mpt, &count, sizeof(struct mpc_bus));
+                       break;
                case MP_IOAPIC:
-                       {
-                               mpt += sizeof(struct mpc_ioapic);
-                               count += sizeof(struct mpc_ioapic);
-                               break;
-                       }
+                       skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
+                       break;
                case MP_INTSRC:
-                       {
-#ifdef CONFIG_X86_IO_APIC
-                               struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
-
-                               printk(KERN_INFO "OLD ");
-                               print_MP_intsrc_info(m);
-                               i = get_MP_intsrc_index(m);
-                               if (i > 0) {
-                                       assign_to_mpc_intsrc(&mp_irqs[i], m);
-                                       printk(KERN_INFO "NEW ");
-                                       print_mp_irq_info(&mp_irqs[i]);
-                               } else if (!i) {
-                                       /* legacy, do nothing */
-                               } else if (nr_m_spare < SPARE_SLOT_NUM) {
-                                       /*
-                                        * not found (-1), or duplicated (-2)
-                                        * are invalid entries,
-                                        * we need to use the slot  later
-                                        */
-                                       m_spare[nr_m_spare] = m;
-                                       nr_m_spare++;
-                               }
-#endif
-                               mpt += sizeof(struct mpc_intsrc);
-                               count += sizeof(struct mpc_intsrc);
-                               break;
-                       }
+                       check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare);
+                       skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
+                       break;
                case MP_LINTSRC:
-                       {
-                               struct mpc_lintsrc *m =
-                                   (struct mpc_lintsrc *)mpt;
-                               mpt += sizeof(*m);
-                               count += sizeof(*m);
-                               break;
-                       }
+                       skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
+                       break;
                default:
                        /* wrong mptable */
-                       printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
-                       printk(KERN_ERR "type %x\n", *mpt);
-                       print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
-                                       1, mpc, mpc->length, 1);
+                       smp_dump_mptable(mpc, mpt);
                        goto out;
                }
        }
@@ -924,23 +938,15 @@ static int  __init replace_intsrc_all(struct mpc_table *mpc,
                        continue;
 
                if (nr_m_spare > 0) {
-                       printk(KERN_INFO "*NEW* found ");
+                       apic_printk(APIC_VERBOSE, "*NEW* found\n");
                        nr_m_spare--;
                        assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
                        m_spare[nr_m_spare] = NULL;
                } else {
                        struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
                        count += sizeof(struct mpc_intsrc);
-                       if (!mpc_new_phys) {
-                               printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
-                       } else {
-                               if (count <= mpc_new_length)
-                                       printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
-                               else {
-                                       printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
-                                       goto out;
-                               }
-                       }
+                       if (!check_slot(mpc_new_phys, mpc_new_length, count))
+                               goto out;
                        assign_to_mpc_intsrc(&mp_irqs[i], m);
                        mpc->length = count;
                        mpt += sizeof(struct mpc_intsrc);
index 63dd358d8ee13be0708c0139bf348d5d237aecfb..8e45f4464880ccdba671ea5248c5e2bea4f9b297 100644 (file)
@@ -470,7 +470,6 @@ struct pv_mmu_ops pv_mmu_ops = {
 #if PAGETABLE_LEVELS >= 3
 #ifdef CONFIG_X86_PAE
        .set_pte_atomic = native_set_pte_atomic,
-       .set_pte_present = native_set_pte_present,
        .pte_clear = native_pte_clear,
        .pmd_clear = native_pmd_clear,
 #endif
index c70ab5a5d4c886a0856de8e0f313a97048d2c328..8b02a3936d4280ede99fd907e3f87e7a8845ae2e 100644 (file)
@@ -1,14 +1,14 @@
 /* Fallback functions when the main IOMMU code is not compiled in. This
    code is roughly equivalent to i386. */
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/string.h>
 #include <linux/dma-mapping.h>
 #include <linux/scatterlist.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/mm.h>
 
-#include <asm/iommu.h>
 #include <asm/processor.h>
+#include <asm/iommu.h>
 #include <asm/dma.h>
 
 static int
@@ -79,11 +79,11 @@ static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
 }
 
 struct dma_mapping_ops nommu_dma_ops = {
-       .alloc_coherent = dma_generic_alloc_coherent,
-       .free_coherent = nommu_free_coherent,
-       .map_single = nommu_map_single,
-       .map_sg = nommu_map_sg,
-       .is_phys = 1,
+       .alloc_coherent = dma_generic_alloc_coherent,
+       .free_coherent  = nommu_free_coherent,
+       .map_single     = nommu_map_single,
+       .map_sg         = nommu_map_sg,
+       .is_phys        = 1,
 };
 
 void __init no_iommu_init(void)
index 6afa5232dbb739f99c07ff0641cc265428db74eb..156f87582c6cd5e0fcb677b7c685d92c68443d72 100644 (file)
@@ -65,11 +65,11 @@ void exit_thread(void)
 {
        struct task_struct *me = current;
        struct thread_struct *t = &me->thread;
+       unsigned long *bp = t->io_bitmap_ptr;
 
-       if (me->thread.io_bitmap_ptr) {
+       if (bp) {
                struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
 
-               kfree(t->io_bitmap_ptr);
                t->io_bitmap_ptr = NULL;
                clear_thread_flag(TIF_IO_BITMAP);
                /*
@@ -78,6 +78,7 @@ void exit_thread(void)
                memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
                t->io_bitmap_max = 0;
                put_cpu();
+               kfree(bp);
        }
 
        ds_exit_thread(current);
index 3d9672e59c16aeb0b16d0be4d0aae5f1b362f61c..19378715f4157b5202d823f9d751dd1f07c08b38 100644 (file)
@@ -685,9 +685,8 @@ static int ptrace_bts_config(struct task_struct *child,
                if (!cfg.signal)
                        return -EINVAL;
 
-               return -EOPNOTSUPP;
-
                child->thread.bts_ovfl_signal = cfg.signal;
+               return -EOPNOTSUPP;
        }
 
        if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
index 697d1b78cfbf702eb6ab9cc556bb9e94bc6d357b..e95022e4f5d5e19c6048edd188a29942e1dce49c 100644 (file)
@@ -74,8 +74,7 @@ static void ich_force_hpet_resume(void)
        if (!force_hpet_address)
                return;
 
-       if (rcba_base == NULL)
-               BUG();
+       BUG_ON(rcba_base == NULL);
 
        /* read the Function Disable register, dword mode only */
        val = readl(rcba_base + 0x3404);
index 2064d0aa8d28842f235600428c3c2958f8313723..41235531b11ca894c809f9ae97f4d3c4f14791a4 100644 (file)
@@ -17,7 +17,8 @@
 
 #define PTR(x) (x << 2)
 
-/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
+/*
+ * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
  * ~ control_page + PAGE_SIZE are used as data storage and stack for
  * jumping back
  */
@@ -76,8 +77,10 @@ relocate_kernel:
        movl    %eax, CP_PA_SWAP_PAGE(%edi)
        movl    %ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
 
-       /* get physical address of control page now */
-       /* this is impossible after page table switch */
+       /*
+        * get physical address of control page now
+        * this is impossible after page table switch
+        */
        movl    PTR(PA_CONTROL_PAGE)(%ebp), %edi
 
        /* switch to new set of page tables */
@@ -97,7 +100,8 @@ identity_mapped:
        /* store the start address on the stack */
        pushl   %edx
 
-       /* Set cr0 to a known state:
+       /*
+        * Set cr0 to a known state:
         *  - Paging disabled
         *  - Alignment check disabled
         *  - Write protect disabled
@@ -113,7 +117,8 @@ identity_mapped:
        /* clear cr4 if applicable */
        testl   %ecx, %ecx
        jz      1f
-       /* Set cr4 to a known state:
+       /*
+        * Set cr4 to a known state:
         * Setting everything to zero seems safe.
         */
        xorl    %eax, %eax
@@ -132,15 +137,18 @@ identity_mapped:
        call    swap_pages
        addl    $8, %esp
 
-       /* To be certain of avoiding problems with self-modifying code
+       /*
+        * To be certain of avoiding problems with self-modifying code
         * I need to execute a serializing instruction here.
         * So I flush the TLB, it's handy, and not processor dependent.
         */
        xorl    %eax, %eax
        movl    %eax, %cr3
 
-       /* set all of the registers to known values */
-       /* leave %esp alone */
+       /*
+        * set all of the registers to known values
+        * leave %esp alone
+        */
 
        testl   %esi, %esi
        jnz 1f
index d32cfb27a479ecbf32be5dca3c87e5a220e59261..4de8f5b3d476ca0aa544e025568cf00cb6c25ebb 100644 (file)
 #define PTR(x) (x << 3)
 #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
 
+/*
+ * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
+ * ~ control_page + PAGE_SIZE are used as data storage and stack for
+ * jumping back
+ */
+#define DATA(offset)           (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
+
+/* Minimal CPU state */
+#define RSP                    DATA(0x0)
+#define CR0                    DATA(0x8)
+#define CR3                    DATA(0x10)
+#define CR4                    DATA(0x18)
+
+/* other data */
+#define CP_PA_TABLE_PAGE       DATA(0x20)
+#define CP_PA_SWAP_PAGE                DATA(0x28)
+#define CP_PA_BACKUP_PAGES_MAP DATA(0x30)
+
        .text
        .align PAGE_SIZE
        .code64
        .globl relocate_kernel
 relocate_kernel:
-       /* %rdi indirection_page
+       /*
+        * %rdi indirection_page
         * %rsi page_list
         * %rdx start address
+        * %rcx preserve_context
         */
 
+       /* Save the CPU context, used for jumping back */
+       pushq %rbx
+       pushq %rbp
+       pushq %r12
+       pushq %r13
+       pushq %r14
+       pushq %r15
+       pushf
+
+       movq    PTR(VA_CONTROL_PAGE)(%rsi), %r11
+       movq    %rsp, RSP(%r11)
+       movq    %cr0, %rax
+       movq    %rax, CR0(%r11)
+       movq    %cr3, %rax
+       movq    %rax, CR3(%r11)
+       movq    %cr4, %rax
+       movq    %rax, CR4(%r11)
+
        /* zero out flags, and disable interrupts */
        pushq $0
        popfq
 
-       /* get physical address of control page now */
-       /* this is impossible after page table switch */
+       /*
+        * get physical address of control page now
+        * this is impossible after page table switch
+        */
        movq    PTR(PA_CONTROL_PAGE)(%rsi), %r8
 
        /* get physical address of page table now too */
-       movq    PTR(PA_TABLE_PAGE)(%rsi), %rcx
+       movq    PTR(PA_TABLE_PAGE)(%rsi), %r9
+
+       /* get physical address of swap page now */
+       movq    PTR(PA_SWAP_PAGE)(%rsi), %r10
+
+       /* save some information for jumping back */
+       movq    %r9, CP_PA_TABLE_PAGE(%r11)
+       movq    %r10, CP_PA_SWAP_PAGE(%r11)
+       movq    %rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
 
        /* Switch to the identity mapped page tables */
-       movq    %rcx, %cr3
+       movq    %r9, %cr3
 
        /* setup a new stack at the end of the physical control page */
        lea     PAGE_SIZE(%r8), %rsp
@@ -55,7 +103,8 @@ identity_mapped:
        /* store the start address on the stack */
        pushq   %rdx
 
-       /* Set cr0 to a known state:
+       /*
+        * Set cr0 to a known state:
         *  - Paging enabled
         *  - Alignment check disabled
         *  - Write protect disabled
@@ -68,7 +117,8 @@ identity_mapped:
        orl     $(X86_CR0_PG | X86_CR0_PE), %eax
        movq    %rax, %cr0
 
-       /* Set cr4 to a known state:
+       /*
+        * Set cr4 to a known state:
         *  - physical address extension enabled
         */
        movq    $X86_CR4_PAE, %rax
@@ -78,9 +128,87 @@ identity_mapped:
 1:
 
        /* Flush the TLB (needed?) */
-       movq    %rcx, %cr3
+       movq    %r9, %cr3
+
+       movq    %rcx, %r11
+       call    swap_pages
+
+       /*
+        * To be certain of avoiding problems with self-modifying code
+        * I need to execute a serializing instruction here.
+        * So I flush the TLB by reloading %cr3 here, it's handy,
+        * and not processor dependent.
+        */
+       movq    %cr3, %rax
+       movq    %rax, %cr3
+
+       /*
+        * set all of the registers to known values
+        * leave %rsp alone
+        */
+
+       testq   %r11, %r11
+       jnz 1f
+       xorq    %rax, %rax
+       xorq    %rbx, %rbx
+       xorq    %rcx, %rcx
+       xorq    %rdx, %rdx
+       xorq    %rsi, %rsi
+       xorq    %rdi, %rdi
+       xorq    %rbp, %rbp
+       xorq    %r8,  %r8
+       xorq    %r9,  %r9
+       xorq    %r10, %r9
+       xorq    %r11, %r11
+       xorq    %r12, %r12
+       xorq    %r13, %r13
+       xorq    %r14, %r14
+       xorq    %r15, %r15
+
+       ret
+
+1:
+       popq    %rdx
+       leaq    PAGE_SIZE(%r10), %rsp
+       call    *%rdx
+
+       /* get the re-entry point of the peer system */
+       movq    0(%rsp), %rbp
+       call    1f
+1:
+       popq    %r8
+       subq    $(1b - relocate_kernel), %r8
+       movq    CP_PA_SWAP_PAGE(%r8), %r10
+       movq    CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
+       movq    CP_PA_TABLE_PAGE(%r8), %rax
+       movq    %rax, %cr3
+       lea     PAGE_SIZE(%r8), %rsp
+       call    swap_pages
+       movq    $virtual_mapped, %rax
+       pushq   %rax
+       ret
+
+virtual_mapped:
+       movq    RSP(%r8), %rsp
+       movq    CR4(%r8), %rax
+       movq    %rax, %cr4
+       movq    CR3(%r8), %rax
+       movq    CR0(%r8), %r8
+       movq    %rax, %cr3
+       movq    %r8, %cr0
+       movq    %rbp, %rax
+
+       popf
+       popq    %r15
+       popq    %r14
+       popq    %r13
+       popq    %r12
+       popq    %rbp
+       popq    %rbx
+       ret
 
        /* Do the copies */
+swap_pages:
        movq    %rdi, %rcx      /* Put the page_list in %rcx */
        xorq    %rdi, %rdi
        xorq    %rsi, %rsi
@@ -112,36 +240,27 @@ identity_mapped:
        movq    %rcx,   %rsi  /* For ever source page do a copy */
        andq    $0xfffffffffffff000, %rsi
 
+       movq    %rdi, %rdx
+       movq    %rsi, %rax
+
+       movq    %r10, %rdi
        movq    $512,   %rcx
        rep ; movsq
-       jmp     0b
-3:
-
-       /* To be certain of avoiding problems with self-modifying code
-        * I need to execute a serializing instruction here.
-        * So I flush the TLB by reloading %cr3 here, it's handy,
-        * and not processor dependent.
-        */
-       movq    %cr3, %rax
-       movq    %rax, %cr3
 
-       /* set all of the registers to known values */
-       /* leave %rsp alone */
+       movq    %rax, %rdi
+       movq    %rdx, %rsi
+       movq    $512,   %rcx
+       rep ; movsq
 
-       xorq    %rax, %rax
-       xorq    %rbx, %rbx
-       xorq    %rcx, %rcx
-       xorq    %rdx, %rdx
-       xorq    %rsi, %rsi
-       xorq    %rdi, %rdi
-       xorq    %rbp, %rbp
-       xorq    %r8,  %r8
-       xorq    %r9,  %r9
-       xorq    %r10, %r9
-       xorq    %r11, %r11
-       xorq    %r12, %r12
-       xorq    %r13, %r13
-       xorq    %r14, %r14
-       xorq    %r15, %r15
+       movq    %rdx, %rdi
+       movq    %r10, %rsi
+       movq    $512,   %rcx
+       rep ; movsq
 
+       lea     PAGE_SIZE(%rax), %rsi
+       jmp     0b
+3:
        ret
+
+       .globl kexec_control_code_size
+.set kexec_control_code_size, . - relocate_kernel
index dd6f2b71561bfbdc245e78cae8cfac30d44d425e..5d465b207e72c3bfae75b29f6d0e1dcb3365e6b4 100644 (file)
@@ -1,14 +1,14 @@
 /*
  * RTC related functions
  */
+#include <linux/platform_device.h>
+#include <linux/mc146818rtc.h>
 #include <linux/acpi.h>
 #include <linux/bcd.h>
-#include <linux/mc146818rtc.h>
-#include <linux/platform_device.h>
 #include <linux/pnp.h>
 
-#include <asm/time.h>
 #include <asm/vsyscall.h>
+#include <asm/time.h>
 
 #ifdef CONFIG_X86_32
 /*
@@ -16,9 +16,9 @@
  * register we are working with.  It is required for NMI access to the
  * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
  */
-volatile unsigned long cmos_lock = 0;
+volatile unsigned long cmos_lock;
 EXPORT_SYMBOL(cmos_lock);
-#endif
+#endif /* CONFIG_X86_32 */
 
 /* For two digit years assume time is always after that */
 #define CMOS_YEARS_OFFS 2000
@@ -38,9 +38,9 @@ EXPORT_SYMBOL(rtc_lock);
  */
 int mach_set_rtc_mmss(unsigned long nowtime)
 {
-       int retval = 0;
        int real_seconds, real_minutes, cmos_minutes;
        unsigned char save_control, save_freq_select;
+       int retval = 0;
 
         /* tell the clock it's being set */
        save_control = CMOS_READ(RTC_CONTROL);
@@ -72,8 +72,8 @@ int mach_set_rtc_mmss(unsigned long nowtime)
                        real_seconds = bin2bcd(real_seconds);
                        real_minutes = bin2bcd(real_minutes);
                }
-               CMOS_WRITE(real_seconds,RTC_SECONDS);
-               CMOS_WRITE(real_minutes,RTC_MINUTES);
+               CMOS_WRITE(real_seconds, RTC_SECONDS);
+               CMOS_WRITE(real_minutes, RTC_MINUTES);
        } else {
                printk(KERN_WARNING
                       "set_rtc_mmss: can't update from %d to %d\n",
@@ -151,6 +151,7 @@ unsigned char rtc_cmos_read(unsigned char addr)
        outb(addr, RTC_PORT(0));
        val = inb(RTC_PORT(1));
        lock_cmos_suffix(addr);
+
        return val;
 }
 EXPORT_SYMBOL(rtc_cmos_read);
@@ -166,8 +167,8 @@ EXPORT_SYMBOL(rtc_cmos_write);
 
 static int set_rtc_mmss(unsigned long nowtime)
 {
-       int retval;
        unsigned long flags;
+       int retval;
 
        spin_lock_irqsave(&rtc_lock, flags);
        retval = set_wallclock(nowtime);
@@ -242,6 +243,7 @@ static __init int add_rtc_cmos(void)
        platform_device_register(&rtc_device);
        dev_info(&rtc_device.dev,
                 "registered platform RTC device (no PNP device found)\n");
+
        return 0;
 }
 device_initcall(add_rtc_cmos);
index b746deb9ebc649685c4c167f50e525541b8da292..a0d26237d7cf141462a2d20941f8bd99d4195e57 100644 (file)
 #define ARCH_SETUP
 #endif
 
+RESERVE_BRK(dmi_alloc, 65536);
+
 unsigned int boot_cpu_id __read_mostly;
 
+static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
+unsigned long _brk_end = (unsigned long)__brk_base;
+
 #ifdef CONFIG_X86_64
 int default_cpu_present_to_apicid(int mps_cpu)
 {
@@ -158,12 +163,6 @@ static struct resource bss_resource = {
 
 
 #ifdef CONFIG_X86_32
-/* This value is set up by the early boot code to point to the value
-   immediately after the boot time page tables.  It contains a *physical*
-   address, and must not be in the .bss segment! */
-unsigned long init_pg_tables_start __initdata = ~0UL;
-unsigned long init_pg_tables_end __initdata = ~0UL;
-
 static struct resource video_ram_resource = {
        .name   = "Video RAM area",
        .start  = 0xa0000,
@@ -202,7 +201,9 @@ struct ist_info ist_info;
 #endif
 
 #else
-struct cpuinfo_x86 boot_cpu_data __read_mostly;
+struct cpuinfo_x86 boot_cpu_data __read_mostly = {
+       .x86_phys_bits = MAX_PHYSMEM_BITS,
+};
 EXPORT_SYMBOL(boot_cpu_data);
 #endif
 
@@ -216,12 +217,6 @@ unsigned long mmu_cr4_features = X86_CR4_PAE;
 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
 int bootloader_type;
 
-/*
- * Early DMI memory
- */
-int dmi_alloc_index;
-char dmi_alloc_data[DMI_MAX_DATA];
-
 /*
  * Setup options
  */
@@ -267,6 +262,35 @@ static inline void copy_edd(void)
 }
 #endif
 
+void * __init extend_brk(size_t size, size_t align)
+{
+       size_t mask = align - 1;
+       void *ret;
+
+       BUG_ON(_brk_start == 0);
+       BUG_ON(align & mask);
+
+       _brk_end = (_brk_end + mask) & ~mask;
+       BUG_ON((char *)(_brk_end + size) > __brk_limit);
+
+       ret = (void *)_brk_end;
+       _brk_end += size;
+
+       memset(ret, 0, size);
+
+       return ret;
+}
+
+static void __init reserve_brk(void)
+{
+       if (_brk_end > _brk_start)
+               reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK");
+
+       /* Mark brk area as locked down and no longer taking any
+          new allocations */
+       _brk_start = 0;
+}
+
 #ifdef CONFIG_BLK_DEV_INITRD
 
 #ifdef CONFIG_X86_32
@@ -715,11 +739,7 @@ void __init setup_arch(char **cmdline_p)
        init_mm.start_code = (unsigned long) _text;
        init_mm.end_code = (unsigned long) _etext;
        init_mm.end_data = (unsigned long) _edata;
-#ifdef CONFIG_X86_32
-       init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
-#else
-       init_mm.brk = (unsigned long) &_end;
-#endif
+       init_mm.brk = _brk_end;
 
        code_resource.start = virt_to_phys(_text);
        code_resource.end = virt_to_phys(_etext)-1;
@@ -840,6 +860,8 @@ void __init setup_arch(char **cmdline_p)
        setup_bios_corruption_check();
 #endif
 
+       reserve_brk();
+
        /* max_pfn_mapped is updated here */
        max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
        max_pfn_mapped = max_low_pfn_mapped;
index d2cc6428c5875a6103a00757eeeec9ff0831508f..dfcc74ab0ab64f922c12717ada6a248408593a75 100644 (file)
@@ -211,31 +211,27 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 {
        /* Default to using normal stack */
        unsigned long sp = regs->sp;
+       int onsigstack = on_sig_stack(sp);
 
 #ifdef CONFIG_X86_64
        /* redzone */
        sp -= 128;
 #endif /* CONFIG_X86_64 */
 
-       /*
-        * If we are on the alternate signal stack and would overflow it, don't.
-        * Return an always-bogus address instead so we will die with SIGSEGV.
-        */
-       if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size)))
-               return (void __user *) -1L;
-
-       /* This is the X/Open sanctioned signal stack switching.  */
-       if (ka->sa.sa_flags & SA_ONSTACK) {
-               if (sas_ss_flags(sp) == 0)
-                       sp = current->sas_ss_sp + current->sas_ss_size;
-       } else {
+       if (!onsigstack) {
+               /* This is the X/Open sanctioned signal stack switching.  */
+               if (ka->sa.sa_flags & SA_ONSTACK) {
+                       if (sas_ss_flags(sp) == 0)
+                               sp = current->sas_ss_sp + current->sas_ss_size;
+               } else {
 #ifdef CONFIG_X86_32
-               /* This is the legacy signal stack switching. */
-               if ((regs->ss & 0xffff) != __USER_DS &&
-                       !(ka->sa.sa_flags & SA_RESTORER) &&
-                               ka->sa.sa_restorer)
-                       sp = (unsigned long) ka->sa.sa_restorer;
+                       /* This is the legacy signal stack switching. */
+                       if ((regs->ss & 0xffff) != __USER_DS &&
+                               !(ka->sa.sa_flags & SA_RESTORER) &&
+                                       ka->sa.sa_restorer)
+                               sp = (unsigned long) ka->sa.sa_restorer;
 #endif /* CONFIG_X86_32 */
+               }
        }
 
        if (used_math()) {
@@ -244,12 +240,22 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
                sp = round_down(sp, 64);
 #endif /* CONFIG_X86_64 */
                *fpstate = (void __user *)sp;
-
-               if (save_i387_xstate(*fpstate) < 0)
-                       return (void __user *)-1L;
        }
 
-       return (void __user *)align_sigframe(sp - frame_size);
+       sp = align_sigframe(sp - frame_size);
+
+       /*
+        * If we are on the alternate signal stack and would overflow it, don't.
+        * Return an always-bogus address instead so we will die with SIGSEGV.
+        */
+       if (onsigstack && !likely(on_sig_stack(sp)))
+               return (void __user *)-1L;
+
+       /* save i387 state */
+       if (used_math() && save_i387_xstate(*fpstate) < 0)
+               return (void __user *)-1L;
+
+       return (void __user *)sp;
 }
 
 #ifdef CONFIG_X86_32
index 249334f5080a16d2e5a6e2c4fb67a8c19be88891..ef7d10170c30d1e98291cfdb0b4d0f17eda49b28 100644 (file)
@@ -114,10 +114,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
 
 atomic_t init_deasserted;
 
-
-/* Set if we find a B stepping CPU */
-static int __cpuinitdata smp_b_stepping;
-
 #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
 
 /* which logical CPUs are on which nodes */
@@ -271,8 +267,6 @@ static void __cpuinit smp_callin(void)
        cpumask_set_cpu(cpuid, cpu_callin_mask);
 }
 
-static int __cpuinitdata unsafe_smp;
-
 /*
  * Activate a secondary processor.
  */
@@ -340,76 +334,6 @@ notrace static void __cpuinit start_secondary(void *unused)
        cpu_idle();
 }
 
-static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
-{
-       /*
-        * Mask B, Pentium, but not Pentium MMX
-        */
-       if (c->x86_vendor == X86_VENDOR_INTEL &&
-           c->x86 == 5 &&
-           c->x86_mask >= 1 && c->x86_mask <= 4 &&
-           c->x86_model <= 3)
-               /*
-                * Remember we have B step Pentia with bugs
-                */
-               smp_b_stepping = 1;
-
-       /*
-        * Certain Athlons might work (for various values of 'work') in SMP
-        * but they are not certified as MP capable.
-        */
-       if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
-
-               if (num_possible_cpus() == 1)
-                       goto valid_k7;
-
-               /* Athlon 660/661 is valid. */
-               if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
-                   (c->x86_mask == 1)))
-                       goto valid_k7;
-
-               /* Duron 670 is valid */
-               if ((c->x86_model == 7) && (c->x86_mask == 0))
-                       goto valid_k7;
-
-               /*
-                * Athlon 662, Duron 671, and Athlon >model 7 have capability
-                * bit. It's worth noting that the A5 stepping (662) of some
-                * Athlon XP's have the MP bit set.
-                * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
-                * more.
-                */
-               if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
-                   ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
-                    (c->x86_model > 7))
-                       if (cpu_has_mp)
-                               goto valid_k7;
-
-               /* If we get here, not a certified SMP capable AMD system. */
-               unsafe_smp = 1;
-       }
-
-valid_k7:
-       ;
-}
-
-static void __cpuinit smp_checks(void)
-{
-       if (smp_b_stepping)
-               printk(KERN_WARNING "WARNING: SMP operation may be unreliable"
-                                   "with B stepping processors.\n");
-
-       /*
-        * Don't taint if we are running SMP kernel on a single non-MP
-        * approved Athlon
-        */
-       if (unsafe_smp && num_online_cpus() > 1) {
-               printk(KERN_INFO "WARNING: This combination of AMD"
-                       "processors is not suitable for SMP.\n");
-               add_taint(TAINT_UNSAFE_SMP);
-       }
-}
-
 /*
  * The bootstrap kernel entry code has set these up. Save them for
  * a given CPU
@@ -423,7 +347,6 @@ void __cpuinit smp_store_cpu_info(int id)
        c->cpu_index = id;
        if (id != 0)
                identify_secondary_cpu(c);
-       smp_apply_quirks(c);
 }
 
 
@@ -1193,7 +1116,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
        pr_debug("Boot done.\n");
 
        impress_friends();
-       smp_checks();
 #ifdef CONFIG_X86_IO_APIC
        setup_ioapic_dest();
 #endif
index d038b9c45cf89597e10ab05d1814b2557b606994..79c073247284cbdb334cf2a03b4e7ffaae8d0bcd 100644 (file)
@@ -750,7 +750,7 @@ static int __init uv_bau_init(void)
        int node;
        int nblades;
        int last_blade;
-       int cur_cpu = 0;
+       int cur_cpu;
 
        if (!is_uv_system())
                return 0;
@@ -760,6 +760,7 @@ static int __init uv_bau_init(void)
        uv_mmask = (1UL << uv_hub_info->n_val) - 1;
        nblades = 0;
        last_blade = -1;
+       cur_cpu = 0;
        for_each_online_node(node) {
                blade = uv_node_to_blade_id(node);
                if (blade == last_blade)
index 0fcc95a354f7c28d980bf45fffd777c01a3ace25..7e4515957a1c89ad4980f7aa119120ee4d7cfa2a 100644 (file)
  *
  * Send feedback to <colpatch@us.ibm.com>
  */
-#include <linux/init.h>
-#include <linux/smp.h>
 #include <linux/nodemask.h>
 #include <linux/mmzone.h>
+#include <linux/init.h>
+#include <linux/smp.h>
 #include <asm/cpu.h>
 
 static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
@@ -47,6 +47,7 @@ int __ref arch_register_cpu(int num)
         */
        if (num)
                per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
+
        return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
 }
 EXPORT_SYMBOL(arch_register_cpu);
@@ -56,12 +57,13 @@ void arch_unregister_cpu(int num)
        unregister_cpu(&per_cpu(cpu_devices, num).cpu);
 }
 EXPORT_SYMBOL(arch_unregister_cpu);
-#else
+#else /* CONFIG_HOTPLUG_CPU */
+
 static int __init arch_register_cpu(int num)
 {
        return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
 }
-#endif /*CONFIG_HOTPLUG_CPU*/
+#endif /* CONFIG_HOTPLUG_CPU */
 
 static int __init topology_init(void)
 {
@@ -70,11 +72,11 @@ static int __init topology_init(void)
 #ifdef CONFIG_NUMA
        for_each_online_node(i)
                register_one_node(i);
-#endif /* CONFIG_NUMA */
+#endif
 
        for_each_present_cpu(i)
                arch_register_cpu(i);
+
        return 0;
 }
-
 subsys_initcall(topology_init);
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
new file mode 100644 (file)
index 0000000..2ffb6c5
--- /dev/null
@@ -0,0 +1,393 @@
+/*
+ * SGI RTC clock/timer routines.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ *  Copyright (c) 2009 Silicon Graphics, Inc.  All Rights Reserved.
+ *  Copyright (c) Dimitri Sivanich
+ */
+#include <linux/clockchips.h>
+
+#include <asm/uv/uv_mmrs.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/bios.h>
+#include <asm/uv/uv.h>
+#include <asm/apic.h>
+#include <asm/cpu.h>
+
+#define RTC_NAME               "sgi_rtc"
+
+static cycle_t uv_read_rtc(void);
+static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
+static void uv_rtc_timer_setup(enum clock_event_mode,
+                               struct clock_event_device *);
+
+static struct clocksource clocksource_uv = {
+       .name           = RTC_NAME,
+       .rating         = 400,
+       .read           = uv_read_rtc,
+       .mask           = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
+       .shift          = 10,
+       .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static struct clock_event_device clock_event_device_uv = {
+       .name           = RTC_NAME,
+       .features       = CLOCK_EVT_FEAT_ONESHOT,
+       .shift          = 20,
+       .rating         = 400,
+       .irq            = -1,
+       .set_next_event = uv_rtc_next_event,
+       .set_mode       = uv_rtc_timer_setup,
+       .event_handler  = NULL,
+};
+
+static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
+
+/* There is one of these allocated per node */
+struct uv_rtc_timer_head {
+       spinlock_t      lock;
+       /* next cpu waiting for timer, local node relative: */
+       int             next_cpu;
+       /* number of cpus on this node: */
+       int             ncpus;
+       struct {
+               int     lcpu;           /* systemwide logical cpu number */
+               u64     expires;        /* next timer expiration for this cpu */
+       } cpu[1];
+};
+
+/*
+ * Access to uv_rtc_timer_head via blade id.
+ */
+static struct uv_rtc_timer_head                **blade_info __read_mostly;
+
+static int                             uv_rtc_enable;
+
+/*
+ * Hardware interface routines
+ */
+
+/* Send IPIs to another node */
+static void uv_rtc_send_IPI(int cpu)
+{
+       unsigned long apicid, val;
+       int pnode;
+
+       apicid = cpu_physical_id(cpu);
+       pnode = uv_apicid_to_pnode(apicid);
+       val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+             (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+             (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
+
+       uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
+}
+
+/* Check for an RTC interrupt pending */
+static int uv_intr_pending(int pnode)
+{
+       return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
+               UVH_EVENT_OCCURRED0_RTC1_MASK;
+}
+
+/* Setup interrupt and return non-zero if early expiration occurred. */
+static int uv_setup_intr(int cpu, u64 expires)
+{
+       u64 val;
+       int pnode = uv_cpu_to_pnode(cpu);
+
+       uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
+               UVH_RTC1_INT_CONFIG_M_MASK);
+       uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
+
+       uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
+               UVH_EVENT_OCCURRED0_RTC1_MASK);
+
+       val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
+               ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
+
+       /* Set configuration */
+       uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
+       /* Initialize comparator value */
+       uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
+
+       return (expires < uv_read_rtc() && !uv_intr_pending(pnode));
+}
+
+/*
+ * Per-cpu timer tracking routines
+ */
+
+static __init void uv_rtc_deallocate_timers(void)
+{
+       int bid;
+
+       for_each_possible_blade(bid) {
+               kfree(blade_info[bid]);
+       }
+       kfree(blade_info);
+}
+
+/* Allocate per-node list of cpu timer expiration times. */
+static __init int uv_rtc_allocate_timers(void)
+{
+       int cpu;
+
+       blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL);
+       if (!blade_info)
+               return -ENOMEM;
+       memset(blade_info, 0, uv_possible_blades * sizeof(void *));
+
+       for_each_present_cpu(cpu) {
+               int nid = cpu_to_node(cpu);
+               int bid = uv_cpu_to_blade_id(cpu);
+               int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+               struct uv_rtc_timer_head *head = blade_info[bid];
+
+               if (!head) {
+                       head = kmalloc_node(sizeof(struct uv_rtc_timer_head) +
+                               (uv_blade_nr_possible_cpus(bid) *
+                                       2 * sizeof(u64)),
+                               GFP_KERNEL, nid);
+                       if (!head) {
+                               uv_rtc_deallocate_timers();
+                               return -ENOMEM;
+                       }
+                       spin_lock_init(&head->lock);
+                       head->ncpus = uv_blade_nr_possible_cpus(bid);
+                       head->next_cpu = -1;
+                       blade_info[bid] = head;
+               }
+
+               head->cpu[bcpu].lcpu = cpu;
+               head->cpu[bcpu].expires = ULLONG_MAX;
+       }
+
+       return 0;
+}
+
+/* Find and set the next expiring timer.  */
+static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode)
+{
+       u64 lowest = ULLONG_MAX;
+       int c, bcpu = -1;
+
+       head->next_cpu = -1;
+       for (c = 0; c < head->ncpus; c++) {
+               u64 exp = head->cpu[c].expires;
+               if (exp < lowest) {
+                       bcpu = c;
+                       lowest = exp;
+               }
+       }
+       if (bcpu >= 0) {
+               head->next_cpu = bcpu;
+               c = head->cpu[bcpu].lcpu;
+               if (uv_setup_intr(c, lowest))
+                       /* If we didn't set it up in time, trigger */
+                       uv_rtc_send_IPI(c);
+       } else {
+               uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
+                       UVH_RTC1_INT_CONFIG_M_MASK);
+       }
+}
+
+/*
+ * Set expiration time for current cpu.
+ *
+ * Returns 1 if we missed the expiration time.
+ */
+static int uv_rtc_set_timer(int cpu, u64 expires)
+{
+       int pnode = uv_cpu_to_pnode(cpu);
+       int bid = uv_cpu_to_blade_id(cpu);
+       struct uv_rtc_timer_head *head = blade_info[bid];
+       int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+       u64 *t = &head->cpu[bcpu].expires;
+       unsigned long flags;
+       int next_cpu;
+
+       spin_lock_irqsave(&head->lock, flags);
+
+       next_cpu = head->next_cpu;
+       *t = expires;
+       /* Will this one be next to go off? */
+       if (next_cpu < 0 || bcpu == next_cpu ||
+                       expires < head->cpu[next_cpu].expires) {
+               head->next_cpu = bcpu;
+               if (uv_setup_intr(cpu, expires)) {
+                       *t = ULLONG_MAX;
+                       uv_rtc_find_next_timer(head, pnode);
+                       spin_unlock_irqrestore(&head->lock, flags);
+                       return 1;
+               }
+       }
+
+       spin_unlock_irqrestore(&head->lock, flags);
+       return 0;
+}
+
+/*
+ * Unset expiration time for current cpu.
+ *
+ * Returns 1 if this timer was pending.
+ */
+static int uv_rtc_unset_timer(int cpu)
+{
+       int pnode = uv_cpu_to_pnode(cpu);
+       int bid = uv_cpu_to_blade_id(cpu);
+       struct uv_rtc_timer_head *head = blade_info[bid];
+       int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+       u64 *t = &head->cpu[bcpu].expires;
+       unsigned long flags;
+       int rc = 0;
+
+       spin_lock_irqsave(&head->lock, flags);
+
+       if (head->next_cpu == bcpu && uv_read_rtc() >= *t)
+               rc = 1;
+
+       *t = ULLONG_MAX;
+
+       /* Was the hardware setup for this timer? */
+       if (head->next_cpu == bcpu)
+               uv_rtc_find_next_timer(head, pnode);
+
+       spin_unlock_irqrestore(&head->lock, flags);
+
+       return rc;
+}
+
+
+/*
+ * Kernel interface routines.
+ */
+
+/*
+ * Read the RTC.
+ */
+static cycle_t uv_read_rtc(void)
+{
+       return (cycle_t)uv_read_local_mmr(UVH_RTC);
+}
+
+/*
+ * Program the next event, relative to now
+ */
+static int uv_rtc_next_event(unsigned long delta,
+                            struct clock_event_device *ced)
+{
+       int ced_cpu = cpumask_first(ced->cpumask);
+
+       return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc());
+}
+
+/*
+ * Setup the RTC timer in oneshot mode
+ */
+static void uv_rtc_timer_setup(enum clock_event_mode mode,
+                              struct clock_event_device *evt)
+{
+       int ced_cpu = cpumask_first(evt->cpumask);
+
+       switch (mode) {
+       case CLOCK_EVT_MODE_PERIODIC:
+       case CLOCK_EVT_MODE_ONESHOT:
+       case CLOCK_EVT_MODE_RESUME:
+               /* Nothing to do here yet */
+               break;
+       case CLOCK_EVT_MODE_UNUSED:
+       case CLOCK_EVT_MODE_SHUTDOWN:
+               uv_rtc_unset_timer(ced_cpu);
+               break;
+       }
+}
+
+static void uv_rtc_interrupt(void)
+{
+       struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
+       int cpu = smp_processor_id();
+
+       if (!ced || !ced->event_handler)
+               return;
+
+       if (uv_rtc_unset_timer(cpu) != 1)
+               return;
+
+       ced->event_handler(ced);
+}
+
+static int __init uv_enable_rtc(char *str)
+{
+       uv_rtc_enable = 1;
+
+       return 1;
+}
+__setup("uvrtc", uv_enable_rtc);
+
+static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
+{
+       struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
+
+       *ced = clock_event_device_uv;
+       ced->cpumask = cpumask_of(smp_processor_id());
+       clockevents_register_device(ced);
+}
+
+static __init int uv_rtc_setup_clock(void)
+{
+       int rc;
+
+       if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension)
+               return -ENODEV;
+
+       generic_interrupt_extension = uv_rtc_interrupt;
+
+       clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
+                               clocksource_uv.shift);
+
+       rc = clocksource_register(&clocksource_uv);
+       if (rc) {
+               generic_interrupt_extension = NULL;
+               return rc;
+       }
+
+       /* Setup and register clockevents */
+       rc = uv_rtc_allocate_timers();
+       if (rc) {
+               clocksource_unregister(&clocksource_uv);
+               generic_interrupt_extension = NULL;
+               return rc;
+       }
+
+       clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
+                               NSEC_PER_SEC, clock_event_device_uv.shift);
+
+       clock_event_device_uv.min_delta_ns = NSEC_PER_SEC /
+                                               sn_rtc_cycles_per_second;
+
+       clock_event_device_uv.max_delta_ns = clocksource_uv.mask *
+                               (NSEC_PER_SEC / sn_rtc_cycles_per_second);
+
+       rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
+       if (rc) {
+               clocksource_unregister(&clocksource_uv);
+               generic_interrupt_extension = NULL;
+               uv_rtc_deallocate_timers();
+       }
+
+       return rc;
+}
+arch_initcall(uv_rtc_setup_clock);
index 191a876e9e879cf2e63a374605421cb0f223628e..31ffc24eec4d6f7e3a84cd7d5a11cc8a5b6e18d2 100644 (file)
@@ -578,7 +578,7 @@ static struct irq_chip piix4_virtual_irq_type = {
 static irqreturn_t piix4_master_intr(int irq, void *dev_id)
 {
        int realirq;
-       irq_desc_t *desc;
+       struct irq_desc *desc;
        unsigned long flags;
 
        spin_lock_irqsave(&i8259A_lock, flags);
index 2cc4a90e2cb3ae35e0b81425ef3f41e159f0a1a3..95deb9f2211e98decb5572e4757bc2cc48f3da18 100644 (file)
@@ -395,11 +395,6 @@ static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
        vmi_ops.update_pte(ptep, VMI_PAGE_PT);
 }
 
-static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
-{
-       vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
-}
-
 static void vmi_set_pud(pud_t *pudp, pud_t pudval)
 {
        /* Um, eww */
@@ -750,7 +745,6 @@ static inline int __init activate_vmi(void)
                pv_mmu_ops.set_pmd = vmi_set_pmd;
 #ifdef CONFIG_X86_PAE
                pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
-               pv_mmu_ops.set_pte_present = vmi_set_pte_present;
                pv_mmu_ops.set_pud = vmi_set_pud;
                pv_mmu_ops.pte_clear = vmi_pte_clear;
                pv_mmu_ops.pmd_clear = vmi_pmd_clear;
index 0d860963f268f5a79c1198a6dffba0fee22f897d..62ad500d55f3e18bb2a405624a5890836f805398 100644 (file)
@@ -189,15 +189,24 @@ SECTIONS
        *(.bss)
        . = ALIGN(4);
        __bss_stop = .;
-       _end = . ;
-       /* This is where the kernel creates the early boot page tables */
+  }
+
+  .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
        . = ALIGN(PAGE_SIZE);
-       pg0 = . ;
+       __brk_base = . ;
+       . += 64 * 1024 ;        /* 64k alignment slop space */
+       *(.brk_reservation)     /* areas brk users have reserved */
+       __brk_limit = . ;
+  }
+
+  .end : AT(ADDR(.end) - LOAD_OFFSET) {
+       _end = . ;
   }
 
   /* Sections to be discarded */
   /DISCARD/ : {
        *(.exitcall.exit)
+       *(.discard)
        }
 
   STABS_DEBUG
@@ -205,6 +214,12 @@ SECTIONS
   DWARF_DEBUG
 }
 
+/*
+ * Build-time check on the image size:
+ */
+ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+       "kernel image bigger than KERNEL_IMAGE_SIZE")
+
 #ifdef CONFIG_KEXEC
 /* Link time checks */
 #include <asm/kexec.h>
index fbfced6f6800c11052a7f640fd4edd1d32a2b3a5..c8742507b030bec22bb90fcdcfc12d5dcdb35da7 100644 (file)
@@ -29,8 +29,8 @@ SECTIONS
 {
   . = __START_KERNEL;
   phys_startup_64 = startup_64 - LOAD_OFFSET;
-  _text = .;                   /* Text and read-only data */
   .text :  AT(ADDR(.text) - LOAD_OFFSET) {
+       _text = .;                      /* Text and read-only data */
        /* First the code that has to be first for bootstrapping */
        *(.text.head)
        _stext = .;
@@ -61,13 +61,13 @@ SECTIONS
   .data : AT(ADDR(.data) - LOAD_OFFSET) {
        DATA_DATA
        CONSTRUCTORS
+       _edata = .;                     /* End of data section */
        } :data
 
-  _edata = .;                  /* End of data section */
 
-  . = ALIGN(PAGE_SIZE);
-  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
   .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+       . = ALIGN(PAGE_SIZE);
+       . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
        *(.data.cacheline_aligned)
   }
   . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
@@ -125,29 +125,29 @@ SECTIONS
 #undef VVIRT_OFFSET
 #undef VVIRT
 
-  . = ALIGN(THREAD_SIZE);      /* init_task */
   .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+       . = ALIGN(THREAD_SIZE); /* init_task */
        *(.data.init_task)
   }:data.init
 
-  . = ALIGN(PAGE_SIZE);
   .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+       . = ALIGN(PAGE_SIZE);
        *(.data.page_aligned)
   }
 
-  /* might get freed after init */
-  . = ALIGN(PAGE_SIZE);
-  __smp_alt_begin = .;
-  __smp_locks = .;
   .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+       /* might get freed after init */
+       . = ALIGN(PAGE_SIZE);
+       __smp_alt_begin = .;
+       __smp_locks = .;
        *(.smp_locks)
+       __smp_locks_end = .;
+       . = ALIGN(PAGE_SIZE);
+       __smp_alt_end = .;
   }
-  __smp_locks_end = .;
-  . = ALIGN(PAGE_SIZE);
-  __smp_alt_end = .;
 
   . = ALIGN(PAGE_SIZE);                /* Init code and data */
-  __init_begin = .;
+  __init_begin = .;    /* paired with __init_end */
   .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
        _sinittext = .;
        INIT_TEXT
@@ -159,40 +159,42 @@ SECTIONS
        __initdata_end = .;
    }
 
-  . = ALIGN(16);
-  __setup_start = .;
-  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
-  __setup_end = .;
-  __initcall_start = .;
+  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+       . = ALIGN(16);
+       __setup_start = .;
+       *(.init.setup)
+       __setup_end = .;
+  }
   .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+       __initcall_start = .;
        INITCALLS
+       __initcall_end = .;
   }
-  __initcall_end = .;
-  __con_initcall_start = .;
   .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+       __con_initcall_start = .;
        *(.con_initcall.init)
+       __con_initcall_end = .;
   }
-  __con_initcall_end = .;
-  __x86_cpu_dev_start = .;
   .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+       __x86_cpu_dev_start = .;
        *(.x86_cpu_dev.init)
+       __x86_cpu_dev_end = .;
   }
-  __x86_cpu_dev_end = .;
   SECURITY_INIT
 
   . = ALIGN(8);
   .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
-  __parainstructions = .;
+       __parainstructions = .;
        *(.parainstructions)
-  __parainstructions_end = .;
+       __parainstructions_end = .;
   }
 
-  . = ALIGN(8);
-  __alt_instructions = .;
   .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+       . = ALIGN(8);
+       __alt_instructions = .;
        *(.altinstructions)
+       __alt_instructions_end = .;
   }
-  __alt_instructions_end = .;
   .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
        *(.altinstr_replacement)
   }
@@ -207,9 +209,11 @@ SECTIONS
 
 #ifdef CONFIG_BLK_DEV_INITRD
   . = ALIGN(PAGE_SIZE);
-  __initramfs_start = .;
-  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
-  __initramfs_end = .;
+  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+       __initramfs_start = .;
+       *(.init.ramfs)
+       __initramfs_end = .;
+  }
 #endif
 
 #ifdef CONFIG_SMP
@@ -229,20 +233,29 @@ SECTIONS
   . = ALIGN(PAGE_SIZE);
   __init_end = .;
 
-  . = ALIGN(PAGE_SIZE);
-  __nosave_begin = .;
   .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
-      *(.data.nosave)
+       . = ALIGN(PAGE_SIZE);
+       __nosave_begin = .;
+       *(.data.nosave)
+       . = ALIGN(PAGE_SIZE);
+       __nosave_end = .;
   } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
-  . = ALIGN(PAGE_SIZE);
-  __nosave_end = .;
 
-  __bss_start = .;             /* BSS */
   .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+       . = ALIGN(PAGE_SIZE);
+       __bss_start = .;                /* BSS */
        *(.bss.page_aligned)
        *(.bss)
-       }
-  __bss_stop = .;
+       __bss_stop = .;
+  }
+
+  .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
+       . = ALIGN(PAGE_SIZE);
+       __brk_base = . ;
+       . += 64 * 1024 ;        /* 64k alignment slop space */
+       *(.brk_reservation)     /* areas brk users have reserved */
+       __brk_limit = . ;
+  }
 
   _end = . ;
 
@@ -250,6 +263,7 @@ SECTIONS
   /DISCARD/ : {
        *(.exitcall.exit)
        *(.eh_frame)
+       *(.discard)
        }
 
   STABS_DEBUG
@@ -275,3 +289,10 @@ ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
 ASSERT((per_cpu__irq_stack_union == 0),
         "irq_stack_union is not at start of per-cpu area");
 #endif
+
+#ifdef CONFIG_KEXEC
+#include <asm/kexec.h>
+
+ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+       "kexec control code size is too big")
+#endif
index 74de562812ccc0e3e4886c18d945d669d282c48c..a1d804bcd48357554cc017f4b63c217eb2b84eaa 100644 (file)
@@ -22,7 +22,7 @@
 #include <asm/paravirt.h>
 #include <asm/setup.h>
 
-#ifdef CONFIG_PARAVIRT
+#if defined CONFIG_PCI && defined CONFIG_PARAVIRT
 /*
  * Interrupt control on vSMPowered systems:
  * ~AC is a shadow of IF.  If IF is 'on' AC should be 'off'
@@ -114,6 +114,7 @@ static void __init set_vsmp_pv_ops(void)
 }
 #endif
 
+#ifdef CONFIG_PCI
 static int is_vsmp = -1;
 
 static void __init detect_vsmp_box(void)
@@ -139,6 +140,15 @@ int is_vsmp_box(void)
        }
 }
 
+#else
+static void __init detect_vsmp_box(void)
+{
+}
+int is_vsmp_box(void)
+{
+       return 0;
+}
+#endif
 void __init vsmp_init(void)
 {
        detect_vsmp_box();
index 9fe4ddaa8f6ff1fc53bbe09443d32d733876131c..90e44a10e68a6b5eb252655490be4b1e6c6d4941 100644 (file)
@@ -1058,14 +1058,6 @@ __init void lguest_init(void)
         * lguest_init() where the rest of the fairly chaotic boot setup
         * occurs. */
 
-       /* The native boot code sets up initial page tables immediately after
-        * the kernel itself, and sets init_pg_tables_end so they're not
-        * clobbered.  The Launcher places our initial pagetables somewhere at
-        * the top of our physical memory, so we don't need extra space: set
-        * init_pg_tables_end to the end of the kernel. */
-       init_pg_tables_start = __pa(pg0);
-       init_pg_tables_end = __pa(pg0);
-
        /* As described in head_32.S, we map the first 128M of memory. */
        max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
 
index c22981fa2f3a95240f3a4c99d00da41d1b7392ec..ad5441ed1b57fdfffe6822dede52cad626ae2b39 100644 (file)
@@ -1,30 +1,38 @@
 /* Copyright 2002 Andi Kleen */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
+
 #include <asm/cpufeature.h>
+#include <asm/dwarf2.h>
 
 /*
  * memcpy - Copy a memory block.
  *
- * Input:      
- * rdi destination
- * rsi source
- * rdx count
- * 
+ * Input:
+ *  rdi destination
+ *  rsi source
+ *  rdx count
+ *
  * Output:
  * rax original destination
- */    
+ */
 
+/*
+ * memcpy_c() - fast string ops (REP MOVSQ) based variant.
+ *
+ * Calls to this get patched into the kernel image via the
+ * alternative instructions framework:
+ */
        ALIGN
 memcpy_c:
        CFI_STARTPROC
-       movq %rdi,%rax
-       movl %edx,%ecx
-       shrl $3,%ecx
-       andl $7,%edx
+       movq %rdi, %rax
+
+       movl %edx, %ecx
+       shrl $3, %ecx
+       andl $7, %edx
        rep movsq
-       movl %edx,%ecx
+       movl %edx, %ecx
        rep movsb
        ret
        CFI_ENDPROC
@@ -33,99 +41,110 @@ ENDPROC(memcpy_c)
 ENTRY(__memcpy)
 ENTRY(memcpy)
        CFI_STARTPROC
-       pushq %rbx
-       CFI_ADJUST_CFA_OFFSET 8
-       CFI_REL_OFFSET rbx, 0
-       movq %rdi,%rax
 
-       movl %edx,%ecx
-       shrl $6,%ecx
+       /*
+        * Put the number of full 64-byte blocks into %ecx.
+        * Tail portion is handled at the end:
+        */
+       movq %rdi, %rax
+       movl %edx, %ecx
+       shrl   $6, %ecx
        jz .Lhandle_tail
 
        .p2align 4
 .Lloop_64:
+       /*
+        * We decrement the loop index here - and the zero-flag is
+        * checked at the end of the loop (instructions inbetween do
+        * not change the zero flag):
+        */
        decl %ecx
 
-       movq (%rsi),%r11
-       movq 8(%rsi),%r8
+       /*
+        * Move in blocks of 4x16 bytes:
+        */
+       movq 0*8(%rsi),         %r11
+       movq 1*8(%rsi),         %r8
+       movq %r11,              0*8(%rdi)
+       movq %r8,               1*8(%rdi)
 
-       movq %r11,(%rdi)
-       movq %r8,1*8(%rdi)
+       movq 2*8(%rsi),         %r9
+       movq 3*8(%rsi),         %r10
+       movq %r9,               2*8(%rdi)
+       movq %r10,              3*8(%rdi)
 
-       movq 2*8(%rsi),%r9
-       movq 3*8(%rsi),%r10
+       movq 4*8(%rsi),         %r11
+       movq 5*8(%rsi),         %r8
+       movq %r11,              4*8(%rdi)
+       movq %r8,               5*8(%rdi)
 
-       movq %r9,2*8(%rdi)
-       movq %r10,3*8(%rdi)
+       movq 6*8(%rsi),         %r9
+       movq 7*8(%rsi),         %r10
+       movq %r9,               6*8(%rdi)
+       movq %r10,              7*8(%rdi)
 
-       movq 4*8(%rsi),%r11
-       movq 5*8(%rsi),%r8
+       leaq 64(%rsi), %rsi
+       leaq 64(%rdi), %rdi
 
-       movq %r11,4*8(%rdi)
-       movq %r8,5*8(%rdi)
-
-       movq 6*8(%rsi),%r9
-       movq 7*8(%rsi),%r10
-
-       movq %r9,6*8(%rdi)
-       movq %r10,7*8(%rdi)
-
-       leaq 64(%rsi),%rsi
-       leaq 64(%rdi),%rdi
        jnz  .Lloop_64
 
 .Lhandle_tail:
-       movl %edx,%ecx
-       andl $63,%ecx
-       shrl $3,%ecx
+       movl %edx, %ecx
+       andl  $63, %ecx
+       shrl   $3, %ecx
        jz   .Lhandle_7
+
        .p2align 4
 .Lloop_8:
        decl %ecx
-       movq (%rsi),%r8
-       movq %r8,(%rdi)
-       leaq 8(%rdi),%rdi
-       leaq 8(%rsi),%rsi
+       movq (%rsi),            %r8
+       movq %r8,               (%rdi)
+       leaq 8(%rdi),           %rdi
+       leaq 8(%rsi),           %rsi
        jnz  .Lloop_8
 
 .Lhandle_7:
-       movl %edx,%ecx
-       andl $7,%ecx
-       jz .Lende
+       movl %edx, %ecx
+       andl $7, %ecx
+       jz .Lend
+
        .p2align 4
 .Lloop_1:
-       movb (%rsi),%r8b
-       movb %r8b,(%rdi)
+       movb (%rsi), %r8b
+       movb %r8b, (%rdi)
        incq %rdi
        incq %rsi
        decl %ecx
        jnz .Lloop_1
 
-.Lende:
-       popq %rbx
-       CFI_ADJUST_CFA_OFFSET -8
-       CFI_RESTORE rbx
+.Lend:
        ret
-.Lfinal:
        CFI_ENDPROC
 ENDPROC(memcpy)
 ENDPROC(__memcpy)
 
-       /* Some CPUs run faster using the string copy instructions.
-          It is also a lot simpler. Use this when possible */
+       /*
+        * Some CPUs run faster using the string copy instructions.
+        * It is also a lot simpler. Use this when possible:
+        */
 
-       .section .altinstr_replacement,"ax"
+       .section .altinstr_replacement, "ax"
 1:     .byte 0xeb                              /* jmp <disp8> */
        .byte (memcpy_c - memcpy) - (2f - 1b)   /* offset */
 2:
        .previous
-       .section .altinstructions,"a"
+
+       .section .altinstructions, "a"
        .align 8
        .quad memcpy
        .quad 1b
        .byte X86_FEATURE_REP_GOOD
-       /* Replace only beginning, memcpy is used to apply alternatives, so it
-        * is silly to overwrite itself with nops - reboot is only outcome... */
+
+       /*
+        * Replace only beginning, memcpy is used to apply alternatives,
+        * so it is silly to overwrite itself with nops - reboot is the
+        * only outcome...
+        */
        .byte 2b - 1b
        .byte 2b - 1b
        .previous
index 00f127c80b0e38fa50eec531e8e85c8d1f18fd6d..522db5e3d0bf973e54ca71e33201454c4aadefda 100644 (file)
@@ -121,22 +121,13 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
        pagefault_enable();
 }
 
-/* This is the same as kmap_atomic() but can map memory that doesn't
+/*
+ * This is the same as kmap_atomic() but can map memory that doesn't
  * have a struct page associated with it.
  */
 void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
 {
-       enum fixed_addresses idx;
-       unsigned long vaddr;
-
-       pagefault_disable();
-
-       idx = type + KM_TYPE_NR*smp_processor_id();
-       vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-       set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
-       arch_flush_lazy_mmu_mode();
-
-       return (void*) vaddr;
+       return kmap_atomic_prot_pfn(pfn, type, kmap_prot);
 }
 EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
 
@@ -158,7 +149,6 @@ EXPORT_SYMBOL(kunmap);
 EXPORT_SYMBOL(kmap_atomic);
 EXPORT_SYMBOL(kunmap_atomic);
 
-#ifdef CONFIG_NUMA
 void __init set_highmem_pages_init(void)
 {
        struct zone *zone;
@@ -182,11 +172,3 @@ void __init set_highmem_pages_init(void)
        }
        totalram_pages += totalhigh_pages;
 }
-#else
-void __init set_highmem_pages_init(void)
-{
-       add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
-
-       totalram_pages += totalhigh_pages;
-}
-#endif /* CONFIG_NUMA */
index ce6a722587d8fa6cfd8343c3931866ce93eda13b..fd3da1dda1c9e5033af44cbdaf432239784558f2 100644 (file)
@@ -1,8 +1,345 @@
+#include <linux/ioport.h>
 #include <linux/swap.h>
+
 #include <asm/cacheflush.h>
+#include <asm/e820.h>
+#include <asm/init.h>
 #include <asm/page.h>
+#include <asm/page_types.h>
 #include <asm/sections.h>
 #include <asm/system.h>
+#include <asm/tlbflush.h>
+
+unsigned long __initdata e820_table_start;
+unsigned long __meminitdata e820_table_end;
+unsigned long __meminitdata e820_table_top;
+
+int after_bootmem;
+
+int direct_gbpages
+#ifdef CONFIG_DIRECT_GBPAGES
+                               = 1
+#endif
+;
+
+static void __init find_early_table_space(unsigned long end, int use_pse,
+                                         int use_gbpages)
+{
+       unsigned long puds, pmds, ptes, tables, start;
+
+       puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+       tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+
+       if (use_gbpages) {
+               unsigned long extra;
+
+               extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
+               pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
+       } else
+               pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+
+       tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
+
+       if (use_pse) {
+               unsigned long extra;
+
+               extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+#ifdef CONFIG_X86_32
+               extra += PMD_SIZE;
+#endif
+               ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       } else
+               ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+       tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
+
+#ifdef CONFIG_X86_32
+       /* for fixmap */
+       tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
+#endif
+
+       /*
+        * RED-PEN putting page tables only on node 0 could
+        * cause a hotspot and fill up ZONE_DMA. The page tables
+        * need roughly 0.5KB per GB.
+        */
+#ifdef CONFIG_X86_32
+       start = 0x7000;
+       e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+                                       tables, PAGE_SIZE);
+#else /* CONFIG_X86_64 */
+       start = 0x8000;
+       e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
+#endif
+       if (e820_table_start == -1UL)
+               panic("Cannot find space for the kernel page tables");
+
+       e820_table_start >>= PAGE_SHIFT;
+       e820_table_end = e820_table_start;
+       e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
+
+       printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+               end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT);
+}
+
+struct map_range {
+       unsigned long start;
+       unsigned long end;
+       unsigned page_size_mask;
+};
+
+#ifdef CONFIG_X86_32
+#define NR_RANGE_MR 3
+#else /* CONFIG_X86_64 */
+#define NR_RANGE_MR 5
+#endif
+
+static int __meminit save_mr(struct map_range *mr, int nr_range,
+                            unsigned long start_pfn, unsigned long end_pfn,
+                            unsigned long page_size_mask)
+{
+       if (start_pfn < end_pfn) {
+               if (nr_range >= NR_RANGE_MR)
+                       panic("run out of range for init_memory_mapping\n");
+               mr[nr_range].start = start_pfn<<PAGE_SHIFT;
+               mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
+               mr[nr_range].page_size_mask = page_size_mask;
+               nr_range++;
+       }
+
+       return nr_range;
+}
+
+#ifdef CONFIG_X86_64
+static void __init init_gbpages(void)
+{
+       if (direct_gbpages && cpu_has_gbpages)
+               printk(KERN_INFO "Using GB pages for direct mapping\n");
+       else
+               direct_gbpages = 0;
+}
+#else
+static inline void init_gbpages(void)
+{
+}
+#endif
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+                                              unsigned long end)
+{
+       unsigned long page_size_mask = 0;
+       unsigned long start_pfn, end_pfn;
+       unsigned long ret = 0;
+       unsigned long pos;
+
+       struct map_range mr[NR_RANGE_MR];
+       int nr_range, i;
+       int use_pse, use_gbpages;
+
+       printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
+
+       if (!after_bootmem)
+               init_gbpages();
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+       /*
+        * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+        * This will simplify cpa(), which otherwise needs to support splitting
+        * large pages into small in interrupt context, etc.
+        */
+       use_pse = use_gbpages = 0;
+#else
+       use_pse = cpu_has_pse;
+       use_gbpages = direct_gbpages;
+#endif
+
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_PAE
+       set_nx();
+       if (nx_enabled)
+               printk(KERN_INFO "NX (Execute Disable) protection: active\n");
+#endif
+
+       /* Enable PSE if available */
+       if (cpu_has_pse)
+               set_in_cr4(X86_CR4_PSE);
+
+       /* Enable PGE if available */
+       if (cpu_has_pge) {
+               set_in_cr4(X86_CR4_PGE);
+               __supported_pte_mask |= _PAGE_GLOBAL;
+       }
+#endif
+
+       if (use_gbpages)
+               page_size_mask |= 1 << PG_LEVEL_1G;
+       if (use_pse)
+               page_size_mask |= 1 << PG_LEVEL_2M;
+
+       memset(mr, 0, sizeof(mr));
+       nr_range = 0;
+
+       /* head if not big page alignment ? */
+       start_pfn = start >> PAGE_SHIFT;
+       pos = start_pfn << PAGE_SHIFT;
+#ifdef CONFIG_X86_32
+       /*
+        * Don't use a large page for the first 2/4MB of memory
+        * because there are often fixed size MTRRs in there
+        * and overlapping MTRRs into large pages can cause
+        * slowdowns.
+        */
+       if (pos == 0)
+               end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+       else
+               end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+                                << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+       end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
+                       << (PMD_SHIFT - PAGE_SHIFT);
+#endif
+       if (end_pfn > (end >> PAGE_SHIFT))
+               end_pfn = end >> PAGE_SHIFT;
+       if (start_pfn < end_pfn) {
+               nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+               pos = end_pfn << PAGE_SHIFT;
+       }
+
+       /* big page (2M) range */
+       start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+                        << (PMD_SHIFT - PAGE_SHIFT);
+#ifdef CONFIG_X86_32
+       end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+       end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+                        << (PUD_SHIFT - PAGE_SHIFT);
+       if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
+               end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+#endif
+
+       if (start_pfn < end_pfn) {
+               nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+                               page_size_mask & (1<<PG_LEVEL_2M));
+               pos = end_pfn << PAGE_SHIFT;
+       }
+
+#ifdef CONFIG_X86_64
+       /* big page (1G) range */
+       start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+                        << (PUD_SHIFT - PAGE_SHIFT);
+       end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+       if (start_pfn < end_pfn) {
+               nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+                               page_size_mask &
+                                ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+               pos = end_pfn << PAGE_SHIFT;
+       }
+
+       /* tail is not big page (1G) alignment */
+       start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+                        << (PMD_SHIFT - PAGE_SHIFT);
+       end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+       if (start_pfn < end_pfn) {
+               nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+                               page_size_mask & (1<<PG_LEVEL_2M));
+               pos = end_pfn << PAGE_SHIFT;
+       }
+#endif
+
+       /* tail is not big page (2M) alignment */
+       start_pfn = pos>>PAGE_SHIFT;
+       end_pfn = end>>PAGE_SHIFT;
+       nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+
+       /* try to merge same page size and continuous */
+       for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+               unsigned long old_start;
+               if (mr[i].end != mr[i+1].start ||
+                   mr[i].page_size_mask != mr[i+1].page_size_mask)
+                       continue;
+               /* move it */
+               old_start = mr[i].start;
+               memmove(&mr[i], &mr[i+1],
+                       (nr_range - 1 - i) * sizeof(struct map_range));
+               mr[i--].start = old_start;
+               nr_range--;
+       }
+
+       for (i = 0; i < nr_range; i++)
+               printk(KERN_DEBUG " %010lx - %010lx page %s\n",
+                               mr[i].start, mr[i].end,
+                       (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
+                        (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
+
+       /*
+        * Find space for the kernel direct mapping tables.
+        *
+        * Later we should allocate these tables in the local node of the
+        * memory mapped. Unfortunately this is done currently before the
+        * nodes are discovered.
+        */
+       if (!after_bootmem)
+               find_early_table_space(end, use_pse, use_gbpages);
+
+#ifdef CONFIG_X86_32
+       for (i = 0; i < nr_range; i++)
+               kernel_physical_mapping_init(mr[i].start, mr[i].end,
+                                            mr[i].page_size_mask);
+       ret = end;
+#else /* CONFIG_X86_64 */
+       for (i = 0; i < nr_range; i++)
+               ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
+                                                  mr[i].page_size_mask);
+#endif
+
+#ifdef CONFIG_X86_32
+       early_ioremap_page_table_range_init();
+
+       load_cr3(swapper_pg_dir);
+#endif
+
+#ifdef CONFIG_X86_64
+       if (!after_bootmem)
+               mmu_cr4_features = read_cr4();
+#endif
+       __flush_tlb_all();
+
+       if (!after_bootmem && e820_table_end > e820_table_start)
+               reserve_early(e820_table_start << PAGE_SHIFT,
+                                e820_table_end << PAGE_SHIFT, "PGTABLE");
+
+       if (!after_bootmem)
+               early_memtest(start, end);
+
+       return ret >> PAGE_SHIFT;
+}
+
+
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+       if (pagenr <= 256)
+               return 1;
+       if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
+               return 0;
+       if (!page_is_ram(pagenr))
+               return 1;
+       return 0;
+}
 
 void free_init_pages(char *what, unsigned long begin, unsigned long end)
 {
@@ -47,3 +384,10 @@ void free_initmem(void)
                        (unsigned long)(&__init_begin),
                        (unsigned long)(&__init_end));
 }
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+       free_init_pages("initrd memory", start, end);
+}
+#endif
index 47df0e1bbeb9aca5a30c9b2d858431518263ad11..db81e9a8556b3b0bba854aeba5740bd0cc039039 100644 (file)
@@ -49,6 +49,7 @@
 #include <asm/paravirt.h>
 #include <asm/setup.h>
 #include <asm/cacheflush.h>
+#include <asm/init.h>
 
 unsigned long max_low_pfn_mapped;
 unsigned long max_pfn_mapped;
@@ -58,19 +59,14 @@ unsigned long highstart_pfn, highend_pfn;
 
 static noinline int do_test_wp_bit(void);
 
-
-static unsigned long __initdata table_start;
-static unsigned long __meminitdata table_end;
-static unsigned long __meminitdata table_top;
-
-static int __initdata after_init_bootmem;
+bool __read_mostly __vmalloc_start_set = false;
 
 static __init void *alloc_low_page(void)
 {
-       unsigned long pfn = table_end++;
+       unsigned long pfn = e820_table_end++;
        void *adr;
 
-       if (pfn >= table_top)
+       if (pfn >= e820_table_top)
                panic("alloc_low_page: ran out of memory");
 
        adr = __va(pfn * PAGE_SIZE);
@@ -90,7 +86,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
 
 #ifdef CONFIG_X86_PAE
        if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
-               if (after_init_bootmem)
+               if (after_bootmem)
                        pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
                else
                        pmd_table = (pmd_t *)alloc_low_page();
@@ -117,7 +113,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
        if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
                pte_t *page_table = NULL;
 
-               if (after_init_bootmem) {
+               if (after_bootmem) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
                        page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
 #endif
@@ -168,12 +164,12 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
        if (pmd_idx_kmap_begin != pmd_idx_kmap_end
            && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
            && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
-           && ((__pa(pte) >> PAGE_SHIFT) < table_start
-               || (__pa(pte) >> PAGE_SHIFT) >= table_end)) {
+           && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start
+               || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) {
                pte_t *newpte;
                int i;
 
-               BUG_ON(after_init_bootmem);
+               BUG_ON(after_bootmem);
                newpte = alloc_low_page();
                for (i = 0; i < PTRS_PER_PTE; i++)
                        set_pte(newpte + i, pte[i]);
@@ -242,11 +238,14 @@ static inline int is_kernel_text(unsigned long addr)
  * of max_low_pfn pages, by creating page tables starting from address
  * PAGE_OFFSET:
  */
-static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
-                                               unsigned long start_pfn,
-                                               unsigned long end_pfn,
-                                               int use_pse)
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+                            unsigned long end,
+                            unsigned long page_size_mask)
 {
+       int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
+       unsigned long start_pfn, end_pfn;
+       pgd_t *pgd_base = swapper_pg_dir;
        int pgd_idx, pmd_idx, pte_ofs;
        unsigned long pfn;
        pgd_t *pgd;
@@ -255,6 +254,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
        unsigned pages_2m, pages_4k;
        int mapping_iter;
 
+       start_pfn = start >> PAGE_SHIFT;
+       end_pfn = end >> PAGE_SHIFT;
+
        /*
         * First iteration will setup identity mapping using large/small pages
         * based on use_pse, with other attributes same as set by
@@ -369,26 +371,6 @@ repeat:
                mapping_iter = 2;
                goto repeat;
        }
-}
-
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
-       if (pagenr <= 256)
-               return 1;
-       if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
-               return 0;
-       if (!page_is_ram(pagenr))
-               return 1;
        return 0;
 }
 
@@ -545,8 +527,9 @@ void __init native_pagetable_setup_done(pgd_t *base)
  * be partially populated, and so it avoids stomping on any existing
  * mappings.
  */
-static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
+void __init early_ioremap_page_table_range_init(void)
 {
+       pgd_t *pgd_base = swapper_pg_dir;
        unsigned long vaddr, end;
 
        /*
@@ -641,7 +624,7 @@ static int __init noexec_setup(char *str)
 }
 early_param("noexec", noexec_setup);
 
-static void __init set_nx(void)
+void __init set_nx(void)
 {
        unsigned int v[4], l, h;
 
@@ -793,6 +776,8 @@ void __init initmem_init(unsigned long start_pfn,
 #ifdef CONFIG_FLATMEM
        max_mapnr = num_physpages;
 #endif
+       __vmalloc_start_set = true;
+
        printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
                        pages_to_mb(max_low_pfn));
 
@@ -814,176 +799,66 @@ static void __init zone_sizes_init(void)
        free_area_init_nodes(max_zone_pfns);
 }
 
+static unsigned long __init setup_node_bootmem(int nodeid,
+                                unsigned long start_pfn,
+                                unsigned long end_pfn,
+                                unsigned long bootmap)
+{
+       unsigned long bootmap_size;
+
+       /* don't touch min_low_pfn */
+       bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
+                                        bootmap >> PAGE_SHIFT,
+                                        start_pfn, end_pfn);
+       printk(KERN_INFO "  node %d low ram: %08lx - %08lx\n",
+               nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+       printk(KERN_INFO "  node %d bootmap %08lx - %08lx\n",
+                nodeid, bootmap, bootmap + bootmap_size);
+       free_bootmem_with_active_regions(nodeid, end_pfn);
+       early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+
+       return bootmap + bootmap_size;
+}
+
 void __init setup_bootmem_allocator(void)
 {
-       int i;
+       int nodeid;
        unsigned long bootmap_size, bootmap;
        /*
         * Initialize the boot-time allocator (with low memory only):
         */
        bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
-       bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
-                                max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
+       bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
                                 PAGE_SIZE);
        if (bootmap == -1L)
                panic("Cannot find bootmem map of size %ld\n", bootmap_size);
        reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
 
-       /* don't touch min_low_pfn */
-       bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
-                                        min_low_pfn, max_low_pfn);
        printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
                 max_pfn_mapped<<PAGE_SHIFT);
-       printk(KERN_INFO "  low ram: %08lx - %08lx\n",
-                min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
-       printk(KERN_INFO "  bootmap %08lx - %08lx\n",
-                bootmap, bootmap + bootmap_size);
-       for_each_online_node(i)
-               free_bootmem_with_active_regions(i, max_low_pfn);
-       early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
-
-       after_init_bootmem = 1;
-}
-
-static void __init find_early_table_space(unsigned long end, int use_pse)
-{
-       unsigned long puds, pmds, ptes, tables, start;
+       printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
 
-       puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
-       tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+       for_each_online_node(nodeid) {
+                unsigned long start_pfn, end_pfn;
 
-       pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
-       tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
-
-       if (use_pse) {
-               unsigned long extra;
-
-               extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
-               extra += PMD_SIZE;
-               ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       } else
-               ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
-
-       tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
-
-       /* for fixmap */
-       tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-
-       /*
-        * RED-PEN putting page tables only on node 0 could
-        * cause a hotspot and fill up ZONE_DMA. The page tables
-        * need roughly 0.5KB per GB.
-        */
-       start = 0x7000;
-       table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
-                                       tables, PAGE_SIZE);
-       if (table_start == -1UL)
-               panic("Cannot find space for the kernel page tables");
-
-       table_start >>= PAGE_SHIFT;
-       table_end = table_start;
-       table_top = table_start + (tables>>PAGE_SHIFT);
-
-       printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-               end, table_start << PAGE_SHIFT,
-               (table_start << PAGE_SHIFT) + tables);
-}
-
-unsigned long __init_refok init_memory_mapping(unsigned long start,
-                                               unsigned long end)
-{
-       pgd_t *pgd_base = swapper_pg_dir;
-       unsigned long start_pfn, end_pfn;
-       unsigned long big_page_start;
-#ifdef CONFIG_DEBUG_PAGEALLOC
-       /*
-        * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
-        * This will simplify cpa(), which otherwise needs to support splitting
-        * large pages into small in interrupt context, etc.
-        */
-       int use_pse = 0;
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+               start_pfn = node_start_pfn[nodeid];
+               end_pfn = node_end_pfn[nodeid];
+               if (start_pfn > max_low_pfn)
+                       continue;
+               if (end_pfn > max_low_pfn)
+                       end_pfn = max_low_pfn;
 #else
-       int use_pse = cpu_has_pse;
+               start_pfn = 0;
+               end_pfn = max_low_pfn;
 #endif
-
-       /*
-        * Find space for the kernel direct mapping tables.
-        */
-       if (!after_init_bootmem)
-               find_early_table_space(end, use_pse);
-
-#ifdef CONFIG_X86_PAE
-       set_nx();
-       if (nx_enabled)
-               printk(KERN_INFO "NX (Execute Disable) protection: active\n");
-#endif
-
-       /* Enable PSE if available */
-       if (cpu_has_pse)
-               set_in_cr4(X86_CR4_PSE);
-
-       /* Enable PGE if available */
-       if (cpu_has_pge) {
-               set_in_cr4(X86_CR4_PGE);
-               __supported_pte_mask |= _PAGE_GLOBAL;
-       }
-
-       /*
-        * Don't use a large page for the first 2/4MB of memory
-        * because there are often fixed size MTRRs in there
-        * and overlapping MTRRs into large pages can cause
-        * slowdowns.
-        */
-       big_page_start = PMD_SIZE;
-
-       if (start < big_page_start) {
-               start_pfn = start >> PAGE_SHIFT;
-               end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
-       } else {
-               /* head is not big page alignment ? */
-               start_pfn = start >> PAGE_SHIFT;
-               end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
-                                << (PMD_SHIFT - PAGE_SHIFT);
+               bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
+                                                bootmap);
        }
-       if (start_pfn < end_pfn)
-               kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
-
-       /* big page range */
-       start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
-                        << (PMD_SHIFT - PAGE_SHIFT);
-       if (start_pfn < (big_page_start >> PAGE_SHIFT))
-               start_pfn =  big_page_start >> PAGE_SHIFT;
-       end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
-       if (start_pfn < end_pfn)
-               kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
-                                            use_pse);
-
-       /* tail is not big page alignment ? */
-       start_pfn = end_pfn;
-       if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
-               end_pfn = end >> PAGE_SHIFT;
-               if (start_pfn < end_pfn)
-                       kernel_physical_mapping_init(pgd_base, start_pfn,
-                                                        end_pfn, 0);
-       }
-
-       early_ioremap_page_table_range_init(pgd_base);
 
-       load_cr3(swapper_pg_dir);
-
-       __flush_tlb_all();
-
-       if (!after_init_bootmem)
-               reserve_early(table_start << PAGE_SHIFT,
-                                table_end << PAGE_SHIFT, "PGTABLE");
-
-       if (!after_init_bootmem)
-               early_memtest(start, end);
-
-       return end >> PAGE_SHIFT;
+       after_bootmem = 1;
 }
 
-
 /*
  * paging_init() sets up the page tables - note that the first 8MB are
  * already mapped by head.S.
@@ -1217,13 +1092,6 @@ void mark_rodata_ro(void)
 }
 #endif
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-       free_init_pages("initrd memory", start, end);
-}
-#endif
-
 int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
                                   int flags)
 {
index 07f44d491df16d9b392fdaa09e3bdf4143f3c236..54efa57d1c039b648d6eb77bef1878c10e9cdb0b 100644 (file)
@@ -48,6 +48,7 @@
 #include <asm/kdebug.h>
 #include <asm/numa.h>
 #include <asm/cacheflush.h>
+#include <asm/init.h>
 
 /*
  * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -61,12 +62,6 @@ static unsigned long dma_reserve __initdata;
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 
-int direct_gbpages
-#ifdef CONFIG_DIRECT_GBPAGES
-                               = 1
-#endif
-;
-
 static int __init parse_direct_gbpages_off(char *arg)
 {
        direct_gbpages = 0;
@@ -87,12 +82,10 @@ early_param("gbpages", parse_direct_gbpages_on);
  * around without checking the pgd every time.
  */
 
-int after_bootmem;
-
 pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
-static int do_not_nx __cpuinitdata;
+static int disable_nx __cpuinitdata;
 
 /*
  * noexec=on|off
@@ -107,9 +100,9 @@ static int __init nonx_setup(char *str)
                return -EINVAL;
        if (!strncmp(str, "on", 2)) {
                __supported_pte_mask |= _PAGE_NX;
-               do_not_nx = 0;
+               disable_nx = 0;
        } else if (!strncmp(str, "off", 3)) {
-               do_not_nx = 1;
+               disable_nx = 1;
                __supported_pte_mask &= ~_PAGE_NX;
        }
        return 0;
@@ -121,7 +114,7 @@ void __cpuinit check_efer(void)
        unsigned long efer;
 
        rdmsrl(MSR_EFER, efer);
-       if (!(efer & EFER_NX) || do_not_nx)
+       if (!(efer & EFER_NX) || disable_nx)
                __supported_pte_mask &= ~_PAGE_NX;
 }
 
@@ -325,13 +318,9 @@ void __init cleanup_highmap(void)
        }
 }
 
-static unsigned long __initdata table_start;
-static unsigned long __meminitdata table_end;
-static unsigned long __meminitdata table_top;
-
 static __ref void *alloc_low_page(unsigned long *phys)
 {
-       unsigned long pfn = table_end++;
+       unsigned long pfn = e820_table_end++;
        void *adr;
 
        if (after_bootmem) {
@@ -341,7 +330,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
                return adr;
        }
 
-       if (pfn >= table_top)
+       if (pfn >= e820_table_top)
                panic("alloc_low_page: ran out of memory");
 
        adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -581,58 +570,10 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
        return phys_pud_init(pud, addr, end, page_size_mask);
 }
 
-static void __init find_early_table_space(unsigned long end, int use_pse,
-                                         int use_gbpages)
-{
-       unsigned long puds, pmds, ptes, tables, start;
-
-       puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
-       tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
-       if (use_gbpages) {
-               unsigned long extra;
-               extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
-               pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
-       } else
-               pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
-       tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
-
-       if (use_pse) {
-               unsigned long extra;
-               extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
-               ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       } else
-               ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
-
-       /*
-        * RED-PEN putting page tables only on node 0 could
-        * cause a hotspot and fill up ZONE_DMA. The page tables
-        * need roughly 0.5KB per GB.
-        */
-       start = 0x8000;
-       table_start = find_e820_area(start, end, tables, PAGE_SIZE);
-       if (table_start == -1UL)
-               panic("Cannot find space for the kernel page tables");
-
-       table_start >>= PAGE_SHIFT;
-       table_end = table_start;
-       table_top = table_start + (tables >> PAGE_SHIFT);
-
-       printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-               end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
-}
-
-static void __init init_gbpages(void)
-{
-       if (direct_gbpages && cpu_has_gbpages)
-               printk(KERN_INFO "Using GB pages for direct mapping\n");
-       else
-               direct_gbpages = 0;
-}
-
-static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
-                                               unsigned long end,
-                                               unsigned long page_size_mask)
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+                            unsigned long end,
+                            unsigned long page_size_mask)
 {
 
        unsigned long next, last_map_addr = end;
@@ -669,176 +610,6 @@ static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
        return last_map_addr;
 }
 
-struct map_range {
-       unsigned long start;
-       unsigned long end;
-       unsigned page_size_mask;
-};
-
-#define NR_RANGE_MR 5
-
-static int save_mr(struct map_range *mr, int nr_range,
-                  unsigned long start_pfn, unsigned long end_pfn,
-                  unsigned long page_size_mask)
-{
-
-       if (start_pfn < end_pfn) {
-               if (nr_range >= NR_RANGE_MR)
-                       panic("run out of range for init_memory_mapping\n");
-               mr[nr_range].start = start_pfn<<PAGE_SHIFT;
-               mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
-               mr[nr_range].page_size_mask = page_size_mask;
-               nr_range++;
-       }
-
-       return nr_range;
-}
-
-/*
- * Setup the direct mapping of the physical memory at PAGE_OFFSET.
- * This runs before bootmem is initialized and gets pages directly from
- * the physical memory. To access them they are temporarily mapped.
- */
-unsigned long __init_refok init_memory_mapping(unsigned long start,
-                                              unsigned long end)
-{
-       unsigned long last_map_addr = 0;
-       unsigned long page_size_mask = 0;
-       unsigned long start_pfn, end_pfn;
-       unsigned long pos;
-
-       struct map_range mr[NR_RANGE_MR];
-       int nr_range, i;
-       int use_pse, use_gbpages;
-
-       printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
-
-       /*
-        * Find space for the kernel direct mapping tables.
-        *
-        * Later we should allocate these tables in the local node of the
-        * memory mapped. Unfortunately this is done currently before the
-        * nodes are discovered.
-        */
-       if (!after_bootmem)
-               init_gbpages();
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-       /*
-        * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
-        * This will simplify cpa(), which otherwise needs to support splitting
-        * large pages into small in interrupt context, etc.
-        */
-       use_pse = use_gbpages = 0;
-#else
-       use_pse = cpu_has_pse;
-       use_gbpages = direct_gbpages;
-#endif
-
-       if (use_gbpages)
-               page_size_mask |= 1 << PG_LEVEL_1G;
-       if (use_pse)
-               page_size_mask |= 1 << PG_LEVEL_2M;
-
-       memset(mr, 0, sizeof(mr));
-       nr_range = 0;
-
-       /* head if not big page alignment ?*/
-       start_pfn = start >> PAGE_SHIFT;
-       pos = start_pfn << PAGE_SHIFT;
-       end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
-                       << (PMD_SHIFT - PAGE_SHIFT);
-       if (end_pfn > (end >> PAGE_SHIFT))
-               end_pfn = end >> PAGE_SHIFT;
-       if (start_pfn < end_pfn) {
-               nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-               pos = end_pfn << PAGE_SHIFT;
-       }
-
-       /* big page (2M) range*/
-       start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-                        << (PMD_SHIFT - PAGE_SHIFT);
-       end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-                        << (PUD_SHIFT - PAGE_SHIFT);
-       if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
-               end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
-       if (start_pfn < end_pfn) {
-               nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-                               page_size_mask & (1<<PG_LEVEL_2M));
-               pos = end_pfn << PAGE_SHIFT;
-       }
-
-       /* big page (1G) range */
-       start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-                        << (PUD_SHIFT - PAGE_SHIFT);
-       end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
-       if (start_pfn < end_pfn) {
-               nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-                               page_size_mask &
-                                ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
-               pos = end_pfn << PAGE_SHIFT;
-       }
-
-       /* tail is not big page (1G) alignment */
-       start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-                        << (PMD_SHIFT - PAGE_SHIFT);
-       end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
-       if (start_pfn < end_pfn) {
-               nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-                               page_size_mask & (1<<PG_LEVEL_2M));
-               pos = end_pfn << PAGE_SHIFT;
-       }
-
-       /* tail is not big page (2M) alignment */
-       start_pfn = pos>>PAGE_SHIFT;
-       end_pfn = end>>PAGE_SHIFT;
-       nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-
-       /* try to merge same page size and continuous */
-       for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
-               unsigned long old_start;
-               if (mr[i].end != mr[i+1].start ||
-                   mr[i].page_size_mask != mr[i+1].page_size_mask)
-                       continue;
-               /* move it */
-               old_start = mr[i].start;
-               memmove(&mr[i], &mr[i+1],
-                        (nr_range - 1 - i) * sizeof (struct map_range));
-               mr[i--].start = old_start;
-               nr_range--;
-       }
-
-       for (i = 0; i < nr_range; i++)
-               printk(KERN_DEBUG " %010lx - %010lx page %s\n",
-                               mr[i].start, mr[i].end,
-                       (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
-                        (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
-
-       if (!after_bootmem)
-               find_early_table_space(end, use_pse, use_gbpages);
-
-       for (i = 0; i < nr_range; i++)
-               last_map_addr = kernel_physical_mapping_init(
-                                       mr[i].start, mr[i].end,
-                                       mr[i].page_size_mask);
-
-       if (!after_bootmem)
-               mmu_cr4_features = read_cr4();
-       __flush_tlb_all();
-
-       if (!after_bootmem && table_end > table_start)
-               reserve_early(table_start << PAGE_SHIFT,
-                                table_end << PAGE_SHIFT, "PGTABLE");
-
-       printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
-                        last_map_addr, end);
-
-       if (!after_bootmem)
-               early_memtest(start, end);
-
-       return last_map_addr >> PAGE_SHIFT;
-}
-
 #ifndef CONFIG_NUMA
 void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 {
@@ -910,28 +681,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
-       if (pagenr <= 256)
-               return 1;
-       if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
-               return 0;
-       if (!page_is_ram(pagenr))
-               return 1;
-       return 0;
-}
-
-
 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
                         kcore_modules, kcore_vsyscall;
 
@@ -1019,13 +768,6 @@ void mark_rodata_ro(void)
 
 #endif
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-       free_init_pages("initrd memory", start, end);
-}
-#endif
-
 int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
                                   int flags)
 {
index 04102d42ff4250873a519620958855e9b50770ff..699c9b2895ae8cc1225d14dd15eac4fe68384151 100644 (file)
@@ -31,16 +31,27 @@ int is_io_mapping_possible(resource_size_t base, unsigned long size)
 }
 EXPORT_SYMBOL_GPL(is_io_mapping_possible);
 
-/* Map 'pfn' using fixed map 'type' and protections 'prot'
- */
-void *
-iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
+void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
 {
        enum fixed_addresses idx;
        unsigned long vaddr;
 
        pagefault_disable();
 
+       idx = type + KM_TYPE_NR * smp_processor_id();
+       vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+       set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
+       arch_flush_lazy_mmu_mode();
+
+       return (void *)vaddr;
+}
+
+/*
+ * Map 'pfn' using fixed map 'type' and protections 'prot'
+ */
+void *
+iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
+{
        /*
         * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
         * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
@@ -50,12 +61,7 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
        if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
                prot = PAGE_KERNEL_UC_MINUS;
 
-       idx = type + KM_TYPE_NR*smp_processor_id();
-       vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-       set_pte(kmap_pte-idx, pfn_pte(pfn, prot));
-       arch_flush_lazy_mmu_mode();
-
-       return (void*) vaddr;
+       return kmap_atomic_prot_pfn(pfn, type, prot);
 }
 EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
 
index 433f7bd4648af4206fbc4afb0b2fd6b6dab59b6e..0dfa09d69e80236ee47a94d6de8e7f8b8f0663c3 100644 (file)
 #include <asm/pgalloc.h>
 #include <asm/pat.h>
 
-#ifdef CONFIG_X86_64
-
-static inline int phys_addr_valid(unsigned long addr)
+static inline int phys_addr_valid(resource_size_t addr)
 {
-       return addr < (1UL << boot_cpu_data.x86_phys_bits);
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+       return !(addr >> boot_cpu_data.x86_phys_bits);
+#else
+       return 1;
+#endif
 }
 
+#ifdef CONFIG_X86_64
+
 unsigned long __phys_addr(unsigned long x)
 {
        if (x >= __START_KERNEL_map) {
@@ -38,8 +42,7 @@ unsigned long __phys_addr(unsigned long x)
        } else {
                VIRTUAL_BUG_ON(x < PAGE_OFFSET);
                x -= PAGE_OFFSET;
-               VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM :
-                                       !phys_addr_valid(x));
+               VIRTUAL_BUG_ON(!phys_addr_valid(x));
        }
        return x;
 }
@@ -56,10 +59,8 @@ bool __virt_addr_valid(unsigned long x)
                if (x < PAGE_OFFSET)
                        return false;
                x -= PAGE_OFFSET;
-               if (system_state == SYSTEM_BOOTING ?
-                               x > MAXMEM : !phys_addr_valid(x)) {
+               if (!phys_addr_valid(x))
                        return false;
-               }
        }
 
        return pfn_valid(x >> PAGE_SHIFT);
@@ -68,18 +69,12 @@ EXPORT_SYMBOL(__virt_addr_valid);
 
 #else
 
-static inline int phys_addr_valid(unsigned long addr)
-{
-       return 1;
-}
-
 #ifdef CONFIG_DEBUG_VIRTUAL
 unsigned long __phys_addr(unsigned long x)
 {
-       /* VMALLOC_* aren't constants; not available at the boot time */
+       /* VMALLOC_* aren't constants  */
        VIRTUAL_BUG_ON(x < PAGE_OFFSET);
-       VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING &&
-               is_vmalloc_addr((void *) x));
+       VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
        return x - PAGE_OFFSET;
 }
 EXPORT_SYMBOL(__phys_addr);
@@ -89,7 +84,9 @@ bool __virt_addr_valid(unsigned long x)
 {
        if (x < PAGE_OFFSET)
                return false;
-       if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x))
+       if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
+               return false;
+       if (x >= FIXADDR_START)
                return false;
        return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
 }
@@ -508,13 +505,19 @@ static inline pte_t * __init early_ioremap_pte(unsigned long addr)
        return &bm_pte[pte_index(addr)];
 }
 
+static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
+
 void __init early_ioremap_init(void)
 {
        pmd_t *pmd;
+       int i;
 
        if (early_ioremap_debug)
                printk(KERN_INFO "early_ioremap_init()\n");
 
+       for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+               slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+
        pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
        memset(bm_pte, 0, sizeof(bm_pte));
        pmd_populate_kernel(&init_mm, pmd, bm_pte);
@@ -581,6 +584,7 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
 
 static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
 static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
+
 static int __init check_early_ioremap_leak(void)
 {
        int count = 0;
@@ -602,7 +606,8 @@ static int __init check_early_ioremap_leak(void)
 }
 late_initcall(check_early_ioremap_leak);
 
-static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
+static void __init __iomem *
+__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
 {
        unsigned long offset, last_addr;
        unsigned int nrpages;
@@ -668,9 +673,9 @@ static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned lo
                --nrpages;
        }
        if (early_ioremap_debug)
-               printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
+               printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]);
 
-       prev_map[slot] = (void __iomem *)(offset + fix_to_virt(idx0));
+       prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
        return prev_map[slot];
 }
 
@@ -738,8 +743,3 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
        }
        prev_map[slot] = NULL;
 }
-
-void __this_fixmap_does_not_exist(void)
-{
-       WARN_ON(1);
-}
index 6a518dd08a36511661df2842815781c94a680fbd..4f115e00486bc1fe50fc8c1987298df3b8ac1266 100644 (file)
@@ -310,7 +310,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
        struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
 
        if (!ctx->active) {
-               pr_warning("kmmio: spurious debug trap on CPU %d.\n",
+               pr_debug("kmmio: spurious debug trap on CPU %d.\n",
                                                        smp_processor_id());
                goto out;
        }
index 0bcd7883d036d68e1943f49529c3e96c7a70fd62..605c8be06217b0da36344abd5f23d06326bcd87a 100644 (file)
@@ -100,6 +100,9 @@ static int __init parse_memtest(char *arg)
 {
        if (arg)
                memtest_pattern = simple_strtoul(arg, NULL, 0);
+       else
+               memtest_pattern = ARRAY_SIZE(patterns);
+
        return 0;
 }
 
index 451fe95a0352b3d63634231ea31c4eea016b2550..3daefa04ace535f693b07e5700e0470e17ca93b8 100644 (file)
@@ -416,10 +416,11 @@ void __init initmem_init(unsigned long start_pfn,
        for_each_online_node(nid)
                propagate_e820_map_node(nid);
 
-       for_each_online_node(nid)
+       for_each_online_node(nid) {
                memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+               NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
+       }
 
-       NODE_DATA(0)->bdata = &bootmem_node_data[0];
        setup_bootmem_allocator();
 }
 
index 9c4294986af779ed62ab088af0dae0f0048eb0c9..d71e1b636ce69167515c0a6bd3b47d3807b49175 100644 (file)
@@ -16,6 +16,7 @@
 #include <asm/processor.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
+#include <asm/setup.h>
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/proto.h>
@@ -33,6 +34,7 @@ struct cpa_data {
        unsigned long   pfn;
        unsigned        force_split : 1;
        int             curpage;
+       struct page     **pages;
 };
 
 /*
@@ -45,6 +47,7 @@ static DEFINE_SPINLOCK(cpa_lock);
 
 #define CPA_FLUSHTLB 1
 #define CPA_ARRAY 2
+#define CPA_PAGES_ARRAY 4
 
 #ifdef CONFIG_PROC_FS
 static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -95,7 +98,7 @@ static inline unsigned long highmap_start_pfn(void)
 
 static inline unsigned long highmap_end_pfn(void)
 {
-       return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
+       return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
 }
 
 #endif
@@ -201,10 +204,10 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
        }
 }
 
-static void cpa_flush_array(unsigned long *start, int numpages, int cache)
+static void cpa_flush_array(unsigned long *start, int numpages, int cache,
+                           int in_flags, struct page **pages)
 {
        unsigned int i, level;
-       unsigned long *addr;
 
        BUG_ON(irqs_disabled());
 
@@ -225,14 +228,22 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache)
         * will cause all other CPUs to flush the same
         * cachelines:
         */
-       for (i = 0, addr = start; i < numpages; i++, addr++) {
-               pte_t *pte = lookup_address(*addr, &level);
+       for (i = 0; i < numpages; i++) {
+               unsigned long addr;
+               pte_t *pte;
+
+               if (in_flags & CPA_PAGES_ARRAY)
+                       addr = (unsigned long)page_address(pages[i]);
+               else
+                       addr = start[i];
+
+               pte = lookup_address(addr, &level);
 
                /*
                 * Only flush present addresses:
                 */
                if (pte && (pte_val(*pte) & _PAGE_PRESENT))
-                       clflush_cache_range((void *) *addr, PAGE_SIZE);
+                       clflush_cache_range((void *)addr, PAGE_SIZE);
        }
 }
 
@@ -584,7 +595,9 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
        unsigned int level;
        pte_t *kpte, old_pte;
 
-       if (cpa->flags & CPA_ARRAY)
+       if (cpa->flags & CPA_PAGES_ARRAY)
+               address = (unsigned long)page_address(cpa->pages[cpa->curpage]);
+       else if (cpa->flags & CPA_ARRAY)
                address = cpa->vaddr[cpa->curpage];
        else
                address = *cpa->vaddr;
@@ -687,7 +700,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
         * No need to redo, when the primary call touched the direct
         * mapping already:
         */
-       if (cpa->flags & CPA_ARRAY)
+       if (cpa->flags & CPA_PAGES_ARRAY)
+               vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]);
+       else if (cpa->flags & CPA_ARRAY)
                vaddr = cpa->vaddr[cpa->curpage];
        else
                vaddr = *cpa->vaddr;
@@ -698,7 +713,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
                alias_cpa = *cpa;
                temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
                alias_cpa.vaddr = &temp_cpa_vaddr;
-               alias_cpa.flags &= ~CPA_ARRAY;
+               alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
 
 
                ret = __change_page_attr_set_clr(&alias_cpa, 0);
@@ -711,7 +726,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
         * No need to redo, when the primary call touched the high
         * mapping already:
         */
-       if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
+       if (within(vaddr, (unsigned long) _text, _brk_end))
                return 0;
 
        /*
@@ -724,7 +739,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
        alias_cpa = *cpa;
        temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
        alias_cpa.vaddr = &temp_cpa_vaddr;
-       alias_cpa.flags &= ~CPA_ARRAY;
+       alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
 
        /*
         * The high mapping range is imprecise, so ignore the return value.
@@ -745,7 +760,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
                 */
                cpa->numpages = numpages;
                /* for array changes, we can't use large page */
-               if (cpa->flags & CPA_ARRAY)
+               if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
                        cpa->numpages = 1;
 
                if (!debug_pagealloc)
@@ -769,7 +784,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
                 */
                BUG_ON(cpa->numpages > numpages);
                numpages -= cpa->numpages;
-               if (cpa->flags & CPA_ARRAY)
+               if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
                        cpa->curpage++;
                else
                        *cpa->vaddr += cpa->numpages * PAGE_SIZE;
@@ -786,7 +801,8 @@ static inline int cache_attr(pgprot_t attr)
 
 static int change_page_attr_set_clr(unsigned long *addr, int numpages,
                                    pgprot_t mask_set, pgprot_t mask_clr,
-                                   int force_split, int array)
+                                   int force_split, int in_flag,
+                                   struct page **pages)
 {
        struct cpa_data cpa;
        int ret, cache, checkalias;
@@ -801,15 +817,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
                return 0;
 
        /* Ensure we are PAGE_SIZE aligned */
-       if (!array) {
-               if (*addr & ~PAGE_MASK) {
-                       *addr &= PAGE_MASK;
-                       /*
-                        * People should not be passing in unaligned addresses:
-                        */
-                       WARN_ON_ONCE(1);
-               }
-       } else {
+       if (in_flag & CPA_ARRAY) {
                int i;
                for (i = 0; i < numpages; i++) {
                        if (addr[i] & ~PAGE_MASK) {
@@ -817,6 +825,18 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
                                WARN_ON_ONCE(1);
                        }
                }
+       } else if (!(in_flag & CPA_PAGES_ARRAY)) {
+               /*
+                * in_flag of CPA_PAGES_ARRAY implies it is aligned.
+                * No need to cehck in that case
+                */
+               if (*addr & ~PAGE_MASK) {
+                       *addr &= PAGE_MASK;
+                       /*
+                        * People should not be passing in unaligned addresses:
+                        */
+                       WARN_ON_ONCE(1);
+               }
        }
 
        /* Must avoid aliasing mappings in the highmem code */
@@ -832,6 +852,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
        arch_flush_lazy_mmu_mode();
 
        cpa.vaddr = addr;
+       cpa.pages = pages;
        cpa.numpages = numpages;
        cpa.mask_set = mask_set;
        cpa.mask_clr = mask_clr;
@@ -839,8 +860,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
        cpa.curpage = 0;
        cpa.force_split = force_split;
 
-       if (array)
-               cpa.flags |= CPA_ARRAY;
+       if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
+               cpa.flags |= in_flag;
 
        /* No alias checking for _NX bit modifications */
        checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
@@ -866,9 +887,10 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
         * wbindv):
         */
        if (!ret && cpu_has_clflush) {
-               if (cpa.flags & CPA_ARRAY)
-                       cpa_flush_array(addr, numpages, cache);
-               else
+               if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
+                       cpa_flush_array(addr, numpages, cache,
+                                       cpa.flags, pages);
+               } else
                        cpa_flush_range(*addr, numpages, cache);
        } else
                cpa_flush_all(cache);
@@ -888,14 +910,28 @@ static inline int change_page_attr_set(unsigned long *addr, int numpages,
                                       pgprot_t mask, int array)
 {
        return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
-               array);
+               (array ? CPA_ARRAY : 0), NULL);
 }
 
 static inline int change_page_attr_clear(unsigned long *addr, int numpages,
                                         pgprot_t mask, int array)
 {
        return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
-               array);
+               (array ? CPA_ARRAY : 0), NULL);
+}
+
+static inline int cpa_set_pages_array(struct page **pages, int numpages,
+                                      pgprot_t mask)
+{
+       return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
+               CPA_PAGES_ARRAY, pages);
+}
+
+static inline int cpa_clear_pages_array(struct page **pages, int numpages,
+                                        pgprot_t mask)
+{
+       return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
+               CPA_PAGES_ARRAY, pages);
 }
 
 int _set_memory_uc(unsigned long addr, int numpages)
@@ -1043,7 +1079,7 @@ int set_memory_np(unsigned long addr, int numpages)
 int set_memory_4k(unsigned long addr, int numpages)
 {
        return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
-                                       __pgprot(0), 1, 0);
+                                       __pgprot(0), 1, 0, NULL);
 }
 
 int set_pages_uc(struct page *page, int numpages)
@@ -1054,6 +1090,35 @@ int set_pages_uc(struct page *page, int numpages)
 }
 EXPORT_SYMBOL(set_pages_uc);
 
+int set_pages_array_uc(struct page **pages, int addrinarray)
+{
+       unsigned long start;
+       unsigned long end;
+       int i;
+       int free_idx;
+
+       for (i = 0; i < addrinarray; i++) {
+               start = (unsigned long)page_address(pages[i]);
+               end = start + PAGE_SIZE;
+               if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
+                       goto err_out;
+       }
+
+       if (cpa_set_pages_array(pages, addrinarray,
+                       __pgprot(_PAGE_CACHE_UC_MINUS)) == 0) {
+               return 0; /* Success */
+       }
+err_out:
+       free_idx = i;
+       for (i = 0; i < free_idx; i++) {
+               start = (unsigned long)page_address(pages[i]);
+               end = start + PAGE_SIZE;
+               free_memtype(start, end);
+       }
+       return -EINVAL;
+}
+EXPORT_SYMBOL(set_pages_array_uc);
+
 int set_pages_wb(struct page *page, int numpages)
 {
        unsigned long addr = (unsigned long)page_address(page);
@@ -1062,6 +1127,26 @@ int set_pages_wb(struct page *page, int numpages)
 }
 EXPORT_SYMBOL(set_pages_wb);
 
+int set_pages_array_wb(struct page **pages, int addrinarray)
+{
+       int retval;
+       unsigned long start;
+       unsigned long end;
+       int i;
+
+       retval = cpa_clear_pages_array(pages, addrinarray,
+                       __pgprot(_PAGE_CACHE_MASK));
+
+       for (i = 0; i < addrinarray; i++) {
+               start = (unsigned long)page_address(pages[i]);
+               end = start + PAGE_SIZE;
+               free_memtype(start, end);
+       }
+
+       return retval;
+}
+EXPORT_SYMBOL(set_pages_array_wb);
+
 int set_pages_x(struct page *page, int numpages)
 {
        unsigned long addr = (unsigned long)page_address(page);
index 2ed37158012dc0017879730c9cd4d00e4a45d8cf..640339ee4fb2ae7ab79c02302cf26d0647966344 100644 (file)
@@ -677,10 +677,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
        is_ram = pat_pagerange_is_ram(paddr, paddr + size);
 
        /*
-        * reserve_pfn_range() doesn't support RAM pages.
+        * reserve_pfn_range() doesn't support RAM pages. Maintain the current
+        * behavior with RAM pages by returning success.
         */
        if (is_ram != 0)
-               return -EINVAL;
+               return 0;
 
        ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
        if (ret)
index f2e477c91c1b85d72bb3d7ec0659e647c67c968c..46c8834aedc0fe3c16e5c88d70012f62f436a3c3 100644 (file)
@@ -50,7 +50,7 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
        }
        pte = pte_offset_kernel(pmd, vaddr);
        if (pte_val(pteval))
-               set_pte_present(&init_mm, vaddr, pte, pteval);
+               set_pte_at(&init_mm, vaddr, pte, pteval);
        else
                pte_clear(&init_mm, vaddr, pte);
 
index a654d59e448327e640d239698625afaf9944d34e..821e97017e954a8a8e74a607950c583061412d7a 100644 (file)
@@ -186,11 +186,6 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
        cpumask_andnot(to_cpumask(f->flush_cpumask),
                       cpumask, cpumask_of(smp_processor_id()));
 
-       /*
-        * Make the above memory operations globally visible before
-        * sending the IPI.
-        */
-       smp_mb();
        /*
         * We have to send the IPI only to
         * CPUs affected.
index 82d22fc601ae3acc0ebb83df9d116e95ec66ea96..8c362b96b644953f018b2001541714dbbfab0c29 100644 (file)
@@ -90,7 +90,7 @@ static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d)
        return 0;
 }
 
-static struct dmi_system_id can_skip_pciprobe_dmi_table[] __devinitdata = {
+static const struct dmi_system_id can_skip_pciprobe_dmi_table[] __devinitconst = {
 /*
  * Systems where PCI IO resource ISA alignment can be skipped
  * when the ISA enable bit in the bridge control is not set
@@ -183,7 +183,7 @@ static int __devinit assign_all_busses(const struct dmi_system_id *d)
 }
 #endif
 
-static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
+static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
 #ifdef __i386__
 /*
  * Laptops which need pci=assign-busses to see Cardbus cards
index 7d388d5cf54852136da06d0ca08b04c646fc7199..9c49919e4d1c08d6d098aa34feb8cd46fb97e623 100644 (file)
@@ -356,7 +356,7 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev)
 DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video);
 
 
-static struct dmi_system_id __devinitdata msi_k8t_dmi_table[] = {
+static const struct dmi_system_id __devinitconst msi_k8t_dmi_table[] = {
        {
                .ident = "MSI-K8T-Neo2Fir",
                .matches = {
@@ -413,7 +413,7 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
  */
 static u16 toshiba_line_size;
 
-static struct dmi_system_id __devinitdata toshiba_ohci1394_dmi_table[] = {
+static const struct dmi_system_id __devinitconst toshiba_ohci1394_dmi_table[] = {
        {
                .ident = "Toshiba PS5 based laptop",
                .matches = {
index 5ead808dd70c3ec34dbd34da39600851b9c30619..f234a37bd428c73aa0e8623079bfbe5e06923948 100644 (file)
@@ -319,6 +319,9 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
                        return -EINVAL;
                }
                flags = new_flags;
+               vma->vm_page_prot = __pgprot(
+                       (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK) |
+                       flags);
        }
 
        if (((vma->vm_pgoff < max_low_pfn_mapped) ||
index cb6afa4ec95c524ce8e95fa20b47b88a2f65dac9..db3802fb7b8470082aac4ba2884e6a588bda694d 100644 (file)
@@ -1723,9 +1723,9 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
 {
        pmd_t *kernel_pmd;
 
-       init_pg_tables_start = __pa(pgd);
-       init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
-       max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
+       max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
+                                 xen_start_info->nr_pt_frames * PAGE_SIZE +
+                                 512*1024);
 
        kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
        memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
@@ -1870,7 +1870,6 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
 
 #ifdef CONFIG_X86_PAE
        .set_pte_atomic = xen_set_pte_atomic,
-       .set_pte_present = xen_set_pte_at,
        .pte_clear = xen_pte_clear,
        .pmd_clear = xen_pmd_clear,
 #endif /* CONFIG_X86_PAE */
index 5f333403c2ea0fdb4434942e555f3373871869ec..d313039e2fdf8179dbc3324e7778b7e2e6bed964 100644 (file)
@@ -31,6 +31,8 @@
 #include <linux/iova.h>
 #include <linux/intel-iommu.h>
 #include <linux/timer.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
 
 #undef PREFIX
 #define PREFIX "DMAR:"
@@ -509,6 +511,7 @@ int alloc_iommu(struct dmar_drhd_unit *drhd)
                return -ENOMEM;
 
        iommu->seq_id = iommu_allocated++;
+       sprintf (iommu->name, "dmar%d", iommu->seq_id);
 
        iommu->reg = ioremap(drhd->reg_base_addr, VTD_PAGE_SIZE);
        if (!iommu->reg) {
@@ -750,6 +753,42 @@ int qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
        return qi_submit_sync(&desc, iommu);
 }
 
+/*
+ * Disable Queued Invalidation interface.
+ */
+void dmar_disable_qi(struct intel_iommu *iommu)
+{
+       unsigned long flags;
+       u32 sts;
+       cycles_t start_time = get_cycles();
+
+       if (!ecap_qis(iommu->ecap))
+               return;
+
+       spin_lock_irqsave(&iommu->register_lock, flags);
+
+       sts =  dmar_readq(iommu->reg + DMAR_GSTS_REG);
+       if (!(sts & DMA_GSTS_QIES))
+               goto end;
+
+       /*
+        * Give a chance to HW to complete the pending invalidation requests.
+        */
+       while ((readl(iommu->reg + DMAR_IQT_REG) !=
+               readl(iommu->reg + DMAR_IQH_REG)) &&
+               (DMAR_OPERATION_TIMEOUT > (get_cycles() - start_time)))
+               cpu_relax();
+
+       iommu->gcmd &= ~DMA_GCMD_QIE;
+
+       writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl,
+                     !(sts & DMA_GSTS_QIES), sts);
+end:
+       spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
 /*
  * Enable Queued Invalidation interface. This is a must to support
  * interrupt-remapping. Also used by DMA-remapping, which replaces
@@ -770,20 +809,20 @@ int dmar_enable_qi(struct intel_iommu *iommu)
        if (iommu->qi)
                return 0;
 
-       iommu->qi = kmalloc(sizeof(*qi), GFP_KERNEL);
+       iommu->qi = kmalloc(sizeof(*qi), GFP_ATOMIC);
        if (!iommu->qi)
                return -ENOMEM;
 
        qi = iommu->qi;
 
-       qi->desc = (void *)(get_zeroed_page(GFP_KERNEL));
+       qi->desc = (void *)(get_zeroed_page(GFP_ATOMIC));
        if (!qi->desc) {
                kfree(qi);
                iommu->qi = 0;
                return -ENOMEM;
        }
 
-       qi->desc_status = kmalloc(QI_LENGTH * sizeof(int), GFP_KERNEL);
+       qi->desc_status = kmalloc(QI_LENGTH * sizeof(int), GFP_ATOMIC);
        if (!qi->desc_status) {
                free_page((unsigned long) qi->desc);
                kfree(qi);
@@ -812,3 +851,254 @@ int dmar_enable_qi(struct intel_iommu *iommu)
 
        return 0;
 }
+
+/* iommu interrupt handling. Most stuff are MSI-like. */
+
+enum faulttype {
+       DMA_REMAP,
+       INTR_REMAP,
+       UNKNOWN,
+};
+
+static const char *dma_remap_fault_reasons[] =
+{
+       "Software",
+       "Present bit in root entry is clear",
+       "Present bit in context entry is clear",
+       "Invalid context entry",
+       "Access beyond MGAW",
+       "PTE Write access is not set",
+       "PTE Read access is not set",
+       "Next page table ptr is invalid",
+       "Root table address invalid",
+       "Context table ptr is invalid",
+       "non-zero reserved fields in RTP",
+       "non-zero reserved fields in CTP",
+       "non-zero reserved fields in PTE",
+};
+
+static const char *intr_remap_fault_reasons[] =
+{
+       "Detected reserved fields in the decoded interrupt-remapped request",
+       "Interrupt index exceeded the interrupt-remapping table size",
+       "Present field in the IRTE entry is clear",
+       "Error accessing interrupt-remapping table pointed by IRTA_REG",
+       "Detected reserved fields in the IRTE entry",
+       "Blocked a compatibility format interrupt request",
+       "Blocked an interrupt request due to source-id verification failure",
+};
+
+#define MAX_FAULT_REASON_IDX   (ARRAY_SIZE(fault_reason_strings) - 1)
+
+const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type)
+{
+       if (fault_reason >= 0x20 && (fault_reason <= 0x20 +
+                                    ARRAY_SIZE(intr_remap_fault_reasons))) {
+               *fault_type = INTR_REMAP;
+               return intr_remap_fault_reasons[fault_reason - 0x20];
+       } else if (fault_reason < ARRAY_SIZE(dma_remap_fault_reasons)) {
+               *fault_type = DMA_REMAP;
+               return dma_remap_fault_reasons[fault_reason];
+       } else {
+               *fault_type = UNKNOWN;
+               return "Unknown";
+       }
+}
+
+void dmar_msi_unmask(unsigned int irq)
+{
+       struct intel_iommu *iommu = get_irq_data(irq);
+       unsigned long flag;
+
+       /* unmask it */
+       spin_lock_irqsave(&iommu->register_lock, flag);
+       writel(0, iommu->reg + DMAR_FECTL_REG);
+       /* Read a reg to force flush the post write */
+       readl(iommu->reg + DMAR_FECTL_REG);
+       spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+void dmar_msi_mask(unsigned int irq)
+{
+       unsigned long flag;
+       struct intel_iommu *iommu = get_irq_data(irq);
+
+       /* mask it */
+       spin_lock_irqsave(&iommu->register_lock, flag);
+       writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
+       /* Read a reg to force flush the post write */
+       readl(iommu->reg + DMAR_FECTL_REG);
+       spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+void dmar_msi_write(int irq, struct msi_msg *msg)
+{
+       struct intel_iommu *iommu = get_irq_data(irq);
+       unsigned long flag;
+
+       spin_lock_irqsave(&iommu->register_lock, flag);
+       writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
+       writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
+       writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
+       spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+void dmar_msi_read(int irq, struct msi_msg *msg)
+{
+       struct intel_iommu *iommu = get_irq_data(irq);
+       unsigned long flag;
+
+       spin_lock_irqsave(&iommu->register_lock, flag);
+       msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
+       msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
+       msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
+       spin_unlock_irqrestore(&iommu->register_lock, flag);
+}
+
+static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
+               u8 fault_reason, u16 source_id, unsigned long long addr)
+{
+       const char *reason;
+       int fault_type;
+
+       reason = dmar_get_fault_reason(fault_reason, &fault_type);
+
+       if (fault_type == INTR_REMAP)
+               printk(KERN_ERR "INTR-REMAP: Request device [[%02x:%02x.%d] "
+                      "fault index %llx\n"
+                       "INTR-REMAP:[fault reason %02d] %s\n",
+                       (source_id >> 8), PCI_SLOT(source_id & 0xFF),
+                       PCI_FUNC(source_id & 0xFF), addr >> 48,
+                       fault_reason, reason);
+       else
+               printk(KERN_ERR
+                      "DMAR:[%s] Request device [%02x:%02x.%d] "
+                      "fault addr %llx \n"
+                      "DMAR:[fault reason %02d] %s\n",
+                      (type ? "DMA Read" : "DMA Write"),
+                      (source_id >> 8), PCI_SLOT(source_id & 0xFF),
+                      PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
+       return 0;
+}
+
+#define PRIMARY_FAULT_REG_LEN (16)
+irqreturn_t dmar_fault(int irq, void *dev_id)
+{
+       struct intel_iommu *iommu = dev_id;
+       int reg, fault_index;
+       u32 fault_status;
+       unsigned long flag;
+
+       spin_lock_irqsave(&iommu->register_lock, flag);
+       fault_status = readl(iommu->reg + DMAR_FSTS_REG);
+       if (fault_status)
+               printk(KERN_ERR "DRHD: handling fault status reg %x\n",
+                      fault_status);
+
+       /* TBD: ignore advanced fault log currently */
+       if (!(fault_status & DMA_FSTS_PPF))
+               goto clear_rest;
+
+       fault_index = dma_fsts_fault_record_index(fault_status);
+       reg = cap_fault_reg_offset(iommu->cap);
+       while (1) {
+               u8 fault_reason;
+               u16 source_id;
+               u64 guest_addr;
+               int type;
+               u32 data;
+
+               /* highest 32 bits */
+               data = readl(iommu->reg + reg +
+                               fault_index * PRIMARY_FAULT_REG_LEN + 12);
+               if (!(data & DMA_FRCD_F))
+                       break;
+
+               fault_reason = dma_frcd_fault_reason(data);
+               type = dma_frcd_type(data);
+
+               data = readl(iommu->reg + reg +
+                               fault_index * PRIMARY_FAULT_REG_LEN + 8);
+               source_id = dma_frcd_source_id(data);
+
+               guest_addr = dmar_readq(iommu->reg + reg +
+                               fault_index * PRIMARY_FAULT_REG_LEN);
+               guest_addr = dma_frcd_page_addr(guest_addr);
+               /* clear the fault */
+               writel(DMA_FRCD_F, iommu->reg + reg +
+                       fault_index * PRIMARY_FAULT_REG_LEN + 12);
+
+               spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+               dmar_fault_do_one(iommu, type, fault_reason,
+                               source_id, guest_addr);
+
+               fault_index++;
+               if (fault_index > cap_num_fault_regs(iommu->cap))
+                       fault_index = 0;
+               spin_lock_irqsave(&iommu->register_lock, flag);
+       }
+clear_rest:
+       /* clear all the other faults */
+       fault_status = readl(iommu->reg + DMAR_FSTS_REG);
+       writel(fault_status, iommu->reg + DMAR_FSTS_REG);
+
+       spin_unlock_irqrestore(&iommu->register_lock, flag);
+       return IRQ_HANDLED;
+}
+
+int dmar_set_interrupt(struct intel_iommu *iommu)
+{
+       int irq, ret;
+
+       /*
+        * Check if the fault interrupt is already initialized.
+        */
+       if (iommu->irq)
+               return 0;
+
+       irq = create_irq();
+       if (!irq) {
+               printk(KERN_ERR "IOMMU: no free vectors\n");
+               return -EINVAL;
+       }
+
+       set_irq_data(irq, iommu);
+       iommu->irq = irq;
+
+       ret = arch_setup_dmar_msi(irq);
+       if (ret) {
+               set_irq_data(irq, NULL);
+               iommu->irq = 0;
+               destroy_irq(irq);
+               return 0;
+       }
+
+       ret = request_irq(irq, dmar_fault, 0, iommu->name, iommu);
+       if (ret)
+               printk(KERN_ERR "IOMMU: can't request irq\n");
+       return ret;
+}
+
+int __init enable_drhd_fault_handling(void)
+{
+       struct dmar_drhd_unit *drhd;
+
+       /*
+        * Enable fault control interrupt.
+        */
+       for_each_drhd_unit(drhd) {
+               int ret;
+               struct intel_iommu *iommu = drhd->iommu;
+               ret = dmar_set_interrupt(iommu);
+
+               if (ret) {
+                       printk(KERN_ERR "DRHD %Lx: failed to enable fault, "
+                              " interrupt, ret %d\n",
+                              (unsigned long long)drhd->reg_base_addr, ret);
+                       return -1;
+               }
+       }
+
+       return 0;
+}
index f3f686581a9026685dc3e6c994b00e88ed88486c..ef167b8b047da4844b57b8e3b9ae356d23eb057b 100644 (file)
@@ -1004,194 +1004,6 @@ static int iommu_disable_translation(struct intel_iommu *iommu)
        return 0;
 }
 
-/* iommu interrupt handling. Most stuff are MSI-like. */
-
-static const char *fault_reason_strings[] =
-{
-       "Software",
-       "Present bit in root entry is clear",
-       "Present bit in context entry is clear",
-       "Invalid context entry",
-       "Access beyond MGAW",
-       "PTE Write access is not set",
-       "PTE Read access is not set",
-       "Next page table ptr is invalid",
-       "Root table address invalid",
-       "Context table ptr is invalid",
-       "non-zero reserved fields in RTP",
-       "non-zero reserved fields in CTP",
-       "non-zero reserved fields in PTE",
-};
-#define MAX_FAULT_REASON_IDX   (ARRAY_SIZE(fault_reason_strings) - 1)
-
-const char *dmar_get_fault_reason(u8 fault_reason)
-{
-       if (fault_reason > MAX_FAULT_REASON_IDX)
-               return "Unknown";
-       else
-               return fault_reason_strings[fault_reason];
-}
-
-void dmar_msi_unmask(unsigned int irq)
-{
-       struct intel_iommu *iommu = get_irq_data(irq);
-       unsigned long flag;
-
-       /* unmask it */
-       spin_lock_irqsave(&iommu->register_lock, flag);
-       writel(0, iommu->reg + DMAR_FECTL_REG);
-       /* Read a reg to force flush the post write */
-       readl(iommu->reg + DMAR_FECTL_REG);
-       spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-void dmar_msi_mask(unsigned int irq)
-{
-       unsigned long flag;
-       struct intel_iommu *iommu = get_irq_data(irq);
-
-       /* mask it */
-       spin_lock_irqsave(&iommu->register_lock, flag);
-       writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
-       /* Read a reg to force flush the post write */
-       readl(iommu->reg + DMAR_FECTL_REG);
-       spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-void dmar_msi_write(int irq, struct msi_msg *msg)
-{
-       struct intel_iommu *iommu = get_irq_data(irq);
-       unsigned long flag;
-
-       spin_lock_irqsave(&iommu->register_lock, flag);
-       writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
-       writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
-       writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
-       spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-void dmar_msi_read(int irq, struct msi_msg *msg)
-{
-       struct intel_iommu *iommu = get_irq_data(irq);
-       unsigned long flag;
-
-       spin_lock_irqsave(&iommu->register_lock, flag);
-       msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
-       msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
-       msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
-       spin_unlock_irqrestore(&iommu->register_lock, flag);
-}
-
-static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
-               u8 fault_reason, u16 source_id, unsigned long long addr)
-{
-       const char *reason;
-
-       reason = dmar_get_fault_reason(fault_reason);
-
-       printk(KERN_ERR
-               "DMAR:[%s] Request device [%02x:%02x.%d] "
-               "fault addr %llx \n"
-               "DMAR:[fault reason %02d] %s\n",
-               (type ? "DMA Read" : "DMA Write"),
-               (source_id >> 8), PCI_SLOT(source_id & 0xFF),
-               PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
-       return 0;
-}
-
-#define PRIMARY_FAULT_REG_LEN (16)
-static irqreturn_t iommu_page_fault(int irq, void *dev_id)
-{
-       struct intel_iommu *iommu = dev_id;
-       int reg, fault_index;
-       u32 fault_status;
-       unsigned long flag;
-
-       spin_lock_irqsave(&iommu->register_lock, flag);
-       fault_status = readl(iommu->reg + DMAR_FSTS_REG);
-
-       /* TBD: ignore advanced fault log currently */
-       if (!(fault_status & DMA_FSTS_PPF))
-               goto clear_overflow;
-
-       fault_index = dma_fsts_fault_record_index(fault_status);
-       reg = cap_fault_reg_offset(iommu->cap);
-       while (1) {
-               u8 fault_reason;
-               u16 source_id;
-               u64 guest_addr;
-               int type;
-               u32 data;
-
-               /* highest 32 bits */
-               data = readl(iommu->reg + reg +
-                               fault_index * PRIMARY_FAULT_REG_LEN + 12);
-               if (!(data & DMA_FRCD_F))
-                       break;
-
-               fault_reason = dma_frcd_fault_reason(data);
-               type = dma_frcd_type(data);
-
-               data = readl(iommu->reg + reg +
-                               fault_index * PRIMARY_FAULT_REG_LEN + 8);
-               source_id = dma_frcd_source_id(data);
-
-               guest_addr = dmar_readq(iommu->reg + reg +
-                               fault_index * PRIMARY_FAULT_REG_LEN);
-               guest_addr = dma_frcd_page_addr(guest_addr);
-               /* clear the fault */
-               writel(DMA_FRCD_F, iommu->reg + reg +
-                       fault_index * PRIMARY_FAULT_REG_LEN + 12);
-
-               spin_unlock_irqrestore(&iommu->register_lock, flag);
-
-               iommu_page_fault_do_one(iommu, type, fault_reason,
-                               source_id, guest_addr);
-
-               fault_index++;
-               if (fault_index > cap_num_fault_regs(iommu->cap))
-                       fault_index = 0;
-               spin_lock_irqsave(&iommu->register_lock, flag);
-       }
-clear_overflow:
-       /* clear primary fault overflow */
-       fault_status = readl(iommu->reg + DMAR_FSTS_REG);
-       if (fault_status & DMA_FSTS_PFO)
-               writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
-
-       spin_unlock_irqrestore(&iommu->register_lock, flag);
-       return IRQ_HANDLED;
-}
-
-int dmar_set_interrupt(struct intel_iommu *iommu)
-{
-       int irq, ret;
-
-       irq = create_irq();
-       if (!irq) {
-               printk(KERN_ERR "IOMMU: no free vectors\n");
-               return -EINVAL;
-       }
-
-       set_irq_data(irq, iommu);
-       iommu->irq = irq;
-
-       ret = arch_setup_dmar_msi(irq);
-       if (ret) {
-               set_irq_data(irq, NULL);
-               iommu->irq = 0;
-               destroy_irq(irq);
-               return 0;
-       }
-
-       /* Force fault register is cleared */
-       iommu_page_fault(irq, iommu);
-
-       ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
-       if (ret)
-               printk(KERN_ERR "IOMMU: can't request irq\n");
-       return ret;
-}
 
 static int iommu_init_domains(struct intel_iommu *iommu)
 {
@@ -1987,7 +1799,7 @@ static int __init init_dmars(void)
        struct dmar_rmrr_unit *rmrr;
        struct pci_dev *pdev;
        struct intel_iommu *iommu;
-       int i, ret, unit = 0;
+       int i, ret;
 
        /*
         * for each drhd
@@ -2043,11 +1855,40 @@ static int __init init_dmars(void)
                }
        }
 
+       /*
+        * Start from the sane iommu hardware state.
+        */
+       for_each_drhd_unit(drhd) {
+               if (drhd->ignored)
+                       continue;
+
+               iommu = drhd->iommu;
+
+               /*
+                * If the queued invalidation is already initialized by us
+                * (for example, while enabling interrupt-remapping) then
+                * we got the things already rolling from a sane state.
+                */
+               if (iommu->qi)
+                       continue;
+
+               /*
+                * Clear any previous faults.
+                */
+               dmar_fault(-1, iommu);
+               /*
+                * Disable queued invalidation if supported and already enabled
+                * before OS handover.
+                */
+               dmar_disable_qi(iommu);
+       }
+
        for_each_drhd_unit(drhd) {
                if (drhd->ignored)
                        continue;
 
                iommu = drhd->iommu;
+
                if (dmar_enable_qi(iommu)) {
                        /*
                         * Queued Invalidate not enabled, use Register Based
@@ -2109,7 +1950,6 @@ static int __init init_dmars(void)
                if (drhd->ignored)
                        continue;
                iommu = drhd->iommu;
-               sprintf (iommu->name, "dmar%d", unit++);
 
                iommu_flush_write_buffer(iommu);
 
index 9d07a05d26f1125227fa8f37a73ac0c948ed32d2..b041a409f4a70f84e4a76ed4e0011c758aa31d71 100644 (file)
@@ -117,21 +117,22 @@ int get_irte(int irq, struct irte *entry)
 {
        int index;
        struct irq_2_iommu *irq_iommu;
+       unsigned long flags;
 
        if (!entry)
                return -1;
 
-       spin_lock(&irq_2_ir_lock);
+       spin_lock_irqsave(&irq_2_ir_lock, flags);
        irq_iommu = valid_irq_2_iommu(irq);
        if (!irq_iommu) {
-               spin_unlock(&irq_2_ir_lock);
+               spin_unlock_irqrestore(&irq_2_ir_lock, flags);
                return -1;
        }
 
        index = irq_iommu->irte_index + irq_iommu->sub_handle;
        *entry = *(irq_iommu->iommu->ir_table->base + index);
 
-       spin_unlock(&irq_2_ir_lock);
+       spin_unlock_irqrestore(&irq_2_ir_lock, flags);
        return 0;
 }
 
@@ -141,6 +142,7 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
        struct irq_2_iommu *irq_iommu;
        u16 index, start_index;
        unsigned int mask = 0;
+       unsigned long flags;
        int i;
 
        if (!count)
@@ -170,7 +172,7 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
                return -1;
        }
 
-       spin_lock(&irq_2_ir_lock);
+       spin_lock_irqsave(&irq_2_ir_lock, flags);
        do {
                for (i = index; i < index + count; i++)
                        if  (table->base[i].present)
@@ -182,7 +184,7 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
                index = (index + count) % INTR_REMAP_TABLE_ENTRIES;
 
                if (index == start_index) {
-                       spin_unlock(&irq_2_ir_lock);
+                       spin_unlock_irqrestore(&irq_2_ir_lock, flags);
                        printk(KERN_ERR "can't allocate an IRTE\n");
                        return -1;
                }
@@ -193,7 +195,7 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
 
        irq_iommu = irq_2_iommu_alloc(irq);
        if (!irq_iommu) {
-               spin_unlock(&irq_2_ir_lock);
+               spin_unlock_irqrestore(&irq_2_ir_lock, flags);
                printk(KERN_ERR "can't allocate irq_2_iommu\n");
                return -1;
        }
@@ -203,7 +205,7 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
        irq_iommu->sub_handle = 0;
        irq_iommu->irte_mask = mask;
 
-       spin_unlock(&irq_2_ir_lock);
+       spin_unlock_irqrestore(&irq_2_ir_lock, flags);
 
        return index;
 }
@@ -223,30 +225,32 @@ int map_irq_to_irte_handle(int irq, u16 *sub_handle)
 {
        int index;
        struct irq_2_iommu *irq_iommu;
+       unsigned long flags;
 
-       spin_lock(&irq_2_ir_lock);
+       spin_lock_irqsave(&irq_2_ir_lock, flags);
        irq_iommu = valid_irq_2_iommu(irq);
        if (!irq_iommu) {
-               spin_unlock(&irq_2_ir_lock);
+               spin_unlock_irqrestore(&irq_2_ir_lock, flags);
                return -1;
        }
 
        *sub_handle = irq_iommu->sub_handle;
        index = irq_iommu->irte_index;
-       spin_unlock(&irq_2_ir_lock);
+       spin_unlock_irqrestore(&irq_2_ir_lock, flags);
        return index;
 }
 
 int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
 {
        struct irq_2_iommu *irq_iommu;
+       unsigned long flags;
 
-       spin_lock(&irq_2_ir_lock);
+       spin_lock_irqsave(&irq_2_ir_lock, flags);
 
        irq_iommu = irq_2_iommu_alloc(irq);
 
        if (!irq_iommu) {
-               spin_unlock(&irq_2_ir_lock);
+               spin_unlock_irqrestore(&irq_2_ir_lock, flags);
                printk(KERN_ERR "can't allocate irq_2_iommu\n");
                return -1;
        }
@@ -256,7 +260,7 @@ int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
        irq_iommu->sub_handle = subhandle;
        irq_iommu->irte_mask = 0;
 
-       spin_unlock(&irq_2_ir_lock);
+       spin_unlock_irqrestore(&irq_2_ir_lock, flags);
 
        return 0;
 }
@@ -264,11 +268,12 @@ int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
 int clear_irte_irq(int irq, struct intel_iommu *iommu, u16 index)
 {
        struct irq_2_iommu *irq_iommu;
+       unsigned long flags;
 
-       spin_lock(&irq_2_ir_lock);
+       spin_lock_irqsave(&irq_2_ir_lock, flags);
        irq_iommu = valid_irq_2_iommu(irq);
        if (!irq_iommu) {
-               spin_unlock(&irq_2_ir_lock);
+               spin_unlock_irqrestore(&irq_2_ir_lock, flags);
                return -1;
        }
 
@@ -277,7 +282,7 @@ int clear_irte_irq(int irq, struct intel_iommu *iommu, u16 index)
        irq_iommu->sub_handle = 0;
        irq_2_iommu(irq)->irte_mask = 0;
 
-       spin_unlock(&irq_2_ir_lock);
+       spin_unlock_irqrestore(&irq_2_ir_lock, flags);
 
        return 0;
 }
@@ -289,11 +294,12 @@ int modify_irte(int irq, struct irte *irte_modified)
        struct irte *irte;
        struct intel_iommu *iommu;
        struct irq_2_iommu *irq_iommu;
+       unsigned long flags;
 
-       spin_lock(&irq_2_ir_lock);
+       spin_lock_irqsave(&irq_2_ir_lock, flags);
        irq_iommu = valid_irq_2_iommu(irq);
        if (!irq_iommu) {
-               spin_unlock(&irq_2_ir_lock);
+               spin_unlock_irqrestore(&irq_2_ir_lock, flags);
                return -1;
        }
 
@@ -302,11 +308,11 @@ int modify_irte(int irq, struct irte *irte_modified)
        index = irq_iommu->irte_index + irq_iommu->sub_handle;
        irte = &iommu->ir_table->base[index];
 
-       set_64bit((unsigned long *)irte, irte_modified->low | (1 << 1));
+       set_64bit((unsigned long *)irte, irte_modified->low);
        __iommu_flush_cache(iommu, irte, sizeof(*irte));
 
        rc = qi_flush_iec(iommu, index, 0);
-       spin_unlock(&irq_2_ir_lock);
+       spin_unlock_irqrestore(&irq_2_ir_lock, flags);
 
        return rc;
 }
@@ -317,11 +323,12 @@ int flush_irte(int irq)
        int index;
        struct intel_iommu *iommu;
        struct irq_2_iommu *irq_iommu;
+       unsigned long flags;
 
-       spin_lock(&irq_2_ir_lock);
+       spin_lock_irqsave(&irq_2_ir_lock, flags);
        irq_iommu = valid_irq_2_iommu(irq);
        if (!irq_iommu) {
-               spin_unlock(&irq_2_ir_lock);
+               spin_unlock_irqrestore(&irq_2_ir_lock, flags);
                return -1;
        }
 
@@ -330,7 +337,7 @@ int flush_irte(int irq)
        index = irq_iommu->irte_index + irq_iommu->sub_handle;
 
        rc = qi_flush_iec(iommu, index, irq_iommu->irte_mask);
-       spin_unlock(&irq_2_ir_lock);
+       spin_unlock_irqrestore(&irq_2_ir_lock, flags);
 
        return rc;
 }
@@ -363,11 +370,12 @@ int free_irte(int irq)
        struct irte *irte;
        struct intel_iommu *iommu;
        struct irq_2_iommu *irq_iommu;
+       unsigned long flags;
 
-       spin_lock(&irq_2_ir_lock);
+       spin_lock_irqsave(&irq_2_ir_lock, flags);
        irq_iommu = valid_irq_2_iommu(irq);
        if (!irq_iommu) {
-               spin_unlock(&irq_2_ir_lock);
+               spin_unlock_irqrestore(&irq_2_ir_lock, flags);
                return -1;
        }
 
@@ -378,7 +386,7 @@ int free_irte(int irq)
 
        if (!irq_iommu->sub_handle) {
                for (i = 0; i < (1 << irq_iommu->irte_mask); i++)
-                       set_64bit((unsigned long *)irte, 0);
+                       set_64bit((unsigned long *)(irte + i), 0);
                rc = qi_flush_iec(iommu, index, irq_iommu->irte_mask);
        }
 
@@ -387,7 +395,7 @@ int free_irte(int irq)
        irq_iommu->sub_handle = 0;
        irq_iommu->irte_mask = 0;
 
-       spin_unlock(&irq_2_ir_lock);
+       spin_unlock_irqrestore(&irq_2_ir_lock, flags);
 
        return rc;
 }
@@ -439,12 +447,12 @@ static int setup_intr_remapping(struct intel_iommu *iommu, int mode)
        struct page *pages;
 
        ir_table = iommu->ir_table = kzalloc(sizeof(struct ir_table),
-                                            GFP_KERNEL);
+                                            GFP_ATOMIC);
 
        if (!iommu->ir_table)
                return -ENOMEM;
 
-       pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, INTR_REMAP_PAGE_ORDER);
+       pages = alloc_pages(GFP_ATOMIC | __GFP_ZERO, INTR_REMAP_PAGE_ORDER);
 
        if (!pages) {
                printk(KERN_ERR "failed to allocate pages of order %d\n",
@@ -459,11 +467,55 @@ static int setup_intr_remapping(struct intel_iommu *iommu, int mode)
        return 0;
 }
 
+/*
+ * Disable Interrupt Remapping.
+ */
+static void disable_intr_remapping(struct intel_iommu *iommu)
+{
+       unsigned long flags;
+       u32 sts;
+
+       if (!ecap_ir_support(iommu->ecap))
+               return;
+
+       spin_lock_irqsave(&iommu->register_lock, flags);
+
+       sts = dmar_readq(iommu->reg + DMAR_GSTS_REG);
+       if (!(sts & DMA_GSTS_IRES))
+               goto end;
+
+       iommu->gcmd &= ~DMA_GCMD_IRE;
+       writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
+
+       IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
+                     readl, !(sts & DMA_GSTS_IRES), sts);
+
+end:
+       spin_unlock_irqrestore(&iommu->register_lock, flags);
+}
+
 int __init enable_intr_remapping(int eim)
 {
        struct dmar_drhd_unit *drhd;
        int setup = 0;
 
+       for_each_drhd_unit(drhd) {
+               struct intel_iommu *iommu = drhd->iommu;
+
+               /*
+                * Clear previous faults.
+                */
+               dmar_fault(-1, iommu);
+
+               /*
+                * Disable intr remapping and queued invalidation, if already
+                * enabled prior to OS handover.
+                */
+               disable_intr_remapping(iommu);
+
+               dmar_disable_qi(iommu);
+       }
+
        /*
         * check for the Interrupt-remapping support
         */
index f28440784cf0861eed96d57dada953f8d3113bae..2f34274689564e3bffb72739ac754081e4ad66f7 100644 (file)
 #include <linux/acpi.h>
 #include <linux/types.h>
 #include <linux/msi.h>
+#include <linux/irqreturn.h>
 
-#if defined(CONFIG_DMAR) || defined(CONFIG_INTR_REMAP)
 struct intel_iommu;
-
+#if defined(CONFIG_DMAR) || defined(CONFIG_INTR_REMAP)
 struct dmar_drhd_unit {
        struct list_head list;          /* list of drhd units   */
        struct  acpi_dmar_header *hdr;  /* ACPI header          */
@@ -49,7 +49,7 @@ extern int dmar_dev_scope_init(void);
 
 /* Intel IOMMU detection */
 extern void detect_intel_iommu(void);
-
+extern int enable_drhd_fault_handling(void);
 
 extern int parse_ioapics_under_ir(void);
 extern int alloc_iommu(struct dmar_drhd_unit *);
@@ -63,12 +63,12 @@ static inline int dmar_table_init(void)
 {
        return -ENODEV;
 }
+static inline int enable_drhd_fault_handling(void)
+{
+       return -1;
+}
 #endif /* !CONFIG_DMAR && !CONFIG_INTR_REMAP */
 
-#ifdef CONFIG_INTR_REMAP
-extern int intr_remapping_enabled;
-extern int enable_intr_remapping(int);
-
 struct irte {
        union {
                struct {
@@ -97,6 +97,10 @@ struct irte {
                __u64 high;
        };
 };
+#ifdef CONFIG_INTR_REMAP
+extern int intr_remapping_enabled;
+extern int enable_intr_remapping(int);
+
 extern int get_irte(int irq, struct irte *entry);
 extern int modify_irte(int irq, struct irte *irte_modified);
 extern int alloc_irte(struct intel_iommu *iommu, int irq, u16 count);
@@ -111,14 +115,40 @@ extern int irq_remapped(int irq);
 extern struct intel_iommu *map_dev_to_ir(struct pci_dev *dev);
 extern struct intel_iommu *map_ioapic_to_ir(int apic);
 #else
+static inline int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
+{
+       return -1;
+}
+static inline int modify_irte(int irq, struct irte *irte_modified)
+{
+       return -1;
+}
+static inline int free_irte(int irq)
+{
+       return -1;
+}
+static inline int map_irq_to_irte_handle(int irq, u16 *sub_handle)
+{
+       return -1;
+}
+static inline int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index,
+                              u16 sub_handle)
+{
+       return -1;
+}
+static inline struct intel_iommu *map_dev_to_ir(struct pci_dev *dev)
+{
+       return NULL;
+}
+static inline struct intel_iommu *map_ioapic_to_ir(int apic)
+{
+       return NULL;
+}
 #define irq_remapped(irq)              (0)
 #define enable_intr_remapping(mode)    (-1)
 #define intr_remapping_enabled         (0)
 #endif
 
-#ifdef CONFIG_DMAR
-extern const char *dmar_get_fault_reason(u8 fault_reason);
-
 /* Can't use the common MSI interrupt functions
  * since DMAR is not a pci device
  */
@@ -127,8 +157,10 @@ extern void dmar_msi_mask(unsigned int irq);
 extern void dmar_msi_read(int irq, struct msi_msg *msg);
 extern void dmar_msi_write(int irq, struct msi_msg *msg);
 extern int dmar_set_interrupt(struct intel_iommu *iommu);
+extern irqreturn_t dmar_fault(int irq, void *dev_id);
 extern int arch_setup_dmar_msi(unsigned int irq);
 
+#ifdef CONFIG_DMAR
 extern int iommu_detected, no_iommu;
 extern struct list_head dmar_rmrr_units;
 struct dmar_rmrr_unit {
index d2e3cbfba14f45ea30b34bc78e36735c5db2755d..78c1262e870473c57914092012607ef45369e3c2 100644 (file)
@@ -292,6 +292,8 @@ struct intel_iommu {
        spinlock_t      register_lock; /* protect register handling */
        int             seq_id; /* sequence id of the iommu */
        int             agaw; /* agaw of this iommu */
+       unsigned int    irq;
+       unsigned char   name[13];    /* Device Name */
 
 #ifdef CONFIG_DMAR
        unsigned long   *domain_ids; /* bitmap of domains */
@@ -299,8 +301,6 @@ struct intel_iommu {
        spinlock_t      lock; /* protect context, domain ids */
        struct root_entry *root_entry; /* virtual address */
 
-       unsigned int irq;
-       unsigned char name[7];    /* Device Name */
        struct iommu_flush flush;
 #endif
        struct q_inval  *qi;            /* Queued invalidation info */
@@ -321,6 +321,7 @@ extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev);
 extern int alloc_iommu(struct dmar_drhd_unit *drhd);
 extern void free_iommu(struct intel_iommu *iommu);
 extern int dmar_enable_qi(struct intel_iommu *iommu);
+extern void dmar_disable_qi(struct intel_iommu *iommu);
 extern void qi_global_iec(struct intel_iommu *iommu);
 
 extern int qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid,
index 065cdf8c09fb50fcce6ff67bcae848ac555c6b7a..b1ea37fc7a24598956b766e33654253f156e8125 100644 (file)
@@ -104,6 +104,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_CAN_NONLINEAR 0x08000000    /* Has ->fault & does nonlinear pages */
 #define VM_MIXEDMAP    0x10000000      /* Can contain "struct page" and pure PFN pages */
 #define VM_SAO         0x20000000      /* Strong Access Ordering (powerpc) */
+#define VM_PFN_AT_MMAP 0x40000000      /* PFNMAP vma that is fully mapped at mmap time */
 
 #ifndef VM_STACK_DEFAULT_FLAGS         /* arch can override this */
 #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
@@ -145,7 +146,7 @@ extern pgprot_t protection_map[16];
  */
 static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
 {
-       return ((vma->vm_flags & VM_PFNMAP) && vma->vm_pgoff);
+       return (vma->vm_flags & VM_PFN_AT_MMAP);
 }
 
 static inline int is_pfn_mapping(struct vm_area_struct *vma)
index f4c413bdd38d790e10957e970edc02657c245670..2246591f3711f3717a2228b64995bebf1b81d38a 100644 (file)
@@ -10001,10 +10001,11 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
        cpu = task_cpu(tsk);
        ca = task_ca(tsk);
 
-       for (; ca; ca = ca->parent) {
+       do {
                u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                *cpuusage += cputime;
-       }
+               ca = ca->parent;
+       } while (ca);
 }
 
 struct cgroup_subsys cpuacct_subsys = {
index baa999e87cd21d2f9e67c8e3b1093bb6831513fb..2032ad2fc34b82742ad273a4327394e76a5b187e 100644 (file)
@@ -1665,9 +1665,10 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
         * behaviour that some programs depend on. We mark the "original"
         * un-COW'ed pages by matching them up with "vma->vm_pgoff".
         */
-       if (addr == vma->vm_start && end == vma->vm_end)
+       if (addr == vma->vm_start && end == vma->vm_end) {
                vma->vm_pgoff = pfn;
-       else if (is_cow_mapping(vma->vm_flags))
+               vma->vm_flags |= VM_PFN_AT_MMAP;
+       } else if (is_cow_mapping(vma->vm_flags))
                return -EINVAL;
 
        vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
@@ -1679,6 +1680,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
                 * needed from higher level routine calling unmap_vmas
                 */
                vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
+               vma->vm_flags &= ~VM_PFN_AT_MMAP;
                return -EINVAL;
        }