2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
43 #define ROOT_SIZE VTD_PAGE_SIZE
44 #define CONTEXT_SIZE VTD_PAGE_SIZE
46 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
47 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
49 #define IOAPIC_RANGE_START (0xfee00000)
50 #define IOAPIC_RANGE_END (0xfeefffff)
51 #define IOVA_START_ADDR (0x1000)
53 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
55 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
57 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
58 #define DMA_32BIT_PFN IOVA_PFN(DMA_32BIT_MASK)
59 #define DMA_64BIT_PFN IOVA_PFN(DMA_64BIT_MASK)
61 /* global iommu list, set NULL for ignored DMAR units */
62 static struct intel_iommu **g_iommus;
64 static int rwbf_quirk;
69 * 12-63: Context Ptr (12 - (haw-1))
76 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
77 static inline bool root_present(struct root_entry *root)
79 return (root->val & 1);
81 static inline void set_root_present(struct root_entry *root)
85 static inline void set_root_value(struct root_entry *root, unsigned long value)
87 root->val |= value & VTD_PAGE_MASK;
90 static inline struct context_entry *
91 get_context_addr_from_root(struct root_entry *root)
93 return (struct context_entry *)
94 (root_present(root)?phys_to_virt(
95 root->val & VTD_PAGE_MASK) :
102 * 1: fault processing disable
103 * 2-3: translation type
104 * 12-63: address space root
110 struct context_entry {
115 static inline bool context_present(struct context_entry *context)
117 return (context->lo & 1);
119 static inline void context_set_present(struct context_entry *context)
124 static inline void context_set_fault_enable(struct context_entry *context)
126 context->lo &= (((u64)-1) << 2) | 1;
129 #define CONTEXT_TT_MULTI_LEVEL 0
131 static inline void context_set_translation_type(struct context_entry *context,
134 context->lo &= (((u64)-1) << 4) | 3;
135 context->lo |= (value & 3) << 2;
138 static inline void context_set_address_root(struct context_entry *context,
141 context->lo |= value & VTD_PAGE_MASK;
144 static inline void context_set_address_width(struct context_entry *context,
147 context->hi |= value & 7;
150 static inline void context_set_domain_id(struct context_entry *context,
153 context->hi |= (value & ((1 << 16) - 1)) << 8;
156 static inline void context_clear_entry(struct context_entry *context)
169 * 12-63: Host physcial address
175 static inline void dma_clear_pte(struct dma_pte *pte)
180 static inline void dma_set_pte_readable(struct dma_pte *pte)
182 pte->val |= DMA_PTE_READ;
185 static inline void dma_set_pte_writable(struct dma_pte *pte)
187 pte->val |= DMA_PTE_WRITE;
190 static inline void dma_set_pte_snp(struct dma_pte *pte)
192 pte->val |= DMA_PTE_SNP;
195 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
197 pte->val = (pte->val & ~3) | (prot & 3);
200 static inline u64 dma_pte_addr(struct dma_pte *pte)
202 return (pte->val & VTD_PAGE_MASK);
205 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
207 pte->val |= (addr & VTD_PAGE_MASK);
210 static inline bool dma_pte_present(struct dma_pte *pte)
212 return (pte->val & 3) != 0;
215 /* devices under the same p2p bridge are owned in one domain */
216 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
218 /* domain represents a virtual machine, more than one devices
219 * across iommus may be owned in one domain, e.g. kvm guest.
221 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
224 int id; /* domain id */
225 unsigned long iommu_bmp; /* bitmap of iommus this domain uses*/
227 struct list_head devices; /* all devices' list */
228 struct iova_domain iovad; /* iova's that belong to this domain */
230 struct dma_pte *pgd; /* virtual address */
231 spinlock_t mapping_lock; /* page table lock */
232 int gaw; /* max guest address width */
234 /* adjusted guest address width, 0 is level 2 30-bit */
237 int flags; /* flags to find out type of domain */
239 int iommu_coherency;/* indicate coherency of iommu access */
240 int iommu_snooping; /* indicate snooping control feature*/
241 int iommu_count; /* reference count of iommu */
242 spinlock_t iommu_lock; /* protect iommu set in domain */
243 u64 max_addr; /* maximum mapped address */
246 /* PCI domain-device relationship */
247 struct device_domain_info {
248 struct list_head link; /* link to domain siblings */
249 struct list_head global; /* link to global list */
250 u8 bus; /* PCI bus numer */
251 u8 devfn; /* PCI devfn number */
252 struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
253 struct dmar_domain *domain; /* pointer to domain */
256 static void flush_unmaps_timeout(unsigned long data);
258 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
260 #define HIGH_WATER_MARK 250
261 struct deferred_flush_tables {
263 struct iova *iova[HIGH_WATER_MARK];
264 struct dmar_domain *domain[HIGH_WATER_MARK];
267 static struct deferred_flush_tables *deferred_flush;
269 /* bitmap for indexing intel_iommus */
270 static int g_num_of_iommus;
272 static DEFINE_SPINLOCK(async_umap_flush_lock);
273 static LIST_HEAD(unmaps_to_do);
276 static long list_size;
278 static void domain_remove_dev_info(struct dmar_domain *domain);
280 #ifdef CONFIG_DMAR_DEFAULT_ON
281 int dmar_disabled = 0;
283 int dmar_disabled = 1;
284 #endif /*CONFIG_DMAR_DEFAULT_ON*/
286 static int __initdata dmar_map_gfx = 1;
287 static int dmar_forcedac;
288 static int intel_iommu_strict;
290 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
291 static DEFINE_SPINLOCK(device_domain_lock);
292 static LIST_HEAD(device_domain_list);
294 static struct iommu_ops intel_iommu_ops;
296 static int __init intel_iommu_setup(char *str)
301 if (!strncmp(str, "on", 2)) {
303 printk(KERN_INFO "Intel-IOMMU: enabled\n");
304 } else if (!strncmp(str, "off", 3)) {
306 printk(KERN_INFO "Intel-IOMMU: disabled\n");
307 } else if (!strncmp(str, "igfx_off", 8)) {
310 "Intel-IOMMU: disable GFX device mapping\n");
311 } else if (!strncmp(str, "forcedac", 8)) {
313 "Intel-IOMMU: Forcing DAC for PCI devices\n");
315 } else if (!strncmp(str, "strict", 6)) {
317 "Intel-IOMMU: disable batched IOTLB flush\n");
318 intel_iommu_strict = 1;
321 str += strcspn(str, ",");
327 __setup("intel_iommu=", intel_iommu_setup);
329 static struct kmem_cache *iommu_domain_cache;
330 static struct kmem_cache *iommu_devinfo_cache;
331 static struct kmem_cache *iommu_iova_cache;
333 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
338 /* trying to avoid low memory issues */
339 flags = current->flags & PF_MEMALLOC;
340 current->flags |= PF_MEMALLOC;
341 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
342 current->flags &= (~PF_MEMALLOC | flags);
347 static inline void *alloc_pgtable_page(void)
352 /* trying to avoid low memory issues */
353 flags = current->flags & PF_MEMALLOC;
354 current->flags |= PF_MEMALLOC;
355 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
356 current->flags &= (~PF_MEMALLOC | flags);
360 static inline void free_pgtable_page(void *vaddr)
362 free_page((unsigned long)vaddr);
365 static inline void *alloc_domain_mem(void)
367 return iommu_kmem_cache_alloc(iommu_domain_cache);
370 static void free_domain_mem(void *vaddr)
372 kmem_cache_free(iommu_domain_cache, vaddr);
375 static inline void * alloc_devinfo_mem(void)
377 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
380 static inline void free_devinfo_mem(void *vaddr)
382 kmem_cache_free(iommu_devinfo_cache, vaddr);
385 struct iova *alloc_iova_mem(void)
387 return iommu_kmem_cache_alloc(iommu_iova_cache);
390 void free_iova_mem(struct iova *iova)
392 kmem_cache_free(iommu_iova_cache, iova);
396 static inline int width_to_agaw(int width);
398 /* calculate agaw for each iommu.
399 * "SAGAW" may be different across iommus, use a default agaw, and
400 * get a supported less agaw for iommus that don't support the default agaw.
402 int iommu_calculate_agaw(struct intel_iommu *iommu)
407 sagaw = cap_sagaw(iommu->cap);
408 for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
410 if (test_bit(agaw, &sagaw))
417 /* in native case, each domain is related to only one iommu */
418 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
422 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
424 iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
425 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
428 return g_iommus[iommu_id];
431 static void domain_update_iommu_coherency(struct dmar_domain *domain)
435 domain->iommu_coherency = 1;
437 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
438 for (; i < g_num_of_iommus; ) {
439 if (!ecap_coherent(g_iommus[i]->ecap)) {
440 domain->iommu_coherency = 0;
443 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
447 static void domain_update_iommu_snooping(struct dmar_domain *domain)
451 domain->iommu_snooping = 1;
453 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
454 for (; i < g_num_of_iommus; ) {
455 if (!ecap_sc_support(g_iommus[i]->ecap)) {
456 domain->iommu_snooping = 0;
459 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
463 /* Some capabilities may be different across iommus */
464 static void domain_update_iommu_cap(struct dmar_domain *domain)
466 domain_update_iommu_coherency(domain);
467 domain_update_iommu_snooping(domain);
470 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
472 struct dmar_drhd_unit *drhd = NULL;
475 for_each_drhd_unit(drhd) {
479 for (i = 0; i < drhd->devices_cnt; i++)
480 if (drhd->devices[i] &&
481 drhd->devices[i]->bus->number == bus &&
482 drhd->devices[i]->devfn == devfn)
485 if (drhd->include_all)
492 static void domain_flush_cache(struct dmar_domain *domain,
493 void *addr, int size)
495 if (!domain->iommu_coherency)
496 clflush_cache_range(addr, size);
499 /* Gets context entry for a given bus and devfn */
500 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
503 struct root_entry *root;
504 struct context_entry *context;
505 unsigned long phy_addr;
508 spin_lock_irqsave(&iommu->lock, flags);
509 root = &iommu->root_entry[bus];
510 context = get_context_addr_from_root(root);
512 context = (struct context_entry *)alloc_pgtable_page();
514 spin_unlock_irqrestore(&iommu->lock, flags);
517 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
518 phy_addr = virt_to_phys((void *)context);
519 set_root_value(root, phy_addr);
520 set_root_present(root);
521 __iommu_flush_cache(iommu, root, sizeof(*root));
523 spin_unlock_irqrestore(&iommu->lock, flags);
524 return &context[devfn];
527 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
529 struct root_entry *root;
530 struct context_entry *context;
534 spin_lock_irqsave(&iommu->lock, flags);
535 root = &iommu->root_entry[bus];
536 context = get_context_addr_from_root(root);
541 ret = context_present(&context[devfn]);
543 spin_unlock_irqrestore(&iommu->lock, flags);
547 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
549 struct root_entry *root;
550 struct context_entry *context;
553 spin_lock_irqsave(&iommu->lock, flags);
554 root = &iommu->root_entry[bus];
555 context = get_context_addr_from_root(root);
557 context_clear_entry(&context[devfn]);
558 __iommu_flush_cache(iommu, &context[devfn], \
561 spin_unlock_irqrestore(&iommu->lock, flags);
564 static void free_context_table(struct intel_iommu *iommu)
566 struct root_entry *root;
569 struct context_entry *context;
571 spin_lock_irqsave(&iommu->lock, flags);
572 if (!iommu->root_entry) {
575 for (i = 0; i < ROOT_ENTRY_NR; i++) {
576 root = &iommu->root_entry[i];
577 context = get_context_addr_from_root(root);
579 free_pgtable_page(context);
581 free_pgtable_page(iommu->root_entry);
582 iommu->root_entry = NULL;
584 spin_unlock_irqrestore(&iommu->lock, flags);
587 /* page table handling */
588 #define LEVEL_STRIDE (9)
589 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
591 static inline int agaw_to_level(int agaw)
596 static inline int agaw_to_width(int agaw)
598 return 30 + agaw * LEVEL_STRIDE;
602 static inline int width_to_agaw(int width)
604 return (width - 30) / LEVEL_STRIDE;
607 static inline unsigned int level_to_offset_bits(int level)
609 return (12 + (level - 1) * LEVEL_STRIDE);
612 static inline int address_level_offset(u64 addr, int level)
614 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
617 static inline u64 level_mask(int level)
619 return ((u64)-1 << level_to_offset_bits(level));
622 static inline u64 level_size(int level)
624 return ((u64)1 << level_to_offset_bits(level));
627 static inline u64 align_to_level(u64 addr, int level)
629 return ((addr + level_size(level) - 1) & level_mask(level));
632 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
634 int addr_width = agaw_to_width(domain->agaw);
635 struct dma_pte *parent, *pte = NULL;
636 int level = agaw_to_level(domain->agaw);
640 BUG_ON(!domain->pgd);
642 addr &= (((u64)1) << addr_width) - 1;
643 parent = domain->pgd;
645 spin_lock_irqsave(&domain->mapping_lock, flags);
649 offset = address_level_offset(addr, level);
650 pte = &parent[offset];
654 if (!dma_pte_present(pte)) {
655 tmp_page = alloc_pgtable_page();
658 spin_unlock_irqrestore(&domain->mapping_lock,
662 domain_flush_cache(domain, tmp_page, PAGE_SIZE);
663 dma_set_pte_addr(pte, virt_to_phys(tmp_page));
665 * high level table always sets r/w, last level page
666 * table control read/write
668 dma_set_pte_readable(pte);
669 dma_set_pte_writable(pte);
670 domain_flush_cache(domain, pte, sizeof(*pte));
672 parent = phys_to_virt(dma_pte_addr(pte));
676 spin_unlock_irqrestore(&domain->mapping_lock, flags);
680 /* return address's pte at specific level */
681 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
684 struct dma_pte *parent, *pte = NULL;
685 int total = agaw_to_level(domain->agaw);
688 parent = domain->pgd;
689 while (level <= total) {
690 offset = address_level_offset(addr, total);
691 pte = &parent[offset];
695 if (!dma_pte_present(pte))
697 parent = phys_to_virt(dma_pte_addr(pte));
703 /* clear one page's page table */
704 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
706 struct dma_pte *pte = NULL;
708 /* get last level pte */
709 pte = dma_addr_level_pte(domain, addr, 1);
713 domain_flush_cache(domain, pte, sizeof(*pte));
717 /* clear last level pte, a tlb flush should be followed */
718 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
720 int addr_width = agaw_to_width(domain->agaw);
722 start &= (((u64)1) << addr_width) - 1;
723 end &= (((u64)1) << addr_width) - 1;
724 /* in case it's partial page */
725 start = PAGE_ALIGN(start);
728 /* we don't need lock here, nobody else touches the iova range */
729 while (start < end) {
730 dma_pte_clear_one(domain, start);
731 start += VTD_PAGE_SIZE;
735 /* free page table pages. last level pte should already be cleared */
736 static void dma_pte_free_pagetable(struct dmar_domain *domain,
739 int addr_width = agaw_to_width(domain->agaw);
741 int total = agaw_to_level(domain->agaw);
745 start &= (((u64)1) << addr_width) - 1;
746 end &= (((u64)1) << addr_width) - 1;
748 /* we don't need lock here, nobody else touches the iova range */
750 while (level <= total) {
751 tmp = align_to_level(start, level);
752 if (tmp >= end || (tmp + level_size(level) > end))
756 pte = dma_addr_level_pte(domain, tmp, level);
759 phys_to_virt(dma_pte_addr(pte)));
761 domain_flush_cache(domain, pte, sizeof(*pte));
763 tmp += level_size(level);
768 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
769 free_pgtable_page(domain->pgd);
775 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
777 struct root_entry *root;
780 root = (struct root_entry *)alloc_pgtable_page();
784 __iommu_flush_cache(iommu, root, ROOT_SIZE);
786 spin_lock_irqsave(&iommu->lock, flags);
787 iommu->root_entry = root;
788 spin_unlock_irqrestore(&iommu->lock, flags);
793 static void iommu_set_root_entry(struct intel_iommu *iommu)
799 addr = iommu->root_entry;
801 spin_lock_irqsave(&iommu->register_lock, flag);
802 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
804 cmd = iommu->gcmd | DMA_GCMD_SRTP;
805 writel(cmd, iommu->reg + DMAR_GCMD_REG);
807 /* Make sure hardware complete it */
808 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
809 readl, (sts & DMA_GSTS_RTPS), sts);
811 spin_unlock_irqrestore(&iommu->register_lock, flag);
814 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
819 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
821 val = iommu->gcmd | DMA_GCMD_WBF;
823 spin_lock_irqsave(&iommu->register_lock, flag);
824 writel(val, iommu->reg + DMAR_GCMD_REG);
826 /* Make sure hardware complete it */
827 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
828 readl, (!(val & DMA_GSTS_WBFS)), val);
830 spin_unlock_irqrestore(&iommu->register_lock, flag);
833 /* return value determine if we need a write buffer flush */
834 static int __iommu_flush_context(struct intel_iommu *iommu,
835 u16 did, u16 source_id, u8 function_mask, u64 type,
836 int non_present_entry_flush)
842 * In the non-present entry flush case, if hardware doesn't cache
843 * non-present entry we do nothing and if hardware cache non-present
844 * entry, we flush entries of domain 0 (the domain id is used to cache
845 * any non-present entries)
847 if (non_present_entry_flush) {
848 if (!cap_caching_mode(iommu->cap))
855 case DMA_CCMD_GLOBAL_INVL:
856 val = DMA_CCMD_GLOBAL_INVL;
858 case DMA_CCMD_DOMAIN_INVL:
859 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
861 case DMA_CCMD_DEVICE_INVL:
862 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
863 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
870 spin_lock_irqsave(&iommu->register_lock, flag);
871 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
873 /* Make sure hardware complete it */
874 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
875 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
877 spin_unlock_irqrestore(&iommu->register_lock, flag);
879 /* flush context entry will implicitly flush write buffer */
883 /* return value determine if we need a write buffer flush */
884 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
885 u64 addr, unsigned int size_order, u64 type,
886 int non_present_entry_flush)
888 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
889 u64 val = 0, val_iva = 0;
893 * In the non-present entry flush case, if hardware doesn't cache
894 * non-present entry we do nothing and if hardware cache non-present
895 * entry, we flush entries of domain 0 (the domain id is used to cache
896 * any non-present entries)
898 if (non_present_entry_flush) {
899 if (!cap_caching_mode(iommu->cap))
906 case DMA_TLB_GLOBAL_FLUSH:
907 /* global flush doesn't need set IVA_REG */
908 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
910 case DMA_TLB_DSI_FLUSH:
911 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
913 case DMA_TLB_PSI_FLUSH:
914 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
915 /* Note: always flush non-leaf currently */
916 val_iva = size_order | addr;
921 /* Note: set drain read/write */
924 * This is probably to be super secure.. Looks like we can
925 * ignore it without any impact.
927 if (cap_read_drain(iommu->cap))
928 val |= DMA_TLB_READ_DRAIN;
930 if (cap_write_drain(iommu->cap))
931 val |= DMA_TLB_WRITE_DRAIN;
933 spin_lock_irqsave(&iommu->register_lock, flag);
934 /* Note: Only uses first TLB reg currently */
936 dmar_writeq(iommu->reg + tlb_offset, val_iva);
937 dmar_writeq(iommu->reg + tlb_offset + 8, val);
939 /* Make sure hardware complete it */
940 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
941 dmar_readq, (!(val & DMA_TLB_IVT)), val);
943 spin_unlock_irqrestore(&iommu->register_lock, flag);
945 /* check IOTLB invalidation granularity */
946 if (DMA_TLB_IAIG(val) == 0)
947 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
948 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
949 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
950 (unsigned long long)DMA_TLB_IIRG(type),
951 (unsigned long long)DMA_TLB_IAIG(val));
952 /* flush iotlb entry will implicitly flush write buffer */
956 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
957 u64 addr, unsigned int pages, int non_present_entry_flush)
961 BUG_ON(addr & (~VTD_PAGE_MASK));
964 /* Fallback to domain selective flush if no PSI support */
965 if (!cap_pgsel_inv(iommu->cap))
966 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
968 non_present_entry_flush);
971 * PSI requires page size to be 2 ^ x, and the base address is naturally
972 * aligned to the size
974 mask = ilog2(__roundup_pow_of_two(pages));
975 /* Fallback to domain selective flush if size is too big */
976 if (mask > cap_max_amask_val(iommu->cap))
977 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
978 DMA_TLB_DSI_FLUSH, non_present_entry_flush);
980 return iommu->flush.flush_iotlb(iommu, did, addr, mask,
982 non_present_entry_flush);
985 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
990 spin_lock_irqsave(&iommu->register_lock, flags);
991 pmen = readl(iommu->reg + DMAR_PMEN_REG);
992 pmen &= ~DMA_PMEN_EPM;
993 writel(pmen, iommu->reg + DMAR_PMEN_REG);
995 /* wait for the protected region status bit to clear */
996 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
997 readl, !(pmen & DMA_PMEN_PRS), pmen);
999 spin_unlock_irqrestore(&iommu->register_lock, flags);
1002 static int iommu_enable_translation(struct intel_iommu *iommu)
1005 unsigned long flags;
1007 spin_lock_irqsave(&iommu->register_lock, flags);
1008 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
1010 /* Make sure hardware complete it */
1011 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1012 readl, (sts & DMA_GSTS_TES), sts);
1014 iommu->gcmd |= DMA_GCMD_TE;
1015 spin_unlock_irqrestore(&iommu->register_lock, flags);
1019 static int iommu_disable_translation(struct intel_iommu *iommu)
1024 spin_lock_irqsave(&iommu->register_lock, flag);
1025 iommu->gcmd &= ~DMA_GCMD_TE;
1026 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1028 /* Make sure hardware complete it */
1029 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1030 readl, (!(sts & DMA_GSTS_TES)), sts);
1032 spin_unlock_irqrestore(&iommu->register_lock, flag);
1036 /* iommu interrupt handling. Most stuff are MSI-like. */
1038 static const char *fault_reason_strings[] =
1041 "Present bit in root entry is clear",
1042 "Present bit in context entry is clear",
1043 "Invalid context entry",
1044 "Access beyond MGAW",
1045 "PTE Write access is not set",
1046 "PTE Read access is not set",
1047 "Next page table ptr is invalid",
1048 "Root table address invalid",
1049 "Context table ptr is invalid",
1050 "non-zero reserved fields in RTP",
1051 "non-zero reserved fields in CTP",
1052 "non-zero reserved fields in PTE",
1054 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
1056 const char *dmar_get_fault_reason(u8 fault_reason)
1058 if (fault_reason > MAX_FAULT_REASON_IDX)
1061 return fault_reason_strings[fault_reason];
1064 void dmar_msi_unmask(unsigned int irq)
1066 struct intel_iommu *iommu = get_irq_data(irq);
1070 spin_lock_irqsave(&iommu->register_lock, flag);
1071 writel(0, iommu->reg + DMAR_FECTL_REG);
1072 /* Read a reg to force flush the post write */
1073 readl(iommu->reg + DMAR_FECTL_REG);
1074 spin_unlock_irqrestore(&iommu->register_lock, flag);
1077 void dmar_msi_mask(unsigned int irq)
1080 struct intel_iommu *iommu = get_irq_data(irq);
1083 spin_lock_irqsave(&iommu->register_lock, flag);
1084 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1085 /* Read a reg to force flush the post write */
1086 readl(iommu->reg + DMAR_FECTL_REG);
1087 spin_unlock_irqrestore(&iommu->register_lock, flag);
1090 void dmar_msi_write(int irq, struct msi_msg *msg)
1092 struct intel_iommu *iommu = get_irq_data(irq);
1095 spin_lock_irqsave(&iommu->register_lock, flag);
1096 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1097 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1098 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1099 spin_unlock_irqrestore(&iommu->register_lock, flag);
1102 void dmar_msi_read(int irq, struct msi_msg *msg)
1104 struct intel_iommu *iommu = get_irq_data(irq);
1107 spin_lock_irqsave(&iommu->register_lock, flag);
1108 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1109 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1110 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1111 spin_unlock_irqrestore(&iommu->register_lock, flag);
1114 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1115 u8 fault_reason, u16 source_id, unsigned long long addr)
1119 reason = dmar_get_fault_reason(fault_reason);
1122 "DMAR:[%s] Request device [%02x:%02x.%d] "
1123 "fault addr %llx \n"
1124 "DMAR:[fault reason %02d] %s\n",
1125 (type ? "DMA Read" : "DMA Write"),
1126 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1127 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1131 #define PRIMARY_FAULT_REG_LEN (16)
1132 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1134 struct intel_iommu *iommu = dev_id;
1135 int reg, fault_index;
1139 spin_lock_irqsave(&iommu->register_lock, flag);
1140 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1142 /* TBD: ignore advanced fault log currently */
1143 if (!(fault_status & DMA_FSTS_PPF))
1144 goto clear_overflow;
1146 fault_index = dma_fsts_fault_record_index(fault_status);
1147 reg = cap_fault_reg_offset(iommu->cap);
1155 /* highest 32 bits */
1156 data = readl(iommu->reg + reg +
1157 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1158 if (!(data & DMA_FRCD_F))
1161 fault_reason = dma_frcd_fault_reason(data);
1162 type = dma_frcd_type(data);
1164 data = readl(iommu->reg + reg +
1165 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1166 source_id = dma_frcd_source_id(data);
1168 guest_addr = dmar_readq(iommu->reg + reg +
1169 fault_index * PRIMARY_FAULT_REG_LEN);
1170 guest_addr = dma_frcd_page_addr(guest_addr);
1171 /* clear the fault */
1172 writel(DMA_FRCD_F, iommu->reg + reg +
1173 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1175 spin_unlock_irqrestore(&iommu->register_lock, flag);
1177 iommu_page_fault_do_one(iommu, type, fault_reason,
1178 source_id, guest_addr);
1181 if (fault_index > cap_num_fault_regs(iommu->cap))
1183 spin_lock_irqsave(&iommu->register_lock, flag);
1186 /* clear primary fault overflow */
1187 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1188 if (fault_status & DMA_FSTS_PFO)
1189 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1191 spin_unlock_irqrestore(&iommu->register_lock, flag);
1195 int dmar_set_interrupt(struct intel_iommu *iommu)
1201 printk(KERN_ERR "IOMMU: no free vectors\n");
1205 set_irq_data(irq, iommu);
1208 ret = arch_setup_dmar_msi(irq);
1210 set_irq_data(irq, NULL);
1216 /* Force fault register is cleared */
1217 iommu_page_fault(irq, iommu);
1219 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1221 printk(KERN_ERR "IOMMU: can't request irq\n");
1225 static int iommu_init_domains(struct intel_iommu *iommu)
1227 unsigned long ndomains;
1228 unsigned long nlongs;
1230 ndomains = cap_ndoms(iommu->cap);
1231 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1232 nlongs = BITS_TO_LONGS(ndomains);
1234 /* TBD: there might be 64K domains,
1235 * consider other allocation for future chip
1237 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1238 if (!iommu->domain_ids) {
1239 printk(KERN_ERR "Allocating domain id array failed\n");
1242 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1244 if (!iommu->domains) {
1245 printk(KERN_ERR "Allocating domain array failed\n");
1246 kfree(iommu->domain_ids);
1250 spin_lock_init(&iommu->lock);
1253 * if Caching mode is set, then invalid translations are tagged
1254 * with domainid 0. Hence we need to pre-allocate it.
1256 if (cap_caching_mode(iommu->cap))
1257 set_bit(0, iommu->domain_ids);
1262 static void domain_exit(struct dmar_domain *domain);
1263 static void vm_domain_exit(struct dmar_domain *domain);
1265 void free_dmar_iommu(struct intel_iommu *iommu)
1267 struct dmar_domain *domain;
1269 unsigned long flags;
1271 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1272 for (; i < cap_ndoms(iommu->cap); ) {
1273 domain = iommu->domains[i];
1274 clear_bit(i, iommu->domain_ids);
1276 spin_lock_irqsave(&domain->iommu_lock, flags);
1277 if (--domain->iommu_count == 0) {
1278 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1279 vm_domain_exit(domain);
1281 domain_exit(domain);
1283 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1285 i = find_next_bit(iommu->domain_ids,
1286 cap_ndoms(iommu->cap), i+1);
1289 if (iommu->gcmd & DMA_GCMD_TE)
1290 iommu_disable_translation(iommu);
1293 set_irq_data(iommu->irq, NULL);
1294 /* This will mask the irq */
1295 free_irq(iommu->irq, iommu);
1296 destroy_irq(iommu->irq);
1299 kfree(iommu->domains);
1300 kfree(iommu->domain_ids);
1302 g_iommus[iommu->seq_id] = NULL;
1304 /* if all iommus are freed, free g_iommus */
1305 for (i = 0; i < g_num_of_iommus; i++) {
1310 if (i == g_num_of_iommus)
1313 /* free context mapping */
1314 free_context_table(iommu);
1317 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1320 unsigned long ndomains;
1321 struct dmar_domain *domain;
1322 unsigned long flags;
1324 domain = alloc_domain_mem();
1328 ndomains = cap_ndoms(iommu->cap);
1330 spin_lock_irqsave(&iommu->lock, flags);
1331 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1332 if (num >= ndomains) {
1333 spin_unlock_irqrestore(&iommu->lock, flags);
1334 free_domain_mem(domain);
1335 printk(KERN_ERR "IOMMU: no free domain ids\n");
1339 set_bit(num, iommu->domain_ids);
1341 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1342 set_bit(iommu->seq_id, &domain->iommu_bmp);
1344 iommu->domains[num] = domain;
1345 spin_unlock_irqrestore(&iommu->lock, flags);
1350 static void iommu_free_domain(struct dmar_domain *domain)
1352 unsigned long flags;
1353 struct intel_iommu *iommu;
1355 iommu = domain_get_iommu(domain);
1357 spin_lock_irqsave(&iommu->lock, flags);
1358 clear_bit(domain->id, iommu->domain_ids);
1359 spin_unlock_irqrestore(&iommu->lock, flags);
1362 static struct iova_domain reserved_iova_list;
1363 static struct lock_class_key reserved_alloc_key;
1364 static struct lock_class_key reserved_rbtree_key;
1366 static void dmar_init_reserved_ranges(void)
1368 struct pci_dev *pdev = NULL;
1373 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1375 lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1376 &reserved_alloc_key);
1377 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1378 &reserved_rbtree_key);
1380 /* IOAPIC ranges shouldn't be accessed by DMA */
1381 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1382 IOVA_PFN(IOAPIC_RANGE_END));
1384 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1386 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1387 for_each_pci_dev(pdev) {
1390 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1391 r = &pdev->resource[i];
1392 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1396 size = r->end - addr;
1397 size = PAGE_ALIGN(size);
1398 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1399 IOVA_PFN(size + addr) - 1);
1401 printk(KERN_ERR "Reserve iova failed\n");
1407 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1409 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1412 static inline int guestwidth_to_adjustwidth(int gaw)
1415 int r = (gaw - 12) % 9;
1426 static int domain_init(struct dmar_domain *domain, int guest_width)
1428 struct intel_iommu *iommu;
1429 int adjust_width, agaw;
1430 unsigned long sagaw;
1432 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1433 spin_lock_init(&domain->mapping_lock);
1434 spin_lock_init(&domain->iommu_lock);
1436 domain_reserve_special_ranges(domain);
1438 /* calculate AGAW */
1439 iommu = domain_get_iommu(domain);
1440 if (guest_width > cap_mgaw(iommu->cap))
1441 guest_width = cap_mgaw(iommu->cap);
1442 domain->gaw = guest_width;
1443 adjust_width = guestwidth_to_adjustwidth(guest_width);
1444 agaw = width_to_agaw(adjust_width);
1445 sagaw = cap_sagaw(iommu->cap);
1446 if (!test_bit(agaw, &sagaw)) {
1447 /* hardware doesn't support it, choose a bigger one */
1448 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1449 agaw = find_next_bit(&sagaw, 5, agaw);
1453 domain->agaw = agaw;
1454 INIT_LIST_HEAD(&domain->devices);
1456 if (ecap_coherent(iommu->ecap))
1457 domain->iommu_coherency = 1;
1459 domain->iommu_coherency = 0;
1461 if (ecap_sc_support(iommu->ecap))
1462 domain->iommu_snooping = 1;
1464 domain->iommu_snooping = 0;
1466 domain->iommu_count = 1;
1468 /* always allocate the top pgd */
1469 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1472 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1476 static void domain_exit(struct dmar_domain *domain)
1480 /* Domain 0 is reserved, so dont process it */
1484 domain_remove_dev_info(domain);
1486 put_iova_domain(&domain->iovad);
1487 end = DOMAIN_MAX_ADDR(domain->gaw);
1488 end = end & (~PAGE_MASK);
1491 dma_pte_clear_range(domain, 0, end);
1493 /* free page tables */
1494 dma_pte_free_pagetable(domain, 0, end);
1496 iommu_free_domain(domain);
1497 free_domain_mem(domain);
1500 static int domain_context_mapping_one(struct dmar_domain *domain,
1503 struct context_entry *context;
1504 unsigned long flags;
1505 struct intel_iommu *iommu;
1506 struct dma_pte *pgd;
1508 unsigned long ndomains;
1512 pr_debug("Set context mapping for %02x:%02x.%d\n",
1513 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1514 BUG_ON(!domain->pgd);
1516 iommu = device_to_iommu(bus, devfn);
1520 context = device_to_context_entry(iommu, bus, devfn);
1523 spin_lock_irqsave(&iommu->lock, flags);
1524 if (context_present(context)) {
1525 spin_unlock_irqrestore(&iommu->lock, flags);
1532 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1535 /* find an available domain id for this device in iommu */
1536 ndomains = cap_ndoms(iommu->cap);
1537 num = find_first_bit(iommu->domain_ids, ndomains);
1538 for (; num < ndomains; ) {
1539 if (iommu->domains[num] == domain) {
1544 num = find_next_bit(iommu->domain_ids,
1545 cap_ndoms(iommu->cap), num+1);
1549 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1550 if (num >= ndomains) {
1551 spin_unlock_irqrestore(&iommu->lock, flags);
1552 printk(KERN_ERR "IOMMU: no free domain ids\n");
1556 set_bit(num, iommu->domain_ids);
1557 iommu->domains[num] = domain;
1561 /* Skip top levels of page tables for
1562 * iommu which has less agaw than default.
1564 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1565 pgd = phys_to_virt(dma_pte_addr(pgd));
1566 if (!dma_pte_present(pgd)) {
1567 spin_unlock_irqrestore(&iommu->lock, flags);
1573 context_set_domain_id(context, id);
1574 context_set_address_width(context, iommu->agaw);
1575 context_set_address_root(context, virt_to_phys(pgd));
1576 context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1577 context_set_fault_enable(context);
1578 context_set_present(context);
1579 domain_flush_cache(domain, context, sizeof(*context));
1581 /* it's a non-present to present mapping */
1582 if (iommu->flush.flush_context(iommu, domain->id,
1583 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1584 DMA_CCMD_DEVICE_INVL, 1))
1585 iommu_flush_write_buffer(iommu);
1587 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1589 spin_unlock_irqrestore(&iommu->lock, flags);
1591 spin_lock_irqsave(&domain->iommu_lock, flags);
1592 if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1593 domain->iommu_count++;
1594 domain_update_iommu_cap(domain);
1596 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1601 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1604 struct pci_dev *tmp, *parent;
1606 ret = domain_context_mapping_one(domain, pdev->bus->number,
1611 /* dependent device mapping */
1612 tmp = pci_find_upstream_pcie_bridge(pdev);
1615 /* Secondary interface's bus number and devfn 0 */
1616 parent = pdev->bus->self;
1617 while (parent != tmp) {
1618 ret = domain_context_mapping_one(domain, parent->bus->number,
1622 parent = parent->bus->self;
1624 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1625 return domain_context_mapping_one(domain,
1626 tmp->subordinate->number, 0);
1627 else /* this is a legacy PCI bridge */
1628 return domain_context_mapping_one(domain,
1629 tmp->bus->number, tmp->devfn);
1632 static int domain_context_mapped(struct pci_dev *pdev)
1635 struct pci_dev *tmp, *parent;
1636 struct intel_iommu *iommu;
1638 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1642 ret = device_context_mapped(iommu,
1643 pdev->bus->number, pdev->devfn);
1646 /* dependent device mapping */
1647 tmp = pci_find_upstream_pcie_bridge(pdev);
1650 /* Secondary interface's bus number and devfn 0 */
1651 parent = pdev->bus->self;
1652 while (parent != tmp) {
1653 ret = device_context_mapped(iommu, parent->bus->number,
1657 parent = parent->bus->self;
1660 return device_context_mapped(iommu,
1661 tmp->subordinate->number, 0);
1663 return device_context_mapped(iommu,
1664 tmp->bus->number, tmp->devfn);
1668 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1669 u64 hpa, size_t size, int prot)
1671 u64 start_pfn, end_pfn;
1672 struct dma_pte *pte;
1674 int addr_width = agaw_to_width(domain->agaw);
1676 hpa &= (((u64)1) << addr_width) - 1;
1678 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1681 start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1682 end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1684 while (start_pfn < end_pfn) {
1685 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1688 /* We don't need lock here, nobody else
1689 * touches the iova range
1691 BUG_ON(dma_pte_addr(pte));
1692 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1693 dma_set_pte_prot(pte, prot);
1694 if (prot & DMA_PTE_SNP)
1695 dma_set_pte_snp(pte);
1696 domain_flush_cache(domain, pte, sizeof(*pte));
1703 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1708 clear_context_table(iommu, bus, devfn);
1709 iommu->flush.flush_context(iommu, 0, 0, 0,
1710 DMA_CCMD_GLOBAL_INVL, 0);
1711 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1712 DMA_TLB_GLOBAL_FLUSH, 0);
1715 static void domain_remove_dev_info(struct dmar_domain *domain)
1717 struct device_domain_info *info;
1718 unsigned long flags;
1719 struct intel_iommu *iommu;
1721 spin_lock_irqsave(&device_domain_lock, flags);
1722 while (!list_empty(&domain->devices)) {
1723 info = list_entry(domain->devices.next,
1724 struct device_domain_info, link);
1725 list_del(&info->link);
1726 list_del(&info->global);
1728 info->dev->dev.archdata.iommu = NULL;
1729 spin_unlock_irqrestore(&device_domain_lock, flags);
1731 iommu = device_to_iommu(info->bus, info->devfn);
1732 iommu_detach_dev(iommu, info->bus, info->devfn);
1733 free_devinfo_mem(info);
1735 spin_lock_irqsave(&device_domain_lock, flags);
1737 spin_unlock_irqrestore(&device_domain_lock, flags);
1742 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1744 static struct dmar_domain *
1745 find_domain(struct pci_dev *pdev)
1747 struct device_domain_info *info;
1749 /* No lock here, assumes no domain exit in normal case */
1750 info = pdev->dev.archdata.iommu;
1752 return info->domain;
1756 /* domain is initialized */
1757 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1759 struct dmar_domain *domain, *found = NULL;
1760 struct intel_iommu *iommu;
1761 struct dmar_drhd_unit *drhd;
1762 struct device_domain_info *info, *tmp;
1763 struct pci_dev *dev_tmp;
1764 unsigned long flags;
1765 int bus = 0, devfn = 0;
1767 domain = find_domain(pdev);
1771 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1773 if (dev_tmp->is_pcie) {
1774 bus = dev_tmp->subordinate->number;
1777 bus = dev_tmp->bus->number;
1778 devfn = dev_tmp->devfn;
1780 spin_lock_irqsave(&device_domain_lock, flags);
1781 list_for_each_entry(info, &device_domain_list, global) {
1782 if (info->bus == bus && info->devfn == devfn) {
1783 found = info->domain;
1787 spin_unlock_irqrestore(&device_domain_lock, flags);
1788 /* pcie-pci bridge already has a domain, uses it */
1795 /* Allocate new domain for the device */
1796 drhd = dmar_find_matched_drhd_unit(pdev);
1798 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1802 iommu = drhd->iommu;
1804 domain = iommu_alloc_domain(iommu);
1808 if (domain_init(domain, gaw)) {
1809 domain_exit(domain);
1813 /* register pcie-to-pci device */
1815 info = alloc_devinfo_mem();
1817 domain_exit(domain);
1821 info->devfn = devfn;
1823 info->domain = domain;
1824 /* This domain is shared by devices under p2p bridge */
1825 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1827 /* pcie-to-pci bridge already has a domain, uses it */
1829 spin_lock_irqsave(&device_domain_lock, flags);
1830 list_for_each_entry(tmp, &device_domain_list, global) {
1831 if (tmp->bus == bus && tmp->devfn == devfn) {
1832 found = tmp->domain;
1837 free_devinfo_mem(info);
1838 domain_exit(domain);
1841 list_add(&info->link, &domain->devices);
1842 list_add(&info->global, &device_domain_list);
1844 spin_unlock_irqrestore(&device_domain_lock, flags);
1848 info = alloc_devinfo_mem();
1851 info->bus = pdev->bus->number;
1852 info->devfn = pdev->devfn;
1854 info->domain = domain;
1855 spin_lock_irqsave(&device_domain_lock, flags);
1856 /* somebody is fast */
1857 found = find_domain(pdev);
1858 if (found != NULL) {
1859 spin_unlock_irqrestore(&device_domain_lock, flags);
1860 if (found != domain) {
1861 domain_exit(domain);
1864 free_devinfo_mem(info);
1867 list_add(&info->link, &domain->devices);
1868 list_add(&info->global, &device_domain_list);
1869 pdev->dev.archdata.iommu = info;
1870 spin_unlock_irqrestore(&device_domain_lock, flags);
1873 /* recheck it here, maybe others set it */
1874 return find_domain(pdev);
1877 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1878 unsigned long long start,
1879 unsigned long long end)
1881 struct dmar_domain *domain;
1883 unsigned long long base;
1887 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1888 pci_name(pdev), start, end);
1889 /* page table init */
1890 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1894 /* The address might not be aligned */
1895 base = start & PAGE_MASK;
1897 size = PAGE_ALIGN(size);
1898 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1899 IOVA_PFN(base + size) - 1)) {
1900 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1905 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1906 size, base, pci_name(pdev));
1908 * RMRR range might have overlap with physical memory range,
1911 dma_pte_clear_range(domain, base, base + size);
1913 ret = domain_page_mapping(domain, base, base, size,
1914 DMA_PTE_READ|DMA_PTE_WRITE);
1918 /* context entry init */
1919 ret = domain_context_mapping(domain, pdev);
1923 domain_exit(domain);
1928 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1929 struct pci_dev *pdev)
1931 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1933 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1934 rmrr->end_address + 1);
1937 #ifdef CONFIG_DMAR_GFX_WA
1938 struct iommu_prepare_data {
1939 struct pci_dev *pdev;
1943 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1944 unsigned long end_pfn, void *datax)
1946 struct iommu_prepare_data *data;
1948 data = (struct iommu_prepare_data *)datax;
1950 data->ret = iommu_prepare_identity_map(data->pdev,
1951 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1956 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1959 struct iommu_prepare_data data;
1964 for_each_online_node(nid) {
1965 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1972 static void __init iommu_prepare_gfx_mapping(void)
1974 struct pci_dev *pdev = NULL;
1977 for_each_pci_dev(pdev) {
1978 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1979 !IS_GFX_DEVICE(pdev))
1981 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1983 ret = iommu_prepare_with_active_regions(pdev);
1985 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1988 #else /* !CONFIG_DMAR_GFX_WA */
1989 static inline void iommu_prepare_gfx_mapping(void)
1995 #ifdef CONFIG_DMAR_FLOPPY_WA
1996 static inline void iommu_prepare_isa(void)
1998 struct pci_dev *pdev;
2001 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2005 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
2006 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2009 printk("IOMMU: Failed to create 0-64M identity map, "
2010 "floppy might not work\n");
2014 static inline void iommu_prepare_isa(void)
2018 #endif /* !CONFIG_DMAR_FLPY_WA */
2020 static int __init init_dmars(void)
2022 struct dmar_drhd_unit *drhd;
2023 struct dmar_rmrr_unit *rmrr;
2024 struct pci_dev *pdev;
2025 struct intel_iommu *iommu;
2026 int i, ret, unit = 0;
2031 * initialize and program root entry to not present
2034 for_each_drhd_unit(drhd) {
2037 * lock not needed as this is only incremented in the single
2038 * threaded kernel __init code path all other access are read
2043 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2046 printk(KERN_ERR "Allocating global iommu array failed\n");
2051 deferred_flush = kzalloc(g_num_of_iommus *
2052 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2053 if (!deferred_flush) {
2059 for_each_drhd_unit(drhd) {
2063 iommu = drhd->iommu;
2064 g_iommus[iommu->seq_id] = iommu;
2066 ret = iommu_init_domains(iommu);
2072 * we could share the same root & context tables
2073 * amoung all IOMMU's. Need to Split it later.
2075 ret = iommu_alloc_root_entry(iommu);
2077 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2082 for_each_drhd_unit(drhd) {
2086 iommu = drhd->iommu;
2087 if (dmar_enable_qi(iommu)) {
2089 * Queued Invalidate not enabled, use Register Based
2092 iommu->flush.flush_context = __iommu_flush_context;
2093 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2094 printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2096 (unsigned long long)drhd->reg_base_addr);
2098 iommu->flush.flush_context = qi_flush_context;
2099 iommu->flush.flush_iotlb = qi_flush_iotlb;
2100 printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2102 (unsigned long long)drhd->reg_base_addr);
2108 * for each dev attached to rmrr
2110 * locate drhd for dev, alloc domain for dev
2111 * allocate free domain
2112 * allocate page table entries for rmrr
2113 * if context not allocated for bus
2114 * allocate and init context
2115 * set present in root table for this bus
2116 * init context with domain, translation etc
2120 for_each_rmrr_units(rmrr) {
2121 for (i = 0; i < rmrr->devices_cnt; i++) {
2122 pdev = rmrr->devices[i];
2123 /* some BIOS lists non-exist devices in DMAR table */
2126 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2129 "IOMMU: mapping reserved region failed\n");
2133 iommu_prepare_gfx_mapping();
2135 iommu_prepare_isa();
2140 * global invalidate context cache
2141 * global invalidate iotlb
2142 * enable translation
2144 for_each_drhd_unit(drhd) {
2147 iommu = drhd->iommu;
2148 sprintf (iommu->name, "dmar%d", unit++);
2150 iommu_flush_write_buffer(iommu);
2152 ret = dmar_set_interrupt(iommu);
2156 iommu_set_root_entry(iommu);
2158 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2160 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2162 iommu_disable_protect_mem_regions(iommu);
2164 ret = iommu_enable_translation(iommu);
2171 for_each_drhd_unit(drhd) {
2174 iommu = drhd->iommu;
2181 static inline u64 aligned_size(u64 host_addr, size_t size)
2184 addr = (host_addr & (~PAGE_MASK)) + size;
2185 return PAGE_ALIGN(addr);
2189 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2193 /* Make sure it's in range */
2194 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2195 if (!size || (IOVA_START_ADDR + size > end))
2198 piova = alloc_iova(&domain->iovad,
2199 size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2203 static struct iova *
2204 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2205 size_t size, u64 dma_mask)
2207 struct pci_dev *pdev = to_pci_dev(dev);
2208 struct iova *iova = NULL;
2210 if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2211 iova = iommu_alloc_iova(domain, size, dma_mask);
2214 * First try to allocate an io virtual address in
2215 * DMA_32BIT_MASK and if that fails then try allocating
2218 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2220 iova = iommu_alloc_iova(domain, size, dma_mask);
2224 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2231 static struct dmar_domain *
2232 get_valid_domain_for_dev(struct pci_dev *pdev)
2234 struct dmar_domain *domain;
2237 domain = get_domain_for_dev(pdev,
2238 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2241 "Allocating domain for %s failed", pci_name(pdev));
2245 /* make sure context mapping is ok */
2246 if (unlikely(!domain_context_mapped(pdev))) {
2247 ret = domain_context_mapping(domain, pdev);
2250 "Domain context map for %s failed",
2259 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2260 size_t size, int dir, u64 dma_mask)
2262 struct pci_dev *pdev = to_pci_dev(hwdev);
2263 struct dmar_domain *domain;
2264 phys_addr_t start_paddr;
2268 struct intel_iommu *iommu;
2270 BUG_ON(dir == DMA_NONE);
2271 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2274 domain = get_valid_domain_for_dev(pdev);
2278 iommu = domain_get_iommu(domain);
2279 size = aligned_size((u64)paddr, size);
2281 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2285 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2288 * Check if DMAR supports zero-length reads on write only
2291 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2292 !cap_zlr(iommu->cap))
2293 prot |= DMA_PTE_READ;
2294 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2295 prot |= DMA_PTE_WRITE;
2297 * paddr - (paddr + size) might be partial page, we should map the whole
2298 * page. Note: if two part of one page are separately mapped, we
2299 * might have two guest_addr mapping to the same host paddr, but this
2300 * is not a big problem
2302 ret = domain_page_mapping(domain, start_paddr,
2303 ((u64)paddr) & PAGE_MASK, size, prot);
2307 /* it's a non-present to present mapping */
2308 ret = iommu_flush_iotlb_psi(iommu, domain->id,
2309 start_paddr, size >> VTD_PAGE_SHIFT, 1);
2311 iommu_flush_write_buffer(iommu);
2313 return start_paddr + ((u64)paddr & (~PAGE_MASK));
2317 __free_iova(&domain->iovad, iova);
2318 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2319 pci_name(pdev), size, (unsigned long long)paddr, dir);
2323 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2324 size_t size, int dir)
2326 return __intel_map_single(hwdev, paddr, size, dir,
2327 to_pci_dev(hwdev)->dma_mask);
2330 static void flush_unmaps(void)
2336 /* just flush them all */
2337 for (i = 0; i < g_num_of_iommus; i++) {
2338 struct intel_iommu *iommu = g_iommus[i];
2342 if (deferred_flush[i].next) {
2343 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2344 DMA_TLB_GLOBAL_FLUSH, 0);
2345 for (j = 0; j < deferred_flush[i].next; j++) {
2346 __free_iova(&deferred_flush[i].domain[j]->iovad,
2347 deferred_flush[i].iova[j]);
2349 deferred_flush[i].next = 0;
2356 static void flush_unmaps_timeout(unsigned long data)
2358 unsigned long flags;
2360 spin_lock_irqsave(&async_umap_flush_lock, flags);
2362 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2365 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2367 unsigned long flags;
2369 struct intel_iommu *iommu;
2371 spin_lock_irqsave(&async_umap_flush_lock, flags);
2372 if (list_size == HIGH_WATER_MARK)
2375 iommu = domain_get_iommu(dom);
2376 iommu_id = iommu->seq_id;
2378 next = deferred_flush[iommu_id].next;
2379 deferred_flush[iommu_id].domain[next] = dom;
2380 deferred_flush[iommu_id].iova[next] = iova;
2381 deferred_flush[iommu_id].next++;
2384 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2388 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2391 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2394 struct pci_dev *pdev = to_pci_dev(dev);
2395 struct dmar_domain *domain;
2396 unsigned long start_addr;
2398 struct intel_iommu *iommu;
2400 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2402 domain = find_domain(pdev);
2405 iommu = domain_get_iommu(domain);
2407 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2411 start_addr = iova->pfn_lo << PAGE_SHIFT;
2412 size = aligned_size((u64)dev_addr, size);
2414 pr_debug("Device %s unmapping: %zx@%llx\n",
2415 pci_name(pdev), size, (unsigned long long)start_addr);
2417 /* clear the whole page */
2418 dma_pte_clear_range(domain, start_addr, start_addr + size);
2419 /* free page tables */
2420 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2421 if (intel_iommu_strict) {
2422 if (iommu_flush_iotlb_psi(iommu,
2423 domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2424 iommu_flush_write_buffer(iommu);
2426 __free_iova(&domain->iovad, iova);
2428 add_unmap(domain, iova);
2430 * queue up the release of the unmap to save the 1/6th of the
2431 * cpu used up by the iotlb flush operation...
2436 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2437 dma_addr_t *dma_handle, gfp_t flags)
2442 size = PAGE_ALIGN(size);
2443 order = get_order(size);
2444 flags &= ~(GFP_DMA | GFP_DMA32);
2446 vaddr = (void *)__get_free_pages(flags, order);
2449 memset(vaddr, 0, size);
2451 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2453 hwdev->coherent_dma_mask);
2456 free_pages((unsigned long)vaddr, order);
2460 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2461 dma_addr_t dma_handle)
2465 size = PAGE_ALIGN(size);
2466 order = get_order(size);
2468 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2469 free_pages((unsigned long)vaddr, order);
2472 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2473 int nelems, int dir)
2476 struct pci_dev *pdev = to_pci_dev(hwdev);
2477 struct dmar_domain *domain;
2478 unsigned long start_addr;
2482 struct scatterlist *sg;
2483 struct intel_iommu *iommu;
2485 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2488 domain = find_domain(pdev);
2491 iommu = domain_get_iommu(domain);
2493 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2496 for_each_sg(sglist, sg, nelems, i) {
2497 addr = page_to_phys(sg_page(sg)) + sg->offset;
2498 size += aligned_size((u64)addr, sg->length);
2501 start_addr = iova->pfn_lo << PAGE_SHIFT;
2503 /* clear the whole page */
2504 dma_pte_clear_range(domain, start_addr, start_addr + size);
2505 /* free page tables */
2506 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2508 if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2509 size >> VTD_PAGE_SHIFT, 0))
2510 iommu_flush_write_buffer(iommu);
2513 __free_iova(&domain->iovad, iova);
2516 static int intel_nontranslate_map_sg(struct device *hddev,
2517 struct scatterlist *sglist, int nelems, int dir)
2520 struct scatterlist *sg;
2522 for_each_sg(sglist, sg, nelems, i) {
2523 BUG_ON(!sg_page(sg));
2524 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2525 sg->dma_length = sg->length;
2530 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2535 struct pci_dev *pdev = to_pci_dev(hwdev);
2536 struct dmar_domain *domain;
2540 struct iova *iova = NULL;
2542 struct scatterlist *sg;
2543 unsigned long start_addr;
2544 struct intel_iommu *iommu;
2546 BUG_ON(dir == DMA_NONE);
2547 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2548 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2550 domain = get_valid_domain_for_dev(pdev);
2554 iommu = domain_get_iommu(domain);
2556 for_each_sg(sglist, sg, nelems, i) {
2557 addr = page_to_phys(sg_page(sg)) + sg->offset;
2558 size += aligned_size((u64)addr, sg->length);
2561 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2563 sglist->dma_length = 0;
2568 * Check if DMAR supports zero-length reads on write only
2571 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2572 !cap_zlr(iommu->cap))
2573 prot |= DMA_PTE_READ;
2574 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2575 prot |= DMA_PTE_WRITE;
2577 start_addr = iova->pfn_lo << PAGE_SHIFT;
2579 for_each_sg(sglist, sg, nelems, i) {
2580 addr = page_to_phys(sg_page(sg)) + sg->offset;
2581 size = aligned_size((u64)addr, sg->length);
2582 ret = domain_page_mapping(domain, start_addr + offset,
2583 ((u64)addr) & PAGE_MASK,
2586 /* clear the page */
2587 dma_pte_clear_range(domain, start_addr,
2588 start_addr + offset);
2589 /* free page tables */
2590 dma_pte_free_pagetable(domain, start_addr,
2591 start_addr + offset);
2593 __free_iova(&domain->iovad, iova);
2596 sg->dma_address = start_addr + offset +
2597 ((u64)addr & (~PAGE_MASK));
2598 sg->dma_length = sg->length;
2602 /* it's a non-present to present mapping */
2603 if (iommu_flush_iotlb_psi(iommu, domain->id,
2604 start_addr, offset >> VTD_PAGE_SHIFT, 1))
2605 iommu_flush_write_buffer(iommu);
2609 static struct dma_mapping_ops intel_dma_ops = {
2610 .alloc_coherent = intel_alloc_coherent,
2611 .free_coherent = intel_free_coherent,
2612 .map_single = intel_map_single,
2613 .unmap_single = intel_unmap_single,
2614 .map_sg = intel_map_sg,
2615 .unmap_sg = intel_unmap_sg,
2618 static inline int iommu_domain_cache_init(void)
2622 iommu_domain_cache = kmem_cache_create("iommu_domain",
2623 sizeof(struct dmar_domain),
2628 if (!iommu_domain_cache) {
2629 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2636 static inline int iommu_devinfo_cache_init(void)
2640 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2641 sizeof(struct device_domain_info),
2645 if (!iommu_devinfo_cache) {
2646 printk(KERN_ERR "Couldn't create devinfo cache\n");
2653 static inline int iommu_iova_cache_init(void)
2657 iommu_iova_cache = kmem_cache_create("iommu_iova",
2658 sizeof(struct iova),
2662 if (!iommu_iova_cache) {
2663 printk(KERN_ERR "Couldn't create iova cache\n");
2670 static int __init iommu_init_mempool(void)
2673 ret = iommu_iova_cache_init();
2677 ret = iommu_domain_cache_init();
2681 ret = iommu_devinfo_cache_init();
2685 kmem_cache_destroy(iommu_domain_cache);
2687 kmem_cache_destroy(iommu_iova_cache);
2692 static void __init iommu_exit_mempool(void)
2694 kmem_cache_destroy(iommu_devinfo_cache);
2695 kmem_cache_destroy(iommu_domain_cache);
2696 kmem_cache_destroy(iommu_iova_cache);
2700 static void __init init_no_remapping_devices(void)
2702 struct dmar_drhd_unit *drhd;
2704 for_each_drhd_unit(drhd) {
2705 if (!drhd->include_all) {
2707 for (i = 0; i < drhd->devices_cnt; i++)
2708 if (drhd->devices[i] != NULL)
2710 /* ignore DMAR unit if no pci devices exist */
2711 if (i == drhd->devices_cnt)
2719 for_each_drhd_unit(drhd) {
2721 if (drhd->ignored || drhd->include_all)
2724 for (i = 0; i < drhd->devices_cnt; i++)
2725 if (drhd->devices[i] &&
2726 !IS_GFX_DEVICE(drhd->devices[i]))
2729 if (i < drhd->devices_cnt)
2732 /* bypass IOMMU if it is just for gfx devices */
2734 for (i = 0; i < drhd->devices_cnt; i++) {
2735 if (!drhd->devices[i])
2737 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2742 int __init intel_iommu_init(void)
2746 if (dmar_table_init())
2749 if (dmar_dev_scope_init())
2753 * Check the need for DMA-remapping initialization now.
2754 * Above initialization will also be used by Interrupt-remapping.
2756 if (no_iommu || swiotlb || dmar_disabled)
2759 iommu_init_mempool();
2760 dmar_init_reserved_ranges();
2762 init_no_remapping_devices();
2766 printk(KERN_ERR "IOMMU: dmar init failed\n");
2767 put_iova_domain(&reserved_iova_list);
2768 iommu_exit_mempool();
2772 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2774 init_timer(&unmap_timer);
2776 dma_ops = &intel_dma_ops;
2778 register_iommu(&intel_iommu_ops);
2783 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2784 struct pci_dev *pdev)
2786 struct device_domain_info *info;
2787 unsigned long flags;
2789 info = alloc_devinfo_mem();
2793 info->bus = pdev->bus->number;
2794 info->devfn = pdev->devfn;
2796 info->domain = domain;
2798 spin_lock_irqsave(&device_domain_lock, flags);
2799 list_add(&info->link, &domain->devices);
2800 list_add(&info->global, &device_domain_list);
2801 pdev->dev.archdata.iommu = info;
2802 spin_unlock_irqrestore(&device_domain_lock, flags);
2807 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
2808 struct pci_dev *pdev)
2810 struct pci_dev *tmp, *parent;
2812 if (!iommu || !pdev)
2815 /* dependent device detach */
2816 tmp = pci_find_upstream_pcie_bridge(pdev);
2817 /* Secondary interface's bus number and devfn 0 */
2819 parent = pdev->bus->self;
2820 while (parent != tmp) {
2821 iommu_detach_dev(iommu, parent->bus->number,
2823 parent = parent->bus->self;
2825 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
2826 iommu_detach_dev(iommu,
2827 tmp->subordinate->number, 0);
2828 else /* this is a legacy PCI bridge */
2829 iommu_detach_dev(iommu,
2830 tmp->bus->number, tmp->devfn);
2834 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2835 struct pci_dev *pdev)
2837 struct device_domain_info *info;
2838 struct intel_iommu *iommu;
2839 unsigned long flags;
2841 struct list_head *entry, *tmp;
2843 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2847 spin_lock_irqsave(&device_domain_lock, flags);
2848 list_for_each_safe(entry, tmp, &domain->devices) {
2849 info = list_entry(entry, struct device_domain_info, link);
2850 if (info->bus == pdev->bus->number &&
2851 info->devfn == pdev->devfn) {
2852 list_del(&info->link);
2853 list_del(&info->global);
2855 info->dev->dev.archdata.iommu = NULL;
2856 spin_unlock_irqrestore(&device_domain_lock, flags);
2858 iommu_detach_dev(iommu, info->bus, info->devfn);
2859 iommu_detach_dependent_devices(iommu, pdev);
2860 free_devinfo_mem(info);
2862 spin_lock_irqsave(&device_domain_lock, flags);
2870 /* if there is no other devices under the same iommu
2871 * owned by this domain, clear this iommu in iommu_bmp
2872 * update iommu count and coherency
2874 if (device_to_iommu(info->bus, info->devfn) == iommu)
2879 unsigned long tmp_flags;
2880 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2881 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2882 domain->iommu_count--;
2883 domain_update_iommu_cap(domain);
2884 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2887 spin_unlock_irqrestore(&device_domain_lock, flags);
2890 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2892 struct device_domain_info *info;
2893 struct intel_iommu *iommu;
2894 unsigned long flags1, flags2;
2896 spin_lock_irqsave(&device_domain_lock, flags1);
2897 while (!list_empty(&domain->devices)) {
2898 info = list_entry(domain->devices.next,
2899 struct device_domain_info, link);
2900 list_del(&info->link);
2901 list_del(&info->global);
2903 info->dev->dev.archdata.iommu = NULL;
2905 spin_unlock_irqrestore(&device_domain_lock, flags1);
2907 iommu = device_to_iommu(info->bus, info->devfn);
2908 iommu_detach_dev(iommu, info->bus, info->devfn);
2909 iommu_detach_dependent_devices(iommu, info->dev);
2911 /* clear this iommu in iommu_bmp, update iommu count
2914 spin_lock_irqsave(&domain->iommu_lock, flags2);
2915 if (test_and_clear_bit(iommu->seq_id,
2916 &domain->iommu_bmp)) {
2917 domain->iommu_count--;
2918 domain_update_iommu_cap(domain);
2920 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2922 free_devinfo_mem(info);
2923 spin_lock_irqsave(&device_domain_lock, flags1);
2925 spin_unlock_irqrestore(&device_domain_lock, flags1);
2928 /* domain id for virtual machine, it won't be set in context */
2929 static unsigned long vm_domid;
2931 static int vm_domain_min_agaw(struct dmar_domain *domain)
2934 int min_agaw = domain->agaw;
2936 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2937 for (; i < g_num_of_iommus; ) {
2938 if (min_agaw > g_iommus[i]->agaw)
2939 min_agaw = g_iommus[i]->agaw;
2941 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2947 static struct dmar_domain *iommu_alloc_vm_domain(void)
2949 struct dmar_domain *domain;
2951 domain = alloc_domain_mem();
2955 domain->id = vm_domid++;
2956 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2957 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2962 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2966 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2967 spin_lock_init(&domain->mapping_lock);
2968 spin_lock_init(&domain->iommu_lock);
2970 domain_reserve_special_ranges(domain);
2972 /* calculate AGAW */
2973 domain->gaw = guest_width;
2974 adjust_width = guestwidth_to_adjustwidth(guest_width);
2975 domain->agaw = width_to_agaw(adjust_width);
2977 INIT_LIST_HEAD(&domain->devices);
2979 domain->iommu_count = 0;
2980 domain->iommu_coherency = 0;
2981 domain->max_addr = 0;
2983 /* always allocate the top pgd */
2984 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2987 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2991 static void iommu_free_vm_domain(struct dmar_domain *domain)
2993 unsigned long flags;
2994 struct dmar_drhd_unit *drhd;
2995 struct intel_iommu *iommu;
2997 unsigned long ndomains;
2999 for_each_drhd_unit(drhd) {
3002 iommu = drhd->iommu;
3004 ndomains = cap_ndoms(iommu->cap);
3005 i = find_first_bit(iommu->domain_ids, ndomains);
3006 for (; i < ndomains; ) {
3007 if (iommu->domains[i] == domain) {
3008 spin_lock_irqsave(&iommu->lock, flags);
3009 clear_bit(i, iommu->domain_ids);
3010 iommu->domains[i] = NULL;
3011 spin_unlock_irqrestore(&iommu->lock, flags);
3014 i = find_next_bit(iommu->domain_ids, ndomains, i+1);
3019 static void vm_domain_exit(struct dmar_domain *domain)
3023 /* Domain 0 is reserved, so dont process it */
3027 vm_domain_remove_all_dev_info(domain);
3029 put_iova_domain(&domain->iovad);
3030 end = DOMAIN_MAX_ADDR(domain->gaw);
3031 end = end & (~VTD_PAGE_MASK);
3034 dma_pte_clear_range(domain, 0, end);
3036 /* free page tables */
3037 dma_pte_free_pagetable(domain, 0, end);
3039 iommu_free_vm_domain(domain);
3040 free_domain_mem(domain);
3043 static int intel_iommu_domain_init(struct iommu_domain *domain)
3045 struct dmar_domain *dmar_domain;
3047 dmar_domain = iommu_alloc_vm_domain();
3050 "intel_iommu_domain_init: dmar_domain == NULL\n");
3053 if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3055 "intel_iommu_domain_init() failed\n");
3056 vm_domain_exit(dmar_domain);
3059 domain->priv = dmar_domain;
3064 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3066 struct dmar_domain *dmar_domain = domain->priv;
3068 domain->priv = NULL;
3069 vm_domain_exit(dmar_domain);
3072 static int intel_iommu_attach_device(struct iommu_domain *domain,
3075 struct dmar_domain *dmar_domain = domain->priv;
3076 struct pci_dev *pdev = to_pci_dev(dev);
3077 struct intel_iommu *iommu;
3082 /* normally pdev is not mapped */
3083 if (unlikely(domain_context_mapped(pdev))) {
3084 struct dmar_domain *old_domain;
3086 old_domain = find_domain(pdev);
3088 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3089 vm_domain_remove_one_dev_info(old_domain, pdev);
3091 domain_remove_dev_info(old_domain);
3095 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
3099 /* check if this iommu agaw is sufficient for max mapped address */
3100 addr_width = agaw_to_width(iommu->agaw);
3101 end = DOMAIN_MAX_ADDR(addr_width);
3102 end = end & VTD_PAGE_MASK;
3103 if (end < dmar_domain->max_addr) {
3104 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3105 "sufficient for the mapped address (%llx)\n",
3106 __func__, iommu->agaw, dmar_domain->max_addr);
3110 ret = domain_context_mapping(dmar_domain, pdev);
3114 ret = vm_domain_add_dev_info(dmar_domain, pdev);
3118 static void intel_iommu_detach_device(struct iommu_domain *domain,
3121 struct dmar_domain *dmar_domain = domain->priv;
3122 struct pci_dev *pdev = to_pci_dev(dev);
3124 vm_domain_remove_one_dev_info(dmar_domain, pdev);
3127 static int intel_iommu_map_range(struct iommu_domain *domain,
3128 unsigned long iova, phys_addr_t hpa,
3129 size_t size, int iommu_prot)
3131 struct dmar_domain *dmar_domain = domain->priv;
3137 if (iommu_prot & IOMMU_READ)
3138 prot |= DMA_PTE_READ;
3139 if (iommu_prot & IOMMU_WRITE)
3140 prot |= DMA_PTE_WRITE;
3141 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3142 prot |= DMA_PTE_SNP;
3144 max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3145 if (dmar_domain->max_addr < max_addr) {
3149 /* check if minimum agaw is sufficient for mapped address */
3150 min_agaw = vm_domain_min_agaw(dmar_domain);
3151 addr_width = agaw_to_width(min_agaw);
3152 end = DOMAIN_MAX_ADDR(addr_width);
3153 end = end & VTD_PAGE_MASK;
3154 if (end < max_addr) {
3155 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3156 "sufficient for the mapped address (%llx)\n",
3157 __func__, min_agaw, max_addr);
3160 dmar_domain->max_addr = max_addr;
3163 ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3167 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3168 unsigned long iova, size_t size)
3170 struct dmar_domain *dmar_domain = domain->priv;
3173 /* The address might not be aligned */
3174 base = iova & VTD_PAGE_MASK;
3175 size = VTD_PAGE_ALIGN(size);
3176 dma_pte_clear_range(dmar_domain, base, base + size);
3178 if (dmar_domain->max_addr == base + size)
3179 dmar_domain->max_addr = base;
3182 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3185 struct dmar_domain *dmar_domain = domain->priv;
3186 struct dma_pte *pte;
3189 pte = addr_to_dma_pte(dmar_domain, iova);
3191 phys = dma_pte_addr(pte);
3196 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3199 struct dmar_domain *dmar_domain = domain->priv;
3201 if (cap == IOMMU_CAP_CACHE_COHERENCY)
3202 return dmar_domain->iommu_snooping;
3207 static struct iommu_ops intel_iommu_ops = {
3208 .domain_init = intel_iommu_domain_init,
3209 .domain_destroy = intel_iommu_domain_destroy,
3210 .attach_dev = intel_iommu_attach_device,
3211 .detach_dev = intel_iommu_detach_device,
3212 .map = intel_iommu_map_range,
3213 .unmap = intel_iommu_unmap_range,
3214 .iova_to_phys = intel_iommu_iova_to_phys,
3215 .domain_has_cap = intel_iommu_domain_has_cap,
3218 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3221 * Mobile 4 Series Chipset neglects to set RWBF capability,
3224 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3228 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);