2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/sysdev.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/intel-iommu.h>
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
43 #define ROOT_SIZE VTD_PAGE_SIZE
44 #define CONTEXT_SIZE VTD_PAGE_SIZE
46 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
47 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
49 #define IOAPIC_RANGE_START (0xfee00000)
50 #define IOAPIC_RANGE_END (0xfeefffff)
51 #define IOVA_START_ADDR (0x1000)
53 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
55 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
58 static void flush_unmaps_timeout(unsigned long data);
60 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
62 #define HIGH_WATER_MARK 250
63 struct deferred_flush_tables {
65 struct iova *iova[HIGH_WATER_MARK];
66 struct dmar_domain *domain[HIGH_WATER_MARK];
69 static struct deferred_flush_tables *deferred_flush;
71 /* bitmap for indexing intel_iommus */
72 static int g_num_of_iommus;
74 static DEFINE_SPINLOCK(async_umap_flush_lock);
75 static LIST_HEAD(unmaps_to_do);
78 static long list_size;
80 static void domain_remove_dev_info(struct dmar_domain *domain);
83 static int __initdata dmar_map_gfx = 1;
84 static int dmar_forcedac;
85 static int intel_iommu_strict;
87 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
88 static DEFINE_SPINLOCK(device_domain_lock);
89 static LIST_HEAD(device_domain_list);
91 static int __init intel_iommu_setup(char *str)
96 if (!strncmp(str, "off", 3)) {
98 printk(KERN_INFO"Intel-IOMMU: disabled\n");
99 } else if (!strncmp(str, "igfx_off", 8)) {
102 "Intel-IOMMU: disable GFX device mapping\n");
103 } else if (!strncmp(str, "forcedac", 8)) {
105 "Intel-IOMMU: Forcing DAC for PCI devices\n");
107 } else if (!strncmp(str, "strict", 6)) {
109 "Intel-IOMMU: disable batched IOTLB flush\n");
110 intel_iommu_strict = 1;
113 str += strcspn(str, ",");
119 __setup("intel_iommu=", intel_iommu_setup);
121 static struct kmem_cache *iommu_domain_cache;
122 static struct kmem_cache *iommu_devinfo_cache;
123 static struct kmem_cache *iommu_iova_cache;
125 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
130 /* trying to avoid low memory issues */
131 flags = current->flags & PF_MEMALLOC;
132 current->flags |= PF_MEMALLOC;
133 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
134 current->flags &= (~PF_MEMALLOC | flags);
139 static inline void *alloc_pgtable_page(void)
144 /* trying to avoid low memory issues */
145 flags = current->flags & PF_MEMALLOC;
146 current->flags |= PF_MEMALLOC;
147 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
148 current->flags &= (~PF_MEMALLOC | flags);
152 static inline void free_pgtable_page(void *vaddr)
154 free_page((unsigned long)vaddr);
157 static inline void *alloc_domain_mem(void)
159 return iommu_kmem_cache_alloc(iommu_domain_cache);
162 static void free_domain_mem(void *vaddr)
164 kmem_cache_free(iommu_domain_cache, vaddr);
167 static inline void * alloc_devinfo_mem(void)
169 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
172 static inline void free_devinfo_mem(void *vaddr)
174 kmem_cache_free(iommu_devinfo_cache, vaddr);
177 struct iova *alloc_iova_mem(void)
179 return iommu_kmem_cache_alloc(iommu_iova_cache);
182 void free_iova_mem(struct iova *iova)
184 kmem_cache_free(iommu_iova_cache, iova);
187 /* Gets context entry for a given bus and devfn */
188 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
191 struct root_entry *root;
192 struct context_entry *context;
193 unsigned long phy_addr;
196 spin_lock_irqsave(&iommu->lock, flags);
197 root = &iommu->root_entry[bus];
198 context = get_context_addr_from_root(root);
200 context = (struct context_entry *)alloc_pgtable_page();
202 spin_unlock_irqrestore(&iommu->lock, flags);
205 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
206 phy_addr = virt_to_phys((void *)context);
207 set_root_value(root, phy_addr);
208 set_root_present(root);
209 __iommu_flush_cache(iommu, root, sizeof(*root));
211 spin_unlock_irqrestore(&iommu->lock, flags);
212 return &context[devfn];
215 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
217 struct root_entry *root;
218 struct context_entry *context;
222 spin_lock_irqsave(&iommu->lock, flags);
223 root = &iommu->root_entry[bus];
224 context = get_context_addr_from_root(root);
229 ret = context_present(context[devfn]);
231 spin_unlock_irqrestore(&iommu->lock, flags);
235 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
237 struct root_entry *root;
238 struct context_entry *context;
241 spin_lock_irqsave(&iommu->lock, flags);
242 root = &iommu->root_entry[bus];
243 context = get_context_addr_from_root(root);
245 context_clear_entry(context[devfn]);
246 __iommu_flush_cache(iommu, &context[devfn], \
249 spin_unlock_irqrestore(&iommu->lock, flags);
252 static void free_context_table(struct intel_iommu *iommu)
254 struct root_entry *root;
257 struct context_entry *context;
259 spin_lock_irqsave(&iommu->lock, flags);
260 if (!iommu->root_entry) {
263 for (i = 0; i < ROOT_ENTRY_NR; i++) {
264 root = &iommu->root_entry[i];
265 context = get_context_addr_from_root(root);
267 free_pgtable_page(context);
269 free_pgtable_page(iommu->root_entry);
270 iommu->root_entry = NULL;
272 spin_unlock_irqrestore(&iommu->lock, flags);
275 /* page table handling */
276 #define LEVEL_STRIDE (9)
277 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
279 static inline int agaw_to_level(int agaw)
284 static inline int agaw_to_width(int agaw)
286 return 30 + agaw * LEVEL_STRIDE;
290 static inline int width_to_agaw(int width)
292 return (width - 30) / LEVEL_STRIDE;
295 static inline unsigned int level_to_offset_bits(int level)
297 return (12 + (level - 1) * LEVEL_STRIDE);
300 static inline int address_level_offset(u64 addr, int level)
302 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
305 static inline u64 level_mask(int level)
307 return ((u64)-1 << level_to_offset_bits(level));
310 static inline u64 level_size(int level)
312 return ((u64)1 << level_to_offset_bits(level));
315 static inline u64 align_to_level(u64 addr, int level)
317 return ((addr + level_size(level) - 1) & level_mask(level));
320 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
322 int addr_width = agaw_to_width(domain->agaw);
323 struct dma_pte *parent, *pte = NULL;
324 int level = agaw_to_level(domain->agaw);
328 BUG_ON(!domain->pgd);
330 addr &= (((u64)1) << addr_width) - 1;
331 parent = domain->pgd;
333 spin_lock_irqsave(&domain->mapping_lock, flags);
337 offset = address_level_offset(addr, level);
338 pte = &parent[offset];
342 if (!dma_pte_present(*pte)) {
343 tmp_page = alloc_pgtable_page();
346 spin_unlock_irqrestore(&domain->mapping_lock,
350 __iommu_flush_cache(domain->iommu, tmp_page,
352 dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
354 * high level table always sets r/w, last level page
355 * table control read/write
357 dma_set_pte_readable(*pte);
358 dma_set_pte_writable(*pte);
359 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
361 parent = phys_to_virt(dma_pte_addr(*pte));
365 spin_unlock_irqrestore(&domain->mapping_lock, flags);
369 /* return address's pte at specific level */
370 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
373 struct dma_pte *parent, *pte = NULL;
374 int total = agaw_to_level(domain->agaw);
377 parent = domain->pgd;
378 while (level <= total) {
379 offset = address_level_offset(addr, total);
380 pte = &parent[offset];
384 if (!dma_pte_present(*pte))
386 parent = phys_to_virt(dma_pte_addr(*pte));
392 /* clear one page's page table */
393 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
395 struct dma_pte *pte = NULL;
397 /* get last level pte */
398 pte = dma_addr_level_pte(domain, addr, 1);
402 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
406 /* clear last level pte, a tlb flush should be followed */
407 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
409 int addr_width = agaw_to_width(domain->agaw);
411 start &= (((u64)1) << addr_width) - 1;
412 end &= (((u64)1) << addr_width) - 1;
413 /* in case it's partial page */
414 start = PAGE_ALIGN(start);
417 /* we don't need lock here, nobody else touches the iova range */
418 while (start < end) {
419 dma_pte_clear_one(domain, start);
420 start += VTD_PAGE_SIZE;
424 /* free page table pages. last level pte should already be cleared */
425 static void dma_pte_free_pagetable(struct dmar_domain *domain,
428 int addr_width = agaw_to_width(domain->agaw);
430 int total = agaw_to_level(domain->agaw);
434 start &= (((u64)1) << addr_width) - 1;
435 end &= (((u64)1) << addr_width) - 1;
437 /* we don't need lock here, nobody else touches the iova range */
439 while (level <= total) {
440 tmp = align_to_level(start, level);
441 if (tmp >= end || (tmp + level_size(level) > end))
445 pte = dma_addr_level_pte(domain, tmp, level);
448 phys_to_virt(dma_pte_addr(*pte)));
450 __iommu_flush_cache(domain->iommu,
453 tmp += level_size(level);
458 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
459 free_pgtable_page(domain->pgd);
465 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
467 struct root_entry *root;
470 root = (struct root_entry *)alloc_pgtable_page();
474 __iommu_flush_cache(iommu, root, ROOT_SIZE);
476 spin_lock_irqsave(&iommu->lock, flags);
477 iommu->root_entry = root;
478 spin_unlock_irqrestore(&iommu->lock, flags);
483 static void iommu_set_root_entry(struct intel_iommu *iommu)
489 addr = iommu->root_entry;
491 spin_lock_irqsave(&iommu->register_lock, flag);
492 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
494 cmd = iommu->gcmd | DMA_GCMD_SRTP;
495 writel(cmd, iommu->reg + DMAR_GCMD_REG);
497 /* Make sure hardware complete it */
498 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
499 readl, (sts & DMA_GSTS_RTPS), sts);
501 spin_unlock_irqrestore(&iommu->register_lock, flag);
504 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
509 if (!cap_rwbf(iommu->cap))
511 val = iommu->gcmd | DMA_GCMD_WBF;
513 spin_lock_irqsave(&iommu->register_lock, flag);
514 writel(val, iommu->reg + DMAR_GCMD_REG);
516 /* Make sure hardware complete it */
517 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
518 readl, (!(val & DMA_GSTS_WBFS)), val);
520 spin_unlock_irqrestore(&iommu->register_lock, flag);
523 /* return value determine if we need a write buffer flush */
524 static int __iommu_flush_context(struct intel_iommu *iommu,
525 u16 did, u16 source_id, u8 function_mask, u64 type,
526 int non_present_entry_flush)
532 * In the non-present entry flush case, if hardware doesn't cache
533 * non-present entry we do nothing and if hardware cache non-present
534 * entry, we flush entries of domain 0 (the domain id is used to cache
535 * any non-present entries)
537 if (non_present_entry_flush) {
538 if (!cap_caching_mode(iommu->cap))
545 case DMA_CCMD_GLOBAL_INVL:
546 val = DMA_CCMD_GLOBAL_INVL;
548 case DMA_CCMD_DOMAIN_INVL:
549 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
551 case DMA_CCMD_DEVICE_INVL:
552 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
553 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
560 spin_lock_irqsave(&iommu->register_lock, flag);
561 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
563 /* Make sure hardware complete it */
564 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
565 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
567 spin_unlock_irqrestore(&iommu->register_lock, flag);
569 /* flush context entry will implicitly flush write buffer */
573 /* return value determine if we need a write buffer flush */
574 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
575 u64 addr, unsigned int size_order, u64 type,
576 int non_present_entry_flush)
578 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
579 u64 val = 0, val_iva = 0;
583 * In the non-present entry flush case, if hardware doesn't cache
584 * non-present entry we do nothing and if hardware cache non-present
585 * entry, we flush entries of domain 0 (the domain id is used to cache
586 * any non-present entries)
588 if (non_present_entry_flush) {
589 if (!cap_caching_mode(iommu->cap))
596 case DMA_TLB_GLOBAL_FLUSH:
597 /* global flush doesn't need set IVA_REG */
598 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
600 case DMA_TLB_DSI_FLUSH:
601 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
603 case DMA_TLB_PSI_FLUSH:
604 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
605 /* Note: always flush non-leaf currently */
606 val_iva = size_order | addr;
611 /* Note: set drain read/write */
614 * This is probably to be super secure.. Looks like we can
615 * ignore it without any impact.
617 if (cap_read_drain(iommu->cap))
618 val |= DMA_TLB_READ_DRAIN;
620 if (cap_write_drain(iommu->cap))
621 val |= DMA_TLB_WRITE_DRAIN;
623 spin_lock_irqsave(&iommu->register_lock, flag);
624 /* Note: Only uses first TLB reg currently */
626 dmar_writeq(iommu->reg + tlb_offset, val_iva);
627 dmar_writeq(iommu->reg + tlb_offset + 8, val);
629 /* Make sure hardware complete it */
630 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
631 dmar_readq, (!(val & DMA_TLB_IVT)), val);
633 spin_unlock_irqrestore(&iommu->register_lock, flag);
635 /* check IOTLB invalidation granularity */
636 if (DMA_TLB_IAIG(val) == 0)
637 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
638 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
639 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
640 (unsigned long long)DMA_TLB_IIRG(type),
641 (unsigned long long)DMA_TLB_IAIG(val));
642 /* flush iotlb entry will implicitly flush write buffer */
646 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
647 u64 addr, unsigned int pages, int non_present_entry_flush)
651 BUG_ON(addr & (~VTD_PAGE_MASK));
654 /* Fallback to domain selective flush if no PSI support */
655 if (!cap_pgsel_inv(iommu->cap))
656 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
658 non_present_entry_flush);
661 * PSI requires page size to be 2 ^ x, and the base address is naturally
662 * aligned to the size
664 mask = ilog2(__roundup_pow_of_two(pages));
665 /* Fallback to domain selective flush if size is too big */
666 if (mask > cap_max_amask_val(iommu->cap))
667 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
668 DMA_TLB_DSI_FLUSH, non_present_entry_flush);
670 return iommu->flush.flush_iotlb(iommu, did, addr, mask,
672 non_present_entry_flush);
675 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
680 spin_lock_irqsave(&iommu->register_lock, flags);
681 pmen = readl(iommu->reg + DMAR_PMEN_REG);
682 pmen &= ~DMA_PMEN_EPM;
683 writel(pmen, iommu->reg + DMAR_PMEN_REG);
685 /* wait for the protected region status bit to clear */
686 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
687 readl, !(pmen & DMA_PMEN_PRS), pmen);
689 spin_unlock_irqrestore(&iommu->register_lock, flags);
692 static int iommu_enable_translation(struct intel_iommu *iommu)
697 spin_lock_irqsave(&iommu->register_lock, flags);
698 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
700 /* Make sure hardware complete it */
701 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
702 readl, (sts & DMA_GSTS_TES), sts);
704 iommu->gcmd |= DMA_GCMD_TE;
705 spin_unlock_irqrestore(&iommu->register_lock, flags);
709 static int iommu_disable_translation(struct intel_iommu *iommu)
714 spin_lock_irqsave(&iommu->register_lock, flag);
715 iommu->gcmd &= ~DMA_GCMD_TE;
716 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
718 /* Make sure hardware complete it */
719 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
720 readl, (!(sts & DMA_GSTS_TES)), sts);
722 spin_unlock_irqrestore(&iommu->register_lock, flag);
726 /* iommu interrupt handling. Most stuff are MSI-like. */
728 static const char *fault_reason_strings[] =
731 "Present bit in root entry is clear",
732 "Present bit in context entry is clear",
733 "Invalid context entry",
734 "Access beyond MGAW",
735 "PTE Write access is not set",
736 "PTE Read access is not set",
737 "Next page table ptr is invalid",
738 "Root table address invalid",
739 "Context table ptr is invalid",
740 "non-zero reserved fields in RTP",
741 "non-zero reserved fields in CTP",
742 "non-zero reserved fields in PTE",
744 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
746 const char *dmar_get_fault_reason(u8 fault_reason)
748 if (fault_reason > MAX_FAULT_REASON_IDX)
751 return fault_reason_strings[fault_reason];
754 void dmar_msi_unmask(unsigned int irq)
756 struct intel_iommu *iommu = get_irq_data(irq);
760 spin_lock_irqsave(&iommu->register_lock, flag);
761 writel(0, iommu->reg + DMAR_FECTL_REG);
762 /* Read a reg to force flush the post write */
763 readl(iommu->reg + DMAR_FECTL_REG);
764 spin_unlock_irqrestore(&iommu->register_lock, flag);
767 void dmar_msi_mask(unsigned int irq)
770 struct intel_iommu *iommu = get_irq_data(irq);
773 spin_lock_irqsave(&iommu->register_lock, flag);
774 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
775 /* Read a reg to force flush the post write */
776 readl(iommu->reg + DMAR_FECTL_REG);
777 spin_unlock_irqrestore(&iommu->register_lock, flag);
780 void dmar_msi_write(int irq, struct msi_msg *msg)
782 struct intel_iommu *iommu = get_irq_data(irq);
785 spin_lock_irqsave(&iommu->register_lock, flag);
786 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
787 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
788 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
789 spin_unlock_irqrestore(&iommu->register_lock, flag);
792 void dmar_msi_read(int irq, struct msi_msg *msg)
794 struct intel_iommu *iommu = get_irq_data(irq);
797 spin_lock_irqsave(&iommu->register_lock, flag);
798 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
799 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
800 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
801 spin_unlock_irqrestore(&iommu->register_lock, flag);
804 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
805 u8 fault_reason, u16 source_id, unsigned long long addr)
809 reason = dmar_get_fault_reason(fault_reason);
812 "DMAR:[%s] Request device [%02x:%02x.%d] "
814 "DMAR:[fault reason %02d] %s\n",
815 (type ? "DMA Read" : "DMA Write"),
816 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
817 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
821 #define PRIMARY_FAULT_REG_LEN (16)
822 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
824 struct intel_iommu *iommu = dev_id;
825 int reg, fault_index;
829 spin_lock_irqsave(&iommu->register_lock, flag);
830 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
832 /* TBD: ignore advanced fault log currently */
833 if (!(fault_status & DMA_FSTS_PPF))
836 fault_index = dma_fsts_fault_record_index(fault_status);
837 reg = cap_fault_reg_offset(iommu->cap);
845 /* highest 32 bits */
846 data = readl(iommu->reg + reg +
847 fault_index * PRIMARY_FAULT_REG_LEN + 12);
848 if (!(data & DMA_FRCD_F))
851 fault_reason = dma_frcd_fault_reason(data);
852 type = dma_frcd_type(data);
854 data = readl(iommu->reg + reg +
855 fault_index * PRIMARY_FAULT_REG_LEN + 8);
856 source_id = dma_frcd_source_id(data);
858 guest_addr = dmar_readq(iommu->reg + reg +
859 fault_index * PRIMARY_FAULT_REG_LEN);
860 guest_addr = dma_frcd_page_addr(guest_addr);
861 /* clear the fault */
862 writel(DMA_FRCD_F, iommu->reg + reg +
863 fault_index * PRIMARY_FAULT_REG_LEN + 12);
865 spin_unlock_irqrestore(&iommu->register_lock, flag);
867 iommu_page_fault_do_one(iommu, type, fault_reason,
868 source_id, guest_addr);
871 if (fault_index > cap_num_fault_regs(iommu->cap))
873 spin_lock_irqsave(&iommu->register_lock, flag);
876 /* clear primary fault overflow */
877 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
878 if (fault_status & DMA_FSTS_PFO)
879 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
881 spin_unlock_irqrestore(&iommu->register_lock, flag);
885 int dmar_set_interrupt(struct intel_iommu *iommu)
891 printk(KERN_ERR "IOMMU: no free vectors\n");
895 set_irq_data(irq, iommu);
898 ret = arch_setup_dmar_msi(irq);
900 set_irq_data(irq, NULL);
906 /* Force fault register is cleared */
907 iommu_page_fault(irq, iommu);
909 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
911 printk(KERN_ERR "IOMMU: can't request irq\n");
915 static int iommu_init_domains(struct intel_iommu *iommu)
917 unsigned long ndomains;
918 unsigned long nlongs;
920 ndomains = cap_ndoms(iommu->cap);
921 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
922 nlongs = BITS_TO_LONGS(ndomains);
924 /* TBD: there might be 64K domains,
925 * consider other allocation for future chip
927 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
928 if (!iommu->domain_ids) {
929 printk(KERN_ERR "Allocating domain id array failed\n");
932 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
934 if (!iommu->domains) {
935 printk(KERN_ERR "Allocating domain array failed\n");
936 kfree(iommu->domain_ids);
940 spin_lock_init(&iommu->lock);
943 * if Caching mode is set, then invalid translations are tagged
944 * with domainid 0. Hence we need to pre-allocate it.
946 if (cap_caching_mode(iommu->cap))
947 set_bit(0, iommu->domain_ids);
952 static void domain_exit(struct dmar_domain *domain);
954 void free_dmar_iommu(struct intel_iommu *iommu)
956 struct dmar_domain *domain;
959 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
960 for (; i < cap_ndoms(iommu->cap); ) {
961 domain = iommu->domains[i];
962 clear_bit(i, iommu->domain_ids);
964 i = find_next_bit(iommu->domain_ids,
965 cap_ndoms(iommu->cap), i+1);
968 if (iommu->gcmd & DMA_GCMD_TE)
969 iommu_disable_translation(iommu);
972 set_irq_data(iommu->irq, NULL);
973 /* This will mask the irq */
974 free_irq(iommu->irq, iommu);
975 destroy_irq(iommu->irq);
978 kfree(iommu->domains);
979 kfree(iommu->domain_ids);
981 /* free context mapping */
982 free_context_table(iommu);
985 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
988 unsigned long ndomains;
989 struct dmar_domain *domain;
992 domain = alloc_domain_mem();
996 ndomains = cap_ndoms(iommu->cap);
998 spin_lock_irqsave(&iommu->lock, flags);
999 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1000 if (num >= ndomains) {
1001 spin_unlock_irqrestore(&iommu->lock, flags);
1002 free_domain_mem(domain);
1003 printk(KERN_ERR "IOMMU: no free domain ids\n");
1007 set_bit(num, iommu->domain_ids);
1009 domain->iommu = iommu;
1010 iommu->domains[num] = domain;
1011 spin_unlock_irqrestore(&iommu->lock, flags);
1016 static void iommu_free_domain(struct dmar_domain *domain)
1018 unsigned long flags;
1020 spin_lock_irqsave(&domain->iommu->lock, flags);
1021 clear_bit(domain->id, domain->iommu->domain_ids);
1022 spin_unlock_irqrestore(&domain->iommu->lock, flags);
1025 static struct iova_domain reserved_iova_list;
1026 static struct lock_class_key reserved_alloc_key;
1027 static struct lock_class_key reserved_rbtree_key;
1029 static void dmar_init_reserved_ranges(void)
1031 struct pci_dev *pdev = NULL;
1036 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1038 lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1039 &reserved_alloc_key);
1040 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1041 &reserved_rbtree_key);
1043 /* IOAPIC ranges shouldn't be accessed by DMA */
1044 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1045 IOVA_PFN(IOAPIC_RANGE_END));
1047 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1049 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1050 for_each_pci_dev(pdev) {
1053 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1054 r = &pdev->resource[i];
1055 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1059 size = r->end - addr;
1060 size = PAGE_ALIGN(size);
1061 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1062 IOVA_PFN(size + addr) - 1);
1064 printk(KERN_ERR "Reserve iova failed\n");
1070 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1072 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1075 static inline int guestwidth_to_adjustwidth(int gaw)
1078 int r = (gaw - 12) % 9;
1089 static int domain_init(struct dmar_domain *domain, int guest_width)
1091 struct intel_iommu *iommu;
1092 int adjust_width, agaw;
1093 unsigned long sagaw;
1095 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1096 spin_lock_init(&domain->mapping_lock);
1098 domain_reserve_special_ranges(domain);
1100 /* calculate AGAW */
1101 iommu = domain->iommu;
1102 if (guest_width > cap_mgaw(iommu->cap))
1103 guest_width = cap_mgaw(iommu->cap);
1104 domain->gaw = guest_width;
1105 adjust_width = guestwidth_to_adjustwidth(guest_width);
1106 agaw = width_to_agaw(adjust_width);
1107 sagaw = cap_sagaw(iommu->cap);
1108 if (!test_bit(agaw, &sagaw)) {
1109 /* hardware doesn't support it, choose a bigger one */
1110 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1111 agaw = find_next_bit(&sagaw, 5, agaw);
1115 domain->agaw = agaw;
1116 INIT_LIST_HEAD(&domain->devices);
1118 /* always allocate the top pgd */
1119 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1122 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1126 static void domain_exit(struct dmar_domain *domain)
1130 /* Domain 0 is reserved, so dont process it */
1134 domain_remove_dev_info(domain);
1136 put_iova_domain(&domain->iovad);
1137 end = DOMAIN_MAX_ADDR(domain->gaw);
1138 end = end & (~PAGE_MASK);
1141 dma_pte_clear_range(domain, 0, end);
1143 /* free page tables */
1144 dma_pte_free_pagetable(domain, 0, end);
1146 iommu_free_domain(domain);
1147 free_domain_mem(domain);
1150 static int domain_context_mapping_one(struct dmar_domain *domain,
1153 struct context_entry *context;
1154 struct intel_iommu *iommu = domain->iommu;
1155 unsigned long flags;
1157 pr_debug("Set context mapping for %02x:%02x.%d\n",
1158 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1159 BUG_ON(!domain->pgd);
1160 context = device_to_context_entry(iommu, bus, devfn);
1163 spin_lock_irqsave(&iommu->lock, flags);
1164 if (context_present(*context)) {
1165 spin_unlock_irqrestore(&iommu->lock, flags);
1169 context_set_domain_id(*context, domain->id);
1170 context_set_address_width(*context, domain->agaw);
1171 context_set_address_root(*context, virt_to_phys(domain->pgd));
1172 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1173 context_set_fault_enable(*context);
1174 context_set_present(*context);
1175 __iommu_flush_cache(iommu, context, sizeof(*context));
1177 /* it's a non-present to present mapping */
1178 if (iommu->flush.flush_context(iommu, domain->id,
1179 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1180 DMA_CCMD_DEVICE_INVL, 1))
1181 iommu_flush_write_buffer(iommu);
1183 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1185 spin_unlock_irqrestore(&iommu->lock, flags);
1190 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1193 struct pci_dev *tmp, *parent;
1195 ret = domain_context_mapping_one(domain, pdev->bus->number,
1200 /* dependent device mapping */
1201 tmp = pci_find_upstream_pcie_bridge(pdev);
1204 /* Secondary interface's bus number and devfn 0 */
1205 parent = pdev->bus->self;
1206 while (parent != tmp) {
1207 ret = domain_context_mapping_one(domain, parent->bus->number,
1211 parent = parent->bus->self;
1213 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1214 return domain_context_mapping_one(domain,
1215 tmp->subordinate->number, 0);
1216 else /* this is a legacy PCI bridge */
1217 return domain_context_mapping_one(domain,
1218 tmp->bus->number, tmp->devfn);
1221 static int domain_context_mapped(struct dmar_domain *domain,
1222 struct pci_dev *pdev)
1225 struct pci_dev *tmp, *parent;
1227 ret = device_context_mapped(domain->iommu,
1228 pdev->bus->number, pdev->devfn);
1231 /* dependent device mapping */
1232 tmp = pci_find_upstream_pcie_bridge(pdev);
1235 /* Secondary interface's bus number and devfn 0 */
1236 parent = pdev->bus->self;
1237 while (parent != tmp) {
1238 ret = device_context_mapped(domain->iommu, parent->bus->number,
1242 parent = parent->bus->self;
1245 return device_context_mapped(domain->iommu,
1246 tmp->subordinate->number, 0);
1248 return device_context_mapped(domain->iommu,
1249 tmp->bus->number, tmp->devfn);
1253 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1254 u64 hpa, size_t size, int prot)
1256 u64 start_pfn, end_pfn;
1257 struct dma_pte *pte;
1259 int addr_width = agaw_to_width(domain->agaw);
1261 hpa &= (((u64)1) << addr_width) - 1;
1263 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1266 start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1267 end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1269 while (start_pfn < end_pfn) {
1270 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1273 /* We don't need lock here, nobody else
1274 * touches the iova range
1276 BUG_ON(dma_pte_addr(*pte));
1277 dma_set_pte_addr(*pte, start_pfn << VTD_PAGE_SHIFT);
1278 dma_set_pte_prot(*pte, prot);
1279 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1286 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1288 clear_context_table(domain->iommu, bus, devfn);
1289 domain->iommu->flush.flush_context(domain->iommu, 0, 0, 0,
1290 DMA_CCMD_GLOBAL_INVL, 0);
1291 domain->iommu->flush.flush_iotlb(domain->iommu, 0, 0, 0,
1292 DMA_TLB_GLOBAL_FLUSH, 0);
1295 static void domain_remove_dev_info(struct dmar_domain *domain)
1297 struct device_domain_info *info;
1298 unsigned long flags;
1300 spin_lock_irqsave(&device_domain_lock, flags);
1301 while (!list_empty(&domain->devices)) {
1302 info = list_entry(domain->devices.next,
1303 struct device_domain_info, link);
1304 list_del(&info->link);
1305 list_del(&info->global);
1307 info->dev->dev.archdata.iommu = NULL;
1308 spin_unlock_irqrestore(&device_domain_lock, flags);
1310 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1311 free_devinfo_mem(info);
1313 spin_lock_irqsave(&device_domain_lock, flags);
1315 spin_unlock_irqrestore(&device_domain_lock, flags);
1320 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1322 static struct dmar_domain *
1323 find_domain(struct pci_dev *pdev)
1325 struct device_domain_info *info;
1327 /* No lock here, assumes no domain exit in normal case */
1328 info = pdev->dev.archdata.iommu;
1330 return info->domain;
1334 /* domain is initialized */
1335 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1337 struct dmar_domain *domain, *found = NULL;
1338 struct intel_iommu *iommu;
1339 struct dmar_drhd_unit *drhd;
1340 struct device_domain_info *info, *tmp;
1341 struct pci_dev *dev_tmp;
1342 unsigned long flags;
1343 int bus = 0, devfn = 0;
1345 domain = find_domain(pdev);
1349 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1351 if (dev_tmp->is_pcie) {
1352 bus = dev_tmp->subordinate->number;
1355 bus = dev_tmp->bus->number;
1356 devfn = dev_tmp->devfn;
1358 spin_lock_irqsave(&device_domain_lock, flags);
1359 list_for_each_entry(info, &device_domain_list, global) {
1360 if (info->bus == bus && info->devfn == devfn) {
1361 found = info->domain;
1365 spin_unlock_irqrestore(&device_domain_lock, flags);
1366 /* pcie-pci bridge already has a domain, uses it */
1373 /* Allocate new domain for the device */
1374 drhd = dmar_find_matched_drhd_unit(pdev);
1376 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1380 iommu = drhd->iommu;
1382 domain = iommu_alloc_domain(iommu);
1386 if (domain_init(domain, gaw)) {
1387 domain_exit(domain);
1391 /* register pcie-to-pci device */
1393 info = alloc_devinfo_mem();
1395 domain_exit(domain);
1399 info->devfn = devfn;
1401 info->domain = domain;
1402 /* This domain is shared by devices under p2p bridge */
1403 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1405 /* pcie-to-pci bridge already has a domain, uses it */
1407 spin_lock_irqsave(&device_domain_lock, flags);
1408 list_for_each_entry(tmp, &device_domain_list, global) {
1409 if (tmp->bus == bus && tmp->devfn == devfn) {
1410 found = tmp->domain;
1415 free_devinfo_mem(info);
1416 domain_exit(domain);
1419 list_add(&info->link, &domain->devices);
1420 list_add(&info->global, &device_domain_list);
1422 spin_unlock_irqrestore(&device_domain_lock, flags);
1426 info = alloc_devinfo_mem();
1429 info->bus = pdev->bus->number;
1430 info->devfn = pdev->devfn;
1432 info->domain = domain;
1433 spin_lock_irqsave(&device_domain_lock, flags);
1434 /* somebody is fast */
1435 found = find_domain(pdev);
1436 if (found != NULL) {
1437 spin_unlock_irqrestore(&device_domain_lock, flags);
1438 if (found != domain) {
1439 domain_exit(domain);
1442 free_devinfo_mem(info);
1445 list_add(&info->link, &domain->devices);
1446 list_add(&info->global, &device_domain_list);
1447 pdev->dev.archdata.iommu = info;
1448 spin_unlock_irqrestore(&device_domain_lock, flags);
1451 /* recheck it here, maybe others set it */
1452 return find_domain(pdev);
1455 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1456 unsigned long long start,
1457 unsigned long long end)
1459 struct dmar_domain *domain;
1461 unsigned long long base;
1465 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1466 pci_name(pdev), start, end);
1467 /* page table init */
1468 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1472 /* The address might not be aligned */
1473 base = start & PAGE_MASK;
1475 size = PAGE_ALIGN(size);
1476 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1477 IOVA_PFN(base + size) - 1)) {
1478 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1483 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1484 size, base, pci_name(pdev));
1486 * RMRR range might have overlap with physical memory range,
1489 dma_pte_clear_range(domain, base, base + size);
1491 ret = domain_page_mapping(domain, base, base, size,
1492 DMA_PTE_READ|DMA_PTE_WRITE);
1496 /* context entry init */
1497 ret = domain_context_mapping(domain, pdev);
1501 domain_exit(domain);
1506 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1507 struct pci_dev *pdev)
1509 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1511 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1512 rmrr->end_address + 1);
1515 #ifdef CONFIG_DMAR_GFX_WA
1516 struct iommu_prepare_data {
1517 struct pci_dev *pdev;
1521 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1522 unsigned long end_pfn, void *datax)
1524 struct iommu_prepare_data *data;
1526 data = (struct iommu_prepare_data *)datax;
1528 data->ret = iommu_prepare_identity_map(data->pdev,
1529 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1534 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1537 struct iommu_prepare_data data;
1542 for_each_online_node(nid) {
1543 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1550 static void __init iommu_prepare_gfx_mapping(void)
1552 struct pci_dev *pdev = NULL;
1555 for_each_pci_dev(pdev) {
1556 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1557 !IS_GFX_DEVICE(pdev))
1559 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1561 ret = iommu_prepare_with_active_regions(pdev);
1563 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1568 #ifdef CONFIG_DMAR_FLOPPY_WA
1569 static inline void iommu_prepare_isa(void)
1571 struct pci_dev *pdev;
1574 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1578 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1579 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1582 printk("IOMMU: Failed to create 0-64M identity map, "
1583 "floppy might not work\n");
1587 static inline void iommu_prepare_isa(void)
1591 #endif /* !CONFIG_DMAR_FLPY_WA */
1593 int __init init_dmars(void)
1595 struct dmar_drhd_unit *drhd;
1596 struct dmar_rmrr_unit *rmrr;
1597 struct pci_dev *pdev;
1598 struct intel_iommu *iommu;
1599 int i, ret, unit = 0;
1604 * initialize and program root entry to not present
1607 for_each_drhd_unit(drhd) {
1610 * lock not needed as this is only incremented in the single
1611 * threaded kernel __init code path all other access are read
1616 deferred_flush = kzalloc(g_num_of_iommus *
1617 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1618 if (!deferred_flush) {
1623 for_each_drhd_unit(drhd) {
1627 iommu = drhd->iommu;
1629 ret = iommu_init_domains(iommu);
1635 * we could share the same root & context tables
1636 * amoung all IOMMU's. Need to Split it later.
1638 ret = iommu_alloc_root_entry(iommu);
1640 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1645 for_each_drhd_unit(drhd) {
1649 iommu = drhd->iommu;
1650 if (dmar_enable_qi(iommu)) {
1652 * Queued Invalidate not enabled, use Register Based
1655 iommu->flush.flush_context = __iommu_flush_context;
1656 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1657 printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1658 "invalidation\n", drhd->reg_base_addr);
1660 iommu->flush.flush_context = qi_flush_context;
1661 iommu->flush.flush_iotlb = qi_flush_iotlb;
1662 printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1663 "invalidation\n", drhd->reg_base_addr);
1669 * for each dev attached to rmrr
1671 * locate drhd for dev, alloc domain for dev
1672 * allocate free domain
1673 * allocate page table entries for rmrr
1674 * if context not allocated for bus
1675 * allocate and init context
1676 * set present in root table for this bus
1677 * init context with domain, translation etc
1681 for_each_rmrr_units(rmrr) {
1682 for (i = 0; i < rmrr->devices_cnt; i++) {
1683 pdev = rmrr->devices[i];
1684 /* some BIOS lists non-exist devices in DMAR table */
1687 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1690 "IOMMU: mapping reserved region failed\n");
1694 iommu_prepare_gfx_mapping();
1696 iommu_prepare_isa();
1701 * global invalidate context cache
1702 * global invalidate iotlb
1703 * enable translation
1705 for_each_drhd_unit(drhd) {
1708 iommu = drhd->iommu;
1709 sprintf (iommu->name, "dmar%d", unit++);
1711 iommu_flush_write_buffer(iommu);
1713 ret = dmar_set_interrupt(iommu);
1717 iommu_set_root_entry(iommu);
1719 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
1721 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
1723 iommu_disable_protect_mem_regions(iommu);
1725 ret = iommu_enable_translation(iommu);
1732 for_each_drhd_unit(drhd) {
1735 iommu = drhd->iommu;
1741 static inline u64 aligned_size(u64 host_addr, size_t size)
1744 addr = (host_addr & (~PAGE_MASK)) + size;
1745 return PAGE_ALIGN(addr);
1749 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1753 /* Make sure it's in range */
1754 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1755 if (!size || (IOVA_START_ADDR + size > end))
1758 piova = alloc_iova(&domain->iovad,
1759 size >> PAGE_SHIFT, IOVA_PFN(end), 1);
1763 static struct iova *
1764 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1765 size_t size, u64 dma_mask)
1767 struct pci_dev *pdev = to_pci_dev(dev);
1768 struct iova *iova = NULL;
1770 if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
1771 iova = iommu_alloc_iova(domain, size, dma_mask);
1774 * First try to allocate an io virtual address in
1775 * DMA_32BIT_MASK and if that fails then try allocating
1778 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1780 iova = iommu_alloc_iova(domain, size, dma_mask);
1784 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1791 static struct dmar_domain *
1792 get_valid_domain_for_dev(struct pci_dev *pdev)
1794 struct dmar_domain *domain;
1797 domain = get_domain_for_dev(pdev,
1798 DEFAULT_DOMAIN_ADDRESS_WIDTH);
1801 "Allocating domain for %s failed", pci_name(pdev));
1805 /* make sure context mapping is ok */
1806 if (unlikely(!domain_context_mapped(domain, pdev))) {
1807 ret = domain_context_mapping(domain, pdev);
1810 "Domain context map for %s failed",
1819 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
1820 size_t size, int dir, u64 dma_mask)
1822 struct pci_dev *pdev = to_pci_dev(hwdev);
1823 struct dmar_domain *domain;
1824 phys_addr_t start_paddr;
1829 BUG_ON(dir == DMA_NONE);
1830 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1833 domain = get_valid_domain_for_dev(pdev);
1837 size = aligned_size((u64)paddr, size);
1839 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
1843 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
1846 * Check if DMAR supports zero-length reads on write only
1849 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1850 !cap_zlr(domain->iommu->cap))
1851 prot |= DMA_PTE_READ;
1852 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1853 prot |= DMA_PTE_WRITE;
1855 * paddr - (paddr + size) might be partial page, we should map the whole
1856 * page. Note: if two part of one page are separately mapped, we
1857 * might have two guest_addr mapping to the same host paddr, but this
1858 * is not a big problem
1860 ret = domain_page_mapping(domain, start_paddr,
1861 ((u64)paddr) & PAGE_MASK, size, prot);
1865 /* it's a non-present to present mapping */
1866 ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1867 start_paddr, size >> VTD_PAGE_SHIFT, 1);
1869 iommu_flush_write_buffer(domain->iommu);
1871 return start_paddr + ((u64)paddr & (~PAGE_MASK));
1875 __free_iova(&domain->iovad, iova);
1876 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1877 pci_name(pdev), size, (unsigned long long)paddr, dir);
1881 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
1882 size_t size, int dir)
1884 return __intel_map_single(hwdev, paddr, size, dir,
1885 to_pci_dev(hwdev)->dma_mask);
1888 static void flush_unmaps(void)
1894 /* just flush them all */
1895 for (i = 0; i < g_num_of_iommus; i++) {
1896 if (deferred_flush[i].next) {
1897 struct intel_iommu *iommu =
1898 deferred_flush[i].domain[0]->iommu;
1900 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1901 DMA_TLB_GLOBAL_FLUSH, 0);
1902 for (j = 0; j < deferred_flush[i].next; j++) {
1903 __free_iova(&deferred_flush[i].domain[j]->iovad,
1904 deferred_flush[i].iova[j]);
1906 deferred_flush[i].next = 0;
1913 static void flush_unmaps_timeout(unsigned long data)
1915 unsigned long flags;
1917 spin_lock_irqsave(&async_umap_flush_lock, flags);
1919 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1922 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
1924 unsigned long flags;
1927 spin_lock_irqsave(&async_umap_flush_lock, flags);
1928 if (list_size == HIGH_WATER_MARK)
1931 iommu_id = dom->iommu->seq_id;
1933 next = deferred_flush[iommu_id].next;
1934 deferred_flush[iommu_id].domain[next] = dom;
1935 deferred_flush[iommu_id].iova[next] = iova;
1936 deferred_flush[iommu_id].next++;
1939 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
1943 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1946 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
1949 struct pci_dev *pdev = to_pci_dev(dev);
1950 struct dmar_domain *domain;
1951 unsigned long start_addr;
1954 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1956 domain = find_domain(pdev);
1959 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1963 start_addr = iova->pfn_lo << PAGE_SHIFT;
1964 size = aligned_size((u64)dev_addr, size);
1966 pr_debug("Device %s unmapping: %lx@%llx\n",
1967 pci_name(pdev), size, (unsigned long long)start_addr);
1969 /* clear the whole page */
1970 dma_pte_clear_range(domain, start_addr, start_addr + size);
1971 /* free page tables */
1972 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1973 if (intel_iommu_strict) {
1974 if (iommu_flush_iotlb_psi(domain->iommu,
1975 domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
1976 iommu_flush_write_buffer(domain->iommu);
1978 __free_iova(&domain->iovad, iova);
1980 add_unmap(domain, iova);
1982 * queue up the release of the unmap to save the 1/6th of the
1983 * cpu used up by the iotlb flush operation...
1988 void *intel_alloc_coherent(struct device *hwdev, size_t size,
1989 dma_addr_t *dma_handle, gfp_t flags)
1994 size = PAGE_ALIGN(size);
1995 order = get_order(size);
1996 flags &= ~(GFP_DMA | GFP_DMA32);
1998 vaddr = (void *)__get_free_pages(flags, order);
2001 memset(vaddr, 0, size);
2003 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2005 hwdev->coherent_dma_mask);
2008 free_pages((unsigned long)vaddr, order);
2012 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2013 dma_addr_t dma_handle)
2017 size = PAGE_ALIGN(size);
2018 order = get_order(size);
2020 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2021 free_pages((unsigned long)vaddr, order);
2024 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2026 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2027 int nelems, int dir)
2030 struct pci_dev *pdev = to_pci_dev(hwdev);
2031 struct dmar_domain *domain;
2032 unsigned long start_addr;
2036 struct scatterlist *sg;
2038 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2041 domain = find_domain(pdev);
2043 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2046 for_each_sg(sglist, sg, nelems, i) {
2047 addr = SG_ENT_VIRT_ADDRESS(sg);
2048 size += aligned_size((u64)addr, sg->length);
2051 start_addr = iova->pfn_lo << PAGE_SHIFT;
2053 /* clear the whole page */
2054 dma_pte_clear_range(domain, start_addr, start_addr + size);
2055 /* free page tables */
2056 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2058 if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2059 size >> VTD_PAGE_SHIFT, 0))
2060 iommu_flush_write_buffer(domain->iommu);
2063 __free_iova(&domain->iovad, iova);
2066 static int intel_nontranslate_map_sg(struct device *hddev,
2067 struct scatterlist *sglist, int nelems, int dir)
2070 struct scatterlist *sg;
2072 for_each_sg(sglist, sg, nelems, i) {
2073 BUG_ON(!sg_page(sg));
2074 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2075 sg->dma_length = sg->length;
2080 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2085 struct pci_dev *pdev = to_pci_dev(hwdev);
2086 struct dmar_domain *domain;
2090 struct iova *iova = NULL;
2092 struct scatterlist *sg;
2093 unsigned long start_addr;
2095 BUG_ON(dir == DMA_NONE);
2096 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2097 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2099 domain = get_valid_domain_for_dev(pdev);
2103 for_each_sg(sglist, sg, nelems, i) {
2104 addr = SG_ENT_VIRT_ADDRESS(sg);
2105 addr = (void *)virt_to_phys(addr);
2106 size += aligned_size((u64)addr, sg->length);
2109 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2111 sglist->dma_length = 0;
2116 * Check if DMAR supports zero-length reads on write only
2119 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2120 !cap_zlr(domain->iommu->cap))
2121 prot |= DMA_PTE_READ;
2122 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2123 prot |= DMA_PTE_WRITE;
2125 start_addr = iova->pfn_lo << PAGE_SHIFT;
2127 for_each_sg(sglist, sg, nelems, i) {
2128 addr = SG_ENT_VIRT_ADDRESS(sg);
2129 addr = (void *)virt_to_phys(addr);
2130 size = aligned_size((u64)addr, sg->length);
2131 ret = domain_page_mapping(domain, start_addr + offset,
2132 ((u64)addr) & PAGE_MASK,
2135 /* clear the page */
2136 dma_pte_clear_range(domain, start_addr,
2137 start_addr + offset);
2138 /* free page tables */
2139 dma_pte_free_pagetable(domain, start_addr,
2140 start_addr + offset);
2142 __free_iova(&domain->iovad, iova);
2145 sg->dma_address = start_addr + offset +
2146 ((u64)addr & (~PAGE_MASK));
2147 sg->dma_length = sg->length;
2151 /* it's a non-present to present mapping */
2152 if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2153 start_addr, offset >> VTD_PAGE_SHIFT, 1))
2154 iommu_flush_write_buffer(domain->iommu);
2158 static struct dma_mapping_ops intel_dma_ops = {
2159 .alloc_coherent = intel_alloc_coherent,
2160 .free_coherent = intel_free_coherent,
2161 .map_single = intel_map_single,
2162 .unmap_single = intel_unmap_single,
2163 .map_sg = intel_map_sg,
2164 .unmap_sg = intel_unmap_sg,
2167 static inline int iommu_domain_cache_init(void)
2171 iommu_domain_cache = kmem_cache_create("iommu_domain",
2172 sizeof(struct dmar_domain),
2177 if (!iommu_domain_cache) {
2178 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2185 static inline int iommu_devinfo_cache_init(void)
2189 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2190 sizeof(struct device_domain_info),
2194 if (!iommu_devinfo_cache) {
2195 printk(KERN_ERR "Couldn't create devinfo cache\n");
2202 static inline int iommu_iova_cache_init(void)
2206 iommu_iova_cache = kmem_cache_create("iommu_iova",
2207 sizeof(struct iova),
2211 if (!iommu_iova_cache) {
2212 printk(KERN_ERR "Couldn't create iova cache\n");
2219 static int __init iommu_init_mempool(void)
2222 ret = iommu_iova_cache_init();
2226 ret = iommu_domain_cache_init();
2230 ret = iommu_devinfo_cache_init();
2234 kmem_cache_destroy(iommu_domain_cache);
2236 kmem_cache_destroy(iommu_iova_cache);
2241 static void __init iommu_exit_mempool(void)
2243 kmem_cache_destroy(iommu_devinfo_cache);
2244 kmem_cache_destroy(iommu_domain_cache);
2245 kmem_cache_destroy(iommu_iova_cache);
2249 static void __init init_no_remapping_devices(void)
2251 struct dmar_drhd_unit *drhd;
2253 for_each_drhd_unit(drhd) {
2254 if (!drhd->include_all) {
2256 for (i = 0; i < drhd->devices_cnt; i++)
2257 if (drhd->devices[i] != NULL)
2259 /* ignore DMAR unit if no pci devices exist */
2260 if (i == drhd->devices_cnt)
2268 for_each_drhd_unit(drhd) {
2270 if (drhd->ignored || drhd->include_all)
2273 for (i = 0; i < drhd->devices_cnt; i++)
2274 if (drhd->devices[i] &&
2275 !IS_GFX_DEVICE(drhd->devices[i]))
2278 if (i < drhd->devices_cnt)
2281 /* bypass IOMMU if it is just for gfx devices */
2283 for (i = 0; i < drhd->devices_cnt; i++) {
2284 if (!drhd->devices[i])
2286 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2291 int __init intel_iommu_init(void)
2295 if (dmar_table_init())
2298 if (dmar_dev_scope_init())
2302 * Check the need for DMA-remapping initialization now.
2303 * Above initialization will also be used by Interrupt-remapping.
2305 if (no_iommu || swiotlb || dmar_disabled)
2308 iommu_init_mempool();
2309 dmar_init_reserved_ranges();
2311 init_no_remapping_devices();
2315 printk(KERN_ERR "IOMMU: dmar init failed\n");
2316 put_iova_domain(&reserved_iova_list);
2317 iommu_exit_mempool();
2321 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2323 init_timer(&unmap_timer);
2325 dma_ops = &intel_dma_ops;
2329 void intel_iommu_domain_exit(struct dmar_domain *domain)
2333 /* Domain 0 is reserved, so dont process it */
2337 end = DOMAIN_MAX_ADDR(domain->gaw);
2338 end = end & (~VTD_PAGE_MASK);
2341 dma_pte_clear_range(domain, 0, end);
2343 /* free page tables */
2344 dma_pte_free_pagetable(domain, 0, end);
2346 iommu_free_domain(domain);
2347 free_domain_mem(domain);
2349 EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
2351 struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
2353 struct dmar_drhd_unit *drhd;
2354 struct dmar_domain *domain;
2355 struct intel_iommu *iommu;
2357 drhd = dmar_find_matched_drhd_unit(pdev);
2359 printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
2363 iommu = drhd->iommu;
2366 "intel_iommu_domain_alloc: iommu == NULL\n");
2369 domain = iommu_alloc_domain(iommu);
2372 "intel_iommu_domain_alloc: domain == NULL\n");
2375 if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2377 "intel_iommu_domain_alloc: domain_init() failed\n");
2378 intel_iommu_domain_exit(domain);
2383 EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
2385 int intel_iommu_context_mapping(
2386 struct dmar_domain *domain, struct pci_dev *pdev)
2389 rc = domain_context_mapping(domain, pdev);
2392 EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
2394 int intel_iommu_page_mapping(
2395 struct dmar_domain *domain, dma_addr_t iova,
2396 u64 hpa, size_t size, int prot)
2399 rc = domain_page_mapping(domain, iova, hpa, size, prot);
2402 EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
2404 void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
2406 detach_domain_for_dev(domain, bus, devfn);
2408 EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
2410 struct dmar_domain *
2411 intel_iommu_find_domain(struct pci_dev *pdev)
2413 return find_domain(pdev);
2415 EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
2417 int intel_iommu_found(void)
2419 return g_num_of_iommus;
2421 EXPORT_SYMBOL_GPL(intel_iommu_found);
2423 u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
2425 struct dma_pte *pte;
2429 pte = addr_to_dma_pte(domain, iova);
2432 pfn = dma_pte_addr(*pte);
2434 return pfn >> VTD_PAGE_SHIFT;
2436 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);