2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/slab.h>
27 #include <linux/irq.h>
28 #include <linux/interrupt.h>
29 #include <linux/sysdev.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
37 #include "intel-iommu.h"
38 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
39 #include <asm/cacheflush.h>
43 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
44 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
46 #define IOAPIC_RANGE_START (0xfee00000)
47 #define IOAPIC_RANGE_END (0xfeefffff)
48 #define IOVA_START_ADDR (0x1000)
50 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
52 #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) /* 10sec */
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
57 static void flush_unmaps_timeout(unsigned long data);
59 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
61 #define HIGH_WATER_MARK 250
62 struct deferred_flush_tables {
64 struct iova *iova[HIGH_WATER_MARK];
65 struct dmar_domain *domain[HIGH_WATER_MARK];
68 static struct deferred_flush_tables *deferred_flush;
70 /* bitmap for indexing intel_iommus */
71 static int g_num_of_iommus;
73 static DEFINE_SPINLOCK(async_umap_flush_lock);
74 static LIST_HEAD(unmaps_to_do);
77 static long list_size;
79 static void domain_remove_dev_info(struct dmar_domain *domain);
81 static int dmar_disabled;
82 static int __initdata dmar_map_gfx = 1;
83 static int dmar_forcedac;
84 static int intel_iommu_strict;
86 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
87 static DEFINE_SPINLOCK(device_domain_lock);
88 static LIST_HEAD(device_domain_list);
90 static int __init intel_iommu_setup(char *str)
95 if (!strncmp(str, "off", 3)) {
97 printk(KERN_INFO"Intel-IOMMU: disabled\n");
98 } else if (!strncmp(str, "igfx_off", 8)) {
101 "Intel-IOMMU: disable GFX device mapping\n");
102 } else if (!strncmp(str, "forcedac", 8)) {
104 "Intel-IOMMU: Forcing DAC for PCI devices\n");
106 } else if (!strncmp(str, "strict", 6)) {
108 "Intel-IOMMU: disable batched IOTLB flush\n");
109 intel_iommu_strict = 1;
112 str += strcspn(str, ",");
118 __setup("intel_iommu=", intel_iommu_setup);
120 static struct kmem_cache *iommu_domain_cache;
121 static struct kmem_cache *iommu_devinfo_cache;
122 static struct kmem_cache *iommu_iova_cache;
124 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
129 /* trying to avoid low memory issues */
130 flags = current->flags & PF_MEMALLOC;
131 current->flags |= PF_MEMALLOC;
132 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
133 current->flags &= (~PF_MEMALLOC | flags);
138 static inline void *alloc_pgtable_page(void)
143 /* trying to avoid low memory issues */
144 flags = current->flags & PF_MEMALLOC;
145 current->flags |= PF_MEMALLOC;
146 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
147 current->flags &= (~PF_MEMALLOC | flags);
151 static inline void free_pgtable_page(void *vaddr)
153 free_page((unsigned long)vaddr);
156 static inline void *alloc_domain_mem(void)
158 return iommu_kmem_cache_alloc(iommu_domain_cache);
161 static inline void free_domain_mem(void *vaddr)
163 kmem_cache_free(iommu_domain_cache, vaddr);
166 static inline void * alloc_devinfo_mem(void)
168 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
171 static inline void free_devinfo_mem(void *vaddr)
173 kmem_cache_free(iommu_devinfo_cache, vaddr);
176 struct iova *alloc_iova_mem(void)
178 return iommu_kmem_cache_alloc(iommu_iova_cache);
181 void free_iova_mem(struct iova *iova)
183 kmem_cache_free(iommu_iova_cache, iova);
186 static inline void __iommu_flush_cache(
187 struct intel_iommu *iommu, void *addr, int size)
189 if (!ecap_coherent(iommu->ecap))
190 clflush_cache_range(addr, size);
193 /* Gets context entry for a given bus and devfn */
194 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
197 struct root_entry *root;
198 struct context_entry *context;
199 unsigned long phy_addr;
202 spin_lock_irqsave(&iommu->lock, flags);
203 root = &iommu->root_entry[bus];
204 context = get_context_addr_from_root(root);
206 context = (struct context_entry *)alloc_pgtable_page();
208 spin_unlock_irqrestore(&iommu->lock, flags);
211 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
212 phy_addr = virt_to_phys((void *)context);
213 set_root_value(root, phy_addr);
214 set_root_present(root);
215 __iommu_flush_cache(iommu, root, sizeof(*root));
217 spin_unlock_irqrestore(&iommu->lock, flags);
218 return &context[devfn];
221 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
223 struct root_entry *root;
224 struct context_entry *context;
228 spin_lock_irqsave(&iommu->lock, flags);
229 root = &iommu->root_entry[bus];
230 context = get_context_addr_from_root(root);
235 ret = context_present(context[devfn]);
237 spin_unlock_irqrestore(&iommu->lock, flags);
241 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
243 struct root_entry *root;
244 struct context_entry *context;
247 spin_lock_irqsave(&iommu->lock, flags);
248 root = &iommu->root_entry[bus];
249 context = get_context_addr_from_root(root);
251 context_clear_entry(context[devfn]);
252 __iommu_flush_cache(iommu, &context[devfn], \
255 spin_unlock_irqrestore(&iommu->lock, flags);
258 static void free_context_table(struct intel_iommu *iommu)
260 struct root_entry *root;
263 struct context_entry *context;
265 spin_lock_irqsave(&iommu->lock, flags);
266 if (!iommu->root_entry) {
269 for (i = 0; i < ROOT_ENTRY_NR; i++) {
270 root = &iommu->root_entry[i];
271 context = get_context_addr_from_root(root);
273 free_pgtable_page(context);
275 free_pgtable_page(iommu->root_entry);
276 iommu->root_entry = NULL;
278 spin_unlock_irqrestore(&iommu->lock, flags);
281 /* page table handling */
282 #define LEVEL_STRIDE (9)
283 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
285 static inline int agaw_to_level(int agaw)
290 static inline int agaw_to_width(int agaw)
292 return 30 + agaw * LEVEL_STRIDE;
296 static inline int width_to_agaw(int width)
298 return (width - 30) / LEVEL_STRIDE;
301 static inline unsigned int level_to_offset_bits(int level)
303 return (12 + (level - 1) * LEVEL_STRIDE);
306 static inline int address_level_offset(u64 addr, int level)
308 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
311 static inline u64 level_mask(int level)
313 return ((u64)-1 << level_to_offset_bits(level));
316 static inline u64 level_size(int level)
318 return ((u64)1 << level_to_offset_bits(level));
321 static inline u64 align_to_level(u64 addr, int level)
323 return ((addr + level_size(level) - 1) & level_mask(level));
326 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
328 int addr_width = agaw_to_width(domain->agaw);
329 struct dma_pte *parent, *pte = NULL;
330 int level = agaw_to_level(domain->agaw);
334 BUG_ON(!domain->pgd);
336 addr &= (((u64)1) << addr_width) - 1;
337 parent = domain->pgd;
339 spin_lock_irqsave(&domain->mapping_lock, flags);
343 offset = address_level_offset(addr, level);
344 pte = &parent[offset];
348 if (!dma_pte_present(*pte)) {
349 tmp_page = alloc_pgtable_page();
352 spin_unlock_irqrestore(&domain->mapping_lock,
356 __iommu_flush_cache(domain->iommu, tmp_page,
358 dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
360 * high level table always sets r/w, last level page
361 * table control read/write
363 dma_set_pte_readable(*pte);
364 dma_set_pte_writable(*pte);
365 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
367 parent = phys_to_virt(dma_pte_addr(*pte));
371 spin_unlock_irqrestore(&domain->mapping_lock, flags);
375 /* return address's pte at specific level */
376 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
379 struct dma_pte *parent, *pte = NULL;
380 int total = agaw_to_level(domain->agaw);
383 parent = domain->pgd;
384 while (level <= total) {
385 offset = address_level_offset(addr, total);
386 pte = &parent[offset];
390 if (!dma_pte_present(*pte))
392 parent = phys_to_virt(dma_pte_addr(*pte));
398 /* clear one page's page table */
399 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
401 struct dma_pte *pte = NULL;
403 /* get last level pte */
404 pte = dma_addr_level_pte(domain, addr, 1);
408 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
412 /* clear last level pte, a tlb flush should be followed */
413 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
415 int addr_width = agaw_to_width(domain->agaw);
417 start &= (((u64)1) << addr_width) - 1;
418 end &= (((u64)1) << addr_width) - 1;
419 /* in case it's partial page */
420 start = PAGE_ALIGN_4K(start);
423 /* we don't need lock here, nobody else touches the iova range */
424 while (start < end) {
425 dma_pte_clear_one(domain, start);
426 start += PAGE_SIZE_4K;
430 /* free page table pages. last level pte should already be cleared */
431 static void dma_pte_free_pagetable(struct dmar_domain *domain,
434 int addr_width = agaw_to_width(domain->agaw);
436 int total = agaw_to_level(domain->agaw);
440 start &= (((u64)1) << addr_width) - 1;
441 end &= (((u64)1) << addr_width) - 1;
443 /* we don't need lock here, nobody else touches the iova range */
445 while (level <= total) {
446 tmp = align_to_level(start, level);
447 if (tmp >= end || (tmp + level_size(level) > end))
451 pte = dma_addr_level_pte(domain, tmp, level);
454 phys_to_virt(dma_pte_addr(*pte)));
456 __iommu_flush_cache(domain->iommu,
459 tmp += level_size(level);
464 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
465 free_pgtable_page(domain->pgd);
471 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
473 struct root_entry *root;
476 root = (struct root_entry *)alloc_pgtable_page();
480 __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
482 spin_lock_irqsave(&iommu->lock, flags);
483 iommu->root_entry = root;
484 spin_unlock_irqrestore(&iommu->lock, flags);
489 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
491 cycles_t start_time = get_cycles();\
493 sts = op (iommu->reg + offset);\
496 if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\
497 panic("DMAR hardware is malfunctioning\n");\
502 static void iommu_set_root_entry(struct intel_iommu *iommu)
508 addr = iommu->root_entry;
510 spin_lock_irqsave(&iommu->register_lock, flag);
511 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
513 cmd = iommu->gcmd | DMA_GCMD_SRTP;
514 writel(cmd, iommu->reg + DMAR_GCMD_REG);
516 /* Make sure hardware complete it */
517 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
518 readl, (sts & DMA_GSTS_RTPS), sts);
520 spin_unlock_irqrestore(&iommu->register_lock, flag);
523 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
528 if (!cap_rwbf(iommu->cap))
530 val = iommu->gcmd | DMA_GCMD_WBF;
532 spin_lock_irqsave(&iommu->register_lock, flag);
533 writel(val, iommu->reg + DMAR_GCMD_REG);
535 /* Make sure hardware complete it */
536 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
537 readl, (!(val & DMA_GSTS_WBFS)), val);
539 spin_unlock_irqrestore(&iommu->register_lock, flag);
542 /* return value determine if we need a write buffer flush */
543 static int __iommu_flush_context(struct intel_iommu *iommu,
544 u16 did, u16 source_id, u8 function_mask, u64 type,
545 int non_present_entry_flush)
551 * In the non-present entry flush case, if hardware doesn't cache
552 * non-present entry we do nothing and if hardware cache non-present
553 * entry, we flush entries of domain 0 (the domain id is used to cache
554 * any non-present entries)
556 if (non_present_entry_flush) {
557 if (!cap_caching_mode(iommu->cap))
564 case DMA_CCMD_GLOBAL_INVL:
565 val = DMA_CCMD_GLOBAL_INVL;
567 case DMA_CCMD_DOMAIN_INVL:
568 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
570 case DMA_CCMD_DEVICE_INVL:
571 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
572 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
579 spin_lock_irqsave(&iommu->register_lock, flag);
580 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
582 /* Make sure hardware complete it */
583 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
584 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
586 spin_unlock_irqrestore(&iommu->register_lock, flag);
588 /* flush context entry will implictly flush write buffer */
592 static int inline iommu_flush_context_global(struct intel_iommu *iommu,
593 int non_present_entry_flush)
595 return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
596 non_present_entry_flush);
599 static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
600 int non_present_entry_flush)
602 return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
603 non_present_entry_flush);
606 static int inline iommu_flush_context_device(struct intel_iommu *iommu,
607 u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
609 return __iommu_flush_context(iommu, did, source_id, function_mask,
610 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
613 /* return value determine if we need a write buffer flush */
614 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
615 u64 addr, unsigned int size_order, u64 type,
616 int non_present_entry_flush)
618 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
619 u64 val = 0, val_iva = 0;
623 * In the non-present entry flush case, if hardware doesn't cache
624 * non-present entry we do nothing and if hardware cache non-present
625 * entry, we flush entries of domain 0 (the domain id is used to cache
626 * any non-present entries)
628 if (non_present_entry_flush) {
629 if (!cap_caching_mode(iommu->cap))
636 case DMA_TLB_GLOBAL_FLUSH:
637 /* global flush doesn't need set IVA_REG */
638 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
640 case DMA_TLB_DSI_FLUSH:
641 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
643 case DMA_TLB_PSI_FLUSH:
644 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
645 /* Note: always flush non-leaf currently */
646 val_iva = size_order | addr;
651 /* Note: set drain read/write */
654 * This is probably to be super secure.. Looks like we can
655 * ignore it without any impact.
657 if (cap_read_drain(iommu->cap))
658 val |= DMA_TLB_READ_DRAIN;
660 if (cap_write_drain(iommu->cap))
661 val |= DMA_TLB_WRITE_DRAIN;
663 spin_lock_irqsave(&iommu->register_lock, flag);
664 /* Note: Only uses first TLB reg currently */
666 dmar_writeq(iommu->reg + tlb_offset, val_iva);
667 dmar_writeq(iommu->reg + tlb_offset + 8, val);
669 /* Make sure hardware complete it */
670 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
671 dmar_readq, (!(val & DMA_TLB_IVT)), val);
673 spin_unlock_irqrestore(&iommu->register_lock, flag);
675 /* check IOTLB invalidation granularity */
676 if (DMA_TLB_IAIG(val) == 0)
677 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
678 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
679 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
680 DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
681 /* flush context entry will implictly flush write buffer */
685 static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
686 int non_present_entry_flush)
688 return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
689 non_present_entry_flush);
692 static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
693 int non_present_entry_flush)
695 return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
696 non_present_entry_flush);
699 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
700 u64 addr, unsigned int pages, int non_present_entry_flush)
704 BUG_ON(addr & (~PAGE_MASK_4K));
707 /* Fallback to domain selective flush if no PSI support */
708 if (!cap_pgsel_inv(iommu->cap))
709 return iommu_flush_iotlb_dsi(iommu, did,
710 non_present_entry_flush);
713 * PSI requires page size to be 2 ^ x, and the base address is naturally
714 * aligned to the size
716 mask = ilog2(__roundup_pow_of_two(pages));
717 /* Fallback to domain selective flush if size is too big */
718 if (mask > cap_max_amask_val(iommu->cap))
719 return iommu_flush_iotlb_dsi(iommu, did,
720 non_present_entry_flush);
722 return __iommu_flush_iotlb(iommu, did, addr, mask,
723 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
726 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
731 spin_lock_irqsave(&iommu->register_lock, flags);
732 pmen = readl(iommu->reg + DMAR_PMEN_REG);
733 pmen &= ~DMA_PMEN_EPM;
734 writel(pmen, iommu->reg + DMAR_PMEN_REG);
736 /* wait for the protected region status bit to clear */
737 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
738 readl, !(pmen & DMA_PMEN_PRS), pmen);
740 spin_unlock_irqrestore(&iommu->register_lock, flags);
743 static int iommu_enable_translation(struct intel_iommu *iommu)
748 spin_lock_irqsave(&iommu->register_lock, flags);
749 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
751 /* Make sure hardware complete it */
752 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
753 readl, (sts & DMA_GSTS_TES), sts);
755 iommu->gcmd |= DMA_GCMD_TE;
756 spin_unlock_irqrestore(&iommu->register_lock, flags);
760 static int iommu_disable_translation(struct intel_iommu *iommu)
765 spin_lock_irqsave(&iommu->register_lock, flag);
766 iommu->gcmd &= ~DMA_GCMD_TE;
767 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
769 /* Make sure hardware complete it */
770 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
771 readl, (!(sts & DMA_GSTS_TES)), sts);
773 spin_unlock_irqrestore(&iommu->register_lock, flag);
777 /* iommu interrupt handling. Most stuff are MSI-like. */
779 static const char *fault_reason_strings[] =
782 "Present bit in root entry is clear",
783 "Present bit in context entry is clear",
784 "Invalid context entry",
785 "Access beyond MGAW",
786 "PTE Write access is not set",
787 "PTE Read access is not set",
788 "Next page table ptr is invalid",
789 "Root table address invalid",
790 "Context table ptr is invalid",
791 "non-zero reserved fields in RTP",
792 "non-zero reserved fields in CTP",
793 "non-zero reserved fields in PTE",
795 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
797 const char *dmar_get_fault_reason(u8 fault_reason)
799 if (fault_reason > MAX_FAULT_REASON_IDX)
802 return fault_reason_strings[fault_reason];
805 void dmar_msi_unmask(unsigned int irq)
807 struct intel_iommu *iommu = get_irq_data(irq);
811 spin_lock_irqsave(&iommu->register_lock, flag);
812 writel(0, iommu->reg + DMAR_FECTL_REG);
813 /* Read a reg to force flush the post write */
814 readl(iommu->reg + DMAR_FECTL_REG);
815 spin_unlock_irqrestore(&iommu->register_lock, flag);
818 void dmar_msi_mask(unsigned int irq)
821 struct intel_iommu *iommu = get_irq_data(irq);
824 spin_lock_irqsave(&iommu->register_lock, flag);
825 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
826 /* Read a reg to force flush the post write */
827 readl(iommu->reg + DMAR_FECTL_REG);
828 spin_unlock_irqrestore(&iommu->register_lock, flag);
831 void dmar_msi_write(int irq, struct msi_msg *msg)
833 struct intel_iommu *iommu = get_irq_data(irq);
836 spin_lock_irqsave(&iommu->register_lock, flag);
837 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
838 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
839 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
840 spin_unlock_irqrestore(&iommu->register_lock, flag);
843 void dmar_msi_read(int irq, struct msi_msg *msg)
845 struct intel_iommu *iommu = get_irq_data(irq);
848 spin_lock_irqsave(&iommu->register_lock, flag);
849 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
850 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
851 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
852 spin_unlock_irqrestore(&iommu->register_lock, flag);
855 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
856 u8 fault_reason, u16 source_id, u64 addr)
860 reason = dmar_get_fault_reason(fault_reason);
863 "DMAR:[%s] Request device [%02x:%02x.%d] "
865 "DMAR:[fault reason %02d] %s\n",
866 (type ? "DMA Read" : "DMA Write"),
867 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
868 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
872 #define PRIMARY_FAULT_REG_LEN (16)
873 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
875 struct intel_iommu *iommu = dev_id;
876 int reg, fault_index;
880 spin_lock_irqsave(&iommu->register_lock, flag);
881 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
883 /* TBD: ignore advanced fault log currently */
884 if (!(fault_status & DMA_FSTS_PPF))
887 fault_index = dma_fsts_fault_record_index(fault_status);
888 reg = cap_fault_reg_offset(iommu->cap);
896 /* highest 32 bits */
897 data = readl(iommu->reg + reg +
898 fault_index * PRIMARY_FAULT_REG_LEN + 12);
899 if (!(data & DMA_FRCD_F))
902 fault_reason = dma_frcd_fault_reason(data);
903 type = dma_frcd_type(data);
905 data = readl(iommu->reg + reg +
906 fault_index * PRIMARY_FAULT_REG_LEN + 8);
907 source_id = dma_frcd_source_id(data);
909 guest_addr = dmar_readq(iommu->reg + reg +
910 fault_index * PRIMARY_FAULT_REG_LEN);
911 guest_addr = dma_frcd_page_addr(guest_addr);
912 /* clear the fault */
913 writel(DMA_FRCD_F, iommu->reg + reg +
914 fault_index * PRIMARY_FAULT_REG_LEN + 12);
916 spin_unlock_irqrestore(&iommu->register_lock, flag);
918 iommu_page_fault_do_one(iommu, type, fault_reason,
919 source_id, guest_addr);
922 if (fault_index > cap_num_fault_regs(iommu->cap))
924 spin_lock_irqsave(&iommu->register_lock, flag);
927 /* clear primary fault overflow */
928 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
929 if (fault_status & DMA_FSTS_PFO)
930 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
932 spin_unlock_irqrestore(&iommu->register_lock, flag);
936 int dmar_set_interrupt(struct intel_iommu *iommu)
942 printk(KERN_ERR "IOMMU: no free vectors\n");
946 set_irq_data(irq, iommu);
949 ret = arch_setup_dmar_msi(irq);
951 set_irq_data(irq, NULL);
957 /* Force fault register is cleared */
958 iommu_page_fault(irq, iommu);
960 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
962 printk(KERN_ERR "IOMMU: can't request irq\n");
966 static int iommu_init_domains(struct intel_iommu *iommu)
968 unsigned long ndomains;
969 unsigned long nlongs;
971 ndomains = cap_ndoms(iommu->cap);
972 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
973 nlongs = BITS_TO_LONGS(ndomains);
975 /* TBD: there might be 64K domains,
976 * consider other allocation for future chip
978 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
979 if (!iommu->domain_ids) {
980 printk(KERN_ERR "Allocating domain id array failed\n");
983 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
985 if (!iommu->domains) {
986 printk(KERN_ERR "Allocating domain array failed\n");
987 kfree(iommu->domain_ids);
991 spin_lock_init(&iommu->lock);
994 * if Caching mode is set, then invalid translations are tagged
995 * with domainid 0. Hence we need to pre-allocate it.
997 if (cap_caching_mode(iommu->cap))
998 set_bit(0, iommu->domain_ids);
1003 static void domain_exit(struct dmar_domain *domain);
1005 void free_dmar_iommu(struct intel_iommu *iommu)
1007 struct dmar_domain *domain;
1010 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1011 for (; i < cap_ndoms(iommu->cap); ) {
1012 domain = iommu->domains[i];
1013 clear_bit(i, iommu->domain_ids);
1014 domain_exit(domain);
1015 i = find_next_bit(iommu->domain_ids,
1016 cap_ndoms(iommu->cap), i+1);
1019 if (iommu->gcmd & DMA_GCMD_TE)
1020 iommu_disable_translation(iommu);
1023 set_irq_data(iommu->irq, NULL);
1024 /* This will mask the irq */
1025 free_irq(iommu->irq, iommu);
1026 destroy_irq(iommu->irq);
1029 kfree(iommu->domains);
1030 kfree(iommu->domain_ids);
1032 /* free context mapping */
1033 free_context_table(iommu);
1036 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1039 unsigned long ndomains;
1040 struct dmar_domain *domain;
1041 unsigned long flags;
1043 domain = alloc_domain_mem();
1047 ndomains = cap_ndoms(iommu->cap);
1049 spin_lock_irqsave(&iommu->lock, flags);
1050 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1051 if (num >= ndomains) {
1052 spin_unlock_irqrestore(&iommu->lock, flags);
1053 free_domain_mem(domain);
1054 printk(KERN_ERR "IOMMU: no free domain ids\n");
1058 set_bit(num, iommu->domain_ids);
1060 domain->iommu = iommu;
1061 iommu->domains[num] = domain;
1062 spin_unlock_irqrestore(&iommu->lock, flags);
1067 static void iommu_free_domain(struct dmar_domain *domain)
1069 unsigned long flags;
1071 spin_lock_irqsave(&domain->iommu->lock, flags);
1072 clear_bit(domain->id, domain->iommu->domain_ids);
1073 spin_unlock_irqrestore(&domain->iommu->lock, flags);
1076 static struct iova_domain reserved_iova_list;
1077 static struct lock_class_key reserved_alloc_key;
1078 static struct lock_class_key reserved_rbtree_key;
1080 static void dmar_init_reserved_ranges(void)
1082 struct pci_dev *pdev = NULL;
1087 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1089 lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1090 &reserved_alloc_key);
1091 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1092 &reserved_rbtree_key);
1094 /* IOAPIC ranges shouldn't be accessed by DMA */
1095 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1096 IOVA_PFN(IOAPIC_RANGE_END));
1098 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1100 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1101 for_each_pci_dev(pdev) {
1104 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1105 r = &pdev->resource[i];
1106 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1109 addr &= PAGE_MASK_4K;
1110 size = r->end - addr;
1111 size = PAGE_ALIGN_4K(size);
1112 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1113 IOVA_PFN(size + addr) - 1);
1115 printk(KERN_ERR "Reserve iova failed\n");
1121 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1123 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1126 static inline int guestwidth_to_adjustwidth(int gaw)
1129 int r = (gaw - 12) % 9;
1140 static int domain_init(struct dmar_domain *domain, int guest_width)
1142 struct intel_iommu *iommu;
1143 int adjust_width, agaw;
1144 unsigned long sagaw;
1146 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1147 spin_lock_init(&domain->mapping_lock);
1149 domain_reserve_special_ranges(domain);
1151 /* calculate AGAW */
1152 iommu = domain->iommu;
1153 if (guest_width > cap_mgaw(iommu->cap))
1154 guest_width = cap_mgaw(iommu->cap);
1155 domain->gaw = guest_width;
1156 adjust_width = guestwidth_to_adjustwidth(guest_width);
1157 agaw = width_to_agaw(adjust_width);
1158 sagaw = cap_sagaw(iommu->cap);
1159 if (!test_bit(agaw, &sagaw)) {
1160 /* hardware doesn't support it, choose a bigger one */
1161 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1162 agaw = find_next_bit(&sagaw, 5, agaw);
1166 domain->agaw = agaw;
1167 INIT_LIST_HEAD(&domain->devices);
1169 /* always allocate the top pgd */
1170 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1173 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1177 static void domain_exit(struct dmar_domain *domain)
1181 /* Domain 0 is reserved, so dont process it */
1185 domain_remove_dev_info(domain);
1187 put_iova_domain(&domain->iovad);
1188 end = DOMAIN_MAX_ADDR(domain->gaw);
1189 end = end & (~PAGE_MASK_4K);
1192 dma_pte_clear_range(domain, 0, end);
1194 /* free page tables */
1195 dma_pte_free_pagetable(domain, 0, end);
1197 iommu_free_domain(domain);
1198 free_domain_mem(domain);
1201 static int domain_context_mapping_one(struct dmar_domain *domain,
1204 struct context_entry *context;
1205 struct intel_iommu *iommu = domain->iommu;
1206 unsigned long flags;
1208 pr_debug("Set context mapping for %02x:%02x.%d\n",
1209 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1210 BUG_ON(!domain->pgd);
1211 context = device_to_context_entry(iommu, bus, devfn);
1214 spin_lock_irqsave(&iommu->lock, flags);
1215 if (context_present(*context)) {
1216 spin_unlock_irqrestore(&iommu->lock, flags);
1220 context_set_domain_id(*context, domain->id);
1221 context_set_address_width(*context, domain->agaw);
1222 context_set_address_root(*context, virt_to_phys(domain->pgd));
1223 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1224 context_set_fault_enable(*context);
1225 context_set_present(*context);
1226 __iommu_flush_cache(iommu, context, sizeof(*context));
1228 /* it's a non-present to present mapping */
1229 if (iommu_flush_context_device(iommu, domain->id,
1230 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1231 iommu_flush_write_buffer(iommu);
1233 iommu_flush_iotlb_dsi(iommu, 0, 0);
1234 spin_unlock_irqrestore(&iommu->lock, flags);
1239 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1242 struct pci_dev *tmp, *parent;
1244 ret = domain_context_mapping_one(domain, pdev->bus->number,
1249 /* dependent device mapping */
1250 tmp = pci_find_upstream_pcie_bridge(pdev);
1253 /* Secondary interface's bus number and devfn 0 */
1254 parent = pdev->bus->self;
1255 while (parent != tmp) {
1256 ret = domain_context_mapping_one(domain, parent->bus->number,
1260 parent = parent->bus->self;
1262 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1263 return domain_context_mapping_one(domain,
1264 tmp->subordinate->number, 0);
1265 else /* this is a legacy PCI bridge */
1266 return domain_context_mapping_one(domain,
1267 tmp->bus->number, tmp->devfn);
1270 static int domain_context_mapped(struct dmar_domain *domain,
1271 struct pci_dev *pdev)
1274 struct pci_dev *tmp, *parent;
1276 ret = device_context_mapped(domain->iommu,
1277 pdev->bus->number, pdev->devfn);
1280 /* dependent device mapping */
1281 tmp = pci_find_upstream_pcie_bridge(pdev);
1284 /* Secondary interface's bus number and devfn 0 */
1285 parent = pdev->bus->self;
1286 while (parent != tmp) {
1287 ret = device_context_mapped(domain->iommu, parent->bus->number,
1291 parent = parent->bus->self;
1294 return device_context_mapped(domain->iommu,
1295 tmp->subordinate->number, 0);
1297 return device_context_mapped(domain->iommu,
1298 tmp->bus->number, tmp->devfn);
1302 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1303 u64 hpa, size_t size, int prot)
1305 u64 start_pfn, end_pfn;
1306 struct dma_pte *pte;
1309 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1311 iova &= PAGE_MASK_4K;
1312 start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1313 end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1315 while (start_pfn < end_pfn) {
1316 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1319 /* We don't need lock here, nobody else
1320 * touches the iova range
1322 BUG_ON(dma_pte_addr(*pte));
1323 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1324 dma_set_pte_prot(*pte, prot);
1325 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1332 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1334 clear_context_table(domain->iommu, bus, devfn);
1335 iommu_flush_context_global(domain->iommu, 0);
1336 iommu_flush_iotlb_global(domain->iommu, 0);
1339 static void domain_remove_dev_info(struct dmar_domain *domain)
1341 struct device_domain_info *info;
1342 unsigned long flags;
1344 spin_lock_irqsave(&device_domain_lock, flags);
1345 while (!list_empty(&domain->devices)) {
1346 info = list_entry(domain->devices.next,
1347 struct device_domain_info, link);
1348 list_del(&info->link);
1349 list_del(&info->global);
1351 info->dev->dev.archdata.iommu = NULL;
1352 spin_unlock_irqrestore(&device_domain_lock, flags);
1354 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1355 free_devinfo_mem(info);
1357 spin_lock_irqsave(&device_domain_lock, flags);
1359 spin_unlock_irqrestore(&device_domain_lock, flags);
1364 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1366 struct dmar_domain *
1367 find_domain(struct pci_dev *pdev)
1369 struct device_domain_info *info;
1371 /* No lock here, assumes no domain exit in normal case */
1372 info = pdev->dev.archdata.iommu;
1374 return info->domain;
1378 /* domain is initialized */
1379 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1381 struct dmar_domain *domain, *found = NULL;
1382 struct intel_iommu *iommu;
1383 struct dmar_drhd_unit *drhd;
1384 struct device_domain_info *info, *tmp;
1385 struct pci_dev *dev_tmp;
1386 unsigned long flags;
1387 int bus = 0, devfn = 0;
1389 domain = find_domain(pdev);
1393 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1395 if (dev_tmp->is_pcie) {
1396 bus = dev_tmp->subordinate->number;
1399 bus = dev_tmp->bus->number;
1400 devfn = dev_tmp->devfn;
1402 spin_lock_irqsave(&device_domain_lock, flags);
1403 list_for_each_entry(info, &device_domain_list, global) {
1404 if (info->bus == bus && info->devfn == devfn) {
1405 found = info->domain;
1409 spin_unlock_irqrestore(&device_domain_lock, flags);
1410 /* pcie-pci bridge already has a domain, uses it */
1417 /* Allocate new domain for the device */
1418 drhd = dmar_find_matched_drhd_unit(pdev);
1420 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1424 iommu = drhd->iommu;
1426 domain = iommu_alloc_domain(iommu);
1430 if (domain_init(domain, gaw)) {
1431 domain_exit(domain);
1435 /* register pcie-to-pci device */
1437 info = alloc_devinfo_mem();
1439 domain_exit(domain);
1443 info->devfn = devfn;
1445 info->domain = domain;
1446 /* This domain is shared by devices under p2p bridge */
1447 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1449 /* pcie-to-pci bridge already has a domain, uses it */
1451 spin_lock_irqsave(&device_domain_lock, flags);
1452 list_for_each_entry(tmp, &device_domain_list, global) {
1453 if (tmp->bus == bus && tmp->devfn == devfn) {
1454 found = tmp->domain;
1459 free_devinfo_mem(info);
1460 domain_exit(domain);
1463 list_add(&info->link, &domain->devices);
1464 list_add(&info->global, &device_domain_list);
1466 spin_unlock_irqrestore(&device_domain_lock, flags);
1470 info = alloc_devinfo_mem();
1473 info->bus = pdev->bus->number;
1474 info->devfn = pdev->devfn;
1476 info->domain = domain;
1477 spin_lock_irqsave(&device_domain_lock, flags);
1478 /* somebody is fast */
1479 found = find_domain(pdev);
1480 if (found != NULL) {
1481 spin_unlock_irqrestore(&device_domain_lock, flags);
1482 if (found != domain) {
1483 domain_exit(domain);
1486 free_devinfo_mem(info);
1489 list_add(&info->link, &domain->devices);
1490 list_add(&info->global, &device_domain_list);
1491 pdev->dev.archdata.iommu = info;
1492 spin_unlock_irqrestore(&device_domain_lock, flags);
1495 /* recheck it here, maybe others set it */
1496 return find_domain(pdev);
1499 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1501 struct dmar_domain *domain;
1507 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1508 pci_name(pdev), start, end);
1509 /* page table init */
1510 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1514 /* The address might not be aligned */
1515 base = start & PAGE_MASK_4K;
1517 size = PAGE_ALIGN_4K(size);
1518 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1519 IOVA_PFN(base + size) - 1)) {
1520 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1525 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1526 size, base, pci_name(pdev));
1528 * RMRR range might have overlap with physical memory range,
1531 dma_pte_clear_range(domain, base, base + size);
1533 ret = domain_page_mapping(domain, base, base, size,
1534 DMA_PTE_READ|DMA_PTE_WRITE);
1538 /* context entry init */
1539 ret = domain_context_mapping(domain, pdev);
1543 domain_exit(domain);
1548 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1549 struct pci_dev *pdev)
1551 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1553 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1554 rmrr->end_address + 1);
1557 #ifdef CONFIG_DMAR_GFX_WA
1558 struct iommu_prepare_data {
1559 struct pci_dev *pdev;
1563 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1564 unsigned long end_pfn, void *datax)
1566 struct iommu_prepare_data *data;
1568 data = (struct iommu_prepare_data *)datax;
1570 data->ret = iommu_prepare_identity_map(data->pdev,
1571 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1576 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1579 struct iommu_prepare_data data;
1584 for_each_online_node(nid) {
1585 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1592 static void __init iommu_prepare_gfx_mapping(void)
1594 struct pci_dev *pdev = NULL;
1597 for_each_pci_dev(pdev) {
1598 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1599 !IS_GFX_DEVICE(pdev))
1601 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1603 ret = iommu_prepare_with_active_regions(pdev);
1605 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1610 #ifdef CONFIG_DMAR_FLOPPY_WA
1611 static inline void iommu_prepare_isa(void)
1613 struct pci_dev *pdev;
1616 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1620 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1621 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1624 printk("IOMMU: Failed to create 0-64M identity map, "
1625 "floppy might not work\n");
1629 static inline void iommu_prepare_isa(void)
1633 #endif /* !CONFIG_DMAR_FLPY_WA */
1635 int __init init_dmars(void)
1637 struct dmar_drhd_unit *drhd;
1638 struct dmar_rmrr_unit *rmrr;
1639 struct pci_dev *pdev;
1640 struct intel_iommu *iommu;
1641 int i, ret, unit = 0;
1646 * initialize and program root entry to not present
1649 for_each_drhd_unit(drhd) {
1652 * lock not needed as this is only incremented in the single
1653 * threaded kernel __init code path all other access are read
1658 deferred_flush = kzalloc(g_num_of_iommus *
1659 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1660 if (!deferred_flush) {
1665 for_each_drhd_unit(drhd) {
1668 iommu = alloc_iommu(drhd);
1674 ret = iommu_init_domains(iommu);
1680 * we could share the same root & context tables
1681 * amoung all IOMMU's. Need to Split it later.
1683 ret = iommu_alloc_root_entry(iommu);
1685 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1692 * for each dev attached to rmrr
1694 * locate drhd for dev, alloc domain for dev
1695 * allocate free domain
1696 * allocate page table entries for rmrr
1697 * if context not allocated for bus
1698 * allocate and init context
1699 * set present in root table for this bus
1700 * init context with domain, translation etc
1704 for_each_rmrr_units(rmrr) {
1705 for (i = 0; i < rmrr->devices_cnt; i++) {
1706 pdev = rmrr->devices[i];
1707 /* some BIOS lists non-exist devices in DMAR table */
1710 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1713 "IOMMU: mapping reserved region failed\n");
1717 iommu_prepare_gfx_mapping();
1719 iommu_prepare_isa();
1724 * global invalidate context cache
1725 * global invalidate iotlb
1726 * enable translation
1728 for_each_drhd_unit(drhd) {
1731 iommu = drhd->iommu;
1732 sprintf (iommu->name, "dmar%d", unit++);
1734 iommu_flush_write_buffer(iommu);
1736 ret = dmar_set_interrupt(iommu);
1740 iommu_set_root_entry(iommu);
1742 iommu_flush_context_global(iommu, 0);
1743 iommu_flush_iotlb_global(iommu, 0);
1745 iommu_disable_protect_mem_regions(iommu);
1747 ret = iommu_enable_translation(iommu);
1754 for_each_drhd_unit(drhd) {
1757 iommu = drhd->iommu;
1763 static inline u64 aligned_size(u64 host_addr, size_t size)
1766 addr = (host_addr & (~PAGE_MASK_4K)) + size;
1767 return PAGE_ALIGN_4K(addr);
1771 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1775 /* Make sure it's in range */
1776 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1777 if (!size || (IOVA_START_ADDR + size > end))
1780 piova = alloc_iova(&domain->iovad,
1781 size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
1785 static struct iova *
1786 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1789 struct pci_dev *pdev = to_pci_dev(dev);
1790 struct iova *iova = NULL;
1792 if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1793 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1796 * First try to allocate an io virtual address in
1797 * DMA_32BIT_MASK and if that fails then try allocating
1800 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1802 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1806 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1813 static struct dmar_domain *
1814 get_valid_domain_for_dev(struct pci_dev *pdev)
1816 struct dmar_domain *domain;
1819 domain = get_domain_for_dev(pdev,
1820 DEFAULT_DOMAIN_ADDRESS_WIDTH);
1823 "Allocating domain for %s failed", pci_name(pdev));
1827 /* make sure context mapping is ok */
1828 if (unlikely(!domain_context_mapped(domain, pdev))) {
1829 ret = domain_context_mapping(domain, pdev);
1832 "Domain context map for %s failed",
1842 intel_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
1844 struct pci_dev *pdev = to_pci_dev(hwdev);
1845 struct dmar_domain *domain;
1846 unsigned long start_paddr;
1851 BUG_ON(dir == DMA_NONE);
1852 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1855 domain = get_valid_domain_for_dev(pdev);
1859 size = aligned_size((u64)paddr, size);
1861 iova = __intel_alloc_iova(hwdev, domain, size);
1865 start_paddr = iova->pfn_lo << PAGE_SHIFT_4K;
1868 * Check if DMAR supports zero-length reads on write only
1871 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1872 !cap_zlr(domain->iommu->cap))
1873 prot |= DMA_PTE_READ;
1874 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1875 prot |= DMA_PTE_WRITE;
1877 * paddr - (paddr + size) might be partial page, we should map the whole
1878 * page. Note: if two part of one page are separately mapped, we
1879 * might have two guest_addr mapping to the same host paddr, but this
1880 * is not a big problem
1882 ret = domain_page_mapping(domain, start_paddr,
1883 ((u64)paddr) & PAGE_MASK_4K, size, prot);
1887 pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1888 pci_name(pdev), size, (u64)paddr,
1889 size, (u64)start_paddr, dir);
1891 /* it's a non-present to present mapping */
1892 ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1893 start_paddr, size >> PAGE_SHIFT_4K, 1);
1895 iommu_flush_write_buffer(domain->iommu);
1897 return (start_paddr + ((u64)paddr & (~PAGE_MASK_4K)));
1901 __free_iova(&domain->iovad, iova);
1902 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1903 pci_name(pdev), size, (u64)paddr, dir);
1907 static void flush_unmaps(void)
1913 /* just flush them all */
1914 for (i = 0; i < g_num_of_iommus; i++) {
1915 if (deferred_flush[i].next) {
1916 struct intel_iommu *iommu =
1917 deferred_flush[i].domain[0]->iommu;
1919 iommu_flush_iotlb_global(iommu, 0);
1920 for (j = 0; j < deferred_flush[i].next; j++) {
1921 __free_iova(&deferred_flush[i].domain[j]->iovad,
1922 deferred_flush[i].iova[j]);
1924 deferred_flush[i].next = 0;
1931 static void flush_unmaps_timeout(unsigned long data)
1933 unsigned long flags;
1935 spin_lock_irqsave(&async_umap_flush_lock, flags);
1937 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1940 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
1942 unsigned long flags;
1945 spin_lock_irqsave(&async_umap_flush_lock, flags);
1946 if (list_size == HIGH_WATER_MARK)
1949 iommu_id = dom->iommu->seq_id;
1951 next = deferred_flush[iommu_id].next;
1952 deferred_flush[iommu_id].domain[next] = dom;
1953 deferred_flush[iommu_id].iova[next] = iova;
1954 deferred_flush[iommu_id].next++;
1957 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
1961 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1964 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1965 size_t size, int dir)
1967 struct pci_dev *pdev = to_pci_dev(dev);
1968 struct dmar_domain *domain;
1969 unsigned long start_addr;
1972 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1974 domain = find_domain(pdev);
1977 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1981 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1982 size = aligned_size((u64)dev_addr, size);
1984 pr_debug("Device %s unmapping: %lx@%llx\n",
1985 pci_name(pdev), size, (u64)start_addr);
1987 /* clear the whole page */
1988 dma_pte_clear_range(domain, start_addr, start_addr + size);
1989 /* free page tables */
1990 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1991 if (intel_iommu_strict) {
1992 if (iommu_flush_iotlb_psi(domain->iommu,
1993 domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
1994 iommu_flush_write_buffer(domain->iommu);
1996 __free_iova(&domain->iovad, iova);
1998 add_unmap(domain, iova);
2000 * queue up the release of the unmap to save the 1/6th of the
2001 * cpu used up by the iotlb flush operation...
2006 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
2007 dma_addr_t *dma_handle, gfp_t flags)
2012 size = PAGE_ALIGN_4K(size);
2013 order = get_order(size);
2014 flags &= ~(GFP_DMA | GFP_DMA32);
2016 vaddr = (void *)__get_free_pages(flags, order);
2019 memset(vaddr, 0, size);
2021 *dma_handle = intel_map_single(hwdev, virt_to_bus(vaddr), size, DMA_BIDIRECTIONAL);
2024 free_pages((unsigned long)vaddr, order);
2028 static void intel_free_coherent(struct device *hwdev, size_t size,
2029 void *vaddr, dma_addr_t dma_handle)
2033 size = PAGE_ALIGN_4K(size);
2034 order = get_order(size);
2036 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2037 free_pages((unsigned long)vaddr, order);
2040 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2041 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2042 int nelems, int dir)
2045 struct pci_dev *pdev = to_pci_dev(hwdev);
2046 struct dmar_domain *domain;
2047 unsigned long start_addr;
2051 struct scatterlist *sg;
2053 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2056 domain = find_domain(pdev);
2058 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2061 for_each_sg(sglist, sg, nelems, i) {
2062 addr = SG_ENT_VIRT_ADDRESS(sg);
2063 size += aligned_size((u64)addr, sg->length);
2066 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2068 /* clear the whole page */
2069 dma_pte_clear_range(domain, start_addr, start_addr + size);
2070 /* free page tables */
2071 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2073 if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2074 size >> PAGE_SHIFT_4K, 0))
2075 iommu_flush_write_buffer(domain->iommu);
2078 __free_iova(&domain->iovad, iova);
2081 static int intel_nontranslate_map_sg(struct device *hddev,
2082 struct scatterlist *sglist, int nelems, int dir)
2085 struct scatterlist *sg;
2087 for_each_sg(sglist, sg, nelems, i) {
2088 BUG_ON(!sg_page(sg));
2089 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2090 sg->dma_length = sg->length;
2095 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
2096 int nelems, int dir)
2100 struct pci_dev *pdev = to_pci_dev(hwdev);
2101 struct dmar_domain *domain;
2105 struct iova *iova = NULL;
2107 struct scatterlist *sg;
2108 unsigned long start_addr;
2110 BUG_ON(dir == DMA_NONE);
2111 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2112 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2114 domain = get_valid_domain_for_dev(pdev);
2118 for_each_sg(sglist, sg, nelems, i) {
2119 addr = SG_ENT_VIRT_ADDRESS(sg);
2120 addr = (void *)virt_to_phys(addr);
2121 size += aligned_size((u64)addr, sg->length);
2124 iova = __intel_alloc_iova(hwdev, domain, size);
2126 sglist->dma_length = 0;
2131 * Check if DMAR supports zero-length reads on write only
2134 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2135 !cap_zlr(domain->iommu->cap))
2136 prot |= DMA_PTE_READ;
2137 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2138 prot |= DMA_PTE_WRITE;
2140 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2142 for_each_sg(sglist, sg, nelems, i) {
2143 addr = SG_ENT_VIRT_ADDRESS(sg);
2144 addr = (void *)virt_to_phys(addr);
2145 size = aligned_size((u64)addr, sg->length);
2146 ret = domain_page_mapping(domain, start_addr + offset,
2147 ((u64)addr) & PAGE_MASK_4K,
2150 /* clear the page */
2151 dma_pte_clear_range(domain, start_addr,
2152 start_addr + offset);
2153 /* free page tables */
2154 dma_pte_free_pagetable(domain, start_addr,
2155 start_addr + offset);
2157 __free_iova(&domain->iovad, iova);
2160 sg->dma_address = start_addr + offset +
2161 ((u64)addr & (~PAGE_MASK_4K));
2162 sg->dma_length = sg->length;
2166 /* it's a non-present to present mapping */
2167 if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2168 start_addr, offset >> PAGE_SHIFT_4K, 1))
2169 iommu_flush_write_buffer(domain->iommu);
2173 static struct dma_mapping_ops intel_dma_ops = {
2174 .alloc_coherent = intel_alloc_coherent,
2175 .free_coherent = intel_free_coherent,
2176 .map_single = intel_map_single,
2177 .unmap_single = intel_unmap_single,
2178 .map_sg = intel_map_sg,
2179 .unmap_sg = intel_unmap_sg,
2182 static inline int iommu_domain_cache_init(void)
2186 iommu_domain_cache = kmem_cache_create("iommu_domain",
2187 sizeof(struct dmar_domain),
2192 if (!iommu_domain_cache) {
2193 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2200 static inline int iommu_devinfo_cache_init(void)
2204 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2205 sizeof(struct device_domain_info),
2210 if (!iommu_devinfo_cache) {
2211 printk(KERN_ERR "Couldn't create devinfo cache\n");
2218 static inline int iommu_iova_cache_init(void)
2222 iommu_iova_cache = kmem_cache_create("iommu_iova",
2223 sizeof(struct iova),
2228 if (!iommu_iova_cache) {
2229 printk(KERN_ERR "Couldn't create iova cache\n");
2236 static int __init iommu_init_mempool(void)
2239 ret = iommu_iova_cache_init();
2243 ret = iommu_domain_cache_init();
2247 ret = iommu_devinfo_cache_init();
2251 kmem_cache_destroy(iommu_domain_cache);
2253 kmem_cache_destroy(iommu_iova_cache);
2258 static void __init iommu_exit_mempool(void)
2260 kmem_cache_destroy(iommu_devinfo_cache);
2261 kmem_cache_destroy(iommu_domain_cache);
2262 kmem_cache_destroy(iommu_iova_cache);
2266 void __init detect_intel_iommu(void)
2268 if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
2270 if (early_dmar_detect()) {
2275 static void __init init_no_remapping_devices(void)
2277 struct dmar_drhd_unit *drhd;
2279 for_each_drhd_unit(drhd) {
2280 if (!drhd->include_all) {
2282 for (i = 0; i < drhd->devices_cnt; i++)
2283 if (drhd->devices[i] != NULL)
2285 /* ignore DMAR unit if no pci devices exist */
2286 if (i == drhd->devices_cnt)
2294 for_each_drhd_unit(drhd) {
2296 if (drhd->ignored || drhd->include_all)
2299 for (i = 0; i < drhd->devices_cnt; i++)
2300 if (drhd->devices[i] &&
2301 !IS_GFX_DEVICE(drhd->devices[i]))
2304 if (i < drhd->devices_cnt)
2307 /* bypass IOMMU if it is just for gfx devices */
2309 for (i = 0; i < drhd->devices_cnt; i++) {
2310 if (!drhd->devices[i])
2312 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2317 int __init intel_iommu_init(void)
2321 if (no_iommu || swiotlb || dmar_disabled)
2324 if (dmar_table_init())
2327 iommu_init_mempool();
2328 dmar_init_reserved_ranges();
2330 init_no_remapping_devices();
2334 printk(KERN_ERR "IOMMU: dmar init failed\n");
2335 put_iova_domain(&reserved_iova_list);
2336 iommu_exit_mempool();
2340 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2342 init_timer(&unmap_timer);
2344 dma_ops = &intel_dma_ops;