2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/slab.h>
27 #include <linux/irq.h>
28 #include <linux/interrupt.h>
29 #include <linux/sysdev.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
37 #include "intel-iommu.h"
38 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
39 #include <asm/cacheflush.h>
43 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
44 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
46 #define IOAPIC_RANGE_START (0xfee00000)
47 #define IOAPIC_RANGE_END (0xfeefffff)
48 #define IOVA_START_ADDR (0x1000)
50 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
52 #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) /* 10sec */
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
57 static void flush_unmaps_timeout(unsigned long data);
59 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
61 #define HIGH_WATER_MARK 250
62 struct deferred_flush_tables {
64 struct iova *iova[HIGH_WATER_MARK];
65 struct dmar_domain *domain[HIGH_WATER_MARK];
68 static struct deferred_flush_tables *deferred_flush;
70 /* bitmap for indexing intel_iommus */
71 static int g_num_of_iommus;
73 static DEFINE_SPINLOCK(async_umap_flush_lock);
74 static LIST_HEAD(unmaps_to_do);
77 static long list_size;
79 static void domain_remove_dev_info(struct dmar_domain *domain);
81 static int dmar_disabled;
82 static int __initdata dmar_map_gfx = 1;
83 static int dmar_forcedac;
84 static int intel_iommu_strict;
86 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
87 static DEFINE_SPINLOCK(device_domain_lock);
88 static LIST_HEAD(device_domain_list);
90 static int __init intel_iommu_setup(char *str)
95 if (!strncmp(str, "off", 3)) {
97 printk(KERN_INFO"Intel-IOMMU: disabled\n");
98 } else if (!strncmp(str, "igfx_off", 8)) {
101 "Intel-IOMMU: disable GFX device mapping\n");
102 } else if (!strncmp(str, "forcedac", 8)) {
104 "Intel-IOMMU: Forcing DAC for PCI devices\n");
106 } else if (!strncmp(str, "strict", 6)) {
108 "Intel-IOMMU: disable batched IOTLB flush\n");
109 intel_iommu_strict = 1;
112 str += strcspn(str, ",");
118 __setup("intel_iommu=", intel_iommu_setup);
120 static struct kmem_cache *iommu_domain_cache;
121 static struct kmem_cache *iommu_devinfo_cache;
122 static struct kmem_cache *iommu_iova_cache;
124 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
129 /* trying to avoid low memory issues */
130 flags = current->flags & PF_MEMALLOC;
131 current->flags |= PF_MEMALLOC;
132 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
133 current->flags &= (~PF_MEMALLOC | flags);
138 static inline void *alloc_pgtable_page(void)
143 /* trying to avoid low memory issues */
144 flags = current->flags & PF_MEMALLOC;
145 current->flags |= PF_MEMALLOC;
146 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
147 current->flags &= (~PF_MEMALLOC | flags);
151 static inline void free_pgtable_page(void *vaddr)
153 free_page((unsigned long)vaddr);
156 static inline void *alloc_domain_mem(void)
158 return iommu_kmem_cache_alloc(iommu_domain_cache);
161 static inline void free_domain_mem(void *vaddr)
163 kmem_cache_free(iommu_domain_cache, vaddr);
166 static inline void * alloc_devinfo_mem(void)
168 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
171 static inline void free_devinfo_mem(void *vaddr)
173 kmem_cache_free(iommu_devinfo_cache, vaddr);
176 struct iova *alloc_iova_mem(void)
178 return iommu_kmem_cache_alloc(iommu_iova_cache);
181 void free_iova_mem(struct iova *iova)
183 kmem_cache_free(iommu_iova_cache, iova);
186 static inline void __iommu_flush_cache(
187 struct intel_iommu *iommu, void *addr, int size)
189 if (!ecap_coherent(iommu->ecap))
190 clflush_cache_range(addr, size);
193 /* Gets context entry for a given bus and devfn */
194 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
197 struct root_entry *root;
198 struct context_entry *context;
199 unsigned long phy_addr;
202 spin_lock_irqsave(&iommu->lock, flags);
203 root = &iommu->root_entry[bus];
204 context = get_context_addr_from_root(root);
206 context = (struct context_entry *)alloc_pgtable_page();
208 spin_unlock_irqrestore(&iommu->lock, flags);
211 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
212 phy_addr = virt_to_phys((void *)context);
213 set_root_value(root, phy_addr);
214 set_root_present(root);
215 __iommu_flush_cache(iommu, root, sizeof(*root));
217 spin_unlock_irqrestore(&iommu->lock, flags);
218 return &context[devfn];
221 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
223 struct root_entry *root;
224 struct context_entry *context;
228 spin_lock_irqsave(&iommu->lock, flags);
229 root = &iommu->root_entry[bus];
230 context = get_context_addr_from_root(root);
235 ret = context_present(context[devfn]);
237 spin_unlock_irqrestore(&iommu->lock, flags);
241 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
243 struct root_entry *root;
244 struct context_entry *context;
247 spin_lock_irqsave(&iommu->lock, flags);
248 root = &iommu->root_entry[bus];
249 context = get_context_addr_from_root(root);
251 context_clear_entry(context[devfn]);
252 __iommu_flush_cache(iommu, &context[devfn], \
255 spin_unlock_irqrestore(&iommu->lock, flags);
258 static void free_context_table(struct intel_iommu *iommu)
260 struct root_entry *root;
263 struct context_entry *context;
265 spin_lock_irqsave(&iommu->lock, flags);
266 if (!iommu->root_entry) {
269 for (i = 0; i < ROOT_ENTRY_NR; i++) {
270 root = &iommu->root_entry[i];
271 context = get_context_addr_from_root(root);
273 free_pgtable_page(context);
275 free_pgtable_page(iommu->root_entry);
276 iommu->root_entry = NULL;
278 spin_unlock_irqrestore(&iommu->lock, flags);
281 /* page table handling */
282 #define LEVEL_STRIDE (9)
283 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
285 static inline int agaw_to_level(int agaw)
290 static inline int agaw_to_width(int agaw)
292 return 30 + agaw * LEVEL_STRIDE;
296 static inline int width_to_agaw(int width)
298 return (width - 30) / LEVEL_STRIDE;
301 static inline unsigned int level_to_offset_bits(int level)
303 return (12 + (level - 1) * LEVEL_STRIDE);
306 static inline int address_level_offset(u64 addr, int level)
308 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
311 static inline u64 level_mask(int level)
313 return ((u64)-1 << level_to_offset_bits(level));
316 static inline u64 level_size(int level)
318 return ((u64)1 << level_to_offset_bits(level));
321 static inline u64 align_to_level(u64 addr, int level)
323 return ((addr + level_size(level) - 1) & level_mask(level));
326 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
328 int addr_width = agaw_to_width(domain->agaw);
329 struct dma_pte *parent, *pte = NULL;
330 int level = agaw_to_level(domain->agaw);
334 BUG_ON(!domain->pgd);
336 addr &= (((u64)1) << addr_width) - 1;
337 parent = domain->pgd;
339 spin_lock_irqsave(&domain->mapping_lock, flags);
343 offset = address_level_offset(addr, level);
344 pte = &parent[offset];
348 if (!dma_pte_present(*pte)) {
349 tmp_page = alloc_pgtable_page();
352 spin_unlock_irqrestore(&domain->mapping_lock,
356 __iommu_flush_cache(domain->iommu, tmp_page,
358 dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
360 * high level table always sets r/w, last level page
361 * table control read/write
363 dma_set_pte_readable(*pte);
364 dma_set_pte_writable(*pte);
365 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
367 parent = phys_to_virt(dma_pte_addr(*pte));
371 spin_unlock_irqrestore(&domain->mapping_lock, flags);
375 /* return address's pte at specific level */
376 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
379 struct dma_pte *parent, *pte = NULL;
380 int total = agaw_to_level(domain->agaw);
383 parent = domain->pgd;
384 while (level <= total) {
385 offset = address_level_offset(addr, total);
386 pte = &parent[offset];
390 if (!dma_pte_present(*pte))
392 parent = phys_to_virt(dma_pte_addr(*pte));
398 /* clear one page's page table */
399 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
401 struct dma_pte *pte = NULL;
403 /* get last level pte */
404 pte = dma_addr_level_pte(domain, addr, 1);
408 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
412 /* clear last level pte, a tlb flush should be followed */
413 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
415 int addr_width = agaw_to_width(domain->agaw);
417 start &= (((u64)1) << addr_width) - 1;
418 end &= (((u64)1) << addr_width) - 1;
419 /* in case it's partial page */
420 start = PAGE_ALIGN_4K(start);
423 /* we don't need lock here, nobody else touches the iova range */
424 while (start < end) {
425 dma_pte_clear_one(domain, start);
426 start += PAGE_SIZE_4K;
430 /* free page table pages. last level pte should already be cleared */
431 static void dma_pte_free_pagetable(struct dmar_domain *domain,
434 int addr_width = agaw_to_width(domain->agaw);
436 int total = agaw_to_level(domain->agaw);
440 start &= (((u64)1) << addr_width) - 1;
441 end &= (((u64)1) << addr_width) - 1;
443 /* we don't need lock here, nobody else touches the iova range */
445 while (level <= total) {
446 tmp = align_to_level(start, level);
447 if (tmp >= end || (tmp + level_size(level) > end))
451 pte = dma_addr_level_pte(domain, tmp, level);
454 phys_to_virt(dma_pte_addr(*pte)));
456 __iommu_flush_cache(domain->iommu,
459 tmp += level_size(level);
464 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
465 free_pgtable_page(domain->pgd);
471 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
473 struct root_entry *root;
476 root = (struct root_entry *)alloc_pgtable_page();
480 __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
482 spin_lock_irqsave(&iommu->lock, flags);
483 iommu->root_entry = root;
484 spin_unlock_irqrestore(&iommu->lock, flags);
489 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
491 cycles_t start_time = get_cycles();\
493 sts = op (iommu->reg + offset);\
496 if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\
497 panic("DMAR hardware is malfunctioning\n");\
502 static void iommu_set_root_entry(struct intel_iommu *iommu)
508 addr = iommu->root_entry;
510 spin_lock_irqsave(&iommu->register_lock, flag);
511 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
513 cmd = iommu->gcmd | DMA_GCMD_SRTP;
514 writel(cmd, iommu->reg + DMAR_GCMD_REG);
516 /* Make sure hardware complete it */
517 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
518 readl, (sts & DMA_GSTS_RTPS), sts);
520 spin_unlock_irqrestore(&iommu->register_lock, flag);
523 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
528 if (!cap_rwbf(iommu->cap))
530 val = iommu->gcmd | DMA_GCMD_WBF;
532 spin_lock_irqsave(&iommu->register_lock, flag);
533 writel(val, iommu->reg + DMAR_GCMD_REG);
535 /* Make sure hardware complete it */
536 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
537 readl, (!(val & DMA_GSTS_WBFS)), val);
539 spin_unlock_irqrestore(&iommu->register_lock, flag);
542 /* return value determine if we need a write buffer flush */
543 static int __iommu_flush_context(struct intel_iommu *iommu,
544 u16 did, u16 source_id, u8 function_mask, u64 type,
545 int non_present_entry_flush)
551 * In the non-present entry flush case, if hardware doesn't cache
552 * non-present entry we do nothing and if hardware cache non-present
553 * entry, we flush entries of domain 0 (the domain id is used to cache
554 * any non-present entries)
556 if (non_present_entry_flush) {
557 if (!cap_caching_mode(iommu->cap))
564 case DMA_CCMD_GLOBAL_INVL:
565 val = DMA_CCMD_GLOBAL_INVL;
567 case DMA_CCMD_DOMAIN_INVL:
568 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
570 case DMA_CCMD_DEVICE_INVL:
571 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
572 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
579 spin_lock_irqsave(&iommu->register_lock, flag);
580 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
582 /* Make sure hardware complete it */
583 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
584 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
586 spin_unlock_irqrestore(&iommu->register_lock, flag);
588 /* flush context entry will implictly flush write buffer */
592 static int inline iommu_flush_context_global(struct intel_iommu *iommu,
593 int non_present_entry_flush)
595 return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
596 non_present_entry_flush);
599 static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
600 int non_present_entry_flush)
602 return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
603 non_present_entry_flush);
606 static int inline iommu_flush_context_device(struct intel_iommu *iommu,
607 u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
609 return __iommu_flush_context(iommu, did, source_id, function_mask,
610 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
613 /* return value determine if we need a write buffer flush */
614 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
615 u64 addr, unsigned int size_order, u64 type,
616 int non_present_entry_flush)
618 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
619 u64 val = 0, val_iva = 0;
623 * In the non-present entry flush case, if hardware doesn't cache
624 * non-present entry we do nothing and if hardware cache non-present
625 * entry, we flush entries of domain 0 (the domain id is used to cache
626 * any non-present entries)
628 if (non_present_entry_flush) {
629 if (!cap_caching_mode(iommu->cap))
636 case DMA_TLB_GLOBAL_FLUSH:
637 /* global flush doesn't need set IVA_REG */
638 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
640 case DMA_TLB_DSI_FLUSH:
641 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
643 case DMA_TLB_PSI_FLUSH:
644 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
645 /* Note: always flush non-leaf currently */
646 val_iva = size_order | addr;
651 /* Note: set drain read/write */
654 * This is probably to be super secure.. Looks like we can
655 * ignore it without any impact.
657 if (cap_read_drain(iommu->cap))
658 val |= DMA_TLB_READ_DRAIN;
660 if (cap_write_drain(iommu->cap))
661 val |= DMA_TLB_WRITE_DRAIN;
663 spin_lock_irqsave(&iommu->register_lock, flag);
664 /* Note: Only uses first TLB reg currently */
666 dmar_writeq(iommu->reg + tlb_offset, val_iva);
667 dmar_writeq(iommu->reg + tlb_offset + 8, val);
669 /* Make sure hardware complete it */
670 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
671 dmar_readq, (!(val & DMA_TLB_IVT)), val);
673 spin_unlock_irqrestore(&iommu->register_lock, flag);
675 /* check IOTLB invalidation granularity */
676 if (DMA_TLB_IAIG(val) == 0)
677 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
678 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
679 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
680 DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
681 /* flush context entry will implictly flush write buffer */
685 static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
686 int non_present_entry_flush)
688 return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
689 non_present_entry_flush);
692 static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
693 int non_present_entry_flush)
695 return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
696 non_present_entry_flush);
699 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
700 u64 addr, unsigned int pages, int non_present_entry_flush)
704 BUG_ON(addr & (~PAGE_MASK_4K));
707 /* Fallback to domain selective flush if no PSI support */
708 if (!cap_pgsel_inv(iommu->cap))
709 return iommu_flush_iotlb_dsi(iommu, did,
710 non_present_entry_flush);
713 * PSI requires page size to be 2 ^ x, and the base address is naturally
714 * aligned to the size
716 mask = ilog2(__roundup_pow_of_two(pages));
717 /* Fallback to domain selective flush if size is too big */
718 if (mask > cap_max_amask_val(iommu->cap))
719 return iommu_flush_iotlb_dsi(iommu, did,
720 non_present_entry_flush);
722 return __iommu_flush_iotlb(iommu, did, addr, mask,
723 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
726 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
731 spin_lock_irqsave(&iommu->register_lock, flags);
732 pmen = readl(iommu->reg + DMAR_PMEN_REG);
733 pmen &= ~DMA_PMEN_EPM;
734 writel(pmen, iommu->reg + DMAR_PMEN_REG);
736 /* wait for the protected region status bit to clear */
737 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
738 readl, !(pmen & DMA_PMEN_PRS), pmen);
740 spin_unlock_irqrestore(&iommu->register_lock, flags);
743 static int iommu_enable_translation(struct intel_iommu *iommu)
748 spin_lock_irqsave(&iommu->register_lock, flags);
749 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
751 /* Make sure hardware complete it */
752 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
753 readl, (sts & DMA_GSTS_TES), sts);
755 iommu->gcmd |= DMA_GCMD_TE;
756 spin_unlock_irqrestore(&iommu->register_lock, flags);
760 static int iommu_disable_translation(struct intel_iommu *iommu)
765 spin_lock_irqsave(&iommu->register_lock, flag);
766 iommu->gcmd &= ~DMA_GCMD_TE;
767 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
769 /* Make sure hardware complete it */
770 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
771 readl, (!(sts & DMA_GSTS_TES)), sts);
773 spin_unlock_irqrestore(&iommu->register_lock, flag);
777 /* iommu interrupt handling. Most stuff are MSI-like. */
779 static const char *fault_reason_strings[] =
782 "Present bit in root entry is clear",
783 "Present bit in context entry is clear",
784 "Invalid context entry",
785 "Access beyond MGAW",
786 "PTE Write access is not set",
787 "PTE Read access is not set",
788 "Next page table ptr is invalid",
789 "Root table address invalid",
790 "Context table ptr is invalid",
791 "non-zero reserved fields in RTP",
792 "non-zero reserved fields in CTP",
793 "non-zero reserved fields in PTE",
795 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
797 const char *dmar_get_fault_reason(u8 fault_reason)
799 if (fault_reason > MAX_FAULT_REASON_IDX)
802 return fault_reason_strings[fault_reason];
805 void dmar_msi_unmask(unsigned int irq)
807 struct intel_iommu *iommu = get_irq_data(irq);
811 spin_lock_irqsave(&iommu->register_lock, flag);
812 writel(0, iommu->reg + DMAR_FECTL_REG);
813 /* Read a reg to force flush the post write */
814 readl(iommu->reg + DMAR_FECTL_REG);
815 spin_unlock_irqrestore(&iommu->register_lock, flag);
818 void dmar_msi_mask(unsigned int irq)
821 struct intel_iommu *iommu = get_irq_data(irq);
824 spin_lock_irqsave(&iommu->register_lock, flag);
825 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
826 /* Read a reg to force flush the post write */
827 readl(iommu->reg + DMAR_FECTL_REG);
828 spin_unlock_irqrestore(&iommu->register_lock, flag);
831 void dmar_msi_write(int irq, struct msi_msg *msg)
833 struct intel_iommu *iommu = get_irq_data(irq);
836 spin_lock_irqsave(&iommu->register_lock, flag);
837 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
838 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
839 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
840 spin_unlock_irqrestore(&iommu->register_lock, flag);
843 void dmar_msi_read(int irq, struct msi_msg *msg)
845 struct intel_iommu *iommu = get_irq_data(irq);
848 spin_lock_irqsave(&iommu->register_lock, flag);
849 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
850 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
851 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
852 spin_unlock_irqrestore(&iommu->register_lock, flag);
855 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
856 u8 fault_reason, u16 source_id, u64 addr)
860 reason = dmar_get_fault_reason(fault_reason);
863 "DMAR:[%s] Request device [%02x:%02x.%d] "
865 "DMAR:[fault reason %02d] %s\n",
866 (type ? "DMA Read" : "DMA Write"),
867 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
868 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
872 #define PRIMARY_FAULT_REG_LEN (16)
873 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
875 struct intel_iommu *iommu = dev_id;
876 int reg, fault_index;
880 spin_lock_irqsave(&iommu->register_lock, flag);
881 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
883 /* TBD: ignore advanced fault log currently */
884 if (!(fault_status & DMA_FSTS_PPF))
887 fault_index = dma_fsts_fault_record_index(fault_status);
888 reg = cap_fault_reg_offset(iommu->cap);
896 /* highest 32 bits */
897 data = readl(iommu->reg + reg +
898 fault_index * PRIMARY_FAULT_REG_LEN + 12);
899 if (!(data & DMA_FRCD_F))
902 fault_reason = dma_frcd_fault_reason(data);
903 type = dma_frcd_type(data);
905 data = readl(iommu->reg + reg +
906 fault_index * PRIMARY_FAULT_REG_LEN + 8);
907 source_id = dma_frcd_source_id(data);
909 guest_addr = dmar_readq(iommu->reg + reg +
910 fault_index * PRIMARY_FAULT_REG_LEN);
911 guest_addr = dma_frcd_page_addr(guest_addr);
912 /* clear the fault */
913 writel(DMA_FRCD_F, iommu->reg + reg +
914 fault_index * PRIMARY_FAULT_REG_LEN + 12);
916 spin_unlock_irqrestore(&iommu->register_lock, flag);
918 iommu_page_fault_do_one(iommu, type, fault_reason,
919 source_id, guest_addr);
922 if (fault_index > cap_num_fault_regs(iommu->cap))
924 spin_lock_irqsave(&iommu->register_lock, flag);
927 /* clear primary fault overflow */
928 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
929 if (fault_status & DMA_FSTS_PFO)
930 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
932 spin_unlock_irqrestore(&iommu->register_lock, flag);
936 int dmar_set_interrupt(struct intel_iommu *iommu)
942 printk(KERN_ERR "IOMMU: no free vectors\n");
946 set_irq_data(irq, iommu);
949 ret = arch_setup_dmar_msi(irq);
951 set_irq_data(irq, NULL);
957 /* Force fault register is cleared */
958 iommu_page_fault(irq, iommu);
960 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
962 printk(KERN_ERR "IOMMU: can't request irq\n");
966 static int iommu_init_domains(struct intel_iommu *iommu)
968 unsigned long ndomains;
969 unsigned long nlongs;
971 ndomains = cap_ndoms(iommu->cap);
972 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
973 nlongs = BITS_TO_LONGS(ndomains);
975 /* TBD: there might be 64K domains,
976 * consider other allocation for future chip
978 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
979 if (!iommu->domain_ids) {
980 printk(KERN_ERR "Allocating domain id array failed\n");
983 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
985 if (!iommu->domains) {
986 printk(KERN_ERR "Allocating domain array failed\n");
987 kfree(iommu->domain_ids);
991 spin_lock_init(&iommu->lock);
994 * if Caching mode is set, then invalid translations are tagged
995 * with domainid 0. Hence we need to pre-allocate it.
997 if (cap_caching_mode(iommu->cap))
998 set_bit(0, iommu->domain_ids);
1003 static void domain_exit(struct dmar_domain *domain);
1005 void free_dmar_iommu(struct intel_iommu *iommu)
1007 struct dmar_domain *domain;
1010 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1011 for (; i < cap_ndoms(iommu->cap); ) {
1012 domain = iommu->domains[i];
1013 clear_bit(i, iommu->domain_ids);
1014 domain_exit(domain);
1015 i = find_next_bit(iommu->domain_ids,
1016 cap_ndoms(iommu->cap), i+1);
1019 if (iommu->gcmd & DMA_GCMD_TE)
1020 iommu_disable_translation(iommu);
1023 set_irq_data(iommu->irq, NULL);
1024 /* This will mask the irq */
1025 free_irq(iommu->irq, iommu);
1026 destroy_irq(iommu->irq);
1029 kfree(iommu->domains);
1030 kfree(iommu->domain_ids);
1032 /* free context mapping */
1033 free_context_table(iommu);
1036 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1039 unsigned long ndomains;
1040 struct dmar_domain *domain;
1041 unsigned long flags;
1043 domain = alloc_domain_mem();
1047 ndomains = cap_ndoms(iommu->cap);
1049 spin_lock_irqsave(&iommu->lock, flags);
1050 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1051 if (num >= ndomains) {
1052 spin_unlock_irqrestore(&iommu->lock, flags);
1053 free_domain_mem(domain);
1054 printk(KERN_ERR "IOMMU: no free domain ids\n");
1058 set_bit(num, iommu->domain_ids);
1060 domain->iommu = iommu;
1061 iommu->domains[num] = domain;
1062 spin_unlock_irqrestore(&iommu->lock, flags);
1067 static void iommu_free_domain(struct dmar_domain *domain)
1069 unsigned long flags;
1071 spin_lock_irqsave(&domain->iommu->lock, flags);
1072 clear_bit(domain->id, domain->iommu->domain_ids);
1073 spin_unlock_irqrestore(&domain->iommu->lock, flags);
1076 static struct iova_domain reserved_iova_list;
1077 static struct lock_class_key reserved_alloc_key;
1078 static struct lock_class_key reserved_rbtree_key;
1080 static void dmar_init_reserved_ranges(void)
1082 struct pci_dev *pdev = NULL;
1087 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1089 lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1090 &reserved_alloc_key);
1091 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1092 &reserved_rbtree_key);
1094 /* IOAPIC ranges shouldn't be accessed by DMA */
1095 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1096 IOVA_PFN(IOAPIC_RANGE_END));
1098 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1100 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1101 for_each_pci_dev(pdev) {
1104 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1105 r = &pdev->resource[i];
1106 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1109 addr &= PAGE_MASK_4K;
1110 size = r->end - addr;
1111 size = PAGE_ALIGN_4K(size);
1112 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1113 IOVA_PFN(size + addr) - 1);
1115 printk(KERN_ERR "Reserve iova failed\n");
1121 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1123 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1126 static inline int guestwidth_to_adjustwidth(int gaw)
1129 int r = (gaw - 12) % 9;
1140 static int domain_init(struct dmar_domain *domain, int guest_width)
1142 struct intel_iommu *iommu;
1143 int adjust_width, agaw;
1144 unsigned long sagaw;
1146 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1147 spin_lock_init(&domain->mapping_lock);
1149 domain_reserve_special_ranges(domain);
1151 /* calculate AGAW */
1152 iommu = domain->iommu;
1153 if (guest_width > cap_mgaw(iommu->cap))
1154 guest_width = cap_mgaw(iommu->cap);
1155 domain->gaw = guest_width;
1156 adjust_width = guestwidth_to_adjustwidth(guest_width);
1157 agaw = width_to_agaw(adjust_width);
1158 sagaw = cap_sagaw(iommu->cap);
1159 if (!test_bit(agaw, &sagaw)) {
1160 /* hardware doesn't support it, choose a bigger one */
1161 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1162 agaw = find_next_bit(&sagaw, 5, agaw);
1166 domain->agaw = agaw;
1167 INIT_LIST_HEAD(&domain->devices);
1169 /* always allocate the top pgd */
1170 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1173 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1177 static void domain_exit(struct dmar_domain *domain)
1181 /* Domain 0 is reserved, so dont process it */
1185 domain_remove_dev_info(domain);
1187 put_iova_domain(&domain->iovad);
1188 end = DOMAIN_MAX_ADDR(domain->gaw);
1189 end = end & (~PAGE_MASK_4K);
1192 dma_pte_clear_range(domain, 0, end);
1194 /* free page tables */
1195 dma_pte_free_pagetable(domain, 0, end);
1197 iommu_free_domain(domain);
1198 free_domain_mem(domain);
1201 static int domain_context_mapping_one(struct dmar_domain *domain,
1204 struct context_entry *context;
1205 struct intel_iommu *iommu = domain->iommu;
1206 unsigned long flags;
1208 pr_debug("Set context mapping for %02x:%02x.%d\n",
1209 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1210 BUG_ON(!domain->pgd);
1211 context = device_to_context_entry(iommu, bus, devfn);
1214 spin_lock_irqsave(&iommu->lock, flags);
1215 if (context_present(*context)) {
1216 spin_unlock_irqrestore(&iommu->lock, flags);
1220 context_set_domain_id(*context, domain->id);
1221 context_set_address_width(*context, domain->agaw);
1222 context_set_address_root(*context, virt_to_phys(domain->pgd));
1223 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1224 context_set_fault_enable(*context);
1225 context_set_present(*context);
1226 __iommu_flush_cache(iommu, context, sizeof(*context));
1228 /* it's a non-present to present mapping */
1229 if (iommu_flush_context_device(iommu, domain->id,
1230 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1231 iommu_flush_write_buffer(iommu);
1233 iommu_flush_iotlb_dsi(iommu, 0, 0);
1234 spin_unlock_irqrestore(&iommu->lock, flags);
1239 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1242 struct pci_dev *tmp, *parent;
1244 ret = domain_context_mapping_one(domain, pdev->bus->number,
1249 /* dependent device mapping */
1250 tmp = pci_find_upstream_pcie_bridge(pdev);
1253 /* Secondary interface's bus number and devfn 0 */
1254 parent = pdev->bus->self;
1255 while (parent != tmp) {
1256 ret = domain_context_mapping_one(domain, parent->bus->number,
1260 parent = parent->bus->self;
1262 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1263 return domain_context_mapping_one(domain,
1264 tmp->subordinate->number, 0);
1265 else /* this is a legacy PCI bridge */
1266 return domain_context_mapping_one(domain,
1267 tmp->bus->number, tmp->devfn);
1270 static int domain_context_mapped(struct dmar_domain *domain,
1271 struct pci_dev *pdev)
1274 struct pci_dev *tmp, *parent;
1276 ret = device_context_mapped(domain->iommu,
1277 pdev->bus->number, pdev->devfn);
1280 /* dependent device mapping */
1281 tmp = pci_find_upstream_pcie_bridge(pdev);
1284 /* Secondary interface's bus number and devfn 0 */
1285 parent = pdev->bus->self;
1286 while (parent != tmp) {
1287 ret = device_context_mapped(domain->iommu, parent->bus->number,
1291 parent = parent->bus->self;
1294 return device_context_mapped(domain->iommu,
1295 tmp->subordinate->number, 0);
1297 return device_context_mapped(domain->iommu,
1298 tmp->bus->number, tmp->devfn);
1302 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1303 u64 hpa, size_t size, int prot)
1305 u64 start_pfn, end_pfn;
1306 struct dma_pte *pte;
1309 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1311 iova &= PAGE_MASK_4K;
1312 start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1313 end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1315 while (start_pfn < end_pfn) {
1316 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1319 /* We don't need lock here, nobody else
1320 * touches the iova range
1322 BUG_ON(dma_pte_addr(*pte));
1323 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1324 dma_set_pte_prot(*pte, prot);
1325 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1332 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1334 clear_context_table(domain->iommu, bus, devfn);
1335 iommu_flush_context_global(domain->iommu, 0);
1336 iommu_flush_iotlb_global(domain->iommu, 0);
1339 static void domain_remove_dev_info(struct dmar_domain *domain)
1341 struct device_domain_info *info;
1342 unsigned long flags;
1344 spin_lock_irqsave(&device_domain_lock, flags);
1345 while (!list_empty(&domain->devices)) {
1346 info = list_entry(domain->devices.next,
1347 struct device_domain_info, link);
1348 list_del(&info->link);
1349 list_del(&info->global);
1351 info->dev->dev.archdata.iommu = NULL;
1352 spin_unlock_irqrestore(&device_domain_lock, flags);
1354 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1355 free_devinfo_mem(info);
1357 spin_lock_irqsave(&device_domain_lock, flags);
1359 spin_unlock_irqrestore(&device_domain_lock, flags);
1364 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1366 struct dmar_domain *
1367 find_domain(struct pci_dev *pdev)
1369 struct device_domain_info *info;
1371 /* No lock here, assumes no domain exit in normal case */
1372 info = pdev->dev.archdata.iommu;
1374 return info->domain;
1378 /* domain is initialized */
1379 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1381 struct dmar_domain *domain, *found = NULL;
1382 struct intel_iommu *iommu;
1383 struct dmar_drhd_unit *drhd;
1384 struct device_domain_info *info, *tmp;
1385 struct pci_dev *dev_tmp;
1386 unsigned long flags;
1387 int bus = 0, devfn = 0;
1389 domain = find_domain(pdev);
1393 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1395 if (dev_tmp->is_pcie) {
1396 bus = dev_tmp->subordinate->number;
1399 bus = dev_tmp->bus->number;
1400 devfn = dev_tmp->devfn;
1402 spin_lock_irqsave(&device_domain_lock, flags);
1403 list_for_each_entry(info, &device_domain_list, global) {
1404 if (info->bus == bus && info->devfn == devfn) {
1405 found = info->domain;
1409 spin_unlock_irqrestore(&device_domain_lock, flags);
1410 /* pcie-pci bridge already has a domain, uses it */
1417 /* Allocate new domain for the device */
1418 drhd = dmar_find_matched_drhd_unit(pdev);
1420 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1424 iommu = drhd->iommu;
1426 domain = iommu_alloc_domain(iommu);
1430 if (domain_init(domain, gaw)) {
1431 domain_exit(domain);
1435 /* register pcie-to-pci device */
1437 info = alloc_devinfo_mem();
1439 domain_exit(domain);
1443 info->devfn = devfn;
1445 info->domain = domain;
1446 /* This domain is shared by devices under p2p bridge */
1447 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1449 /* pcie-to-pci bridge already has a domain, uses it */
1451 spin_lock_irqsave(&device_domain_lock, flags);
1452 list_for_each_entry(tmp, &device_domain_list, global) {
1453 if (tmp->bus == bus && tmp->devfn == devfn) {
1454 found = tmp->domain;
1459 free_devinfo_mem(info);
1460 domain_exit(domain);
1463 list_add(&info->link, &domain->devices);
1464 list_add(&info->global, &device_domain_list);
1466 spin_unlock_irqrestore(&device_domain_lock, flags);
1470 info = alloc_devinfo_mem();
1473 info->bus = pdev->bus->number;
1474 info->devfn = pdev->devfn;
1476 info->domain = domain;
1477 spin_lock_irqsave(&device_domain_lock, flags);
1478 /* somebody is fast */
1479 found = find_domain(pdev);
1480 if (found != NULL) {
1481 spin_unlock_irqrestore(&device_domain_lock, flags);
1482 if (found != domain) {
1483 domain_exit(domain);
1486 free_devinfo_mem(info);
1489 list_add(&info->link, &domain->devices);
1490 list_add(&info->global, &device_domain_list);
1491 pdev->dev.archdata.iommu = info;
1492 spin_unlock_irqrestore(&device_domain_lock, flags);
1495 /* recheck it here, maybe others set it */
1496 return find_domain(pdev);
1499 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1501 struct dmar_domain *domain;
1507 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1508 pci_name(pdev), start, end);
1509 /* page table init */
1510 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1514 /* The address might not be aligned */
1515 base = start & PAGE_MASK_4K;
1517 size = PAGE_ALIGN_4K(size);
1518 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1519 IOVA_PFN(base + size) - 1)) {
1520 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1525 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1526 size, base, pci_name(pdev));
1528 * RMRR range might have overlap with physical memory range,
1531 dma_pte_clear_range(domain, base, base + size);
1533 ret = domain_page_mapping(domain, base, base, size,
1534 DMA_PTE_READ|DMA_PTE_WRITE);
1538 /* context entry init */
1539 ret = domain_context_mapping(domain, pdev);
1543 domain_exit(domain);
1548 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1549 struct pci_dev *pdev)
1551 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1553 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1554 rmrr->end_address + 1);
1557 #ifdef CONFIG_DMAR_GFX_WA
1558 struct iommu_prepare_data {
1559 struct pci_dev *pdev;
1563 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1564 unsigned long end_pfn, void *datax)
1566 struct iommu_prepare_data *data;
1568 data = (struct iommu_prepare_data *)datax;
1570 data->ret = iommu_prepare_identity_map(data->pdev,
1571 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1576 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1579 struct iommu_prepare_data data;
1584 for_each_online_node(nid) {
1585 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1592 static void __init iommu_prepare_gfx_mapping(void)
1594 struct pci_dev *pdev = NULL;
1597 for_each_pci_dev(pdev) {
1598 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1599 !IS_GFX_DEVICE(pdev))
1601 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1603 ret = iommu_prepare_with_active_regions(pdev);
1605 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1610 #ifdef CONFIG_DMAR_FLOPPY_WA
1611 static inline void iommu_prepare_isa(void)
1613 struct pci_dev *pdev;
1616 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1620 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1621 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1624 printk("IOMMU: Failed to create 0-64M identity map, "
1625 "floppy might not work\n");
1629 static inline void iommu_prepare_isa(void)
1633 #endif /* !CONFIG_DMAR_FLPY_WA */
1635 int __init init_dmars(void)
1637 struct dmar_drhd_unit *drhd;
1638 struct dmar_rmrr_unit *rmrr;
1639 struct pci_dev *pdev;
1640 struct intel_iommu *iommu;
1641 int i, ret, unit = 0;
1646 * initialize and program root entry to not present
1649 for_each_drhd_unit(drhd) {
1652 * lock not needed as this is only incremented in the single
1653 * threaded kernel __init code path all other access are read
1658 deferred_flush = kzalloc(g_num_of_iommus *
1659 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1660 if (!deferred_flush) {
1665 for_each_drhd_unit(drhd) {
1669 iommu = drhd->iommu;
1671 ret = iommu_init_domains(iommu);
1677 * we could share the same root & context tables
1678 * amoung all IOMMU's. Need to Split it later.
1680 ret = iommu_alloc_root_entry(iommu);
1682 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1689 * for each dev attached to rmrr
1691 * locate drhd for dev, alloc domain for dev
1692 * allocate free domain
1693 * allocate page table entries for rmrr
1694 * if context not allocated for bus
1695 * allocate and init context
1696 * set present in root table for this bus
1697 * init context with domain, translation etc
1701 for_each_rmrr_units(rmrr) {
1702 for (i = 0; i < rmrr->devices_cnt; i++) {
1703 pdev = rmrr->devices[i];
1704 /* some BIOS lists non-exist devices in DMAR table */
1707 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1710 "IOMMU: mapping reserved region failed\n");
1714 iommu_prepare_gfx_mapping();
1716 iommu_prepare_isa();
1721 * global invalidate context cache
1722 * global invalidate iotlb
1723 * enable translation
1725 for_each_drhd_unit(drhd) {
1728 iommu = drhd->iommu;
1729 sprintf (iommu->name, "dmar%d", unit++);
1731 iommu_flush_write_buffer(iommu);
1733 ret = dmar_set_interrupt(iommu);
1737 iommu_set_root_entry(iommu);
1739 iommu_flush_context_global(iommu, 0);
1740 iommu_flush_iotlb_global(iommu, 0);
1742 iommu_disable_protect_mem_regions(iommu);
1744 ret = iommu_enable_translation(iommu);
1751 for_each_drhd_unit(drhd) {
1754 iommu = drhd->iommu;
1760 static inline u64 aligned_size(u64 host_addr, size_t size)
1763 addr = (host_addr & (~PAGE_MASK_4K)) + size;
1764 return PAGE_ALIGN_4K(addr);
1768 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1772 /* Make sure it's in range */
1773 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1774 if (!size || (IOVA_START_ADDR + size > end))
1777 piova = alloc_iova(&domain->iovad,
1778 size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
1782 static struct iova *
1783 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1786 struct pci_dev *pdev = to_pci_dev(dev);
1787 struct iova *iova = NULL;
1789 if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1790 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1793 * First try to allocate an io virtual address in
1794 * DMA_32BIT_MASK and if that fails then try allocating
1797 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1799 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1803 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1810 static struct dmar_domain *
1811 get_valid_domain_for_dev(struct pci_dev *pdev)
1813 struct dmar_domain *domain;
1816 domain = get_domain_for_dev(pdev,
1817 DEFAULT_DOMAIN_ADDRESS_WIDTH);
1820 "Allocating domain for %s failed", pci_name(pdev));
1824 /* make sure context mapping is ok */
1825 if (unlikely(!domain_context_mapped(domain, pdev))) {
1826 ret = domain_context_mapping(domain, pdev);
1829 "Domain context map for %s failed",
1839 intel_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
1841 struct pci_dev *pdev = to_pci_dev(hwdev);
1842 struct dmar_domain *domain;
1843 unsigned long start_paddr;
1848 BUG_ON(dir == DMA_NONE);
1849 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1852 domain = get_valid_domain_for_dev(pdev);
1856 size = aligned_size((u64)paddr, size);
1858 iova = __intel_alloc_iova(hwdev, domain, size);
1862 start_paddr = iova->pfn_lo << PAGE_SHIFT_4K;
1865 * Check if DMAR supports zero-length reads on write only
1868 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1869 !cap_zlr(domain->iommu->cap))
1870 prot |= DMA_PTE_READ;
1871 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1872 prot |= DMA_PTE_WRITE;
1874 * paddr - (paddr + size) might be partial page, we should map the whole
1875 * page. Note: if two part of one page are separately mapped, we
1876 * might have two guest_addr mapping to the same host paddr, but this
1877 * is not a big problem
1879 ret = domain_page_mapping(domain, start_paddr,
1880 ((u64)paddr) & PAGE_MASK_4K, size, prot);
1884 pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1885 pci_name(pdev), size, (u64)paddr,
1886 size, (u64)start_paddr, dir);
1888 /* it's a non-present to present mapping */
1889 ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1890 start_paddr, size >> PAGE_SHIFT_4K, 1);
1892 iommu_flush_write_buffer(domain->iommu);
1894 return (start_paddr + ((u64)paddr & (~PAGE_MASK_4K)));
1898 __free_iova(&domain->iovad, iova);
1899 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1900 pci_name(pdev), size, (u64)paddr, dir);
1904 static void flush_unmaps(void)
1910 /* just flush them all */
1911 for (i = 0; i < g_num_of_iommus; i++) {
1912 if (deferred_flush[i].next) {
1913 struct intel_iommu *iommu =
1914 deferred_flush[i].domain[0]->iommu;
1916 iommu_flush_iotlb_global(iommu, 0);
1917 for (j = 0; j < deferred_flush[i].next; j++) {
1918 __free_iova(&deferred_flush[i].domain[j]->iovad,
1919 deferred_flush[i].iova[j]);
1921 deferred_flush[i].next = 0;
1928 static void flush_unmaps_timeout(unsigned long data)
1930 unsigned long flags;
1932 spin_lock_irqsave(&async_umap_flush_lock, flags);
1934 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1937 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
1939 unsigned long flags;
1942 spin_lock_irqsave(&async_umap_flush_lock, flags);
1943 if (list_size == HIGH_WATER_MARK)
1946 iommu_id = dom->iommu->seq_id;
1948 next = deferred_flush[iommu_id].next;
1949 deferred_flush[iommu_id].domain[next] = dom;
1950 deferred_flush[iommu_id].iova[next] = iova;
1951 deferred_flush[iommu_id].next++;
1954 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
1958 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1961 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1962 size_t size, int dir)
1964 struct pci_dev *pdev = to_pci_dev(dev);
1965 struct dmar_domain *domain;
1966 unsigned long start_addr;
1969 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1971 domain = find_domain(pdev);
1974 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1978 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1979 size = aligned_size((u64)dev_addr, size);
1981 pr_debug("Device %s unmapping: %lx@%llx\n",
1982 pci_name(pdev), size, (u64)start_addr);
1984 /* clear the whole page */
1985 dma_pte_clear_range(domain, start_addr, start_addr + size);
1986 /* free page tables */
1987 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1988 if (intel_iommu_strict) {
1989 if (iommu_flush_iotlb_psi(domain->iommu,
1990 domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
1991 iommu_flush_write_buffer(domain->iommu);
1993 __free_iova(&domain->iovad, iova);
1995 add_unmap(domain, iova);
1997 * queue up the release of the unmap to save the 1/6th of the
1998 * cpu used up by the iotlb flush operation...
2003 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
2004 dma_addr_t *dma_handle, gfp_t flags)
2009 size = PAGE_ALIGN_4K(size);
2010 order = get_order(size);
2011 flags &= ~(GFP_DMA | GFP_DMA32);
2013 vaddr = (void *)__get_free_pages(flags, order);
2016 memset(vaddr, 0, size);
2018 *dma_handle = intel_map_single(hwdev, virt_to_bus(vaddr), size, DMA_BIDIRECTIONAL);
2021 free_pages((unsigned long)vaddr, order);
2025 static void intel_free_coherent(struct device *hwdev, size_t size,
2026 void *vaddr, dma_addr_t dma_handle)
2030 size = PAGE_ALIGN_4K(size);
2031 order = get_order(size);
2033 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2034 free_pages((unsigned long)vaddr, order);
2037 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2038 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2039 int nelems, int dir)
2042 struct pci_dev *pdev = to_pci_dev(hwdev);
2043 struct dmar_domain *domain;
2044 unsigned long start_addr;
2048 struct scatterlist *sg;
2050 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2053 domain = find_domain(pdev);
2055 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2058 for_each_sg(sglist, sg, nelems, i) {
2059 addr = SG_ENT_VIRT_ADDRESS(sg);
2060 size += aligned_size((u64)addr, sg->length);
2063 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2065 /* clear the whole page */
2066 dma_pte_clear_range(domain, start_addr, start_addr + size);
2067 /* free page tables */
2068 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2070 if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2071 size >> PAGE_SHIFT_4K, 0))
2072 iommu_flush_write_buffer(domain->iommu);
2075 __free_iova(&domain->iovad, iova);
2078 static int intel_nontranslate_map_sg(struct device *hddev,
2079 struct scatterlist *sglist, int nelems, int dir)
2082 struct scatterlist *sg;
2084 for_each_sg(sglist, sg, nelems, i) {
2085 BUG_ON(!sg_page(sg));
2086 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2087 sg->dma_length = sg->length;
2092 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
2093 int nelems, int dir)
2097 struct pci_dev *pdev = to_pci_dev(hwdev);
2098 struct dmar_domain *domain;
2102 struct iova *iova = NULL;
2104 struct scatterlist *sg;
2105 unsigned long start_addr;
2107 BUG_ON(dir == DMA_NONE);
2108 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2109 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2111 domain = get_valid_domain_for_dev(pdev);
2115 for_each_sg(sglist, sg, nelems, i) {
2116 addr = SG_ENT_VIRT_ADDRESS(sg);
2117 addr = (void *)virt_to_phys(addr);
2118 size += aligned_size((u64)addr, sg->length);
2121 iova = __intel_alloc_iova(hwdev, domain, size);
2123 sglist->dma_length = 0;
2128 * Check if DMAR supports zero-length reads on write only
2131 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2132 !cap_zlr(domain->iommu->cap))
2133 prot |= DMA_PTE_READ;
2134 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2135 prot |= DMA_PTE_WRITE;
2137 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2139 for_each_sg(sglist, sg, nelems, i) {
2140 addr = SG_ENT_VIRT_ADDRESS(sg);
2141 addr = (void *)virt_to_phys(addr);
2142 size = aligned_size((u64)addr, sg->length);
2143 ret = domain_page_mapping(domain, start_addr + offset,
2144 ((u64)addr) & PAGE_MASK_4K,
2147 /* clear the page */
2148 dma_pte_clear_range(domain, start_addr,
2149 start_addr + offset);
2150 /* free page tables */
2151 dma_pte_free_pagetable(domain, start_addr,
2152 start_addr + offset);
2154 __free_iova(&domain->iovad, iova);
2157 sg->dma_address = start_addr + offset +
2158 ((u64)addr & (~PAGE_MASK_4K));
2159 sg->dma_length = sg->length;
2163 /* it's a non-present to present mapping */
2164 if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2165 start_addr, offset >> PAGE_SHIFT_4K, 1))
2166 iommu_flush_write_buffer(domain->iommu);
2170 static struct dma_mapping_ops intel_dma_ops = {
2171 .alloc_coherent = intel_alloc_coherent,
2172 .free_coherent = intel_free_coherent,
2173 .map_single = intel_map_single,
2174 .unmap_single = intel_unmap_single,
2175 .map_sg = intel_map_sg,
2176 .unmap_sg = intel_unmap_sg,
2179 static inline int iommu_domain_cache_init(void)
2183 iommu_domain_cache = kmem_cache_create("iommu_domain",
2184 sizeof(struct dmar_domain),
2189 if (!iommu_domain_cache) {
2190 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2197 static inline int iommu_devinfo_cache_init(void)
2201 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2202 sizeof(struct device_domain_info),
2207 if (!iommu_devinfo_cache) {
2208 printk(KERN_ERR "Couldn't create devinfo cache\n");
2215 static inline int iommu_iova_cache_init(void)
2219 iommu_iova_cache = kmem_cache_create("iommu_iova",
2220 sizeof(struct iova),
2225 if (!iommu_iova_cache) {
2226 printk(KERN_ERR "Couldn't create iova cache\n");
2233 static int __init iommu_init_mempool(void)
2236 ret = iommu_iova_cache_init();
2240 ret = iommu_domain_cache_init();
2244 ret = iommu_devinfo_cache_init();
2248 kmem_cache_destroy(iommu_domain_cache);
2250 kmem_cache_destroy(iommu_iova_cache);
2255 static void __init iommu_exit_mempool(void)
2257 kmem_cache_destroy(iommu_devinfo_cache);
2258 kmem_cache_destroy(iommu_domain_cache);
2259 kmem_cache_destroy(iommu_iova_cache);
2263 void __init detect_intel_iommu(void)
2265 if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
2267 if (early_dmar_detect()) {
2272 static void __init init_no_remapping_devices(void)
2274 struct dmar_drhd_unit *drhd;
2276 for_each_drhd_unit(drhd) {
2277 if (!drhd->include_all) {
2279 for (i = 0; i < drhd->devices_cnt; i++)
2280 if (drhd->devices[i] != NULL)
2282 /* ignore DMAR unit if no pci devices exist */
2283 if (i == drhd->devices_cnt)
2291 for_each_drhd_unit(drhd) {
2293 if (drhd->ignored || drhd->include_all)
2296 for (i = 0; i < drhd->devices_cnt; i++)
2297 if (drhd->devices[i] &&
2298 !IS_GFX_DEVICE(drhd->devices[i]))
2301 if (i < drhd->devices_cnt)
2304 /* bypass IOMMU if it is just for gfx devices */
2306 for (i = 0; i < drhd->devices_cnt; i++) {
2307 if (!drhd->devices[i])
2309 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2314 int __init intel_iommu_init(void)
2318 if (no_iommu || swiotlb || dmar_disabled)
2321 if (dmar_table_init())
2324 if (dmar_dev_scope_init())
2327 iommu_init_mempool();
2328 dmar_init_reserved_ranges();
2330 init_no_remapping_devices();
2334 printk(KERN_ERR "IOMMU: dmar init failed\n");
2335 put_iova_domain(&reserved_iova_list);
2336 iommu_exit_mempool();
2340 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2342 init_timer(&unmap_timer);
2344 dma_ops = &intel_dma_ops;