2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/slab.h>
27 #include <linux/irq.h>
28 #include <linux/interrupt.h>
29 #include <linux/sysdev.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/intel-iommu.h>
38 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
43 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
44 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
46 #define IOAPIC_RANGE_START (0xfee00000)
47 #define IOAPIC_RANGE_END (0xfeefffff)
48 #define IOVA_START_ADDR (0x1000)
50 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
52 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
55 static void flush_unmaps_timeout(unsigned long data);
57 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
59 #define HIGH_WATER_MARK 250
60 struct deferred_flush_tables {
62 struct iova *iova[HIGH_WATER_MARK];
63 struct dmar_domain *domain[HIGH_WATER_MARK];
66 static struct deferred_flush_tables *deferred_flush;
68 /* bitmap for indexing intel_iommus */
69 static int g_num_of_iommus;
71 static DEFINE_SPINLOCK(async_umap_flush_lock);
72 static LIST_HEAD(unmaps_to_do);
75 static long list_size;
77 static void domain_remove_dev_info(struct dmar_domain *domain);
80 static int __initdata dmar_map_gfx = 1;
81 static int dmar_forcedac;
82 static int intel_iommu_strict;
84 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
85 static DEFINE_SPINLOCK(device_domain_lock);
86 static LIST_HEAD(device_domain_list);
88 static int __init intel_iommu_setup(char *str)
93 if (!strncmp(str, "off", 3)) {
95 printk(KERN_INFO"Intel-IOMMU: disabled\n");
96 } else if (!strncmp(str, "igfx_off", 8)) {
99 "Intel-IOMMU: disable GFX device mapping\n");
100 } else if (!strncmp(str, "forcedac", 8)) {
102 "Intel-IOMMU: Forcing DAC for PCI devices\n");
104 } else if (!strncmp(str, "strict", 6)) {
106 "Intel-IOMMU: disable batched IOTLB flush\n");
107 intel_iommu_strict = 1;
110 str += strcspn(str, ",");
116 __setup("intel_iommu=", intel_iommu_setup);
118 static struct kmem_cache *iommu_domain_cache;
119 static struct kmem_cache *iommu_devinfo_cache;
120 static struct kmem_cache *iommu_iova_cache;
122 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
127 /* trying to avoid low memory issues */
128 flags = current->flags & PF_MEMALLOC;
129 current->flags |= PF_MEMALLOC;
130 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
131 current->flags &= (~PF_MEMALLOC | flags);
136 static inline void *alloc_pgtable_page(void)
141 /* trying to avoid low memory issues */
142 flags = current->flags & PF_MEMALLOC;
143 current->flags |= PF_MEMALLOC;
144 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
145 current->flags &= (~PF_MEMALLOC | flags);
149 static inline void free_pgtable_page(void *vaddr)
151 free_page((unsigned long)vaddr);
154 static inline void *alloc_domain_mem(void)
156 return iommu_kmem_cache_alloc(iommu_domain_cache);
159 static void free_domain_mem(void *vaddr)
161 kmem_cache_free(iommu_domain_cache, vaddr);
164 static inline void * alloc_devinfo_mem(void)
166 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
169 static inline void free_devinfo_mem(void *vaddr)
171 kmem_cache_free(iommu_devinfo_cache, vaddr);
174 struct iova *alloc_iova_mem(void)
176 return iommu_kmem_cache_alloc(iommu_iova_cache);
179 void free_iova_mem(struct iova *iova)
181 kmem_cache_free(iommu_iova_cache, iova);
184 /* Gets context entry for a given bus and devfn */
185 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
188 struct root_entry *root;
189 struct context_entry *context;
190 unsigned long phy_addr;
193 spin_lock_irqsave(&iommu->lock, flags);
194 root = &iommu->root_entry[bus];
195 context = get_context_addr_from_root(root);
197 context = (struct context_entry *)alloc_pgtable_page();
199 spin_unlock_irqrestore(&iommu->lock, flags);
202 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
203 phy_addr = virt_to_phys((void *)context);
204 set_root_value(root, phy_addr);
205 set_root_present(root);
206 __iommu_flush_cache(iommu, root, sizeof(*root));
208 spin_unlock_irqrestore(&iommu->lock, flags);
209 return &context[devfn];
212 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
214 struct root_entry *root;
215 struct context_entry *context;
219 spin_lock_irqsave(&iommu->lock, flags);
220 root = &iommu->root_entry[bus];
221 context = get_context_addr_from_root(root);
226 ret = context_present(context[devfn]);
228 spin_unlock_irqrestore(&iommu->lock, flags);
232 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
234 struct root_entry *root;
235 struct context_entry *context;
238 spin_lock_irqsave(&iommu->lock, flags);
239 root = &iommu->root_entry[bus];
240 context = get_context_addr_from_root(root);
242 context_clear_entry(context[devfn]);
243 __iommu_flush_cache(iommu, &context[devfn], \
246 spin_unlock_irqrestore(&iommu->lock, flags);
249 static void free_context_table(struct intel_iommu *iommu)
251 struct root_entry *root;
254 struct context_entry *context;
256 spin_lock_irqsave(&iommu->lock, flags);
257 if (!iommu->root_entry) {
260 for (i = 0; i < ROOT_ENTRY_NR; i++) {
261 root = &iommu->root_entry[i];
262 context = get_context_addr_from_root(root);
264 free_pgtable_page(context);
266 free_pgtable_page(iommu->root_entry);
267 iommu->root_entry = NULL;
269 spin_unlock_irqrestore(&iommu->lock, flags);
272 /* page table handling */
273 #define LEVEL_STRIDE (9)
274 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
276 static inline int agaw_to_level(int agaw)
281 static inline int agaw_to_width(int agaw)
283 return 30 + agaw * LEVEL_STRIDE;
287 static inline int width_to_agaw(int width)
289 return (width - 30) / LEVEL_STRIDE;
292 static inline unsigned int level_to_offset_bits(int level)
294 return (12 + (level - 1) * LEVEL_STRIDE);
297 static inline int address_level_offset(u64 addr, int level)
299 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
302 static inline u64 level_mask(int level)
304 return ((u64)-1 << level_to_offset_bits(level));
307 static inline u64 level_size(int level)
309 return ((u64)1 << level_to_offset_bits(level));
312 static inline u64 align_to_level(u64 addr, int level)
314 return ((addr + level_size(level) - 1) & level_mask(level));
317 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
319 int addr_width = agaw_to_width(domain->agaw);
320 struct dma_pte *parent, *pte = NULL;
321 int level = agaw_to_level(domain->agaw);
325 BUG_ON(!domain->pgd);
327 addr &= (((u64)1) << addr_width) - 1;
328 parent = domain->pgd;
330 spin_lock_irqsave(&domain->mapping_lock, flags);
334 offset = address_level_offset(addr, level);
335 pte = &parent[offset];
339 if (!dma_pte_present(*pte)) {
340 tmp_page = alloc_pgtable_page();
343 spin_unlock_irqrestore(&domain->mapping_lock,
347 __iommu_flush_cache(domain->iommu, tmp_page,
349 dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
351 * high level table always sets r/w, last level page
352 * table control read/write
354 dma_set_pte_readable(*pte);
355 dma_set_pte_writable(*pte);
356 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
358 parent = phys_to_virt(dma_pte_addr(*pte));
362 spin_unlock_irqrestore(&domain->mapping_lock, flags);
366 /* return address's pte at specific level */
367 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
370 struct dma_pte *parent, *pte = NULL;
371 int total = agaw_to_level(domain->agaw);
374 parent = domain->pgd;
375 while (level <= total) {
376 offset = address_level_offset(addr, total);
377 pte = &parent[offset];
381 if (!dma_pte_present(*pte))
383 parent = phys_to_virt(dma_pte_addr(*pte));
389 /* clear one page's page table */
390 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
392 struct dma_pte *pte = NULL;
394 /* get last level pte */
395 pte = dma_addr_level_pte(domain, addr, 1);
399 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
403 /* clear last level pte, a tlb flush should be followed */
404 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
406 int addr_width = agaw_to_width(domain->agaw);
408 start &= (((u64)1) << addr_width) - 1;
409 end &= (((u64)1) << addr_width) - 1;
410 /* in case it's partial page */
411 start = PAGE_ALIGN_4K(start);
414 /* we don't need lock here, nobody else touches the iova range */
415 while (start < end) {
416 dma_pte_clear_one(domain, start);
417 start += PAGE_SIZE_4K;
421 /* free page table pages. last level pte should already be cleared */
422 static void dma_pte_free_pagetable(struct dmar_domain *domain,
425 int addr_width = agaw_to_width(domain->agaw);
427 int total = agaw_to_level(domain->agaw);
431 start &= (((u64)1) << addr_width) - 1;
432 end &= (((u64)1) << addr_width) - 1;
434 /* we don't need lock here, nobody else touches the iova range */
436 while (level <= total) {
437 tmp = align_to_level(start, level);
438 if (tmp >= end || (tmp + level_size(level) > end))
442 pte = dma_addr_level_pte(domain, tmp, level);
445 phys_to_virt(dma_pte_addr(*pte)));
447 __iommu_flush_cache(domain->iommu,
450 tmp += level_size(level);
455 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
456 free_pgtable_page(domain->pgd);
462 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
464 struct root_entry *root;
467 root = (struct root_entry *)alloc_pgtable_page();
471 __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
473 spin_lock_irqsave(&iommu->lock, flags);
474 iommu->root_entry = root;
475 spin_unlock_irqrestore(&iommu->lock, flags);
480 static void iommu_set_root_entry(struct intel_iommu *iommu)
486 addr = iommu->root_entry;
488 spin_lock_irqsave(&iommu->register_lock, flag);
489 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
491 cmd = iommu->gcmd | DMA_GCMD_SRTP;
492 writel(cmd, iommu->reg + DMAR_GCMD_REG);
494 /* Make sure hardware complete it */
495 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
496 readl, (sts & DMA_GSTS_RTPS), sts);
498 spin_unlock_irqrestore(&iommu->register_lock, flag);
501 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
506 if (!cap_rwbf(iommu->cap))
508 val = iommu->gcmd | DMA_GCMD_WBF;
510 spin_lock_irqsave(&iommu->register_lock, flag);
511 writel(val, iommu->reg + DMAR_GCMD_REG);
513 /* Make sure hardware complete it */
514 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
515 readl, (!(val & DMA_GSTS_WBFS)), val);
517 spin_unlock_irqrestore(&iommu->register_lock, flag);
520 /* return value determine if we need a write buffer flush */
521 static int __iommu_flush_context(struct intel_iommu *iommu,
522 u16 did, u16 source_id, u8 function_mask, u64 type,
523 int non_present_entry_flush)
529 * In the non-present entry flush case, if hardware doesn't cache
530 * non-present entry we do nothing and if hardware cache non-present
531 * entry, we flush entries of domain 0 (the domain id is used to cache
532 * any non-present entries)
534 if (non_present_entry_flush) {
535 if (!cap_caching_mode(iommu->cap))
542 case DMA_CCMD_GLOBAL_INVL:
543 val = DMA_CCMD_GLOBAL_INVL;
545 case DMA_CCMD_DOMAIN_INVL:
546 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
548 case DMA_CCMD_DEVICE_INVL:
549 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
550 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
557 spin_lock_irqsave(&iommu->register_lock, flag);
558 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
560 /* Make sure hardware complete it */
561 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
562 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
564 spin_unlock_irqrestore(&iommu->register_lock, flag);
566 /* flush context entry will implictly flush write buffer */
570 /* return value determine if we need a write buffer flush */
571 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
572 u64 addr, unsigned int size_order, u64 type,
573 int non_present_entry_flush)
575 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
576 u64 val = 0, val_iva = 0;
580 * In the non-present entry flush case, if hardware doesn't cache
581 * non-present entry we do nothing and if hardware cache non-present
582 * entry, we flush entries of domain 0 (the domain id is used to cache
583 * any non-present entries)
585 if (non_present_entry_flush) {
586 if (!cap_caching_mode(iommu->cap))
593 case DMA_TLB_GLOBAL_FLUSH:
594 /* global flush doesn't need set IVA_REG */
595 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
597 case DMA_TLB_DSI_FLUSH:
598 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
600 case DMA_TLB_PSI_FLUSH:
601 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
602 /* Note: always flush non-leaf currently */
603 val_iva = size_order | addr;
608 /* Note: set drain read/write */
611 * This is probably to be super secure.. Looks like we can
612 * ignore it without any impact.
614 if (cap_read_drain(iommu->cap))
615 val |= DMA_TLB_READ_DRAIN;
617 if (cap_write_drain(iommu->cap))
618 val |= DMA_TLB_WRITE_DRAIN;
620 spin_lock_irqsave(&iommu->register_lock, flag);
621 /* Note: Only uses first TLB reg currently */
623 dmar_writeq(iommu->reg + tlb_offset, val_iva);
624 dmar_writeq(iommu->reg + tlb_offset + 8, val);
626 /* Make sure hardware complete it */
627 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
628 dmar_readq, (!(val & DMA_TLB_IVT)), val);
630 spin_unlock_irqrestore(&iommu->register_lock, flag);
632 /* check IOTLB invalidation granularity */
633 if (DMA_TLB_IAIG(val) == 0)
634 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
635 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
636 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
637 DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
638 /* flush context entry will implictly flush write buffer */
642 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
643 u64 addr, unsigned int pages, int non_present_entry_flush)
647 BUG_ON(addr & (~PAGE_MASK_4K));
650 /* Fallback to domain selective flush if no PSI support */
651 if (!cap_pgsel_inv(iommu->cap))
652 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
654 non_present_entry_flush);
657 * PSI requires page size to be 2 ^ x, and the base address is naturally
658 * aligned to the size
660 mask = ilog2(__roundup_pow_of_two(pages));
661 /* Fallback to domain selective flush if size is too big */
662 if (mask > cap_max_amask_val(iommu->cap))
663 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
664 DMA_TLB_DSI_FLUSH, non_present_entry_flush);
666 return iommu->flush.flush_iotlb(iommu, did, addr, mask,
668 non_present_entry_flush);
671 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
676 spin_lock_irqsave(&iommu->register_lock, flags);
677 pmen = readl(iommu->reg + DMAR_PMEN_REG);
678 pmen &= ~DMA_PMEN_EPM;
679 writel(pmen, iommu->reg + DMAR_PMEN_REG);
681 /* wait for the protected region status bit to clear */
682 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
683 readl, !(pmen & DMA_PMEN_PRS), pmen);
685 spin_unlock_irqrestore(&iommu->register_lock, flags);
688 static int iommu_enable_translation(struct intel_iommu *iommu)
693 spin_lock_irqsave(&iommu->register_lock, flags);
694 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
696 /* Make sure hardware complete it */
697 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
698 readl, (sts & DMA_GSTS_TES), sts);
700 iommu->gcmd |= DMA_GCMD_TE;
701 spin_unlock_irqrestore(&iommu->register_lock, flags);
705 static int iommu_disable_translation(struct intel_iommu *iommu)
710 spin_lock_irqsave(&iommu->register_lock, flag);
711 iommu->gcmd &= ~DMA_GCMD_TE;
712 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
714 /* Make sure hardware complete it */
715 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
716 readl, (!(sts & DMA_GSTS_TES)), sts);
718 spin_unlock_irqrestore(&iommu->register_lock, flag);
722 /* iommu interrupt handling. Most stuff are MSI-like. */
724 static const char *fault_reason_strings[] =
727 "Present bit in root entry is clear",
728 "Present bit in context entry is clear",
729 "Invalid context entry",
730 "Access beyond MGAW",
731 "PTE Write access is not set",
732 "PTE Read access is not set",
733 "Next page table ptr is invalid",
734 "Root table address invalid",
735 "Context table ptr is invalid",
736 "non-zero reserved fields in RTP",
737 "non-zero reserved fields in CTP",
738 "non-zero reserved fields in PTE",
740 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
742 const char *dmar_get_fault_reason(u8 fault_reason)
744 if (fault_reason > MAX_FAULT_REASON_IDX)
747 return fault_reason_strings[fault_reason];
750 void dmar_msi_unmask(unsigned int irq)
752 struct intel_iommu *iommu = get_irq_data(irq);
756 spin_lock_irqsave(&iommu->register_lock, flag);
757 writel(0, iommu->reg + DMAR_FECTL_REG);
758 /* Read a reg to force flush the post write */
759 readl(iommu->reg + DMAR_FECTL_REG);
760 spin_unlock_irqrestore(&iommu->register_lock, flag);
763 void dmar_msi_mask(unsigned int irq)
766 struct intel_iommu *iommu = get_irq_data(irq);
769 spin_lock_irqsave(&iommu->register_lock, flag);
770 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
771 /* Read a reg to force flush the post write */
772 readl(iommu->reg + DMAR_FECTL_REG);
773 spin_unlock_irqrestore(&iommu->register_lock, flag);
776 void dmar_msi_write(int irq, struct msi_msg *msg)
778 struct intel_iommu *iommu = get_irq_data(irq);
781 spin_lock_irqsave(&iommu->register_lock, flag);
782 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
783 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
784 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
785 spin_unlock_irqrestore(&iommu->register_lock, flag);
788 void dmar_msi_read(int irq, struct msi_msg *msg)
790 struct intel_iommu *iommu = get_irq_data(irq);
793 spin_lock_irqsave(&iommu->register_lock, flag);
794 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
795 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
796 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
797 spin_unlock_irqrestore(&iommu->register_lock, flag);
800 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
801 u8 fault_reason, u16 source_id, u64 addr)
805 reason = dmar_get_fault_reason(fault_reason);
808 "DMAR:[%s] Request device [%02x:%02x.%d] "
810 "DMAR:[fault reason %02d] %s\n",
811 (type ? "DMA Read" : "DMA Write"),
812 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
813 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
817 #define PRIMARY_FAULT_REG_LEN (16)
818 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
820 struct intel_iommu *iommu = dev_id;
821 int reg, fault_index;
825 spin_lock_irqsave(&iommu->register_lock, flag);
826 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
828 /* TBD: ignore advanced fault log currently */
829 if (!(fault_status & DMA_FSTS_PPF))
832 fault_index = dma_fsts_fault_record_index(fault_status);
833 reg = cap_fault_reg_offset(iommu->cap);
841 /* highest 32 bits */
842 data = readl(iommu->reg + reg +
843 fault_index * PRIMARY_FAULT_REG_LEN + 12);
844 if (!(data & DMA_FRCD_F))
847 fault_reason = dma_frcd_fault_reason(data);
848 type = dma_frcd_type(data);
850 data = readl(iommu->reg + reg +
851 fault_index * PRIMARY_FAULT_REG_LEN + 8);
852 source_id = dma_frcd_source_id(data);
854 guest_addr = dmar_readq(iommu->reg + reg +
855 fault_index * PRIMARY_FAULT_REG_LEN);
856 guest_addr = dma_frcd_page_addr(guest_addr);
857 /* clear the fault */
858 writel(DMA_FRCD_F, iommu->reg + reg +
859 fault_index * PRIMARY_FAULT_REG_LEN + 12);
861 spin_unlock_irqrestore(&iommu->register_lock, flag);
863 iommu_page_fault_do_one(iommu, type, fault_reason,
864 source_id, guest_addr);
867 if (fault_index > cap_num_fault_regs(iommu->cap))
869 spin_lock_irqsave(&iommu->register_lock, flag);
872 /* clear primary fault overflow */
873 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
874 if (fault_status & DMA_FSTS_PFO)
875 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
877 spin_unlock_irqrestore(&iommu->register_lock, flag);
881 int dmar_set_interrupt(struct intel_iommu *iommu)
887 printk(KERN_ERR "IOMMU: no free vectors\n");
891 set_irq_data(irq, iommu);
894 ret = arch_setup_dmar_msi(irq);
896 set_irq_data(irq, NULL);
902 /* Force fault register is cleared */
903 iommu_page_fault(irq, iommu);
905 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
907 printk(KERN_ERR "IOMMU: can't request irq\n");
911 static int iommu_init_domains(struct intel_iommu *iommu)
913 unsigned long ndomains;
914 unsigned long nlongs;
916 ndomains = cap_ndoms(iommu->cap);
917 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
918 nlongs = BITS_TO_LONGS(ndomains);
920 /* TBD: there might be 64K domains,
921 * consider other allocation for future chip
923 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
924 if (!iommu->domain_ids) {
925 printk(KERN_ERR "Allocating domain id array failed\n");
928 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
930 if (!iommu->domains) {
931 printk(KERN_ERR "Allocating domain array failed\n");
932 kfree(iommu->domain_ids);
936 spin_lock_init(&iommu->lock);
939 * if Caching mode is set, then invalid translations are tagged
940 * with domainid 0. Hence we need to pre-allocate it.
942 if (cap_caching_mode(iommu->cap))
943 set_bit(0, iommu->domain_ids);
948 static void domain_exit(struct dmar_domain *domain);
950 void free_dmar_iommu(struct intel_iommu *iommu)
952 struct dmar_domain *domain;
955 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
956 for (; i < cap_ndoms(iommu->cap); ) {
957 domain = iommu->domains[i];
958 clear_bit(i, iommu->domain_ids);
960 i = find_next_bit(iommu->domain_ids,
961 cap_ndoms(iommu->cap), i+1);
964 if (iommu->gcmd & DMA_GCMD_TE)
965 iommu_disable_translation(iommu);
968 set_irq_data(iommu->irq, NULL);
969 /* This will mask the irq */
970 free_irq(iommu->irq, iommu);
971 destroy_irq(iommu->irq);
974 kfree(iommu->domains);
975 kfree(iommu->domain_ids);
977 /* free context mapping */
978 free_context_table(iommu);
981 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
984 unsigned long ndomains;
985 struct dmar_domain *domain;
988 domain = alloc_domain_mem();
992 ndomains = cap_ndoms(iommu->cap);
994 spin_lock_irqsave(&iommu->lock, flags);
995 num = find_first_zero_bit(iommu->domain_ids, ndomains);
996 if (num >= ndomains) {
997 spin_unlock_irqrestore(&iommu->lock, flags);
998 free_domain_mem(domain);
999 printk(KERN_ERR "IOMMU: no free domain ids\n");
1003 set_bit(num, iommu->domain_ids);
1005 domain->iommu = iommu;
1006 iommu->domains[num] = domain;
1007 spin_unlock_irqrestore(&iommu->lock, flags);
1012 static void iommu_free_domain(struct dmar_domain *domain)
1014 unsigned long flags;
1016 spin_lock_irqsave(&domain->iommu->lock, flags);
1017 clear_bit(domain->id, domain->iommu->domain_ids);
1018 spin_unlock_irqrestore(&domain->iommu->lock, flags);
1021 static struct iova_domain reserved_iova_list;
1022 static struct lock_class_key reserved_alloc_key;
1023 static struct lock_class_key reserved_rbtree_key;
1025 static void dmar_init_reserved_ranges(void)
1027 struct pci_dev *pdev = NULL;
1032 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1034 lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1035 &reserved_alloc_key);
1036 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1037 &reserved_rbtree_key);
1039 /* IOAPIC ranges shouldn't be accessed by DMA */
1040 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1041 IOVA_PFN(IOAPIC_RANGE_END));
1043 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1045 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1046 for_each_pci_dev(pdev) {
1049 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1050 r = &pdev->resource[i];
1051 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1054 addr &= PAGE_MASK_4K;
1055 size = r->end - addr;
1056 size = PAGE_ALIGN_4K(size);
1057 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1058 IOVA_PFN(size + addr) - 1);
1060 printk(KERN_ERR "Reserve iova failed\n");
1066 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1068 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1071 static inline int guestwidth_to_adjustwidth(int gaw)
1074 int r = (gaw - 12) % 9;
1085 static int domain_init(struct dmar_domain *domain, int guest_width)
1087 struct intel_iommu *iommu;
1088 int adjust_width, agaw;
1089 unsigned long sagaw;
1091 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1092 spin_lock_init(&domain->mapping_lock);
1094 domain_reserve_special_ranges(domain);
1096 /* calculate AGAW */
1097 iommu = domain->iommu;
1098 if (guest_width > cap_mgaw(iommu->cap))
1099 guest_width = cap_mgaw(iommu->cap);
1100 domain->gaw = guest_width;
1101 adjust_width = guestwidth_to_adjustwidth(guest_width);
1102 agaw = width_to_agaw(adjust_width);
1103 sagaw = cap_sagaw(iommu->cap);
1104 if (!test_bit(agaw, &sagaw)) {
1105 /* hardware doesn't support it, choose a bigger one */
1106 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1107 agaw = find_next_bit(&sagaw, 5, agaw);
1111 domain->agaw = agaw;
1112 INIT_LIST_HEAD(&domain->devices);
1114 /* always allocate the top pgd */
1115 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1118 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1122 static void domain_exit(struct dmar_domain *domain)
1126 /* Domain 0 is reserved, so dont process it */
1130 domain_remove_dev_info(domain);
1132 put_iova_domain(&domain->iovad);
1133 end = DOMAIN_MAX_ADDR(domain->gaw);
1134 end = end & (~PAGE_MASK_4K);
1137 dma_pte_clear_range(domain, 0, end);
1139 /* free page tables */
1140 dma_pte_free_pagetable(domain, 0, end);
1142 iommu_free_domain(domain);
1143 free_domain_mem(domain);
1146 static int domain_context_mapping_one(struct dmar_domain *domain,
1149 struct context_entry *context;
1150 struct intel_iommu *iommu = domain->iommu;
1151 unsigned long flags;
1153 pr_debug("Set context mapping for %02x:%02x.%d\n",
1154 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1155 BUG_ON(!domain->pgd);
1156 context = device_to_context_entry(iommu, bus, devfn);
1159 spin_lock_irqsave(&iommu->lock, flags);
1160 if (context_present(*context)) {
1161 spin_unlock_irqrestore(&iommu->lock, flags);
1165 context_set_domain_id(*context, domain->id);
1166 context_set_address_width(*context, domain->agaw);
1167 context_set_address_root(*context, virt_to_phys(domain->pgd));
1168 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1169 context_set_fault_enable(*context);
1170 context_set_present(*context);
1171 __iommu_flush_cache(iommu, context, sizeof(*context));
1173 /* it's a non-present to present mapping */
1174 if (iommu->flush.flush_context(iommu, domain->id,
1175 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1176 DMA_CCMD_DEVICE_INVL, 1))
1177 iommu_flush_write_buffer(iommu);
1179 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1181 spin_unlock_irqrestore(&iommu->lock, flags);
1186 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1189 struct pci_dev *tmp, *parent;
1191 ret = domain_context_mapping_one(domain, pdev->bus->number,
1196 /* dependent device mapping */
1197 tmp = pci_find_upstream_pcie_bridge(pdev);
1200 /* Secondary interface's bus number and devfn 0 */
1201 parent = pdev->bus->self;
1202 while (parent != tmp) {
1203 ret = domain_context_mapping_one(domain, parent->bus->number,
1207 parent = parent->bus->self;
1209 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1210 return domain_context_mapping_one(domain,
1211 tmp->subordinate->number, 0);
1212 else /* this is a legacy PCI bridge */
1213 return domain_context_mapping_one(domain,
1214 tmp->bus->number, tmp->devfn);
1217 static int domain_context_mapped(struct dmar_domain *domain,
1218 struct pci_dev *pdev)
1221 struct pci_dev *tmp, *parent;
1223 ret = device_context_mapped(domain->iommu,
1224 pdev->bus->number, pdev->devfn);
1227 /* dependent device mapping */
1228 tmp = pci_find_upstream_pcie_bridge(pdev);
1231 /* Secondary interface's bus number and devfn 0 */
1232 parent = pdev->bus->self;
1233 while (parent != tmp) {
1234 ret = device_context_mapped(domain->iommu, parent->bus->number,
1238 parent = parent->bus->self;
1241 return device_context_mapped(domain->iommu,
1242 tmp->subordinate->number, 0);
1244 return device_context_mapped(domain->iommu,
1245 tmp->bus->number, tmp->devfn);
1249 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1250 u64 hpa, size_t size, int prot)
1252 u64 start_pfn, end_pfn;
1253 struct dma_pte *pte;
1256 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1258 iova &= PAGE_MASK_4K;
1259 start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1260 end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1262 while (start_pfn < end_pfn) {
1263 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1266 /* We don't need lock here, nobody else
1267 * touches the iova range
1269 BUG_ON(dma_pte_addr(*pte));
1270 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1271 dma_set_pte_prot(*pte, prot);
1272 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1279 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1281 clear_context_table(domain->iommu, bus, devfn);
1282 domain->iommu->flush.flush_context(domain->iommu, 0, 0, 0,
1283 DMA_CCMD_GLOBAL_INVL, 0);
1284 domain->iommu->flush.flush_iotlb(domain->iommu, 0, 0, 0,
1285 DMA_TLB_GLOBAL_FLUSH, 0);
1288 static void domain_remove_dev_info(struct dmar_domain *domain)
1290 struct device_domain_info *info;
1291 unsigned long flags;
1293 spin_lock_irqsave(&device_domain_lock, flags);
1294 while (!list_empty(&domain->devices)) {
1295 info = list_entry(domain->devices.next,
1296 struct device_domain_info, link);
1297 list_del(&info->link);
1298 list_del(&info->global);
1300 info->dev->dev.archdata.iommu = NULL;
1301 spin_unlock_irqrestore(&device_domain_lock, flags);
1303 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1304 free_devinfo_mem(info);
1306 spin_lock_irqsave(&device_domain_lock, flags);
1308 spin_unlock_irqrestore(&device_domain_lock, flags);
1313 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1315 static struct dmar_domain *
1316 find_domain(struct pci_dev *pdev)
1318 struct device_domain_info *info;
1320 /* No lock here, assumes no domain exit in normal case */
1321 info = pdev->dev.archdata.iommu;
1323 return info->domain;
1327 /* domain is initialized */
1328 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1330 struct dmar_domain *domain, *found = NULL;
1331 struct intel_iommu *iommu;
1332 struct dmar_drhd_unit *drhd;
1333 struct device_domain_info *info, *tmp;
1334 struct pci_dev *dev_tmp;
1335 unsigned long flags;
1336 int bus = 0, devfn = 0;
1338 domain = find_domain(pdev);
1342 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1344 if (dev_tmp->is_pcie) {
1345 bus = dev_tmp->subordinate->number;
1348 bus = dev_tmp->bus->number;
1349 devfn = dev_tmp->devfn;
1351 spin_lock_irqsave(&device_domain_lock, flags);
1352 list_for_each_entry(info, &device_domain_list, global) {
1353 if (info->bus == bus && info->devfn == devfn) {
1354 found = info->domain;
1358 spin_unlock_irqrestore(&device_domain_lock, flags);
1359 /* pcie-pci bridge already has a domain, uses it */
1366 /* Allocate new domain for the device */
1367 drhd = dmar_find_matched_drhd_unit(pdev);
1369 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1373 iommu = drhd->iommu;
1375 domain = iommu_alloc_domain(iommu);
1379 if (domain_init(domain, gaw)) {
1380 domain_exit(domain);
1384 /* register pcie-to-pci device */
1386 info = alloc_devinfo_mem();
1388 domain_exit(domain);
1392 info->devfn = devfn;
1394 info->domain = domain;
1395 /* This domain is shared by devices under p2p bridge */
1396 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1398 /* pcie-to-pci bridge already has a domain, uses it */
1400 spin_lock_irqsave(&device_domain_lock, flags);
1401 list_for_each_entry(tmp, &device_domain_list, global) {
1402 if (tmp->bus == bus && tmp->devfn == devfn) {
1403 found = tmp->domain;
1408 free_devinfo_mem(info);
1409 domain_exit(domain);
1412 list_add(&info->link, &domain->devices);
1413 list_add(&info->global, &device_domain_list);
1415 spin_unlock_irqrestore(&device_domain_lock, flags);
1419 info = alloc_devinfo_mem();
1422 info->bus = pdev->bus->number;
1423 info->devfn = pdev->devfn;
1425 info->domain = domain;
1426 spin_lock_irqsave(&device_domain_lock, flags);
1427 /* somebody is fast */
1428 found = find_domain(pdev);
1429 if (found != NULL) {
1430 spin_unlock_irqrestore(&device_domain_lock, flags);
1431 if (found != domain) {
1432 domain_exit(domain);
1435 free_devinfo_mem(info);
1438 list_add(&info->link, &domain->devices);
1439 list_add(&info->global, &device_domain_list);
1440 pdev->dev.archdata.iommu = info;
1441 spin_unlock_irqrestore(&device_domain_lock, flags);
1444 /* recheck it here, maybe others set it */
1445 return find_domain(pdev);
1448 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1450 struct dmar_domain *domain;
1456 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1457 pci_name(pdev), start, end);
1458 /* page table init */
1459 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1463 /* The address might not be aligned */
1464 base = start & PAGE_MASK_4K;
1466 size = PAGE_ALIGN_4K(size);
1467 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1468 IOVA_PFN(base + size) - 1)) {
1469 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1474 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1475 size, base, pci_name(pdev));
1477 * RMRR range might have overlap with physical memory range,
1480 dma_pte_clear_range(domain, base, base + size);
1482 ret = domain_page_mapping(domain, base, base, size,
1483 DMA_PTE_READ|DMA_PTE_WRITE);
1487 /* context entry init */
1488 ret = domain_context_mapping(domain, pdev);
1492 domain_exit(domain);
1497 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1498 struct pci_dev *pdev)
1500 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1502 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1503 rmrr->end_address + 1);
1506 #ifdef CONFIG_DMAR_GFX_WA
1507 struct iommu_prepare_data {
1508 struct pci_dev *pdev;
1512 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1513 unsigned long end_pfn, void *datax)
1515 struct iommu_prepare_data *data;
1517 data = (struct iommu_prepare_data *)datax;
1519 data->ret = iommu_prepare_identity_map(data->pdev,
1520 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1525 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1528 struct iommu_prepare_data data;
1533 for_each_online_node(nid) {
1534 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1541 static void __init iommu_prepare_gfx_mapping(void)
1543 struct pci_dev *pdev = NULL;
1546 for_each_pci_dev(pdev) {
1547 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1548 !IS_GFX_DEVICE(pdev))
1550 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1552 ret = iommu_prepare_with_active_regions(pdev);
1554 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1559 #ifdef CONFIG_DMAR_FLOPPY_WA
1560 static inline void iommu_prepare_isa(void)
1562 struct pci_dev *pdev;
1565 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1569 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1570 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1573 printk("IOMMU: Failed to create 0-64M identity map, "
1574 "floppy might not work\n");
1578 static inline void iommu_prepare_isa(void)
1582 #endif /* !CONFIG_DMAR_FLPY_WA */
1584 int __init init_dmars(void)
1586 struct dmar_drhd_unit *drhd;
1587 struct dmar_rmrr_unit *rmrr;
1588 struct pci_dev *pdev;
1589 struct intel_iommu *iommu;
1590 int i, ret, unit = 0;
1595 * initialize and program root entry to not present
1598 for_each_drhd_unit(drhd) {
1601 * lock not needed as this is only incremented in the single
1602 * threaded kernel __init code path all other access are read
1607 deferred_flush = kzalloc(g_num_of_iommus *
1608 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1609 if (!deferred_flush) {
1614 for_each_drhd_unit(drhd) {
1618 iommu = drhd->iommu;
1620 ret = iommu_init_domains(iommu);
1626 * we could share the same root & context tables
1627 * amoung all IOMMU's. Need to Split it later.
1629 ret = iommu_alloc_root_entry(iommu);
1631 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1636 for_each_drhd_unit(drhd) {
1640 iommu = drhd->iommu;
1641 if (dmar_enable_qi(iommu)) {
1643 * Queued Invalidate not enabled, use Register Based
1646 iommu->flush.flush_context = __iommu_flush_context;
1647 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1648 printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1649 "invalidation\n", drhd->reg_base_addr);
1651 iommu->flush.flush_context = qi_flush_context;
1652 iommu->flush.flush_iotlb = qi_flush_iotlb;
1653 printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1654 "invalidation\n", drhd->reg_base_addr);
1660 * for each dev attached to rmrr
1662 * locate drhd for dev, alloc domain for dev
1663 * allocate free domain
1664 * allocate page table entries for rmrr
1665 * if context not allocated for bus
1666 * allocate and init context
1667 * set present in root table for this bus
1668 * init context with domain, translation etc
1672 for_each_rmrr_units(rmrr) {
1673 for (i = 0; i < rmrr->devices_cnt; i++) {
1674 pdev = rmrr->devices[i];
1675 /* some BIOS lists non-exist devices in DMAR table */
1678 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1681 "IOMMU: mapping reserved region failed\n");
1685 iommu_prepare_gfx_mapping();
1687 iommu_prepare_isa();
1692 * global invalidate context cache
1693 * global invalidate iotlb
1694 * enable translation
1696 for_each_drhd_unit(drhd) {
1699 iommu = drhd->iommu;
1700 sprintf (iommu->name, "dmar%d", unit++);
1702 iommu_flush_write_buffer(iommu);
1704 ret = dmar_set_interrupt(iommu);
1708 iommu_set_root_entry(iommu);
1710 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
1712 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
1714 iommu_disable_protect_mem_regions(iommu);
1716 ret = iommu_enable_translation(iommu);
1723 for_each_drhd_unit(drhd) {
1726 iommu = drhd->iommu;
1732 static inline u64 aligned_size(u64 host_addr, size_t size)
1735 addr = (host_addr & (~PAGE_MASK_4K)) + size;
1736 return PAGE_ALIGN_4K(addr);
1740 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1744 /* Make sure it's in range */
1745 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1746 if (!size || (IOVA_START_ADDR + size > end))
1749 piova = alloc_iova(&domain->iovad,
1750 size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
1754 static struct iova *
1755 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1758 struct pci_dev *pdev = to_pci_dev(dev);
1759 struct iova *iova = NULL;
1761 if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1762 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1765 * First try to allocate an io virtual address in
1766 * DMA_32BIT_MASK and if that fails then try allocating
1769 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1771 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1775 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1782 static struct dmar_domain *
1783 get_valid_domain_for_dev(struct pci_dev *pdev)
1785 struct dmar_domain *domain;
1788 domain = get_domain_for_dev(pdev,
1789 DEFAULT_DOMAIN_ADDRESS_WIDTH);
1792 "Allocating domain for %s failed", pci_name(pdev));
1796 /* make sure context mapping is ok */
1797 if (unlikely(!domain_context_mapped(domain, pdev))) {
1798 ret = domain_context_mapping(domain, pdev);
1801 "Domain context map for %s failed",
1811 intel_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
1813 struct pci_dev *pdev = to_pci_dev(hwdev);
1814 struct dmar_domain *domain;
1815 unsigned long start_paddr;
1820 BUG_ON(dir == DMA_NONE);
1821 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1824 domain = get_valid_domain_for_dev(pdev);
1828 size = aligned_size((u64)paddr, size);
1830 iova = __intel_alloc_iova(hwdev, domain, size);
1834 start_paddr = iova->pfn_lo << PAGE_SHIFT_4K;
1837 * Check if DMAR supports zero-length reads on write only
1840 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1841 !cap_zlr(domain->iommu->cap))
1842 prot |= DMA_PTE_READ;
1843 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1844 prot |= DMA_PTE_WRITE;
1846 * paddr - (paddr + size) might be partial page, we should map the whole
1847 * page. Note: if two part of one page are separately mapped, we
1848 * might have two guest_addr mapping to the same host paddr, but this
1849 * is not a big problem
1851 ret = domain_page_mapping(domain, start_paddr,
1852 ((u64)paddr) & PAGE_MASK_4K, size, prot);
1856 pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1857 pci_name(pdev), size, (u64)paddr,
1858 size, (u64)start_paddr, dir);
1860 /* it's a non-present to present mapping */
1861 ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1862 start_paddr, size >> PAGE_SHIFT_4K, 1);
1864 iommu_flush_write_buffer(domain->iommu);
1866 return (start_paddr + ((u64)paddr & (~PAGE_MASK_4K)));
1870 __free_iova(&domain->iovad, iova);
1871 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1872 pci_name(pdev), size, (u64)paddr, dir);
1876 static void flush_unmaps(void)
1882 /* just flush them all */
1883 for (i = 0; i < g_num_of_iommus; i++) {
1884 if (deferred_flush[i].next) {
1885 struct intel_iommu *iommu =
1886 deferred_flush[i].domain[0]->iommu;
1888 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1889 DMA_TLB_GLOBAL_FLUSH, 0);
1890 for (j = 0; j < deferred_flush[i].next; j++) {
1891 __free_iova(&deferred_flush[i].domain[j]->iovad,
1892 deferred_flush[i].iova[j]);
1894 deferred_flush[i].next = 0;
1901 static void flush_unmaps_timeout(unsigned long data)
1903 unsigned long flags;
1905 spin_lock_irqsave(&async_umap_flush_lock, flags);
1907 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1910 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
1912 unsigned long flags;
1915 spin_lock_irqsave(&async_umap_flush_lock, flags);
1916 if (list_size == HIGH_WATER_MARK)
1919 iommu_id = dom->iommu->seq_id;
1921 next = deferred_flush[iommu_id].next;
1922 deferred_flush[iommu_id].domain[next] = dom;
1923 deferred_flush[iommu_id].iova[next] = iova;
1924 deferred_flush[iommu_id].next++;
1927 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
1931 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1934 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1935 size_t size, int dir)
1937 struct pci_dev *pdev = to_pci_dev(dev);
1938 struct dmar_domain *domain;
1939 unsigned long start_addr;
1942 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1944 domain = find_domain(pdev);
1947 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1951 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1952 size = aligned_size((u64)dev_addr, size);
1954 pr_debug("Device %s unmapping: %lx@%llx\n",
1955 pci_name(pdev), size, (u64)start_addr);
1957 /* clear the whole page */
1958 dma_pte_clear_range(domain, start_addr, start_addr + size);
1959 /* free page tables */
1960 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1961 if (intel_iommu_strict) {
1962 if (iommu_flush_iotlb_psi(domain->iommu,
1963 domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
1964 iommu_flush_write_buffer(domain->iommu);
1966 __free_iova(&domain->iovad, iova);
1968 add_unmap(domain, iova);
1970 * queue up the release of the unmap to save the 1/6th of the
1971 * cpu used up by the iotlb flush operation...
1976 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
1977 dma_addr_t *dma_handle, gfp_t flags)
1982 size = PAGE_ALIGN_4K(size);
1983 order = get_order(size);
1984 flags &= ~(GFP_DMA | GFP_DMA32);
1986 vaddr = (void *)__get_free_pages(flags, order);
1989 memset(vaddr, 0, size);
1991 *dma_handle = intel_map_single(hwdev, virt_to_bus(vaddr), size, DMA_BIDIRECTIONAL);
1994 free_pages((unsigned long)vaddr, order);
1998 static void intel_free_coherent(struct device *hwdev, size_t size,
1999 void *vaddr, dma_addr_t dma_handle)
2003 size = PAGE_ALIGN_4K(size);
2004 order = get_order(size);
2006 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2007 free_pages((unsigned long)vaddr, order);
2010 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2011 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2012 int nelems, int dir)
2015 struct pci_dev *pdev = to_pci_dev(hwdev);
2016 struct dmar_domain *domain;
2017 unsigned long start_addr;
2021 struct scatterlist *sg;
2023 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2026 domain = find_domain(pdev);
2028 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2031 for_each_sg(sglist, sg, nelems, i) {
2032 addr = SG_ENT_VIRT_ADDRESS(sg);
2033 size += aligned_size((u64)addr, sg->length);
2036 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2038 /* clear the whole page */
2039 dma_pte_clear_range(domain, start_addr, start_addr + size);
2040 /* free page tables */
2041 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2043 if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2044 size >> PAGE_SHIFT_4K, 0))
2045 iommu_flush_write_buffer(domain->iommu);
2048 __free_iova(&domain->iovad, iova);
2051 static int intel_nontranslate_map_sg(struct device *hddev,
2052 struct scatterlist *sglist, int nelems, int dir)
2055 struct scatterlist *sg;
2057 for_each_sg(sglist, sg, nelems, i) {
2058 BUG_ON(!sg_page(sg));
2059 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2060 sg->dma_length = sg->length;
2065 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
2066 int nelems, int dir)
2070 struct pci_dev *pdev = to_pci_dev(hwdev);
2071 struct dmar_domain *domain;
2075 struct iova *iova = NULL;
2077 struct scatterlist *sg;
2078 unsigned long start_addr;
2080 BUG_ON(dir == DMA_NONE);
2081 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2082 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2084 domain = get_valid_domain_for_dev(pdev);
2088 for_each_sg(sglist, sg, nelems, i) {
2089 addr = SG_ENT_VIRT_ADDRESS(sg);
2090 addr = (void *)virt_to_phys(addr);
2091 size += aligned_size((u64)addr, sg->length);
2094 iova = __intel_alloc_iova(hwdev, domain, size);
2096 sglist->dma_length = 0;
2101 * Check if DMAR supports zero-length reads on write only
2104 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2105 !cap_zlr(domain->iommu->cap))
2106 prot |= DMA_PTE_READ;
2107 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2108 prot |= DMA_PTE_WRITE;
2110 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2112 for_each_sg(sglist, sg, nelems, i) {
2113 addr = SG_ENT_VIRT_ADDRESS(sg);
2114 addr = (void *)virt_to_phys(addr);
2115 size = aligned_size((u64)addr, sg->length);
2116 ret = domain_page_mapping(domain, start_addr + offset,
2117 ((u64)addr) & PAGE_MASK_4K,
2120 /* clear the page */
2121 dma_pte_clear_range(domain, start_addr,
2122 start_addr + offset);
2123 /* free page tables */
2124 dma_pte_free_pagetable(domain, start_addr,
2125 start_addr + offset);
2127 __free_iova(&domain->iovad, iova);
2130 sg->dma_address = start_addr + offset +
2131 ((u64)addr & (~PAGE_MASK_4K));
2132 sg->dma_length = sg->length;
2136 /* it's a non-present to present mapping */
2137 if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2138 start_addr, offset >> PAGE_SHIFT_4K, 1))
2139 iommu_flush_write_buffer(domain->iommu);
2143 static struct dma_mapping_ops intel_dma_ops = {
2144 .alloc_coherent = intel_alloc_coherent,
2145 .free_coherent = intel_free_coherent,
2146 .map_single = intel_map_single,
2147 .unmap_single = intel_unmap_single,
2148 .map_sg = intel_map_sg,
2149 .unmap_sg = intel_unmap_sg,
2152 static inline int iommu_domain_cache_init(void)
2156 iommu_domain_cache = kmem_cache_create("iommu_domain",
2157 sizeof(struct dmar_domain),
2162 if (!iommu_domain_cache) {
2163 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2170 static inline int iommu_devinfo_cache_init(void)
2174 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2175 sizeof(struct device_domain_info),
2180 if (!iommu_devinfo_cache) {
2181 printk(KERN_ERR "Couldn't create devinfo cache\n");
2188 static inline int iommu_iova_cache_init(void)
2192 iommu_iova_cache = kmem_cache_create("iommu_iova",
2193 sizeof(struct iova),
2198 if (!iommu_iova_cache) {
2199 printk(KERN_ERR "Couldn't create iova cache\n");
2206 static int __init iommu_init_mempool(void)
2209 ret = iommu_iova_cache_init();
2213 ret = iommu_domain_cache_init();
2217 ret = iommu_devinfo_cache_init();
2221 kmem_cache_destroy(iommu_domain_cache);
2223 kmem_cache_destroy(iommu_iova_cache);
2228 static void __init iommu_exit_mempool(void)
2230 kmem_cache_destroy(iommu_devinfo_cache);
2231 kmem_cache_destroy(iommu_domain_cache);
2232 kmem_cache_destroy(iommu_iova_cache);
2236 static void __init init_no_remapping_devices(void)
2238 struct dmar_drhd_unit *drhd;
2240 for_each_drhd_unit(drhd) {
2241 if (!drhd->include_all) {
2243 for (i = 0; i < drhd->devices_cnt; i++)
2244 if (drhd->devices[i] != NULL)
2246 /* ignore DMAR unit if no pci devices exist */
2247 if (i == drhd->devices_cnt)
2255 for_each_drhd_unit(drhd) {
2257 if (drhd->ignored || drhd->include_all)
2260 for (i = 0; i < drhd->devices_cnt; i++)
2261 if (drhd->devices[i] &&
2262 !IS_GFX_DEVICE(drhd->devices[i]))
2265 if (i < drhd->devices_cnt)
2268 /* bypass IOMMU if it is just for gfx devices */
2270 for (i = 0; i < drhd->devices_cnt; i++) {
2271 if (!drhd->devices[i])
2273 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2278 int __init intel_iommu_init(void)
2282 if (dmar_table_init())
2285 if (dmar_dev_scope_init())
2289 * Check the need for DMA-remapping initialization now.
2290 * Above initialization will also be used by Interrupt-remapping.
2292 if (no_iommu || swiotlb || dmar_disabled)
2295 iommu_init_mempool();
2296 dmar_init_reserved_ranges();
2298 init_no_remapping_devices();
2302 printk(KERN_ERR "IOMMU: dmar init failed\n");
2303 put_iova_domain(&reserved_iova_list);
2304 iommu_exit_mempool();
2308 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2310 init_timer(&unmap_timer);
2312 dma_ops = &intel_dma_ops;
2316 void intel_iommu_domain_exit(struct dmar_domain *domain)
2320 /* Domain 0 is reserved, so dont process it */
2324 end = DOMAIN_MAX_ADDR(domain->gaw);
2325 end = end & (~PAGE_MASK_4K);
2328 dma_pte_clear_range(domain, 0, end);
2330 /* free page tables */
2331 dma_pte_free_pagetable(domain, 0, end);
2333 iommu_free_domain(domain);
2334 free_domain_mem(domain);
2336 EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
2338 struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
2340 struct dmar_drhd_unit *drhd;
2341 struct dmar_domain *domain;
2342 struct intel_iommu *iommu;
2344 drhd = dmar_find_matched_drhd_unit(pdev);
2346 printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
2350 iommu = drhd->iommu;
2353 "intel_iommu_domain_alloc: iommu == NULL\n");
2356 domain = iommu_alloc_domain(iommu);
2359 "intel_iommu_domain_alloc: domain == NULL\n");
2362 if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2364 "intel_iommu_domain_alloc: domain_init() failed\n");
2365 intel_iommu_domain_exit(domain);
2370 EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
2372 int intel_iommu_context_mapping(
2373 struct dmar_domain *domain, struct pci_dev *pdev)
2376 rc = domain_context_mapping(domain, pdev);
2379 EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
2381 int intel_iommu_page_mapping(
2382 struct dmar_domain *domain, dma_addr_t iova,
2383 u64 hpa, size_t size, int prot)
2386 rc = domain_page_mapping(domain, iova, hpa, size, prot);
2389 EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
2391 void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
2393 detach_domain_for_dev(domain, bus, devfn);
2395 EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
2397 struct dmar_domain *
2398 intel_iommu_find_domain(struct pci_dev *pdev)
2400 return find_domain(pdev);
2402 EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
2404 int intel_iommu_found(void)
2406 return g_num_of_iommus;
2408 EXPORT_SYMBOL_GPL(intel_iommu_found);
2410 u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
2412 struct dma_pte *pte;
2416 pte = addr_to_dma_pte(domain, iova);
2419 pfn = dma_pte_addr(*pte);
2421 return pfn >> PAGE_SHIFT_4K;
2423 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);