]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/blob - drivers/pci/intel-iommu.c
dmar: Use queued invalidation interface for IOTLB and context invalidation
[linux-2.6-omap-h63xx.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  */
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/slab.h>
27 #include <linux/irq.h>
28 #include <linux/interrupt.h>
29 #include <linux/sysdev.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/intel-iommu.h>
38 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
41 #include "pci.h"
42
43 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
44 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
45
46 #define IOAPIC_RANGE_START      (0xfee00000)
47 #define IOAPIC_RANGE_END        (0xfeefffff)
48 #define IOVA_START_ADDR         (0x1000)
49
50 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
51
52 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
53
54
55 static void flush_unmaps_timeout(unsigned long data);
56
57 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
58
59 #define HIGH_WATER_MARK 250
60 struct deferred_flush_tables {
61         int next;
62         struct iova *iova[HIGH_WATER_MARK];
63         struct dmar_domain *domain[HIGH_WATER_MARK];
64 };
65
66 static struct deferred_flush_tables *deferred_flush;
67
68 /* bitmap for indexing intel_iommus */
69 static int g_num_of_iommus;
70
71 static DEFINE_SPINLOCK(async_umap_flush_lock);
72 static LIST_HEAD(unmaps_to_do);
73
74 static int timer_on;
75 static long list_size;
76
77 static void domain_remove_dev_info(struct dmar_domain *domain);
78
79 int dmar_disabled;
80 static int __initdata dmar_map_gfx = 1;
81 static int dmar_forcedac;
82 static int intel_iommu_strict;
83
84 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
85 static DEFINE_SPINLOCK(device_domain_lock);
86 static LIST_HEAD(device_domain_list);
87
88 static int __init intel_iommu_setup(char *str)
89 {
90         if (!str)
91                 return -EINVAL;
92         while (*str) {
93                 if (!strncmp(str, "off", 3)) {
94                         dmar_disabled = 1;
95                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
96                 } else if (!strncmp(str, "igfx_off", 8)) {
97                         dmar_map_gfx = 0;
98                         printk(KERN_INFO
99                                 "Intel-IOMMU: disable GFX device mapping\n");
100                 } else if (!strncmp(str, "forcedac", 8)) {
101                         printk(KERN_INFO
102                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
103                         dmar_forcedac = 1;
104                 } else if (!strncmp(str, "strict", 6)) {
105                         printk(KERN_INFO
106                                 "Intel-IOMMU: disable batched IOTLB flush\n");
107                         intel_iommu_strict = 1;
108                 }
109
110                 str += strcspn(str, ",");
111                 while (*str == ',')
112                         str++;
113         }
114         return 0;
115 }
116 __setup("intel_iommu=", intel_iommu_setup);
117
118 static struct kmem_cache *iommu_domain_cache;
119 static struct kmem_cache *iommu_devinfo_cache;
120 static struct kmem_cache *iommu_iova_cache;
121
122 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
123 {
124         unsigned int flags;
125         void *vaddr;
126
127         /* trying to avoid low memory issues */
128         flags = current->flags & PF_MEMALLOC;
129         current->flags |= PF_MEMALLOC;
130         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
131         current->flags &= (~PF_MEMALLOC | flags);
132         return vaddr;
133 }
134
135
136 static inline void *alloc_pgtable_page(void)
137 {
138         unsigned int flags;
139         void *vaddr;
140
141         /* trying to avoid low memory issues */
142         flags = current->flags & PF_MEMALLOC;
143         current->flags |= PF_MEMALLOC;
144         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
145         current->flags &= (~PF_MEMALLOC | flags);
146         return vaddr;
147 }
148
149 static inline void free_pgtable_page(void *vaddr)
150 {
151         free_page((unsigned long)vaddr);
152 }
153
154 static inline void *alloc_domain_mem(void)
155 {
156         return iommu_kmem_cache_alloc(iommu_domain_cache);
157 }
158
159 static void free_domain_mem(void *vaddr)
160 {
161         kmem_cache_free(iommu_domain_cache, vaddr);
162 }
163
164 static inline void * alloc_devinfo_mem(void)
165 {
166         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
167 }
168
169 static inline void free_devinfo_mem(void *vaddr)
170 {
171         kmem_cache_free(iommu_devinfo_cache, vaddr);
172 }
173
174 struct iova *alloc_iova_mem(void)
175 {
176         return iommu_kmem_cache_alloc(iommu_iova_cache);
177 }
178
179 void free_iova_mem(struct iova *iova)
180 {
181         kmem_cache_free(iommu_iova_cache, iova);
182 }
183
184 /* Gets context entry for a given bus and devfn */
185 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
186                 u8 bus, u8 devfn)
187 {
188         struct root_entry *root;
189         struct context_entry *context;
190         unsigned long phy_addr;
191         unsigned long flags;
192
193         spin_lock_irqsave(&iommu->lock, flags);
194         root = &iommu->root_entry[bus];
195         context = get_context_addr_from_root(root);
196         if (!context) {
197                 context = (struct context_entry *)alloc_pgtable_page();
198                 if (!context) {
199                         spin_unlock_irqrestore(&iommu->lock, flags);
200                         return NULL;
201                 }
202                 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
203                 phy_addr = virt_to_phys((void *)context);
204                 set_root_value(root, phy_addr);
205                 set_root_present(root);
206                 __iommu_flush_cache(iommu, root, sizeof(*root));
207         }
208         spin_unlock_irqrestore(&iommu->lock, flags);
209         return &context[devfn];
210 }
211
212 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
213 {
214         struct root_entry *root;
215         struct context_entry *context;
216         int ret;
217         unsigned long flags;
218
219         spin_lock_irqsave(&iommu->lock, flags);
220         root = &iommu->root_entry[bus];
221         context = get_context_addr_from_root(root);
222         if (!context) {
223                 ret = 0;
224                 goto out;
225         }
226         ret = context_present(context[devfn]);
227 out:
228         spin_unlock_irqrestore(&iommu->lock, flags);
229         return ret;
230 }
231
232 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
233 {
234         struct root_entry *root;
235         struct context_entry *context;
236         unsigned long flags;
237
238         spin_lock_irqsave(&iommu->lock, flags);
239         root = &iommu->root_entry[bus];
240         context = get_context_addr_from_root(root);
241         if (context) {
242                 context_clear_entry(context[devfn]);
243                 __iommu_flush_cache(iommu, &context[devfn], \
244                         sizeof(*context));
245         }
246         spin_unlock_irqrestore(&iommu->lock, flags);
247 }
248
249 static void free_context_table(struct intel_iommu *iommu)
250 {
251         struct root_entry *root;
252         int i;
253         unsigned long flags;
254         struct context_entry *context;
255
256         spin_lock_irqsave(&iommu->lock, flags);
257         if (!iommu->root_entry) {
258                 goto out;
259         }
260         for (i = 0; i < ROOT_ENTRY_NR; i++) {
261                 root = &iommu->root_entry[i];
262                 context = get_context_addr_from_root(root);
263                 if (context)
264                         free_pgtable_page(context);
265         }
266         free_pgtable_page(iommu->root_entry);
267         iommu->root_entry = NULL;
268 out:
269         spin_unlock_irqrestore(&iommu->lock, flags);
270 }
271
272 /* page table handling */
273 #define LEVEL_STRIDE            (9)
274 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
275
276 static inline int agaw_to_level(int agaw)
277 {
278         return agaw + 2;
279 }
280
281 static inline int agaw_to_width(int agaw)
282 {
283         return 30 + agaw * LEVEL_STRIDE;
284
285 }
286
287 static inline int width_to_agaw(int width)
288 {
289         return (width - 30) / LEVEL_STRIDE;
290 }
291
292 static inline unsigned int level_to_offset_bits(int level)
293 {
294         return (12 + (level - 1) * LEVEL_STRIDE);
295 }
296
297 static inline int address_level_offset(u64 addr, int level)
298 {
299         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
300 }
301
302 static inline u64 level_mask(int level)
303 {
304         return ((u64)-1 << level_to_offset_bits(level));
305 }
306
307 static inline u64 level_size(int level)
308 {
309         return ((u64)1 << level_to_offset_bits(level));
310 }
311
312 static inline u64 align_to_level(u64 addr, int level)
313 {
314         return ((addr + level_size(level) - 1) & level_mask(level));
315 }
316
317 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
318 {
319         int addr_width = agaw_to_width(domain->agaw);
320         struct dma_pte *parent, *pte = NULL;
321         int level = agaw_to_level(domain->agaw);
322         int offset;
323         unsigned long flags;
324
325         BUG_ON(!domain->pgd);
326
327         addr &= (((u64)1) << addr_width) - 1;
328         parent = domain->pgd;
329
330         spin_lock_irqsave(&domain->mapping_lock, flags);
331         while (level > 0) {
332                 void *tmp_page;
333
334                 offset = address_level_offset(addr, level);
335                 pte = &parent[offset];
336                 if (level == 1)
337                         break;
338
339                 if (!dma_pte_present(*pte)) {
340                         tmp_page = alloc_pgtable_page();
341
342                         if (!tmp_page) {
343                                 spin_unlock_irqrestore(&domain->mapping_lock,
344                                         flags);
345                                 return NULL;
346                         }
347                         __iommu_flush_cache(domain->iommu, tmp_page,
348                                         PAGE_SIZE_4K);
349                         dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
350                         /*
351                          * high level table always sets r/w, last level page
352                          * table control read/write
353                          */
354                         dma_set_pte_readable(*pte);
355                         dma_set_pte_writable(*pte);
356                         __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
357                 }
358                 parent = phys_to_virt(dma_pte_addr(*pte));
359                 level--;
360         }
361
362         spin_unlock_irqrestore(&domain->mapping_lock, flags);
363         return pte;
364 }
365
366 /* return address's pte at specific level */
367 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
368                 int level)
369 {
370         struct dma_pte *parent, *pte = NULL;
371         int total = agaw_to_level(domain->agaw);
372         int offset;
373
374         parent = domain->pgd;
375         while (level <= total) {
376                 offset = address_level_offset(addr, total);
377                 pte = &parent[offset];
378                 if (level == total)
379                         return pte;
380
381                 if (!dma_pte_present(*pte))
382                         break;
383                 parent = phys_to_virt(dma_pte_addr(*pte));
384                 total--;
385         }
386         return NULL;
387 }
388
389 /* clear one page's page table */
390 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
391 {
392         struct dma_pte *pte = NULL;
393
394         /* get last level pte */
395         pte = dma_addr_level_pte(domain, addr, 1);
396
397         if (pte) {
398                 dma_clear_pte(*pte);
399                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
400         }
401 }
402
403 /* clear last level pte, a tlb flush should be followed */
404 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
405 {
406         int addr_width = agaw_to_width(domain->agaw);
407
408         start &= (((u64)1) << addr_width) - 1;
409         end &= (((u64)1) << addr_width) - 1;
410         /* in case it's partial page */
411         start = PAGE_ALIGN_4K(start);
412         end &= PAGE_MASK_4K;
413
414         /* we don't need lock here, nobody else touches the iova range */
415         while (start < end) {
416                 dma_pte_clear_one(domain, start);
417                 start += PAGE_SIZE_4K;
418         }
419 }
420
421 /* free page table pages. last level pte should already be cleared */
422 static void dma_pte_free_pagetable(struct dmar_domain *domain,
423         u64 start, u64 end)
424 {
425         int addr_width = agaw_to_width(domain->agaw);
426         struct dma_pte *pte;
427         int total = agaw_to_level(domain->agaw);
428         int level;
429         u64 tmp;
430
431         start &= (((u64)1) << addr_width) - 1;
432         end &= (((u64)1) << addr_width) - 1;
433
434         /* we don't need lock here, nobody else touches the iova range */
435         level = 2;
436         while (level <= total) {
437                 tmp = align_to_level(start, level);
438                 if (tmp >= end || (tmp + level_size(level) > end))
439                         return;
440
441                 while (tmp < end) {
442                         pte = dma_addr_level_pte(domain, tmp, level);
443                         if (pte) {
444                                 free_pgtable_page(
445                                         phys_to_virt(dma_pte_addr(*pte)));
446                                 dma_clear_pte(*pte);
447                                 __iommu_flush_cache(domain->iommu,
448                                                 pte, sizeof(*pte));
449                         }
450                         tmp += level_size(level);
451                 }
452                 level++;
453         }
454         /* free pgd */
455         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
456                 free_pgtable_page(domain->pgd);
457                 domain->pgd = NULL;
458         }
459 }
460
461 /* iommu handling */
462 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
463 {
464         struct root_entry *root;
465         unsigned long flags;
466
467         root = (struct root_entry *)alloc_pgtable_page();
468         if (!root)
469                 return -ENOMEM;
470
471         __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
472
473         spin_lock_irqsave(&iommu->lock, flags);
474         iommu->root_entry = root;
475         spin_unlock_irqrestore(&iommu->lock, flags);
476
477         return 0;
478 }
479
480 static void iommu_set_root_entry(struct intel_iommu *iommu)
481 {
482         void *addr;
483         u32 cmd, sts;
484         unsigned long flag;
485
486         addr = iommu->root_entry;
487
488         spin_lock_irqsave(&iommu->register_lock, flag);
489         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
490
491         cmd = iommu->gcmd | DMA_GCMD_SRTP;
492         writel(cmd, iommu->reg + DMAR_GCMD_REG);
493
494         /* Make sure hardware complete it */
495         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
496                 readl, (sts & DMA_GSTS_RTPS), sts);
497
498         spin_unlock_irqrestore(&iommu->register_lock, flag);
499 }
500
501 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
502 {
503         u32 val;
504         unsigned long flag;
505
506         if (!cap_rwbf(iommu->cap))
507                 return;
508         val = iommu->gcmd | DMA_GCMD_WBF;
509
510         spin_lock_irqsave(&iommu->register_lock, flag);
511         writel(val, iommu->reg + DMAR_GCMD_REG);
512
513         /* Make sure hardware complete it */
514         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
515                         readl, (!(val & DMA_GSTS_WBFS)), val);
516
517         spin_unlock_irqrestore(&iommu->register_lock, flag);
518 }
519
520 /* return value determine if we need a write buffer flush */
521 static int __iommu_flush_context(struct intel_iommu *iommu,
522         u16 did, u16 source_id, u8 function_mask, u64 type,
523         int non_present_entry_flush)
524 {
525         u64 val = 0;
526         unsigned long flag;
527
528         /*
529          * In the non-present entry flush case, if hardware doesn't cache
530          * non-present entry we do nothing and if hardware cache non-present
531          * entry, we flush entries of domain 0 (the domain id is used to cache
532          * any non-present entries)
533          */
534         if (non_present_entry_flush) {
535                 if (!cap_caching_mode(iommu->cap))
536                         return 1;
537                 else
538                         did = 0;
539         }
540
541         switch (type) {
542         case DMA_CCMD_GLOBAL_INVL:
543                 val = DMA_CCMD_GLOBAL_INVL;
544                 break;
545         case DMA_CCMD_DOMAIN_INVL:
546                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
547                 break;
548         case DMA_CCMD_DEVICE_INVL:
549                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
550                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
551                 break;
552         default:
553                 BUG();
554         }
555         val |= DMA_CCMD_ICC;
556
557         spin_lock_irqsave(&iommu->register_lock, flag);
558         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
559
560         /* Make sure hardware complete it */
561         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
562                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
563
564         spin_unlock_irqrestore(&iommu->register_lock, flag);
565
566         /* flush context entry will implictly flush write buffer */
567         return 0;
568 }
569
570 /* return value determine if we need a write buffer flush */
571 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
572         u64 addr, unsigned int size_order, u64 type,
573         int non_present_entry_flush)
574 {
575         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
576         u64 val = 0, val_iva = 0;
577         unsigned long flag;
578
579         /*
580          * In the non-present entry flush case, if hardware doesn't cache
581          * non-present entry we do nothing and if hardware cache non-present
582          * entry, we flush entries of domain 0 (the domain id is used to cache
583          * any non-present entries)
584          */
585         if (non_present_entry_flush) {
586                 if (!cap_caching_mode(iommu->cap))
587                         return 1;
588                 else
589                         did = 0;
590         }
591
592         switch (type) {
593         case DMA_TLB_GLOBAL_FLUSH:
594                 /* global flush doesn't need set IVA_REG */
595                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
596                 break;
597         case DMA_TLB_DSI_FLUSH:
598                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
599                 break;
600         case DMA_TLB_PSI_FLUSH:
601                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
602                 /* Note: always flush non-leaf currently */
603                 val_iva = size_order | addr;
604                 break;
605         default:
606                 BUG();
607         }
608         /* Note: set drain read/write */
609 #if 0
610         /*
611          * This is probably to be super secure.. Looks like we can
612          * ignore it without any impact.
613          */
614         if (cap_read_drain(iommu->cap))
615                 val |= DMA_TLB_READ_DRAIN;
616 #endif
617         if (cap_write_drain(iommu->cap))
618                 val |= DMA_TLB_WRITE_DRAIN;
619
620         spin_lock_irqsave(&iommu->register_lock, flag);
621         /* Note: Only uses first TLB reg currently */
622         if (val_iva)
623                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
624         dmar_writeq(iommu->reg + tlb_offset + 8, val);
625
626         /* Make sure hardware complete it */
627         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
628                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
629
630         spin_unlock_irqrestore(&iommu->register_lock, flag);
631
632         /* check IOTLB invalidation granularity */
633         if (DMA_TLB_IAIG(val) == 0)
634                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
635         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
636                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
637                         DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
638         /* flush context entry will implictly flush write buffer */
639         return 0;
640 }
641
642 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
643         u64 addr, unsigned int pages, int non_present_entry_flush)
644 {
645         unsigned int mask;
646
647         BUG_ON(addr & (~PAGE_MASK_4K));
648         BUG_ON(pages == 0);
649
650         /* Fallback to domain selective flush if no PSI support */
651         if (!cap_pgsel_inv(iommu->cap))
652                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
653                                                 DMA_TLB_DSI_FLUSH,
654                                                 non_present_entry_flush);
655
656         /*
657          * PSI requires page size to be 2 ^ x, and the base address is naturally
658          * aligned to the size
659          */
660         mask = ilog2(__roundup_pow_of_two(pages));
661         /* Fallback to domain selective flush if size is too big */
662         if (mask > cap_max_amask_val(iommu->cap))
663                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
664                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
665
666         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
667                                         DMA_TLB_PSI_FLUSH,
668                                         non_present_entry_flush);
669 }
670
671 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
672 {
673         u32 pmen;
674         unsigned long flags;
675
676         spin_lock_irqsave(&iommu->register_lock, flags);
677         pmen = readl(iommu->reg + DMAR_PMEN_REG);
678         pmen &= ~DMA_PMEN_EPM;
679         writel(pmen, iommu->reg + DMAR_PMEN_REG);
680
681         /* wait for the protected region status bit to clear */
682         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
683                 readl, !(pmen & DMA_PMEN_PRS), pmen);
684
685         spin_unlock_irqrestore(&iommu->register_lock, flags);
686 }
687
688 static int iommu_enable_translation(struct intel_iommu *iommu)
689 {
690         u32 sts;
691         unsigned long flags;
692
693         spin_lock_irqsave(&iommu->register_lock, flags);
694         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
695
696         /* Make sure hardware complete it */
697         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
698                 readl, (sts & DMA_GSTS_TES), sts);
699
700         iommu->gcmd |= DMA_GCMD_TE;
701         spin_unlock_irqrestore(&iommu->register_lock, flags);
702         return 0;
703 }
704
705 static int iommu_disable_translation(struct intel_iommu *iommu)
706 {
707         u32 sts;
708         unsigned long flag;
709
710         spin_lock_irqsave(&iommu->register_lock, flag);
711         iommu->gcmd &= ~DMA_GCMD_TE;
712         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
713
714         /* Make sure hardware complete it */
715         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
716                 readl, (!(sts & DMA_GSTS_TES)), sts);
717
718         spin_unlock_irqrestore(&iommu->register_lock, flag);
719         return 0;
720 }
721
722 /* iommu interrupt handling. Most stuff are MSI-like. */
723
724 static const char *fault_reason_strings[] =
725 {
726         "Software",
727         "Present bit in root entry is clear",
728         "Present bit in context entry is clear",
729         "Invalid context entry",
730         "Access beyond MGAW",
731         "PTE Write access is not set",
732         "PTE Read access is not set",
733         "Next page table ptr is invalid",
734         "Root table address invalid",
735         "Context table ptr is invalid",
736         "non-zero reserved fields in RTP",
737         "non-zero reserved fields in CTP",
738         "non-zero reserved fields in PTE",
739 };
740 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
741
742 const char *dmar_get_fault_reason(u8 fault_reason)
743 {
744         if (fault_reason > MAX_FAULT_REASON_IDX)
745                 return "Unknown";
746         else
747                 return fault_reason_strings[fault_reason];
748 }
749
750 void dmar_msi_unmask(unsigned int irq)
751 {
752         struct intel_iommu *iommu = get_irq_data(irq);
753         unsigned long flag;
754
755         /* unmask it */
756         spin_lock_irqsave(&iommu->register_lock, flag);
757         writel(0, iommu->reg + DMAR_FECTL_REG);
758         /* Read a reg to force flush the post write */
759         readl(iommu->reg + DMAR_FECTL_REG);
760         spin_unlock_irqrestore(&iommu->register_lock, flag);
761 }
762
763 void dmar_msi_mask(unsigned int irq)
764 {
765         unsigned long flag;
766         struct intel_iommu *iommu = get_irq_data(irq);
767
768         /* mask it */
769         spin_lock_irqsave(&iommu->register_lock, flag);
770         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
771         /* Read a reg to force flush the post write */
772         readl(iommu->reg + DMAR_FECTL_REG);
773         spin_unlock_irqrestore(&iommu->register_lock, flag);
774 }
775
776 void dmar_msi_write(int irq, struct msi_msg *msg)
777 {
778         struct intel_iommu *iommu = get_irq_data(irq);
779         unsigned long flag;
780
781         spin_lock_irqsave(&iommu->register_lock, flag);
782         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
783         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
784         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
785         spin_unlock_irqrestore(&iommu->register_lock, flag);
786 }
787
788 void dmar_msi_read(int irq, struct msi_msg *msg)
789 {
790         struct intel_iommu *iommu = get_irq_data(irq);
791         unsigned long flag;
792
793         spin_lock_irqsave(&iommu->register_lock, flag);
794         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
795         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
796         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
797         spin_unlock_irqrestore(&iommu->register_lock, flag);
798 }
799
800 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
801                 u8 fault_reason, u16 source_id, u64 addr)
802 {
803         const char *reason;
804
805         reason = dmar_get_fault_reason(fault_reason);
806
807         printk(KERN_ERR
808                 "DMAR:[%s] Request device [%02x:%02x.%d] "
809                 "fault addr %llx \n"
810                 "DMAR:[fault reason %02d] %s\n",
811                 (type ? "DMA Read" : "DMA Write"),
812                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
813                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
814         return 0;
815 }
816
817 #define PRIMARY_FAULT_REG_LEN (16)
818 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
819 {
820         struct intel_iommu *iommu = dev_id;
821         int reg, fault_index;
822         u32 fault_status;
823         unsigned long flag;
824
825         spin_lock_irqsave(&iommu->register_lock, flag);
826         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
827
828         /* TBD: ignore advanced fault log currently */
829         if (!(fault_status & DMA_FSTS_PPF))
830                 goto clear_overflow;
831
832         fault_index = dma_fsts_fault_record_index(fault_status);
833         reg = cap_fault_reg_offset(iommu->cap);
834         while (1) {
835                 u8 fault_reason;
836                 u16 source_id;
837                 u64 guest_addr;
838                 int type;
839                 u32 data;
840
841                 /* highest 32 bits */
842                 data = readl(iommu->reg + reg +
843                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
844                 if (!(data & DMA_FRCD_F))
845                         break;
846
847                 fault_reason = dma_frcd_fault_reason(data);
848                 type = dma_frcd_type(data);
849
850                 data = readl(iommu->reg + reg +
851                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
852                 source_id = dma_frcd_source_id(data);
853
854                 guest_addr = dmar_readq(iommu->reg + reg +
855                                 fault_index * PRIMARY_FAULT_REG_LEN);
856                 guest_addr = dma_frcd_page_addr(guest_addr);
857                 /* clear the fault */
858                 writel(DMA_FRCD_F, iommu->reg + reg +
859                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
860
861                 spin_unlock_irqrestore(&iommu->register_lock, flag);
862
863                 iommu_page_fault_do_one(iommu, type, fault_reason,
864                                 source_id, guest_addr);
865
866                 fault_index++;
867                 if (fault_index > cap_num_fault_regs(iommu->cap))
868                         fault_index = 0;
869                 spin_lock_irqsave(&iommu->register_lock, flag);
870         }
871 clear_overflow:
872         /* clear primary fault overflow */
873         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
874         if (fault_status & DMA_FSTS_PFO)
875                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
876
877         spin_unlock_irqrestore(&iommu->register_lock, flag);
878         return IRQ_HANDLED;
879 }
880
881 int dmar_set_interrupt(struct intel_iommu *iommu)
882 {
883         int irq, ret;
884
885         irq = create_irq();
886         if (!irq) {
887                 printk(KERN_ERR "IOMMU: no free vectors\n");
888                 return -EINVAL;
889         }
890
891         set_irq_data(irq, iommu);
892         iommu->irq = irq;
893
894         ret = arch_setup_dmar_msi(irq);
895         if (ret) {
896                 set_irq_data(irq, NULL);
897                 iommu->irq = 0;
898                 destroy_irq(irq);
899                 return 0;
900         }
901
902         /* Force fault register is cleared */
903         iommu_page_fault(irq, iommu);
904
905         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
906         if (ret)
907                 printk(KERN_ERR "IOMMU: can't request irq\n");
908         return ret;
909 }
910
911 static int iommu_init_domains(struct intel_iommu *iommu)
912 {
913         unsigned long ndomains;
914         unsigned long nlongs;
915
916         ndomains = cap_ndoms(iommu->cap);
917         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
918         nlongs = BITS_TO_LONGS(ndomains);
919
920         /* TBD: there might be 64K domains,
921          * consider other allocation for future chip
922          */
923         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
924         if (!iommu->domain_ids) {
925                 printk(KERN_ERR "Allocating domain id array failed\n");
926                 return -ENOMEM;
927         }
928         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
929                         GFP_KERNEL);
930         if (!iommu->domains) {
931                 printk(KERN_ERR "Allocating domain array failed\n");
932                 kfree(iommu->domain_ids);
933                 return -ENOMEM;
934         }
935
936         spin_lock_init(&iommu->lock);
937
938         /*
939          * if Caching mode is set, then invalid translations are tagged
940          * with domainid 0. Hence we need to pre-allocate it.
941          */
942         if (cap_caching_mode(iommu->cap))
943                 set_bit(0, iommu->domain_ids);
944         return 0;
945 }
946
947
948 static void domain_exit(struct dmar_domain *domain);
949
950 void free_dmar_iommu(struct intel_iommu *iommu)
951 {
952         struct dmar_domain *domain;
953         int i;
954
955         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
956         for (; i < cap_ndoms(iommu->cap); ) {
957                 domain = iommu->domains[i];
958                 clear_bit(i, iommu->domain_ids);
959                 domain_exit(domain);
960                 i = find_next_bit(iommu->domain_ids,
961                         cap_ndoms(iommu->cap), i+1);
962         }
963
964         if (iommu->gcmd & DMA_GCMD_TE)
965                 iommu_disable_translation(iommu);
966
967         if (iommu->irq) {
968                 set_irq_data(iommu->irq, NULL);
969                 /* This will mask the irq */
970                 free_irq(iommu->irq, iommu);
971                 destroy_irq(iommu->irq);
972         }
973
974         kfree(iommu->domains);
975         kfree(iommu->domain_ids);
976
977         /* free context mapping */
978         free_context_table(iommu);
979 }
980
981 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
982 {
983         unsigned long num;
984         unsigned long ndomains;
985         struct dmar_domain *domain;
986         unsigned long flags;
987
988         domain = alloc_domain_mem();
989         if (!domain)
990                 return NULL;
991
992         ndomains = cap_ndoms(iommu->cap);
993
994         spin_lock_irqsave(&iommu->lock, flags);
995         num = find_first_zero_bit(iommu->domain_ids, ndomains);
996         if (num >= ndomains) {
997                 spin_unlock_irqrestore(&iommu->lock, flags);
998                 free_domain_mem(domain);
999                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1000                 return NULL;
1001         }
1002
1003         set_bit(num, iommu->domain_ids);
1004         domain->id = num;
1005         domain->iommu = iommu;
1006         iommu->domains[num] = domain;
1007         spin_unlock_irqrestore(&iommu->lock, flags);
1008
1009         return domain;
1010 }
1011
1012 static void iommu_free_domain(struct dmar_domain *domain)
1013 {
1014         unsigned long flags;
1015
1016         spin_lock_irqsave(&domain->iommu->lock, flags);
1017         clear_bit(domain->id, domain->iommu->domain_ids);
1018         spin_unlock_irqrestore(&domain->iommu->lock, flags);
1019 }
1020
1021 static struct iova_domain reserved_iova_list;
1022 static struct lock_class_key reserved_alloc_key;
1023 static struct lock_class_key reserved_rbtree_key;
1024
1025 static void dmar_init_reserved_ranges(void)
1026 {
1027         struct pci_dev *pdev = NULL;
1028         struct iova *iova;
1029         int i;
1030         u64 addr, size;
1031
1032         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1033
1034         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1035                 &reserved_alloc_key);
1036         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1037                 &reserved_rbtree_key);
1038
1039         /* IOAPIC ranges shouldn't be accessed by DMA */
1040         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1041                 IOVA_PFN(IOAPIC_RANGE_END));
1042         if (!iova)
1043                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1044
1045         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1046         for_each_pci_dev(pdev) {
1047                 struct resource *r;
1048
1049                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1050                         r = &pdev->resource[i];
1051                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1052                                 continue;
1053                         addr = r->start;
1054                         addr &= PAGE_MASK_4K;
1055                         size = r->end - addr;
1056                         size = PAGE_ALIGN_4K(size);
1057                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1058                                 IOVA_PFN(size + addr) - 1);
1059                         if (!iova)
1060                                 printk(KERN_ERR "Reserve iova failed\n");
1061                 }
1062         }
1063
1064 }
1065
1066 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1067 {
1068         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1069 }
1070
1071 static inline int guestwidth_to_adjustwidth(int gaw)
1072 {
1073         int agaw;
1074         int r = (gaw - 12) % 9;
1075
1076         if (r == 0)
1077                 agaw = gaw;
1078         else
1079                 agaw = gaw + 9 - r;
1080         if (agaw > 64)
1081                 agaw = 64;
1082         return agaw;
1083 }
1084
1085 static int domain_init(struct dmar_domain *domain, int guest_width)
1086 {
1087         struct intel_iommu *iommu;
1088         int adjust_width, agaw;
1089         unsigned long sagaw;
1090
1091         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1092         spin_lock_init(&domain->mapping_lock);
1093
1094         domain_reserve_special_ranges(domain);
1095
1096         /* calculate AGAW */
1097         iommu = domain->iommu;
1098         if (guest_width > cap_mgaw(iommu->cap))
1099                 guest_width = cap_mgaw(iommu->cap);
1100         domain->gaw = guest_width;
1101         adjust_width = guestwidth_to_adjustwidth(guest_width);
1102         agaw = width_to_agaw(adjust_width);
1103         sagaw = cap_sagaw(iommu->cap);
1104         if (!test_bit(agaw, &sagaw)) {
1105                 /* hardware doesn't support it, choose a bigger one */
1106                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1107                 agaw = find_next_bit(&sagaw, 5, agaw);
1108                 if (agaw >= 5)
1109                         return -ENODEV;
1110         }
1111         domain->agaw = agaw;
1112         INIT_LIST_HEAD(&domain->devices);
1113
1114         /* always allocate the top pgd */
1115         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1116         if (!domain->pgd)
1117                 return -ENOMEM;
1118         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1119         return 0;
1120 }
1121
1122 static void domain_exit(struct dmar_domain *domain)
1123 {
1124         u64 end;
1125
1126         /* Domain 0 is reserved, so dont process it */
1127         if (!domain)
1128                 return;
1129
1130         domain_remove_dev_info(domain);
1131         /* destroy iovas */
1132         put_iova_domain(&domain->iovad);
1133         end = DOMAIN_MAX_ADDR(domain->gaw);
1134         end = end & (~PAGE_MASK_4K);
1135
1136         /* clear ptes */
1137         dma_pte_clear_range(domain, 0, end);
1138
1139         /* free page tables */
1140         dma_pte_free_pagetable(domain, 0, end);
1141
1142         iommu_free_domain(domain);
1143         free_domain_mem(domain);
1144 }
1145
1146 static int domain_context_mapping_one(struct dmar_domain *domain,
1147                 u8 bus, u8 devfn)
1148 {
1149         struct context_entry *context;
1150         struct intel_iommu *iommu = domain->iommu;
1151         unsigned long flags;
1152
1153         pr_debug("Set context mapping for %02x:%02x.%d\n",
1154                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1155         BUG_ON(!domain->pgd);
1156         context = device_to_context_entry(iommu, bus, devfn);
1157         if (!context)
1158                 return -ENOMEM;
1159         spin_lock_irqsave(&iommu->lock, flags);
1160         if (context_present(*context)) {
1161                 spin_unlock_irqrestore(&iommu->lock, flags);
1162                 return 0;
1163         }
1164
1165         context_set_domain_id(*context, domain->id);
1166         context_set_address_width(*context, domain->agaw);
1167         context_set_address_root(*context, virt_to_phys(domain->pgd));
1168         context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1169         context_set_fault_enable(*context);
1170         context_set_present(*context);
1171         __iommu_flush_cache(iommu, context, sizeof(*context));
1172
1173         /* it's a non-present to present mapping */
1174         if (iommu->flush.flush_context(iommu, domain->id,
1175                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1176                 DMA_CCMD_DEVICE_INVL, 1))
1177                 iommu_flush_write_buffer(iommu);
1178         else
1179                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1180
1181         spin_unlock_irqrestore(&iommu->lock, flags);
1182         return 0;
1183 }
1184
1185 static int
1186 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1187 {
1188         int ret;
1189         struct pci_dev *tmp, *parent;
1190
1191         ret = domain_context_mapping_one(domain, pdev->bus->number,
1192                 pdev->devfn);
1193         if (ret)
1194                 return ret;
1195
1196         /* dependent device mapping */
1197         tmp = pci_find_upstream_pcie_bridge(pdev);
1198         if (!tmp)
1199                 return 0;
1200         /* Secondary interface's bus number and devfn 0 */
1201         parent = pdev->bus->self;
1202         while (parent != tmp) {
1203                 ret = domain_context_mapping_one(domain, parent->bus->number,
1204                         parent->devfn);
1205                 if (ret)
1206                         return ret;
1207                 parent = parent->bus->self;
1208         }
1209         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1210                 return domain_context_mapping_one(domain,
1211                         tmp->subordinate->number, 0);
1212         else /* this is a legacy PCI bridge */
1213                 return domain_context_mapping_one(domain,
1214                         tmp->bus->number, tmp->devfn);
1215 }
1216
1217 static int domain_context_mapped(struct dmar_domain *domain,
1218         struct pci_dev *pdev)
1219 {
1220         int ret;
1221         struct pci_dev *tmp, *parent;
1222
1223         ret = device_context_mapped(domain->iommu,
1224                 pdev->bus->number, pdev->devfn);
1225         if (!ret)
1226                 return ret;
1227         /* dependent device mapping */
1228         tmp = pci_find_upstream_pcie_bridge(pdev);
1229         if (!tmp)
1230                 return ret;
1231         /* Secondary interface's bus number and devfn 0 */
1232         parent = pdev->bus->self;
1233         while (parent != tmp) {
1234                 ret = device_context_mapped(domain->iommu, parent->bus->number,
1235                         parent->devfn);
1236                 if (!ret)
1237                         return ret;
1238                 parent = parent->bus->self;
1239         }
1240         if (tmp->is_pcie)
1241                 return device_context_mapped(domain->iommu,
1242                         tmp->subordinate->number, 0);
1243         else
1244                 return device_context_mapped(domain->iommu,
1245                         tmp->bus->number, tmp->devfn);
1246 }
1247
1248 static int
1249 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1250                         u64 hpa, size_t size, int prot)
1251 {
1252         u64 start_pfn, end_pfn;
1253         struct dma_pte *pte;
1254         int index;
1255
1256         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1257                 return -EINVAL;
1258         iova &= PAGE_MASK_4K;
1259         start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1260         end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1261         index = 0;
1262         while (start_pfn < end_pfn) {
1263                 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1264                 if (!pte)
1265                         return -ENOMEM;
1266                 /* We don't need lock here, nobody else
1267                  * touches the iova range
1268                  */
1269                 BUG_ON(dma_pte_addr(*pte));
1270                 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1271                 dma_set_pte_prot(*pte, prot);
1272                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1273                 start_pfn++;
1274                 index++;
1275         }
1276         return 0;
1277 }
1278
1279 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1280 {
1281         clear_context_table(domain->iommu, bus, devfn);
1282         domain->iommu->flush.flush_context(domain->iommu, 0, 0, 0,
1283                                            DMA_CCMD_GLOBAL_INVL, 0);
1284         domain->iommu->flush.flush_iotlb(domain->iommu, 0, 0, 0,
1285                                          DMA_TLB_GLOBAL_FLUSH, 0);
1286 }
1287
1288 static void domain_remove_dev_info(struct dmar_domain *domain)
1289 {
1290         struct device_domain_info *info;
1291         unsigned long flags;
1292
1293         spin_lock_irqsave(&device_domain_lock, flags);
1294         while (!list_empty(&domain->devices)) {
1295                 info = list_entry(domain->devices.next,
1296                         struct device_domain_info, link);
1297                 list_del(&info->link);
1298                 list_del(&info->global);
1299                 if (info->dev)
1300                         info->dev->dev.archdata.iommu = NULL;
1301                 spin_unlock_irqrestore(&device_domain_lock, flags);
1302
1303                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1304                 free_devinfo_mem(info);
1305
1306                 spin_lock_irqsave(&device_domain_lock, flags);
1307         }
1308         spin_unlock_irqrestore(&device_domain_lock, flags);
1309 }
1310
1311 /*
1312  * find_domain
1313  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1314  */
1315 static struct dmar_domain *
1316 find_domain(struct pci_dev *pdev)
1317 {
1318         struct device_domain_info *info;
1319
1320         /* No lock here, assumes no domain exit in normal case */
1321         info = pdev->dev.archdata.iommu;
1322         if (info)
1323                 return info->domain;
1324         return NULL;
1325 }
1326
1327 /* domain is initialized */
1328 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1329 {
1330         struct dmar_domain *domain, *found = NULL;
1331         struct intel_iommu *iommu;
1332         struct dmar_drhd_unit *drhd;
1333         struct device_domain_info *info, *tmp;
1334         struct pci_dev *dev_tmp;
1335         unsigned long flags;
1336         int bus = 0, devfn = 0;
1337
1338         domain = find_domain(pdev);
1339         if (domain)
1340                 return domain;
1341
1342         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1343         if (dev_tmp) {
1344                 if (dev_tmp->is_pcie) {
1345                         bus = dev_tmp->subordinate->number;
1346                         devfn = 0;
1347                 } else {
1348                         bus = dev_tmp->bus->number;
1349                         devfn = dev_tmp->devfn;
1350                 }
1351                 spin_lock_irqsave(&device_domain_lock, flags);
1352                 list_for_each_entry(info, &device_domain_list, global) {
1353                         if (info->bus == bus && info->devfn == devfn) {
1354                                 found = info->domain;
1355                                 break;
1356                         }
1357                 }
1358                 spin_unlock_irqrestore(&device_domain_lock, flags);
1359                 /* pcie-pci bridge already has a domain, uses it */
1360                 if (found) {
1361                         domain = found;
1362                         goto found_domain;
1363                 }
1364         }
1365
1366         /* Allocate new domain for the device */
1367         drhd = dmar_find_matched_drhd_unit(pdev);
1368         if (!drhd) {
1369                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1370                         pci_name(pdev));
1371                 return NULL;
1372         }
1373         iommu = drhd->iommu;
1374
1375         domain = iommu_alloc_domain(iommu);
1376         if (!domain)
1377                 goto error;
1378
1379         if (domain_init(domain, gaw)) {
1380                 domain_exit(domain);
1381                 goto error;
1382         }
1383
1384         /* register pcie-to-pci device */
1385         if (dev_tmp) {
1386                 info = alloc_devinfo_mem();
1387                 if (!info) {
1388                         domain_exit(domain);
1389                         goto error;
1390                 }
1391                 info->bus = bus;
1392                 info->devfn = devfn;
1393                 info->dev = NULL;
1394                 info->domain = domain;
1395                 /* This domain is shared by devices under p2p bridge */
1396                 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1397
1398                 /* pcie-to-pci bridge already has a domain, uses it */
1399                 found = NULL;
1400                 spin_lock_irqsave(&device_domain_lock, flags);
1401                 list_for_each_entry(tmp, &device_domain_list, global) {
1402                         if (tmp->bus == bus && tmp->devfn == devfn) {
1403                                 found = tmp->domain;
1404                                 break;
1405                         }
1406                 }
1407                 if (found) {
1408                         free_devinfo_mem(info);
1409                         domain_exit(domain);
1410                         domain = found;
1411                 } else {
1412                         list_add(&info->link, &domain->devices);
1413                         list_add(&info->global, &device_domain_list);
1414                 }
1415                 spin_unlock_irqrestore(&device_domain_lock, flags);
1416         }
1417
1418 found_domain:
1419         info = alloc_devinfo_mem();
1420         if (!info)
1421                 goto error;
1422         info->bus = pdev->bus->number;
1423         info->devfn = pdev->devfn;
1424         info->dev = pdev;
1425         info->domain = domain;
1426         spin_lock_irqsave(&device_domain_lock, flags);
1427         /* somebody is fast */
1428         found = find_domain(pdev);
1429         if (found != NULL) {
1430                 spin_unlock_irqrestore(&device_domain_lock, flags);
1431                 if (found != domain) {
1432                         domain_exit(domain);
1433                         domain = found;
1434                 }
1435                 free_devinfo_mem(info);
1436                 return domain;
1437         }
1438         list_add(&info->link, &domain->devices);
1439         list_add(&info->global, &device_domain_list);
1440         pdev->dev.archdata.iommu = info;
1441         spin_unlock_irqrestore(&device_domain_lock, flags);
1442         return domain;
1443 error:
1444         /* recheck it here, maybe others set it */
1445         return find_domain(pdev);
1446 }
1447
1448 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1449 {
1450         struct dmar_domain *domain;
1451         unsigned long size;
1452         u64 base;
1453         int ret;
1454
1455         printk(KERN_INFO
1456                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1457                 pci_name(pdev), start, end);
1458         /* page table init */
1459         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1460         if (!domain)
1461                 return -ENOMEM;
1462
1463         /* The address might not be aligned */
1464         base = start & PAGE_MASK_4K;
1465         size = end - base;
1466         size = PAGE_ALIGN_4K(size);
1467         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1468                         IOVA_PFN(base + size) - 1)) {
1469                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1470                 ret = -ENOMEM;
1471                 goto error;
1472         }
1473
1474         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1475                 size, base, pci_name(pdev));
1476         /*
1477          * RMRR range might have overlap with physical memory range,
1478          * clear it first
1479          */
1480         dma_pte_clear_range(domain, base, base + size);
1481
1482         ret = domain_page_mapping(domain, base, base, size,
1483                 DMA_PTE_READ|DMA_PTE_WRITE);
1484         if (ret)
1485                 goto error;
1486
1487         /* context entry init */
1488         ret = domain_context_mapping(domain, pdev);
1489         if (!ret)
1490                 return 0;
1491 error:
1492         domain_exit(domain);
1493         return ret;
1494
1495 }
1496
1497 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1498         struct pci_dev *pdev)
1499 {
1500         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1501                 return 0;
1502         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1503                 rmrr->end_address + 1);
1504 }
1505
1506 #ifdef CONFIG_DMAR_GFX_WA
1507 struct iommu_prepare_data {
1508         struct pci_dev *pdev;
1509         int ret;
1510 };
1511
1512 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1513                                          unsigned long end_pfn, void *datax)
1514 {
1515         struct iommu_prepare_data *data;
1516
1517         data = (struct iommu_prepare_data *)datax;
1518
1519         data->ret = iommu_prepare_identity_map(data->pdev,
1520                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1521         return data->ret;
1522
1523 }
1524
1525 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1526 {
1527         int nid;
1528         struct iommu_prepare_data data;
1529
1530         data.pdev = pdev;
1531         data.ret = 0;
1532
1533         for_each_online_node(nid) {
1534                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1535                 if (data.ret)
1536                         return data.ret;
1537         }
1538         return data.ret;
1539 }
1540
1541 static void __init iommu_prepare_gfx_mapping(void)
1542 {
1543         struct pci_dev *pdev = NULL;
1544         int ret;
1545
1546         for_each_pci_dev(pdev) {
1547                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1548                                 !IS_GFX_DEVICE(pdev))
1549                         continue;
1550                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1551                         pci_name(pdev));
1552                 ret = iommu_prepare_with_active_regions(pdev);
1553                 if (ret)
1554                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1555         }
1556 }
1557 #endif
1558
1559 #ifdef CONFIG_DMAR_FLOPPY_WA
1560 static inline void iommu_prepare_isa(void)
1561 {
1562         struct pci_dev *pdev;
1563         int ret;
1564
1565         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1566         if (!pdev)
1567                 return;
1568
1569         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1570         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1571
1572         if (ret)
1573                 printk("IOMMU: Failed to create 0-64M identity map, "
1574                         "floppy might not work\n");
1575
1576 }
1577 #else
1578 static inline void iommu_prepare_isa(void)
1579 {
1580         return;
1581 }
1582 #endif /* !CONFIG_DMAR_FLPY_WA */
1583
1584 int __init init_dmars(void)
1585 {
1586         struct dmar_drhd_unit *drhd;
1587         struct dmar_rmrr_unit *rmrr;
1588         struct pci_dev *pdev;
1589         struct intel_iommu *iommu;
1590         int i, ret, unit = 0;
1591
1592         /*
1593          * for each drhd
1594          *    allocate root
1595          *    initialize and program root entry to not present
1596          * endfor
1597          */
1598         for_each_drhd_unit(drhd) {
1599                 g_num_of_iommus++;
1600                 /*
1601                  * lock not needed as this is only incremented in the single
1602                  * threaded kernel __init code path all other access are read
1603                  * only
1604                  */
1605         }
1606
1607         deferred_flush = kzalloc(g_num_of_iommus *
1608                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1609         if (!deferred_flush) {
1610                 ret = -ENOMEM;
1611                 goto error;
1612         }
1613
1614         for_each_drhd_unit(drhd) {
1615                 if (drhd->ignored)
1616                         continue;
1617
1618                 iommu = drhd->iommu;
1619
1620                 ret = iommu_init_domains(iommu);
1621                 if (ret)
1622                         goto error;
1623
1624                 /*
1625                  * TBD:
1626                  * we could share the same root & context tables
1627                  * amoung all IOMMU's. Need to Split it later.
1628                  */
1629                 ret = iommu_alloc_root_entry(iommu);
1630                 if (ret) {
1631                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1632                         goto error;
1633                 }
1634         }
1635
1636         for_each_drhd_unit(drhd) {
1637                 if (drhd->ignored)
1638                         continue;
1639
1640                 iommu = drhd->iommu;
1641                 if (dmar_enable_qi(iommu)) {
1642                         /*
1643                          * Queued Invalidate not enabled, use Register Based
1644                          * Invalidate
1645                          */
1646                         iommu->flush.flush_context = __iommu_flush_context;
1647                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1648                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1649                                "invalidation\n", drhd->reg_base_addr);
1650                 } else {
1651                         iommu->flush.flush_context = qi_flush_context;
1652                         iommu->flush.flush_iotlb = qi_flush_iotlb;
1653                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1654                                "invalidation\n", drhd->reg_base_addr);
1655                 }
1656         }
1657
1658         /*
1659          * For each rmrr
1660          *   for each dev attached to rmrr
1661          *   do
1662          *     locate drhd for dev, alloc domain for dev
1663          *     allocate free domain
1664          *     allocate page table entries for rmrr
1665          *     if context not allocated for bus
1666          *           allocate and init context
1667          *           set present in root table for this bus
1668          *     init context with domain, translation etc
1669          *    endfor
1670          * endfor
1671          */
1672         for_each_rmrr_units(rmrr) {
1673                 for (i = 0; i < rmrr->devices_cnt; i++) {
1674                         pdev = rmrr->devices[i];
1675                         /* some BIOS lists non-exist devices in DMAR table */
1676                         if (!pdev)
1677                                 continue;
1678                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1679                         if (ret)
1680                                 printk(KERN_ERR
1681                                  "IOMMU: mapping reserved region failed\n");
1682                 }
1683         }
1684
1685         iommu_prepare_gfx_mapping();
1686
1687         iommu_prepare_isa();
1688
1689         /*
1690          * for each drhd
1691          *   enable fault log
1692          *   global invalidate context cache
1693          *   global invalidate iotlb
1694          *   enable translation
1695          */
1696         for_each_drhd_unit(drhd) {
1697                 if (drhd->ignored)
1698                         continue;
1699                 iommu = drhd->iommu;
1700                 sprintf (iommu->name, "dmar%d", unit++);
1701
1702                 iommu_flush_write_buffer(iommu);
1703
1704                 ret = dmar_set_interrupt(iommu);
1705                 if (ret)
1706                         goto error;
1707
1708                 iommu_set_root_entry(iommu);
1709
1710                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
1711                                            0);
1712                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
1713                                          0);
1714                 iommu_disable_protect_mem_regions(iommu);
1715
1716                 ret = iommu_enable_translation(iommu);
1717                 if (ret)
1718                         goto error;
1719         }
1720
1721         return 0;
1722 error:
1723         for_each_drhd_unit(drhd) {
1724                 if (drhd->ignored)
1725                         continue;
1726                 iommu = drhd->iommu;
1727                 free_iommu(iommu);
1728         }
1729         return ret;
1730 }
1731
1732 static inline u64 aligned_size(u64 host_addr, size_t size)
1733 {
1734         u64 addr;
1735         addr = (host_addr & (~PAGE_MASK_4K)) + size;
1736         return PAGE_ALIGN_4K(addr);
1737 }
1738
1739 struct iova *
1740 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1741 {
1742         struct iova *piova;
1743
1744         /* Make sure it's in range */
1745         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1746         if (!size || (IOVA_START_ADDR + size > end))
1747                 return NULL;
1748
1749         piova = alloc_iova(&domain->iovad,
1750                         size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
1751         return piova;
1752 }
1753
1754 static struct iova *
1755 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1756                 size_t size)
1757 {
1758         struct pci_dev *pdev = to_pci_dev(dev);
1759         struct iova *iova = NULL;
1760
1761         if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1762                 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1763         } else  {
1764                 /*
1765                  * First try to allocate an io virtual address in
1766                  * DMA_32BIT_MASK and if that fails then try allocating
1767                  * from higher range
1768                  */
1769                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1770                 if (!iova)
1771                         iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1772         }
1773
1774         if (!iova) {
1775                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1776                 return NULL;
1777         }
1778
1779         return iova;
1780 }
1781
1782 static struct dmar_domain *
1783 get_valid_domain_for_dev(struct pci_dev *pdev)
1784 {
1785         struct dmar_domain *domain;
1786         int ret;
1787
1788         domain = get_domain_for_dev(pdev,
1789                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
1790         if (!domain) {
1791                 printk(KERN_ERR
1792                         "Allocating domain for %s failed", pci_name(pdev));
1793                 return NULL;
1794         }
1795
1796         /* make sure context mapping is ok */
1797         if (unlikely(!domain_context_mapped(domain, pdev))) {
1798                 ret = domain_context_mapping(domain, pdev);
1799                 if (ret) {
1800                         printk(KERN_ERR
1801                                 "Domain context map for %s failed",
1802                                 pci_name(pdev));
1803                         return NULL;
1804                 }
1805         }
1806
1807         return domain;
1808 }
1809
1810 static dma_addr_t
1811 intel_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
1812 {
1813         struct pci_dev *pdev = to_pci_dev(hwdev);
1814         struct dmar_domain *domain;
1815         unsigned long start_paddr;
1816         struct iova *iova;
1817         int prot = 0;
1818         int ret;
1819
1820         BUG_ON(dir == DMA_NONE);
1821         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1822                 return paddr;
1823
1824         domain = get_valid_domain_for_dev(pdev);
1825         if (!domain)
1826                 return 0;
1827
1828         size = aligned_size((u64)paddr, size);
1829
1830         iova = __intel_alloc_iova(hwdev, domain, size);
1831         if (!iova)
1832                 goto error;
1833
1834         start_paddr = iova->pfn_lo << PAGE_SHIFT_4K;
1835
1836         /*
1837          * Check if DMAR supports zero-length reads on write only
1838          * mappings..
1839          */
1840         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1841                         !cap_zlr(domain->iommu->cap))
1842                 prot |= DMA_PTE_READ;
1843         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1844                 prot |= DMA_PTE_WRITE;
1845         /*
1846          * paddr - (paddr + size) might be partial page, we should map the whole
1847          * page.  Note: if two part of one page are separately mapped, we
1848          * might have two guest_addr mapping to the same host paddr, but this
1849          * is not a big problem
1850          */
1851         ret = domain_page_mapping(domain, start_paddr,
1852                 ((u64)paddr) & PAGE_MASK_4K, size, prot);
1853         if (ret)
1854                 goto error;
1855
1856         pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1857                 pci_name(pdev), size, (u64)paddr,
1858                 size, (u64)start_paddr, dir);
1859
1860         /* it's a non-present to present mapping */
1861         ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1862                         start_paddr, size >> PAGE_SHIFT_4K, 1);
1863         if (ret)
1864                 iommu_flush_write_buffer(domain->iommu);
1865
1866         return (start_paddr + ((u64)paddr & (~PAGE_MASK_4K)));
1867
1868 error:
1869         if (iova)
1870                 __free_iova(&domain->iovad, iova);
1871         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1872                 pci_name(pdev), size, (u64)paddr, dir);
1873         return 0;
1874 }
1875
1876 static void flush_unmaps(void)
1877 {
1878         int i, j;
1879
1880         timer_on = 0;
1881
1882         /* just flush them all */
1883         for (i = 0; i < g_num_of_iommus; i++) {
1884                 if (deferred_flush[i].next) {
1885                         struct intel_iommu *iommu =
1886                                 deferred_flush[i].domain[0]->iommu;
1887
1888                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1889                                                  DMA_TLB_GLOBAL_FLUSH, 0);
1890                         for (j = 0; j < deferred_flush[i].next; j++) {
1891                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
1892                                                 deferred_flush[i].iova[j]);
1893                         }
1894                         deferred_flush[i].next = 0;
1895                 }
1896         }
1897
1898         list_size = 0;
1899 }
1900
1901 static void flush_unmaps_timeout(unsigned long data)
1902 {
1903         unsigned long flags;
1904
1905         spin_lock_irqsave(&async_umap_flush_lock, flags);
1906         flush_unmaps();
1907         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1908 }
1909
1910 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
1911 {
1912         unsigned long flags;
1913         int next, iommu_id;
1914
1915         spin_lock_irqsave(&async_umap_flush_lock, flags);
1916         if (list_size == HIGH_WATER_MARK)
1917                 flush_unmaps();
1918
1919         iommu_id = dom->iommu->seq_id;
1920
1921         next = deferred_flush[iommu_id].next;
1922         deferred_flush[iommu_id].domain[next] = dom;
1923         deferred_flush[iommu_id].iova[next] = iova;
1924         deferred_flush[iommu_id].next++;
1925
1926         if (!timer_on) {
1927                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
1928                 timer_on = 1;
1929         }
1930         list_size++;
1931         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1932 }
1933
1934 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1935         size_t size, int dir)
1936 {
1937         struct pci_dev *pdev = to_pci_dev(dev);
1938         struct dmar_domain *domain;
1939         unsigned long start_addr;
1940         struct iova *iova;
1941
1942         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1943                 return;
1944         domain = find_domain(pdev);
1945         BUG_ON(!domain);
1946
1947         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1948         if (!iova)
1949                 return;
1950
1951         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1952         size = aligned_size((u64)dev_addr, size);
1953
1954         pr_debug("Device %s unmapping: %lx@%llx\n",
1955                 pci_name(pdev), size, (u64)start_addr);
1956
1957         /*  clear the whole page */
1958         dma_pte_clear_range(domain, start_addr, start_addr + size);
1959         /* free page tables */
1960         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1961         if (intel_iommu_strict) {
1962                 if (iommu_flush_iotlb_psi(domain->iommu,
1963                         domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
1964                         iommu_flush_write_buffer(domain->iommu);
1965                 /* free iova */
1966                 __free_iova(&domain->iovad, iova);
1967         } else {
1968                 add_unmap(domain, iova);
1969                 /*
1970                  * queue up the release of the unmap to save the 1/6th of the
1971                  * cpu used up by the iotlb flush operation...
1972                  */
1973         }
1974 }
1975
1976 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
1977                        dma_addr_t *dma_handle, gfp_t flags)
1978 {
1979         void *vaddr;
1980         int order;
1981
1982         size = PAGE_ALIGN_4K(size);
1983         order = get_order(size);
1984         flags &= ~(GFP_DMA | GFP_DMA32);
1985
1986         vaddr = (void *)__get_free_pages(flags, order);
1987         if (!vaddr)
1988                 return NULL;
1989         memset(vaddr, 0, size);
1990
1991         *dma_handle = intel_map_single(hwdev, virt_to_bus(vaddr), size, DMA_BIDIRECTIONAL);
1992         if (*dma_handle)
1993                 return vaddr;
1994         free_pages((unsigned long)vaddr, order);
1995         return NULL;
1996 }
1997
1998 static void intel_free_coherent(struct device *hwdev, size_t size,
1999         void *vaddr, dma_addr_t dma_handle)
2000 {
2001         int order;
2002
2003         size = PAGE_ALIGN_4K(size);
2004         order = get_order(size);
2005
2006         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2007         free_pages((unsigned long)vaddr, order);
2008 }
2009
2010 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2011 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2012         int nelems, int dir)
2013 {
2014         int i;
2015         struct pci_dev *pdev = to_pci_dev(hwdev);
2016         struct dmar_domain *domain;
2017         unsigned long start_addr;
2018         struct iova *iova;
2019         size_t size = 0;
2020         void *addr;
2021         struct scatterlist *sg;
2022
2023         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2024                 return;
2025
2026         domain = find_domain(pdev);
2027
2028         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2029         if (!iova)
2030                 return;
2031         for_each_sg(sglist, sg, nelems, i) {
2032                 addr = SG_ENT_VIRT_ADDRESS(sg);
2033                 size += aligned_size((u64)addr, sg->length);
2034         }
2035
2036         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2037
2038         /*  clear the whole page */
2039         dma_pte_clear_range(domain, start_addr, start_addr + size);
2040         /* free page tables */
2041         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2042
2043         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2044                         size >> PAGE_SHIFT_4K, 0))
2045                 iommu_flush_write_buffer(domain->iommu);
2046
2047         /* free iova */
2048         __free_iova(&domain->iovad, iova);
2049 }
2050
2051 static int intel_nontranslate_map_sg(struct device *hddev,
2052         struct scatterlist *sglist, int nelems, int dir)
2053 {
2054         int i;
2055         struct scatterlist *sg;
2056
2057         for_each_sg(sglist, sg, nelems, i) {
2058                 BUG_ON(!sg_page(sg));
2059                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2060                 sg->dma_length = sg->length;
2061         }
2062         return nelems;
2063 }
2064
2065 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
2066                                 int nelems, int dir)
2067 {
2068         void *addr;
2069         int i;
2070         struct pci_dev *pdev = to_pci_dev(hwdev);
2071         struct dmar_domain *domain;
2072         size_t size = 0;
2073         int prot = 0;
2074         size_t offset = 0;
2075         struct iova *iova = NULL;
2076         int ret;
2077         struct scatterlist *sg;
2078         unsigned long start_addr;
2079
2080         BUG_ON(dir == DMA_NONE);
2081         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2082                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2083
2084         domain = get_valid_domain_for_dev(pdev);
2085         if (!domain)
2086                 return 0;
2087
2088         for_each_sg(sglist, sg, nelems, i) {
2089                 addr = SG_ENT_VIRT_ADDRESS(sg);
2090                 addr = (void *)virt_to_phys(addr);
2091                 size += aligned_size((u64)addr, sg->length);
2092         }
2093
2094         iova = __intel_alloc_iova(hwdev, domain, size);
2095         if (!iova) {
2096                 sglist->dma_length = 0;
2097                 return 0;
2098         }
2099
2100         /*
2101          * Check if DMAR supports zero-length reads on write only
2102          * mappings..
2103          */
2104         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2105                         !cap_zlr(domain->iommu->cap))
2106                 prot |= DMA_PTE_READ;
2107         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2108                 prot |= DMA_PTE_WRITE;
2109
2110         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2111         offset = 0;
2112         for_each_sg(sglist, sg, nelems, i) {
2113                 addr = SG_ENT_VIRT_ADDRESS(sg);
2114                 addr = (void *)virt_to_phys(addr);
2115                 size = aligned_size((u64)addr, sg->length);
2116                 ret = domain_page_mapping(domain, start_addr + offset,
2117                         ((u64)addr) & PAGE_MASK_4K,
2118                         size, prot);
2119                 if (ret) {
2120                         /*  clear the page */
2121                         dma_pte_clear_range(domain, start_addr,
2122                                   start_addr + offset);
2123                         /* free page tables */
2124                         dma_pte_free_pagetable(domain, start_addr,
2125                                   start_addr + offset);
2126                         /* free iova */
2127                         __free_iova(&domain->iovad, iova);
2128                         return 0;
2129                 }
2130                 sg->dma_address = start_addr + offset +
2131                                 ((u64)addr & (~PAGE_MASK_4K));
2132                 sg->dma_length = sg->length;
2133                 offset += size;
2134         }
2135
2136         /* it's a non-present to present mapping */
2137         if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2138                         start_addr, offset >> PAGE_SHIFT_4K, 1))
2139                 iommu_flush_write_buffer(domain->iommu);
2140         return nelems;
2141 }
2142
2143 static struct dma_mapping_ops intel_dma_ops = {
2144         .alloc_coherent = intel_alloc_coherent,
2145         .free_coherent = intel_free_coherent,
2146         .map_single = intel_map_single,
2147         .unmap_single = intel_unmap_single,
2148         .map_sg = intel_map_sg,
2149         .unmap_sg = intel_unmap_sg,
2150 };
2151
2152 static inline int iommu_domain_cache_init(void)
2153 {
2154         int ret = 0;
2155
2156         iommu_domain_cache = kmem_cache_create("iommu_domain",
2157                                          sizeof(struct dmar_domain),
2158                                          0,
2159                                          SLAB_HWCACHE_ALIGN,
2160
2161                                          NULL);
2162         if (!iommu_domain_cache) {
2163                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2164                 ret = -ENOMEM;
2165         }
2166
2167         return ret;
2168 }
2169
2170 static inline int iommu_devinfo_cache_init(void)
2171 {
2172         int ret = 0;
2173
2174         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2175                                          sizeof(struct device_domain_info),
2176                                          0,
2177                                          SLAB_HWCACHE_ALIGN,
2178
2179                                          NULL);
2180         if (!iommu_devinfo_cache) {
2181                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2182                 ret = -ENOMEM;
2183         }
2184
2185         return ret;
2186 }
2187
2188 static inline int iommu_iova_cache_init(void)
2189 {
2190         int ret = 0;
2191
2192         iommu_iova_cache = kmem_cache_create("iommu_iova",
2193                                          sizeof(struct iova),
2194                                          0,
2195                                          SLAB_HWCACHE_ALIGN,
2196
2197                                          NULL);
2198         if (!iommu_iova_cache) {
2199                 printk(KERN_ERR "Couldn't create iova cache\n");
2200                 ret = -ENOMEM;
2201         }
2202
2203         return ret;
2204 }
2205
2206 static int __init iommu_init_mempool(void)
2207 {
2208         int ret;
2209         ret = iommu_iova_cache_init();
2210         if (ret)
2211                 return ret;
2212
2213         ret = iommu_domain_cache_init();
2214         if (ret)
2215                 goto domain_error;
2216
2217         ret = iommu_devinfo_cache_init();
2218         if (!ret)
2219                 return ret;
2220
2221         kmem_cache_destroy(iommu_domain_cache);
2222 domain_error:
2223         kmem_cache_destroy(iommu_iova_cache);
2224
2225         return -ENOMEM;
2226 }
2227
2228 static void __init iommu_exit_mempool(void)
2229 {
2230         kmem_cache_destroy(iommu_devinfo_cache);
2231         kmem_cache_destroy(iommu_domain_cache);
2232         kmem_cache_destroy(iommu_iova_cache);
2233
2234 }
2235
2236 static void __init init_no_remapping_devices(void)
2237 {
2238         struct dmar_drhd_unit *drhd;
2239
2240         for_each_drhd_unit(drhd) {
2241                 if (!drhd->include_all) {
2242                         int i;
2243                         for (i = 0; i < drhd->devices_cnt; i++)
2244                                 if (drhd->devices[i] != NULL)
2245                                         break;
2246                         /* ignore DMAR unit if no pci devices exist */
2247                         if (i == drhd->devices_cnt)
2248                                 drhd->ignored = 1;
2249                 }
2250         }
2251
2252         if (dmar_map_gfx)
2253                 return;
2254
2255         for_each_drhd_unit(drhd) {
2256                 int i;
2257                 if (drhd->ignored || drhd->include_all)
2258                         continue;
2259
2260                 for (i = 0; i < drhd->devices_cnt; i++)
2261                         if (drhd->devices[i] &&
2262                                 !IS_GFX_DEVICE(drhd->devices[i]))
2263                                 break;
2264
2265                 if (i < drhd->devices_cnt)
2266                         continue;
2267
2268                 /* bypass IOMMU if it is just for gfx devices */
2269                 drhd->ignored = 1;
2270                 for (i = 0; i < drhd->devices_cnt; i++) {
2271                         if (!drhd->devices[i])
2272                                 continue;
2273                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2274                 }
2275         }
2276 }
2277
2278 int __init intel_iommu_init(void)
2279 {
2280         int ret = 0;
2281
2282         if (dmar_table_init())
2283                 return  -ENODEV;
2284
2285         if (dmar_dev_scope_init())
2286                 return  -ENODEV;
2287
2288         /*
2289          * Check the need for DMA-remapping initialization now.
2290          * Above initialization will also be used by Interrupt-remapping.
2291          */
2292         if (no_iommu || swiotlb || dmar_disabled)
2293                 return -ENODEV;
2294
2295         iommu_init_mempool();
2296         dmar_init_reserved_ranges();
2297
2298         init_no_remapping_devices();
2299
2300         ret = init_dmars();
2301         if (ret) {
2302                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2303                 put_iova_domain(&reserved_iova_list);
2304                 iommu_exit_mempool();
2305                 return ret;
2306         }
2307         printk(KERN_INFO
2308         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2309
2310         init_timer(&unmap_timer);
2311         force_iommu = 1;
2312         dma_ops = &intel_dma_ops;
2313         return 0;
2314 }
2315
2316 void intel_iommu_domain_exit(struct dmar_domain *domain)
2317 {
2318         u64 end;
2319
2320         /* Domain 0 is reserved, so dont process it */
2321         if (!domain)
2322                 return;
2323
2324         end = DOMAIN_MAX_ADDR(domain->gaw);
2325         end = end & (~PAGE_MASK_4K);
2326
2327         /* clear ptes */
2328         dma_pte_clear_range(domain, 0, end);
2329
2330         /* free page tables */
2331         dma_pte_free_pagetable(domain, 0, end);
2332
2333         iommu_free_domain(domain);
2334         free_domain_mem(domain);
2335 }
2336 EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
2337
2338 struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
2339 {
2340         struct dmar_drhd_unit *drhd;
2341         struct dmar_domain *domain;
2342         struct intel_iommu *iommu;
2343
2344         drhd = dmar_find_matched_drhd_unit(pdev);
2345         if (!drhd) {
2346                 printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
2347                 return NULL;
2348         }
2349
2350         iommu = drhd->iommu;
2351         if (!iommu) {
2352                 printk(KERN_ERR
2353                         "intel_iommu_domain_alloc: iommu == NULL\n");
2354                 return NULL;
2355         }
2356         domain = iommu_alloc_domain(iommu);
2357         if (!domain) {
2358                 printk(KERN_ERR
2359                         "intel_iommu_domain_alloc: domain == NULL\n");
2360                 return NULL;
2361         }
2362         if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2363                 printk(KERN_ERR
2364                         "intel_iommu_domain_alloc: domain_init() failed\n");
2365                 intel_iommu_domain_exit(domain);
2366                 return NULL;
2367         }
2368         return domain;
2369 }
2370 EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
2371
2372 int intel_iommu_context_mapping(
2373         struct dmar_domain *domain, struct pci_dev *pdev)
2374 {
2375         int rc;
2376         rc = domain_context_mapping(domain, pdev);
2377         return rc;
2378 }
2379 EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
2380
2381 int intel_iommu_page_mapping(
2382         struct dmar_domain *domain, dma_addr_t iova,
2383         u64 hpa, size_t size, int prot)
2384 {
2385         int rc;
2386         rc = domain_page_mapping(domain, iova, hpa, size, prot);
2387         return rc;
2388 }
2389 EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
2390
2391 void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
2392 {
2393         detach_domain_for_dev(domain, bus, devfn);
2394 }
2395 EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
2396
2397 struct dmar_domain *
2398 intel_iommu_find_domain(struct pci_dev *pdev)
2399 {
2400         return find_domain(pdev);
2401 }
2402 EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
2403
2404 int intel_iommu_found(void)
2405 {
2406         return g_num_of_iommus;
2407 }
2408 EXPORT_SYMBOL_GPL(intel_iommu_found);
2409
2410 u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
2411 {
2412         struct dma_pte *pte;
2413         u64 pfn;
2414
2415         pfn = 0;
2416         pte = addr_to_dma_pte(domain, iova);
2417
2418         if (pte)
2419                 pfn = dma_pte_addr(*pte);
2420
2421         return pfn >> PAGE_SHIFT_4K;
2422 }
2423 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);