]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/blob - drivers/pci/intel-iommu.c
Merge branches 'x86/acpi', 'x86/asm', 'x86/cpudetect', 'x86/crashdump', 'x86/debug...
[linux-2.6-omap-h63xx.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
41 #include "pci.h"
42
43 #define ROOT_SIZE               VTD_PAGE_SIZE
44 #define CONTEXT_SIZE            VTD_PAGE_SIZE
45
46 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
47 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
48
49 #define IOAPIC_RANGE_START      (0xfee00000)
50 #define IOAPIC_RANGE_END        (0xfeefffff)
51 #define IOVA_START_ADDR         (0x1000)
52
53 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
54
55 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
56
57 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
58 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
59 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
60
61 /* global iommu list, set NULL for ignored DMAR units */
62 static struct intel_iommu **g_iommus;
63
64 /*
65  * 0: Present
66  * 1-11: Reserved
67  * 12-63: Context Ptr (12 - (haw-1))
68  * 64-127: Reserved
69  */
70 struct root_entry {
71         u64     val;
72         u64     rsvd1;
73 };
74 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
75 static inline bool root_present(struct root_entry *root)
76 {
77         return (root->val & 1);
78 }
79 static inline void set_root_present(struct root_entry *root)
80 {
81         root->val |= 1;
82 }
83 static inline void set_root_value(struct root_entry *root, unsigned long value)
84 {
85         root->val |= value & VTD_PAGE_MASK;
86 }
87
88 static inline struct context_entry *
89 get_context_addr_from_root(struct root_entry *root)
90 {
91         return (struct context_entry *)
92                 (root_present(root)?phys_to_virt(
93                 root->val & VTD_PAGE_MASK) :
94                 NULL);
95 }
96
97 /*
98  * low 64 bits:
99  * 0: present
100  * 1: fault processing disable
101  * 2-3: translation type
102  * 12-63: address space root
103  * high 64 bits:
104  * 0-2: address width
105  * 3-6: aval
106  * 8-23: domain id
107  */
108 struct context_entry {
109         u64 lo;
110         u64 hi;
111 };
112
113 static inline bool context_present(struct context_entry *context)
114 {
115         return (context->lo & 1);
116 }
117 static inline void context_set_present(struct context_entry *context)
118 {
119         context->lo |= 1;
120 }
121
122 static inline void context_set_fault_enable(struct context_entry *context)
123 {
124         context->lo &= (((u64)-1) << 2) | 1;
125 }
126
127 #define CONTEXT_TT_MULTI_LEVEL 0
128
129 static inline void context_set_translation_type(struct context_entry *context,
130                                                 unsigned long value)
131 {
132         context->lo &= (((u64)-1) << 4) | 3;
133         context->lo |= (value & 3) << 2;
134 }
135
136 static inline void context_set_address_root(struct context_entry *context,
137                                             unsigned long value)
138 {
139         context->lo |= value & VTD_PAGE_MASK;
140 }
141
142 static inline void context_set_address_width(struct context_entry *context,
143                                              unsigned long value)
144 {
145         context->hi |= value & 7;
146 }
147
148 static inline void context_set_domain_id(struct context_entry *context,
149                                          unsigned long value)
150 {
151         context->hi |= (value & ((1 << 16) - 1)) << 8;
152 }
153
154 static inline void context_clear_entry(struct context_entry *context)
155 {
156         context->lo = 0;
157         context->hi = 0;
158 }
159
160 /*
161  * 0: readable
162  * 1: writable
163  * 2-6: reserved
164  * 7: super page
165  * 8-11: available
166  * 12-63: Host physcial address
167  */
168 struct dma_pte {
169         u64 val;
170 };
171
172 static inline void dma_clear_pte(struct dma_pte *pte)
173 {
174         pte->val = 0;
175 }
176
177 static inline void dma_set_pte_readable(struct dma_pte *pte)
178 {
179         pte->val |= DMA_PTE_READ;
180 }
181
182 static inline void dma_set_pte_writable(struct dma_pte *pte)
183 {
184         pte->val |= DMA_PTE_WRITE;
185 }
186
187 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
188 {
189         pte->val = (pte->val & ~3) | (prot & 3);
190 }
191
192 static inline u64 dma_pte_addr(struct dma_pte *pte)
193 {
194         return (pte->val & VTD_PAGE_MASK);
195 }
196
197 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
198 {
199         pte->val |= (addr & VTD_PAGE_MASK);
200 }
201
202 static inline bool dma_pte_present(struct dma_pte *pte)
203 {
204         return (pte->val & 3) != 0;
205 }
206
207 /* devices under the same p2p bridge are owned in one domain */
208 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
209
210 /* domain represents a virtual machine, more than one devices
211  * across iommus may be owned in one domain, e.g. kvm guest.
212  */
213 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
214
215 struct dmar_domain {
216         int     id;                     /* domain id */
217         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
218
219         struct list_head devices;       /* all devices' list */
220         struct iova_domain iovad;       /* iova's that belong to this domain */
221
222         struct dma_pte  *pgd;           /* virtual address */
223         spinlock_t      mapping_lock;   /* page table lock */
224         int             gaw;            /* max guest address width */
225
226         /* adjusted guest address width, 0 is level 2 30-bit */
227         int             agaw;
228
229         int             flags;          /* flags to find out type of domain */
230
231         int             iommu_coherency;/* indicate coherency of iommu access */
232         int             iommu_count;    /* reference count of iommu */
233         spinlock_t      iommu_lock;     /* protect iommu set in domain */
234         u64             max_addr;       /* maximum mapped address */
235 };
236
237 /* PCI domain-device relationship */
238 struct device_domain_info {
239         struct list_head link;  /* link to domain siblings */
240         struct list_head global; /* link to global list */
241         u8 bus;                 /* PCI bus numer */
242         u8 devfn;               /* PCI devfn number */
243         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
244         struct dmar_domain *domain; /* pointer to domain */
245 };
246
247 static void flush_unmaps_timeout(unsigned long data);
248
249 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
250
251 #define HIGH_WATER_MARK 250
252 struct deferred_flush_tables {
253         int next;
254         struct iova *iova[HIGH_WATER_MARK];
255         struct dmar_domain *domain[HIGH_WATER_MARK];
256 };
257
258 static struct deferred_flush_tables *deferred_flush;
259
260 /* bitmap for indexing intel_iommus */
261 static int g_num_of_iommus;
262
263 static DEFINE_SPINLOCK(async_umap_flush_lock);
264 static LIST_HEAD(unmaps_to_do);
265
266 static int timer_on;
267 static long list_size;
268
269 static void domain_remove_dev_info(struct dmar_domain *domain);
270
271 #ifdef CONFIG_DMAR_DEFAULT_ON
272 int dmar_disabled = 0;
273 #else
274 int dmar_disabled = 1;
275 #endif /*CONFIG_DMAR_DEFAULT_ON*/
276
277 static int __initdata dmar_map_gfx = 1;
278 static int dmar_forcedac;
279 static int intel_iommu_strict;
280
281 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
282 static DEFINE_SPINLOCK(device_domain_lock);
283 static LIST_HEAD(device_domain_list);
284
285 static struct iommu_ops intel_iommu_ops;
286
287 static int __init intel_iommu_setup(char *str)
288 {
289         if (!str)
290                 return -EINVAL;
291         while (*str) {
292                 if (!strncmp(str, "on", 2)) {
293                         dmar_disabled = 0;
294                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
295                 } else if (!strncmp(str, "off", 3)) {
296                         dmar_disabled = 1;
297                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
298                 } else if (!strncmp(str, "igfx_off", 8)) {
299                         dmar_map_gfx = 0;
300                         printk(KERN_INFO
301                                 "Intel-IOMMU: disable GFX device mapping\n");
302                 } else if (!strncmp(str, "forcedac", 8)) {
303                         printk(KERN_INFO
304                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
305                         dmar_forcedac = 1;
306                 } else if (!strncmp(str, "strict", 6)) {
307                         printk(KERN_INFO
308                                 "Intel-IOMMU: disable batched IOTLB flush\n");
309                         intel_iommu_strict = 1;
310                 }
311
312                 str += strcspn(str, ",");
313                 while (*str == ',')
314                         str++;
315         }
316         return 0;
317 }
318 __setup("intel_iommu=", intel_iommu_setup);
319
320 static struct kmem_cache *iommu_domain_cache;
321 static struct kmem_cache *iommu_devinfo_cache;
322 static struct kmem_cache *iommu_iova_cache;
323
324 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
325 {
326         unsigned int flags;
327         void *vaddr;
328
329         /* trying to avoid low memory issues */
330         flags = current->flags & PF_MEMALLOC;
331         current->flags |= PF_MEMALLOC;
332         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
333         current->flags &= (~PF_MEMALLOC | flags);
334         return vaddr;
335 }
336
337
338 static inline void *alloc_pgtable_page(void)
339 {
340         unsigned int flags;
341         void *vaddr;
342
343         /* trying to avoid low memory issues */
344         flags = current->flags & PF_MEMALLOC;
345         current->flags |= PF_MEMALLOC;
346         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
347         current->flags &= (~PF_MEMALLOC | flags);
348         return vaddr;
349 }
350
351 static inline void free_pgtable_page(void *vaddr)
352 {
353         free_page((unsigned long)vaddr);
354 }
355
356 static inline void *alloc_domain_mem(void)
357 {
358         return iommu_kmem_cache_alloc(iommu_domain_cache);
359 }
360
361 static void free_domain_mem(void *vaddr)
362 {
363         kmem_cache_free(iommu_domain_cache, vaddr);
364 }
365
366 static inline void * alloc_devinfo_mem(void)
367 {
368         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
369 }
370
371 static inline void free_devinfo_mem(void *vaddr)
372 {
373         kmem_cache_free(iommu_devinfo_cache, vaddr);
374 }
375
376 struct iova *alloc_iova_mem(void)
377 {
378         return iommu_kmem_cache_alloc(iommu_iova_cache);
379 }
380
381 void free_iova_mem(struct iova *iova)
382 {
383         kmem_cache_free(iommu_iova_cache, iova);
384 }
385
386
387 static inline int width_to_agaw(int width);
388
389 /* calculate agaw for each iommu.
390  * "SAGAW" may be different across iommus, use a default agaw, and
391  * get a supported less agaw for iommus that don't support the default agaw.
392  */
393 int iommu_calculate_agaw(struct intel_iommu *iommu)
394 {
395         unsigned long sagaw;
396         int agaw = -1;
397
398         sagaw = cap_sagaw(iommu->cap);
399         for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
400              agaw >= 0; agaw--) {
401                 if (test_bit(agaw, &sagaw))
402                         break;
403         }
404
405         return agaw;
406 }
407
408 /* in native case, each domain is related to only one iommu */
409 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
410 {
411         int iommu_id;
412
413         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
414
415         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
416         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
417                 return NULL;
418
419         return g_iommus[iommu_id];
420 }
421
422 /* "Coherency" capability may be different across iommus */
423 static void domain_update_iommu_coherency(struct dmar_domain *domain)
424 {
425         int i;
426
427         domain->iommu_coherency = 1;
428
429         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
430         for (; i < g_num_of_iommus; ) {
431                 if (!ecap_coherent(g_iommus[i]->ecap)) {
432                         domain->iommu_coherency = 0;
433                         break;
434                 }
435                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
436         }
437 }
438
439 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
440 {
441         struct dmar_drhd_unit *drhd = NULL;
442         int i;
443
444         for_each_drhd_unit(drhd) {
445                 if (drhd->ignored)
446                         continue;
447
448                 for (i = 0; i < drhd->devices_cnt; i++)
449                         if (drhd->devices[i] &&
450                             drhd->devices[i]->bus->number == bus &&
451                             drhd->devices[i]->devfn == devfn)
452                                 return drhd->iommu;
453
454                 if (drhd->include_all)
455                         return drhd->iommu;
456         }
457
458         return NULL;
459 }
460
461 static void domain_flush_cache(struct dmar_domain *domain,
462                                void *addr, int size)
463 {
464         if (!domain->iommu_coherency)
465                 clflush_cache_range(addr, size);
466 }
467
468 /* Gets context entry for a given bus and devfn */
469 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
470                 u8 bus, u8 devfn)
471 {
472         struct root_entry *root;
473         struct context_entry *context;
474         unsigned long phy_addr;
475         unsigned long flags;
476
477         spin_lock_irqsave(&iommu->lock, flags);
478         root = &iommu->root_entry[bus];
479         context = get_context_addr_from_root(root);
480         if (!context) {
481                 context = (struct context_entry *)alloc_pgtable_page();
482                 if (!context) {
483                         spin_unlock_irqrestore(&iommu->lock, flags);
484                         return NULL;
485                 }
486                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
487                 phy_addr = virt_to_phys((void *)context);
488                 set_root_value(root, phy_addr);
489                 set_root_present(root);
490                 __iommu_flush_cache(iommu, root, sizeof(*root));
491         }
492         spin_unlock_irqrestore(&iommu->lock, flags);
493         return &context[devfn];
494 }
495
496 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
497 {
498         struct root_entry *root;
499         struct context_entry *context;
500         int ret;
501         unsigned long flags;
502
503         spin_lock_irqsave(&iommu->lock, flags);
504         root = &iommu->root_entry[bus];
505         context = get_context_addr_from_root(root);
506         if (!context) {
507                 ret = 0;
508                 goto out;
509         }
510         ret = context_present(&context[devfn]);
511 out:
512         spin_unlock_irqrestore(&iommu->lock, flags);
513         return ret;
514 }
515
516 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
517 {
518         struct root_entry *root;
519         struct context_entry *context;
520         unsigned long flags;
521
522         spin_lock_irqsave(&iommu->lock, flags);
523         root = &iommu->root_entry[bus];
524         context = get_context_addr_from_root(root);
525         if (context) {
526                 context_clear_entry(&context[devfn]);
527                 __iommu_flush_cache(iommu, &context[devfn], \
528                         sizeof(*context));
529         }
530         spin_unlock_irqrestore(&iommu->lock, flags);
531 }
532
533 static void free_context_table(struct intel_iommu *iommu)
534 {
535         struct root_entry *root;
536         int i;
537         unsigned long flags;
538         struct context_entry *context;
539
540         spin_lock_irqsave(&iommu->lock, flags);
541         if (!iommu->root_entry) {
542                 goto out;
543         }
544         for (i = 0; i < ROOT_ENTRY_NR; i++) {
545                 root = &iommu->root_entry[i];
546                 context = get_context_addr_from_root(root);
547                 if (context)
548                         free_pgtable_page(context);
549         }
550         free_pgtable_page(iommu->root_entry);
551         iommu->root_entry = NULL;
552 out:
553         spin_unlock_irqrestore(&iommu->lock, flags);
554 }
555
556 /* page table handling */
557 #define LEVEL_STRIDE            (9)
558 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
559
560 static inline int agaw_to_level(int agaw)
561 {
562         return agaw + 2;
563 }
564
565 static inline int agaw_to_width(int agaw)
566 {
567         return 30 + agaw * LEVEL_STRIDE;
568
569 }
570
571 static inline int width_to_agaw(int width)
572 {
573         return (width - 30) / LEVEL_STRIDE;
574 }
575
576 static inline unsigned int level_to_offset_bits(int level)
577 {
578         return (12 + (level - 1) * LEVEL_STRIDE);
579 }
580
581 static inline int address_level_offset(u64 addr, int level)
582 {
583         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
584 }
585
586 static inline u64 level_mask(int level)
587 {
588         return ((u64)-1 << level_to_offset_bits(level));
589 }
590
591 static inline u64 level_size(int level)
592 {
593         return ((u64)1 << level_to_offset_bits(level));
594 }
595
596 static inline u64 align_to_level(u64 addr, int level)
597 {
598         return ((addr + level_size(level) - 1) & level_mask(level));
599 }
600
601 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
602 {
603         int addr_width = agaw_to_width(domain->agaw);
604         struct dma_pte *parent, *pte = NULL;
605         int level = agaw_to_level(domain->agaw);
606         int offset;
607         unsigned long flags;
608
609         BUG_ON(!domain->pgd);
610
611         addr &= (((u64)1) << addr_width) - 1;
612         parent = domain->pgd;
613
614         spin_lock_irqsave(&domain->mapping_lock, flags);
615         while (level > 0) {
616                 void *tmp_page;
617
618                 offset = address_level_offset(addr, level);
619                 pte = &parent[offset];
620                 if (level == 1)
621                         break;
622
623                 if (!dma_pte_present(pte)) {
624                         tmp_page = alloc_pgtable_page();
625
626                         if (!tmp_page) {
627                                 spin_unlock_irqrestore(&domain->mapping_lock,
628                                         flags);
629                                 return NULL;
630                         }
631                         domain_flush_cache(domain, tmp_page, PAGE_SIZE);
632                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
633                         /*
634                          * high level table always sets r/w, last level page
635                          * table control read/write
636                          */
637                         dma_set_pte_readable(pte);
638                         dma_set_pte_writable(pte);
639                         domain_flush_cache(domain, pte, sizeof(*pte));
640                 }
641                 parent = phys_to_virt(dma_pte_addr(pte));
642                 level--;
643         }
644
645         spin_unlock_irqrestore(&domain->mapping_lock, flags);
646         return pte;
647 }
648
649 /* return address's pte at specific level */
650 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
651                 int level)
652 {
653         struct dma_pte *parent, *pte = NULL;
654         int total = agaw_to_level(domain->agaw);
655         int offset;
656
657         parent = domain->pgd;
658         while (level <= total) {
659                 offset = address_level_offset(addr, total);
660                 pte = &parent[offset];
661                 if (level == total)
662                         return pte;
663
664                 if (!dma_pte_present(pte))
665                         break;
666                 parent = phys_to_virt(dma_pte_addr(pte));
667                 total--;
668         }
669         return NULL;
670 }
671
672 /* clear one page's page table */
673 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
674 {
675         struct dma_pte *pte = NULL;
676
677         /* get last level pte */
678         pte = dma_addr_level_pte(domain, addr, 1);
679
680         if (pte) {
681                 dma_clear_pte(pte);
682                 domain_flush_cache(domain, pte, sizeof(*pte));
683         }
684 }
685
686 /* clear last level pte, a tlb flush should be followed */
687 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
688 {
689         int addr_width = agaw_to_width(domain->agaw);
690
691         start &= (((u64)1) << addr_width) - 1;
692         end &= (((u64)1) << addr_width) - 1;
693         /* in case it's partial page */
694         start = PAGE_ALIGN(start);
695         end &= PAGE_MASK;
696
697         /* we don't need lock here, nobody else touches the iova range */
698         while (start < end) {
699                 dma_pte_clear_one(domain, start);
700                 start += VTD_PAGE_SIZE;
701         }
702 }
703
704 /* free page table pages. last level pte should already be cleared */
705 static void dma_pte_free_pagetable(struct dmar_domain *domain,
706         u64 start, u64 end)
707 {
708         int addr_width = agaw_to_width(domain->agaw);
709         struct dma_pte *pte;
710         int total = agaw_to_level(domain->agaw);
711         int level;
712         u64 tmp;
713
714         start &= (((u64)1) << addr_width) - 1;
715         end &= (((u64)1) << addr_width) - 1;
716
717         /* we don't need lock here, nobody else touches the iova range */
718         level = 2;
719         while (level <= total) {
720                 tmp = align_to_level(start, level);
721                 if (tmp >= end || (tmp + level_size(level) > end))
722                         return;
723
724                 while (tmp < end) {
725                         pte = dma_addr_level_pte(domain, tmp, level);
726                         if (pte) {
727                                 free_pgtable_page(
728                                         phys_to_virt(dma_pte_addr(pte)));
729                                 dma_clear_pte(pte);
730                                 domain_flush_cache(domain, pte, sizeof(*pte));
731                         }
732                         tmp += level_size(level);
733                 }
734                 level++;
735         }
736         /* free pgd */
737         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
738                 free_pgtable_page(domain->pgd);
739                 domain->pgd = NULL;
740         }
741 }
742
743 /* iommu handling */
744 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
745 {
746         struct root_entry *root;
747         unsigned long flags;
748
749         root = (struct root_entry *)alloc_pgtable_page();
750         if (!root)
751                 return -ENOMEM;
752
753         __iommu_flush_cache(iommu, root, ROOT_SIZE);
754
755         spin_lock_irqsave(&iommu->lock, flags);
756         iommu->root_entry = root;
757         spin_unlock_irqrestore(&iommu->lock, flags);
758
759         return 0;
760 }
761
762 static void iommu_set_root_entry(struct intel_iommu *iommu)
763 {
764         void *addr;
765         u32 cmd, sts;
766         unsigned long flag;
767
768         addr = iommu->root_entry;
769
770         spin_lock_irqsave(&iommu->register_lock, flag);
771         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
772
773         cmd = iommu->gcmd | DMA_GCMD_SRTP;
774         writel(cmd, iommu->reg + DMAR_GCMD_REG);
775
776         /* Make sure hardware complete it */
777         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
778                 readl, (sts & DMA_GSTS_RTPS), sts);
779
780         spin_unlock_irqrestore(&iommu->register_lock, flag);
781 }
782
783 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
784 {
785         u32 val;
786         unsigned long flag;
787
788         if (!cap_rwbf(iommu->cap))
789                 return;
790         val = iommu->gcmd | DMA_GCMD_WBF;
791
792         spin_lock_irqsave(&iommu->register_lock, flag);
793         writel(val, iommu->reg + DMAR_GCMD_REG);
794
795         /* Make sure hardware complete it */
796         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
797                         readl, (!(val & DMA_GSTS_WBFS)), val);
798
799         spin_unlock_irqrestore(&iommu->register_lock, flag);
800 }
801
802 /* return value determine if we need a write buffer flush */
803 static int __iommu_flush_context(struct intel_iommu *iommu,
804         u16 did, u16 source_id, u8 function_mask, u64 type,
805         int non_present_entry_flush)
806 {
807         u64 val = 0;
808         unsigned long flag;
809
810         /*
811          * In the non-present entry flush case, if hardware doesn't cache
812          * non-present entry we do nothing and if hardware cache non-present
813          * entry, we flush entries of domain 0 (the domain id is used to cache
814          * any non-present entries)
815          */
816         if (non_present_entry_flush) {
817                 if (!cap_caching_mode(iommu->cap))
818                         return 1;
819                 else
820                         did = 0;
821         }
822
823         switch (type) {
824         case DMA_CCMD_GLOBAL_INVL:
825                 val = DMA_CCMD_GLOBAL_INVL;
826                 break;
827         case DMA_CCMD_DOMAIN_INVL:
828                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
829                 break;
830         case DMA_CCMD_DEVICE_INVL:
831                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
832                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
833                 break;
834         default:
835                 BUG();
836         }
837         val |= DMA_CCMD_ICC;
838
839         spin_lock_irqsave(&iommu->register_lock, flag);
840         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
841
842         /* Make sure hardware complete it */
843         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
844                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
845
846         spin_unlock_irqrestore(&iommu->register_lock, flag);
847
848         /* flush context entry will implicitly flush write buffer */
849         return 0;
850 }
851
852 /* return value determine if we need a write buffer flush */
853 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
854         u64 addr, unsigned int size_order, u64 type,
855         int non_present_entry_flush)
856 {
857         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
858         u64 val = 0, val_iva = 0;
859         unsigned long flag;
860
861         /*
862          * In the non-present entry flush case, if hardware doesn't cache
863          * non-present entry we do nothing and if hardware cache non-present
864          * entry, we flush entries of domain 0 (the domain id is used to cache
865          * any non-present entries)
866          */
867         if (non_present_entry_flush) {
868                 if (!cap_caching_mode(iommu->cap))
869                         return 1;
870                 else
871                         did = 0;
872         }
873
874         switch (type) {
875         case DMA_TLB_GLOBAL_FLUSH:
876                 /* global flush doesn't need set IVA_REG */
877                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
878                 break;
879         case DMA_TLB_DSI_FLUSH:
880                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
881                 break;
882         case DMA_TLB_PSI_FLUSH:
883                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
884                 /* Note: always flush non-leaf currently */
885                 val_iva = size_order | addr;
886                 break;
887         default:
888                 BUG();
889         }
890         /* Note: set drain read/write */
891 #if 0
892         /*
893          * This is probably to be super secure.. Looks like we can
894          * ignore it without any impact.
895          */
896         if (cap_read_drain(iommu->cap))
897                 val |= DMA_TLB_READ_DRAIN;
898 #endif
899         if (cap_write_drain(iommu->cap))
900                 val |= DMA_TLB_WRITE_DRAIN;
901
902         spin_lock_irqsave(&iommu->register_lock, flag);
903         /* Note: Only uses first TLB reg currently */
904         if (val_iva)
905                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
906         dmar_writeq(iommu->reg + tlb_offset + 8, val);
907
908         /* Make sure hardware complete it */
909         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
910                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
911
912         spin_unlock_irqrestore(&iommu->register_lock, flag);
913
914         /* check IOTLB invalidation granularity */
915         if (DMA_TLB_IAIG(val) == 0)
916                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
917         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
918                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
919                         (unsigned long long)DMA_TLB_IIRG(type),
920                         (unsigned long long)DMA_TLB_IAIG(val));
921         /* flush iotlb entry will implicitly flush write buffer */
922         return 0;
923 }
924
925 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
926         u64 addr, unsigned int pages, int non_present_entry_flush)
927 {
928         unsigned int mask;
929
930         BUG_ON(addr & (~VTD_PAGE_MASK));
931         BUG_ON(pages == 0);
932
933         /* Fallback to domain selective flush if no PSI support */
934         if (!cap_pgsel_inv(iommu->cap))
935                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
936                                                 DMA_TLB_DSI_FLUSH,
937                                                 non_present_entry_flush);
938
939         /*
940          * PSI requires page size to be 2 ^ x, and the base address is naturally
941          * aligned to the size
942          */
943         mask = ilog2(__roundup_pow_of_two(pages));
944         /* Fallback to domain selective flush if size is too big */
945         if (mask > cap_max_amask_val(iommu->cap))
946                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
947                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
948
949         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
950                                         DMA_TLB_PSI_FLUSH,
951                                         non_present_entry_flush);
952 }
953
954 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
955 {
956         u32 pmen;
957         unsigned long flags;
958
959         spin_lock_irqsave(&iommu->register_lock, flags);
960         pmen = readl(iommu->reg + DMAR_PMEN_REG);
961         pmen &= ~DMA_PMEN_EPM;
962         writel(pmen, iommu->reg + DMAR_PMEN_REG);
963
964         /* wait for the protected region status bit to clear */
965         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
966                 readl, !(pmen & DMA_PMEN_PRS), pmen);
967
968         spin_unlock_irqrestore(&iommu->register_lock, flags);
969 }
970
971 static int iommu_enable_translation(struct intel_iommu *iommu)
972 {
973         u32 sts;
974         unsigned long flags;
975
976         spin_lock_irqsave(&iommu->register_lock, flags);
977         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
978
979         /* Make sure hardware complete it */
980         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
981                 readl, (sts & DMA_GSTS_TES), sts);
982
983         iommu->gcmd |= DMA_GCMD_TE;
984         spin_unlock_irqrestore(&iommu->register_lock, flags);
985         return 0;
986 }
987
988 static int iommu_disable_translation(struct intel_iommu *iommu)
989 {
990         u32 sts;
991         unsigned long flag;
992
993         spin_lock_irqsave(&iommu->register_lock, flag);
994         iommu->gcmd &= ~DMA_GCMD_TE;
995         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
996
997         /* Make sure hardware complete it */
998         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
999                 readl, (!(sts & DMA_GSTS_TES)), sts);
1000
1001         spin_unlock_irqrestore(&iommu->register_lock, flag);
1002         return 0;
1003 }
1004
1005 /* iommu interrupt handling. Most stuff are MSI-like. */
1006
1007 static const char *fault_reason_strings[] =
1008 {
1009         "Software",
1010         "Present bit in root entry is clear",
1011         "Present bit in context entry is clear",
1012         "Invalid context entry",
1013         "Access beyond MGAW",
1014         "PTE Write access is not set",
1015         "PTE Read access is not set",
1016         "Next page table ptr is invalid",
1017         "Root table address invalid",
1018         "Context table ptr is invalid",
1019         "non-zero reserved fields in RTP",
1020         "non-zero reserved fields in CTP",
1021         "non-zero reserved fields in PTE",
1022 };
1023 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
1024
1025 const char *dmar_get_fault_reason(u8 fault_reason)
1026 {
1027         if (fault_reason > MAX_FAULT_REASON_IDX)
1028                 return "Unknown";
1029         else
1030                 return fault_reason_strings[fault_reason];
1031 }
1032
1033 void dmar_msi_unmask(unsigned int irq)
1034 {
1035         struct intel_iommu *iommu = get_irq_data(irq);
1036         unsigned long flag;
1037
1038         /* unmask it */
1039         spin_lock_irqsave(&iommu->register_lock, flag);
1040         writel(0, iommu->reg + DMAR_FECTL_REG);
1041         /* Read a reg to force flush the post write */
1042         readl(iommu->reg + DMAR_FECTL_REG);
1043         spin_unlock_irqrestore(&iommu->register_lock, flag);
1044 }
1045
1046 void dmar_msi_mask(unsigned int irq)
1047 {
1048         unsigned long flag;
1049         struct intel_iommu *iommu = get_irq_data(irq);
1050
1051         /* mask it */
1052         spin_lock_irqsave(&iommu->register_lock, flag);
1053         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1054         /* Read a reg to force flush the post write */
1055         readl(iommu->reg + DMAR_FECTL_REG);
1056         spin_unlock_irqrestore(&iommu->register_lock, flag);
1057 }
1058
1059 void dmar_msi_write(int irq, struct msi_msg *msg)
1060 {
1061         struct intel_iommu *iommu = get_irq_data(irq);
1062         unsigned long flag;
1063
1064         spin_lock_irqsave(&iommu->register_lock, flag);
1065         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1066         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1067         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1068         spin_unlock_irqrestore(&iommu->register_lock, flag);
1069 }
1070
1071 void dmar_msi_read(int irq, struct msi_msg *msg)
1072 {
1073         struct intel_iommu *iommu = get_irq_data(irq);
1074         unsigned long flag;
1075
1076         spin_lock_irqsave(&iommu->register_lock, flag);
1077         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1078         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1079         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1080         spin_unlock_irqrestore(&iommu->register_lock, flag);
1081 }
1082
1083 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1084                 u8 fault_reason, u16 source_id, unsigned long long addr)
1085 {
1086         const char *reason;
1087
1088         reason = dmar_get_fault_reason(fault_reason);
1089
1090         printk(KERN_ERR
1091                 "DMAR:[%s] Request device [%02x:%02x.%d] "
1092                 "fault addr %llx \n"
1093                 "DMAR:[fault reason %02d] %s\n",
1094                 (type ? "DMA Read" : "DMA Write"),
1095                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1096                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1097         return 0;
1098 }
1099
1100 #define PRIMARY_FAULT_REG_LEN (16)
1101 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1102 {
1103         struct intel_iommu *iommu = dev_id;
1104         int reg, fault_index;
1105         u32 fault_status;
1106         unsigned long flag;
1107
1108         spin_lock_irqsave(&iommu->register_lock, flag);
1109         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1110
1111         /* TBD: ignore advanced fault log currently */
1112         if (!(fault_status & DMA_FSTS_PPF))
1113                 goto clear_overflow;
1114
1115         fault_index = dma_fsts_fault_record_index(fault_status);
1116         reg = cap_fault_reg_offset(iommu->cap);
1117         while (1) {
1118                 u8 fault_reason;
1119                 u16 source_id;
1120                 u64 guest_addr;
1121                 int type;
1122                 u32 data;
1123
1124                 /* highest 32 bits */
1125                 data = readl(iommu->reg + reg +
1126                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1127                 if (!(data & DMA_FRCD_F))
1128                         break;
1129
1130                 fault_reason = dma_frcd_fault_reason(data);
1131                 type = dma_frcd_type(data);
1132
1133                 data = readl(iommu->reg + reg +
1134                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1135                 source_id = dma_frcd_source_id(data);
1136
1137                 guest_addr = dmar_readq(iommu->reg + reg +
1138                                 fault_index * PRIMARY_FAULT_REG_LEN);
1139                 guest_addr = dma_frcd_page_addr(guest_addr);
1140                 /* clear the fault */
1141                 writel(DMA_FRCD_F, iommu->reg + reg +
1142                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
1143
1144                 spin_unlock_irqrestore(&iommu->register_lock, flag);
1145
1146                 iommu_page_fault_do_one(iommu, type, fault_reason,
1147                                 source_id, guest_addr);
1148
1149                 fault_index++;
1150                 if (fault_index > cap_num_fault_regs(iommu->cap))
1151                         fault_index = 0;
1152                 spin_lock_irqsave(&iommu->register_lock, flag);
1153         }
1154 clear_overflow:
1155         /* clear primary fault overflow */
1156         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1157         if (fault_status & DMA_FSTS_PFO)
1158                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1159
1160         spin_unlock_irqrestore(&iommu->register_lock, flag);
1161         return IRQ_HANDLED;
1162 }
1163
1164 int dmar_set_interrupt(struct intel_iommu *iommu)
1165 {
1166         int irq, ret;
1167
1168         irq = create_irq();
1169         if (!irq) {
1170                 printk(KERN_ERR "IOMMU: no free vectors\n");
1171                 return -EINVAL;
1172         }
1173
1174         set_irq_data(irq, iommu);
1175         iommu->irq = irq;
1176
1177         ret = arch_setup_dmar_msi(irq);
1178         if (ret) {
1179                 set_irq_data(irq, NULL);
1180                 iommu->irq = 0;
1181                 destroy_irq(irq);
1182                 return 0;
1183         }
1184
1185         /* Force fault register is cleared */
1186         iommu_page_fault(irq, iommu);
1187
1188         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1189         if (ret)
1190                 printk(KERN_ERR "IOMMU: can't request irq\n");
1191         return ret;
1192 }
1193
1194 static int iommu_init_domains(struct intel_iommu *iommu)
1195 {
1196         unsigned long ndomains;
1197         unsigned long nlongs;
1198
1199         ndomains = cap_ndoms(iommu->cap);
1200         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1201         nlongs = BITS_TO_LONGS(ndomains);
1202
1203         /* TBD: there might be 64K domains,
1204          * consider other allocation for future chip
1205          */
1206         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1207         if (!iommu->domain_ids) {
1208                 printk(KERN_ERR "Allocating domain id array failed\n");
1209                 return -ENOMEM;
1210         }
1211         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1212                         GFP_KERNEL);
1213         if (!iommu->domains) {
1214                 printk(KERN_ERR "Allocating domain array failed\n");
1215                 kfree(iommu->domain_ids);
1216                 return -ENOMEM;
1217         }
1218
1219         spin_lock_init(&iommu->lock);
1220
1221         /*
1222          * if Caching mode is set, then invalid translations are tagged
1223          * with domainid 0. Hence we need to pre-allocate it.
1224          */
1225         if (cap_caching_mode(iommu->cap))
1226                 set_bit(0, iommu->domain_ids);
1227         return 0;
1228 }
1229
1230
1231 static void domain_exit(struct dmar_domain *domain);
1232 static void vm_domain_exit(struct dmar_domain *domain);
1233
1234 void free_dmar_iommu(struct intel_iommu *iommu)
1235 {
1236         struct dmar_domain *domain;
1237         int i;
1238         unsigned long flags;
1239
1240         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1241         for (; i < cap_ndoms(iommu->cap); ) {
1242                 domain = iommu->domains[i];
1243                 clear_bit(i, iommu->domain_ids);
1244
1245                 spin_lock_irqsave(&domain->iommu_lock, flags);
1246                 if (--domain->iommu_count == 0) {
1247                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1248                                 vm_domain_exit(domain);
1249                         else
1250                                 domain_exit(domain);
1251                 }
1252                 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1253
1254                 i = find_next_bit(iommu->domain_ids,
1255                         cap_ndoms(iommu->cap), i+1);
1256         }
1257
1258         if (iommu->gcmd & DMA_GCMD_TE)
1259                 iommu_disable_translation(iommu);
1260
1261         if (iommu->irq) {
1262                 set_irq_data(iommu->irq, NULL);
1263                 /* This will mask the irq */
1264                 free_irq(iommu->irq, iommu);
1265                 destroy_irq(iommu->irq);
1266         }
1267
1268         kfree(iommu->domains);
1269         kfree(iommu->domain_ids);
1270
1271         g_iommus[iommu->seq_id] = NULL;
1272
1273         /* if all iommus are freed, free g_iommus */
1274         for (i = 0; i < g_num_of_iommus; i++) {
1275                 if (g_iommus[i])
1276                         break;
1277         }
1278
1279         if (i == g_num_of_iommus)
1280                 kfree(g_iommus);
1281
1282         /* free context mapping */
1283         free_context_table(iommu);
1284 }
1285
1286 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1287 {
1288         unsigned long num;
1289         unsigned long ndomains;
1290         struct dmar_domain *domain;
1291         unsigned long flags;
1292
1293         domain = alloc_domain_mem();
1294         if (!domain)
1295                 return NULL;
1296
1297         ndomains = cap_ndoms(iommu->cap);
1298
1299         spin_lock_irqsave(&iommu->lock, flags);
1300         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1301         if (num >= ndomains) {
1302                 spin_unlock_irqrestore(&iommu->lock, flags);
1303                 free_domain_mem(domain);
1304                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1305                 return NULL;
1306         }
1307
1308         set_bit(num, iommu->domain_ids);
1309         domain->id = num;
1310         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1311         set_bit(iommu->seq_id, &domain->iommu_bmp);
1312         domain->flags = 0;
1313         iommu->domains[num] = domain;
1314         spin_unlock_irqrestore(&iommu->lock, flags);
1315
1316         return domain;
1317 }
1318
1319 static void iommu_free_domain(struct dmar_domain *domain)
1320 {
1321         unsigned long flags;
1322         struct intel_iommu *iommu;
1323
1324         iommu = domain_get_iommu(domain);
1325
1326         spin_lock_irqsave(&iommu->lock, flags);
1327         clear_bit(domain->id, iommu->domain_ids);
1328         spin_unlock_irqrestore(&iommu->lock, flags);
1329 }
1330
1331 static struct iova_domain reserved_iova_list;
1332 static struct lock_class_key reserved_alloc_key;
1333 static struct lock_class_key reserved_rbtree_key;
1334
1335 static void dmar_init_reserved_ranges(void)
1336 {
1337         struct pci_dev *pdev = NULL;
1338         struct iova *iova;
1339         int i;
1340         u64 addr, size;
1341
1342         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1343
1344         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1345                 &reserved_alloc_key);
1346         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1347                 &reserved_rbtree_key);
1348
1349         /* IOAPIC ranges shouldn't be accessed by DMA */
1350         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1351                 IOVA_PFN(IOAPIC_RANGE_END));
1352         if (!iova)
1353                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1354
1355         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1356         for_each_pci_dev(pdev) {
1357                 struct resource *r;
1358
1359                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1360                         r = &pdev->resource[i];
1361                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1362                                 continue;
1363                         addr = r->start;
1364                         addr &= PAGE_MASK;
1365                         size = r->end - addr;
1366                         size = PAGE_ALIGN(size);
1367                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1368                                 IOVA_PFN(size + addr) - 1);
1369                         if (!iova)
1370                                 printk(KERN_ERR "Reserve iova failed\n");
1371                 }
1372         }
1373
1374 }
1375
1376 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1377 {
1378         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1379 }
1380
1381 static inline int guestwidth_to_adjustwidth(int gaw)
1382 {
1383         int agaw;
1384         int r = (gaw - 12) % 9;
1385
1386         if (r == 0)
1387                 agaw = gaw;
1388         else
1389                 agaw = gaw + 9 - r;
1390         if (agaw > 64)
1391                 agaw = 64;
1392         return agaw;
1393 }
1394
1395 static int domain_init(struct dmar_domain *domain, int guest_width)
1396 {
1397         struct intel_iommu *iommu;
1398         int adjust_width, agaw;
1399         unsigned long sagaw;
1400
1401         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1402         spin_lock_init(&domain->mapping_lock);
1403         spin_lock_init(&domain->iommu_lock);
1404
1405         domain_reserve_special_ranges(domain);
1406
1407         /* calculate AGAW */
1408         iommu = domain_get_iommu(domain);
1409         if (guest_width > cap_mgaw(iommu->cap))
1410                 guest_width = cap_mgaw(iommu->cap);
1411         domain->gaw = guest_width;
1412         adjust_width = guestwidth_to_adjustwidth(guest_width);
1413         agaw = width_to_agaw(adjust_width);
1414         sagaw = cap_sagaw(iommu->cap);
1415         if (!test_bit(agaw, &sagaw)) {
1416                 /* hardware doesn't support it, choose a bigger one */
1417                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1418                 agaw = find_next_bit(&sagaw, 5, agaw);
1419                 if (agaw >= 5)
1420                         return -ENODEV;
1421         }
1422         domain->agaw = agaw;
1423         INIT_LIST_HEAD(&domain->devices);
1424
1425         if (ecap_coherent(iommu->ecap))
1426                 domain->iommu_coherency = 1;
1427         else
1428                 domain->iommu_coherency = 0;
1429
1430         domain->iommu_count = 1;
1431
1432         /* always allocate the top pgd */
1433         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1434         if (!domain->pgd)
1435                 return -ENOMEM;
1436         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1437         return 0;
1438 }
1439
1440 static void domain_exit(struct dmar_domain *domain)
1441 {
1442         u64 end;
1443
1444         /* Domain 0 is reserved, so dont process it */
1445         if (!domain)
1446                 return;
1447
1448         domain_remove_dev_info(domain);
1449         /* destroy iovas */
1450         put_iova_domain(&domain->iovad);
1451         end = DOMAIN_MAX_ADDR(domain->gaw);
1452         end = end & (~PAGE_MASK);
1453
1454         /* clear ptes */
1455         dma_pte_clear_range(domain, 0, end);
1456
1457         /* free page tables */
1458         dma_pte_free_pagetable(domain, 0, end);
1459
1460         iommu_free_domain(domain);
1461         free_domain_mem(domain);
1462 }
1463
1464 static int domain_context_mapping_one(struct dmar_domain *domain,
1465                 u8 bus, u8 devfn)
1466 {
1467         struct context_entry *context;
1468         unsigned long flags;
1469         struct intel_iommu *iommu;
1470         struct dma_pte *pgd;
1471         unsigned long num;
1472         unsigned long ndomains;
1473         int id;
1474         int agaw;
1475
1476         pr_debug("Set context mapping for %02x:%02x.%d\n",
1477                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1478         BUG_ON(!domain->pgd);
1479
1480         iommu = device_to_iommu(bus, devfn);
1481         if (!iommu)
1482                 return -ENODEV;
1483
1484         context = device_to_context_entry(iommu, bus, devfn);
1485         if (!context)
1486                 return -ENOMEM;
1487         spin_lock_irqsave(&iommu->lock, flags);
1488         if (context_present(context)) {
1489                 spin_unlock_irqrestore(&iommu->lock, flags);
1490                 return 0;
1491         }
1492
1493         id = domain->id;
1494         pgd = domain->pgd;
1495
1496         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1497                 int found = 0;
1498
1499                 /* find an available domain id for this device in iommu */
1500                 ndomains = cap_ndoms(iommu->cap);
1501                 num = find_first_bit(iommu->domain_ids, ndomains);
1502                 for (; num < ndomains; ) {
1503                         if (iommu->domains[num] == domain) {
1504                                 id = num;
1505                                 found = 1;
1506                                 break;
1507                         }
1508                         num = find_next_bit(iommu->domain_ids,
1509                                             cap_ndoms(iommu->cap), num+1);
1510                 }
1511
1512                 if (found == 0) {
1513                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1514                         if (num >= ndomains) {
1515                                 spin_unlock_irqrestore(&iommu->lock, flags);
1516                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1517                                 return -EFAULT;
1518                         }
1519
1520                         set_bit(num, iommu->domain_ids);
1521                         iommu->domains[num] = domain;
1522                         id = num;
1523                 }
1524
1525                 /* Skip top levels of page tables for
1526                  * iommu which has less agaw than default.
1527                  */
1528                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1529                         pgd = phys_to_virt(dma_pte_addr(pgd));
1530                         if (!dma_pte_present(pgd)) {
1531                                 spin_unlock_irqrestore(&iommu->lock, flags);
1532                                 return -ENOMEM;
1533                         }
1534                 }
1535         }
1536
1537         context_set_domain_id(context, id);
1538         context_set_address_width(context, iommu->agaw);
1539         context_set_address_root(context, virt_to_phys(pgd));
1540         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1541         context_set_fault_enable(context);
1542         context_set_present(context);
1543         domain_flush_cache(domain, context, sizeof(*context));
1544
1545         /* it's a non-present to present mapping */
1546         if (iommu->flush.flush_context(iommu, domain->id,
1547                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1548                 DMA_CCMD_DEVICE_INVL, 1))
1549                 iommu_flush_write_buffer(iommu);
1550         else
1551                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1552
1553         spin_unlock_irqrestore(&iommu->lock, flags);
1554
1555         spin_lock_irqsave(&domain->iommu_lock, flags);
1556         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1557                 domain->iommu_count++;
1558                 domain_update_iommu_coherency(domain);
1559         }
1560         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1561         return 0;
1562 }
1563
1564 static int
1565 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1566 {
1567         int ret;
1568         struct pci_dev *tmp, *parent;
1569
1570         ret = domain_context_mapping_one(domain, pdev->bus->number,
1571                 pdev->devfn);
1572         if (ret)
1573                 return ret;
1574
1575         /* dependent device mapping */
1576         tmp = pci_find_upstream_pcie_bridge(pdev);
1577         if (!tmp)
1578                 return 0;
1579         /* Secondary interface's bus number and devfn 0 */
1580         parent = pdev->bus->self;
1581         while (parent != tmp) {
1582                 ret = domain_context_mapping_one(domain, parent->bus->number,
1583                         parent->devfn);
1584                 if (ret)
1585                         return ret;
1586                 parent = parent->bus->self;
1587         }
1588         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1589                 return domain_context_mapping_one(domain,
1590                         tmp->subordinate->number, 0);
1591         else /* this is a legacy PCI bridge */
1592                 return domain_context_mapping_one(domain,
1593                         tmp->bus->number, tmp->devfn);
1594 }
1595
1596 static int domain_context_mapped(struct pci_dev *pdev)
1597 {
1598         int ret;
1599         struct pci_dev *tmp, *parent;
1600         struct intel_iommu *iommu;
1601
1602         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1603         if (!iommu)
1604                 return -ENODEV;
1605
1606         ret = device_context_mapped(iommu,
1607                 pdev->bus->number, pdev->devfn);
1608         if (!ret)
1609                 return ret;
1610         /* dependent device mapping */
1611         tmp = pci_find_upstream_pcie_bridge(pdev);
1612         if (!tmp)
1613                 return ret;
1614         /* Secondary interface's bus number and devfn 0 */
1615         parent = pdev->bus->self;
1616         while (parent != tmp) {
1617                 ret = device_context_mapped(iommu, parent->bus->number,
1618                         parent->devfn);
1619                 if (!ret)
1620                         return ret;
1621                 parent = parent->bus->self;
1622         }
1623         if (tmp->is_pcie)
1624                 return device_context_mapped(iommu,
1625                         tmp->subordinate->number, 0);
1626         else
1627                 return device_context_mapped(iommu,
1628                         tmp->bus->number, tmp->devfn);
1629 }
1630
1631 static int
1632 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1633                         u64 hpa, size_t size, int prot)
1634 {
1635         u64 start_pfn, end_pfn;
1636         struct dma_pte *pte;
1637         int index;
1638         int addr_width = agaw_to_width(domain->agaw);
1639
1640         hpa &= (((u64)1) << addr_width) - 1;
1641
1642         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1643                 return -EINVAL;
1644         iova &= PAGE_MASK;
1645         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1646         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1647         index = 0;
1648         while (start_pfn < end_pfn) {
1649                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1650                 if (!pte)
1651                         return -ENOMEM;
1652                 /* We don't need lock here, nobody else
1653                  * touches the iova range
1654                  */
1655                 BUG_ON(dma_pte_addr(pte));
1656                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1657                 dma_set_pte_prot(pte, prot);
1658                 domain_flush_cache(domain, pte, sizeof(*pte));
1659                 start_pfn++;
1660                 index++;
1661         }
1662         return 0;
1663 }
1664
1665 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1666 {
1667         if (!iommu)
1668                 return;
1669
1670         clear_context_table(iommu, bus, devfn);
1671         iommu->flush.flush_context(iommu, 0, 0, 0,
1672                                            DMA_CCMD_GLOBAL_INVL, 0);
1673         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1674                                          DMA_TLB_GLOBAL_FLUSH, 0);
1675 }
1676
1677 static void domain_remove_dev_info(struct dmar_domain *domain)
1678 {
1679         struct device_domain_info *info;
1680         unsigned long flags;
1681         struct intel_iommu *iommu;
1682
1683         spin_lock_irqsave(&device_domain_lock, flags);
1684         while (!list_empty(&domain->devices)) {
1685                 info = list_entry(domain->devices.next,
1686                         struct device_domain_info, link);
1687                 list_del(&info->link);
1688                 list_del(&info->global);
1689                 if (info->dev)
1690                         info->dev->dev.archdata.iommu = NULL;
1691                 spin_unlock_irqrestore(&device_domain_lock, flags);
1692
1693                 iommu = device_to_iommu(info->bus, info->devfn);
1694                 iommu_detach_dev(iommu, info->bus, info->devfn);
1695                 free_devinfo_mem(info);
1696
1697                 spin_lock_irqsave(&device_domain_lock, flags);
1698         }
1699         spin_unlock_irqrestore(&device_domain_lock, flags);
1700 }
1701
1702 /*
1703  * find_domain
1704  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1705  */
1706 static struct dmar_domain *
1707 find_domain(struct pci_dev *pdev)
1708 {
1709         struct device_domain_info *info;
1710
1711         /* No lock here, assumes no domain exit in normal case */
1712         info = pdev->dev.archdata.iommu;
1713         if (info)
1714                 return info->domain;
1715         return NULL;
1716 }
1717
1718 /* domain is initialized */
1719 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1720 {
1721         struct dmar_domain *domain, *found = NULL;
1722         struct intel_iommu *iommu;
1723         struct dmar_drhd_unit *drhd;
1724         struct device_domain_info *info, *tmp;
1725         struct pci_dev *dev_tmp;
1726         unsigned long flags;
1727         int bus = 0, devfn = 0;
1728
1729         domain = find_domain(pdev);
1730         if (domain)
1731                 return domain;
1732
1733         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1734         if (dev_tmp) {
1735                 if (dev_tmp->is_pcie) {
1736                         bus = dev_tmp->subordinate->number;
1737                         devfn = 0;
1738                 } else {
1739                         bus = dev_tmp->bus->number;
1740                         devfn = dev_tmp->devfn;
1741                 }
1742                 spin_lock_irqsave(&device_domain_lock, flags);
1743                 list_for_each_entry(info, &device_domain_list, global) {
1744                         if (info->bus == bus && info->devfn == devfn) {
1745                                 found = info->domain;
1746                                 break;
1747                         }
1748                 }
1749                 spin_unlock_irqrestore(&device_domain_lock, flags);
1750                 /* pcie-pci bridge already has a domain, uses it */
1751                 if (found) {
1752                         domain = found;
1753                         goto found_domain;
1754                 }
1755         }
1756
1757         /* Allocate new domain for the device */
1758         drhd = dmar_find_matched_drhd_unit(pdev);
1759         if (!drhd) {
1760                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1761                         pci_name(pdev));
1762                 return NULL;
1763         }
1764         iommu = drhd->iommu;
1765
1766         domain = iommu_alloc_domain(iommu);
1767         if (!domain)
1768                 goto error;
1769
1770         if (domain_init(domain, gaw)) {
1771                 domain_exit(domain);
1772                 goto error;
1773         }
1774
1775         /* register pcie-to-pci device */
1776         if (dev_tmp) {
1777                 info = alloc_devinfo_mem();
1778                 if (!info) {
1779                         domain_exit(domain);
1780                         goto error;
1781                 }
1782                 info->bus = bus;
1783                 info->devfn = devfn;
1784                 info->dev = NULL;
1785                 info->domain = domain;
1786                 /* This domain is shared by devices under p2p bridge */
1787                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1788
1789                 /* pcie-to-pci bridge already has a domain, uses it */
1790                 found = NULL;
1791                 spin_lock_irqsave(&device_domain_lock, flags);
1792                 list_for_each_entry(tmp, &device_domain_list, global) {
1793                         if (tmp->bus == bus && tmp->devfn == devfn) {
1794                                 found = tmp->domain;
1795                                 break;
1796                         }
1797                 }
1798                 if (found) {
1799                         free_devinfo_mem(info);
1800                         domain_exit(domain);
1801                         domain = found;
1802                 } else {
1803                         list_add(&info->link, &domain->devices);
1804                         list_add(&info->global, &device_domain_list);
1805                 }
1806                 spin_unlock_irqrestore(&device_domain_lock, flags);
1807         }
1808
1809 found_domain:
1810         info = alloc_devinfo_mem();
1811         if (!info)
1812                 goto error;
1813         info->bus = pdev->bus->number;
1814         info->devfn = pdev->devfn;
1815         info->dev = pdev;
1816         info->domain = domain;
1817         spin_lock_irqsave(&device_domain_lock, flags);
1818         /* somebody is fast */
1819         found = find_domain(pdev);
1820         if (found != NULL) {
1821                 spin_unlock_irqrestore(&device_domain_lock, flags);
1822                 if (found != domain) {
1823                         domain_exit(domain);
1824                         domain = found;
1825                 }
1826                 free_devinfo_mem(info);
1827                 return domain;
1828         }
1829         list_add(&info->link, &domain->devices);
1830         list_add(&info->global, &device_domain_list);
1831         pdev->dev.archdata.iommu = info;
1832         spin_unlock_irqrestore(&device_domain_lock, flags);
1833         return domain;
1834 error:
1835         /* recheck it here, maybe others set it */
1836         return find_domain(pdev);
1837 }
1838
1839 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1840                                       unsigned long long start,
1841                                       unsigned long long end)
1842 {
1843         struct dmar_domain *domain;
1844         unsigned long size;
1845         unsigned long long base;
1846         int ret;
1847
1848         printk(KERN_INFO
1849                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1850                 pci_name(pdev), start, end);
1851         /* page table init */
1852         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1853         if (!domain)
1854                 return -ENOMEM;
1855
1856         /* The address might not be aligned */
1857         base = start & PAGE_MASK;
1858         size = end - base;
1859         size = PAGE_ALIGN(size);
1860         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1861                         IOVA_PFN(base + size) - 1)) {
1862                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1863                 ret = -ENOMEM;
1864                 goto error;
1865         }
1866
1867         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1868                 size, base, pci_name(pdev));
1869         /*
1870          * RMRR range might have overlap with physical memory range,
1871          * clear it first
1872          */
1873         dma_pte_clear_range(domain, base, base + size);
1874
1875         ret = domain_page_mapping(domain, base, base, size,
1876                 DMA_PTE_READ|DMA_PTE_WRITE);
1877         if (ret)
1878                 goto error;
1879
1880         /* context entry init */
1881         ret = domain_context_mapping(domain, pdev);
1882         if (!ret)
1883                 return 0;
1884 error:
1885         domain_exit(domain);
1886         return ret;
1887
1888 }
1889
1890 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1891         struct pci_dev *pdev)
1892 {
1893         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1894                 return 0;
1895         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1896                 rmrr->end_address + 1);
1897 }
1898
1899 #ifdef CONFIG_DMAR_GFX_WA
1900 struct iommu_prepare_data {
1901         struct pci_dev *pdev;
1902         int ret;
1903 };
1904
1905 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1906                                          unsigned long end_pfn, void *datax)
1907 {
1908         struct iommu_prepare_data *data;
1909
1910         data = (struct iommu_prepare_data *)datax;
1911
1912         data->ret = iommu_prepare_identity_map(data->pdev,
1913                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1914         return data->ret;
1915
1916 }
1917
1918 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1919 {
1920         int nid;
1921         struct iommu_prepare_data data;
1922
1923         data.pdev = pdev;
1924         data.ret = 0;
1925
1926         for_each_online_node(nid) {
1927                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1928                 if (data.ret)
1929                         return data.ret;
1930         }
1931         return data.ret;
1932 }
1933
1934 static void __init iommu_prepare_gfx_mapping(void)
1935 {
1936         struct pci_dev *pdev = NULL;
1937         int ret;
1938
1939         for_each_pci_dev(pdev) {
1940                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1941                                 !IS_GFX_DEVICE(pdev))
1942                         continue;
1943                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1944                         pci_name(pdev));
1945                 ret = iommu_prepare_with_active_regions(pdev);
1946                 if (ret)
1947                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1948         }
1949 }
1950 #else /* !CONFIG_DMAR_GFX_WA */
1951 static inline void iommu_prepare_gfx_mapping(void)
1952 {
1953         return;
1954 }
1955 #endif
1956
1957 #ifdef CONFIG_DMAR_FLOPPY_WA
1958 static inline void iommu_prepare_isa(void)
1959 {
1960         struct pci_dev *pdev;
1961         int ret;
1962
1963         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1964         if (!pdev)
1965                 return;
1966
1967         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1968         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1969
1970         if (ret)
1971                 printk("IOMMU: Failed to create 0-64M identity map, "
1972                         "floppy might not work\n");
1973
1974 }
1975 #else
1976 static inline void iommu_prepare_isa(void)
1977 {
1978         return;
1979 }
1980 #endif /* !CONFIG_DMAR_FLPY_WA */
1981
1982 static int __init init_dmars(void)
1983 {
1984         struct dmar_drhd_unit *drhd;
1985         struct dmar_rmrr_unit *rmrr;
1986         struct pci_dev *pdev;
1987         struct intel_iommu *iommu;
1988         int i, ret, unit = 0;
1989
1990         /*
1991          * for each drhd
1992          *    allocate root
1993          *    initialize and program root entry to not present
1994          * endfor
1995          */
1996         for_each_drhd_unit(drhd) {
1997                 g_num_of_iommus++;
1998                 /*
1999                  * lock not needed as this is only incremented in the single
2000                  * threaded kernel __init code path all other access are read
2001                  * only
2002                  */
2003         }
2004
2005         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2006                         GFP_KERNEL);
2007         if (!g_iommus) {
2008                 printk(KERN_ERR "Allocating global iommu array failed\n");
2009                 ret = -ENOMEM;
2010                 goto error;
2011         }
2012
2013         deferred_flush = kzalloc(g_num_of_iommus *
2014                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2015         if (!deferred_flush) {
2016                 kfree(g_iommus);
2017                 ret = -ENOMEM;
2018                 goto error;
2019         }
2020
2021         for_each_drhd_unit(drhd) {
2022                 if (drhd->ignored)
2023                         continue;
2024
2025                 iommu = drhd->iommu;
2026                 g_iommus[iommu->seq_id] = iommu;
2027
2028                 ret = iommu_init_domains(iommu);
2029                 if (ret)
2030                         goto error;
2031
2032                 /*
2033                  * TBD:
2034                  * we could share the same root & context tables
2035                  * amoung all IOMMU's. Need to Split it later.
2036                  */
2037                 ret = iommu_alloc_root_entry(iommu);
2038                 if (ret) {
2039                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2040                         goto error;
2041                 }
2042         }
2043
2044         for_each_drhd_unit(drhd) {
2045                 if (drhd->ignored)
2046                         continue;
2047
2048                 iommu = drhd->iommu;
2049                 if (dmar_enable_qi(iommu)) {
2050                         /*
2051                          * Queued Invalidate not enabled, use Register Based
2052                          * Invalidate
2053                          */
2054                         iommu->flush.flush_context = __iommu_flush_context;
2055                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2056                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2057                                "invalidation\n",
2058                                (unsigned long long)drhd->reg_base_addr);
2059                 } else {
2060                         iommu->flush.flush_context = qi_flush_context;
2061                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2062                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2063                                "invalidation\n",
2064                                (unsigned long long)drhd->reg_base_addr);
2065                 }
2066         }
2067
2068         /*
2069          * For each rmrr
2070          *   for each dev attached to rmrr
2071          *   do
2072          *     locate drhd for dev, alloc domain for dev
2073          *     allocate free domain
2074          *     allocate page table entries for rmrr
2075          *     if context not allocated for bus
2076          *           allocate and init context
2077          *           set present in root table for this bus
2078          *     init context with domain, translation etc
2079          *    endfor
2080          * endfor
2081          */
2082         for_each_rmrr_units(rmrr) {
2083                 for (i = 0; i < rmrr->devices_cnt; i++) {
2084                         pdev = rmrr->devices[i];
2085                         /* some BIOS lists non-exist devices in DMAR table */
2086                         if (!pdev)
2087                                 continue;
2088                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2089                         if (ret)
2090                                 printk(KERN_ERR
2091                                  "IOMMU: mapping reserved region failed\n");
2092                 }
2093         }
2094
2095         iommu_prepare_gfx_mapping();
2096
2097         iommu_prepare_isa();
2098
2099         /*
2100          * for each drhd
2101          *   enable fault log
2102          *   global invalidate context cache
2103          *   global invalidate iotlb
2104          *   enable translation
2105          */
2106         for_each_drhd_unit(drhd) {
2107                 if (drhd->ignored)
2108                         continue;
2109                 iommu = drhd->iommu;
2110                 sprintf (iommu->name, "dmar%d", unit++);
2111
2112                 iommu_flush_write_buffer(iommu);
2113
2114                 ret = dmar_set_interrupt(iommu);
2115                 if (ret)
2116                         goto error;
2117
2118                 iommu_set_root_entry(iommu);
2119
2120                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2121                                            0);
2122                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2123                                          0);
2124                 iommu_disable_protect_mem_regions(iommu);
2125
2126                 ret = iommu_enable_translation(iommu);
2127                 if (ret)
2128                         goto error;
2129         }
2130
2131         return 0;
2132 error:
2133         for_each_drhd_unit(drhd) {
2134                 if (drhd->ignored)
2135                         continue;
2136                 iommu = drhd->iommu;
2137                 free_iommu(iommu);
2138         }
2139         kfree(g_iommus);
2140         return ret;
2141 }
2142
2143 static inline u64 aligned_size(u64 host_addr, size_t size)
2144 {
2145         u64 addr;
2146         addr = (host_addr & (~PAGE_MASK)) + size;
2147         return PAGE_ALIGN(addr);
2148 }
2149
2150 struct iova *
2151 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2152 {
2153         struct iova *piova;
2154
2155         /* Make sure it's in range */
2156         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2157         if (!size || (IOVA_START_ADDR + size > end))
2158                 return NULL;
2159
2160         piova = alloc_iova(&domain->iovad,
2161                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2162         return piova;
2163 }
2164
2165 static struct iova *
2166 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2167                    size_t size, u64 dma_mask)
2168 {
2169         struct pci_dev *pdev = to_pci_dev(dev);
2170         struct iova *iova = NULL;
2171
2172         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2173                 iova = iommu_alloc_iova(domain, size, dma_mask);
2174         else {
2175                 /*
2176                  * First try to allocate an io virtual address in
2177                  * DMA_32BIT_MASK and if that fails then try allocating
2178                  * from higher range
2179                  */
2180                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2181                 if (!iova)
2182                         iova = iommu_alloc_iova(domain, size, dma_mask);
2183         }
2184
2185         if (!iova) {
2186                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2187                 return NULL;
2188         }
2189
2190         return iova;
2191 }
2192
2193 static struct dmar_domain *
2194 get_valid_domain_for_dev(struct pci_dev *pdev)
2195 {
2196         struct dmar_domain *domain;
2197         int ret;
2198
2199         domain = get_domain_for_dev(pdev,
2200                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2201         if (!domain) {
2202                 printk(KERN_ERR
2203                         "Allocating domain for %s failed", pci_name(pdev));
2204                 return NULL;
2205         }
2206
2207         /* make sure context mapping is ok */
2208         if (unlikely(!domain_context_mapped(pdev))) {
2209                 ret = domain_context_mapping(domain, pdev);
2210                 if (ret) {
2211                         printk(KERN_ERR
2212                                 "Domain context map for %s failed",
2213                                 pci_name(pdev));
2214                         return NULL;
2215                 }
2216         }
2217
2218         return domain;
2219 }
2220
2221 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2222                                      size_t size, int dir, u64 dma_mask)
2223 {
2224         struct pci_dev *pdev = to_pci_dev(hwdev);
2225         struct dmar_domain *domain;
2226         phys_addr_t start_paddr;
2227         struct iova *iova;
2228         int prot = 0;
2229         int ret;
2230         struct intel_iommu *iommu;
2231
2232         BUG_ON(dir == DMA_NONE);
2233         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2234                 return paddr;
2235
2236         domain = get_valid_domain_for_dev(pdev);
2237         if (!domain)
2238                 return 0;
2239
2240         iommu = domain_get_iommu(domain);
2241         size = aligned_size((u64)paddr, size);
2242
2243         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2244         if (!iova)
2245                 goto error;
2246
2247         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2248
2249         /*
2250          * Check if DMAR supports zero-length reads on write only
2251          * mappings..
2252          */
2253         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2254                         !cap_zlr(iommu->cap))
2255                 prot |= DMA_PTE_READ;
2256         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2257                 prot |= DMA_PTE_WRITE;
2258         /*
2259          * paddr - (paddr + size) might be partial page, we should map the whole
2260          * page.  Note: if two part of one page are separately mapped, we
2261          * might have two guest_addr mapping to the same host paddr, but this
2262          * is not a big problem
2263          */
2264         ret = domain_page_mapping(domain, start_paddr,
2265                 ((u64)paddr) & PAGE_MASK, size, prot);
2266         if (ret)
2267                 goto error;
2268
2269         /* it's a non-present to present mapping */
2270         ret = iommu_flush_iotlb_psi(iommu, domain->id,
2271                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2272         if (ret)
2273                 iommu_flush_write_buffer(iommu);
2274
2275         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2276
2277 error:
2278         if (iova)
2279                 __free_iova(&domain->iovad, iova);
2280         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2281                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2282         return 0;
2283 }
2284
2285 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2286                             size_t size, int dir)
2287 {
2288         return __intel_map_single(hwdev, paddr, size, dir,
2289                                   to_pci_dev(hwdev)->dma_mask);
2290 }
2291
2292 static void flush_unmaps(void)
2293 {
2294         int i, j;
2295
2296         timer_on = 0;
2297
2298         /* just flush them all */
2299         for (i = 0; i < g_num_of_iommus; i++) {
2300                 struct intel_iommu *iommu = g_iommus[i];
2301                 if (!iommu)
2302                         continue;
2303
2304                 if (deferred_flush[i].next) {
2305                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2306                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2307                         for (j = 0; j < deferred_flush[i].next; j++) {
2308                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2309                                                 deferred_flush[i].iova[j]);
2310                         }
2311                         deferred_flush[i].next = 0;
2312                 }
2313         }
2314
2315         list_size = 0;
2316 }
2317
2318 static void flush_unmaps_timeout(unsigned long data)
2319 {
2320         unsigned long flags;
2321
2322         spin_lock_irqsave(&async_umap_flush_lock, flags);
2323         flush_unmaps();
2324         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2325 }
2326
2327 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2328 {
2329         unsigned long flags;
2330         int next, iommu_id;
2331         struct intel_iommu *iommu;
2332
2333         spin_lock_irqsave(&async_umap_flush_lock, flags);
2334         if (list_size == HIGH_WATER_MARK)
2335                 flush_unmaps();
2336
2337         iommu = domain_get_iommu(dom);
2338         iommu_id = iommu->seq_id;
2339
2340         next = deferred_flush[iommu_id].next;
2341         deferred_flush[iommu_id].domain[next] = dom;
2342         deferred_flush[iommu_id].iova[next] = iova;
2343         deferred_flush[iommu_id].next++;
2344
2345         if (!timer_on) {
2346                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2347                 timer_on = 1;
2348         }
2349         list_size++;
2350         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2351 }
2352
2353 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2354                         int dir)
2355 {
2356         struct pci_dev *pdev = to_pci_dev(dev);
2357         struct dmar_domain *domain;
2358         unsigned long start_addr;
2359         struct iova *iova;
2360         struct intel_iommu *iommu;
2361
2362         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2363                 return;
2364         domain = find_domain(pdev);
2365         BUG_ON(!domain);
2366
2367         iommu = domain_get_iommu(domain);
2368
2369         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2370         if (!iova)
2371                 return;
2372
2373         start_addr = iova->pfn_lo << PAGE_SHIFT;
2374         size = aligned_size((u64)dev_addr, size);
2375
2376         pr_debug("Device %s unmapping: %lx@%llx\n",
2377                 pci_name(pdev), size, (unsigned long long)start_addr);
2378
2379         /*  clear the whole page */
2380         dma_pte_clear_range(domain, start_addr, start_addr + size);
2381         /* free page tables */
2382         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2383         if (intel_iommu_strict) {
2384                 if (iommu_flush_iotlb_psi(iommu,
2385                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2386                         iommu_flush_write_buffer(iommu);
2387                 /* free iova */
2388                 __free_iova(&domain->iovad, iova);
2389         } else {
2390                 add_unmap(domain, iova);
2391                 /*
2392                  * queue up the release of the unmap to save the 1/6th of the
2393                  * cpu used up by the iotlb flush operation...
2394                  */
2395         }
2396 }
2397
2398 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2399                            dma_addr_t *dma_handle, gfp_t flags)
2400 {
2401         void *vaddr;
2402         int order;
2403
2404         size = PAGE_ALIGN(size);
2405         order = get_order(size);
2406         flags &= ~(GFP_DMA | GFP_DMA32);
2407
2408         vaddr = (void *)__get_free_pages(flags, order);
2409         if (!vaddr)
2410                 return NULL;
2411         memset(vaddr, 0, size);
2412
2413         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2414                                          DMA_BIDIRECTIONAL,
2415                                          hwdev->coherent_dma_mask);
2416         if (*dma_handle)
2417                 return vaddr;
2418         free_pages((unsigned long)vaddr, order);
2419         return NULL;
2420 }
2421
2422 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2423                          dma_addr_t dma_handle)
2424 {
2425         int order;
2426
2427         size = PAGE_ALIGN(size);
2428         order = get_order(size);
2429
2430         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2431         free_pages((unsigned long)vaddr, order);
2432 }
2433
2434 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2435
2436 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2437                     int nelems, int dir)
2438 {
2439         int i;
2440         struct pci_dev *pdev = to_pci_dev(hwdev);
2441         struct dmar_domain *domain;
2442         unsigned long start_addr;
2443         struct iova *iova;
2444         size_t size = 0;
2445         void *addr;
2446         struct scatterlist *sg;
2447         struct intel_iommu *iommu;
2448
2449         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2450                 return;
2451
2452         domain = find_domain(pdev);
2453         BUG_ON(!domain);
2454
2455         iommu = domain_get_iommu(domain);
2456
2457         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2458         if (!iova)
2459                 return;
2460         for_each_sg(sglist, sg, nelems, i) {
2461                 addr = SG_ENT_VIRT_ADDRESS(sg);
2462                 size += aligned_size((u64)addr, sg->length);
2463         }
2464
2465         start_addr = iova->pfn_lo << PAGE_SHIFT;
2466
2467         /*  clear the whole page */
2468         dma_pte_clear_range(domain, start_addr, start_addr + size);
2469         /* free page tables */
2470         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2471
2472         if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2473                         size >> VTD_PAGE_SHIFT, 0))
2474                 iommu_flush_write_buffer(iommu);
2475
2476         /* free iova */
2477         __free_iova(&domain->iovad, iova);
2478 }
2479
2480 static int intel_nontranslate_map_sg(struct device *hddev,
2481         struct scatterlist *sglist, int nelems, int dir)
2482 {
2483         int i;
2484         struct scatterlist *sg;
2485
2486         for_each_sg(sglist, sg, nelems, i) {
2487                 BUG_ON(!sg_page(sg));
2488                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2489                 sg->dma_length = sg->length;
2490         }
2491         return nelems;
2492 }
2493
2494 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2495                  int dir)
2496 {
2497         void *addr;
2498         int i;
2499         struct pci_dev *pdev = to_pci_dev(hwdev);
2500         struct dmar_domain *domain;
2501         size_t size = 0;
2502         int prot = 0;
2503         size_t offset = 0;
2504         struct iova *iova = NULL;
2505         int ret;
2506         struct scatterlist *sg;
2507         unsigned long start_addr;
2508         struct intel_iommu *iommu;
2509
2510         BUG_ON(dir == DMA_NONE);
2511         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2512                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2513
2514         domain = get_valid_domain_for_dev(pdev);
2515         if (!domain)
2516                 return 0;
2517
2518         iommu = domain_get_iommu(domain);
2519
2520         for_each_sg(sglist, sg, nelems, i) {
2521                 addr = SG_ENT_VIRT_ADDRESS(sg);
2522                 addr = (void *)virt_to_phys(addr);
2523                 size += aligned_size((u64)addr, sg->length);
2524         }
2525
2526         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2527         if (!iova) {
2528                 sglist->dma_length = 0;
2529                 return 0;
2530         }
2531
2532         /*
2533          * Check if DMAR supports zero-length reads on write only
2534          * mappings..
2535          */
2536         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2537                         !cap_zlr(iommu->cap))
2538                 prot |= DMA_PTE_READ;
2539         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2540                 prot |= DMA_PTE_WRITE;
2541
2542         start_addr = iova->pfn_lo << PAGE_SHIFT;
2543         offset = 0;
2544         for_each_sg(sglist, sg, nelems, i) {
2545                 addr = SG_ENT_VIRT_ADDRESS(sg);
2546                 addr = (void *)virt_to_phys(addr);
2547                 size = aligned_size((u64)addr, sg->length);
2548                 ret = domain_page_mapping(domain, start_addr + offset,
2549                         ((u64)addr) & PAGE_MASK,
2550                         size, prot);
2551                 if (ret) {
2552                         /*  clear the page */
2553                         dma_pte_clear_range(domain, start_addr,
2554                                   start_addr + offset);
2555                         /* free page tables */
2556                         dma_pte_free_pagetable(domain, start_addr,
2557                                   start_addr + offset);
2558                         /* free iova */
2559                         __free_iova(&domain->iovad, iova);
2560                         return 0;
2561                 }
2562                 sg->dma_address = start_addr + offset +
2563                                 ((u64)addr & (~PAGE_MASK));
2564                 sg->dma_length = sg->length;
2565                 offset += size;
2566         }
2567
2568         /* it's a non-present to present mapping */
2569         if (iommu_flush_iotlb_psi(iommu, domain->id,
2570                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2571                 iommu_flush_write_buffer(iommu);
2572         return nelems;
2573 }
2574
2575 static struct dma_mapping_ops intel_dma_ops = {
2576         .alloc_coherent = intel_alloc_coherent,
2577         .free_coherent = intel_free_coherent,
2578         .map_single = intel_map_single,
2579         .unmap_single = intel_unmap_single,
2580         .map_sg = intel_map_sg,
2581         .unmap_sg = intel_unmap_sg,
2582 };
2583
2584 static inline int iommu_domain_cache_init(void)
2585 {
2586         int ret = 0;
2587
2588         iommu_domain_cache = kmem_cache_create("iommu_domain",
2589                                          sizeof(struct dmar_domain),
2590                                          0,
2591                                          SLAB_HWCACHE_ALIGN,
2592
2593                                          NULL);
2594         if (!iommu_domain_cache) {
2595                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2596                 ret = -ENOMEM;
2597         }
2598
2599         return ret;
2600 }
2601
2602 static inline int iommu_devinfo_cache_init(void)
2603 {
2604         int ret = 0;
2605
2606         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2607                                          sizeof(struct device_domain_info),
2608                                          0,
2609                                          SLAB_HWCACHE_ALIGN,
2610                                          NULL);
2611         if (!iommu_devinfo_cache) {
2612                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2613                 ret = -ENOMEM;
2614         }
2615
2616         return ret;
2617 }
2618
2619 static inline int iommu_iova_cache_init(void)
2620 {
2621         int ret = 0;
2622
2623         iommu_iova_cache = kmem_cache_create("iommu_iova",
2624                                          sizeof(struct iova),
2625                                          0,
2626                                          SLAB_HWCACHE_ALIGN,
2627                                          NULL);
2628         if (!iommu_iova_cache) {
2629                 printk(KERN_ERR "Couldn't create iova cache\n");
2630                 ret = -ENOMEM;
2631         }
2632
2633         return ret;
2634 }
2635
2636 static int __init iommu_init_mempool(void)
2637 {
2638         int ret;
2639         ret = iommu_iova_cache_init();
2640         if (ret)
2641                 return ret;
2642
2643         ret = iommu_domain_cache_init();
2644         if (ret)
2645                 goto domain_error;
2646
2647         ret = iommu_devinfo_cache_init();
2648         if (!ret)
2649                 return ret;
2650
2651         kmem_cache_destroy(iommu_domain_cache);
2652 domain_error:
2653         kmem_cache_destroy(iommu_iova_cache);
2654
2655         return -ENOMEM;
2656 }
2657
2658 static void __init iommu_exit_mempool(void)
2659 {
2660         kmem_cache_destroy(iommu_devinfo_cache);
2661         kmem_cache_destroy(iommu_domain_cache);
2662         kmem_cache_destroy(iommu_iova_cache);
2663
2664 }
2665
2666 static void __init init_no_remapping_devices(void)
2667 {
2668         struct dmar_drhd_unit *drhd;
2669
2670         for_each_drhd_unit(drhd) {
2671                 if (!drhd->include_all) {
2672                         int i;
2673                         for (i = 0; i < drhd->devices_cnt; i++)
2674                                 if (drhd->devices[i] != NULL)
2675                                         break;
2676                         /* ignore DMAR unit if no pci devices exist */
2677                         if (i == drhd->devices_cnt)
2678                                 drhd->ignored = 1;
2679                 }
2680         }
2681
2682         if (dmar_map_gfx)
2683                 return;
2684
2685         for_each_drhd_unit(drhd) {
2686                 int i;
2687                 if (drhd->ignored || drhd->include_all)
2688                         continue;
2689
2690                 for (i = 0; i < drhd->devices_cnt; i++)
2691                         if (drhd->devices[i] &&
2692                                 !IS_GFX_DEVICE(drhd->devices[i]))
2693                                 break;
2694
2695                 if (i < drhd->devices_cnt)
2696                         continue;
2697
2698                 /* bypass IOMMU if it is just for gfx devices */
2699                 drhd->ignored = 1;
2700                 for (i = 0; i < drhd->devices_cnt; i++) {
2701                         if (!drhd->devices[i])
2702                                 continue;
2703                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2704                 }
2705         }
2706 }
2707
2708 int __init intel_iommu_init(void)
2709 {
2710         int ret = 0;
2711
2712         if (dmar_table_init())
2713                 return  -ENODEV;
2714
2715         if (dmar_dev_scope_init())
2716                 return  -ENODEV;
2717
2718         /*
2719          * Check the need for DMA-remapping initialization now.
2720          * Above initialization will also be used by Interrupt-remapping.
2721          */
2722         if (no_iommu || swiotlb || dmar_disabled)
2723                 return -ENODEV;
2724
2725         iommu_init_mempool();
2726         dmar_init_reserved_ranges();
2727
2728         init_no_remapping_devices();
2729
2730         ret = init_dmars();
2731         if (ret) {
2732                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2733                 put_iova_domain(&reserved_iova_list);
2734                 iommu_exit_mempool();
2735                 return ret;
2736         }
2737         printk(KERN_INFO
2738         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2739
2740         init_timer(&unmap_timer);
2741         force_iommu = 1;
2742         dma_ops = &intel_dma_ops;
2743
2744         register_iommu(&intel_iommu_ops);
2745
2746         return 0;
2747 }
2748
2749 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2750                                   struct pci_dev *pdev)
2751 {
2752         struct device_domain_info *info;
2753         unsigned long flags;
2754
2755         info = alloc_devinfo_mem();
2756         if (!info)
2757                 return -ENOMEM;
2758
2759         info->bus = pdev->bus->number;
2760         info->devfn = pdev->devfn;
2761         info->dev = pdev;
2762         info->domain = domain;
2763
2764         spin_lock_irqsave(&device_domain_lock, flags);
2765         list_add(&info->link, &domain->devices);
2766         list_add(&info->global, &device_domain_list);
2767         pdev->dev.archdata.iommu = info;
2768         spin_unlock_irqrestore(&device_domain_lock, flags);
2769
2770         return 0;
2771 }
2772
2773 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2774                                           struct pci_dev *pdev)
2775 {
2776         struct device_domain_info *info;
2777         struct intel_iommu *iommu;
2778         unsigned long flags;
2779         int found = 0;
2780         struct list_head *entry, *tmp;
2781
2782         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2783         if (!iommu)
2784                 return;
2785
2786         spin_lock_irqsave(&device_domain_lock, flags);
2787         list_for_each_safe(entry, tmp, &domain->devices) {
2788                 info = list_entry(entry, struct device_domain_info, link);
2789                 if (info->bus == pdev->bus->number &&
2790                     info->devfn == pdev->devfn) {
2791                         list_del(&info->link);
2792                         list_del(&info->global);
2793                         if (info->dev)
2794                                 info->dev->dev.archdata.iommu = NULL;
2795                         spin_unlock_irqrestore(&device_domain_lock, flags);
2796
2797                         iommu_detach_dev(iommu, info->bus, info->devfn);
2798                         free_devinfo_mem(info);
2799
2800                         spin_lock_irqsave(&device_domain_lock, flags);
2801
2802                         if (found)
2803                                 break;
2804                         else
2805                                 continue;
2806                 }
2807
2808                 /* if there is no other devices under the same iommu
2809                  * owned by this domain, clear this iommu in iommu_bmp
2810                  * update iommu count and coherency
2811                  */
2812                 if (device_to_iommu(info->bus, info->devfn) == iommu)
2813                         found = 1;
2814         }
2815
2816         if (found == 0) {
2817                 unsigned long tmp_flags;
2818                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2819                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2820                 domain->iommu_count--;
2821                 domain_update_iommu_coherency(domain);
2822                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2823         }
2824
2825         spin_unlock_irqrestore(&device_domain_lock, flags);
2826 }
2827
2828 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2829 {
2830         struct device_domain_info *info;
2831         struct intel_iommu *iommu;
2832         unsigned long flags1, flags2;
2833
2834         spin_lock_irqsave(&device_domain_lock, flags1);
2835         while (!list_empty(&domain->devices)) {
2836                 info = list_entry(domain->devices.next,
2837                         struct device_domain_info, link);
2838                 list_del(&info->link);
2839                 list_del(&info->global);
2840                 if (info->dev)
2841                         info->dev->dev.archdata.iommu = NULL;
2842
2843                 spin_unlock_irqrestore(&device_domain_lock, flags1);
2844
2845                 iommu = device_to_iommu(info->bus, info->devfn);
2846                 iommu_detach_dev(iommu, info->bus, info->devfn);
2847
2848                 /* clear this iommu in iommu_bmp, update iommu count
2849                  * and coherency
2850                  */
2851                 spin_lock_irqsave(&domain->iommu_lock, flags2);
2852                 if (test_and_clear_bit(iommu->seq_id,
2853                                        &domain->iommu_bmp)) {
2854                         domain->iommu_count--;
2855                         domain_update_iommu_coherency(domain);
2856                 }
2857                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2858
2859                 free_devinfo_mem(info);
2860                 spin_lock_irqsave(&device_domain_lock, flags1);
2861         }
2862         spin_unlock_irqrestore(&device_domain_lock, flags1);
2863 }
2864
2865 /* domain id for virtual machine, it won't be set in context */
2866 static unsigned long vm_domid;
2867
2868 static int vm_domain_min_agaw(struct dmar_domain *domain)
2869 {
2870         int i;
2871         int min_agaw = domain->agaw;
2872
2873         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2874         for (; i < g_num_of_iommus; ) {
2875                 if (min_agaw > g_iommus[i]->agaw)
2876                         min_agaw = g_iommus[i]->agaw;
2877
2878                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2879         }
2880
2881         return min_agaw;
2882 }
2883
2884 static struct dmar_domain *iommu_alloc_vm_domain(void)
2885 {
2886         struct dmar_domain *domain;
2887
2888         domain = alloc_domain_mem();
2889         if (!domain)
2890                 return NULL;
2891
2892         domain->id = vm_domid++;
2893         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2894         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2895
2896         return domain;
2897 }
2898
2899 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2900 {
2901         int adjust_width;
2902
2903         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2904         spin_lock_init(&domain->mapping_lock);
2905         spin_lock_init(&domain->iommu_lock);
2906
2907         domain_reserve_special_ranges(domain);
2908
2909         /* calculate AGAW */
2910         domain->gaw = guest_width;
2911         adjust_width = guestwidth_to_adjustwidth(guest_width);
2912         domain->agaw = width_to_agaw(adjust_width);
2913
2914         INIT_LIST_HEAD(&domain->devices);
2915
2916         domain->iommu_count = 0;
2917         domain->iommu_coherency = 0;
2918         domain->max_addr = 0;
2919
2920         /* always allocate the top pgd */
2921         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2922         if (!domain->pgd)
2923                 return -ENOMEM;
2924         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2925         return 0;
2926 }
2927
2928 static void iommu_free_vm_domain(struct dmar_domain *domain)
2929 {
2930         unsigned long flags;
2931         struct dmar_drhd_unit *drhd;
2932         struct intel_iommu *iommu;
2933         unsigned long i;
2934         unsigned long ndomains;
2935
2936         for_each_drhd_unit(drhd) {
2937                 if (drhd->ignored)
2938                         continue;
2939                 iommu = drhd->iommu;
2940
2941                 ndomains = cap_ndoms(iommu->cap);
2942                 i = find_first_bit(iommu->domain_ids, ndomains);
2943                 for (; i < ndomains; ) {
2944                         if (iommu->domains[i] == domain) {
2945                                 spin_lock_irqsave(&iommu->lock, flags);
2946                                 clear_bit(i, iommu->domain_ids);
2947                                 iommu->domains[i] = NULL;
2948                                 spin_unlock_irqrestore(&iommu->lock, flags);
2949                                 break;
2950                         }
2951                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
2952                 }
2953         }
2954 }
2955
2956 static void vm_domain_exit(struct dmar_domain *domain)
2957 {
2958         u64 end;
2959
2960         /* Domain 0 is reserved, so dont process it */
2961         if (!domain)
2962                 return;
2963
2964         vm_domain_remove_all_dev_info(domain);
2965         /* destroy iovas */
2966         put_iova_domain(&domain->iovad);
2967         end = DOMAIN_MAX_ADDR(domain->gaw);
2968         end = end & (~VTD_PAGE_MASK);
2969
2970         /* clear ptes */
2971         dma_pte_clear_range(domain, 0, end);
2972
2973         /* free page tables */
2974         dma_pte_free_pagetable(domain, 0, end);
2975
2976         iommu_free_vm_domain(domain);
2977         free_domain_mem(domain);
2978 }
2979
2980 static int intel_iommu_domain_init(struct iommu_domain *domain)
2981 {
2982         struct dmar_domain *dmar_domain;
2983
2984         dmar_domain = iommu_alloc_vm_domain();
2985         if (!dmar_domain) {
2986                 printk(KERN_ERR
2987                         "intel_iommu_domain_init: dmar_domain == NULL\n");
2988                 return -ENOMEM;
2989         }
2990         if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2991                 printk(KERN_ERR
2992                         "intel_iommu_domain_init() failed\n");
2993                 vm_domain_exit(dmar_domain);
2994                 return -ENOMEM;
2995         }
2996         domain->priv = dmar_domain;
2997
2998         return 0;
2999 }
3000
3001 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3002 {
3003         struct dmar_domain *dmar_domain = domain->priv;
3004
3005         domain->priv = NULL;
3006         vm_domain_exit(dmar_domain);
3007 }
3008
3009 static int intel_iommu_attach_device(struct iommu_domain *domain,
3010                                      struct device *dev)
3011 {
3012         struct dmar_domain *dmar_domain = domain->priv;
3013         struct pci_dev *pdev = to_pci_dev(dev);
3014         struct intel_iommu *iommu;
3015         int addr_width;
3016         u64 end;
3017         int ret;
3018
3019         /* normally pdev is not mapped */
3020         if (unlikely(domain_context_mapped(pdev))) {
3021                 struct dmar_domain *old_domain;
3022
3023                 old_domain = find_domain(pdev);
3024                 if (old_domain) {
3025                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3026                                 vm_domain_remove_one_dev_info(old_domain, pdev);
3027                         else
3028                                 domain_remove_dev_info(old_domain);
3029                 }
3030         }
3031
3032         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
3033         if (!iommu)
3034                 return -ENODEV;
3035
3036         /* check if this iommu agaw is sufficient for max mapped address */
3037         addr_width = agaw_to_width(iommu->agaw);
3038         end = DOMAIN_MAX_ADDR(addr_width);
3039         end = end & VTD_PAGE_MASK;
3040         if (end < dmar_domain->max_addr) {
3041                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3042                        "sufficient for the mapped address (%llx)\n",
3043                        __func__, iommu->agaw, dmar_domain->max_addr);
3044                 return -EFAULT;
3045         }
3046
3047         ret = domain_context_mapping(dmar_domain, pdev);
3048         if (ret)
3049                 return ret;
3050
3051         ret = vm_domain_add_dev_info(dmar_domain, pdev);
3052         return ret;
3053 }
3054
3055 static void intel_iommu_detach_device(struct iommu_domain *domain,
3056                                       struct device *dev)
3057 {
3058         struct dmar_domain *dmar_domain = domain->priv;
3059         struct pci_dev *pdev = to_pci_dev(dev);
3060
3061         vm_domain_remove_one_dev_info(dmar_domain, pdev);
3062 }
3063
3064 static int intel_iommu_map_range(struct iommu_domain *domain,
3065                                  unsigned long iova, phys_addr_t hpa,
3066                                  size_t size, int iommu_prot)
3067 {
3068         struct dmar_domain *dmar_domain = domain->priv;
3069         u64 max_addr;
3070         int addr_width;
3071         int prot = 0;
3072         int ret;
3073
3074         if (iommu_prot & IOMMU_READ)
3075                 prot |= DMA_PTE_READ;
3076         if (iommu_prot & IOMMU_WRITE)
3077                 prot |= DMA_PTE_WRITE;
3078
3079         max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3080         if (dmar_domain->max_addr < max_addr) {
3081                 int min_agaw;
3082                 u64 end;
3083
3084                 /* check if minimum agaw is sufficient for mapped address */
3085                 min_agaw = vm_domain_min_agaw(dmar_domain);
3086                 addr_width = agaw_to_width(min_agaw);
3087                 end = DOMAIN_MAX_ADDR(addr_width);
3088                 end = end & VTD_PAGE_MASK;
3089                 if (end < max_addr) {
3090                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3091                                "sufficient for the mapped address (%llx)\n",
3092                                __func__, min_agaw, max_addr);
3093                         return -EFAULT;
3094                 }
3095                 dmar_domain->max_addr = max_addr;
3096         }
3097
3098         ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3099         return ret;
3100 }
3101
3102 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3103                                     unsigned long iova, size_t size)
3104 {
3105         struct dmar_domain *dmar_domain = domain->priv;
3106         dma_addr_t base;
3107
3108         /* The address might not be aligned */
3109         base = iova & VTD_PAGE_MASK;
3110         size = VTD_PAGE_ALIGN(size);
3111         dma_pte_clear_range(dmar_domain, base, base + size);
3112
3113         if (dmar_domain->max_addr == base + size)
3114                 dmar_domain->max_addr = base;
3115 }
3116
3117 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3118                                             unsigned long iova)
3119 {
3120         struct dmar_domain *dmar_domain = domain->priv;
3121         struct dma_pte *pte;
3122         u64 phys = 0;
3123
3124         pte = addr_to_dma_pte(dmar_domain, iova);
3125         if (pte)
3126                 phys = dma_pte_addr(pte);
3127
3128         return phys;
3129 }
3130
3131 static struct iommu_ops intel_iommu_ops = {
3132         .domain_init    = intel_iommu_domain_init,
3133         .domain_destroy = intel_iommu_domain_destroy,
3134         .attach_dev     = intel_iommu_attach_device,
3135         .detach_dev     = intel_iommu_detach_device,
3136         .map            = intel_iommu_map_range,
3137         .unmap          = intel_iommu_unmap_range,
3138         .iova_to_phys   = intel_iommu_iova_to_phys,
3139 };