]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/blob - drivers/pci/intel-iommu.c
iommu: Add domain_has_cap iommu_ops
[linux-2.6-omap-h63xx.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
41 #include "pci.h"
42
43 #define ROOT_SIZE               VTD_PAGE_SIZE
44 #define CONTEXT_SIZE            VTD_PAGE_SIZE
45
46 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
47 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
48
49 #define IOAPIC_RANGE_START      (0xfee00000)
50 #define IOAPIC_RANGE_END        (0xfeefffff)
51 #define IOVA_START_ADDR         (0x1000)
52
53 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
54
55 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
56
57 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
58 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
59 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
60
61 /* global iommu list, set NULL for ignored DMAR units */
62 static struct intel_iommu **g_iommus;
63
64 static int rwbf_quirk;
65
66 /*
67  * 0: Present
68  * 1-11: Reserved
69  * 12-63: Context Ptr (12 - (haw-1))
70  * 64-127: Reserved
71  */
72 struct root_entry {
73         u64     val;
74         u64     rsvd1;
75 };
76 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
77 static inline bool root_present(struct root_entry *root)
78 {
79         return (root->val & 1);
80 }
81 static inline void set_root_present(struct root_entry *root)
82 {
83         root->val |= 1;
84 }
85 static inline void set_root_value(struct root_entry *root, unsigned long value)
86 {
87         root->val |= value & VTD_PAGE_MASK;
88 }
89
90 static inline struct context_entry *
91 get_context_addr_from_root(struct root_entry *root)
92 {
93         return (struct context_entry *)
94                 (root_present(root)?phys_to_virt(
95                 root->val & VTD_PAGE_MASK) :
96                 NULL);
97 }
98
99 /*
100  * low 64 bits:
101  * 0: present
102  * 1: fault processing disable
103  * 2-3: translation type
104  * 12-63: address space root
105  * high 64 bits:
106  * 0-2: address width
107  * 3-6: aval
108  * 8-23: domain id
109  */
110 struct context_entry {
111         u64 lo;
112         u64 hi;
113 };
114
115 static inline bool context_present(struct context_entry *context)
116 {
117         return (context->lo & 1);
118 }
119 static inline void context_set_present(struct context_entry *context)
120 {
121         context->lo |= 1;
122 }
123
124 static inline void context_set_fault_enable(struct context_entry *context)
125 {
126         context->lo &= (((u64)-1) << 2) | 1;
127 }
128
129 #define CONTEXT_TT_MULTI_LEVEL 0
130
131 static inline void context_set_translation_type(struct context_entry *context,
132                                                 unsigned long value)
133 {
134         context->lo &= (((u64)-1) << 4) | 3;
135         context->lo |= (value & 3) << 2;
136 }
137
138 static inline void context_set_address_root(struct context_entry *context,
139                                             unsigned long value)
140 {
141         context->lo |= value & VTD_PAGE_MASK;
142 }
143
144 static inline void context_set_address_width(struct context_entry *context,
145                                              unsigned long value)
146 {
147         context->hi |= value & 7;
148 }
149
150 static inline void context_set_domain_id(struct context_entry *context,
151                                          unsigned long value)
152 {
153         context->hi |= (value & ((1 << 16) - 1)) << 8;
154 }
155
156 static inline void context_clear_entry(struct context_entry *context)
157 {
158         context->lo = 0;
159         context->hi = 0;
160 }
161
162 /*
163  * 0: readable
164  * 1: writable
165  * 2-6: reserved
166  * 7: super page
167  * 8-11: available
168  * 12-63: Host physcial address
169  */
170 struct dma_pte {
171         u64 val;
172 };
173
174 static inline void dma_clear_pte(struct dma_pte *pte)
175 {
176         pte->val = 0;
177 }
178
179 static inline void dma_set_pte_readable(struct dma_pte *pte)
180 {
181         pte->val |= DMA_PTE_READ;
182 }
183
184 static inline void dma_set_pte_writable(struct dma_pte *pte)
185 {
186         pte->val |= DMA_PTE_WRITE;
187 }
188
189 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
190 {
191         pte->val = (pte->val & ~3) | (prot & 3);
192 }
193
194 static inline u64 dma_pte_addr(struct dma_pte *pte)
195 {
196         return (pte->val & VTD_PAGE_MASK);
197 }
198
199 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
200 {
201         pte->val |= (addr & VTD_PAGE_MASK);
202 }
203
204 static inline bool dma_pte_present(struct dma_pte *pte)
205 {
206         return (pte->val & 3) != 0;
207 }
208
209 /* devices under the same p2p bridge are owned in one domain */
210 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
211
212 /* domain represents a virtual machine, more than one devices
213  * across iommus may be owned in one domain, e.g. kvm guest.
214  */
215 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
216
217 struct dmar_domain {
218         int     id;                     /* domain id */
219         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
220
221         struct list_head devices;       /* all devices' list */
222         struct iova_domain iovad;       /* iova's that belong to this domain */
223
224         struct dma_pte  *pgd;           /* virtual address */
225         spinlock_t      mapping_lock;   /* page table lock */
226         int             gaw;            /* max guest address width */
227
228         /* adjusted guest address width, 0 is level 2 30-bit */
229         int             agaw;
230
231         int             flags;          /* flags to find out type of domain */
232
233         int             iommu_coherency;/* indicate coherency of iommu access */
234         int             iommu_snooping; /* indicate snooping control feature*/
235         int             iommu_count;    /* reference count of iommu */
236         spinlock_t      iommu_lock;     /* protect iommu set in domain */
237         u64             max_addr;       /* maximum mapped address */
238 };
239
240 /* PCI domain-device relationship */
241 struct device_domain_info {
242         struct list_head link;  /* link to domain siblings */
243         struct list_head global; /* link to global list */
244         u8 bus;                 /* PCI bus numer */
245         u8 devfn;               /* PCI devfn number */
246         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
247         struct dmar_domain *domain; /* pointer to domain */
248 };
249
250 static void flush_unmaps_timeout(unsigned long data);
251
252 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
253
254 #define HIGH_WATER_MARK 250
255 struct deferred_flush_tables {
256         int next;
257         struct iova *iova[HIGH_WATER_MARK];
258         struct dmar_domain *domain[HIGH_WATER_MARK];
259 };
260
261 static struct deferred_flush_tables *deferred_flush;
262
263 /* bitmap for indexing intel_iommus */
264 static int g_num_of_iommus;
265
266 static DEFINE_SPINLOCK(async_umap_flush_lock);
267 static LIST_HEAD(unmaps_to_do);
268
269 static int timer_on;
270 static long list_size;
271
272 static void domain_remove_dev_info(struct dmar_domain *domain);
273
274 #ifdef CONFIG_DMAR_DEFAULT_ON
275 int dmar_disabled = 0;
276 #else
277 int dmar_disabled = 1;
278 #endif /*CONFIG_DMAR_DEFAULT_ON*/
279
280 static int __initdata dmar_map_gfx = 1;
281 static int dmar_forcedac;
282 static int intel_iommu_strict;
283
284 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
285 static DEFINE_SPINLOCK(device_domain_lock);
286 static LIST_HEAD(device_domain_list);
287
288 static struct iommu_ops intel_iommu_ops;
289
290 static int __init intel_iommu_setup(char *str)
291 {
292         if (!str)
293                 return -EINVAL;
294         while (*str) {
295                 if (!strncmp(str, "on", 2)) {
296                         dmar_disabled = 0;
297                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
298                 } else if (!strncmp(str, "off", 3)) {
299                         dmar_disabled = 1;
300                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
301                 } else if (!strncmp(str, "igfx_off", 8)) {
302                         dmar_map_gfx = 0;
303                         printk(KERN_INFO
304                                 "Intel-IOMMU: disable GFX device mapping\n");
305                 } else if (!strncmp(str, "forcedac", 8)) {
306                         printk(KERN_INFO
307                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
308                         dmar_forcedac = 1;
309                 } else if (!strncmp(str, "strict", 6)) {
310                         printk(KERN_INFO
311                                 "Intel-IOMMU: disable batched IOTLB flush\n");
312                         intel_iommu_strict = 1;
313                 }
314
315                 str += strcspn(str, ",");
316                 while (*str == ',')
317                         str++;
318         }
319         return 0;
320 }
321 __setup("intel_iommu=", intel_iommu_setup);
322
323 static struct kmem_cache *iommu_domain_cache;
324 static struct kmem_cache *iommu_devinfo_cache;
325 static struct kmem_cache *iommu_iova_cache;
326
327 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
328 {
329         unsigned int flags;
330         void *vaddr;
331
332         /* trying to avoid low memory issues */
333         flags = current->flags & PF_MEMALLOC;
334         current->flags |= PF_MEMALLOC;
335         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
336         current->flags &= (~PF_MEMALLOC | flags);
337         return vaddr;
338 }
339
340
341 static inline void *alloc_pgtable_page(void)
342 {
343         unsigned int flags;
344         void *vaddr;
345
346         /* trying to avoid low memory issues */
347         flags = current->flags & PF_MEMALLOC;
348         current->flags |= PF_MEMALLOC;
349         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
350         current->flags &= (~PF_MEMALLOC | flags);
351         return vaddr;
352 }
353
354 static inline void free_pgtable_page(void *vaddr)
355 {
356         free_page((unsigned long)vaddr);
357 }
358
359 static inline void *alloc_domain_mem(void)
360 {
361         return iommu_kmem_cache_alloc(iommu_domain_cache);
362 }
363
364 static void free_domain_mem(void *vaddr)
365 {
366         kmem_cache_free(iommu_domain_cache, vaddr);
367 }
368
369 static inline void * alloc_devinfo_mem(void)
370 {
371         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
372 }
373
374 static inline void free_devinfo_mem(void *vaddr)
375 {
376         kmem_cache_free(iommu_devinfo_cache, vaddr);
377 }
378
379 struct iova *alloc_iova_mem(void)
380 {
381         return iommu_kmem_cache_alloc(iommu_iova_cache);
382 }
383
384 void free_iova_mem(struct iova *iova)
385 {
386         kmem_cache_free(iommu_iova_cache, iova);
387 }
388
389
390 static inline int width_to_agaw(int width);
391
392 /* calculate agaw for each iommu.
393  * "SAGAW" may be different across iommus, use a default agaw, and
394  * get a supported less agaw for iommus that don't support the default agaw.
395  */
396 int iommu_calculate_agaw(struct intel_iommu *iommu)
397 {
398         unsigned long sagaw;
399         int agaw = -1;
400
401         sagaw = cap_sagaw(iommu->cap);
402         for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
403              agaw >= 0; agaw--) {
404                 if (test_bit(agaw, &sagaw))
405                         break;
406         }
407
408         return agaw;
409 }
410
411 /* in native case, each domain is related to only one iommu */
412 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
413 {
414         int iommu_id;
415
416         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
417
418         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
419         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
420                 return NULL;
421
422         return g_iommus[iommu_id];
423 }
424
425 static void domain_update_iommu_coherency(struct dmar_domain *domain)
426 {
427         int i;
428
429         domain->iommu_coherency = 1;
430
431         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
432         for (; i < g_num_of_iommus; ) {
433                 if (!ecap_coherent(g_iommus[i]->ecap)) {
434                         domain->iommu_coherency = 0;
435                         break;
436                 }
437                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
438         }
439 }
440
441 static void domain_update_iommu_snooping(struct dmar_domain *domain)
442 {
443         int i;
444
445         domain->iommu_snooping = 1;
446
447         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
448         for (; i < g_num_of_iommus; ) {
449                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
450                         domain->iommu_snooping = 0;
451                         break;
452                 }
453                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
454         }
455 }
456
457 /* Some capabilities may be different across iommus */
458 static void domain_update_iommu_cap(struct dmar_domain *domain)
459 {
460         domain_update_iommu_coherency(domain);
461         domain_update_iommu_snooping(domain);
462 }
463
464 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
465 {
466         struct dmar_drhd_unit *drhd = NULL;
467         int i;
468
469         for_each_drhd_unit(drhd) {
470                 if (drhd->ignored)
471                         continue;
472
473                 for (i = 0; i < drhd->devices_cnt; i++)
474                         if (drhd->devices[i] &&
475                             drhd->devices[i]->bus->number == bus &&
476                             drhd->devices[i]->devfn == devfn)
477                                 return drhd->iommu;
478
479                 if (drhd->include_all)
480                         return drhd->iommu;
481         }
482
483         return NULL;
484 }
485
486 static void domain_flush_cache(struct dmar_domain *domain,
487                                void *addr, int size)
488 {
489         if (!domain->iommu_coherency)
490                 clflush_cache_range(addr, size);
491 }
492
493 /* Gets context entry for a given bus and devfn */
494 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
495                 u8 bus, u8 devfn)
496 {
497         struct root_entry *root;
498         struct context_entry *context;
499         unsigned long phy_addr;
500         unsigned long flags;
501
502         spin_lock_irqsave(&iommu->lock, flags);
503         root = &iommu->root_entry[bus];
504         context = get_context_addr_from_root(root);
505         if (!context) {
506                 context = (struct context_entry *)alloc_pgtable_page();
507                 if (!context) {
508                         spin_unlock_irqrestore(&iommu->lock, flags);
509                         return NULL;
510                 }
511                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
512                 phy_addr = virt_to_phys((void *)context);
513                 set_root_value(root, phy_addr);
514                 set_root_present(root);
515                 __iommu_flush_cache(iommu, root, sizeof(*root));
516         }
517         spin_unlock_irqrestore(&iommu->lock, flags);
518         return &context[devfn];
519 }
520
521 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
522 {
523         struct root_entry *root;
524         struct context_entry *context;
525         int ret;
526         unsigned long flags;
527
528         spin_lock_irqsave(&iommu->lock, flags);
529         root = &iommu->root_entry[bus];
530         context = get_context_addr_from_root(root);
531         if (!context) {
532                 ret = 0;
533                 goto out;
534         }
535         ret = context_present(&context[devfn]);
536 out:
537         spin_unlock_irqrestore(&iommu->lock, flags);
538         return ret;
539 }
540
541 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
542 {
543         struct root_entry *root;
544         struct context_entry *context;
545         unsigned long flags;
546
547         spin_lock_irqsave(&iommu->lock, flags);
548         root = &iommu->root_entry[bus];
549         context = get_context_addr_from_root(root);
550         if (context) {
551                 context_clear_entry(&context[devfn]);
552                 __iommu_flush_cache(iommu, &context[devfn], \
553                         sizeof(*context));
554         }
555         spin_unlock_irqrestore(&iommu->lock, flags);
556 }
557
558 static void free_context_table(struct intel_iommu *iommu)
559 {
560         struct root_entry *root;
561         int i;
562         unsigned long flags;
563         struct context_entry *context;
564
565         spin_lock_irqsave(&iommu->lock, flags);
566         if (!iommu->root_entry) {
567                 goto out;
568         }
569         for (i = 0; i < ROOT_ENTRY_NR; i++) {
570                 root = &iommu->root_entry[i];
571                 context = get_context_addr_from_root(root);
572                 if (context)
573                         free_pgtable_page(context);
574         }
575         free_pgtable_page(iommu->root_entry);
576         iommu->root_entry = NULL;
577 out:
578         spin_unlock_irqrestore(&iommu->lock, flags);
579 }
580
581 /* page table handling */
582 #define LEVEL_STRIDE            (9)
583 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
584
585 static inline int agaw_to_level(int agaw)
586 {
587         return agaw + 2;
588 }
589
590 static inline int agaw_to_width(int agaw)
591 {
592         return 30 + agaw * LEVEL_STRIDE;
593
594 }
595
596 static inline int width_to_agaw(int width)
597 {
598         return (width - 30) / LEVEL_STRIDE;
599 }
600
601 static inline unsigned int level_to_offset_bits(int level)
602 {
603         return (12 + (level - 1) * LEVEL_STRIDE);
604 }
605
606 static inline int address_level_offset(u64 addr, int level)
607 {
608         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
609 }
610
611 static inline u64 level_mask(int level)
612 {
613         return ((u64)-1 << level_to_offset_bits(level));
614 }
615
616 static inline u64 level_size(int level)
617 {
618         return ((u64)1 << level_to_offset_bits(level));
619 }
620
621 static inline u64 align_to_level(u64 addr, int level)
622 {
623         return ((addr + level_size(level) - 1) & level_mask(level));
624 }
625
626 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
627 {
628         int addr_width = agaw_to_width(domain->agaw);
629         struct dma_pte *parent, *pte = NULL;
630         int level = agaw_to_level(domain->agaw);
631         int offset;
632         unsigned long flags;
633
634         BUG_ON(!domain->pgd);
635
636         addr &= (((u64)1) << addr_width) - 1;
637         parent = domain->pgd;
638
639         spin_lock_irqsave(&domain->mapping_lock, flags);
640         while (level > 0) {
641                 void *tmp_page;
642
643                 offset = address_level_offset(addr, level);
644                 pte = &parent[offset];
645                 if (level == 1)
646                         break;
647
648                 if (!dma_pte_present(pte)) {
649                         tmp_page = alloc_pgtable_page();
650
651                         if (!tmp_page) {
652                                 spin_unlock_irqrestore(&domain->mapping_lock,
653                                         flags);
654                                 return NULL;
655                         }
656                         domain_flush_cache(domain, tmp_page, PAGE_SIZE);
657                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
658                         /*
659                          * high level table always sets r/w, last level page
660                          * table control read/write
661                          */
662                         dma_set_pte_readable(pte);
663                         dma_set_pte_writable(pte);
664                         domain_flush_cache(domain, pte, sizeof(*pte));
665                 }
666                 parent = phys_to_virt(dma_pte_addr(pte));
667                 level--;
668         }
669
670         spin_unlock_irqrestore(&domain->mapping_lock, flags);
671         return pte;
672 }
673
674 /* return address's pte at specific level */
675 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
676                 int level)
677 {
678         struct dma_pte *parent, *pte = NULL;
679         int total = agaw_to_level(domain->agaw);
680         int offset;
681
682         parent = domain->pgd;
683         while (level <= total) {
684                 offset = address_level_offset(addr, total);
685                 pte = &parent[offset];
686                 if (level == total)
687                         return pte;
688
689                 if (!dma_pte_present(pte))
690                         break;
691                 parent = phys_to_virt(dma_pte_addr(pte));
692                 total--;
693         }
694         return NULL;
695 }
696
697 /* clear one page's page table */
698 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
699 {
700         struct dma_pte *pte = NULL;
701
702         /* get last level pte */
703         pte = dma_addr_level_pte(domain, addr, 1);
704
705         if (pte) {
706                 dma_clear_pte(pte);
707                 domain_flush_cache(domain, pte, sizeof(*pte));
708         }
709 }
710
711 /* clear last level pte, a tlb flush should be followed */
712 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
713 {
714         int addr_width = agaw_to_width(domain->agaw);
715
716         start &= (((u64)1) << addr_width) - 1;
717         end &= (((u64)1) << addr_width) - 1;
718         /* in case it's partial page */
719         start = PAGE_ALIGN(start);
720         end &= PAGE_MASK;
721
722         /* we don't need lock here, nobody else touches the iova range */
723         while (start < end) {
724                 dma_pte_clear_one(domain, start);
725                 start += VTD_PAGE_SIZE;
726         }
727 }
728
729 /* free page table pages. last level pte should already be cleared */
730 static void dma_pte_free_pagetable(struct dmar_domain *domain,
731         u64 start, u64 end)
732 {
733         int addr_width = agaw_to_width(domain->agaw);
734         struct dma_pte *pte;
735         int total = agaw_to_level(domain->agaw);
736         int level;
737         u64 tmp;
738
739         start &= (((u64)1) << addr_width) - 1;
740         end &= (((u64)1) << addr_width) - 1;
741
742         /* we don't need lock here, nobody else touches the iova range */
743         level = 2;
744         while (level <= total) {
745                 tmp = align_to_level(start, level);
746                 if (tmp >= end || (tmp + level_size(level) > end))
747                         return;
748
749                 while (tmp < end) {
750                         pte = dma_addr_level_pte(domain, tmp, level);
751                         if (pte) {
752                                 free_pgtable_page(
753                                         phys_to_virt(dma_pte_addr(pte)));
754                                 dma_clear_pte(pte);
755                                 domain_flush_cache(domain, pte, sizeof(*pte));
756                         }
757                         tmp += level_size(level);
758                 }
759                 level++;
760         }
761         /* free pgd */
762         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
763                 free_pgtable_page(domain->pgd);
764                 domain->pgd = NULL;
765         }
766 }
767
768 /* iommu handling */
769 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
770 {
771         struct root_entry *root;
772         unsigned long flags;
773
774         root = (struct root_entry *)alloc_pgtable_page();
775         if (!root)
776                 return -ENOMEM;
777
778         __iommu_flush_cache(iommu, root, ROOT_SIZE);
779
780         spin_lock_irqsave(&iommu->lock, flags);
781         iommu->root_entry = root;
782         spin_unlock_irqrestore(&iommu->lock, flags);
783
784         return 0;
785 }
786
787 static void iommu_set_root_entry(struct intel_iommu *iommu)
788 {
789         void *addr;
790         u32 cmd, sts;
791         unsigned long flag;
792
793         addr = iommu->root_entry;
794
795         spin_lock_irqsave(&iommu->register_lock, flag);
796         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
797
798         cmd = iommu->gcmd | DMA_GCMD_SRTP;
799         writel(cmd, iommu->reg + DMAR_GCMD_REG);
800
801         /* Make sure hardware complete it */
802         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
803                 readl, (sts & DMA_GSTS_RTPS), sts);
804
805         spin_unlock_irqrestore(&iommu->register_lock, flag);
806 }
807
808 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
809 {
810         u32 val;
811         unsigned long flag;
812
813         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
814                 return;
815         val = iommu->gcmd | DMA_GCMD_WBF;
816
817         spin_lock_irqsave(&iommu->register_lock, flag);
818         writel(val, iommu->reg + DMAR_GCMD_REG);
819
820         /* Make sure hardware complete it */
821         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
822                         readl, (!(val & DMA_GSTS_WBFS)), val);
823
824         spin_unlock_irqrestore(&iommu->register_lock, flag);
825 }
826
827 /* return value determine if we need a write buffer flush */
828 static int __iommu_flush_context(struct intel_iommu *iommu,
829         u16 did, u16 source_id, u8 function_mask, u64 type,
830         int non_present_entry_flush)
831 {
832         u64 val = 0;
833         unsigned long flag;
834
835         /*
836          * In the non-present entry flush case, if hardware doesn't cache
837          * non-present entry we do nothing and if hardware cache non-present
838          * entry, we flush entries of domain 0 (the domain id is used to cache
839          * any non-present entries)
840          */
841         if (non_present_entry_flush) {
842                 if (!cap_caching_mode(iommu->cap))
843                         return 1;
844                 else
845                         did = 0;
846         }
847
848         switch (type) {
849         case DMA_CCMD_GLOBAL_INVL:
850                 val = DMA_CCMD_GLOBAL_INVL;
851                 break;
852         case DMA_CCMD_DOMAIN_INVL:
853                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
854                 break;
855         case DMA_CCMD_DEVICE_INVL:
856                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
857                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
858                 break;
859         default:
860                 BUG();
861         }
862         val |= DMA_CCMD_ICC;
863
864         spin_lock_irqsave(&iommu->register_lock, flag);
865         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
866
867         /* Make sure hardware complete it */
868         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
869                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
870
871         spin_unlock_irqrestore(&iommu->register_lock, flag);
872
873         /* flush context entry will implicitly flush write buffer */
874         return 0;
875 }
876
877 /* return value determine if we need a write buffer flush */
878 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
879         u64 addr, unsigned int size_order, u64 type,
880         int non_present_entry_flush)
881 {
882         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
883         u64 val = 0, val_iva = 0;
884         unsigned long flag;
885
886         /*
887          * In the non-present entry flush case, if hardware doesn't cache
888          * non-present entry we do nothing and if hardware cache non-present
889          * entry, we flush entries of domain 0 (the domain id is used to cache
890          * any non-present entries)
891          */
892         if (non_present_entry_flush) {
893                 if (!cap_caching_mode(iommu->cap))
894                         return 1;
895                 else
896                         did = 0;
897         }
898
899         switch (type) {
900         case DMA_TLB_GLOBAL_FLUSH:
901                 /* global flush doesn't need set IVA_REG */
902                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
903                 break;
904         case DMA_TLB_DSI_FLUSH:
905                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
906                 break;
907         case DMA_TLB_PSI_FLUSH:
908                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
909                 /* Note: always flush non-leaf currently */
910                 val_iva = size_order | addr;
911                 break;
912         default:
913                 BUG();
914         }
915         /* Note: set drain read/write */
916 #if 0
917         /*
918          * This is probably to be super secure.. Looks like we can
919          * ignore it without any impact.
920          */
921         if (cap_read_drain(iommu->cap))
922                 val |= DMA_TLB_READ_DRAIN;
923 #endif
924         if (cap_write_drain(iommu->cap))
925                 val |= DMA_TLB_WRITE_DRAIN;
926
927         spin_lock_irqsave(&iommu->register_lock, flag);
928         /* Note: Only uses first TLB reg currently */
929         if (val_iva)
930                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
931         dmar_writeq(iommu->reg + tlb_offset + 8, val);
932
933         /* Make sure hardware complete it */
934         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
935                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
936
937         spin_unlock_irqrestore(&iommu->register_lock, flag);
938
939         /* check IOTLB invalidation granularity */
940         if (DMA_TLB_IAIG(val) == 0)
941                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
942         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
943                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
944                         (unsigned long long)DMA_TLB_IIRG(type),
945                         (unsigned long long)DMA_TLB_IAIG(val));
946         /* flush iotlb entry will implicitly flush write buffer */
947         return 0;
948 }
949
950 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
951         u64 addr, unsigned int pages, int non_present_entry_flush)
952 {
953         unsigned int mask;
954
955         BUG_ON(addr & (~VTD_PAGE_MASK));
956         BUG_ON(pages == 0);
957
958         /* Fallback to domain selective flush if no PSI support */
959         if (!cap_pgsel_inv(iommu->cap))
960                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
961                                                 DMA_TLB_DSI_FLUSH,
962                                                 non_present_entry_flush);
963
964         /*
965          * PSI requires page size to be 2 ^ x, and the base address is naturally
966          * aligned to the size
967          */
968         mask = ilog2(__roundup_pow_of_two(pages));
969         /* Fallback to domain selective flush if size is too big */
970         if (mask > cap_max_amask_val(iommu->cap))
971                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
972                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
973
974         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
975                                         DMA_TLB_PSI_FLUSH,
976                                         non_present_entry_flush);
977 }
978
979 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
980 {
981         u32 pmen;
982         unsigned long flags;
983
984         spin_lock_irqsave(&iommu->register_lock, flags);
985         pmen = readl(iommu->reg + DMAR_PMEN_REG);
986         pmen &= ~DMA_PMEN_EPM;
987         writel(pmen, iommu->reg + DMAR_PMEN_REG);
988
989         /* wait for the protected region status bit to clear */
990         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
991                 readl, !(pmen & DMA_PMEN_PRS), pmen);
992
993         spin_unlock_irqrestore(&iommu->register_lock, flags);
994 }
995
996 static int iommu_enable_translation(struct intel_iommu *iommu)
997 {
998         u32 sts;
999         unsigned long flags;
1000
1001         spin_lock_irqsave(&iommu->register_lock, flags);
1002         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
1003
1004         /* Make sure hardware complete it */
1005         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1006                 readl, (sts & DMA_GSTS_TES), sts);
1007
1008         iommu->gcmd |= DMA_GCMD_TE;
1009         spin_unlock_irqrestore(&iommu->register_lock, flags);
1010         return 0;
1011 }
1012
1013 static int iommu_disable_translation(struct intel_iommu *iommu)
1014 {
1015         u32 sts;
1016         unsigned long flag;
1017
1018         spin_lock_irqsave(&iommu->register_lock, flag);
1019         iommu->gcmd &= ~DMA_GCMD_TE;
1020         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1021
1022         /* Make sure hardware complete it */
1023         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1024                 readl, (!(sts & DMA_GSTS_TES)), sts);
1025
1026         spin_unlock_irqrestore(&iommu->register_lock, flag);
1027         return 0;
1028 }
1029
1030 /* iommu interrupt handling. Most stuff are MSI-like. */
1031
1032 static const char *fault_reason_strings[] =
1033 {
1034         "Software",
1035         "Present bit in root entry is clear",
1036         "Present bit in context entry is clear",
1037         "Invalid context entry",
1038         "Access beyond MGAW",
1039         "PTE Write access is not set",
1040         "PTE Read access is not set",
1041         "Next page table ptr is invalid",
1042         "Root table address invalid",
1043         "Context table ptr is invalid",
1044         "non-zero reserved fields in RTP",
1045         "non-zero reserved fields in CTP",
1046         "non-zero reserved fields in PTE",
1047 };
1048 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
1049
1050 const char *dmar_get_fault_reason(u8 fault_reason)
1051 {
1052         if (fault_reason > MAX_FAULT_REASON_IDX)
1053                 return "Unknown";
1054         else
1055                 return fault_reason_strings[fault_reason];
1056 }
1057
1058 void dmar_msi_unmask(unsigned int irq)
1059 {
1060         struct intel_iommu *iommu = get_irq_data(irq);
1061         unsigned long flag;
1062
1063         /* unmask it */
1064         spin_lock_irqsave(&iommu->register_lock, flag);
1065         writel(0, iommu->reg + DMAR_FECTL_REG);
1066         /* Read a reg to force flush the post write */
1067         readl(iommu->reg + DMAR_FECTL_REG);
1068         spin_unlock_irqrestore(&iommu->register_lock, flag);
1069 }
1070
1071 void dmar_msi_mask(unsigned int irq)
1072 {
1073         unsigned long flag;
1074         struct intel_iommu *iommu = get_irq_data(irq);
1075
1076         /* mask it */
1077         spin_lock_irqsave(&iommu->register_lock, flag);
1078         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1079         /* Read a reg to force flush the post write */
1080         readl(iommu->reg + DMAR_FECTL_REG);
1081         spin_unlock_irqrestore(&iommu->register_lock, flag);
1082 }
1083
1084 void dmar_msi_write(int irq, struct msi_msg *msg)
1085 {
1086         struct intel_iommu *iommu = get_irq_data(irq);
1087         unsigned long flag;
1088
1089         spin_lock_irqsave(&iommu->register_lock, flag);
1090         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1091         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1092         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1093         spin_unlock_irqrestore(&iommu->register_lock, flag);
1094 }
1095
1096 void dmar_msi_read(int irq, struct msi_msg *msg)
1097 {
1098         struct intel_iommu *iommu = get_irq_data(irq);
1099         unsigned long flag;
1100
1101         spin_lock_irqsave(&iommu->register_lock, flag);
1102         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1103         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1104         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1105         spin_unlock_irqrestore(&iommu->register_lock, flag);
1106 }
1107
1108 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1109                 u8 fault_reason, u16 source_id, unsigned long long addr)
1110 {
1111         const char *reason;
1112
1113         reason = dmar_get_fault_reason(fault_reason);
1114
1115         printk(KERN_ERR
1116                 "DMAR:[%s] Request device [%02x:%02x.%d] "
1117                 "fault addr %llx \n"
1118                 "DMAR:[fault reason %02d] %s\n",
1119                 (type ? "DMA Read" : "DMA Write"),
1120                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1121                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1122         return 0;
1123 }
1124
1125 #define PRIMARY_FAULT_REG_LEN (16)
1126 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1127 {
1128         struct intel_iommu *iommu = dev_id;
1129         int reg, fault_index;
1130         u32 fault_status;
1131         unsigned long flag;
1132
1133         spin_lock_irqsave(&iommu->register_lock, flag);
1134         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1135
1136         /* TBD: ignore advanced fault log currently */
1137         if (!(fault_status & DMA_FSTS_PPF))
1138                 goto clear_overflow;
1139
1140         fault_index = dma_fsts_fault_record_index(fault_status);
1141         reg = cap_fault_reg_offset(iommu->cap);
1142         while (1) {
1143                 u8 fault_reason;
1144                 u16 source_id;
1145                 u64 guest_addr;
1146                 int type;
1147                 u32 data;
1148
1149                 /* highest 32 bits */
1150                 data = readl(iommu->reg + reg +
1151                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1152                 if (!(data & DMA_FRCD_F))
1153                         break;
1154
1155                 fault_reason = dma_frcd_fault_reason(data);
1156                 type = dma_frcd_type(data);
1157
1158                 data = readl(iommu->reg + reg +
1159                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1160                 source_id = dma_frcd_source_id(data);
1161
1162                 guest_addr = dmar_readq(iommu->reg + reg +
1163                                 fault_index * PRIMARY_FAULT_REG_LEN);
1164                 guest_addr = dma_frcd_page_addr(guest_addr);
1165                 /* clear the fault */
1166                 writel(DMA_FRCD_F, iommu->reg + reg +
1167                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
1168
1169                 spin_unlock_irqrestore(&iommu->register_lock, flag);
1170
1171                 iommu_page_fault_do_one(iommu, type, fault_reason,
1172                                 source_id, guest_addr);
1173
1174                 fault_index++;
1175                 if (fault_index > cap_num_fault_regs(iommu->cap))
1176                         fault_index = 0;
1177                 spin_lock_irqsave(&iommu->register_lock, flag);
1178         }
1179 clear_overflow:
1180         /* clear primary fault overflow */
1181         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1182         if (fault_status & DMA_FSTS_PFO)
1183                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1184
1185         spin_unlock_irqrestore(&iommu->register_lock, flag);
1186         return IRQ_HANDLED;
1187 }
1188
1189 int dmar_set_interrupt(struct intel_iommu *iommu)
1190 {
1191         int irq, ret;
1192
1193         irq = create_irq();
1194         if (!irq) {
1195                 printk(KERN_ERR "IOMMU: no free vectors\n");
1196                 return -EINVAL;
1197         }
1198
1199         set_irq_data(irq, iommu);
1200         iommu->irq = irq;
1201
1202         ret = arch_setup_dmar_msi(irq);
1203         if (ret) {
1204                 set_irq_data(irq, NULL);
1205                 iommu->irq = 0;
1206                 destroy_irq(irq);
1207                 return 0;
1208         }
1209
1210         /* Force fault register is cleared */
1211         iommu_page_fault(irq, iommu);
1212
1213         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1214         if (ret)
1215                 printk(KERN_ERR "IOMMU: can't request irq\n");
1216         return ret;
1217 }
1218
1219 static int iommu_init_domains(struct intel_iommu *iommu)
1220 {
1221         unsigned long ndomains;
1222         unsigned long nlongs;
1223
1224         ndomains = cap_ndoms(iommu->cap);
1225         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1226         nlongs = BITS_TO_LONGS(ndomains);
1227
1228         /* TBD: there might be 64K domains,
1229          * consider other allocation for future chip
1230          */
1231         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1232         if (!iommu->domain_ids) {
1233                 printk(KERN_ERR "Allocating domain id array failed\n");
1234                 return -ENOMEM;
1235         }
1236         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1237                         GFP_KERNEL);
1238         if (!iommu->domains) {
1239                 printk(KERN_ERR "Allocating domain array failed\n");
1240                 kfree(iommu->domain_ids);
1241                 return -ENOMEM;
1242         }
1243
1244         spin_lock_init(&iommu->lock);
1245
1246         /*
1247          * if Caching mode is set, then invalid translations are tagged
1248          * with domainid 0. Hence we need to pre-allocate it.
1249          */
1250         if (cap_caching_mode(iommu->cap))
1251                 set_bit(0, iommu->domain_ids);
1252         return 0;
1253 }
1254
1255
1256 static void domain_exit(struct dmar_domain *domain);
1257 static void vm_domain_exit(struct dmar_domain *domain);
1258
1259 void free_dmar_iommu(struct intel_iommu *iommu)
1260 {
1261         struct dmar_domain *domain;
1262         int i;
1263         unsigned long flags;
1264
1265         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1266         for (; i < cap_ndoms(iommu->cap); ) {
1267                 domain = iommu->domains[i];
1268                 clear_bit(i, iommu->domain_ids);
1269
1270                 spin_lock_irqsave(&domain->iommu_lock, flags);
1271                 if (--domain->iommu_count == 0) {
1272                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1273                                 vm_domain_exit(domain);
1274                         else
1275                                 domain_exit(domain);
1276                 }
1277                 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1278
1279                 i = find_next_bit(iommu->domain_ids,
1280                         cap_ndoms(iommu->cap), i+1);
1281         }
1282
1283         if (iommu->gcmd & DMA_GCMD_TE)
1284                 iommu_disable_translation(iommu);
1285
1286         if (iommu->irq) {
1287                 set_irq_data(iommu->irq, NULL);
1288                 /* This will mask the irq */
1289                 free_irq(iommu->irq, iommu);
1290                 destroy_irq(iommu->irq);
1291         }
1292
1293         kfree(iommu->domains);
1294         kfree(iommu->domain_ids);
1295
1296         g_iommus[iommu->seq_id] = NULL;
1297
1298         /* if all iommus are freed, free g_iommus */
1299         for (i = 0; i < g_num_of_iommus; i++) {
1300                 if (g_iommus[i])
1301                         break;
1302         }
1303
1304         if (i == g_num_of_iommus)
1305                 kfree(g_iommus);
1306
1307         /* free context mapping */
1308         free_context_table(iommu);
1309 }
1310
1311 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1312 {
1313         unsigned long num;
1314         unsigned long ndomains;
1315         struct dmar_domain *domain;
1316         unsigned long flags;
1317
1318         domain = alloc_domain_mem();
1319         if (!domain)
1320                 return NULL;
1321
1322         ndomains = cap_ndoms(iommu->cap);
1323
1324         spin_lock_irqsave(&iommu->lock, flags);
1325         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1326         if (num >= ndomains) {
1327                 spin_unlock_irqrestore(&iommu->lock, flags);
1328                 free_domain_mem(domain);
1329                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1330                 return NULL;
1331         }
1332
1333         set_bit(num, iommu->domain_ids);
1334         domain->id = num;
1335         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1336         set_bit(iommu->seq_id, &domain->iommu_bmp);
1337         domain->flags = 0;
1338         iommu->domains[num] = domain;
1339         spin_unlock_irqrestore(&iommu->lock, flags);
1340
1341         return domain;
1342 }
1343
1344 static void iommu_free_domain(struct dmar_domain *domain)
1345 {
1346         unsigned long flags;
1347         struct intel_iommu *iommu;
1348
1349         iommu = domain_get_iommu(domain);
1350
1351         spin_lock_irqsave(&iommu->lock, flags);
1352         clear_bit(domain->id, iommu->domain_ids);
1353         spin_unlock_irqrestore(&iommu->lock, flags);
1354 }
1355
1356 static struct iova_domain reserved_iova_list;
1357 static struct lock_class_key reserved_alloc_key;
1358 static struct lock_class_key reserved_rbtree_key;
1359
1360 static void dmar_init_reserved_ranges(void)
1361 {
1362         struct pci_dev *pdev = NULL;
1363         struct iova *iova;
1364         int i;
1365         u64 addr, size;
1366
1367         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1368
1369         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1370                 &reserved_alloc_key);
1371         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1372                 &reserved_rbtree_key);
1373
1374         /* IOAPIC ranges shouldn't be accessed by DMA */
1375         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1376                 IOVA_PFN(IOAPIC_RANGE_END));
1377         if (!iova)
1378                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1379
1380         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1381         for_each_pci_dev(pdev) {
1382                 struct resource *r;
1383
1384                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1385                         r = &pdev->resource[i];
1386                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1387                                 continue;
1388                         addr = r->start;
1389                         addr &= PAGE_MASK;
1390                         size = r->end - addr;
1391                         size = PAGE_ALIGN(size);
1392                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1393                                 IOVA_PFN(size + addr) - 1);
1394                         if (!iova)
1395                                 printk(KERN_ERR "Reserve iova failed\n");
1396                 }
1397         }
1398
1399 }
1400
1401 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1402 {
1403         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1404 }
1405
1406 static inline int guestwidth_to_adjustwidth(int gaw)
1407 {
1408         int agaw;
1409         int r = (gaw - 12) % 9;
1410
1411         if (r == 0)
1412                 agaw = gaw;
1413         else
1414                 agaw = gaw + 9 - r;
1415         if (agaw > 64)
1416                 agaw = 64;
1417         return agaw;
1418 }
1419
1420 static int domain_init(struct dmar_domain *domain, int guest_width)
1421 {
1422         struct intel_iommu *iommu;
1423         int adjust_width, agaw;
1424         unsigned long sagaw;
1425
1426         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1427         spin_lock_init(&domain->mapping_lock);
1428         spin_lock_init(&domain->iommu_lock);
1429
1430         domain_reserve_special_ranges(domain);
1431
1432         /* calculate AGAW */
1433         iommu = domain_get_iommu(domain);
1434         if (guest_width > cap_mgaw(iommu->cap))
1435                 guest_width = cap_mgaw(iommu->cap);
1436         domain->gaw = guest_width;
1437         adjust_width = guestwidth_to_adjustwidth(guest_width);
1438         agaw = width_to_agaw(adjust_width);
1439         sagaw = cap_sagaw(iommu->cap);
1440         if (!test_bit(agaw, &sagaw)) {
1441                 /* hardware doesn't support it, choose a bigger one */
1442                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1443                 agaw = find_next_bit(&sagaw, 5, agaw);
1444                 if (agaw >= 5)
1445                         return -ENODEV;
1446         }
1447         domain->agaw = agaw;
1448         INIT_LIST_HEAD(&domain->devices);
1449
1450         if (ecap_coherent(iommu->ecap))
1451                 domain->iommu_coherency = 1;
1452         else
1453                 domain->iommu_coherency = 0;
1454
1455         if (ecap_sc_support(iommu->ecap))
1456                 domain->iommu_snooping = 1;
1457         else
1458                 domain->iommu_snooping = 0;
1459
1460         domain->iommu_count = 1;
1461
1462         /* always allocate the top pgd */
1463         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1464         if (!domain->pgd)
1465                 return -ENOMEM;
1466         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1467         return 0;
1468 }
1469
1470 static void domain_exit(struct dmar_domain *domain)
1471 {
1472         u64 end;
1473
1474         /* Domain 0 is reserved, so dont process it */
1475         if (!domain)
1476                 return;
1477
1478         domain_remove_dev_info(domain);
1479         /* destroy iovas */
1480         put_iova_domain(&domain->iovad);
1481         end = DOMAIN_MAX_ADDR(domain->gaw);
1482         end = end & (~PAGE_MASK);
1483
1484         /* clear ptes */
1485         dma_pte_clear_range(domain, 0, end);
1486
1487         /* free page tables */
1488         dma_pte_free_pagetable(domain, 0, end);
1489
1490         iommu_free_domain(domain);
1491         free_domain_mem(domain);
1492 }
1493
1494 static int domain_context_mapping_one(struct dmar_domain *domain,
1495                 u8 bus, u8 devfn)
1496 {
1497         struct context_entry *context;
1498         unsigned long flags;
1499         struct intel_iommu *iommu;
1500         struct dma_pte *pgd;
1501         unsigned long num;
1502         unsigned long ndomains;
1503         int id;
1504         int agaw;
1505
1506         pr_debug("Set context mapping for %02x:%02x.%d\n",
1507                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1508         BUG_ON(!domain->pgd);
1509
1510         iommu = device_to_iommu(bus, devfn);
1511         if (!iommu)
1512                 return -ENODEV;
1513
1514         context = device_to_context_entry(iommu, bus, devfn);
1515         if (!context)
1516                 return -ENOMEM;
1517         spin_lock_irqsave(&iommu->lock, flags);
1518         if (context_present(context)) {
1519                 spin_unlock_irqrestore(&iommu->lock, flags);
1520                 return 0;
1521         }
1522
1523         id = domain->id;
1524         pgd = domain->pgd;
1525
1526         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1527                 int found = 0;
1528
1529                 /* find an available domain id for this device in iommu */
1530                 ndomains = cap_ndoms(iommu->cap);
1531                 num = find_first_bit(iommu->domain_ids, ndomains);
1532                 for (; num < ndomains; ) {
1533                         if (iommu->domains[num] == domain) {
1534                                 id = num;
1535                                 found = 1;
1536                                 break;
1537                         }
1538                         num = find_next_bit(iommu->domain_ids,
1539                                             cap_ndoms(iommu->cap), num+1);
1540                 }
1541
1542                 if (found == 0) {
1543                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1544                         if (num >= ndomains) {
1545                                 spin_unlock_irqrestore(&iommu->lock, flags);
1546                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1547                                 return -EFAULT;
1548                         }
1549
1550                         set_bit(num, iommu->domain_ids);
1551                         iommu->domains[num] = domain;
1552                         id = num;
1553                 }
1554
1555                 /* Skip top levels of page tables for
1556                  * iommu which has less agaw than default.
1557                  */
1558                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1559                         pgd = phys_to_virt(dma_pte_addr(pgd));
1560                         if (!dma_pte_present(pgd)) {
1561                                 spin_unlock_irqrestore(&iommu->lock, flags);
1562                                 return -ENOMEM;
1563                         }
1564                 }
1565         }
1566
1567         context_set_domain_id(context, id);
1568         context_set_address_width(context, iommu->agaw);
1569         context_set_address_root(context, virt_to_phys(pgd));
1570         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1571         context_set_fault_enable(context);
1572         context_set_present(context);
1573         domain_flush_cache(domain, context, sizeof(*context));
1574
1575         /* it's a non-present to present mapping */
1576         if (iommu->flush.flush_context(iommu, domain->id,
1577                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1578                 DMA_CCMD_DEVICE_INVL, 1))
1579                 iommu_flush_write_buffer(iommu);
1580         else
1581                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1582
1583         spin_unlock_irqrestore(&iommu->lock, flags);
1584
1585         spin_lock_irqsave(&domain->iommu_lock, flags);
1586         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1587                 domain->iommu_count++;
1588                 domain_update_iommu_cap(domain);
1589         }
1590         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1591         return 0;
1592 }
1593
1594 static int
1595 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1596 {
1597         int ret;
1598         struct pci_dev *tmp, *parent;
1599
1600         ret = domain_context_mapping_one(domain, pdev->bus->number,
1601                 pdev->devfn);
1602         if (ret)
1603                 return ret;
1604
1605         /* dependent device mapping */
1606         tmp = pci_find_upstream_pcie_bridge(pdev);
1607         if (!tmp)
1608                 return 0;
1609         /* Secondary interface's bus number and devfn 0 */
1610         parent = pdev->bus->self;
1611         while (parent != tmp) {
1612                 ret = domain_context_mapping_one(domain, parent->bus->number,
1613                         parent->devfn);
1614                 if (ret)
1615                         return ret;
1616                 parent = parent->bus->self;
1617         }
1618         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1619                 return domain_context_mapping_one(domain,
1620                         tmp->subordinate->number, 0);
1621         else /* this is a legacy PCI bridge */
1622                 return domain_context_mapping_one(domain,
1623                         tmp->bus->number, tmp->devfn);
1624 }
1625
1626 static int domain_context_mapped(struct pci_dev *pdev)
1627 {
1628         int ret;
1629         struct pci_dev *tmp, *parent;
1630         struct intel_iommu *iommu;
1631
1632         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1633         if (!iommu)
1634                 return -ENODEV;
1635
1636         ret = device_context_mapped(iommu,
1637                 pdev->bus->number, pdev->devfn);
1638         if (!ret)
1639                 return ret;
1640         /* dependent device mapping */
1641         tmp = pci_find_upstream_pcie_bridge(pdev);
1642         if (!tmp)
1643                 return ret;
1644         /* Secondary interface's bus number and devfn 0 */
1645         parent = pdev->bus->self;
1646         while (parent != tmp) {
1647                 ret = device_context_mapped(iommu, parent->bus->number,
1648                         parent->devfn);
1649                 if (!ret)
1650                         return ret;
1651                 parent = parent->bus->self;
1652         }
1653         if (tmp->is_pcie)
1654                 return device_context_mapped(iommu,
1655                         tmp->subordinate->number, 0);
1656         else
1657                 return device_context_mapped(iommu,
1658                         tmp->bus->number, tmp->devfn);
1659 }
1660
1661 static int
1662 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1663                         u64 hpa, size_t size, int prot)
1664 {
1665         u64 start_pfn, end_pfn;
1666         struct dma_pte *pte;
1667         int index;
1668         int addr_width = agaw_to_width(domain->agaw);
1669
1670         hpa &= (((u64)1) << addr_width) - 1;
1671
1672         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1673                 return -EINVAL;
1674         iova &= PAGE_MASK;
1675         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1676         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1677         index = 0;
1678         while (start_pfn < end_pfn) {
1679                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1680                 if (!pte)
1681                         return -ENOMEM;
1682                 /* We don't need lock here, nobody else
1683                  * touches the iova range
1684                  */
1685                 BUG_ON(dma_pte_addr(pte));
1686                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1687                 dma_set_pte_prot(pte, prot);
1688                 domain_flush_cache(domain, pte, sizeof(*pte));
1689                 start_pfn++;
1690                 index++;
1691         }
1692         return 0;
1693 }
1694
1695 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1696 {
1697         if (!iommu)
1698                 return;
1699
1700         clear_context_table(iommu, bus, devfn);
1701         iommu->flush.flush_context(iommu, 0, 0, 0,
1702                                            DMA_CCMD_GLOBAL_INVL, 0);
1703         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1704                                          DMA_TLB_GLOBAL_FLUSH, 0);
1705 }
1706
1707 static void domain_remove_dev_info(struct dmar_domain *domain)
1708 {
1709         struct device_domain_info *info;
1710         unsigned long flags;
1711         struct intel_iommu *iommu;
1712
1713         spin_lock_irqsave(&device_domain_lock, flags);
1714         while (!list_empty(&domain->devices)) {
1715                 info = list_entry(domain->devices.next,
1716                         struct device_domain_info, link);
1717                 list_del(&info->link);
1718                 list_del(&info->global);
1719                 if (info->dev)
1720                         info->dev->dev.archdata.iommu = NULL;
1721                 spin_unlock_irqrestore(&device_domain_lock, flags);
1722
1723                 iommu = device_to_iommu(info->bus, info->devfn);
1724                 iommu_detach_dev(iommu, info->bus, info->devfn);
1725                 free_devinfo_mem(info);
1726
1727                 spin_lock_irqsave(&device_domain_lock, flags);
1728         }
1729         spin_unlock_irqrestore(&device_domain_lock, flags);
1730 }
1731
1732 /*
1733  * find_domain
1734  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1735  */
1736 static struct dmar_domain *
1737 find_domain(struct pci_dev *pdev)
1738 {
1739         struct device_domain_info *info;
1740
1741         /* No lock here, assumes no domain exit in normal case */
1742         info = pdev->dev.archdata.iommu;
1743         if (info)
1744                 return info->domain;
1745         return NULL;
1746 }
1747
1748 /* domain is initialized */
1749 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1750 {
1751         struct dmar_domain *domain, *found = NULL;
1752         struct intel_iommu *iommu;
1753         struct dmar_drhd_unit *drhd;
1754         struct device_domain_info *info, *tmp;
1755         struct pci_dev *dev_tmp;
1756         unsigned long flags;
1757         int bus = 0, devfn = 0;
1758
1759         domain = find_domain(pdev);
1760         if (domain)
1761                 return domain;
1762
1763         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1764         if (dev_tmp) {
1765                 if (dev_tmp->is_pcie) {
1766                         bus = dev_tmp->subordinate->number;
1767                         devfn = 0;
1768                 } else {
1769                         bus = dev_tmp->bus->number;
1770                         devfn = dev_tmp->devfn;
1771                 }
1772                 spin_lock_irqsave(&device_domain_lock, flags);
1773                 list_for_each_entry(info, &device_domain_list, global) {
1774                         if (info->bus == bus && info->devfn == devfn) {
1775                                 found = info->domain;
1776                                 break;
1777                         }
1778                 }
1779                 spin_unlock_irqrestore(&device_domain_lock, flags);
1780                 /* pcie-pci bridge already has a domain, uses it */
1781                 if (found) {
1782                         domain = found;
1783                         goto found_domain;
1784                 }
1785         }
1786
1787         /* Allocate new domain for the device */
1788         drhd = dmar_find_matched_drhd_unit(pdev);
1789         if (!drhd) {
1790                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1791                         pci_name(pdev));
1792                 return NULL;
1793         }
1794         iommu = drhd->iommu;
1795
1796         domain = iommu_alloc_domain(iommu);
1797         if (!domain)
1798                 goto error;
1799
1800         if (domain_init(domain, gaw)) {
1801                 domain_exit(domain);
1802                 goto error;
1803         }
1804
1805         /* register pcie-to-pci device */
1806         if (dev_tmp) {
1807                 info = alloc_devinfo_mem();
1808                 if (!info) {
1809                         domain_exit(domain);
1810                         goto error;
1811                 }
1812                 info->bus = bus;
1813                 info->devfn = devfn;
1814                 info->dev = NULL;
1815                 info->domain = domain;
1816                 /* This domain is shared by devices under p2p bridge */
1817                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1818
1819                 /* pcie-to-pci bridge already has a domain, uses it */
1820                 found = NULL;
1821                 spin_lock_irqsave(&device_domain_lock, flags);
1822                 list_for_each_entry(tmp, &device_domain_list, global) {
1823                         if (tmp->bus == bus && tmp->devfn == devfn) {
1824                                 found = tmp->domain;
1825                                 break;
1826                         }
1827                 }
1828                 if (found) {
1829                         free_devinfo_mem(info);
1830                         domain_exit(domain);
1831                         domain = found;
1832                 } else {
1833                         list_add(&info->link, &domain->devices);
1834                         list_add(&info->global, &device_domain_list);
1835                 }
1836                 spin_unlock_irqrestore(&device_domain_lock, flags);
1837         }
1838
1839 found_domain:
1840         info = alloc_devinfo_mem();
1841         if (!info)
1842                 goto error;
1843         info->bus = pdev->bus->number;
1844         info->devfn = pdev->devfn;
1845         info->dev = pdev;
1846         info->domain = domain;
1847         spin_lock_irqsave(&device_domain_lock, flags);
1848         /* somebody is fast */
1849         found = find_domain(pdev);
1850         if (found != NULL) {
1851                 spin_unlock_irqrestore(&device_domain_lock, flags);
1852                 if (found != domain) {
1853                         domain_exit(domain);
1854                         domain = found;
1855                 }
1856                 free_devinfo_mem(info);
1857                 return domain;
1858         }
1859         list_add(&info->link, &domain->devices);
1860         list_add(&info->global, &device_domain_list);
1861         pdev->dev.archdata.iommu = info;
1862         spin_unlock_irqrestore(&device_domain_lock, flags);
1863         return domain;
1864 error:
1865         /* recheck it here, maybe others set it */
1866         return find_domain(pdev);
1867 }
1868
1869 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1870                                       unsigned long long start,
1871                                       unsigned long long end)
1872 {
1873         struct dmar_domain *domain;
1874         unsigned long size;
1875         unsigned long long base;
1876         int ret;
1877
1878         printk(KERN_INFO
1879                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1880                 pci_name(pdev), start, end);
1881         /* page table init */
1882         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1883         if (!domain)
1884                 return -ENOMEM;
1885
1886         /* The address might not be aligned */
1887         base = start & PAGE_MASK;
1888         size = end - base;
1889         size = PAGE_ALIGN(size);
1890         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1891                         IOVA_PFN(base + size) - 1)) {
1892                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1893                 ret = -ENOMEM;
1894                 goto error;
1895         }
1896
1897         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1898                 size, base, pci_name(pdev));
1899         /*
1900          * RMRR range might have overlap with physical memory range,
1901          * clear it first
1902          */
1903         dma_pte_clear_range(domain, base, base + size);
1904
1905         ret = domain_page_mapping(domain, base, base, size,
1906                 DMA_PTE_READ|DMA_PTE_WRITE);
1907         if (ret)
1908                 goto error;
1909
1910         /* context entry init */
1911         ret = domain_context_mapping(domain, pdev);
1912         if (!ret)
1913                 return 0;
1914 error:
1915         domain_exit(domain);
1916         return ret;
1917
1918 }
1919
1920 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1921         struct pci_dev *pdev)
1922 {
1923         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1924                 return 0;
1925         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1926                 rmrr->end_address + 1);
1927 }
1928
1929 #ifdef CONFIG_DMAR_GFX_WA
1930 struct iommu_prepare_data {
1931         struct pci_dev *pdev;
1932         int ret;
1933 };
1934
1935 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1936                                          unsigned long end_pfn, void *datax)
1937 {
1938         struct iommu_prepare_data *data;
1939
1940         data = (struct iommu_prepare_data *)datax;
1941
1942         data->ret = iommu_prepare_identity_map(data->pdev,
1943                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1944         return data->ret;
1945
1946 }
1947
1948 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1949 {
1950         int nid;
1951         struct iommu_prepare_data data;
1952
1953         data.pdev = pdev;
1954         data.ret = 0;
1955
1956         for_each_online_node(nid) {
1957                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1958                 if (data.ret)
1959                         return data.ret;
1960         }
1961         return data.ret;
1962 }
1963
1964 static void __init iommu_prepare_gfx_mapping(void)
1965 {
1966         struct pci_dev *pdev = NULL;
1967         int ret;
1968
1969         for_each_pci_dev(pdev) {
1970                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1971                                 !IS_GFX_DEVICE(pdev))
1972                         continue;
1973                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1974                         pci_name(pdev));
1975                 ret = iommu_prepare_with_active_regions(pdev);
1976                 if (ret)
1977                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1978         }
1979 }
1980 #else /* !CONFIG_DMAR_GFX_WA */
1981 static inline void iommu_prepare_gfx_mapping(void)
1982 {
1983         return;
1984 }
1985 #endif
1986
1987 #ifdef CONFIG_DMAR_FLOPPY_WA
1988 static inline void iommu_prepare_isa(void)
1989 {
1990         struct pci_dev *pdev;
1991         int ret;
1992
1993         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1994         if (!pdev)
1995                 return;
1996
1997         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1998         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1999
2000         if (ret)
2001                 printk("IOMMU: Failed to create 0-64M identity map, "
2002                         "floppy might not work\n");
2003
2004 }
2005 #else
2006 static inline void iommu_prepare_isa(void)
2007 {
2008         return;
2009 }
2010 #endif /* !CONFIG_DMAR_FLPY_WA */
2011
2012 static int __init init_dmars(void)
2013 {
2014         struct dmar_drhd_unit *drhd;
2015         struct dmar_rmrr_unit *rmrr;
2016         struct pci_dev *pdev;
2017         struct intel_iommu *iommu;
2018         int i, ret, unit = 0;
2019
2020         /*
2021          * for each drhd
2022          *    allocate root
2023          *    initialize and program root entry to not present
2024          * endfor
2025          */
2026         for_each_drhd_unit(drhd) {
2027                 g_num_of_iommus++;
2028                 /*
2029                  * lock not needed as this is only incremented in the single
2030                  * threaded kernel __init code path all other access are read
2031                  * only
2032                  */
2033         }
2034
2035         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2036                         GFP_KERNEL);
2037         if (!g_iommus) {
2038                 printk(KERN_ERR "Allocating global iommu array failed\n");
2039                 ret = -ENOMEM;
2040                 goto error;
2041         }
2042
2043         deferred_flush = kzalloc(g_num_of_iommus *
2044                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2045         if (!deferred_flush) {
2046                 kfree(g_iommus);
2047                 ret = -ENOMEM;
2048                 goto error;
2049         }
2050
2051         for_each_drhd_unit(drhd) {
2052                 if (drhd->ignored)
2053                         continue;
2054
2055                 iommu = drhd->iommu;
2056                 g_iommus[iommu->seq_id] = iommu;
2057
2058                 ret = iommu_init_domains(iommu);
2059                 if (ret)
2060                         goto error;
2061
2062                 /*
2063                  * TBD:
2064                  * we could share the same root & context tables
2065                  * amoung all IOMMU's. Need to Split it later.
2066                  */
2067                 ret = iommu_alloc_root_entry(iommu);
2068                 if (ret) {
2069                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2070                         goto error;
2071                 }
2072         }
2073
2074         for_each_drhd_unit(drhd) {
2075                 if (drhd->ignored)
2076                         continue;
2077
2078                 iommu = drhd->iommu;
2079                 if (dmar_enable_qi(iommu)) {
2080                         /*
2081                          * Queued Invalidate not enabled, use Register Based
2082                          * Invalidate
2083                          */
2084                         iommu->flush.flush_context = __iommu_flush_context;
2085                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2086                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2087                                "invalidation\n",
2088                                (unsigned long long)drhd->reg_base_addr);
2089                 } else {
2090                         iommu->flush.flush_context = qi_flush_context;
2091                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2092                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2093                                "invalidation\n",
2094                                (unsigned long long)drhd->reg_base_addr);
2095                 }
2096         }
2097
2098         /*
2099          * For each rmrr
2100          *   for each dev attached to rmrr
2101          *   do
2102          *     locate drhd for dev, alloc domain for dev
2103          *     allocate free domain
2104          *     allocate page table entries for rmrr
2105          *     if context not allocated for bus
2106          *           allocate and init context
2107          *           set present in root table for this bus
2108          *     init context with domain, translation etc
2109          *    endfor
2110          * endfor
2111          */
2112         for_each_rmrr_units(rmrr) {
2113                 for (i = 0; i < rmrr->devices_cnt; i++) {
2114                         pdev = rmrr->devices[i];
2115                         /* some BIOS lists non-exist devices in DMAR table */
2116                         if (!pdev)
2117                                 continue;
2118                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2119                         if (ret)
2120                                 printk(KERN_ERR
2121                                  "IOMMU: mapping reserved region failed\n");
2122                 }
2123         }
2124
2125         iommu_prepare_gfx_mapping();
2126
2127         iommu_prepare_isa();
2128
2129         /*
2130          * for each drhd
2131          *   enable fault log
2132          *   global invalidate context cache
2133          *   global invalidate iotlb
2134          *   enable translation
2135          */
2136         for_each_drhd_unit(drhd) {
2137                 if (drhd->ignored)
2138                         continue;
2139                 iommu = drhd->iommu;
2140                 sprintf (iommu->name, "dmar%d", unit++);
2141
2142                 iommu_flush_write_buffer(iommu);
2143
2144                 ret = dmar_set_interrupt(iommu);
2145                 if (ret)
2146                         goto error;
2147
2148                 iommu_set_root_entry(iommu);
2149
2150                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2151                                            0);
2152                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2153                                          0);
2154                 iommu_disable_protect_mem_regions(iommu);
2155
2156                 ret = iommu_enable_translation(iommu);
2157                 if (ret)
2158                         goto error;
2159         }
2160
2161         return 0;
2162 error:
2163         for_each_drhd_unit(drhd) {
2164                 if (drhd->ignored)
2165                         continue;
2166                 iommu = drhd->iommu;
2167                 free_iommu(iommu);
2168         }
2169         kfree(g_iommus);
2170         return ret;
2171 }
2172
2173 static inline u64 aligned_size(u64 host_addr, size_t size)
2174 {
2175         u64 addr;
2176         addr = (host_addr & (~PAGE_MASK)) + size;
2177         return PAGE_ALIGN(addr);
2178 }
2179
2180 struct iova *
2181 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2182 {
2183         struct iova *piova;
2184
2185         /* Make sure it's in range */
2186         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2187         if (!size || (IOVA_START_ADDR + size > end))
2188                 return NULL;
2189
2190         piova = alloc_iova(&domain->iovad,
2191                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2192         return piova;
2193 }
2194
2195 static struct iova *
2196 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2197                    size_t size, u64 dma_mask)
2198 {
2199         struct pci_dev *pdev = to_pci_dev(dev);
2200         struct iova *iova = NULL;
2201
2202         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2203                 iova = iommu_alloc_iova(domain, size, dma_mask);
2204         else {
2205                 /*
2206                  * First try to allocate an io virtual address in
2207                  * DMA_32BIT_MASK and if that fails then try allocating
2208                  * from higher range
2209                  */
2210                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2211                 if (!iova)
2212                         iova = iommu_alloc_iova(domain, size, dma_mask);
2213         }
2214
2215         if (!iova) {
2216                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2217                 return NULL;
2218         }
2219
2220         return iova;
2221 }
2222
2223 static struct dmar_domain *
2224 get_valid_domain_for_dev(struct pci_dev *pdev)
2225 {
2226         struct dmar_domain *domain;
2227         int ret;
2228
2229         domain = get_domain_for_dev(pdev,
2230                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2231         if (!domain) {
2232                 printk(KERN_ERR
2233                         "Allocating domain for %s failed", pci_name(pdev));
2234                 return NULL;
2235         }
2236
2237         /* make sure context mapping is ok */
2238         if (unlikely(!domain_context_mapped(pdev))) {
2239                 ret = domain_context_mapping(domain, pdev);
2240                 if (ret) {
2241                         printk(KERN_ERR
2242                                 "Domain context map for %s failed",
2243                                 pci_name(pdev));
2244                         return NULL;
2245                 }
2246         }
2247
2248         return domain;
2249 }
2250
2251 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2252                                      size_t size, int dir, u64 dma_mask)
2253 {
2254         struct pci_dev *pdev = to_pci_dev(hwdev);
2255         struct dmar_domain *domain;
2256         phys_addr_t start_paddr;
2257         struct iova *iova;
2258         int prot = 0;
2259         int ret;
2260         struct intel_iommu *iommu;
2261
2262         BUG_ON(dir == DMA_NONE);
2263         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2264                 return paddr;
2265
2266         domain = get_valid_domain_for_dev(pdev);
2267         if (!domain)
2268                 return 0;
2269
2270         iommu = domain_get_iommu(domain);
2271         size = aligned_size((u64)paddr, size);
2272
2273         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2274         if (!iova)
2275                 goto error;
2276
2277         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2278
2279         /*
2280          * Check if DMAR supports zero-length reads on write only
2281          * mappings..
2282          */
2283         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2284                         !cap_zlr(iommu->cap))
2285                 prot |= DMA_PTE_READ;
2286         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2287                 prot |= DMA_PTE_WRITE;
2288         /*
2289          * paddr - (paddr + size) might be partial page, we should map the whole
2290          * page.  Note: if two part of one page are separately mapped, we
2291          * might have two guest_addr mapping to the same host paddr, but this
2292          * is not a big problem
2293          */
2294         ret = domain_page_mapping(domain, start_paddr,
2295                 ((u64)paddr) & PAGE_MASK, size, prot);
2296         if (ret)
2297                 goto error;
2298
2299         /* it's a non-present to present mapping */
2300         ret = iommu_flush_iotlb_psi(iommu, domain->id,
2301                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2302         if (ret)
2303                 iommu_flush_write_buffer(iommu);
2304
2305         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2306
2307 error:
2308         if (iova)
2309                 __free_iova(&domain->iovad, iova);
2310         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2311                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2312         return 0;
2313 }
2314
2315 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2316                             size_t size, int dir)
2317 {
2318         return __intel_map_single(hwdev, paddr, size, dir,
2319                                   to_pci_dev(hwdev)->dma_mask);
2320 }
2321
2322 static void flush_unmaps(void)
2323 {
2324         int i, j;
2325
2326         timer_on = 0;
2327
2328         /* just flush them all */
2329         for (i = 0; i < g_num_of_iommus; i++) {
2330                 struct intel_iommu *iommu = g_iommus[i];
2331                 if (!iommu)
2332                         continue;
2333
2334                 if (deferred_flush[i].next) {
2335                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2336                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2337                         for (j = 0; j < deferred_flush[i].next; j++) {
2338                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2339                                                 deferred_flush[i].iova[j]);
2340                         }
2341                         deferred_flush[i].next = 0;
2342                 }
2343         }
2344
2345         list_size = 0;
2346 }
2347
2348 static void flush_unmaps_timeout(unsigned long data)
2349 {
2350         unsigned long flags;
2351
2352         spin_lock_irqsave(&async_umap_flush_lock, flags);
2353         flush_unmaps();
2354         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2355 }
2356
2357 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2358 {
2359         unsigned long flags;
2360         int next, iommu_id;
2361         struct intel_iommu *iommu;
2362
2363         spin_lock_irqsave(&async_umap_flush_lock, flags);
2364         if (list_size == HIGH_WATER_MARK)
2365                 flush_unmaps();
2366
2367         iommu = domain_get_iommu(dom);
2368         iommu_id = iommu->seq_id;
2369
2370         next = deferred_flush[iommu_id].next;
2371         deferred_flush[iommu_id].domain[next] = dom;
2372         deferred_flush[iommu_id].iova[next] = iova;
2373         deferred_flush[iommu_id].next++;
2374
2375         if (!timer_on) {
2376                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2377                 timer_on = 1;
2378         }
2379         list_size++;
2380         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2381 }
2382
2383 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2384                         int dir)
2385 {
2386         struct pci_dev *pdev = to_pci_dev(dev);
2387         struct dmar_domain *domain;
2388         unsigned long start_addr;
2389         struct iova *iova;
2390         struct intel_iommu *iommu;
2391
2392         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2393                 return;
2394         domain = find_domain(pdev);
2395         BUG_ON(!domain);
2396
2397         iommu = domain_get_iommu(domain);
2398
2399         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2400         if (!iova)
2401                 return;
2402
2403         start_addr = iova->pfn_lo << PAGE_SHIFT;
2404         size = aligned_size((u64)dev_addr, size);
2405
2406         pr_debug("Device %s unmapping: %lx@%llx\n",
2407                 pci_name(pdev), size, (unsigned long long)start_addr);
2408
2409         /*  clear the whole page */
2410         dma_pte_clear_range(domain, start_addr, start_addr + size);
2411         /* free page tables */
2412         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2413         if (intel_iommu_strict) {
2414                 if (iommu_flush_iotlb_psi(iommu,
2415                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2416                         iommu_flush_write_buffer(iommu);
2417                 /* free iova */
2418                 __free_iova(&domain->iovad, iova);
2419         } else {
2420                 add_unmap(domain, iova);
2421                 /*
2422                  * queue up the release of the unmap to save the 1/6th of the
2423                  * cpu used up by the iotlb flush operation...
2424                  */
2425         }
2426 }
2427
2428 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2429                            dma_addr_t *dma_handle, gfp_t flags)
2430 {
2431         void *vaddr;
2432         int order;
2433
2434         size = PAGE_ALIGN(size);
2435         order = get_order(size);
2436         flags &= ~(GFP_DMA | GFP_DMA32);
2437
2438         vaddr = (void *)__get_free_pages(flags, order);
2439         if (!vaddr)
2440                 return NULL;
2441         memset(vaddr, 0, size);
2442
2443         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2444                                          DMA_BIDIRECTIONAL,
2445                                          hwdev->coherent_dma_mask);
2446         if (*dma_handle)
2447                 return vaddr;
2448         free_pages((unsigned long)vaddr, order);
2449         return NULL;
2450 }
2451
2452 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2453                          dma_addr_t dma_handle)
2454 {
2455         int order;
2456
2457         size = PAGE_ALIGN(size);
2458         order = get_order(size);
2459
2460         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2461         free_pages((unsigned long)vaddr, order);
2462 }
2463
2464 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2465
2466 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2467                     int nelems, int dir)
2468 {
2469         int i;
2470         struct pci_dev *pdev = to_pci_dev(hwdev);
2471         struct dmar_domain *domain;
2472         unsigned long start_addr;
2473         struct iova *iova;
2474         size_t size = 0;
2475         void *addr;
2476         struct scatterlist *sg;
2477         struct intel_iommu *iommu;
2478
2479         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2480                 return;
2481
2482         domain = find_domain(pdev);
2483         BUG_ON(!domain);
2484
2485         iommu = domain_get_iommu(domain);
2486
2487         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2488         if (!iova)
2489                 return;
2490         for_each_sg(sglist, sg, nelems, i) {
2491                 addr = SG_ENT_VIRT_ADDRESS(sg);
2492                 size += aligned_size((u64)addr, sg->length);
2493         }
2494
2495         start_addr = iova->pfn_lo << PAGE_SHIFT;
2496
2497         /*  clear the whole page */
2498         dma_pte_clear_range(domain, start_addr, start_addr + size);
2499         /* free page tables */
2500         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2501
2502         if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2503                         size >> VTD_PAGE_SHIFT, 0))
2504                 iommu_flush_write_buffer(iommu);
2505
2506         /* free iova */
2507         __free_iova(&domain->iovad, iova);
2508 }
2509
2510 static int intel_nontranslate_map_sg(struct device *hddev,
2511         struct scatterlist *sglist, int nelems, int dir)
2512 {
2513         int i;
2514         struct scatterlist *sg;
2515
2516         for_each_sg(sglist, sg, nelems, i) {
2517                 BUG_ON(!sg_page(sg));
2518                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2519                 sg->dma_length = sg->length;
2520         }
2521         return nelems;
2522 }
2523
2524 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2525                  int dir)
2526 {
2527         void *addr;
2528         int i;
2529         struct pci_dev *pdev = to_pci_dev(hwdev);
2530         struct dmar_domain *domain;
2531         size_t size = 0;
2532         int prot = 0;
2533         size_t offset = 0;
2534         struct iova *iova = NULL;
2535         int ret;
2536         struct scatterlist *sg;
2537         unsigned long start_addr;
2538         struct intel_iommu *iommu;
2539
2540         BUG_ON(dir == DMA_NONE);
2541         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2542                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2543
2544         domain = get_valid_domain_for_dev(pdev);
2545         if (!domain)
2546                 return 0;
2547
2548         iommu = domain_get_iommu(domain);
2549
2550         for_each_sg(sglist, sg, nelems, i) {
2551                 addr = SG_ENT_VIRT_ADDRESS(sg);
2552                 addr = (void *)virt_to_phys(addr);
2553                 size += aligned_size((u64)addr, sg->length);
2554         }
2555
2556         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2557         if (!iova) {
2558                 sglist->dma_length = 0;
2559                 return 0;
2560         }
2561
2562         /*
2563          * Check if DMAR supports zero-length reads on write only
2564          * mappings..
2565          */
2566         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2567                         !cap_zlr(iommu->cap))
2568                 prot |= DMA_PTE_READ;
2569         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2570                 prot |= DMA_PTE_WRITE;
2571
2572         start_addr = iova->pfn_lo << PAGE_SHIFT;
2573         offset = 0;
2574         for_each_sg(sglist, sg, nelems, i) {
2575                 addr = SG_ENT_VIRT_ADDRESS(sg);
2576                 addr = (void *)virt_to_phys(addr);
2577                 size = aligned_size((u64)addr, sg->length);
2578                 ret = domain_page_mapping(domain, start_addr + offset,
2579                         ((u64)addr) & PAGE_MASK,
2580                         size, prot);
2581                 if (ret) {
2582                         /*  clear the page */
2583                         dma_pte_clear_range(domain, start_addr,
2584                                   start_addr + offset);
2585                         /* free page tables */
2586                         dma_pte_free_pagetable(domain, start_addr,
2587                                   start_addr + offset);
2588                         /* free iova */
2589                         __free_iova(&domain->iovad, iova);
2590                         return 0;
2591                 }
2592                 sg->dma_address = start_addr + offset +
2593                                 ((u64)addr & (~PAGE_MASK));
2594                 sg->dma_length = sg->length;
2595                 offset += size;
2596         }
2597
2598         /* it's a non-present to present mapping */
2599         if (iommu_flush_iotlb_psi(iommu, domain->id,
2600                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2601                 iommu_flush_write_buffer(iommu);
2602         return nelems;
2603 }
2604
2605 static struct dma_mapping_ops intel_dma_ops = {
2606         .alloc_coherent = intel_alloc_coherent,
2607         .free_coherent = intel_free_coherent,
2608         .map_single = intel_map_single,
2609         .unmap_single = intel_unmap_single,
2610         .map_sg = intel_map_sg,
2611         .unmap_sg = intel_unmap_sg,
2612 };
2613
2614 static inline int iommu_domain_cache_init(void)
2615 {
2616         int ret = 0;
2617
2618         iommu_domain_cache = kmem_cache_create("iommu_domain",
2619                                          sizeof(struct dmar_domain),
2620                                          0,
2621                                          SLAB_HWCACHE_ALIGN,
2622
2623                                          NULL);
2624         if (!iommu_domain_cache) {
2625                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2626                 ret = -ENOMEM;
2627         }
2628
2629         return ret;
2630 }
2631
2632 static inline int iommu_devinfo_cache_init(void)
2633 {
2634         int ret = 0;
2635
2636         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2637                                          sizeof(struct device_domain_info),
2638                                          0,
2639                                          SLAB_HWCACHE_ALIGN,
2640                                          NULL);
2641         if (!iommu_devinfo_cache) {
2642                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2643                 ret = -ENOMEM;
2644         }
2645
2646         return ret;
2647 }
2648
2649 static inline int iommu_iova_cache_init(void)
2650 {
2651         int ret = 0;
2652
2653         iommu_iova_cache = kmem_cache_create("iommu_iova",
2654                                          sizeof(struct iova),
2655                                          0,
2656                                          SLAB_HWCACHE_ALIGN,
2657                                          NULL);
2658         if (!iommu_iova_cache) {
2659                 printk(KERN_ERR "Couldn't create iova cache\n");
2660                 ret = -ENOMEM;
2661         }
2662
2663         return ret;
2664 }
2665
2666 static int __init iommu_init_mempool(void)
2667 {
2668         int ret;
2669         ret = iommu_iova_cache_init();
2670         if (ret)
2671                 return ret;
2672
2673         ret = iommu_domain_cache_init();
2674         if (ret)
2675                 goto domain_error;
2676
2677         ret = iommu_devinfo_cache_init();
2678         if (!ret)
2679                 return ret;
2680
2681         kmem_cache_destroy(iommu_domain_cache);
2682 domain_error:
2683         kmem_cache_destroy(iommu_iova_cache);
2684
2685         return -ENOMEM;
2686 }
2687
2688 static void __init iommu_exit_mempool(void)
2689 {
2690         kmem_cache_destroy(iommu_devinfo_cache);
2691         kmem_cache_destroy(iommu_domain_cache);
2692         kmem_cache_destroy(iommu_iova_cache);
2693
2694 }
2695
2696 static void __init init_no_remapping_devices(void)
2697 {
2698         struct dmar_drhd_unit *drhd;
2699
2700         for_each_drhd_unit(drhd) {
2701                 if (!drhd->include_all) {
2702                         int i;
2703                         for (i = 0; i < drhd->devices_cnt; i++)
2704                                 if (drhd->devices[i] != NULL)
2705                                         break;
2706                         /* ignore DMAR unit if no pci devices exist */
2707                         if (i == drhd->devices_cnt)
2708                                 drhd->ignored = 1;
2709                 }
2710         }
2711
2712         if (dmar_map_gfx)
2713                 return;
2714
2715         for_each_drhd_unit(drhd) {
2716                 int i;
2717                 if (drhd->ignored || drhd->include_all)
2718                         continue;
2719
2720                 for (i = 0; i < drhd->devices_cnt; i++)
2721                         if (drhd->devices[i] &&
2722                                 !IS_GFX_DEVICE(drhd->devices[i]))
2723                                 break;
2724
2725                 if (i < drhd->devices_cnt)
2726                         continue;
2727
2728                 /* bypass IOMMU if it is just for gfx devices */
2729                 drhd->ignored = 1;
2730                 for (i = 0; i < drhd->devices_cnt; i++) {
2731                         if (!drhd->devices[i])
2732                                 continue;
2733                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2734                 }
2735         }
2736 }
2737
2738 int __init intel_iommu_init(void)
2739 {
2740         int ret = 0;
2741
2742         if (dmar_table_init())
2743                 return  -ENODEV;
2744
2745         if (dmar_dev_scope_init())
2746                 return  -ENODEV;
2747
2748         /*
2749          * Check the need for DMA-remapping initialization now.
2750          * Above initialization will also be used by Interrupt-remapping.
2751          */
2752         if (no_iommu || swiotlb || dmar_disabled)
2753                 return -ENODEV;
2754
2755         iommu_init_mempool();
2756         dmar_init_reserved_ranges();
2757
2758         init_no_remapping_devices();
2759
2760         ret = init_dmars();
2761         if (ret) {
2762                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2763                 put_iova_domain(&reserved_iova_list);
2764                 iommu_exit_mempool();
2765                 return ret;
2766         }
2767         printk(KERN_INFO
2768         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2769
2770         init_timer(&unmap_timer);
2771         force_iommu = 1;
2772         dma_ops = &intel_dma_ops;
2773
2774         register_iommu(&intel_iommu_ops);
2775
2776         return 0;
2777 }
2778
2779 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2780                                   struct pci_dev *pdev)
2781 {
2782         struct device_domain_info *info;
2783         unsigned long flags;
2784
2785         info = alloc_devinfo_mem();
2786         if (!info)
2787                 return -ENOMEM;
2788
2789         info->bus = pdev->bus->number;
2790         info->devfn = pdev->devfn;
2791         info->dev = pdev;
2792         info->domain = domain;
2793
2794         spin_lock_irqsave(&device_domain_lock, flags);
2795         list_add(&info->link, &domain->devices);
2796         list_add(&info->global, &device_domain_list);
2797         pdev->dev.archdata.iommu = info;
2798         spin_unlock_irqrestore(&device_domain_lock, flags);
2799
2800         return 0;
2801 }
2802
2803 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2804                                           struct pci_dev *pdev)
2805 {
2806         struct device_domain_info *info;
2807         struct intel_iommu *iommu;
2808         unsigned long flags;
2809         int found = 0;
2810         struct list_head *entry, *tmp;
2811
2812         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2813         if (!iommu)
2814                 return;
2815
2816         spin_lock_irqsave(&device_domain_lock, flags);
2817         list_for_each_safe(entry, tmp, &domain->devices) {
2818                 info = list_entry(entry, struct device_domain_info, link);
2819                 if (info->bus == pdev->bus->number &&
2820                     info->devfn == pdev->devfn) {
2821                         list_del(&info->link);
2822                         list_del(&info->global);
2823                         if (info->dev)
2824                                 info->dev->dev.archdata.iommu = NULL;
2825                         spin_unlock_irqrestore(&device_domain_lock, flags);
2826
2827                         iommu_detach_dev(iommu, info->bus, info->devfn);
2828                         free_devinfo_mem(info);
2829
2830                         spin_lock_irqsave(&device_domain_lock, flags);
2831
2832                         if (found)
2833                                 break;
2834                         else
2835                                 continue;
2836                 }
2837
2838                 /* if there is no other devices under the same iommu
2839                  * owned by this domain, clear this iommu in iommu_bmp
2840                  * update iommu count and coherency
2841                  */
2842                 if (device_to_iommu(info->bus, info->devfn) == iommu)
2843                         found = 1;
2844         }
2845
2846         if (found == 0) {
2847                 unsigned long tmp_flags;
2848                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2849                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2850                 domain->iommu_count--;
2851                 domain_update_iommu_cap(domain);
2852                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2853         }
2854
2855         spin_unlock_irqrestore(&device_domain_lock, flags);
2856 }
2857
2858 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2859 {
2860         struct device_domain_info *info;
2861         struct intel_iommu *iommu;
2862         unsigned long flags1, flags2;
2863
2864         spin_lock_irqsave(&device_domain_lock, flags1);
2865         while (!list_empty(&domain->devices)) {
2866                 info = list_entry(domain->devices.next,
2867                         struct device_domain_info, link);
2868                 list_del(&info->link);
2869                 list_del(&info->global);
2870                 if (info->dev)
2871                         info->dev->dev.archdata.iommu = NULL;
2872
2873                 spin_unlock_irqrestore(&device_domain_lock, flags1);
2874
2875                 iommu = device_to_iommu(info->bus, info->devfn);
2876                 iommu_detach_dev(iommu, info->bus, info->devfn);
2877
2878                 /* clear this iommu in iommu_bmp, update iommu count
2879                  * and capabilities
2880                  */
2881                 spin_lock_irqsave(&domain->iommu_lock, flags2);
2882                 if (test_and_clear_bit(iommu->seq_id,
2883                                        &domain->iommu_bmp)) {
2884                         domain->iommu_count--;
2885                         domain_update_iommu_cap(domain);
2886                 }
2887                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2888
2889                 free_devinfo_mem(info);
2890                 spin_lock_irqsave(&device_domain_lock, flags1);
2891         }
2892         spin_unlock_irqrestore(&device_domain_lock, flags1);
2893 }
2894
2895 /* domain id for virtual machine, it won't be set in context */
2896 static unsigned long vm_domid;
2897
2898 static int vm_domain_min_agaw(struct dmar_domain *domain)
2899 {
2900         int i;
2901         int min_agaw = domain->agaw;
2902
2903         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2904         for (; i < g_num_of_iommus; ) {
2905                 if (min_agaw > g_iommus[i]->agaw)
2906                         min_agaw = g_iommus[i]->agaw;
2907
2908                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2909         }
2910
2911         return min_agaw;
2912 }
2913
2914 static struct dmar_domain *iommu_alloc_vm_domain(void)
2915 {
2916         struct dmar_domain *domain;
2917
2918         domain = alloc_domain_mem();
2919         if (!domain)
2920                 return NULL;
2921
2922         domain->id = vm_domid++;
2923         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2924         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2925
2926         return domain;
2927 }
2928
2929 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2930 {
2931         int adjust_width;
2932
2933         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2934         spin_lock_init(&domain->mapping_lock);
2935         spin_lock_init(&domain->iommu_lock);
2936
2937         domain_reserve_special_ranges(domain);
2938
2939         /* calculate AGAW */
2940         domain->gaw = guest_width;
2941         adjust_width = guestwidth_to_adjustwidth(guest_width);
2942         domain->agaw = width_to_agaw(adjust_width);
2943
2944         INIT_LIST_HEAD(&domain->devices);
2945
2946         domain->iommu_count = 0;
2947         domain->iommu_coherency = 0;
2948         domain->max_addr = 0;
2949
2950         /* always allocate the top pgd */
2951         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2952         if (!domain->pgd)
2953                 return -ENOMEM;
2954         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2955         return 0;
2956 }
2957
2958 static void iommu_free_vm_domain(struct dmar_domain *domain)
2959 {
2960         unsigned long flags;
2961         struct dmar_drhd_unit *drhd;
2962         struct intel_iommu *iommu;
2963         unsigned long i;
2964         unsigned long ndomains;
2965
2966         for_each_drhd_unit(drhd) {
2967                 if (drhd->ignored)
2968                         continue;
2969                 iommu = drhd->iommu;
2970
2971                 ndomains = cap_ndoms(iommu->cap);
2972                 i = find_first_bit(iommu->domain_ids, ndomains);
2973                 for (; i < ndomains; ) {
2974                         if (iommu->domains[i] == domain) {
2975                                 spin_lock_irqsave(&iommu->lock, flags);
2976                                 clear_bit(i, iommu->domain_ids);
2977                                 iommu->domains[i] = NULL;
2978                                 spin_unlock_irqrestore(&iommu->lock, flags);
2979                                 break;
2980                         }
2981                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
2982                 }
2983         }
2984 }
2985
2986 static void vm_domain_exit(struct dmar_domain *domain)
2987 {
2988         u64 end;
2989
2990         /* Domain 0 is reserved, so dont process it */
2991         if (!domain)
2992                 return;
2993
2994         vm_domain_remove_all_dev_info(domain);
2995         /* destroy iovas */
2996         put_iova_domain(&domain->iovad);
2997         end = DOMAIN_MAX_ADDR(domain->gaw);
2998         end = end & (~VTD_PAGE_MASK);
2999
3000         /* clear ptes */
3001         dma_pte_clear_range(domain, 0, end);
3002
3003         /* free page tables */
3004         dma_pte_free_pagetable(domain, 0, end);
3005
3006         iommu_free_vm_domain(domain);
3007         free_domain_mem(domain);
3008 }
3009
3010 static int intel_iommu_domain_init(struct iommu_domain *domain)
3011 {
3012         struct dmar_domain *dmar_domain;
3013
3014         dmar_domain = iommu_alloc_vm_domain();
3015         if (!dmar_domain) {
3016                 printk(KERN_ERR
3017                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3018                 return -ENOMEM;
3019         }
3020         if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3021                 printk(KERN_ERR
3022                         "intel_iommu_domain_init() failed\n");
3023                 vm_domain_exit(dmar_domain);
3024                 return -ENOMEM;
3025         }
3026         domain->priv = dmar_domain;
3027
3028         return 0;
3029 }
3030
3031 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3032 {
3033         struct dmar_domain *dmar_domain = domain->priv;
3034
3035         domain->priv = NULL;
3036         vm_domain_exit(dmar_domain);
3037 }
3038
3039 static int intel_iommu_attach_device(struct iommu_domain *domain,
3040                                      struct device *dev)
3041 {
3042         struct dmar_domain *dmar_domain = domain->priv;
3043         struct pci_dev *pdev = to_pci_dev(dev);
3044         struct intel_iommu *iommu;
3045         int addr_width;
3046         u64 end;
3047         int ret;
3048
3049         /* normally pdev is not mapped */
3050         if (unlikely(domain_context_mapped(pdev))) {
3051                 struct dmar_domain *old_domain;
3052
3053                 old_domain = find_domain(pdev);
3054                 if (old_domain) {
3055                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3056                                 vm_domain_remove_one_dev_info(old_domain, pdev);
3057                         else
3058                                 domain_remove_dev_info(old_domain);
3059                 }
3060         }
3061
3062         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
3063         if (!iommu)
3064                 return -ENODEV;
3065
3066         /* check if this iommu agaw is sufficient for max mapped address */
3067         addr_width = agaw_to_width(iommu->agaw);
3068         end = DOMAIN_MAX_ADDR(addr_width);
3069         end = end & VTD_PAGE_MASK;
3070         if (end < dmar_domain->max_addr) {
3071                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3072                        "sufficient for the mapped address (%llx)\n",
3073                        __func__, iommu->agaw, dmar_domain->max_addr);
3074                 return -EFAULT;
3075         }
3076
3077         ret = domain_context_mapping(dmar_domain, pdev);
3078         if (ret)
3079                 return ret;
3080
3081         ret = vm_domain_add_dev_info(dmar_domain, pdev);
3082         return ret;
3083 }
3084
3085 static void intel_iommu_detach_device(struct iommu_domain *domain,
3086                                       struct device *dev)
3087 {
3088         struct dmar_domain *dmar_domain = domain->priv;
3089         struct pci_dev *pdev = to_pci_dev(dev);
3090
3091         vm_domain_remove_one_dev_info(dmar_domain, pdev);
3092 }
3093
3094 static int intel_iommu_map_range(struct iommu_domain *domain,
3095                                  unsigned long iova, phys_addr_t hpa,
3096                                  size_t size, int iommu_prot)
3097 {
3098         struct dmar_domain *dmar_domain = domain->priv;
3099         u64 max_addr;
3100         int addr_width;
3101         int prot = 0;
3102         int ret;
3103
3104         if (iommu_prot & IOMMU_READ)
3105                 prot |= DMA_PTE_READ;
3106         if (iommu_prot & IOMMU_WRITE)
3107                 prot |= DMA_PTE_WRITE;
3108
3109         max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3110         if (dmar_domain->max_addr < max_addr) {
3111                 int min_agaw;
3112                 u64 end;
3113
3114                 /* check if minimum agaw is sufficient for mapped address */
3115                 min_agaw = vm_domain_min_agaw(dmar_domain);
3116                 addr_width = agaw_to_width(min_agaw);
3117                 end = DOMAIN_MAX_ADDR(addr_width);
3118                 end = end & VTD_PAGE_MASK;
3119                 if (end < max_addr) {
3120                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3121                                "sufficient for the mapped address (%llx)\n",
3122                                __func__, min_agaw, max_addr);
3123                         return -EFAULT;
3124                 }
3125                 dmar_domain->max_addr = max_addr;
3126         }
3127
3128         ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3129         return ret;
3130 }
3131
3132 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3133                                     unsigned long iova, size_t size)
3134 {
3135         struct dmar_domain *dmar_domain = domain->priv;
3136         dma_addr_t base;
3137
3138         /* The address might not be aligned */
3139         base = iova & VTD_PAGE_MASK;
3140         size = VTD_PAGE_ALIGN(size);
3141         dma_pte_clear_range(dmar_domain, base, base + size);
3142
3143         if (dmar_domain->max_addr == base + size)
3144                 dmar_domain->max_addr = base;
3145 }
3146
3147 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3148                                             unsigned long iova)
3149 {
3150         struct dmar_domain *dmar_domain = domain->priv;
3151         struct dma_pte *pte;
3152         u64 phys = 0;
3153
3154         pte = addr_to_dma_pte(dmar_domain, iova);
3155         if (pte)
3156                 phys = dma_pte_addr(pte);
3157
3158         return phys;
3159 }
3160
3161 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3162                                       unsigned long cap)
3163 {
3164         struct dmar_domain *dmar_domain = domain->priv;
3165
3166         if (cap == IOMMU_CAP_CACHE_COHERENCY)
3167                 return dmar_domain->iommu_snooping;
3168
3169         return 0;
3170 }
3171
3172 static struct iommu_ops intel_iommu_ops = {
3173         .domain_init    = intel_iommu_domain_init,
3174         .domain_destroy = intel_iommu_domain_destroy,
3175         .attach_dev     = intel_iommu_attach_device,
3176         .detach_dev     = intel_iommu_detach_device,
3177         .map            = intel_iommu_map_range,
3178         .unmap          = intel_iommu_unmap_range,
3179         .iova_to_phys   = intel_iommu_iova_to_phys,
3180         .domain_has_cap = intel_iommu_domain_has_cap,
3181 };
3182
3183 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3184 {
3185         /*
3186          * Mobile 4 Series Chipset neglects to set RWBF capability,
3187          * but needs it:
3188          */
3189         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3190         rwbf_quirk = 1;
3191 }
3192
3193 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);