]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/blob - drivers/pci/intel-iommu.c
4d59a6a1f4dd4a977724449f4b13276cf59b8f22
[linux-2.6-omap-h63xx.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  */
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/slab.h>
27 #include <linux/irq.h>
28 #include <linux/interrupt.h>
29 #include <linux/sysdev.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include "iova.h"
37 #include "intel-iommu.h"
38 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
39 #include <asm/cacheflush.h>
40 #include <asm/gart.h>
41 #include "pci.h"
42
43 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
44 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
45
46 #define IOAPIC_RANGE_START      (0xfee00000)
47 #define IOAPIC_RANGE_END        (0xfeefffff)
48 #define IOVA_START_ADDR         (0x1000)
49
50 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
51
52 #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) /* 10sec */
53
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
55
56
57 static void flush_unmaps_timeout(unsigned long data);
58
59 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
60
61 #define HIGH_WATER_MARK 250
62 struct deferred_flush_tables {
63         int next;
64         struct iova *iova[HIGH_WATER_MARK];
65         struct dmar_domain *domain[HIGH_WATER_MARK];
66 };
67
68 static struct deferred_flush_tables *deferred_flush;
69
70 /* bitmap for indexing intel_iommus */
71 static int g_num_of_iommus;
72
73 static DEFINE_SPINLOCK(async_umap_flush_lock);
74 static LIST_HEAD(unmaps_to_do);
75
76 static int timer_on;
77 static long list_size;
78
79 static void domain_remove_dev_info(struct dmar_domain *domain);
80
81 static int dmar_disabled;
82 static int __initdata dmar_map_gfx = 1;
83 static int dmar_forcedac;
84 static int intel_iommu_strict;
85
86 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
87 static DEFINE_SPINLOCK(device_domain_lock);
88 static LIST_HEAD(device_domain_list);
89
90 static int __init intel_iommu_setup(char *str)
91 {
92         if (!str)
93                 return -EINVAL;
94         while (*str) {
95                 if (!strncmp(str, "off", 3)) {
96                         dmar_disabled = 1;
97                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
98                 } else if (!strncmp(str, "igfx_off", 8)) {
99                         dmar_map_gfx = 0;
100                         printk(KERN_INFO
101                                 "Intel-IOMMU: disable GFX device mapping\n");
102                 } else if (!strncmp(str, "forcedac", 8)) {
103                         printk(KERN_INFO
104                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
105                         dmar_forcedac = 1;
106                 } else if (!strncmp(str, "strict", 6)) {
107                         printk(KERN_INFO
108                                 "Intel-IOMMU: disable batched IOTLB flush\n");
109                         intel_iommu_strict = 1;
110                 }
111
112                 str += strcspn(str, ",");
113                 while (*str == ',')
114                         str++;
115         }
116         return 0;
117 }
118 __setup("intel_iommu=", intel_iommu_setup);
119
120 static struct kmem_cache *iommu_domain_cache;
121 static struct kmem_cache *iommu_devinfo_cache;
122 static struct kmem_cache *iommu_iova_cache;
123
124 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
125 {
126         unsigned int flags;
127         void *vaddr;
128
129         /* trying to avoid low memory issues */
130         flags = current->flags & PF_MEMALLOC;
131         current->flags |= PF_MEMALLOC;
132         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
133         current->flags &= (~PF_MEMALLOC | flags);
134         return vaddr;
135 }
136
137
138 static inline void *alloc_pgtable_page(void)
139 {
140         unsigned int flags;
141         void *vaddr;
142
143         /* trying to avoid low memory issues */
144         flags = current->flags & PF_MEMALLOC;
145         current->flags |= PF_MEMALLOC;
146         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
147         current->flags &= (~PF_MEMALLOC | flags);
148         return vaddr;
149 }
150
151 static inline void free_pgtable_page(void *vaddr)
152 {
153         free_page((unsigned long)vaddr);
154 }
155
156 static inline void *alloc_domain_mem(void)
157 {
158         return iommu_kmem_cache_alloc(iommu_domain_cache);
159 }
160
161 static inline void free_domain_mem(void *vaddr)
162 {
163         kmem_cache_free(iommu_domain_cache, vaddr);
164 }
165
166 static inline void * alloc_devinfo_mem(void)
167 {
168         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
169 }
170
171 static inline void free_devinfo_mem(void *vaddr)
172 {
173         kmem_cache_free(iommu_devinfo_cache, vaddr);
174 }
175
176 struct iova *alloc_iova_mem(void)
177 {
178         return iommu_kmem_cache_alloc(iommu_iova_cache);
179 }
180
181 void free_iova_mem(struct iova *iova)
182 {
183         kmem_cache_free(iommu_iova_cache, iova);
184 }
185
186 static inline void __iommu_flush_cache(
187         struct intel_iommu *iommu, void *addr, int size)
188 {
189         if (!ecap_coherent(iommu->ecap))
190                 clflush_cache_range(addr, size);
191 }
192
193 /* Gets context entry for a given bus and devfn */
194 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
195                 u8 bus, u8 devfn)
196 {
197         struct root_entry *root;
198         struct context_entry *context;
199         unsigned long phy_addr;
200         unsigned long flags;
201
202         spin_lock_irqsave(&iommu->lock, flags);
203         root = &iommu->root_entry[bus];
204         context = get_context_addr_from_root(root);
205         if (!context) {
206                 context = (struct context_entry *)alloc_pgtable_page();
207                 if (!context) {
208                         spin_unlock_irqrestore(&iommu->lock, flags);
209                         return NULL;
210                 }
211                 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
212                 phy_addr = virt_to_phys((void *)context);
213                 set_root_value(root, phy_addr);
214                 set_root_present(root);
215                 __iommu_flush_cache(iommu, root, sizeof(*root));
216         }
217         spin_unlock_irqrestore(&iommu->lock, flags);
218         return &context[devfn];
219 }
220
221 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
222 {
223         struct root_entry *root;
224         struct context_entry *context;
225         int ret;
226         unsigned long flags;
227
228         spin_lock_irqsave(&iommu->lock, flags);
229         root = &iommu->root_entry[bus];
230         context = get_context_addr_from_root(root);
231         if (!context) {
232                 ret = 0;
233                 goto out;
234         }
235         ret = context_present(context[devfn]);
236 out:
237         spin_unlock_irqrestore(&iommu->lock, flags);
238         return ret;
239 }
240
241 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
242 {
243         struct root_entry *root;
244         struct context_entry *context;
245         unsigned long flags;
246
247         spin_lock_irqsave(&iommu->lock, flags);
248         root = &iommu->root_entry[bus];
249         context = get_context_addr_from_root(root);
250         if (context) {
251                 context_clear_entry(context[devfn]);
252                 __iommu_flush_cache(iommu, &context[devfn], \
253                         sizeof(*context));
254         }
255         spin_unlock_irqrestore(&iommu->lock, flags);
256 }
257
258 static void free_context_table(struct intel_iommu *iommu)
259 {
260         struct root_entry *root;
261         int i;
262         unsigned long flags;
263         struct context_entry *context;
264
265         spin_lock_irqsave(&iommu->lock, flags);
266         if (!iommu->root_entry) {
267                 goto out;
268         }
269         for (i = 0; i < ROOT_ENTRY_NR; i++) {
270                 root = &iommu->root_entry[i];
271                 context = get_context_addr_from_root(root);
272                 if (context)
273                         free_pgtable_page(context);
274         }
275         free_pgtable_page(iommu->root_entry);
276         iommu->root_entry = NULL;
277 out:
278         spin_unlock_irqrestore(&iommu->lock, flags);
279 }
280
281 /* page table handling */
282 #define LEVEL_STRIDE            (9)
283 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
284
285 static inline int agaw_to_level(int agaw)
286 {
287         return agaw + 2;
288 }
289
290 static inline int agaw_to_width(int agaw)
291 {
292         return 30 + agaw * LEVEL_STRIDE;
293
294 }
295
296 static inline int width_to_agaw(int width)
297 {
298         return (width - 30) / LEVEL_STRIDE;
299 }
300
301 static inline unsigned int level_to_offset_bits(int level)
302 {
303         return (12 + (level - 1) * LEVEL_STRIDE);
304 }
305
306 static inline int address_level_offset(u64 addr, int level)
307 {
308         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
309 }
310
311 static inline u64 level_mask(int level)
312 {
313         return ((u64)-1 << level_to_offset_bits(level));
314 }
315
316 static inline u64 level_size(int level)
317 {
318         return ((u64)1 << level_to_offset_bits(level));
319 }
320
321 static inline u64 align_to_level(u64 addr, int level)
322 {
323         return ((addr + level_size(level) - 1) & level_mask(level));
324 }
325
326 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
327 {
328         int addr_width = agaw_to_width(domain->agaw);
329         struct dma_pte *parent, *pte = NULL;
330         int level = agaw_to_level(domain->agaw);
331         int offset;
332         unsigned long flags;
333
334         BUG_ON(!domain->pgd);
335
336         addr &= (((u64)1) << addr_width) - 1;
337         parent = domain->pgd;
338
339         spin_lock_irqsave(&domain->mapping_lock, flags);
340         while (level > 0) {
341                 void *tmp_page;
342
343                 offset = address_level_offset(addr, level);
344                 pte = &parent[offset];
345                 if (level == 1)
346                         break;
347
348                 if (!dma_pte_present(*pte)) {
349                         tmp_page = alloc_pgtable_page();
350
351                         if (!tmp_page) {
352                                 spin_unlock_irqrestore(&domain->mapping_lock,
353                                         flags);
354                                 return NULL;
355                         }
356                         __iommu_flush_cache(domain->iommu, tmp_page,
357                                         PAGE_SIZE_4K);
358                         dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
359                         /*
360                          * high level table always sets r/w, last level page
361                          * table control read/write
362                          */
363                         dma_set_pte_readable(*pte);
364                         dma_set_pte_writable(*pte);
365                         __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
366                 }
367                 parent = phys_to_virt(dma_pte_addr(*pte));
368                 level--;
369         }
370
371         spin_unlock_irqrestore(&domain->mapping_lock, flags);
372         return pte;
373 }
374
375 /* return address's pte at specific level */
376 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
377                 int level)
378 {
379         struct dma_pte *parent, *pte = NULL;
380         int total = agaw_to_level(domain->agaw);
381         int offset;
382
383         parent = domain->pgd;
384         while (level <= total) {
385                 offset = address_level_offset(addr, total);
386                 pte = &parent[offset];
387                 if (level == total)
388                         return pte;
389
390                 if (!dma_pte_present(*pte))
391                         break;
392                 parent = phys_to_virt(dma_pte_addr(*pte));
393                 total--;
394         }
395         return NULL;
396 }
397
398 /* clear one page's page table */
399 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
400 {
401         struct dma_pte *pte = NULL;
402
403         /* get last level pte */
404         pte = dma_addr_level_pte(domain, addr, 1);
405
406         if (pte) {
407                 dma_clear_pte(*pte);
408                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
409         }
410 }
411
412 /* clear last level pte, a tlb flush should be followed */
413 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
414 {
415         int addr_width = agaw_to_width(domain->agaw);
416
417         start &= (((u64)1) << addr_width) - 1;
418         end &= (((u64)1) << addr_width) - 1;
419         /* in case it's partial page */
420         start = PAGE_ALIGN_4K(start);
421         end &= PAGE_MASK_4K;
422
423         /* we don't need lock here, nobody else touches the iova range */
424         while (start < end) {
425                 dma_pte_clear_one(domain, start);
426                 start += PAGE_SIZE_4K;
427         }
428 }
429
430 /* free page table pages. last level pte should already be cleared */
431 static void dma_pte_free_pagetable(struct dmar_domain *domain,
432         u64 start, u64 end)
433 {
434         int addr_width = agaw_to_width(domain->agaw);
435         struct dma_pte *pte;
436         int total = agaw_to_level(domain->agaw);
437         int level;
438         u64 tmp;
439
440         start &= (((u64)1) << addr_width) - 1;
441         end &= (((u64)1) << addr_width) - 1;
442
443         /* we don't need lock here, nobody else touches the iova range */
444         level = 2;
445         while (level <= total) {
446                 tmp = align_to_level(start, level);
447                 if (tmp >= end || (tmp + level_size(level) > end))
448                         return;
449
450                 while (tmp < end) {
451                         pte = dma_addr_level_pte(domain, tmp, level);
452                         if (pte) {
453                                 free_pgtable_page(
454                                         phys_to_virt(dma_pte_addr(*pte)));
455                                 dma_clear_pte(*pte);
456                                 __iommu_flush_cache(domain->iommu,
457                                                 pte, sizeof(*pte));
458                         }
459                         tmp += level_size(level);
460                 }
461                 level++;
462         }
463         /* free pgd */
464         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
465                 free_pgtable_page(domain->pgd);
466                 domain->pgd = NULL;
467         }
468 }
469
470 /* iommu handling */
471 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
472 {
473         struct root_entry *root;
474         unsigned long flags;
475
476         root = (struct root_entry *)alloc_pgtable_page();
477         if (!root)
478                 return -ENOMEM;
479
480         __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
481
482         spin_lock_irqsave(&iommu->lock, flags);
483         iommu->root_entry = root;
484         spin_unlock_irqrestore(&iommu->lock, flags);
485
486         return 0;
487 }
488
489 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
490 {\
491         cycles_t start_time = get_cycles();\
492         while (1) {\
493                 sts = op (iommu->reg + offset);\
494                 if (cond)\
495                         break;\
496                 if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\
497                         panic("DMAR hardware is malfunctioning\n");\
498                 cpu_relax();\
499         }\
500 }
501
502 static void iommu_set_root_entry(struct intel_iommu *iommu)
503 {
504         void *addr;
505         u32 cmd, sts;
506         unsigned long flag;
507
508         addr = iommu->root_entry;
509
510         spin_lock_irqsave(&iommu->register_lock, flag);
511         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
512
513         cmd = iommu->gcmd | DMA_GCMD_SRTP;
514         writel(cmd, iommu->reg + DMAR_GCMD_REG);
515
516         /* Make sure hardware complete it */
517         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
518                 readl, (sts & DMA_GSTS_RTPS), sts);
519
520         spin_unlock_irqrestore(&iommu->register_lock, flag);
521 }
522
523 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
524 {
525         u32 val;
526         unsigned long flag;
527
528         if (!cap_rwbf(iommu->cap))
529                 return;
530         val = iommu->gcmd | DMA_GCMD_WBF;
531
532         spin_lock_irqsave(&iommu->register_lock, flag);
533         writel(val, iommu->reg + DMAR_GCMD_REG);
534
535         /* Make sure hardware complete it */
536         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
537                         readl, (!(val & DMA_GSTS_WBFS)), val);
538
539         spin_unlock_irqrestore(&iommu->register_lock, flag);
540 }
541
542 /* return value determine if we need a write buffer flush */
543 static int __iommu_flush_context(struct intel_iommu *iommu,
544         u16 did, u16 source_id, u8 function_mask, u64 type,
545         int non_present_entry_flush)
546 {
547         u64 val = 0;
548         unsigned long flag;
549
550         /*
551          * In the non-present entry flush case, if hardware doesn't cache
552          * non-present entry we do nothing and if hardware cache non-present
553          * entry, we flush entries of domain 0 (the domain id is used to cache
554          * any non-present entries)
555          */
556         if (non_present_entry_flush) {
557                 if (!cap_caching_mode(iommu->cap))
558                         return 1;
559                 else
560                         did = 0;
561         }
562
563         switch (type) {
564         case DMA_CCMD_GLOBAL_INVL:
565                 val = DMA_CCMD_GLOBAL_INVL;
566                 break;
567         case DMA_CCMD_DOMAIN_INVL:
568                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
569                 break;
570         case DMA_CCMD_DEVICE_INVL:
571                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
572                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
573                 break;
574         default:
575                 BUG();
576         }
577         val |= DMA_CCMD_ICC;
578
579         spin_lock_irqsave(&iommu->register_lock, flag);
580         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
581
582         /* Make sure hardware complete it */
583         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
584                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
585
586         spin_unlock_irqrestore(&iommu->register_lock, flag);
587
588         /* flush context entry will implictly flush write buffer */
589         return 0;
590 }
591
592 static int inline iommu_flush_context_global(struct intel_iommu *iommu,
593         int non_present_entry_flush)
594 {
595         return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
596                 non_present_entry_flush);
597 }
598
599 static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
600         int non_present_entry_flush)
601 {
602         return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
603                 non_present_entry_flush);
604 }
605
606 static int inline iommu_flush_context_device(struct intel_iommu *iommu,
607         u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
608 {
609         return __iommu_flush_context(iommu, did, source_id, function_mask,
610                 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
611 }
612
613 /* return value determine if we need a write buffer flush */
614 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
615         u64 addr, unsigned int size_order, u64 type,
616         int non_present_entry_flush)
617 {
618         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
619         u64 val = 0, val_iva = 0;
620         unsigned long flag;
621
622         /*
623          * In the non-present entry flush case, if hardware doesn't cache
624          * non-present entry we do nothing and if hardware cache non-present
625          * entry, we flush entries of domain 0 (the domain id is used to cache
626          * any non-present entries)
627          */
628         if (non_present_entry_flush) {
629                 if (!cap_caching_mode(iommu->cap))
630                         return 1;
631                 else
632                         did = 0;
633         }
634
635         switch (type) {
636         case DMA_TLB_GLOBAL_FLUSH:
637                 /* global flush doesn't need set IVA_REG */
638                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
639                 break;
640         case DMA_TLB_DSI_FLUSH:
641                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
642                 break;
643         case DMA_TLB_PSI_FLUSH:
644                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
645                 /* Note: always flush non-leaf currently */
646                 val_iva = size_order | addr;
647                 break;
648         default:
649                 BUG();
650         }
651         /* Note: set drain read/write */
652 #if 0
653         /*
654          * This is probably to be super secure.. Looks like we can
655          * ignore it without any impact.
656          */
657         if (cap_read_drain(iommu->cap))
658                 val |= DMA_TLB_READ_DRAIN;
659 #endif
660         if (cap_write_drain(iommu->cap))
661                 val |= DMA_TLB_WRITE_DRAIN;
662
663         spin_lock_irqsave(&iommu->register_lock, flag);
664         /* Note: Only uses first TLB reg currently */
665         if (val_iva)
666                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
667         dmar_writeq(iommu->reg + tlb_offset + 8, val);
668
669         /* Make sure hardware complete it */
670         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
671                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
672
673         spin_unlock_irqrestore(&iommu->register_lock, flag);
674
675         /* check IOTLB invalidation granularity */
676         if (DMA_TLB_IAIG(val) == 0)
677                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
678         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
679                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
680                         DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
681         /* flush context entry will implictly flush write buffer */
682         return 0;
683 }
684
685 static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
686         int non_present_entry_flush)
687 {
688         return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
689                 non_present_entry_flush);
690 }
691
692 static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
693         int non_present_entry_flush)
694 {
695         return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
696                 non_present_entry_flush);
697 }
698
699 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
700         u64 addr, unsigned int pages, int non_present_entry_flush)
701 {
702         unsigned int mask;
703
704         BUG_ON(addr & (~PAGE_MASK_4K));
705         BUG_ON(pages == 0);
706
707         /* Fallback to domain selective flush if no PSI support */
708         if (!cap_pgsel_inv(iommu->cap))
709                 return iommu_flush_iotlb_dsi(iommu, did,
710                         non_present_entry_flush);
711
712         /*
713          * PSI requires page size to be 2 ^ x, and the base address is naturally
714          * aligned to the size
715          */
716         mask = ilog2(__roundup_pow_of_two(pages));
717         /* Fallback to domain selective flush if size is too big */
718         if (mask > cap_max_amask_val(iommu->cap))
719                 return iommu_flush_iotlb_dsi(iommu, did,
720                         non_present_entry_flush);
721
722         return __iommu_flush_iotlb(iommu, did, addr, mask,
723                 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
724 }
725
726 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
727 {
728         u32 pmen;
729         unsigned long flags;
730
731         spin_lock_irqsave(&iommu->register_lock, flags);
732         pmen = readl(iommu->reg + DMAR_PMEN_REG);
733         pmen &= ~DMA_PMEN_EPM;
734         writel(pmen, iommu->reg + DMAR_PMEN_REG);
735
736         /* wait for the protected region status bit to clear */
737         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
738                 readl, !(pmen & DMA_PMEN_PRS), pmen);
739
740         spin_unlock_irqrestore(&iommu->register_lock, flags);
741 }
742
743 static int iommu_enable_translation(struct intel_iommu *iommu)
744 {
745         u32 sts;
746         unsigned long flags;
747
748         spin_lock_irqsave(&iommu->register_lock, flags);
749         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
750
751         /* Make sure hardware complete it */
752         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
753                 readl, (sts & DMA_GSTS_TES), sts);
754
755         iommu->gcmd |= DMA_GCMD_TE;
756         spin_unlock_irqrestore(&iommu->register_lock, flags);
757         return 0;
758 }
759
760 static int iommu_disable_translation(struct intel_iommu *iommu)
761 {
762         u32 sts;
763         unsigned long flag;
764
765         spin_lock_irqsave(&iommu->register_lock, flag);
766         iommu->gcmd &= ~DMA_GCMD_TE;
767         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
768
769         /* Make sure hardware complete it */
770         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
771                 readl, (!(sts & DMA_GSTS_TES)), sts);
772
773         spin_unlock_irqrestore(&iommu->register_lock, flag);
774         return 0;
775 }
776
777 /* iommu interrupt handling. Most stuff are MSI-like. */
778
779 static const char *fault_reason_strings[] =
780 {
781         "Software",
782         "Present bit in root entry is clear",
783         "Present bit in context entry is clear",
784         "Invalid context entry",
785         "Access beyond MGAW",
786         "PTE Write access is not set",
787         "PTE Read access is not set",
788         "Next page table ptr is invalid",
789         "Root table address invalid",
790         "Context table ptr is invalid",
791         "non-zero reserved fields in RTP",
792         "non-zero reserved fields in CTP",
793         "non-zero reserved fields in PTE",
794 };
795 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
796
797 const char *dmar_get_fault_reason(u8 fault_reason)
798 {
799         if (fault_reason > MAX_FAULT_REASON_IDX)
800                 return "Unknown";
801         else
802                 return fault_reason_strings[fault_reason];
803 }
804
805 void dmar_msi_unmask(unsigned int irq)
806 {
807         struct intel_iommu *iommu = get_irq_data(irq);
808         unsigned long flag;
809
810         /* unmask it */
811         spin_lock_irqsave(&iommu->register_lock, flag);
812         writel(0, iommu->reg + DMAR_FECTL_REG);
813         /* Read a reg to force flush the post write */
814         readl(iommu->reg + DMAR_FECTL_REG);
815         spin_unlock_irqrestore(&iommu->register_lock, flag);
816 }
817
818 void dmar_msi_mask(unsigned int irq)
819 {
820         unsigned long flag;
821         struct intel_iommu *iommu = get_irq_data(irq);
822
823         /* mask it */
824         spin_lock_irqsave(&iommu->register_lock, flag);
825         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
826         /* Read a reg to force flush the post write */
827         readl(iommu->reg + DMAR_FECTL_REG);
828         spin_unlock_irqrestore(&iommu->register_lock, flag);
829 }
830
831 void dmar_msi_write(int irq, struct msi_msg *msg)
832 {
833         struct intel_iommu *iommu = get_irq_data(irq);
834         unsigned long flag;
835
836         spin_lock_irqsave(&iommu->register_lock, flag);
837         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
838         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
839         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
840         spin_unlock_irqrestore(&iommu->register_lock, flag);
841 }
842
843 void dmar_msi_read(int irq, struct msi_msg *msg)
844 {
845         struct intel_iommu *iommu = get_irq_data(irq);
846         unsigned long flag;
847
848         spin_lock_irqsave(&iommu->register_lock, flag);
849         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
850         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
851         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
852         spin_unlock_irqrestore(&iommu->register_lock, flag);
853 }
854
855 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
856                 u8 fault_reason, u16 source_id, u64 addr)
857 {
858         const char *reason;
859
860         reason = dmar_get_fault_reason(fault_reason);
861
862         printk(KERN_ERR
863                 "DMAR:[%s] Request device [%02x:%02x.%d] "
864                 "fault addr %llx \n"
865                 "DMAR:[fault reason %02d] %s\n",
866                 (type ? "DMA Read" : "DMA Write"),
867                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
868                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
869         return 0;
870 }
871
872 #define PRIMARY_FAULT_REG_LEN (16)
873 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
874 {
875         struct intel_iommu *iommu = dev_id;
876         int reg, fault_index;
877         u32 fault_status;
878         unsigned long flag;
879
880         spin_lock_irqsave(&iommu->register_lock, flag);
881         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
882
883         /* TBD: ignore advanced fault log currently */
884         if (!(fault_status & DMA_FSTS_PPF))
885                 goto clear_overflow;
886
887         fault_index = dma_fsts_fault_record_index(fault_status);
888         reg = cap_fault_reg_offset(iommu->cap);
889         while (1) {
890                 u8 fault_reason;
891                 u16 source_id;
892                 u64 guest_addr;
893                 int type;
894                 u32 data;
895
896                 /* highest 32 bits */
897                 data = readl(iommu->reg + reg +
898                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
899                 if (!(data & DMA_FRCD_F))
900                         break;
901
902                 fault_reason = dma_frcd_fault_reason(data);
903                 type = dma_frcd_type(data);
904
905                 data = readl(iommu->reg + reg +
906                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
907                 source_id = dma_frcd_source_id(data);
908
909                 guest_addr = dmar_readq(iommu->reg + reg +
910                                 fault_index * PRIMARY_FAULT_REG_LEN);
911                 guest_addr = dma_frcd_page_addr(guest_addr);
912                 /* clear the fault */
913                 writel(DMA_FRCD_F, iommu->reg + reg +
914                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
915
916                 spin_unlock_irqrestore(&iommu->register_lock, flag);
917
918                 iommu_page_fault_do_one(iommu, type, fault_reason,
919                                 source_id, guest_addr);
920
921                 fault_index++;
922                 if (fault_index > cap_num_fault_regs(iommu->cap))
923                         fault_index = 0;
924                 spin_lock_irqsave(&iommu->register_lock, flag);
925         }
926 clear_overflow:
927         /* clear primary fault overflow */
928         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
929         if (fault_status & DMA_FSTS_PFO)
930                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
931
932         spin_unlock_irqrestore(&iommu->register_lock, flag);
933         return IRQ_HANDLED;
934 }
935
936 int dmar_set_interrupt(struct intel_iommu *iommu)
937 {
938         int irq, ret;
939
940         irq = create_irq();
941         if (!irq) {
942                 printk(KERN_ERR "IOMMU: no free vectors\n");
943                 return -EINVAL;
944         }
945
946         set_irq_data(irq, iommu);
947         iommu->irq = irq;
948
949         ret = arch_setup_dmar_msi(irq);
950         if (ret) {
951                 set_irq_data(irq, NULL);
952                 iommu->irq = 0;
953                 destroy_irq(irq);
954                 return 0;
955         }
956
957         /* Force fault register is cleared */
958         iommu_page_fault(irq, iommu);
959
960         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
961         if (ret)
962                 printk(KERN_ERR "IOMMU: can't request irq\n");
963         return ret;
964 }
965
966 static int iommu_init_domains(struct intel_iommu *iommu)
967 {
968         unsigned long ndomains;
969         unsigned long nlongs;
970
971         ndomains = cap_ndoms(iommu->cap);
972         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
973         nlongs = BITS_TO_LONGS(ndomains);
974
975         /* TBD: there might be 64K domains,
976          * consider other allocation for future chip
977          */
978         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
979         if (!iommu->domain_ids) {
980                 printk(KERN_ERR "Allocating domain id array failed\n");
981                 return -ENOMEM;
982         }
983         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
984                         GFP_KERNEL);
985         if (!iommu->domains) {
986                 printk(KERN_ERR "Allocating domain array failed\n");
987                 kfree(iommu->domain_ids);
988                 return -ENOMEM;
989         }
990
991         spin_lock_init(&iommu->lock);
992
993         /*
994          * if Caching mode is set, then invalid translations are tagged
995          * with domainid 0. Hence we need to pre-allocate it.
996          */
997         if (cap_caching_mode(iommu->cap))
998                 set_bit(0, iommu->domain_ids);
999         return 0;
1000 }
1001
1002
1003 static void domain_exit(struct dmar_domain *domain);
1004
1005 void free_dmar_iommu(struct intel_iommu *iommu)
1006 {
1007         struct dmar_domain *domain;
1008         int i;
1009
1010         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1011         for (; i < cap_ndoms(iommu->cap); ) {
1012                 domain = iommu->domains[i];
1013                 clear_bit(i, iommu->domain_ids);
1014                 domain_exit(domain);
1015                 i = find_next_bit(iommu->domain_ids,
1016                         cap_ndoms(iommu->cap), i+1);
1017         }
1018
1019         if (iommu->gcmd & DMA_GCMD_TE)
1020                 iommu_disable_translation(iommu);
1021
1022         if (iommu->irq) {
1023                 set_irq_data(iommu->irq, NULL);
1024                 /* This will mask the irq */
1025                 free_irq(iommu->irq, iommu);
1026                 destroy_irq(iommu->irq);
1027         }
1028
1029         kfree(iommu->domains);
1030         kfree(iommu->domain_ids);
1031
1032         /* free context mapping */
1033         free_context_table(iommu);
1034 }
1035
1036 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1037 {
1038         unsigned long num;
1039         unsigned long ndomains;
1040         struct dmar_domain *domain;
1041         unsigned long flags;
1042
1043         domain = alloc_domain_mem();
1044         if (!domain)
1045                 return NULL;
1046
1047         ndomains = cap_ndoms(iommu->cap);
1048
1049         spin_lock_irqsave(&iommu->lock, flags);
1050         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1051         if (num >= ndomains) {
1052                 spin_unlock_irqrestore(&iommu->lock, flags);
1053                 free_domain_mem(domain);
1054                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1055                 return NULL;
1056         }
1057
1058         set_bit(num, iommu->domain_ids);
1059         domain->id = num;
1060         domain->iommu = iommu;
1061         iommu->domains[num] = domain;
1062         spin_unlock_irqrestore(&iommu->lock, flags);
1063
1064         return domain;
1065 }
1066
1067 static void iommu_free_domain(struct dmar_domain *domain)
1068 {
1069         unsigned long flags;
1070
1071         spin_lock_irqsave(&domain->iommu->lock, flags);
1072         clear_bit(domain->id, domain->iommu->domain_ids);
1073         spin_unlock_irqrestore(&domain->iommu->lock, flags);
1074 }
1075
1076 static struct iova_domain reserved_iova_list;
1077 static struct lock_class_key reserved_alloc_key;
1078 static struct lock_class_key reserved_rbtree_key;
1079
1080 static void dmar_init_reserved_ranges(void)
1081 {
1082         struct pci_dev *pdev = NULL;
1083         struct iova *iova;
1084         int i;
1085         u64 addr, size;
1086
1087         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1088
1089         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1090                 &reserved_alloc_key);
1091         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1092                 &reserved_rbtree_key);
1093
1094         /* IOAPIC ranges shouldn't be accessed by DMA */
1095         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1096                 IOVA_PFN(IOAPIC_RANGE_END));
1097         if (!iova)
1098                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1099
1100         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1101         for_each_pci_dev(pdev) {
1102                 struct resource *r;
1103
1104                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1105                         r = &pdev->resource[i];
1106                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1107                                 continue;
1108                         addr = r->start;
1109                         addr &= PAGE_MASK_4K;
1110                         size = r->end - addr;
1111                         size = PAGE_ALIGN_4K(size);
1112                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1113                                 IOVA_PFN(size + addr) - 1);
1114                         if (!iova)
1115                                 printk(KERN_ERR "Reserve iova failed\n");
1116                 }
1117         }
1118
1119 }
1120
1121 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1122 {
1123         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1124 }
1125
1126 static inline int guestwidth_to_adjustwidth(int gaw)
1127 {
1128         int agaw;
1129         int r = (gaw - 12) % 9;
1130
1131         if (r == 0)
1132                 agaw = gaw;
1133         else
1134                 agaw = gaw + 9 - r;
1135         if (agaw > 64)
1136                 agaw = 64;
1137         return agaw;
1138 }
1139
1140 static int domain_init(struct dmar_domain *domain, int guest_width)
1141 {
1142         struct intel_iommu *iommu;
1143         int adjust_width, agaw;
1144         unsigned long sagaw;
1145
1146         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1147         spin_lock_init(&domain->mapping_lock);
1148
1149         domain_reserve_special_ranges(domain);
1150
1151         /* calculate AGAW */
1152         iommu = domain->iommu;
1153         if (guest_width > cap_mgaw(iommu->cap))
1154                 guest_width = cap_mgaw(iommu->cap);
1155         domain->gaw = guest_width;
1156         adjust_width = guestwidth_to_adjustwidth(guest_width);
1157         agaw = width_to_agaw(adjust_width);
1158         sagaw = cap_sagaw(iommu->cap);
1159         if (!test_bit(agaw, &sagaw)) {
1160                 /* hardware doesn't support it, choose a bigger one */
1161                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1162                 agaw = find_next_bit(&sagaw, 5, agaw);
1163                 if (agaw >= 5)
1164                         return -ENODEV;
1165         }
1166         domain->agaw = agaw;
1167         INIT_LIST_HEAD(&domain->devices);
1168
1169         /* always allocate the top pgd */
1170         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1171         if (!domain->pgd)
1172                 return -ENOMEM;
1173         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1174         return 0;
1175 }
1176
1177 static void domain_exit(struct dmar_domain *domain)
1178 {
1179         u64 end;
1180
1181         /* Domain 0 is reserved, so dont process it */
1182         if (!domain)
1183                 return;
1184
1185         domain_remove_dev_info(domain);
1186         /* destroy iovas */
1187         put_iova_domain(&domain->iovad);
1188         end = DOMAIN_MAX_ADDR(domain->gaw);
1189         end = end & (~PAGE_MASK_4K);
1190
1191         /* clear ptes */
1192         dma_pte_clear_range(domain, 0, end);
1193
1194         /* free page tables */
1195         dma_pte_free_pagetable(domain, 0, end);
1196
1197         iommu_free_domain(domain);
1198         free_domain_mem(domain);
1199 }
1200
1201 static int domain_context_mapping_one(struct dmar_domain *domain,
1202                 u8 bus, u8 devfn)
1203 {
1204         struct context_entry *context;
1205         struct intel_iommu *iommu = domain->iommu;
1206         unsigned long flags;
1207
1208         pr_debug("Set context mapping for %02x:%02x.%d\n",
1209                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1210         BUG_ON(!domain->pgd);
1211         context = device_to_context_entry(iommu, bus, devfn);
1212         if (!context)
1213                 return -ENOMEM;
1214         spin_lock_irqsave(&iommu->lock, flags);
1215         if (context_present(*context)) {
1216                 spin_unlock_irqrestore(&iommu->lock, flags);
1217                 return 0;
1218         }
1219
1220         context_set_domain_id(*context, domain->id);
1221         context_set_address_width(*context, domain->agaw);
1222         context_set_address_root(*context, virt_to_phys(domain->pgd));
1223         context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1224         context_set_fault_enable(*context);
1225         context_set_present(*context);
1226         __iommu_flush_cache(iommu, context, sizeof(*context));
1227
1228         /* it's a non-present to present mapping */
1229         if (iommu_flush_context_device(iommu, domain->id,
1230                         (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1231                 iommu_flush_write_buffer(iommu);
1232         else
1233                 iommu_flush_iotlb_dsi(iommu, 0, 0);
1234         spin_unlock_irqrestore(&iommu->lock, flags);
1235         return 0;
1236 }
1237
1238 static int
1239 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1240 {
1241         int ret;
1242         struct pci_dev *tmp, *parent;
1243
1244         ret = domain_context_mapping_one(domain, pdev->bus->number,
1245                 pdev->devfn);
1246         if (ret)
1247                 return ret;
1248
1249         /* dependent device mapping */
1250         tmp = pci_find_upstream_pcie_bridge(pdev);
1251         if (!tmp)
1252                 return 0;
1253         /* Secondary interface's bus number and devfn 0 */
1254         parent = pdev->bus->self;
1255         while (parent != tmp) {
1256                 ret = domain_context_mapping_one(domain, parent->bus->number,
1257                         parent->devfn);
1258                 if (ret)
1259                         return ret;
1260                 parent = parent->bus->self;
1261         }
1262         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1263                 return domain_context_mapping_one(domain,
1264                         tmp->subordinate->number, 0);
1265         else /* this is a legacy PCI bridge */
1266                 return domain_context_mapping_one(domain,
1267                         tmp->bus->number, tmp->devfn);
1268 }
1269
1270 static int domain_context_mapped(struct dmar_domain *domain,
1271         struct pci_dev *pdev)
1272 {
1273         int ret;
1274         struct pci_dev *tmp, *parent;
1275
1276         ret = device_context_mapped(domain->iommu,
1277                 pdev->bus->number, pdev->devfn);
1278         if (!ret)
1279                 return ret;
1280         /* dependent device mapping */
1281         tmp = pci_find_upstream_pcie_bridge(pdev);
1282         if (!tmp)
1283                 return ret;
1284         /* Secondary interface's bus number and devfn 0 */
1285         parent = pdev->bus->self;
1286         while (parent != tmp) {
1287                 ret = device_context_mapped(domain->iommu, parent->bus->number,
1288                         parent->devfn);
1289                 if (!ret)
1290                         return ret;
1291                 parent = parent->bus->self;
1292         }
1293         if (tmp->is_pcie)
1294                 return device_context_mapped(domain->iommu,
1295                         tmp->subordinate->number, 0);
1296         else
1297                 return device_context_mapped(domain->iommu,
1298                         tmp->bus->number, tmp->devfn);
1299 }
1300
1301 static int
1302 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1303                         u64 hpa, size_t size, int prot)
1304 {
1305         u64 start_pfn, end_pfn;
1306         struct dma_pte *pte;
1307         int index;
1308
1309         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1310                 return -EINVAL;
1311         iova &= PAGE_MASK_4K;
1312         start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1313         end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1314         index = 0;
1315         while (start_pfn < end_pfn) {
1316                 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1317                 if (!pte)
1318                         return -ENOMEM;
1319                 /* We don't need lock here, nobody else
1320                  * touches the iova range
1321                  */
1322                 BUG_ON(dma_pte_addr(*pte));
1323                 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1324                 dma_set_pte_prot(*pte, prot);
1325                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1326                 start_pfn++;
1327                 index++;
1328         }
1329         return 0;
1330 }
1331
1332 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1333 {
1334         clear_context_table(domain->iommu, bus, devfn);
1335         iommu_flush_context_global(domain->iommu, 0);
1336         iommu_flush_iotlb_global(domain->iommu, 0);
1337 }
1338
1339 static void domain_remove_dev_info(struct dmar_domain *domain)
1340 {
1341         struct device_domain_info *info;
1342         unsigned long flags;
1343
1344         spin_lock_irqsave(&device_domain_lock, flags);
1345         while (!list_empty(&domain->devices)) {
1346                 info = list_entry(domain->devices.next,
1347                         struct device_domain_info, link);
1348                 list_del(&info->link);
1349                 list_del(&info->global);
1350                 if (info->dev)
1351                         info->dev->dev.archdata.iommu = NULL;
1352                 spin_unlock_irqrestore(&device_domain_lock, flags);
1353
1354                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1355                 free_devinfo_mem(info);
1356
1357                 spin_lock_irqsave(&device_domain_lock, flags);
1358         }
1359         spin_unlock_irqrestore(&device_domain_lock, flags);
1360 }
1361
1362 /*
1363  * find_domain
1364  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1365  */
1366 struct dmar_domain *
1367 find_domain(struct pci_dev *pdev)
1368 {
1369         struct device_domain_info *info;
1370
1371         /* No lock here, assumes no domain exit in normal case */
1372         info = pdev->dev.archdata.iommu;
1373         if (info)
1374                 return info->domain;
1375         return NULL;
1376 }
1377
1378 /* domain is initialized */
1379 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1380 {
1381         struct dmar_domain *domain, *found = NULL;
1382         struct intel_iommu *iommu;
1383         struct dmar_drhd_unit *drhd;
1384         struct device_domain_info *info, *tmp;
1385         struct pci_dev *dev_tmp;
1386         unsigned long flags;
1387         int bus = 0, devfn = 0;
1388
1389         domain = find_domain(pdev);
1390         if (domain)
1391                 return domain;
1392
1393         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1394         if (dev_tmp) {
1395                 if (dev_tmp->is_pcie) {
1396                         bus = dev_tmp->subordinate->number;
1397                         devfn = 0;
1398                 } else {
1399                         bus = dev_tmp->bus->number;
1400                         devfn = dev_tmp->devfn;
1401                 }
1402                 spin_lock_irqsave(&device_domain_lock, flags);
1403                 list_for_each_entry(info, &device_domain_list, global) {
1404                         if (info->bus == bus && info->devfn == devfn) {
1405                                 found = info->domain;
1406                                 break;
1407                         }
1408                 }
1409                 spin_unlock_irqrestore(&device_domain_lock, flags);
1410                 /* pcie-pci bridge already has a domain, uses it */
1411                 if (found) {
1412                         domain = found;
1413                         goto found_domain;
1414                 }
1415         }
1416
1417         /* Allocate new domain for the device */
1418         drhd = dmar_find_matched_drhd_unit(pdev);
1419         if (!drhd) {
1420                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1421                         pci_name(pdev));
1422                 return NULL;
1423         }
1424         iommu = drhd->iommu;
1425
1426         domain = iommu_alloc_domain(iommu);
1427         if (!domain)
1428                 goto error;
1429
1430         if (domain_init(domain, gaw)) {
1431                 domain_exit(domain);
1432                 goto error;
1433         }
1434
1435         /* register pcie-to-pci device */
1436         if (dev_tmp) {
1437                 info = alloc_devinfo_mem();
1438                 if (!info) {
1439                         domain_exit(domain);
1440                         goto error;
1441                 }
1442                 info->bus = bus;
1443                 info->devfn = devfn;
1444                 info->dev = NULL;
1445                 info->domain = domain;
1446                 /* This domain is shared by devices under p2p bridge */
1447                 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1448
1449                 /* pcie-to-pci bridge already has a domain, uses it */
1450                 found = NULL;
1451                 spin_lock_irqsave(&device_domain_lock, flags);
1452                 list_for_each_entry(tmp, &device_domain_list, global) {
1453                         if (tmp->bus == bus && tmp->devfn == devfn) {
1454                                 found = tmp->domain;
1455                                 break;
1456                         }
1457                 }
1458                 if (found) {
1459                         free_devinfo_mem(info);
1460                         domain_exit(domain);
1461                         domain = found;
1462                 } else {
1463                         list_add(&info->link, &domain->devices);
1464                         list_add(&info->global, &device_domain_list);
1465                 }
1466                 spin_unlock_irqrestore(&device_domain_lock, flags);
1467         }
1468
1469 found_domain:
1470         info = alloc_devinfo_mem();
1471         if (!info)
1472                 goto error;
1473         info->bus = pdev->bus->number;
1474         info->devfn = pdev->devfn;
1475         info->dev = pdev;
1476         info->domain = domain;
1477         spin_lock_irqsave(&device_domain_lock, flags);
1478         /* somebody is fast */
1479         found = find_domain(pdev);
1480         if (found != NULL) {
1481                 spin_unlock_irqrestore(&device_domain_lock, flags);
1482                 if (found != domain) {
1483                         domain_exit(domain);
1484                         domain = found;
1485                 }
1486                 free_devinfo_mem(info);
1487                 return domain;
1488         }
1489         list_add(&info->link, &domain->devices);
1490         list_add(&info->global, &device_domain_list);
1491         pdev->dev.archdata.iommu = info;
1492         spin_unlock_irqrestore(&device_domain_lock, flags);
1493         return domain;
1494 error:
1495         /* recheck it here, maybe others set it */
1496         return find_domain(pdev);
1497 }
1498
1499 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1500 {
1501         struct dmar_domain *domain;
1502         unsigned long size;
1503         u64 base;
1504         int ret;
1505
1506         printk(KERN_INFO
1507                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1508                 pci_name(pdev), start, end);
1509         /* page table init */
1510         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1511         if (!domain)
1512                 return -ENOMEM;
1513
1514         /* The address might not be aligned */
1515         base = start & PAGE_MASK_4K;
1516         size = end - base;
1517         size = PAGE_ALIGN_4K(size);
1518         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1519                         IOVA_PFN(base + size) - 1)) {
1520                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1521                 ret = -ENOMEM;
1522                 goto error;
1523         }
1524
1525         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1526                 size, base, pci_name(pdev));
1527         /*
1528          * RMRR range might have overlap with physical memory range,
1529          * clear it first
1530          */
1531         dma_pte_clear_range(domain, base, base + size);
1532
1533         ret = domain_page_mapping(domain, base, base, size,
1534                 DMA_PTE_READ|DMA_PTE_WRITE);
1535         if (ret)
1536                 goto error;
1537
1538         /* context entry init */
1539         ret = domain_context_mapping(domain, pdev);
1540         if (!ret)
1541                 return 0;
1542 error:
1543         domain_exit(domain);
1544         return ret;
1545
1546 }
1547
1548 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1549         struct pci_dev *pdev)
1550 {
1551         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1552                 return 0;
1553         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1554                 rmrr->end_address + 1);
1555 }
1556
1557 #ifdef CONFIG_DMAR_GFX_WA
1558 struct iommu_prepare_data {
1559         struct pci_dev *pdev;
1560         int ret;
1561 };
1562
1563 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1564                                          unsigned long end_pfn, void *datax)
1565 {
1566         struct iommu_prepare_data *data;
1567
1568         data = (struct iommu_prepare_data *)datax;
1569
1570         data->ret = iommu_prepare_identity_map(data->pdev,
1571                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1572         return data->ret;
1573
1574 }
1575
1576 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1577 {
1578         int nid;
1579         struct iommu_prepare_data data;
1580
1581         data.pdev = pdev;
1582         data.ret = 0;
1583
1584         for_each_online_node(nid) {
1585                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1586                 if (data.ret)
1587                         return data.ret;
1588         }
1589         return data.ret;
1590 }
1591
1592 static void __init iommu_prepare_gfx_mapping(void)
1593 {
1594         struct pci_dev *pdev = NULL;
1595         int ret;
1596
1597         for_each_pci_dev(pdev) {
1598                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1599                                 !IS_GFX_DEVICE(pdev))
1600                         continue;
1601                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1602                         pci_name(pdev));
1603                 ret = iommu_prepare_with_active_regions(pdev);
1604                 if (ret)
1605                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1606         }
1607 }
1608 #endif
1609
1610 #ifdef CONFIG_DMAR_FLOPPY_WA
1611 static inline void iommu_prepare_isa(void)
1612 {
1613         struct pci_dev *pdev;
1614         int ret;
1615
1616         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1617         if (!pdev)
1618                 return;
1619
1620         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1621         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1622
1623         if (ret)
1624                 printk("IOMMU: Failed to create 0-64M identity map, "
1625                         "floppy might not work\n");
1626
1627 }
1628 #else
1629 static inline void iommu_prepare_isa(void)
1630 {
1631         return;
1632 }
1633 #endif /* !CONFIG_DMAR_FLPY_WA */
1634
1635 int __init init_dmars(void)
1636 {
1637         struct dmar_drhd_unit *drhd;
1638         struct dmar_rmrr_unit *rmrr;
1639         struct pci_dev *pdev;
1640         struct intel_iommu *iommu;
1641         int i, ret, unit = 0;
1642
1643         /*
1644          * for each drhd
1645          *    allocate root
1646          *    initialize and program root entry to not present
1647          * endfor
1648          */
1649         for_each_drhd_unit(drhd) {
1650                 g_num_of_iommus++;
1651                 /*
1652                  * lock not needed as this is only incremented in the single
1653                  * threaded kernel __init code path all other access are read
1654                  * only
1655                  */
1656         }
1657
1658         deferred_flush = kzalloc(g_num_of_iommus *
1659                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1660         if (!deferred_flush) {
1661                 ret = -ENOMEM;
1662                 goto error;
1663         }
1664
1665         for_each_drhd_unit(drhd) {
1666                 if (drhd->ignored)
1667                         continue;
1668                 iommu = alloc_iommu(drhd);
1669                 if (!iommu) {
1670                         ret = -ENOMEM;
1671                         goto error;
1672                 }
1673
1674                 ret = iommu_init_domains(iommu);
1675                 if (ret)
1676                         goto error;
1677
1678                 /*
1679                  * TBD:
1680                  * we could share the same root & context tables
1681                  * amoung all IOMMU's. Need to Split it later.
1682                  */
1683                 ret = iommu_alloc_root_entry(iommu);
1684                 if (ret) {
1685                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1686                         goto error;
1687                 }
1688         }
1689
1690         /*
1691          * For each rmrr
1692          *   for each dev attached to rmrr
1693          *   do
1694          *     locate drhd for dev, alloc domain for dev
1695          *     allocate free domain
1696          *     allocate page table entries for rmrr
1697          *     if context not allocated for bus
1698          *           allocate and init context
1699          *           set present in root table for this bus
1700          *     init context with domain, translation etc
1701          *    endfor
1702          * endfor
1703          */
1704         for_each_rmrr_units(rmrr) {
1705                 for (i = 0; i < rmrr->devices_cnt; i++) {
1706                         pdev = rmrr->devices[i];
1707                         /* some BIOS lists non-exist devices in DMAR table */
1708                         if (!pdev)
1709                                 continue;
1710                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1711                         if (ret)
1712                                 printk(KERN_ERR
1713                                  "IOMMU: mapping reserved region failed\n");
1714                 }
1715         }
1716
1717         iommu_prepare_gfx_mapping();
1718
1719         iommu_prepare_isa();
1720
1721         /*
1722          * for each drhd
1723          *   enable fault log
1724          *   global invalidate context cache
1725          *   global invalidate iotlb
1726          *   enable translation
1727          */
1728         for_each_drhd_unit(drhd) {
1729                 if (drhd->ignored)
1730                         continue;
1731                 iommu = drhd->iommu;
1732                 sprintf (iommu->name, "dmar%d", unit++);
1733
1734                 iommu_flush_write_buffer(iommu);
1735
1736                 ret = dmar_set_interrupt(iommu);
1737                 if (ret)
1738                         goto error;
1739
1740                 iommu_set_root_entry(iommu);
1741
1742                 iommu_flush_context_global(iommu, 0);
1743                 iommu_flush_iotlb_global(iommu, 0);
1744
1745                 iommu_disable_protect_mem_regions(iommu);
1746
1747                 ret = iommu_enable_translation(iommu);
1748                 if (ret)
1749                         goto error;
1750         }
1751
1752         return 0;
1753 error:
1754         for_each_drhd_unit(drhd) {
1755                 if (drhd->ignored)
1756                         continue;
1757                 iommu = drhd->iommu;
1758                 free_iommu(iommu);
1759         }
1760         return ret;
1761 }
1762
1763 static inline u64 aligned_size(u64 host_addr, size_t size)
1764 {
1765         u64 addr;
1766         addr = (host_addr & (~PAGE_MASK_4K)) + size;
1767         return PAGE_ALIGN_4K(addr);
1768 }
1769
1770 struct iova *
1771 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1772 {
1773         struct iova *piova;
1774
1775         /* Make sure it's in range */
1776         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1777         if (!size || (IOVA_START_ADDR + size > end))
1778                 return NULL;
1779
1780         piova = alloc_iova(&domain->iovad,
1781                         size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
1782         return piova;
1783 }
1784
1785 static struct iova *
1786 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1787                 size_t size)
1788 {
1789         struct pci_dev *pdev = to_pci_dev(dev);
1790         struct iova *iova = NULL;
1791
1792         if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1793                 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1794         } else  {
1795                 /*
1796                  * First try to allocate an io virtual address in
1797                  * DMA_32BIT_MASK and if that fails then try allocating
1798                  * from higher range
1799                  */
1800                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1801                 if (!iova)
1802                         iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1803         }
1804
1805         if (!iova) {
1806                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1807                 return NULL;
1808         }
1809
1810         return iova;
1811 }
1812
1813 static struct dmar_domain *
1814 get_valid_domain_for_dev(struct pci_dev *pdev)
1815 {
1816         struct dmar_domain *domain;
1817         int ret;
1818
1819         domain = get_domain_for_dev(pdev,
1820                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
1821         if (!domain) {
1822                 printk(KERN_ERR
1823                         "Allocating domain for %s failed", pci_name(pdev));
1824                 return NULL;
1825         }
1826
1827         /* make sure context mapping is ok */
1828         if (unlikely(!domain_context_mapped(domain, pdev))) {
1829                 ret = domain_context_mapping(domain, pdev);
1830                 if (ret) {
1831                         printk(KERN_ERR
1832                                 "Domain context map for %s failed",
1833                                 pci_name(pdev));
1834                         return NULL;
1835                 }
1836         }
1837
1838         return domain;
1839 }
1840
1841 static dma_addr_t
1842 intel_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
1843 {
1844         struct pci_dev *pdev = to_pci_dev(hwdev);
1845         struct dmar_domain *domain;
1846         unsigned long start_paddr;
1847         struct iova *iova;
1848         int prot = 0;
1849         int ret;
1850
1851         BUG_ON(dir == DMA_NONE);
1852         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1853                 return paddr;
1854
1855         domain = get_valid_domain_for_dev(pdev);
1856         if (!domain)
1857                 return 0;
1858
1859         size = aligned_size((u64)paddr, size);
1860
1861         iova = __intel_alloc_iova(hwdev, domain, size);
1862         if (!iova)
1863                 goto error;
1864
1865         start_paddr = iova->pfn_lo << PAGE_SHIFT_4K;
1866
1867         /*
1868          * Check if DMAR supports zero-length reads on write only
1869          * mappings..
1870          */
1871         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1872                         !cap_zlr(domain->iommu->cap))
1873                 prot |= DMA_PTE_READ;
1874         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1875                 prot |= DMA_PTE_WRITE;
1876         /*
1877          * paddr - (paddr + size) might be partial page, we should map the whole
1878          * page.  Note: if two part of one page are separately mapped, we
1879          * might have two guest_addr mapping to the same host paddr, but this
1880          * is not a big problem
1881          */
1882         ret = domain_page_mapping(domain, start_paddr,
1883                 ((u64)paddr) & PAGE_MASK_4K, size, prot);
1884         if (ret)
1885                 goto error;
1886
1887         pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1888                 pci_name(pdev), size, (u64)paddr,
1889                 size, (u64)start_paddr, dir);
1890
1891         /* it's a non-present to present mapping */
1892         ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1893                         start_paddr, size >> PAGE_SHIFT_4K, 1);
1894         if (ret)
1895                 iommu_flush_write_buffer(domain->iommu);
1896
1897         return (start_paddr + ((u64)paddr & (~PAGE_MASK_4K)));
1898
1899 error:
1900         if (iova)
1901                 __free_iova(&domain->iovad, iova);
1902         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1903                 pci_name(pdev), size, (u64)paddr, dir);
1904         return 0;
1905 }
1906
1907 static void flush_unmaps(void)
1908 {
1909         int i, j;
1910
1911         timer_on = 0;
1912
1913         /* just flush them all */
1914         for (i = 0; i < g_num_of_iommus; i++) {
1915                 if (deferred_flush[i].next) {
1916                         struct intel_iommu *iommu =
1917                                 deferred_flush[i].domain[0]->iommu;
1918
1919                         iommu_flush_iotlb_global(iommu, 0);
1920                         for (j = 0; j < deferred_flush[i].next; j++) {
1921                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
1922                                                 deferred_flush[i].iova[j]);
1923                         }
1924                         deferred_flush[i].next = 0;
1925                 }
1926         }
1927
1928         list_size = 0;
1929 }
1930
1931 static void flush_unmaps_timeout(unsigned long data)
1932 {
1933         unsigned long flags;
1934
1935         spin_lock_irqsave(&async_umap_flush_lock, flags);
1936         flush_unmaps();
1937         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1938 }
1939
1940 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
1941 {
1942         unsigned long flags;
1943         int next, iommu_id;
1944
1945         spin_lock_irqsave(&async_umap_flush_lock, flags);
1946         if (list_size == HIGH_WATER_MARK)
1947                 flush_unmaps();
1948
1949         iommu_id = dom->iommu->seq_id;
1950
1951         next = deferred_flush[iommu_id].next;
1952         deferred_flush[iommu_id].domain[next] = dom;
1953         deferred_flush[iommu_id].iova[next] = iova;
1954         deferred_flush[iommu_id].next++;
1955
1956         if (!timer_on) {
1957                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
1958                 timer_on = 1;
1959         }
1960         list_size++;
1961         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1962 }
1963
1964 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1965         size_t size, int dir)
1966 {
1967         struct pci_dev *pdev = to_pci_dev(dev);
1968         struct dmar_domain *domain;
1969         unsigned long start_addr;
1970         struct iova *iova;
1971
1972         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1973                 return;
1974         domain = find_domain(pdev);
1975         BUG_ON(!domain);
1976
1977         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1978         if (!iova)
1979                 return;
1980
1981         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1982         size = aligned_size((u64)dev_addr, size);
1983
1984         pr_debug("Device %s unmapping: %lx@%llx\n",
1985                 pci_name(pdev), size, (u64)start_addr);
1986
1987         /*  clear the whole page */
1988         dma_pte_clear_range(domain, start_addr, start_addr + size);
1989         /* free page tables */
1990         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1991         if (intel_iommu_strict) {
1992                 if (iommu_flush_iotlb_psi(domain->iommu,
1993                         domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
1994                         iommu_flush_write_buffer(domain->iommu);
1995                 /* free iova */
1996                 __free_iova(&domain->iovad, iova);
1997         } else {
1998                 add_unmap(domain, iova);
1999                 /*
2000                  * queue up the release of the unmap to save the 1/6th of the
2001                  * cpu used up by the iotlb flush operation...
2002                  */
2003         }
2004 }
2005
2006 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
2007                        dma_addr_t *dma_handle, gfp_t flags)
2008 {
2009         void *vaddr;
2010         int order;
2011
2012         size = PAGE_ALIGN_4K(size);
2013         order = get_order(size);
2014         flags &= ~(GFP_DMA | GFP_DMA32);
2015
2016         vaddr = (void *)__get_free_pages(flags, order);
2017         if (!vaddr)
2018                 return NULL;
2019         memset(vaddr, 0, size);
2020
2021         *dma_handle = intel_map_single(hwdev, virt_to_bus(vaddr), size, DMA_BIDIRECTIONAL);
2022         if (*dma_handle)
2023                 return vaddr;
2024         free_pages((unsigned long)vaddr, order);
2025         return NULL;
2026 }
2027
2028 static void intel_free_coherent(struct device *hwdev, size_t size,
2029         void *vaddr, dma_addr_t dma_handle)
2030 {
2031         int order;
2032
2033         size = PAGE_ALIGN_4K(size);
2034         order = get_order(size);
2035
2036         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2037         free_pages((unsigned long)vaddr, order);
2038 }
2039
2040 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2041 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2042         int nelems, int dir)
2043 {
2044         int i;
2045         struct pci_dev *pdev = to_pci_dev(hwdev);
2046         struct dmar_domain *domain;
2047         unsigned long start_addr;
2048         struct iova *iova;
2049         size_t size = 0;
2050         void *addr;
2051         struct scatterlist *sg;
2052
2053         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2054                 return;
2055
2056         domain = find_domain(pdev);
2057
2058         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2059         if (!iova)
2060                 return;
2061         for_each_sg(sglist, sg, nelems, i) {
2062                 addr = SG_ENT_VIRT_ADDRESS(sg);
2063                 size += aligned_size((u64)addr, sg->length);
2064         }
2065
2066         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2067
2068         /*  clear the whole page */
2069         dma_pte_clear_range(domain, start_addr, start_addr + size);
2070         /* free page tables */
2071         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2072
2073         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2074                         size >> PAGE_SHIFT_4K, 0))
2075                 iommu_flush_write_buffer(domain->iommu);
2076
2077         /* free iova */
2078         __free_iova(&domain->iovad, iova);
2079 }
2080
2081 static int intel_nontranslate_map_sg(struct device *hddev,
2082         struct scatterlist *sglist, int nelems, int dir)
2083 {
2084         int i;
2085         struct scatterlist *sg;
2086
2087         for_each_sg(sglist, sg, nelems, i) {
2088                 BUG_ON(!sg_page(sg));
2089                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2090                 sg->dma_length = sg->length;
2091         }
2092         return nelems;
2093 }
2094
2095 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
2096                                 int nelems, int dir)
2097 {
2098         void *addr;
2099         int i;
2100         struct pci_dev *pdev = to_pci_dev(hwdev);
2101         struct dmar_domain *domain;
2102         size_t size = 0;
2103         int prot = 0;
2104         size_t offset = 0;
2105         struct iova *iova = NULL;
2106         int ret;
2107         struct scatterlist *sg;
2108         unsigned long start_addr;
2109
2110         BUG_ON(dir == DMA_NONE);
2111         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2112                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2113
2114         domain = get_valid_domain_for_dev(pdev);
2115         if (!domain)
2116                 return 0;
2117
2118         for_each_sg(sglist, sg, nelems, i) {
2119                 addr = SG_ENT_VIRT_ADDRESS(sg);
2120                 addr = (void *)virt_to_phys(addr);
2121                 size += aligned_size((u64)addr, sg->length);
2122         }
2123
2124         iova = __intel_alloc_iova(hwdev, domain, size);
2125         if (!iova) {
2126                 sglist->dma_length = 0;
2127                 return 0;
2128         }
2129
2130         /*
2131          * Check if DMAR supports zero-length reads on write only
2132          * mappings..
2133          */
2134         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2135                         !cap_zlr(domain->iommu->cap))
2136                 prot |= DMA_PTE_READ;
2137         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2138                 prot |= DMA_PTE_WRITE;
2139
2140         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2141         offset = 0;
2142         for_each_sg(sglist, sg, nelems, i) {
2143                 addr = SG_ENT_VIRT_ADDRESS(sg);
2144                 addr = (void *)virt_to_phys(addr);
2145                 size = aligned_size((u64)addr, sg->length);
2146                 ret = domain_page_mapping(domain, start_addr + offset,
2147                         ((u64)addr) & PAGE_MASK_4K,
2148                         size, prot);
2149                 if (ret) {
2150                         /*  clear the page */
2151                         dma_pte_clear_range(domain, start_addr,
2152                                   start_addr + offset);
2153                         /* free page tables */
2154                         dma_pte_free_pagetable(domain, start_addr,
2155                                   start_addr + offset);
2156                         /* free iova */
2157                         __free_iova(&domain->iovad, iova);
2158                         return 0;
2159                 }
2160                 sg->dma_address = start_addr + offset +
2161                                 ((u64)addr & (~PAGE_MASK_4K));
2162                 sg->dma_length = sg->length;
2163                 offset += size;
2164         }
2165
2166         /* it's a non-present to present mapping */
2167         if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2168                         start_addr, offset >> PAGE_SHIFT_4K, 1))
2169                 iommu_flush_write_buffer(domain->iommu);
2170         return nelems;
2171 }
2172
2173 static struct dma_mapping_ops intel_dma_ops = {
2174         .alloc_coherent = intel_alloc_coherent,
2175         .free_coherent = intel_free_coherent,
2176         .map_single = intel_map_single,
2177         .unmap_single = intel_unmap_single,
2178         .map_sg = intel_map_sg,
2179         .unmap_sg = intel_unmap_sg,
2180 };
2181
2182 static inline int iommu_domain_cache_init(void)
2183 {
2184         int ret = 0;
2185
2186         iommu_domain_cache = kmem_cache_create("iommu_domain",
2187                                          sizeof(struct dmar_domain),
2188                                          0,
2189                                          SLAB_HWCACHE_ALIGN,
2190
2191                                          NULL);
2192         if (!iommu_domain_cache) {
2193                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2194                 ret = -ENOMEM;
2195         }
2196
2197         return ret;
2198 }
2199
2200 static inline int iommu_devinfo_cache_init(void)
2201 {
2202         int ret = 0;
2203
2204         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2205                                          sizeof(struct device_domain_info),
2206                                          0,
2207                                          SLAB_HWCACHE_ALIGN,
2208
2209                                          NULL);
2210         if (!iommu_devinfo_cache) {
2211                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2212                 ret = -ENOMEM;
2213         }
2214
2215         return ret;
2216 }
2217
2218 static inline int iommu_iova_cache_init(void)
2219 {
2220         int ret = 0;
2221
2222         iommu_iova_cache = kmem_cache_create("iommu_iova",
2223                                          sizeof(struct iova),
2224                                          0,
2225                                          SLAB_HWCACHE_ALIGN,
2226
2227                                          NULL);
2228         if (!iommu_iova_cache) {
2229                 printk(KERN_ERR "Couldn't create iova cache\n");
2230                 ret = -ENOMEM;
2231         }
2232
2233         return ret;
2234 }
2235
2236 static int __init iommu_init_mempool(void)
2237 {
2238         int ret;
2239         ret = iommu_iova_cache_init();
2240         if (ret)
2241                 return ret;
2242
2243         ret = iommu_domain_cache_init();
2244         if (ret)
2245                 goto domain_error;
2246
2247         ret = iommu_devinfo_cache_init();
2248         if (!ret)
2249                 return ret;
2250
2251         kmem_cache_destroy(iommu_domain_cache);
2252 domain_error:
2253         kmem_cache_destroy(iommu_iova_cache);
2254
2255         return -ENOMEM;
2256 }
2257
2258 static void __init iommu_exit_mempool(void)
2259 {
2260         kmem_cache_destroy(iommu_devinfo_cache);
2261         kmem_cache_destroy(iommu_domain_cache);
2262         kmem_cache_destroy(iommu_iova_cache);
2263
2264 }
2265
2266 void __init detect_intel_iommu(void)
2267 {
2268         if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
2269                 return;
2270         if (early_dmar_detect()) {
2271                 iommu_detected = 1;
2272         }
2273 }
2274
2275 static void __init init_no_remapping_devices(void)
2276 {
2277         struct dmar_drhd_unit *drhd;
2278
2279         for_each_drhd_unit(drhd) {
2280                 if (!drhd->include_all) {
2281                         int i;
2282                         for (i = 0; i < drhd->devices_cnt; i++)
2283                                 if (drhd->devices[i] != NULL)
2284                                         break;
2285                         /* ignore DMAR unit if no pci devices exist */
2286                         if (i == drhd->devices_cnt)
2287                                 drhd->ignored = 1;
2288                 }
2289         }
2290
2291         if (dmar_map_gfx)
2292                 return;
2293
2294         for_each_drhd_unit(drhd) {
2295                 int i;
2296                 if (drhd->ignored || drhd->include_all)
2297                         continue;
2298
2299                 for (i = 0; i < drhd->devices_cnt; i++)
2300                         if (drhd->devices[i] &&
2301                                 !IS_GFX_DEVICE(drhd->devices[i]))
2302                                 break;
2303
2304                 if (i < drhd->devices_cnt)
2305                         continue;
2306
2307                 /* bypass IOMMU if it is just for gfx devices */
2308                 drhd->ignored = 1;
2309                 for (i = 0; i < drhd->devices_cnt; i++) {
2310                         if (!drhd->devices[i])
2311                                 continue;
2312                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2313                 }
2314         }
2315 }
2316
2317 int __init intel_iommu_init(void)
2318 {
2319         int ret = 0;
2320
2321         if (no_iommu || swiotlb || dmar_disabled)
2322                 return -ENODEV;
2323
2324         if (dmar_table_init())
2325                 return  -ENODEV;
2326
2327         iommu_init_mempool();
2328         dmar_init_reserved_ranges();
2329
2330         init_no_remapping_devices();
2331
2332         ret = init_dmars();
2333         if (ret) {
2334                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2335                 put_iova_domain(&reserved_iova_list);
2336                 iommu_exit_mempool();
2337                 return ret;
2338         }
2339         printk(KERN_INFO
2340         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2341
2342         init_timer(&unmap_timer);
2343         force_iommu = 1;
2344         dma_ops = &intel_dma_ops;
2345         return 0;
2346 }
2347