]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/blob - drivers/pci/intel-iommu.c
x64, x2apic/intr-remap: code re-structuring, to be used by both DMA and Interrupt...
[linux-2.6-omap-h63xx.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  */
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/slab.h>
27 #include <linux/irq.h>
28 #include <linux/interrupt.h>
29 #include <linux/sysdev.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include "iova.h"
37 #include "intel-iommu.h"
38 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
39 #include <asm/cacheflush.h>
40 #include <asm/gart.h>
41 #include "pci.h"
42
43 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
44 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
45
46 #define IOAPIC_RANGE_START      (0xfee00000)
47 #define IOAPIC_RANGE_END        (0xfeefffff)
48 #define IOVA_START_ADDR         (0x1000)
49
50 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
51
52 #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) /* 10sec */
53
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
55
56
57 static void flush_unmaps_timeout(unsigned long data);
58
59 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
60
61 #define HIGH_WATER_MARK 250
62 struct deferred_flush_tables {
63         int next;
64         struct iova *iova[HIGH_WATER_MARK];
65         struct dmar_domain *domain[HIGH_WATER_MARK];
66 };
67
68 static struct deferred_flush_tables *deferred_flush;
69
70 /* bitmap for indexing intel_iommus */
71 static int g_num_of_iommus;
72
73 static DEFINE_SPINLOCK(async_umap_flush_lock);
74 static LIST_HEAD(unmaps_to_do);
75
76 static int timer_on;
77 static long list_size;
78
79 static void domain_remove_dev_info(struct dmar_domain *domain);
80
81 static int dmar_disabled;
82 static int __initdata dmar_map_gfx = 1;
83 static int dmar_forcedac;
84 static int intel_iommu_strict;
85
86 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
87 static DEFINE_SPINLOCK(device_domain_lock);
88 static LIST_HEAD(device_domain_list);
89
90 static int __init intel_iommu_setup(char *str)
91 {
92         if (!str)
93                 return -EINVAL;
94         while (*str) {
95                 if (!strncmp(str, "off", 3)) {
96                         dmar_disabled = 1;
97                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
98                 } else if (!strncmp(str, "igfx_off", 8)) {
99                         dmar_map_gfx = 0;
100                         printk(KERN_INFO
101                                 "Intel-IOMMU: disable GFX device mapping\n");
102                 } else if (!strncmp(str, "forcedac", 8)) {
103                         printk(KERN_INFO
104                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
105                         dmar_forcedac = 1;
106                 } else if (!strncmp(str, "strict", 6)) {
107                         printk(KERN_INFO
108                                 "Intel-IOMMU: disable batched IOTLB flush\n");
109                         intel_iommu_strict = 1;
110                 }
111
112                 str += strcspn(str, ",");
113                 while (*str == ',')
114                         str++;
115         }
116         return 0;
117 }
118 __setup("intel_iommu=", intel_iommu_setup);
119
120 static struct kmem_cache *iommu_domain_cache;
121 static struct kmem_cache *iommu_devinfo_cache;
122 static struct kmem_cache *iommu_iova_cache;
123
124 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
125 {
126         unsigned int flags;
127         void *vaddr;
128
129         /* trying to avoid low memory issues */
130         flags = current->flags & PF_MEMALLOC;
131         current->flags |= PF_MEMALLOC;
132         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
133         current->flags &= (~PF_MEMALLOC | flags);
134         return vaddr;
135 }
136
137
138 static inline void *alloc_pgtable_page(void)
139 {
140         unsigned int flags;
141         void *vaddr;
142
143         /* trying to avoid low memory issues */
144         flags = current->flags & PF_MEMALLOC;
145         current->flags |= PF_MEMALLOC;
146         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
147         current->flags &= (~PF_MEMALLOC | flags);
148         return vaddr;
149 }
150
151 static inline void free_pgtable_page(void *vaddr)
152 {
153         free_page((unsigned long)vaddr);
154 }
155
156 static inline void *alloc_domain_mem(void)
157 {
158         return iommu_kmem_cache_alloc(iommu_domain_cache);
159 }
160
161 static inline void free_domain_mem(void *vaddr)
162 {
163         kmem_cache_free(iommu_domain_cache, vaddr);
164 }
165
166 static inline void * alloc_devinfo_mem(void)
167 {
168         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
169 }
170
171 static inline void free_devinfo_mem(void *vaddr)
172 {
173         kmem_cache_free(iommu_devinfo_cache, vaddr);
174 }
175
176 struct iova *alloc_iova_mem(void)
177 {
178         return iommu_kmem_cache_alloc(iommu_iova_cache);
179 }
180
181 void free_iova_mem(struct iova *iova)
182 {
183         kmem_cache_free(iommu_iova_cache, iova);
184 }
185
186 static inline void __iommu_flush_cache(
187         struct intel_iommu *iommu, void *addr, int size)
188 {
189         if (!ecap_coherent(iommu->ecap))
190                 clflush_cache_range(addr, size);
191 }
192
193 /* Gets context entry for a given bus and devfn */
194 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
195                 u8 bus, u8 devfn)
196 {
197         struct root_entry *root;
198         struct context_entry *context;
199         unsigned long phy_addr;
200         unsigned long flags;
201
202         spin_lock_irqsave(&iommu->lock, flags);
203         root = &iommu->root_entry[bus];
204         context = get_context_addr_from_root(root);
205         if (!context) {
206                 context = (struct context_entry *)alloc_pgtable_page();
207                 if (!context) {
208                         spin_unlock_irqrestore(&iommu->lock, flags);
209                         return NULL;
210                 }
211                 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
212                 phy_addr = virt_to_phys((void *)context);
213                 set_root_value(root, phy_addr);
214                 set_root_present(root);
215                 __iommu_flush_cache(iommu, root, sizeof(*root));
216         }
217         spin_unlock_irqrestore(&iommu->lock, flags);
218         return &context[devfn];
219 }
220
221 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
222 {
223         struct root_entry *root;
224         struct context_entry *context;
225         int ret;
226         unsigned long flags;
227
228         spin_lock_irqsave(&iommu->lock, flags);
229         root = &iommu->root_entry[bus];
230         context = get_context_addr_from_root(root);
231         if (!context) {
232                 ret = 0;
233                 goto out;
234         }
235         ret = context_present(context[devfn]);
236 out:
237         spin_unlock_irqrestore(&iommu->lock, flags);
238         return ret;
239 }
240
241 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
242 {
243         struct root_entry *root;
244         struct context_entry *context;
245         unsigned long flags;
246
247         spin_lock_irqsave(&iommu->lock, flags);
248         root = &iommu->root_entry[bus];
249         context = get_context_addr_from_root(root);
250         if (context) {
251                 context_clear_entry(context[devfn]);
252                 __iommu_flush_cache(iommu, &context[devfn], \
253                         sizeof(*context));
254         }
255         spin_unlock_irqrestore(&iommu->lock, flags);
256 }
257
258 static void free_context_table(struct intel_iommu *iommu)
259 {
260         struct root_entry *root;
261         int i;
262         unsigned long flags;
263         struct context_entry *context;
264
265         spin_lock_irqsave(&iommu->lock, flags);
266         if (!iommu->root_entry) {
267                 goto out;
268         }
269         for (i = 0; i < ROOT_ENTRY_NR; i++) {
270                 root = &iommu->root_entry[i];
271                 context = get_context_addr_from_root(root);
272                 if (context)
273                         free_pgtable_page(context);
274         }
275         free_pgtable_page(iommu->root_entry);
276         iommu->root_entry = NULL;
277 out:
278         spin_unlock_irqrestore(&iommu->lock, flags);
279 }
280
281 /* page table handling */
282 #define LEVEL_STRIDE            (9)
283 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
284
285 static inline int agaw_to_level(int agaw)
286 {
287         return agaw + 2;
288 }
289
290 static inline int agaw_to_width(int agaw)
291 {
292         return 30 + agaw * LEVEL_STRIDE;
293
294 }
295
296 static inline int width_to_agaw(int width)
297 {
298         return (width - 30) / LEVEL_STRIDE;
299 }
300
301 static inline unsigned int level_to_offset_bits(int level)
302 {
303         return (12 + (level - 1) * LEVEL_STRIDE);
304 }
305
306 static inline int address_level_offset(u64 addr, int level)
307 {
308         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
309 }
310
311 static inline u64 level_mask(int level)
312 {
313         return ((u64)-1 << level_to_offset_bits(level));
314 }
315
316 static inline u64 level_size(int level)
317 {
318         return ((u64)1 << level_to_offset_bits(level));
319 }
320
321 static inline u64 align_to_level(u64 addr, int level)
322 {
323         return ((addr + level_size(level) - 1) & level_mask(level));
324 }
325
326 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
327 {
328         int addr_width = agaw_to_width(domain->agaw);
329         struct dma_pte *parent, *pte = NULL;
330         int level = agaw_to_level(domain->agaw);
331         int offset;
332         unsigned long flags;
333
334         BUG_ON(!domain->pgd);
335
336         addr &= (((u64)1) << addr_width) - 1;
337         parent = domain->pgd;
338
339         spin_lock_irqsave(&domain->mapping_lock, flags);
340         while (level > 0) {
341                 void *tmp_page;
342
343                 offset = address_level_offset(addr, level);
344                 pte = &parent[offset];
345                 if (level == 1)
346                         break;
347
348                 if (!dma_pte_present(*pte)) {
349                         tmp_page = alloc_pgtable_page();
350
351                         if (!tmp_page) {
352                                 spin_unlock_irqrestore(&domain->mapping_lock,
353                                         flags);
354                                 return NULL;
355                         }
356                         __iommu_flush_cache(domain->iommu, tmp_page,
357                                         PAGE_SIZE_4K);
358                         dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
359                         /*
360                          * high level table always sets r/w, last level page
361                          * table control read/write
362                          */
363                         dma_set_pte_readable(*pte);
364                         dma_set_pte_writable(*pte);
365                         __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
366                 }
367                 parent = phys_to_virt(dma_pte_addr(*pte));
368                 level--;
369         }
370
371         spin_unlock_irqrestore(&domain->mapping_lock, flags);
372         return pte;
373 }
374
375 /* return address's pte at specific level */
376 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
377                 int level)
378 {
379         struct dma_pte *parent, *pte = NULL;
380         int total = agaw_to_level(domain->agaw);
381         int offset;
382
383         parent = domain->pgd;
384         while (level <= total) {
385                 offset = address_level_offset(addr, total);
386                 pte = &parent[offset];
387                 if (level == total)
388                         return pte;
389
390                 if (!dma_pte_present(*pte))
391                         break;
392                 parent = phys_to_virt(dma_pte_addr(*pte));
393                 total--;
394         }
395         return NULL;
396 }
397
398 /* clear one page's page table */
399 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
400 {
401         struct dma_pte *pte = NULL;
402
403         /* get last level pte */
404         pte = dma_addr_level_pte(domain, addr, 1);
405
406         if (pte) {
407                 dma_clear_pte(*pte);
408                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
409         }
410 }
411
412 /* clear last level pte, a tlb flush should be followed */
413 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
414 {
415         int addr_width = agaw_to_width(domain->agaw);
416
417         start &= (((u64)1) << addr_width) - 1;
418         end &= (((u64)1) << addr_width) - 1;
419         /* in case it's partial page */
420         start = PAGE_ALIGN_4K(start);
421         end &= PAGE_MASK_4K;
422
423         /* we don't need lock here, nobody else touches the iova range */
424         while (start < end) {
425                 dma_pte_clear_one(domain, start);
426                 start += PAGE_SIZE_4K;
427         }
428 }
429
430 /* free page table pages. last level pte should already be cleared */
431 static void dma_pte_free_pagetable(struct dmar_domain *domain,
432         u64 start, u64 end)
433 {
434         int addr_width = agaw_to_width(domain->agaw);
435         struct dma_pte *pte;
436         int total = agaw_to_level(domain->agaw);
437         int level;
438         u64 tmp;
439
440         start &= (((u64)1) << addr_width) - 1;
441         end &= (((u64)1) << addr_width) - 1;
442
443         /* we don't need lock here, nobody else touches the iova range */
444         level = 2;
445         while (level <= total) {
446                 tmp = align_to_level(start, level);
447                 if (tmp >= end || (tmp + level_size(level) > end))
448                         return;
449
450                 while (tmp < end) {
451                         pte = dma_addr_level_pte(domain, tmp, level);
452                         if (pte) {
453                                 free_pgtable_page(
454                                         phys_to_virt(dma_pte_addr(*pte)));
455                                 dma_clear_pte(*pte);
456                                 __iommu_flush_cache(domain->iommu,
457                                                 pte, sizeof(*pte));
458                         }
459                         tmp += level_size(level);
460                 }
461                 level++;
462         }
463         /* free pgd */
464         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
465                 free_pgtable_page(domain->pgd);
466                 domain->pgd = NULL;
467         }
468 }
469
470 /* iommu handling */
471 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
472 {
473         struct root_entry *root;
474         unsigned long flags;
475
476         root = (struct root_entry *)alloc_pgtable_page();
477         if (!root)
478                 return -ENOMEM;
479
480         __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
481
482         spin_lock_irqsave(&iommu->lock, flags);
483         iommu->root_entry = root;
484         spin_unlock_irqrestore(&iommu->lock, flags);
485
486         return 0;
487 }
488
489 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
490 {\
491         cycles_t start_time = get_cycles();\
492         while (1) {\
493                 sts = op (iommu->reg + offset);\
494                 if (cond)\
495                         break;\
496                 if (DMAR_OPERATION_TIMEOUT < (get_cycles() - start_time))\
497                         panic("DMAR hardware is malfunctioning\n");\
498                 cpu_relax();\
499         }\
500 }
501
502 static void iommu_set_root_entry(struct intel_iommu *iommu)
503 {
504         void *addr;
505         u32 cmd, sts;
506         unsigned long flag;
507
508         addr = iommu->root_entry;
509
510         spin_lock_irqsave(&iommu->register_lock, flag);
511         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
512
513         cmd = iommu->gcmd | DMA_GCMD_SRTP;
514         writel(cmd, iommu->reg + DMAR_GCMD_REG);
515
516         /* Make sure hardware complete it */
517         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
518                 readl, (sts & DMA_GSTS_RTPS), sts);
519
520         spin_unlock_irqrestore(&iommu->register_lock, flag);
521 }
522
523 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
524 {
525         u32 val;
526         unsigned long flag;
527
528         if (!cap_rwbf(iommu->cap))
529                 return;
530         val = iommu->gcmd | DMA_GCMD_WBF;
531
532         spin_lock_irqsave(&iommu->register_lock, flag);
533         writel(val, iommu->reg + DMAR_GCMD_REG);
534
535         /* Make sure hardware complete it */
536         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
537                         readl, (!(val & DMA_GSTS_WBFS)), val);
538
539         spin_unlock_irqrestore(&iommu->register_lock, flag);
540 }
541
542 /* return value determine if we need a write buffer flush */
543 static int __iommu_flush_context(struct intel_iommu *iommu,
544         u16 did, u16 source_id, u8 function_mask, u64 type,
545         int non_present_entry_flush)
546 {
547         u64 val = 0;
548         unsigned long flag;
549
550         /*
551          * In the non-present entry flush case, if hardware doesn't cache
552          * non-present entry we do nothing and if hardware cache non-present
553          * entry, we flush entries of domain 0 (the domain id is used to cache
554          * any non-present entries)
555          */
556         if (non_present_entry_flush) {
557                 if (!cap_caching_mode(iommu->cap))
558                         return 1;
559                 else
560                         did = 0;
561         }
562
563         switch (type) {
564         case DMA_CCMD_GLOBAL_INVL:
565                 val = DMA_CCMD_GLOBAL_INVL;
566                 break;
567         case DMA_CCMD_DOMAIN_INVL:
568                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
569                 break;
570         case DMA_CCMD_DEVICE_INVL:
571                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
572                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
573                 break;
574         default:
575                 BUG();
576         }
577         val |= DMA_CCMD_ICC;
578
579         spin_lock_irqsave(&iommu->register_lock, flag);
580         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
581
582         /* Make sure hardware complete it */
583         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
584                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
585
586         spin_unlock_irqrestore(&iommu->register_lock, flag);
587
588         /* flush context entry will implictly flush write buffer */
589         return 0;
590 }
591
592 static int inline iommu_flush_context_global(struct intel_iommu *iommu,
593         int non_present_entry_flush)
594 {
595         return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
596                 non_present_entry_flush);
597 }
598
599 static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
600         int non_present_entry_flush)
601 {
602         return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
603                 non_present_entry_flush);
604 }
605
606 static int inline iommu_flush_context_device(struct intel_iommu *iommu,
607         u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
608 {
609         return __iommu_flush_context(iommu, did, source_id, function_mask,
610                 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
611 }
612
613 /* return value determine if we need a write buffer flush */
614 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
615         u64 addr, unsigned int size_order, u64 type,
616         int non_present_entry_flush)
617 {
618         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
619         u64 val = 0, val_iva = 0;
620         unsigned long flag;
621
622         /*
623          * In the non-present entry flush case, if hardware doesn't cache
624          * non-present entry we do nothing and if hardware cache non-present
625          * entry, we flush entries of domain 0 (the domain id is used to cache
626          * any non-present entries)
627          */
628         if (non_present_entry_flush) {
629                 if (!cap_caching_mode(iommu->cap))
630                         return 1;
631                 else
632                         did = 0;
633         }
634
635         switch (type) {
636         case DMA_TLB_GLOBAL_FLUSH:
637                 /* global flush doesn't need set IVA_REG */
638                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
639                 break;
640         case DMA_TLB_DSI_FLUSH:
641                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
642                 break;
643         case DMA_TLB_PSI_FLUSH:
644                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
645                 /* Note: always flush non-leaf currently */
646                 val_iva = size_order | addr;
647                 break;
648         default:
649                 BUG();
650         }
651         /* Note: set drain read/write */
652 #if 0
653         /*
654          * This is probably to be super secure.. Looks like we can
655          * ignore it without any impact.
656          */
657         if (cap_read_drain(iommu->cap))
658                 val |= DMA_TLB_READ_DRAIN;
659 #endif
660         if (cap_write_drain(iommu->cap))
661                 val |= DMA_TLB_WRITE_DRAIN;
662
663         spin_lock_irqsave(&iommu->register_lock, flag);
664         /* Note: Only uses first TLB reg currently */
665         if (val_iva)
666                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
667         dmar_writeq(iommu->reg + tlb_offset + 8, val);
668
669         /* Make sure hardware complete it */
670         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
671                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
672
673         spin_unlock_irqrestore(&iommu->register_lock, flag);
674
675         /* check IOTLB invalidation granularity */
676         if (DMA_TLB_IAIG(val) == 0)
677                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
678         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
679                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
680                         DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
681         /* flush context entry will implictly flush write buffer */
682         return 0;
683 }
684
685 static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
686         int non_present_entry_flush)
687 {
688         return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
689                 non_present_entry_flush);
690 }
691
692 static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
693         int non_present_entry_flush)
694 {
695         return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
696                 non_present_entry_flush);
697 }
698
699 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
700         u64 addr, unsigned int pages, int non_present_entry_flush)
701 {
702         unsigned int mask;
703
704         BUG_ON(addr & (~PAGE_MASK_4K));
705         BUG_ON(pages == 0);
706
707         /* Fallback to domain selective flush if no PSI support */
708         if (!cap_pgsel_inv(iommu->cap))
709                 return iommu_flush_iotlb_dsi(iommu, did,
710                         non_present_entry_flush);
711
712         /*
713          * PSI requires page size to be 2 ^ x, and the base address is naturally
714          * aligned to the size
715          */
716         mask = ilog2(__roundup_pow_of_two(pages));
717         /* Fallback to domain selective flush if size is too big */
718         if (mask > cap_max_amask_val(iommu->cap))
719                 return iommu_flush_iotlb_dsi(iommu, did,
720                         non_present_entry_flush);
721
722         return __iommu_flush_iotlb(iommu, did, addr, mask,
723                 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
724 }
725
726 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
727 {
728         u32 pmen;
729         unsigned long flags;
730
731         spin_lock_irqsave(&iommu->register_lock, flags);
732         pmen = readl(iommu->reg + DMAR_PMEN_REG);
733         pmen &= ~DMA_PMEN_EPM;
734         writel(pmen, iommu->reg + DMAR_PMEN_REG);
735
736         /* wait for the protected region status bit to clear */
737         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
738                 readl, !(pmen & DMA_PMEN_PRS), pmen);
739
740         spin_unlock_irqrestore(&iommu->register_lock, flags);
741 }
742
743 static int iommu_enable_translation(struct intel_iommu *iommu)
744 {
745         u32 sts;
746         unsigned long flags;
747
748         spin_lock_irqsave(&iommu->register_lock, flags);
749         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
750
751         /* Make sure hardware complete it */
752         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
753                 readl, (sts & DMA_GSTS_TES), sts);
754
755         iommu->gcmd |= DMA_GCMD_TE;
756         spin_unlock_irqrestore(&iommu->register_lock, flags);
757         return 0;
758 }
759
760 static int iommu_disable_translation(struct intel_iommu *iommu)
761 {
762         u32 sts;
763         unsigned long flag;
764
765         spin_lock_irqsave(&iommu->register_lock, flag);
766         iommu->gcmd &= ~DMA_GCMD_TE;
767         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
768
769         /* Make sure hardware complete it */
770         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
771                 readl, (!(sts & DMA_GSTS_TES)), sts);
772
773         spin_unlock_irqrestore(&iommu->register_lock, flag);
774         return 0;
775 }
776
777 /* iommu interrupt handling. Most stuff are MSI-like. */
778
779 static const char *fault_reason_strings[] =
780 {
781         "Software",
782         "Present bit in root entry is clear",
783         "Present bit in context entry is clear",
784         "Invalid context entry",
785         "Access beyond MGAW",
786         "PTE Write access is not set",
787         "PTE Read access is not set",
788         "Next page table ptr is invalid",
789         "Root table address invalid",
790         "Context table ptr is invalid",
791         "non-zero reserved fields in RTP",
792         "non-zero reserved fields in CTP",
793         "non-zero reserved fields in PTE",
794 };
795 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
796
797 const char *dmar_get_fault_reason(u8 fault_reason)
798 {
799         if (fault_reason > MAX_FAULT_REASON_IDX)
800                 return "Unknown";
801         else
802                 return fault_reason_strings[fault_reason];
803 }
804
805 void dmar_msi_unmask(unsigned int irq)
806 {
807         struct intel_iommu *iommu = get_irq_data(irq);
808         unsigned long flag;
809
810         /* unmask it */
811         spin_lock_irqsave(&iommu->register_lock, flag);
812         writel(0, iommu->reg + DMAR_FECTL_REG);
813         /* Read a reg to force flush the post write */
814         readl(iommu->reg + DMAR_FECTL_REG);
815         spin_unlock_irqrestore(&iommu->register_lock, flag);
816 }
817
818 void dmar_msi_mask(unsigned int irq)
819 {
820         unsigned long flag;
821         struct intel_iommu *iommu = get_irq_data(irq);
822
823         /* mask it */
824         spin_lock_irqsave(&iommu->register_lock, flag);
825         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
826         /* Read a reg to force flush the post write */
827         readl(iommu->reg + DMAR_FECTL_REG);
828         spin_unlock_irqrestore(&iommu->register_lock, flag);
829 }
830
831 void dmar_msi_write(int irq, struct msi_msg *msg)
832 {
833         struct intel_iommu *iommu = get_irq_data(irq);
834         unsigned long flag;
835
836         spin_lock_irqsave(&iommu->register_lock, flag);
837         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
838         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
839         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
840         spin_unlock_irqrestore(&iommu->register_lock, flag);
841 }
842
843 void dmar_msi_read(int irq, struct msi_msg *msg)
844 {
845         struct intel_iommu *iommu = get_irq_data(irq);
846         unsigned long flag;
847
848         spin_lock_irqsave(&iommu->register_lock, flag);
849         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
850         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
851         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
852         spin_unlock_irqrestore(&iommu->register_lock, flag);
853 }
854
855 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
856                 u8 fault_reason, u16 source_id, u64 addr)
857 {
858         const char *reason;
859
860         reason = dmar_get_fault_reason(fault_reason);
861
862         printk(KERN_ERR
863                 "DMAR:[%s] Request device [%02x:%02x.%d] "
864                 "fault addr %llx \n"
865                 "DMAR:[fault reason %02d] %s\n",
866                 (type ? "DMA Read" : "DMA Write"),
867                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
868                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
869         return 0;
870 }
871
872 #define PRIMARY_FAULT_REG_LEN (16)
873 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
874 {
875         struct intel_iommu *iommu = dev_id;
876         int reg, fault_index;
877         u32 fault_status;
878         unsigned long flag;
879
880         spin_lock_irqsave(&iommu->register_lock, flag);
881         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
882
883         /* TBD: ignore advanced fault log currently */
884         if (!(fault_status & DMA_FSTS_PPF))
885                 goto clear_overflow;
886
887         fault_index = dma_fsts_fault_record_index(fault_status);
888         reg = cap_fault_reg_offset(iommu->cap);
889         while (1) {
890                 u8 fault_reason;
891                 u16 source_id;
892                 u64 guest_addr;
893                 int type;
894                 u32 data;
895
896                 /* highest 32 bits */
897                 data = readl(iommu->reg + reg +
898                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
899                 if (!(data & DMA_FRCD_F))
900                         break;
901
902                 fault_reason = dma_frcd_fault_reason(data);
903                 type = dma_frcd_type(data);
904
905                 data = readl(iommu->reg + reg +
906                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
907                 source_id = dma_frcd_source_id(data);
908
909                 guest_addr = dmar_readq(iommu->reg + reg +
910                                 fault_index * PRIMARY_FAULT_REG_LEN);
911                 guest_addr = dma_frcd_page_addr(guest_addr);
912                 /* clear the fault */
913                 writel(DMA_FRCD_F, iommu->reg + reg +
914                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
915
916                 spin_unlock_irqrestore(&iommu->register_lock, flag);
917
918                 iommu_page_fault_do_one(iommu, type, fault_reason,
919                                 source_id, guest_addr);
920
921                 fault_index++;
922                 if (fault_index > cap_num_fault_regs(iommu->cap))
923                         fault_index = 0;
924                 spin_lock_irqsave(&iommu->register_lock, flag);
925         }
926 clear_overflow:
927         /* clear primary fault overflow */
928         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
929         if (fault_status & DMA_FSTS_PFO)
930                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
931
932         spin_unlock_irqrestore(&iommu->register_lock, flag);
933         return IRQ_HANDLED;
934 }
935
936 int dmar_set_interrupt(struct intel_iommu *iommu)
937 {
938         int irq, ret;
939
940         irq = create_irq();
941         if (!irq) {
942                 printk(KERN_ERR "IOMMU: no free vectors\n");
943                 return -EINVAL;
944         }
945
946         set_irq_data(irq, iommu);
947         iommu->irq = irq;
948
949         ret = arch_setup_dmar_msi(irq);
950         if (ret) {
951                 set_irq_data(irq, NULL);
952                 iommu->irq = 0;
953                 destroy_irq(irq);
954                 return 0;
955         }
956
957         /* Force fault register is cleared */
958         iommu_page_fault(irq, iommu);
959
960         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
961         if (ret)
962                 printk(KERN_ERR "IOMMU: can't request irq\n");
963         return ret;
964 }
965
966 static int iommu_init_domains(struct intel_iommu *iommu)
967 {
968         unsigned long ndomains;
969         unsigned long nlongs;
970
971         ndomains = cap_ndoms(iommu->cap);
972         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
973         nlongs = BITS_TO_LONGS(ndomains);
974
975         /* TBD: there might be 64K domains,
976          * consider other allocation for future chip
977          */
978         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
979         if (!iommu->domain_ids) {
980                 printk(KERN_ERR "Allocating domain id array failed\n");
981                 return -ENOMEM;
982         }
983         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
984                         GFP_KERNEL);
985         if (!iommu->domains) {
986                 printk(KERN_ERR "Allocating domain array failed\n");
987                 kfree(iommu->domain_ids);
988                 return -ENOMEM;
989         }
990
991         spin_lock_init(&iommu->lock);
992
993         /*
994          * if Caching mode is set, then invalid translations are tagged
995          * with domainid 0. Hence we need to pre-allocate it.
996          */
997         if (cap_caching_mode(iommu->cap))
998                 set_bit(0, iommu->domain_ids);
999         return 0;
1000 }
1001
1002
1003 static void domain_exit(struct dmar_domain *domain);
1004
1005 void free_dmar_iommu(struct intel_iommu *iommu)
1006 {
1007         struct dmar_domain *domain;
1008         int i;
1009
1010         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1011         for (; i < cap_ndoms(iommu->cap); ) {
1012                 domain = iommu->domains[i];
1013                 clear_bit(i, iommu->domain_ids);
1014                 domain_exit(domain);
1015                 i = find_next_bit(iommu->domain_ids,
1016                         cap_ndoms(iommu->cap), i+1);
1017         }
1018
1019         if (iommu->gcmd & DMA_GCMD_TE)
1020                 iommu_disable_translation(iommu);
1021
1022         if (iommu->irq) {
1023                 set_irq_data(iommu->irq, NULL);
1024                 /* This will mask the irq */
1025                 free_irq(iommu->irq, iommu);
1026                 destroy_irq(iommu->irq);
1027         }
1028
1029         kfree(iommu->domains);
1030         kfree(iommu->domain_ids);
1031
1032         /* free context mapping */
1033         free_context_table(iommu);
1034 }
1035
1036 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1037 {
1038         unsigned long num;
1039         unsigned long ndomains;
1040         struct dmar_domain *domain;
1041         unsigned long flags;
1042
1043         domain = alloc_domain_mem();
1044         if (!domain)
1045                 return NULL;
1046
1047         ndomains = cap_ndoms(iommu->cap);
1048
1049         spin_lock_irqsave(&iommu->lock, flags);
1050         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1051         if (num >= ndomains) {
1052                 spin_unlock_irqrestore(&iommu->lock, flags);
1053                 free_domain_mem(domain);
1054                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1055                 return NULL;
1056         }
1057
1058         set_bit(num, iommu->domain_ids);
1059         domain->id = num;
1060         domain->iommu = iommu;
1061         iommu->domains[num] = domain;
1062         spin_unlock_irqrestore(&iommu->lock, flags);
1063
1064         return domain;
1065 }
1066
1067 static void iommu_free_domain(struct dmar_domain *domain)
1068 {
1069         unsigned long flags;
1070
1071         spin_lock_irqsave(&domain->iommu->lock, flags);
1072         clear_bit(domain->id, domain->iommu->domain_ids);
1073         spin_unlock_irqrestore(&domain->iommu->lock, flags);
1074 }
1075
1076 static struct iova_domain reserved_iova_list;
1077 static struct lock_class_key reserved_alloc_key;
1078 static struct lock_class_key reserved_rbtree_key;
1079
1080 static void dmar_init_reserved_ranges(void)
1081 {
1082         struct pci_dev *pdev = NULL;
1083         struct iova *iova;
1084         int i;
1085         u64 addr, size;
1086
1087         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1088
1089         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1090                 &reserved_alloc_key);
1091         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1092                 &reserved_rbtree_key);
1093
1094         /* IOAPIC ranges shouldn't be accessed by DMA */
1095         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1096                 IOVA_PFN(IOAPIC_RANGE_END));
1097         if (!iova)
1098                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1099
1100         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1101         for_each_pci_dev(pdev) {
1102                 struct resource *r;
1103
1104                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1105                         r = &pdev->resource[i];
1106                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1107                                 continue;
1108                         addr = r->start;
1109                         addr &= PAGE_MASK_4K;
1110                         size = r->end - addr;
1111                         size = PAGE_ALIGN_4K(size);
1112                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1113                                 IOVA_PFN(size + addr) - 1);
1114                         if (!iova)
1115                                 printk(KERN_ERR "Reserve iova failed\n");
1116                 }
1117         }
1118
1119 }
1120
1121 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1122 {
1123         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1124 }
1125
1126 static inline int guestwidth_to_adjustwidth(int gaw)
1127 {
1128         int agaw;
1129         int r = (gaw - 12) % 9;
1130
1131         if (r == 0)
1132                 agaw = gaw;
1133         else
1134                 agaw = gaw + 9 - r;
1135         if (agaw > 64)
1136                 agaw = 64;
1137         return agaw;
1138 }
1139
1140 static int domain_init(struct dmar_domain *domain, int guest_width)
1141 {
1142         struct intel_iommu *iommu;
1143         int adjust_width, agaw;
1144         unsigned long sagaw;
1145
1146         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1147         spin_lock_init(&domain->mapping_lock);
1148
1149         domain_reserve_special_ranges(domain);
1150
1151         /* calculate AGAW */
1152         iommu = domain->iommu;
1153         if (guest_width > cap_mgaw(iommu->cap))
1154                 guest_width = cap_mgaw(iommu->cap);
1155         domain->gaw = guest_width;
1156         adjust_width = guestwidth_to_adjustwidth(guest_width);
1157         agaw = width_to_agaw(adjust_width);
1158         sagaw = cap_sagaw(iommu->cap);
1159         if (!test_bit(agaw, &sagaw)) {
1160                 /* hardware doesn't support it, choose a bigger one */
1161                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1162                 agaw = find_next_bit(&sagaw, 5, agaw);
1163                 if (agaw >= 5)
1164                         return -ENODEV;
1165         }
1166         domain->agaw = agaw;
1167         INIT_LIST_HEAD(&domain->devices);
1168
1169         /* always allocate the top pgd */
1170         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1171         if (!domain->pgd)
1172                 return -ENOMEM;
1173         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1174         return 0;
1175 }
1176
1177 static void domain_exit(struct dmar_domain *domain)
1178 {
1179         u64 end;
1180
1181         /* Domain 0 is reserved, so dont process it */
1182         if (!domain)
1183                 return;
1184
1185         domain_remove_dev_info(domain);
1186         /* destroy iovas */
1187         put_iova_domain(&domain->iovad);
1188         end = DOMAIN_MAX_ADDR(domain->gaw);
1189         end = end & (~PAGE_MASK_4K);
1190
1191         /* clear ptes */
1192         dma_pte_clear_range(domain, 0, end);
1193
1194         /* free page tables */
1195         dma_pte_free_pagetable(domain, 0, end);
1196
1197         iommu_free_domain(domain);
1198         free_domain_mem(domain);
1199 }
1200
1201 static int domain_context_mapping_one(struct dmar_domain *domain,
1202                 u8 bus, u8 devfn)
1203 {
1204         struct context_entry *context;
1205         struct intel_iommu *iommu = domain->iommu;
1206         unsigned long flags;
1207
1208         pr_debug("Set context mapping for %02x:%02x.%d\n",
1209                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1210         BUG_ON(!domain->pgd);
1211         context = device_to_context_entry(iommu, bus, devfn);
1212         if (!context)
1213                 return -ENOMEM;
1214         spin_lock_irqsave(&iommu->lock, flags);
1215         if (context_present(*context)) {
1216                 spin_unlock_irqrestore(&iommu->lock, flags);
1217                 return 0;
1218         }
1219
1220         context_set_domain_id(*context, domain->id);
1221         context_set_address_width(*context, domain->agaw);
1222         context_set_address_root(*context, virt_to_phys(domain->pgd));
1223         context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1224         context_set_fault_enable(*context);
1225         context_set_present(*context);
1226         __iommu_flush_cache(iommu, context, sizeof(*context));
1227
1228         /* it's a non-present to present mapping */
1229         if (iommu_flush_context_device(iommu, domain->id,
1230                         (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1231                 iommu_flush_write_buffer(iommu);
1232         else
1233                 iommu_flush_iotlb_dsi(iommu, 0, 0);
1234         spin_unlock_irqrestore(&iommu->lock, flags);
1235         return 0;
1236 }
1237
1238 static int
1239 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1240 {
1241         int ret;
1242         struct pci_dev *tmp, *parent;
1243
1244         ret = domain_context_mapping_one(domain, pdev->bus->number,
1245                 pdev->devfn);
1246         if (ret)
1247                 return ret;
1248
1249         /* dependent device mapping */
1250         tmp = pci_find_upstream_pcie_bridge(pdev);
1251         if (!tmp)
1252                 return 0;
1253         /* Secondary interface's bus number and devfn 0 */
1254         parent = pdev->bus->self;
1255         while (parent != tmp) {
1256                 ret = domain_context_mapping_one(domain, parent->bus->number,
1257                         parent->devfn);
1258                 if (ret)
1259                         return ret;
1260                 parent = parent->bus->self;
1261         }
1262         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1263                 return domain_context_mapping_one(domain,
1264                         tmp->subordinate->number, 0);
1265         else /* this is a legacy PCI bridge */
1266                 return domain_context_mapping_one(domain,
1267                         tmp->bus->number, tmp->devfn);
1268 }
1269
1270 static int domain_context_mapped(struct dmar_domain *domain,
1271         struct pci_dev *pdev)
1272 {
1273         int ret;
1274         struct pci_dev *tmp, *parent;
1275
1276         ret = device_context_mapped(domain->iommu,
1277                 pdev->bus->number, pdev->devfn);
1278         if (!ret)
1279                 return ret;
1280         /* dependent device mapping */
1281         tmp = pci_find_upstream_pcie_bridge(pdev);
1282         if (!tmp)
1283                 return ret;
1284         /* Secondary interface's bus number and devfn 0 */
1285         parent = pdev->bus->self;
1286         while (parent != tmp) {
1287                 ret = device_context_mapped(domain->iommu, parent->bus->number,
1288                         parent->devfn);
1289                 if (!ret)
1290                         return ret;
1291                 parent = parent->bus->self;
1292         }
1293         if (tmp->is_pcie)
1294                 return device_context_mapped(domain->iommu,
1295                         tmp->subordinate->number, 0);
1296         else
1297                 return device_context_mapped(domain->iommu,
1298                         tmp->bus->number, tmp->devfn);
1299 }
1300
1301 static int
1302 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1303                         u64 hpa, size_t size, int prot)
1304 {
1305         u64 start_pfn, end_pfn;
1306         struct dma_pte *pte;
1307         int index;
1308
1309         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1310                 return -EINVAL;
1311         iova &= PAGE_MASK_4K;
1312         start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1313         end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1314         index = 0;
1315         while (start_pfn < end_pfn) {
1316                 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1317                 if (!pte)
1318                         return -ENOMEM;
1319                 /* We don't need lock here, nobody else
1320                  * touches the iova range
1321                  */
1322                 BUG_ON(dma_pte_addr(*pte));
1323                 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1324                 dma_set_pte_prot(*pte, prot);
1325                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1326                 start_pfn++;
1327                 index++;
1328         }
1329         return 0;
1330 }
1331
1332 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1333 {
1334         clear_context_table(domain->iommu, bus, devfn);
1335         iommu_flush_context_global(domain->iommu, 0);
1336         iommu_flush_iotlb_global(domain->iommu, 0);
1337 }
1338
1339 static void domain_remove_dev_info(struct dmar_domain *domain)
1340 {
1341         struct device_domain_info *info;
1342         unsigned long flags;
1343
1344         spin_lock_irqsave(&device_domain_lock, flags);
1345         while (!list_empty(&domain->devices)) {
1346                 info = list_entry(domain->devices.next,
1347                         struct device_domain_info, link);
1348                 list_del(&info->link);
1349                 list_del(&info->global);
1350                 if (info->dev)
1351                         info->dev->dev.archdata.iommu = NULL;
1352                 spin_unlock_irqrestore(&device_domain_lock, flags);
1353
1354                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1355                 free_devinfo_mem(info);
1356
1357                 spin_lock_irqsave(&device_domain_lock, flags);
1358         }
1359         spin_unlock_irqrestore(&device_domain_lock, flags);
1360 }
1361
1362 /*
1363  * find_domain
1364  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1365  */
1366 struct dmar_domain *
1367 find_domain(struct pci_dev *pdev)
1368 {
1369         struct device_domain_info *info;
1370
1371         /* No lock here, assumes no domain exit in normal case */
1372         info = pdev->dev.archdata.iommu;
1373         if (info)
1374                 return info->domain;
1375         return NULL;
1376 }
1377
1378 /* domain is initialized */
1379 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1380 {
1381         struct dmar_domain *domain, *found = NULL;
1382         struct intel_iommu *iommu;
1383         struct dmar_drhd_unit *drhd;
1384         struct device_domain_info *info, *tmp;
1385         struct pci_dev *dev_tmp;
1386         unsigned long flags;
1387         int bus = 0, devfn = 0;
1388
1389         domain = find_domain(pdev);
1390         if (domain)
1391                 return domain;
1392
1393         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1394         if (dev_tmp) {
1395                 if (dev_tmp->is_pcie) {
1396                         bus = dev_tmp->subordinate->number;
1397                         devfn = 0;
1398                 } else {
1399                         bus = dev_tmp->bus->number;
1400                         devfn = dev_tmp->devfn;
1401                 }
1402                 spin_lock_irqsave(&device_domain_lock, flags);
1403                 list_for_each_entry(info, &device_domain_list, global) {
1404                         if (info->bus == bus && info->devfn == devfn) {
1405                                 found = info->domain;
1406                                 break;
1407                         }
1408                 }
1409                 spin_unlock_irqrestore(&device_domain_lock, flags);
1410                 /* pcie-pci bridge already has a domain, uses it */
1411                 if (found) {
1412                         domain = found;
1413                         goto found_domain;
1414                 }
1415         }
1416
1417         /* Allocate new domain for the device */
1418         drhd = dmar_find_matched_drhd_unit(pdev);
1419         if (!drhd) {
1420                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1421                         pci_name(pdev));
1422                 return NULL;
1423         }
1424         iommu = drhd->iommu;
1425
1426         domain = iommu_alloc_domain(iommu);
1427         if (!domain)
1428                 goto error;
1429
1430         if (domain_init(domain, gaw)) {
1431                 domain_exit(domain);
1432                 goto error;
1433         }
1434
1435         /* register pcie-to-pci device */
1436         if (dev_tmp) {
1437                 info = alloc_devinfo_mem();
1438                 if (!info) {
1439                         domain_exit(domain);
1440                         goto error;
1441                 }
1442                 info->bus = bus;
1443                 info->devfn = devfn;
1444                 info->dev = NULL;
1445                 info->domain = domain;
1446                 /* This domain is shared by devices under p2p bridge */
1447                 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1448
1449                 /* pcie-to-pci bridge already has a domain, uses it */
1450                 found = NULL;
1451                 spin_lock_irqsave(&device_domain_lock, flags);
1452                 list_for_each_entry(tmp, &device_domain_list, global) {
1453                         if (tmp->bus == bus && tmp->devfn == devfn) {
1454                                 found = tmp->domain;
1455                                 break;
1456                         }
1457                 }
1458                 if (found) {
1459                         free_devinfo_mem(info);
1460                         domain_exit(domain);
1461                         domain = found;
1462                 } else {
1463                         list_add(&info->link, &domain->devices);
1464                         list_add(&info->global, &device_domain_list);
1465                 }
1466                 spin_unlock_irqrestore(&device_domain_lock, flags);
1467         }
1468
1469 found_domain:
1470         info = alloc_devinfo_mem();
1471         if (!info)
1472                 goto error;
1473         info->bus = pdev->bus->number;
1474         info->devfn = pdev->devfn;
1475         info->dev = pdev;
1476         info->domain = domain;
1477         spin_lock_irqsave(&device_domain_lock, flags);
1478         /* somebody is fast */
1479         found = find_domain(pdev);
1480         if (found != NULL) {
1481                 spin_unlock_irqrestore(&device_domain_lock, flags);
1482                 if (found != domain) {
1483                         domain_exit(domain);
1484                         domain = found;
1485                 }
1486                 free_devinfo_mem(info);
1487                 return domain;
1488         }
1489         list_add(&info->link, &domain->devices);
1490         list_add(&info->global, &device_domain_list);
1491         pdev->dev.archdata.iommu = info;
1492         spin_unlock_irqrestore(&device_domain_lock, flags);
1493         return domain;
1494 error:
1495         /* recheck it here, maybe others set it */
1496         return find_domain(pdev);
1497 }
1498
1499 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1500 {
1501         struct dmar_domain *domain;
1502         unsigned long size;
1503         u64 base;
1504         int ret;
1505
1506         printk(KERN_INFO
1507                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1508                 pci_name(pdev), start, end);
1509         /* page table init */
1510         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1511         if (!domain)
1512                 return -ENOMEM;
1513
1514         /* The address might not be aligned */
1515         base = start & PAGE_MASK_4K;
1516         size = end - base;
1517         size = PAGE_ALIGN_4K(size);
1518         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1519                         IOVA_PFN(base + size) - 1)) {
1520                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1521                 ret = -ENOMEM;
1522                 goto error;
1523         }
1524
1525         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1526                 size, base, pci_name(pdev));
1527         /*
1528          * RMRR range might have overlap with physical memory range,
1529          * clear it first
1530          */
1531         dma_pte_clear_range(domain, base, base + size);
1532
1533         ret = domain_page_mapping(domain, base, base, size,
1534                 DMA_PTE_READ|DMA_PTE_WRITE);
1535         if (ret)
1536                 goto error;
1537
1538         /* context entry init */
1539         ret = domain_context_mapping(domain, pdev);
1540         if (!ret)
1541                 return 0;
1542 error:
1543         domain_exit(domain);
1544         return ret;
1545
1546 }
1547
1548 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1549         struct pci_dev *pdev)
1550 {
1551         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1552                 return 0;
1553         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1554                 rmrr->end_address + 1);
1555 }
1556
1557 #ifdef CONFIG_DMAR_GFX_WA
1558 struct iommu_prepare_data {
1559         struct pci_dev *pdev;
1560         int ret;
1561 };
1562
1563 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1564                                          unsigned long end_pfn, void *datax)
1565 {
1566         struct iommu_prepare_data *data;
1567
1568         data = (struct iommu_prepare_data *)datax;
1569
1570         data->ret = iommu_prepare_identity_map(data->pdev,
1571                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1572         return data->ret;
1573
1574 }
1575
1576 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1577 {
1578         int nid;
1579         struct iommu_prepare_data data;
1580
1581         data.pdev = pdev;
1582         data.ret = 0;
1583
1584         for_each_online_node(nid) {
1585                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1586                 if (data.ret)
1587                         return data.ret;
1588         }
1589         return data.ret;
1590 }
1591
1592 static void __init iommu_prepare_gfx_mapping(void)
1593 {
1594         struct pci_dev *pdev = NULL;
1595         int ret;
1596
1597         for_each_pci_dev(pdev) {
1598                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1599                                 !IS_GFX_DEVICE(pdev))
1600                         continue;
1601                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1602                         pci_name(pdev));
1603                 ret = iommu_prepare_with_active_regions(pdev);
1604                 if (ret)
1605                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1606         }
1607 }
1608 #endif
1609
1610 #ifdef CONFIG_DMAR_FLOPPY_WA
1611 static inline void iommu_prepare_isa(void)
1612 {
1613         struct pci_dev *pdev;
1614         int ret;
1615
1616         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1617         if (!pdev)
1618                 return;
1619
1620         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1621         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1622
1623         if (ret)
1624                 printk("IOMMU: Failed to create 0-64M identity map, "
1625                         "floppy might not work\n");
1626
1627 }
1628 #else
1629 static inline void iommu_prepare_isa(void)
1630 {
1631         return;
1632 }
1633 #endif /* !CONFIG_DMAR_FLPY_WA */
1634
1635 int __init init_dmars(void)
1636 {
1637         struct dmar_drhd_unit *drhd;
1638         struct dmar_rmrr_unit *rmrr;
1639         struct pci_dev *pdev;
1640         struct intel_iommu *iommu;
1641         int i, ret, unit = 0;
1642
1643         /*
1644          * for each drhd
1645          *    allocate root
1646          *    initialize and program root entry to not present
1647          * endfor
1648          */
1649         for_each_drhd_unit(drhd) {
1650                 g_num_of_iommus++;
1651                 /*
1652                  * lock not needed as this is only incremented in the single
1653                  * threaded kernel __init code path all other access are read
1654                  * only
1655                  */
1656         }
1657
1658         deferred_flush = kzalloc(g_num_of_iommus *
1659                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1660         if (!deferred_flush) {
1661                 ret = -ENOMEM;
1662                 goto error;
1663         }
1664
1665         for_each_drhd_unit(drhd) {
1666                 if (drhd->ignored)
1667                         continue;
1668
1669                 iommu = drhd->iommu;
1670
1671                 ret = iommu_init_domains(iommu);
1672                 if (ret)
1673                         goto error;
1674
1675                 /*
1676                  * TBD:
1677                  * we could share the same root & context tables
1678                  * amoung all IOMMU's. Need to Split it later.
1679                  */
1680                 ret = iommu_alloc_root_entry(iommu);
1681                 if (ret) {
1682                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1683                         goto error;
1684                 }
1685         }
1686
1687         /*
1688          * For each rmrr
1689          *   for each dev attached to rmrr
1690          *   do
1691          *     locate drhd for dev, alloc domain for dev
1692          *     allocate free domain
1693          *     allocate page table entries for rmrr
1694          *     if context not allocated for bus
1695          *           allocate and init context
1696          *           set present in root table for this bus
1697          *     init context with domain, translation etc
1698          *    endfor
1699          * endfor
1700          */
1701         for_each_rmrr_units(rmrr) {
1702                 for (i = 0; i < rmrr->devices_cnt; i++) {
1703                         pdev = rmrr->devices[i];
1704                         /* some BIOS lists non-exist devices in DMAR table */
1705                         if (!pdev)
1706                                 continue;
1707                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1708                         if (ret)
1709                                 printk(KERN_ERR
1710                                  "IOMMU: mapping reserved region failed\n");
1711                 }
1712         }
1713
1714         iommu_prepare_gfx_mapping();
1715
1716         iommu_prepare_isa();
1717
1718         /*
1719          * for each drhd
1720          *   enable fault log
1721          *   global invalidate context cache
1722          *   global invalidate iotlb
1723          *   enable translation
1724          */
1725         for_each_drhd_unit(drhd) {
1726                 if (drhd->ignored)
1727                         continue;
1728                 iommu = drhd->iommu;
1729                 sprintf (iommu->name, "dmar%d", unit++);
1730
1731                 iommu_flush_write_buffer(iommu);
1732
1733                 ret = dmar_set_interrupt(iommu);
1734                 if (ret)
1735                         goto error;
1736
1737                 iommu_set_root_entry(iommu);
1738
1739                 iommu_flush_context_global(iommu, 0);
1740                 iommu_flush_iotlb_global(iommu, 0);
1741
1742                 iommu_disable_protect_mem_regions(iommu);
1743
1744                 ret = iommu_enable_translation(iommu);
1745                 if (ret)
1746                         goto error;
1747         }
1748
1749         return 0;
1750 error:
1751         for_each_drhd_unit(drhd) {
1752                 if (drhd->ignored)
1753                         continue;
1754                 iommu = drhd->iommu;
1755                 free_iommu(iommu);
1756         }
1757         return ret;
1758 }
1759
1760 static inline u64 aligned_size(u64 host_addr, size_t size)
1761 {
1762         u64 addr;
1763         addr = (host_addr & (~PAGE_MASK_4K)) + size;
1764         return PAGE_ALIGN_4K(addr);
1765 }
1766
1767 struct iova *
1768 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1769 {
1770         struct iova *piova;
1771
1772         /* Make sure it's in range */
1773         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1774         if (!size || (IOVA_START_ADDR + size > end))
1775                 return NULL;
1776
1777         piova = alloc_iova(&domain->iovad,
1778                         size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
1779         return piova;
1780 }
1781
1782 static struct iova *
1783 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1784                 size_t size)
1785 {
1786         struct pci_dev *pdev = to_pci_dev(dev);
1787         struct iova *iova = NULL;
1788
1789         if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1790                 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1791         } else  {
1792                 /*
1793                  * First try to allocate an io virtual address in
1794                  * DMA_32BIT_MASK and if that fails then try allocating
1795                  * from higher range
1796                  */
1797                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1798                 if (!iova)
1799                         iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1800         }
1801
1802         if (!iova) {
1803                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1804                 return NULL;
1805         }
1806
1807         return iova;
1808 }
1809
1810 static struct dmar_domain *
1811 get_valid_domain_for_dev(struct pci_dev *pdev)
1812 {
1813         struct dmar_domain *domain;
1814         int ret;
1815
1816         domain = get_domain_for_dev(pdev,
1817                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
1818         if (!domain) {
1819                 printk(KERN_ERR
1820                         "Allocating domain for %s failed", pci_name(pdev));
1821                 return NULL;
1822         }
1823
1824         /* make sure context mapping is ok */
1825         if (unlikely(!domain_context_mapped(domain, pdev))) {
1826                 ret = domain_context_mapping(domain, pdev);
1827                 if (ret) {
1828                         printk(KERN_ERR
1829                                 "Domain context map for %s failed",
1830                                 pci_name(pdev));
1831                         return NULL;
1832                 }
1833         }
1834
1835         return domain;
1836 }
1837
1838 static dma_addr_t
1839 intel_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
1840 {
1841         struct pci_dev *pdev = to_pci_dev(hwdev);
1842         struct dmar_domain *domain;
1843         unsigned long start_paddr;
1844         struct iova *iova;
1845         int prot = 0;
1846         int ret;
1847
1848         BUG_ON(dir == DMA_NONE);
1849         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1850                 return paddr;
1851
1852         domain = get_valid_domain_for_dev(pdev);
1853         if (!domain)
1854                 return 0;
1855
1856         size = aligned_size((u64)paddr, size);
1857
1858         iova = __intel_alloc_iova(hwdev, domain, size);
1859         if (!iova)
1860                 goto error;
1861
1862         start_paddr = iova->pfn_lo << PAGE_SHIFT_4K;
1863
1864         /*
1865          * Check if DMAR supports zero-length reads on write only
1866          * mappings..
1867          */
1868         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1869                         !cap_zlr(domain->iommu->cap))
1870                 prot |= DMA_PTE_READ;
1871         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1872                 prot |= DMA_PTE_WRITE;
1873         /*
1874          * paddr - (paddr + size) might be partial page, we should map the whole
1875          * page.  Note: if two part of one page are separately mapped, we
1876          * might have two guest_addr mapping to the same host paddr, but this
1877          * is not a big problem
1878          */
1879         ret = domain_page_mapping(domain, start_paddr,
1880                 ((u64)paddr) & PAGE_MASK_4K, size, prot);
1881         if (ret)
1882                 goto error;
1883
1884         pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1885                 pci_name(pdev), size, (u64)paddr,
1886                 size, (u64)start_paddr, dir);
1887
1888         /* it's a non-present to present mapping */
1889         ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1890                         start_paddr, size >> PAGE_SHIFT_4K, 1);
1891         if (ret)
1892                 iommu_flush_write_buffer(domain->iommu);
1893
1894         return (start_paddr + ((u64)paddr & (~PAGE_MASK_4K)));
1895
1896 error:
1897         if (iova)
1898                 __free_iova(&domain->iovad, iova);
1899         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1900                 pci_name(pdev), size, (u64)paddr, dir);
1901         return 0;
1902 }
1903
1904 static void flush_unmaps(void)
1905 {
1906         int i, j;
1907
1908         timer_on = 0;
1909
1910         /* just flush them all */
1911         for (i = 0; i < g_num_of_iommus; i++) {
1912                 if (deferred_flush[i].next) {
1913                         struct intel_iommu *iommu =
1914                                 deferred_flush[i].domain[0]->iommu;
1915
1916                         iommu_flush_iotlb_global(iommu, 0);
1917                         for (j = 0; j < deferred_flush[i].next; j++) {
1918                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
1919                                                 deferred_flush[i].iova[j]);
1920                         }
1921                         deferred_flush[i].next = 0;
1922                 }
1923         }
1924
1925         list_size = 0;
1926 }
1927
1928 static void flush_unmaps_timeout(unsigned long data)
1929 {
1930         unsigned long flags;
1931
1932         spin_lock_irqsave(&async_umap_flush_lock, flags);
1933         flush_unmaps();
1934         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1935 }
1936
1937 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
1938 {
1939         unsigned long flags;
1940         int next, iommu_id;
1941
1942         spin_lock_irqsave(&async_umap_flush_lock, flags);
1943         if (list_size == HIGH_WATER_MARK)
1944                 flush_unmaps();
1945
1946         iommu_id = dom->iommu->seq_id;
1947
1948         next = deferred_flush[iommu_id].next;
1949         deferred_flush[iommu_id].domain[next] = dom;
1950         deferred_flush[iommu_id].iova[next] = iova;
1951         deferred_flush[iommu_id].next++;
1952
1953         if (!timer_on) {
1954                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
1955                 timer_on = 1;
1956         }
1957         list_size++;
1958         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1959 }
1960
1961 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1962         size_t size, int dir)
1963 {
1964         struct pci_dev *pdev = to_pci_dev(dev);
1965         struct dmar_domain *domain;
1966         unsigned long start_addr;
1967         struct iova *iova;
1968
1969         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1970                 return;
1971         domain = find_domain(pdev);
1972         BUG_ON(!domain);
1973
1974         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1975         if (!iova)
1976                 return;
1977
1978         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1979         size = aligned_size((u64)dev_addr, size);
1980
1981         pr_debug("Device %s unmapping: %lx@%llx\n",
1982                 pci_name(pdev), size, (u64)start_addr);
1983
1984         /*  clear the whole page */
1985         dma_pte_clear_range(domain, start_addr, start_addr + size);
1986         /* free page tables */
1987         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1988         if (intel_iommu_strict) {
1989                 if (iommu_flush_iotlb_psi(domain->iommu,
1990                         domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
1991                         iommu_flush_write_buffer(domain->iommu);
1992                 /* free iova */
1993                 __free_iova(&domain->iovad, iova);
1994         } else {
1995                 add_unmap(domain, iova);
1996                 /*
1997                  * queue up the release of the unmap to save the 1/6th of the
1998                  * cpu used up by the iotlb flush operation...
1999                  */
2000         }
2001 }
2002
2003 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
2004                        dma_addr_t *dma_handle, gfp_t flags)
2005 {
2006         void *vaddr;
2007         int order;
2008
2009         size = PAGE_ALIGN_4K(size);
2010         order = get_order(size);
2011         flags &= ~(GFP_DMA | GFP_DMA32);
2012
2013         vaddr = (void *)__get_free_pages(flags, order);
2014         if (!vaddr)
2015                 return NULL;
2016         memset(vaddr, 0, size);
2017
2018         *dma_handle = intel_map_single(hwdev, virt_to_bus(vaddr), size, DMA_BIDIRECTIONAL);
2019         if (*dma_handle)
2020                 return vaddr;
2021         free_pages((unsigned long)vaddr, order);
2022         return NULL;
2023 }
2024
2025 static void intel_free_coherent(struct device *hwdev, size_t size,
2026         void *vaddr, dma_addr_t dma_handle)
2027 {
2028         int order;
2029
2030         size = PAGE_ALIGN_4K(size);
2031         order = get_order(size);
2032
2033         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2034         free_pages((unsigned long)vaddr, order);
2035 }
2036
2037 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2038 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2039         int nelems, int dir)
2040 {
2041         int i;
2042         struct pci_dev *pdev = to_pci_dev(hwdev);
2043         struct dmar_domain *domain;
2044         unsigned long start_addr;
2045         struct iova *iova;
2046         size_t size = 0;
2047         void *addr;
2048         struct scatterlist *sg;
2049
2050         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2051                 return;
2052
2053         domain = find_domain(pdev);
2054
2055         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2056         if (!iova)
2057                 return;
2058         for_each_sg(sglist, sg, nelems, i) {
2059                 addr = SG_ENT_VIRT_ADDRESS(sg);
2060                 size += aligned_size((u64)addr, sg->length);
2061         }
2062
2063         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2064
2065         /*  clear the whole page */
2066         dma_pte_clear_range(domain, start_addr, start_addr + size);
2067         /* free page tables */
2068         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2069
2070         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2071                         size >> PAGE_SHIFT_4K, 0))
2072                 iommu_flush_write_buffer(domain->iommu);
2073
2074         /* free iova */
2075         __free_iova(&domain->iovad, iova);
2076 }
2077
2078 static int intel_nontranslate_map_sg(struct device *hddev,
2079         struct scatterlist *sglist, int nelems, int dir)
2080 {
2081         int i;
2082         struct scatterlist *sg;
2083
2084         for_each_sg(sglist, sg, nelems, i) {
2085                 BUG_ON(!sg_page(sg));
2086                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2087                 sg->dma_length = sg->length;
2088         }
2089         return nelems;
2090 }
2091
2092 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
2093                                 int nelems, int dir)
2094 {
2095         void *addr;
2096         int i;
2097         struct pci_dev *pdev = to_pci_dev(hwdev);
2098         struct dmar_domain *domain;
2099         size_t size = 0;
2100         int prot = 0;
2101         size_t offset = 0;
2102         struct iova *iova = NULL;
2103         int ret;
2104         struct scatterlist *sg;
2105         unsigned long start_addr;
2106
2107         BUG_ON(dir == DMA_NONE);
2108         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2109                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2110
2111         domain = get_valid_domain_for_dev(pdev);
2112         if (!domain)
2113                 return 0;
2114
2115         for_each_sg(sglist, sg, nelems, i) {
2116                 addr = SG_ENT_VIRT_ADDRESS(sg);
2117                 addr = (void *)virt_to_phys(addr);
2118                 size += aligned_size((u64)addr, sg->length);
2119         }
2120
2121         iova = __intel_alloc_iova(hwdev, domain, size);
2122         if (!iova) {
2123                 sglist->dma_length = 0;
2124                 return 0;
2125         }
2126
2127         /*
2128          * Check if DMAR supports zero-length reads on write only
2129          * mappings..
2130          */
2131         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2132                         !cap_zlr(domain->iommu->cap))
2133                 prot |= DMA_PTE_READ;
2134         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2135                 prot |= DMA_PTE_WRITE;
2136
2137         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2138         offset = 0;
2139         for_each_sg(sglist, sg, nelems, i) {
2140                 addr = SG_ENT_VIRT_ADDRESS(sg);
2141                 addr = (void *)virt_to_phys(addr);
2142                 size = aligned_size((u64)addr, sg->length);
2143                 ret = domain_page_mapping(domain, start_addr + offset,
2144                         ((u64)addr) & PAGE_MASK_4K,
2145                         size, prot);
2146                 if (ret) {
2147                         /*  clear the page */
2148                         dma_pte_clear_range(domain, start_addr,
2149                                   start_addr + offset);
2150                         /* free page tables */
2151                         dma_pte_free_pagetable(domain, start_addr,
2152                                   start_addr + offset);
2153                         /* free iova */
2154                         __free_iova(&domain->iovad, iova);
2155                         return 0;
2156                 }
2157                 sg->dma_address = start_addr + offset +
2158                                 ((u64)addr & (~PAGE_MASK_4K));
2159                 sg->dma_length = sg->length;
2160                 offset += size;
2161         }
2162
2163         /* it's a non-present to present mapping */
2164         if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2165                         start_addr, offset >> PAGE_SHIFT_4K, 1))
2166                 iommu_flush_write_buffer(domain->iommu);
2167         return nelems;
2168 }
2169
2170 static struct dma_mapping_ops intel_dma_ops = {
2171         .alloc_coherent = intel_alloc_coherent,
2172         .free_coherent = intel_free_coherent,
2173         .map_single = intel_map_single,
2174         .unmap_single = intel_unmap_single,
2175         .map_sg = intel_map_sg,
2176         .unmap_sg = intel_unmap_sg,
2177 };
2178
2179 static inline int iommu_domain_cache_init(void)
2180 {
2181         int ret = 0;
2182
2183         iommu_domain_cache = kmem_cache_create("iommu_domain",
2184                                          sizeof(struct dmar_domain),
2185                                          0,
2186                                          SLAB_HWCACHE_ALIGN,
2187
2188                                          NULL);
2189         if (!iommu_domain_cache) {
2190                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2191                 ret = -ENOMEM;
2192         }
2193
2194         return ret;
2195 }
2196
2197 static inline int iommu_devinfo_cache_init(void)
2198 {
2199         int ret = 0;
2200
2201         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2202                                          sizeof(struct device_domain_info),
2203                                          0,
2204                                          SLAB_HWCACHE_ALIGN,
2205
2206                                          NULL);
2207         if (!iommu_devinfo_cache) {
2208                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2209                 ret = -ENOMEM;
2210         }
2211
2212         return ret;
2213 }
2214
2215 static inline int iommu_iova_cache_init(void)
2216 {
2217         int ret = 0;
2218
2219         iommu_iova_cache = kmem_cache_create("iommu_iova",
2220                                          sizeof(struct iova),
2221                                          0,
2222                                          SLAB_HWCACHE_ALIGN,
2223
2224                                          NULL);
2225         if (!iommu_iova_cache) {
2226                 printk(KERN_ERR "Couldn't create iova cache\n");
2227                 ret = -ENOMEM;
2228         }
2229
2230         return ret;
2231 }
2232
2233 static int __init iommu_init_mempool(void)
2234 {
2235         int ret;
2236         ret = iommu_iova_cache_init();
2237         if (ret)
2238                 return ret;
2239
2240         ret = iommu_domain_cache_init();
2241         if (ret)
2242                 goto domain_error;
2243
2244         ret = iommu_devinfo_cache_init();
2245         if (!ret)
2246                 return ret;
2247
2248         kmem_cache_destroy(iommu_domain_cache);
2249 domain_error:
2250         kmem_cache_destroy(iommu_iova_cache);
2251
2252         return -ENOMEM;
2253 }
2254
2255 static void __init iommu_exit_mempool(void)
2256 {
2257         kmem_cache_destroy(iommu_devinfo_cache);
2258         kmem_cache_destroy(iommu_domain_cache);
2259         kmem_cache_destroy(iommu_iova_cache);
2260
2261 }
2262
2263 void __init detect_intel_iommu(void)
2264 {
2265         if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
2266                 return;
2267         if (early_dmar_detect()) {
2268                 iommu_detected = 1;
2269         }
2270 }
2271
2272 static void __init init_no_remapping_devices(void)
2273 {
2274         struct dmar_drhd_unit *drhd;
2275
2276         for_each_drhd_unit(drhd) {
2277                 if (!drhd->include_all) {
2278                         int i;
2279                         for (i = 0; i < drhd->devices_cnt; i++)
2280                                 if (drhd->devices[i] != NULL)
2281                                         break;
2282                         /* ignore DMAR unit if no pci devices exist */
2283                         if (i == drhd->devices_cnt)
2284                                 drhd->ignored = 1;
2285                 }
2286         }
2287
2288         if (dmar_map_gfx)
2289                 return;
2290
2291         for_each_drhd_unit(drhd) {
2292                 int i;
2293                 if (drhd->ignored || drhd->include_all)
2294                         continue;
2295
2296                 for (i = 0; i < drhd->devices_cnt; i++)
2297                         if (drhd->devices[i] &&
2298                                 !IS_GFX_DEVICE(drhd->devices[i]))
2299                                 break;
2300
2301                 if (i < drhd->devices_cnt)
2302                         continue;
2303
2304                 /* bypass IOMMU if it is just for gfx devices */
2305                 drhd->ignored = 1;
2306                 for (i = 0; i < drhd->devices_cnt; i++) {
2307                         if (!drhd->devices[i])
2308                                 continue;
2309                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2310                 }
2311         }
2312 }
2313
2314 int __init intel_iommu_init(void)
2315 {
2316         int ret = 0;
2317
2318         if (no_iommu || swiotlb || dmar_disabled)
2319                 return -ENODEV;
2320
2321         if (dmar_table_init())
2322                 return  -ENODEV;
2323
2324         if (dmar_dev_scope_init())
2325                 return  -ENODEV;
2326
2327         iommu_init_mempool();
2328         dmar_init_reserved_ranges();
2329
2330         init_no_remapping_devices();
2331
2332         ret = init_dmars();
2333         if (ret) {
2334                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2335                 put_iova_domain(&reserved_iova_list);
2336                 iommu_exit_mempool();
2337                 return ret;
2338         }
2339         printk(KERN_INFO
2340         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2341
2342         init_timer(&unmap_timer);
2343         force_iommu = 1;
2344         dma_ops = &intel_dma_ops;
2345         return 0;
2346 }
2347