]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/blob - fs/btrfs/extent_io.c
Btrfs: add and improve comments
[linux-2.6-omap-h63xx.git] / fs / btrfs / extent_io.c
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/gfp.h>
6 #include <linux/pagemap.h>
7 #include <linux/page-flags.h>
8 #include <linux/module.h>
9 #include <linux/spinlock.h>
10 #include <linux/blkdev.h>
11 #include <linux/swap.h>
12 #include <linux/version.h>
13 #include <linux/writeback.h>
14 #include <linux/pagevec.h>
15 #include "extent_io.h"
16 #include "extent_map.h"
17 #include "compat.h"
18 #include "ctree.h"
19 #include "btrfs_inode.h"
20
21 /* temporary define until extent_map moves out of btrfs */
22 struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
23                                        unsigned long extra_flags,
24                                        void (*ctor)(void *, struct kmem_cache *,
25                                                     unsigned long));
26
27 static struct kmem_cache *extent_state_cache;
28 static struct kmem_cache *extent_buffer_cache;
29
30 static LIST_HEAD(buffers);
31 static LIST_HEAD(states);
32
33 #ifdef LEAK_DEBUG
34 static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
35 #endif
36
37 #define BUFFER_LRU_MAX 64
38
39 struct tree_entry {
40         u64 start;
41         u64 end;
42         struct rb_node rb_node;
43 };
44
45 struct extent_page_data {
46         struct bio *bio;
47         struct extent_io_tree *tree;
48         get_extent_t *get_extent;
49 };
50
51 int __init extent_io_init(void)
52 {
53         extent_state_cache = btrfs_cache_create("extent_state",
54                                             sizeof(struct extent_state), 0,
55                                             NULL);
56         if (!extent_state_cache)
57                 return -ENOMEM;
58
59         extent_buffer_cache = btrfs_cache_create("extent_buffers",
60                                             sizeof(struct extent_buffer), 0,
61                                             NULL);
62         if (!extent_buffer_cache)
63                 goto free_state_cache;
64         return 0;
65
66 free_state_cache:
67         kmem_cache_destroy(extent_state_cache);
68         return -ENOMEM;
69 }
70
71 void extent_io_exit(void)
72 {
73         struct extent_state *state;
74         struct extent_buffer *eb;
75
76         while (!list_empty(&states)) {
77                 state = list_entry(states.next, struct extent_state, leak_list);
78                 printk("state leak: start %Lu end %Lu state %lu in tree %p refs %d\n", state->start, state->end, state->state, state->tree, atomic_read(&state->refs));
79                 list_del(&state->leak_list);
80                 kmem_cache_free(extent_state_cache, state);
81
82         }
83
84         while (!list_empty(&buffers)) {
85                 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
86                 printk("buffer leak start %Lu len %lu refs %d\n", eb->start, eb->len, atomic_read(&eb->refs));
87                 list_del(&eb->leak_list);
88                 kmem_cache_free(extent_buffer_cache, eb);
89         }
90         if (extent_state_cache)
91                 kmem_cache_destroy(extent_state_cache);
92         if (extent_buffer_cache)
93                 kmem_cache_destroy(extent_buffer_cache);
94 }
95
96 void extent_io_tree_init(struct extent_io_tree *tree,
97                           struct address_space *mapping, gfp_t mask)
98 {
99         tree->state.rb_node = NULL;
100         tree->buffer.rb_node = NULL;
101         tree->ops = NULL;
102         tree->dirty_bytes = 0;
103         spin_lock_init(&tree->lock);
104         spin_lock_init(&tree->buffer_lock);
105         tree->mapping = mapping;
106 }
107 EXPORT_SYMBOL(extent_io_tree_init);
108
109 struct extent_state *alloc_extent_state(gfp_t mask)
110 {
111         struct extent_state *state;
112 #ifdef LEAK_DEBUG
113         unsigned long flags;
114 #endif
115
116         state = kmem_cache_alloc(extent_state_cache, mask);
117         if (!state)
118                 return state;
119         state->state = 0;
120         state->private = 0;
121         state->tree = NULL;
122 #ifdef LEAK_DEBUG
123         spin_lock_irqsave(&leak_lock, flags);
124         list_add(&state->leak_list, &states);
125         spin_unlock_irqrestore(&leak_lock, flags);
126 #endif
127         atomic_set(&state->refs, 1);
128         init_waitqueue_head(&state->wq);
129         return state;
130 }
131 EXPORT_SYMBOL(alloc_extent_state);
132
133 void free_extent_state(struct extent_state *state)
134 {
135         if (!state)
136                 return;
137         if (atomic_dec_and_test(&state->refs)) {
138 #ifdef LEAK_DEBUG
139                 unsigned long flags;
140 #endif
141                 WARN_ON(state->tree);
142 #ifdef LEAK_DEBUG
143                 spin_lock_irqsave(&leak_lock, flags);
144                 list_del(&state->leak_list);
145                 spin_unlock_irqrestore(&leak_lock, flags);
146 #endif
147                 kmem_cache_free(extent_state_cache, state);
148         }
149 }
150 EXPORT_SYMBOL(free_extent_state);
151
152 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
153                                    struct rb_node *node)
154 {
155         struct rb_node ** p = &root->rb_node;
156         struct rb_node * parent = NULL;
157         struct tree_entry *entry;
158
159         while(*p) {
160                 parent = *p;
161                 entry = rb_entry(parent, struct tree_entry, rb_node);
162
163                 if (offset < entry->start)
164                         p = &(*p)->rb_left;
165                 else if (offset > entry->end)
166                         p = &(*p)->rb_right;
167                 else
168                         return parent;
169         }
170
171         entry = rb_entry(node, struct tree_entry, rb_node);
172         rb_link_node(node, parent, p);
173         rb_insert_color(node, root);
174         return NULL;
175 }
176
177 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
178                                      struct rb_node **prev_ret,
179                                      struct rb_node **next_ret)
180 {
181         struct rb_root *root = &tree->state;
182         struct rb_node * n = root->rb_node;
183         struct rb_node *prev = NULL;
184         struct rb_node *orig_prev = NULL;
185         struct tree_entry *entry;
186         struct tree_entry *prev_entry = NULL;
187
188         while(n) {
189                 entry = rb_entry(n, struct tree_entry, rb_node);
190                 prev = n;
191                 prev_entry = entry;
192
193                 if (offset < entry->start)
194                         n = n->rb_left;
195                 else if (offset > entry->end)
196                         n = n->rb_right;
197                 else {
198                         return n;
199                 }
200         }
201
202         if (prev_ret) {
203                 orig_prev = prev;
204                 while(prev && offset > prev_entry->end) {
205                         prev = rb_next(prev);
206                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
207                 }
208                 *prev_ret = prev;
209                 prev = orig_prev;
210         }
211
212         if (next_ret) {
213                 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
214                 while(prev && offset < prev_entry->start) {
215                         prev = rb_prev(prev);
216                         prev_entry = rb_entry(prev, struct tree_entry, rb_node);
217                 }
218                 *next_ret = prev;
219         }
220         return NULL;
221 }
222
223 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
224                                           u64 offset)
225 {
226         struct rb_node *prev = NULL;
227         struct rb_node *ret;
228
229         ret = __etree_search(tree, offset, &prev, NULL);
230         if (!ret) {
231                 return prev;
232         }
233         return ret;
234 }
235
236 static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
237                                           u64 offset, struct rb_node *node)
238 {
239         struct rb_root *root = &tree->buffer;
240         struct rb_node ** p = &root->rb_node;
241         struct rb_node * parent = NULL;
242         struct extent_buffer *eb;
243
244         while(*p) {
245                 parent = *p;
246                 eb = rb_entry(parent, struct extent_buffer, rb_node);
247
248                 if (offset < eb->start)
249                         p = &(*p)->rb_left;
250                 else if (offset > eb->start)
251                         p = &(*p)->rb_right;
252                 else
253                         return eb;
254         }
255
256         rb_link_node(node, parent, p);
257         rb_insert_color(node, root);
258         return NULL;
259 }
260
261 static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
262                                            u64 offset)
263 {
264         struct rb_root *root = &tree->buffer;
265         struct rb_node * n = root->rb_node;
266         struct extent_buffer *eb;
267
268         while(n) {
269                 eb = rb_entry(n, struct extent_buffer, rb_node);
270                 if (offset < eb->start)
271                         n = n->rb_left;
272                 else if (offset > eb->start)
273                         n = n->rb_right;
274                 else
275                         return eb;
276         }
277         return NULL;
278 }
279
280 /*
281  * utility function to look for merge candidates inside a given range.
282  * Any extents with matching state are merged together into a single
283  * extent in the tree.  Extents with EXTENT_IO in their state field
284  * are not merged because the end_io handlers need to be able to do
285  * operations on them without sleeping (or doing allocations/splits).
286  *
287  * This should be called with the tree lock held.
288  */
289 static int merge_state(struct extent_io_tree *tree,
290                        struct extent_state *state)
291 {
292         struct extent_state *other;
293         struct rb_node *other_node;
294
295         if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
296                 return 0;
297
298         other_node = rb_prev(&state->rb_node);
299         if (other_node) {
300                 other = rb_entry(other_node, struct extent_state, rb_node);
301                 if (other->end == state->start - 1 &&
302                     other->state == state->state) {
303                         state->start = other->start;
304                         other->tree = NULL;
305                         rb_erase(&other->rb_node, &tree->state);
306                         free_extent_state(other);
307                 }
308         }
309         other_node = rb_next(&state->rb_node);
310         if (other_node) {
311                 other = rb_entry(other_node, struct extent_state, rb_node);
312                 if (other->start == state->end + 1 &&
313                     other->state == state->state) {
314                         other->start = state->start;
315                         state->tree = NULL;
316                         rb_erase(&state->rb_node, &tree->state);
317                         free_extent_state(state);
318                 }
319         }
320         return 0;
321 }
322
323 static void set_state_cb(struct extent_io_tree *tree,
324                          struct extent_state *state,
325                          unsigned long bits)
326 {
327         if (tree->ops && tree->ops->set_bit_hook) {
328                 tree->ops->set_bit_hook(tree->mapping->host, state->start,
329                                         state->end, state->state, bits);
330         }
331 }
332
333 static void clear_state_cb(struct extent_io_tree *tree,
334                            struct extent_state *state,
335                            unsigned long bits)
336 {
337         if (tree->ops && tree->ops->set_bit_hook) {
338                 tree->ops->clear_bit_hook(tree->mapping->host, state->start,
339                                           state->end, state->state, bits);
340         }
341 }
342
343 /*
344  * insert an extent_state struct into the tree.  'bits' are set on the
345  * struct before it is inserted.
346  *
347  * This may return -EEXIST if the extent is already there, in which case the
348  * state struct is freed.
349  *
350  * The tree lock is not taken internally.  This is a utility function and
351  * probably isn't what you want to call (see set/clear_extent_bit).
352  */
353 static int insert_state(struct extent_io_tree *tree,
354                         struct extent_state *state, u64 start, u64 end,
355                         int bits)
356 {
357         struct rb_node *node;
358
359         if (end < start) {
360                 printk("end < start %Lu %Lu\n", end, start);
361                 WARN_ON(1);
362         }
363         if (bits & EXTENT_DIRTY)
364                 tree->dirty_bytes += end - start + 1;
365         set_state_cb(tree, state, bits);
366         state->state |= bits;
367         state->start = start;
368         state->end = end;
369         node = tree_insert(&tree->state, end, &state->rb_node);
370         if (node) {
371                 struct extent_state *found;
372                 found = rb_entry(node, struct extent_state, rb_node);
373                 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
374                 free_extent_state(state);
375                 return -EEXIST;
376         }
377         state->tree = tree;
378         merge_state(tree, state);
379         return 0;
380 }
381
382 /*
383  * split a given extent state struct in two, inserting the preallocated
384  * struct 'prealloc' as the newly created second half.  'split' indicates an
385  * offset inside 'orig' where it should be split.
386  *
387  * Before calling,
388  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
389  * are two extent state structs in the tree:
390  * prealloc: [orig->start, split - 1]
391  * orig: [ split, orig->end ]
392  *
393  * The tree locks are not taken by this function. They need to be held
394  * by the caller.
395  */
396 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
397                        struct extent_state *prealloc, u64 split)
398 {
399         struct rb_node *node;
400         prealloc->start = orig->start;
401         prealloc->end = split - 1;
402         prealloc->state = orig->state;
403         orig->start = split;
404
405         node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
406         if (node) {
407                 struct extent_state *found;
408                 found = rb_entry(node, struct extent_state, rb_node);
409                 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
410                 free_extent_state(prealloc);
411                 return -EEXIST;
412         }
413         prealloc->tree = tree;
414         return 0;
415 }
416
417 /*
418  * utility function to clear some bits in an extent state struct.
419  * it will optionally wake up any one waiting on this state (wake == 1), or
420  * forcibly remove the state from the tree (delete == 1).
421  *
422  * If no bits are set on the state struct after clearing things, the
423  * struct is freed and removed from the tree
424  */
425 static int clear_state_bit(struct extent_io_tree *tree,
426                             struct extent_state *state, int bits, int wake,
427                             int delete)
428 {
429         int ret = state->state & bits;
430
431         if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
432                 u64 range = state->end - state->start + 1;
433                 WARN_ON(range > tree->dirty_bytes);
434                 tree->dirty_bytes -= range;
435         }
436         clear_state_cb(tree, state, bits);
437         state->state &= ~bits;
438         if (wake)
439                 wake_up(&state->wq);
440         if (delete || state->state == 0) {
441                 if (state->tree) {
442                         clear_state_cb(tree, state, state->state);
443                         rb_erase(&state->rb_node, &tree->state);
444                         state->tree = NULL;
445                         free_extent_state(state);
446                 } else {
447                         WARN_ON(1);
448                 }
449         } else {
450                 merge_state(tree, state);
451         }
452         return ret;
453 }
454
455 /*
456  * clear some bits on a range in the tree.  This may require splitting
457  * or inserting elements in the tree, so the gfp mask is used to
458  * indicate which allocations or sleeping are allowed.
459  *
460  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
461  * the given range from the tree regardless of state (ie for truncate).
462  *
463  * the range [start, end] is inclusive.
464  *
465  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
466  * bits were already set, or zero if none of the bits were already set.
467  */
468 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
469                      int bits, int wake, int delete, gfp_t mask)
470 {
471         struct extent_state *state;
472         struct extent_state *prealloc = NULL;
473         struct rb_node *node;
474         unsigned long flags;
475         int err;
476         int set = 0;
477
478 again:
479         if (!prealloc && (mask & __GFP_WAIT)) {
480                 prealloc = alloc_extent_state(mask);
481                 if (!prealloc)
482                         return -ENOMEM;
483         }
484
485         spin_lock_irqsave(&tree->lock, flags);
486         /*
487          * this search will find the extents that end after
488          * our range starts
489          */
490         node = tree_search(tree, start);
491         if (!node)
492                 goto out;
493         state = rb_entry(node, struct extent_state, rb_node);
494         if (state->start > end)
495                 goto out;
496         WARN_ON(state->end < start);
497
498         /*
499          *     | ---- desired range ---- |
500          *  | state | or
501          *  | ------------- state -------------- |
502          *
503          * We need to split the extent we found, and may flip
504          * bits on second half.
505          *
506          * If the extent we found extends past our range, we
507          * just split and search again.  It'll get split again
508          * the next time though.
509          *
510          * If the extent we found is inside our range, we clear
511          * the desired bit on it.
512          */
513
514         if (state->start < start) {
515                 if (!prealloc)
516                         prealloc = alloc_extent_state(GFP_ATOMIC);
517                 err = split_state(tree, state, prealloc, start);
518                 BUG_ON(err == -EEXIST);
519                 prealloc = NULL;
520                 if (err)
521                         goto out;
522                 if (state->end <= end) {
523                         start = state->end + 1;
524                         set |= clear_state_bit(tree, state, bits,
525                                         wake, delete);
526                 } else {
527                         start = state->start;
528                 }
529                 goto search_again;
530         }
531         /*
532          * | ---- desired range ---- |
533          *                        | state |
534          * We need to split the extent, and clear the bit
535          * on the first half
536          */
537         if (state->start <= end && state->end > end) {
538                 if (!prealloc)
539                         prealloc = alloc_extent_state(GFP_ATOMIC);
540                 err = split_state(tree, state, prealloc, end + 1);
541                 BUG_ON(err == -EEXIST);
542
543                 if (wake)
544                         wake_up(&state->wq);
545                 set |= clear_state_bit(tree, prealloc, bits,
546                                        wake, delete);
547                 prealloc = NULL;
548                 goto out;
549         }
550
551         start = state->end + 1;
552         set |= clear_state_bit(tree, state, bits, wake, delete);
553         goto search_again;
554
555 out:
556         spin_unlock_irqrestore(&tree->lock, flags);
557         if (prealloc)
558                 free_extent_state(prealloc);
559
560         return set;
561
562 search_again:
563         if (start > end)
564                 goto out;
565         spin_unlock_irqrestore(&tree->lock, flags);
566         if (mask & __GFP_WAIT)
567                 cond_resched();
568         goto again;
569 }
570 EXPORT_SYMBOL(clear_extent_bit);
571
572 static int wait_on_state(struct extent_io_tree *tree,
573                          struct extent_state *state)
574 {
575         DEFINE_WAIT(wait);
576         prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
577         spin_unlock_irq(&tree->lock);
578         schedule();
579         spin_lock_irq(&tree->lock);
580         finish_wait(&state->wq, &wait);
581         return 0;
582 }
583
584 /*
585  * waits for one or more bits to clear on a range in the state tree.
586  * The range [start, end] is inclusive.
587  * The tree lock is taken by this function
588  */
589 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
590 {
591         struct extent_state *state;
592         struct rb_node *node;
593
594         spin_lock_irq(&tree->lock);
595 again:
596         while (1) {
597                 /*
598                  * this search will find all the extents that end after
599                  * our range starts
600                  */
601                 node = tree_search(tree, start);
602                 if (!node)
603                         break;
604
605                 state = rb_entry(node, struct extent_state, rb_node);
606
607                 if (state->start > end)
608                         goto out;
609
610                 if (state->state & bits) {
611                         start = state->start;
612                         atomic_inc(&state->refs);
613                         wait_on_state(tree, state);
614                         free_extent_state(state);
615                         goto again;
616                 }
617                 start = state->end + 1;
618
619                 if (start > end)
620                         break;
621
622                 if (need_resched()) {
623                         spin_unlock_irq(&tree->lock);
624                         cond_resched();
625                         spin_lock_irq(&tree->lock);
626                 }
627         }
628 out:
629         spin_unlock_irq(&tree->lock);
630         return 0;
631 }
632 EXPORT_SYMBOL(wait_extent_bit);
633
634 static void set_state_bits(struct extent_io_tree *tree,
635                            struct extent_state *state,
636                            int bits)
637 {
638         if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
639                 u64 range = state->end - state->start + 1;
640                 tree->dirty_bytes += range;
641         }
642         set_state_cb(tree, state, bits);
643         state->state |= bits;
644 }
645
646 /*
647  * set some bits on a range in the tree.  This may require allocations
648  * or sleeping, so the gfp mask is used to indicate what is allowed.
649  *
650  * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
651  * range already has the desired bits set.  The start of the existing
652  * range is returned in failed_start in this case.
653  *
654  * [start, end] is inclusive
655  * This takes the tree lock.
656  */
657 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
658                    int exclusive, u64 *failed_start, gfp_t mask)
659 {
660         struct extent_state *state;
661         struct extent_state *prealloc = NULL;
662         struct rb_node *node;
663         unsigned long flags;
664         int err = 0;
665         int set;
666         u64 last_start;
667         u64 last_end;
668 again:
669         if (!prealloc && (mask & __GFP_WAIT)) {
670                 prealloc = alloc_extent_state(mask);
671                 if (!prealloc)
672                         return -ENOMEM;
673         }
674
675         spin_lock_irqsave(&tree->lock, flags);
676         /*
677          * this search will find all the extents that end after
678          * our range starts.
679          */
680         node = tree_search(tree, start);
681         if (!node) {
682                 err = insert_state(tree, prealloc, start, end, bits);
683                 prealloc = NULL;
684                 BUG_ON(err == -EEXIST);
685                 goto out;
686         }
687
688         state = rb_entry(node, struct extent_state, rb_node);
689         last_start = state->start;
690         last_end = state->end;
691
692         /*
693          * | ---- desired range ---- |
694          * | state |
695          *
696          * Just lock what we found and keep going
697          */
698         if (state->start == start && state->end <= end) {
699                 set = state->state & bits;
700                 if (set && exclusive) {
701                         *failed_start = state->start;
702                         err = -EEXIST;
703                         goto out;
704                 }
705                 set_state_bits(tree, state, bits);
706                 start = state->end + 1;
707                 merge_state(tree, state);
708                 goto search_again;
709         }
710
711         /*
712          *     | ---- desired range ---- |
713          * | state |
714          *   or
715          * | ------------- state -------------- |
716          *
717          * We need to split the extent we found, and may flip bits on
718          * second half.
719          *
720          * If the extent we found extends past our
721          * range, we just split and search again.  It'll get split
722          * again the next time though.
723          *
724          * If the extent we found is inside our range, we set the
725          * desired bit on it.
726          */
727         if (state->start < start) {
728                 set = state->state & bits;
729                 if (exclusive && set) {
730                         *failed_start = start;
731                         err = -EEXIST;
732                         goto out;
733                 }
734                 err = split_state(tree, state, prealloc, start);
735                 BUG_ON(err == -EEXIST);
736                 prealloc = NULL;
737                 if (err)
738                         goto out;
739                 if (state->end <= end) {
740                         set_state_bits(tree, state, bits);
741                         start = state->end + 1;
742                         merge_state(tree, state);
743                 } else {
744                         start = state->start;
745                 }
746                 goto search_again;
747         }
748         /*
749          * | ---- desired range ---- |
750          *     | state | or               | state |
751          *
752          * There's a hole, we need to insert something in it and
753          * ignore the extent we found.
754          */
755         if (state->start > start) {
756                 u64 this_end;
757                 if (end < last_start)
758                         this_end = end;
759                 else
760                         this_end = last_start -1;
761                 err = insert_state(tree, prealloc, start, this_end,
762                                    bits);
763                 prealloc = NULL;
764                 BUG_ON(err == -EEXIST);
765                 if (err)
766                         goto out;
767                 start = this_end + 1;
768                 goto search_again;
769         }
770         /*
771          * | ---- desired range ---- |
772          *                        | state |
773          * We need to split the extent, and set the bit
774          * on the first half
775          */
776         if (state->start <= end && state->end > end) {
777                 set = state->state & bits;
778                 if (exclusive && set) {
779                         *failed_start = start;
780                         err = -EEXIST;
781                         goto out;
782                 }
783                 err = split_state(tree, state, prealloc, end + 1);
784                 BUG_ON(err == -EEXIST);
785
786                 set_state_bits(tree, prealloc, bits);
787                 merge_state(tree, prealloc);
788                 prealloc = NULL;
789                 goto out;
790         }
791
792         goto search_again;
793
794 out:
795         spin_unlock_irqrestore(&tree->lock, flags);
796         if (prealloc)
797                 free_extent_state(prealloc);
798
799         return err;
800
801 search_again:
802         if (start > end)
803                 goto out;
804         spin_unlock_irqrestore(&tree->lock, flags);
805         if (mask & __GFP_WAIT)
806                 cond_resched();
807         goto again;
808 }
809 EXPORT_SYMBOL(set_extent_bit);
810
811 /* wrappers around set/clear extent bit */
812 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
813                      gfp_t mask)
814 {
815         return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
816                               mask);
817 }
818 EXPORT_SYMBOL(set_extent_dirty);
819
820 int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
821                        gfp_t mask)
822 {
823         return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
824 }
825 EXPORT_SYMBOL(set_extent_ordered);
826
827 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
828                     int bits, gfp_t mask)
829 {
830         return set_extent_bit(tree, start, end, bits, 0, NULL,
831                               mask);
832 }
833 EXPORT_SYMBOL(set_extent_bits);
834
835 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
836                       int bits, gfp_t mask)
837 {
838         return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
839 }
840 EXPORT_SYMBOL(clear_extent_bits);
841
842 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
843                      gfp_t mask)
844 {
845         return set_extent_bit(tree, start, end,
846                               EXTENT_DELALLOC | EXTENT_DIRTY,
847                               0, NULL, mask);
848 }
849 EXPORT_SYMBOL(set_extent_delalloc);
850
851 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
852                        gfp_t mask)
853 {
854         return clear_extent_bit(tree, start, end,
855                                 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
856 }
857 EXPORT_SYMBOL(clear_extent_dirty);
858
859 int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
860                          gfp_t mask)
861 {
862         return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
863 }
864 EXPORT_SYMBOL(clear_extent_ordered);
865
866 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
867                      gfp_t mask)
868 {
869         return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
870                               mask);
871 }
872 EXPORT_SYMBOL(set_extent_new);
873
874 int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
875                        gfp_t mask)
876 {
877         return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
878 }
879 EXPORT_SYMBOL(clear_extent_new);
880
881 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
882                         gfp_t mask)
883 {
884         return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
885                               mask);
886 }
887 EXPORT_SYMBOL(set_extent_uptodate);
888
889 int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
890                           gfp_t mask)
891 {
892         return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
893 }
894 EXPORT_SYMBOL(clear_extent_uptodate);
895
896 int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
897                          gfp_t mask)
898 {
899         return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
900                               0, NULL, mask);
901 }
902 EXPORT_SYMBOL(set_extent_writeback);
903
904 int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
905                            gfp_t mask)
906 {
907         return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
908 }
909 EXPORT_SYMBOL(clear_extent_writeback);
910
911 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
912 {
913         return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
914 }
915 EXPORT_SYMBOL(wait_on_extent_writeback);
916
917 /*
918  * either insert or lock state struct between start and end use mask to tell
919  * us if waiting is desired.
920  */
921 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
922 {
923         int err;
924         u64 failed_start;
925         while (1) {
926                 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
927                                      &failed_start, mask);
928                 if (err == -EEXIST && (mask & __GFP_WAIT)) {
929                         wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
930                         start = failed_start;
931                 } else {
932                         break;
933                 }
934                 WARN_ON(start > end);
935         }
936         return err;
937 }
938 EXPORT_SYMBOL(lock_extent);
939
940 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
941                   gfp_t mask)
942 {
943         return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
944 }
945 EXPORT_SYMBOL(unlock_extent);
946
947 /*
948  * helper function to set pages and extents in the tree dirty
949  */
950 int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
951 {
952         unsigned long index = start >> PAGE_CACHE_SHIFT;
953         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
954         struct page *page;
955
956         while (index <= end_index) {
957                 page = find_get_page(tree->mapping, index);
958                 BUG_ON(!page);
959                 __set_page_dirty_nobuffers(page);
960                 page_cache_release(page);
961                 index++;
962         }
963         set_extent_dirty(tree, start, end, GFP_NOFS);
964         return 0;
965 }
966 EXPORT_SYMBOL(set_range_dirty);
967
968 /*
969  * helper function to set both pages and extents in the tree writeback
970  */
971 int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
972 {
973         unsigned long index = start >> PAGE_CACHE_SHIFT;
974         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
975         struct page *page;
976
977         while (index <= end_index) {
978                 page = find_get_page(tree->mapping, index);
979                 BUG_ON(!page);
980                 set_page_writeback(page);
981                 page_cache_release(page);
982                 index++;
983         }
984         set_extent_writeback(tree, start, end, GFP_NOFS);
985         return 0;
986 }
987 EXPORT_SYMBOL(set_range_writeback);
988
989 /*
990  * find the first offset in the io tree with 'bits' set. zero is
991  * returned if we find something, and *start_ret and *end_ret are
992  * set to reflect the state struct that was found.
993  *
994  * If nothing was found, 1 is returned, < 0 on error
995  */
996 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
997                           u64 *start_ret, u64 *end_ret, int bits)
998 {
999         struct rb_node *node;
1000         struct extent_state *state;
1001         int ret = 1;
1002
1003         spin_lock_irq(&tree->lock);
1004         /*
1005          * this search will find all the extents that end after
1006          * our range starts.
1007          */
1008         node = tree_search(tree, start);
1009         if (!node) {
1010                 goto out;
1011         }
1012
1013         while(1) {
1014                 state = rb_entry(node, struct extent_state, rb_node);
1015                 if (state->end >= start && (state->state & bits)) {
1016                         *start_ret = state->start;
1017                         *end_ret = state->end;
1018                         ret = 0;
1019                         break;
1020                 }
1021                 node = rb_next(node);
1022                 if (!node)
1023                         break;
1024         }
1025 out:
1026         spin_unlock_irq(&tree->lock);
1027         return ret;
1028 }
1029 EXPORT_SYMBOL(find_first_extent_bit);
1030
1031 /* find the first state struct with 'bits' set after 'start', and
1032  * return it.  tree->lock must be held.  NULL will returned if
1033  * nothing was found after 'start'
1034  */
1035 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1036                                                  u64 start, int bits)
1037 {
1038         struct rb_node *node;
1039         struct extent_state *state;
1040
1041         /*
1042          * this search will find all the extents that end after
1043          * our range starts.
1044          */
1045         node = tree_search(tree, start);
1046         if (!node) {
1047                 goto out;
1048         }
1049
1050         while(1) {
1051                 state = rb_entry(node, struct extent_state, rb_node);
1052                 if (state->end >= start && (state->state & bits)) {
1053                         return state;
1054                 }
1055                 node = rb_next(node);
1056                 if (!node)
1057                         break;
1058         }
1059 out:
1060         return NULL;
1061 }
1062 EXPORT_SYMBOL(find_first_extent_bit_state);
1063
1064 /*
1065  * find a contiguous range of bytes in the file marked as delalloc, not
1066  * more than 'max_bytes'.  start and end are used to return the range,
1067  *
1068  * 1 is returned if we find something, 0 if nothing was in the tree
1069  */
1070 static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,
1071                                              u64 *start, u64 *end, u64 max_bytes)
1072 {
1073         struct rb_node *node;
1074         struct extent_state *state;
1075         u64 cur_start = *start;
1076         u64 found = 0;
1077         u64 total_bytes = 0;
1078
1079         spin_lock_irq(&tree->lock);
1080         /*
1081          * this search will find all the extents that end after
1082          * our range starts.
1083          */
1084 search_again:
1085         node = tree_search(tree, cur_start);
1086         if (!node) {
1087                 if (!found)
1088                         *end = (u64)-1;
1089                 goto out;
1090         }
1091
1092         while(1) {
1093                 state = rb_entry(node, struct extent_state, rb_node);
1094                 if (found && (state->start != cur_start ||
1095                               (state->state & EXTENT_BOUNDARY))) {
1096                         goto out;
1097                 }
1098                 if (!(state->state & EXTENT_DELALLOC)) {
1099                         if (!found)
1100                                 *end = state->end;
1101                         goto out;
1102                 }
1103                 if (!found && !(state->state & EXTENT_BOUNDARY)) {
1104                         struct extent_state *prev_state;
1105                         struct rb_node *prev_node = node;
1106                         while(1) {
1107                                 prev_node = rb_prev(prev_node);
1108                                 if (!prev_node)
1109                                         break;
1110                                 prev_state = rb_entry(prev_node,
1111                                                       struct extent_state,
1112                                                       rb_node);
1113                                 if ((prev_state->end + 1 != state->start) ||
1114                                     !(prev_state->state & EXTENT_DELALLOC))
1115                                         break;
1116                                 if ((cur_start - prev_state->start) * 2 >
1117                                      max_bytes)
1118                                         break;
1119                                 state = prev_state;
1120                                 node = prev_node;
1121                         }
1122                 }
1123                 if (state->state & EXTENT_LOCKED) {
1124                         DEFINE_WAIT(wait);
1125                         atomic_inc(&state->refs);
1126                         prepare_to_wait(&state->wq, &wait,
1127                                         TASK_UNINTERRUPTIBLE);
1128                         spin_unlock_irq(&tree->lock);
1129                         schedule();
1130                         spin_lock_irq(&tree->lock);
1131                         finish_wait(&state->wq, &wait);
1132                         free_extent_state(state);
1133                         goto search_again;
1134                 }
1135                 set_state_cb(tree, state, EXTENT_LOCKED);
1136                 state->state |= EXTENT_LOCKED;
1137                 if (!found)
1138                         *start = state->start;
1139                 found++;
1140                 *end = state->end;
1141                 cur_start = state->end + 1;
1142                 node = rb_next(node);
1143                 if (!node)
1144                         break;
1145                 total_bytes += state->end - state->start + 1;
1146                 if (total_bytes >= max_bytes)
1147                         break;
1148         }
1149 out:
1150         spin_unlock_irq(&tree->lock);
1151         return found;
1152 }
1153
1154 /*
1155  * count the number of bytes in the tree that have a given bit(s)
1156  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1157  * cached.  The total number found is returned.
1158  */
1159 u64 count_range_bits(struct extent_io_tree *tree,
1160                      u64 *start, u64 search_end, u64 max_bytes,
1161                      unsigned long bits)
1162 {
1163         struct rb_node *node;
1164         struct extent_state *state;
1165         u64 cur_start = *start;
1166         u64 total_bytes = 0;
1167         int found = 0;
1168
1169         if (search_end <= cur_start) {
1170                 printk("search_end %Lu start %Lu\n", search_end, cur_start);
1171                 WARN_ON(1);
1172                 return 0;
1173         }
1174
1175         spin_lock_irq(&tree->lock);
1176         if (cur_start == 0 && bits == EXTENT_DIRTY) {
1177                 total_bytes = tree->dirty_bytes;
1178                 goto out;
1179         }
1180         /*
1181          * this search will find all the extents that end after
1182          * our range starts.
1183          */
1184         node = tree_search(tree, cur_start);
1185         if (!node) {
1186                 goto out;
1187         }
1188
1189         while(1) {
1190                 state = rb_entry(node, struct extent_state, rb_node);
1191                 if (state->start > search_end)
1192                         break;
1193                 if (state->end >= cur_start && (state->state & bits)) {
1194                         total_bytes += min(search_end, state->end) + 1 -
1195                                        max(cur_start, state->start);
1196                         if (total_bytes >= max_bytes)
1197                                 break;
1198                         if (!found) {
1199                                 *start = state->start;
1200                                 found = 1;
1201                         }
1202                 }
1203                 node = rb_next(node);
1204                 if (!node)
1205                         break;
1206         }
1207 out:
1208         spin_unlock_irq(&tree->lock);
1209         return total_bytes;
1210 }
1211 /*
1212  * helper function to lock both pages and extents in the tree.
1213  * pages must be locked first.
1214  */
1215 int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
1216 {
1217         unsigned long index = start >> PAGE_CACHE_SHIFT;
1218         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1219         struct page *page;
1220         int err;
1221
1222         while (index <= end_index) {
1223                 page = grab_cache_page(tree->mapping, index);
1224                 if (!page) {
1225                         err = -ENOMEM;
1226                         goto failed;
1227                 }
1228                 if (IS_ERR(page)) {
1229                         err = PTR_ERR(page);
1230                         goto failed;
1231                 }
1232                 index++;
1233         }
1234         lock_extent(tree, start, end, GFP_NOFS);
1235         return 0;
1236
1237 failed:
1238         /*
1239          * we failed above in getting the page at 'index', so we undo here
1240          * up to but not including the page at 'index'
1241          */
1242         end_index = index;
1243         index = start >> PAGE_CACHE_SHIFT;
1244         while (index < end_index) {
1245                 page = find_get_page(tree->mapping, index);
1246                 unlock_page(page);
1247                 page_cache_release(page);
1248                 index++;
1249         }
1250         return err;
1251 }
1252 EXPORT_SYMBOL(lock_range);
1253
1254 /*
1255  * helper function to unlock both pages and extents in the tree.
1256  */
1257 int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
1258 {
1259         unsigned long index = start >> PAGE_CACHE_SHIFT;
1260         unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1261         struct page *page;
1262
1263         while (index <= end_index) {
1264                 page = find_get_page(tree->mapping, index);
1265                 unlock_page(page);
1266                 page_cache_release(page);
1267                 index++;
1268         }
1269         unlock_extent(tree, start, end, GFP_NOFS);
1270         return 0;
1271 }
1272 EXPORT_SYMBOL(unlock_range);
1273
1274 /*
1275  * set the private field for a given byte offset in the tree.  If there isn't
1276  * an extent_state there already, this does nothing.
1277  */
1278 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1279 {
1280         struct rb_node *node;
1281         struct extent_state *state;
1282         int ret = 0;
1283
1284         spin_lock_irq(&tree->lock);
1285         /*
1286          * this search will find all the extents that end after
1287          * our range starts.
1288          */
1289         node = tree_search(tree, start);
1290         if (!node) {
1291                 ret = -ENOENT;
1292                 goto out;
1293         }
1294         state = rb_entry(node, struct extent_state, rb_node);
1295         if (state->start != start) {
1296                 ret = -ENOENT;
1297                 goto out;
1298         }
1299         state->private = private;
1300 out:
1301         spin_unlock_irq(&tree->lock);
1302         return ret;
1303 }
1304
1305 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1306 {
1307         struct rb_node *node;
1308         struct extent_state *state;
1309         int ret = 0;
1310
1311         spin_lock_irq(&tree->lock);
1312         /*
1313          * this search will find all the extents that end after
1314          * our range starts.
1315          */
1316         node = tree_search(tree, start);
1317         if (!node) {
1318                 ret = -ENOENT;
1319                 goto out;
1320         }
1321         state = rb_entry(node, struct extent_state, rb_node);
1322         if (state->start != start) {
1323                 ret = -ENOENT;
1324                 goto out;
1325         }
1326         *private = state->private;
1327 out:
1328         spin_unlock_irq(&tree->lock);
1329         return ret;
1330 }
1331
1332 /*
1333  * searches a range in the state tree for a given mask.
1334  * If 'filled' == 1, this returns 1 only if every extent in the tree
1335  * has the bits set.  Otherwise, 1 is returned if any bit in the
1336  * range is found set.
1337  */
1338 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1339                    int bits, int filled)
1340 {
1341         struct extent_state *state = NULL;
1342         struct rb_node *node;
1343         int bitset = 0;
1344         unsigned long flags;
1345
1346         spin_lock_irqsave(&tree->lock, flags);
1347         node = tree_search(tree, start);
1348         while (node && start <= end) {
1349                 state = rb_entry(node, struct extent_state, rb_node);
1350
1351                 if (filled && state->start > start) {
1352                         bitset = 0;
1353                         break;
1354                 }
1355
1356                 if (state->start > end)
1357                         break;
1358
1359                 if (state->state & bits) {
1360                         bitset = 1;
1361                         if (!filled)
1362                                 break;
1363                 } else if (filled) {
1364                         bitset = 0;
1365                         break;
1366                 }
1367                 start = state->end + 1;
1368                 if (start > end)
1369                         break;
1370                 node = rb_next(node);
1371                 if (!node) {
1372                         if (filled)
1373                                 bitset = 0;
1374                         break;
1375                 }
1376         }
1377         spin_unlock_irqrestore(&tree->lock, flags);
1378         return bitset;
1379 }
1380 EXPORT_SYMBOL(test_range_bit);
1381
1382 /*
1383  * helper function to set a given page up to date if all the
1384  * extents in the tree for that page are up to date
1385  */
1386 static int check_page_uptodate(struct extent_io_tree *tree,
1387                                struct page *page)
1388 {
1389         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1390         u64 end = start + PAGE_CACHE_SIZE - 1;
1391         if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1392                 SetPageUptodate(page);
1393         return 0;
1394 }
1395
1396 /*
1397  * helper function to unlock a page if all the extents in the tree
1398  * for that page are unlocked
1399  */
1400 static int check_page_locked(struct extent_io_tree *tree,
1401                              struct page *page)
1402 {
1403         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1404         u64 end = start + PAGE_CACHE_SIZE - 1;
1405         if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1406                 unlock_page(page);
1407         return 0;
1408 }
1409
1410 /*
1411  * helper function to end page writeback if all the extents
1412  * in the tree for that page are done with writeback
1413  */
1414 static int check_page_writeback(struct extent_io_tree *tree,
1415                              struct page *page)
1416 {
1417         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1418         u64 end = start + PAGE_CACHE_SIZE - 1;
1419         if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1420                 end_page_writeback(page);
1421         return 0;
1422 }
1423
1424 /* lots and lots of room for performance fixes in the end_bio funcs */
1425
1426 /*
1427  * after a writepage IO is done, we need to:
1428  * clear the uptodate bits on error
1429  * clear the writeback bits in the extent tree for this IO
1430  * end_page_writeback if the page has no more pending IO
1431  *
1432  * Scheduling is not allowed, so the extent state tree is expected
1433  * to have one and only one object corresponding to this IO.
1434  */
1435 static void end_bio_extent_writepage(struct bio *bio, int err)
1436 {
1437         int uptodate = err == 0;
1438         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1439         struct extent_io_tree *tree;
1440         u64 start;
1441         u64 end;
1442         int whole_page;
1443         int ret;
1444
1445         do {
1446                 struct page *page = bvec->bv_page;
1447                 tree = &BTRFS_I(page->mapping->host)->io_tree;
1448
1449                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1450                          bvec->bv_offset;
1451                 end = start + bvec->bv_len - 1;
1452
1453                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1454                         whole_page = 1;
1455                 else
1456                         whole_page = 0;
1457
1458                 if (--bvec >= bio->bi_io_vec)
1459                         prefetchw(&bvec->bv_page->flags);
1460                 if (tree->ops && tree->ops->writepage_end_io_hook) {
1461                         ret = tree->ops->writepage_end_io_hook(page, start,
1462                                                        end, NULL, uptodate);
1463                         if (ret)
1464                                 uptodate = 0;
1465                 }
1466
1467                 if (!uptodate && tree->ops &&
1468                     tree->ops->writepage_io_failed_hook) {
1469                         ret = tree->ops->writepage_io_failed_hook(bio, page,
1470                                                          start, end, NULL);
1471                         if (ret == 0) {
1472                                 uptodate = (err == 0);
1473                                 continue;
1474                         }
1475                 }
1476
1477                 if (!uptodate) {
1478                         clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1479                         ClearPageUptodate(page);
1480                         SetPageError(page);
1481                 }
1482
1483                 clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1484
1485                 if (whole_page)
1486                         end_page_writeback(page);
1487                 else
1488                         check_page_writeback(tree, page);
1489         } while (bvec >= bio->bi_io_vec);
1490
1491         bio_put(bio);
1492 }
1493
1494 /*
1495  * after a readpage IO is done, we need to:
1496  * clear the uptodate bits on error
1497  * set the uptodate bits if things worked
1498  * set the page up to date if all extents in the tree are uptodate
1499  * clear the lock bit in the extent tree
1500  * unlock the page if there are no other extents locked for it
1501  *
1502  * Scheduling is not allowed, so the extent state tree is expected
1503  * to have one and only one object corresponding to this IO.
1504  */
1505 static void end_bio_extent_readpage(struct bio *bio, int err)
1506 {
1507         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1508         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1509         struct extent_io_tree *tree;
1510         u64 start;
1511         u64 end;
1512         int whole_page;
1513         int ret;
1514
1515         do {
1516                 struct page *page = bvec->bv_page;
1517                 tree = &BTRFS_I(page->mapping->host)->io_tree;
1518
1519                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1520                         bvec->bv_offset;
1521                 end = start + bvec->bv_len - 1;
1522
1523                 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1524                         whole_page = 1;
1525                 else
1526                         whole_page = 0;
1527
1528                 if (--bvec >= bio->bi_io_vec)
1529                         prefetchw(&bvec->bv_page->flags);
1530
1531                 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1532                         ret = tree->ops->readpage_end_io_hook(page, start, end,
1533                                                               NULL);
1534                         if (ret)
1535                                 uptodate = 0;
1536                 }
1537                 if (!uptodate && tree->ops &&
1538                     tree->ops->readpage_io_failed_hook) {
1539                         ret = tree->ops->readpage_io_failed_hook(bio, page,
1540                                                          start, end, NULL);
1541                         if (ret == 0) {
1542                                 uptodate =
1543                                         test_bit(BIO_UPTODATE, &bio->bi_flags);
1544                                 continue;
1545                         }
1546                 }
1547
1548                 if (uptodate)
1549                         set_extent_uptodate(tree, start, end,
1550                                             GFP_ATOMIC);
1551                 unlock_extent(tree, start, end, GFP_ATOMIC);
1552
1553                 if (whole_page) {
1554                         if (uptodate) {
1555                                 SetPageUptodate(page);
1556                         } else {
1557                                 ClearPageUptodate(page);
1558                                 SetPageError(page);
1559                         }
1560                         unlock_page(page);
1561                 } else {
1562                         if (uptodate) {
1563                                 check_page_uptodate(tree, page);
1564                         } else {
1565                                 ClearPageUptodate(page);
1566                                 SetPageError(page);
1567                         }
1568                         check_page_locked(tree, page);
1569                 }
1570         } while (bvec >= bio->bi_io_vec);
1571
1572         bio_put(bio);
1573 }
1574
1575 /*
1576  * IO done from prepare_write is pretty simple, we just unlock
1577  * the structs in the extent tree when done, and set the uptodate bits
1578  * as appropriate.
1579  */
1580 static void end_bio_extent_preparewrite(struct bio *bio, int err)
1581 {
1582         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1583         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1584         struct extent_io_tree *tree;
1585         u64 start;
1586         u64 end;
1587
1588         do {
1589                 struct page *page = bvec->bv_page;
1590                 tree = &BTRFS_I(page->mapping->host)->io_tree;
1591
1592                 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1593                         bvec->bv_offset;
1594                 end = start + bvec->bv_len - 1;
1595
1596                 if (--bvec >= bio->bi_io_vec)
1597                         prefetchw(&bvec->bv_page->flags);
1598
1599                 if (uptodate) {
1600                         set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1601                 } else {
1602                         ClearPageUptodate(page);
1603                         SetPageError(page);
1604                 }
1605
1606                 unlock_extent(tree, start, end, GFP_ATOMIC);
1607
1608         } while (bvec >= bio->bi_io_vec);
1609
1610         bio_put(bio);
1611 }
1612
1613 static struct bio *
1614 extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1615                  gfp_t gfp_flags)
1616 {
1617         struct bio *bio;
1618
1619         bio = bio_alloc(gfp_flags, nr_vecs);
1620
1621         if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1622                 while (!bio && (nr_vecs /= 2))
1623                         bio = bio_alloc(gfp_flags, nr_vecs);
1624         }
1625
1626         if (bio) {
1627                 bio->bi_size = 0;
1628                 bio->bi_bdev = bdev;
1629                 bio->bi_sector = first_sector;
1630         }
1631         return bio;
1632 }
1633
1634 static int submit_one_bio(int rw, struct bio *bio, int mirror_num)
1635 {
1636         int ret = 0;
1637         struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1638         struct page *page = bvec->bv_page;
1639         struct extent_io_tree *tree = bio->bi_private;
1640         struct rb_node *node;
1641         struct extent_state *state;
1642         u64 start;
1643         u64 end;
1644
1645         start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1646         end = start + bvec->bv_len - 1;
1647
1648         spin_lock_irq(&tree->lock);
1649         node = __etree_search(tree, start, NULL, NULL);
1650         BUG_ON(!node);
1651         state = rb_entry(node, struct extent_state, rb_node);
1652         while(state->end < end) {
1653                 node = rb_next(node);
1654                 state = rb_entry(node, struct extent_state, rb_node);
1655         }
1656         BUG_ON(state->end != end);
1657         spin_unlock_irq(&tree->lock);
1658
1659         bio->bi_private = NULL;
1660
1661         bio_get(bio);
1662
1663         if (tree->ops && tree->ops->submit_bio_hook)
1664                 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1665                                            mirror_num);
1666         else
1667                 submit_bio(rw, bio);
1668         if (bio_flagged(bio, BIO_EOPNOTSUPP))
1669                 ret = -EOPNOTSUPP;
1670         bio_put(bio);
1671         return ret;
1672 }
1673
1674 static int submit_extent_page(int rw, struct extent_io_tree *tree,
1675                               struct page *page, sector_t sector,
1676                               size_t size, unsigned long offset,
1677                               struct block_device *bdev,
1678                               struct bio **bio_ret,
1679                               unsigned long max_pages,
1680                               bio_end_io_t end_io_func,
1681                               int mirror_num)
1682 {
1683         int ret = 0;
1684         struct bio *bio;
1685         int nr;
1686
1687         if (bio_ret && *bio_ret) {
1688                 bio = *bio_ret;
1689                 if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
1690                     (tree->ops && tree->ops->merge_bio_hook &&
1691                      tree->ops->merge_bio_hook(page, offset, size, bio)) ||
1692                     bio_add_page(bio, page, size, offset) < size) {
1693                         ret = submit_one_bio(rw, bio, mirror_num);
1694                         bio = NULL;
1695                 } else {
1696                         return 0;
1697                 }
1698         }
1699         nr = bio_get_nr_vecs(bdev);
1700         bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1701         if (!bio) {
1702                 printk("failed to allocate bio nr %d\n", nr);
1703         }
1704
1705
1706         bio_add_page(bio, page, size, offset);
1707         bio->bi_end_io = end_io_func;
1708         bio->bi_private = tree;
1709
1710         if (bio_ret) {
1711                 *bio_ret = bio;
1712         } else {
1713                 ret = submit_one_bio(rw, bio, mirror_num);
1714         }
1715
1716         return ret;
1717 }
1718
1719 void set_page_extent_mapped(struct page *page)
1720 {
1721         if (!PagePrivate(page)) {
1722                 SetPagePrivate(page);
1723                 page_cache_get(page);
1724                 set_page_private(page, EXTENT_PAGE_PRIVATE);
1725         }
1726 }
1727
1728 void set_page_extent_head(struct page *page, unsigned long len)
1729 {
1730         set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1731 }
1732
1733 /*
1734  * basic readpage implementation.  Locked extent state structs are inserted
1735  * into the tree that are removed when the IO is done (by the end_io
1736  * handlers)
1737  */
1738 static int __extent_read_full_page(struct extent_io_tree *tree,
1739                                    struct page *page,
1740                                    get_extent_t *get_extent,
1741                                    struct bio **bio, int mirror_num)
1742 {
1743         struct inode *inode = page->mapping->host;
1744         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1745         u64 page_end = start + PAGE_CACHE_SIZE - 1;
1746         u64 end;
1747         u64 cur = start;
1748         u64 extent_offset;
1749         u64 last_byte = i_size_read(inode);
1750         u64 block_start;
1751         u64 cur_end;
1752         sector_t sector;
1753         struct extent_map *em;
1754         struct block_device *bdev;
1755         int ret;
1756         int nr = 0;
1757         size_t page_offset = 0;
1758         size_t iosize;
1759         size_t blocksize = inode->i_sb->s_blocksize;
1760
1761         set_page_extent_mapped(page);
1762
1763         end = page_end;
1764         lock_extent(tree, start, end, GFP_NOFS);
1765
1766         while (cur <= end) {
1767                 if (cur >= last_byte) {
1768                         char *userpage;
1769                         iosize = PAGE_CACHE_SIZE - page_offset;
1770                         userpage = kmap_atomic(page, KM_USER0);
1771                         memset(userpage + page_offset, 0, iosize);
1772                         flush_dcache_page(page);
1773                         kunmap_atomic(userpage, KM_USER0);
1774                         set_extent_uptodate(tree, cur, cur + iosize - 1,
1775                                             GFP_NOFS);
1776                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1777                         break;
1778                 }
1779                 em = get_extent(inode, page, page_offset, cur,
1780                                 end - cur + 1, 0);
1781                 if (IS_ERR(em) || !em) {
1782                         SetPageError(page);
1783                         unlock_extent(tree, cur, end, GFP_NOFS);
1784                         break;
1785                 }
1786                 extent_offset = cur - em->start;
1787                 if (extent_map_end(em) <= cur) {
1788 printk("bad mapping em [%Lu %Lu] cur %Lu\n", em->start, extent_map_end(em), cur);
1789                 }
1790                 BUG_ON(extent_map_end(em) <= cur);
1791                 if (end < cur) {
1792 printk("2bad mapping end %Lu cur %Lu\n", end, cur);
1793                 }
1794                 BUG_ON(end < cur);
1795
1796                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
1797                 cur_end = min(extent_map_end(em) - 1, end);
1798                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1799                 sector = (em->block_start + extent_offset) >> 9;
1800                 bdev = em->bdev;
1801                 block_start = em->block_start;
1802                 free_extent_map(em);
1803                 em = NULL;
1804
1805                 /* we've found a hole, just zero and go on */
1806                 if (block_start == EXTENT_MAP_HOLE) {
1807                         char *userpage;
1808                         userpage = kmap_atomic(page, KM_USER0);
1809                         memset(userpage + page_offset, 0, iosize);
1810                         flush_dcache_page(page);
1811                         kunmap_atomic(userpage, KM_USER0);
1812
1813                         set_extent_uptodate(tree, cur, cur + iosize - 1,
1814                                             GFP_NOFS);
1815                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1816                         cur = cur + iosize;
1817                         page_offset += iosize;
1818                         continue;
1819                 }
1820                 /* the get_extent function already copied into the page */
1821                 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
1822                         check_page_uptodate(tree, page);
1823                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1824                         cur = cur + iosize;
1825                         page_offset += iosize;
1826                         continue;
1827                 }
1828                 /* we have an inline extent but it didn't get marked up
1829                  * to date.  Error out
1830                  */
1831                 if (block_start == EXTENT_MAP_INLINE) {
1832                         SetPageError(page);
1833                         unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1834                         cur = cur + iosize;
1835                         page_offset += iosize;
1836                         continue;
1837                 }
1838
1839                 ret = 0;
1840                 if (tree->ops && tree->ops->readpage_io_hook) {
1841                         ret = tree->ops->readpage_io_hook(page, cur,
1842                                                           cur + iosize - 1);
1843                 }
1844                 if (!ret) {
1845                         unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
1846                         pnr -= page->index;
1847                         ret = submit_extent_page(READ, tree, page,
1848                                          sector, iosize, page_offset,
1849                                          bdev, bio, pnr,
1850                                          end_bio_extent_readpage, mirror_num);
1851                         nr++;
1852                 }
1853                 if (ret)
1854                         SetPageError(page);
1855                 cur = cur + iosize;
1856                 page_offset += iosize;
1857         }
1858         if (!nr) {
1859                 if (!PageError(page))
1860                         SetPageUptodate(page);
1861                 unlock_page(page);
1862         }
1863         return 0;
1864 }
1865
1866 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
1867                             get_extent_t *get_extent)
1868 {
1869         struct bio *bio = NULL;
1870         int ret;
1871
1872         ret = __extent_read_full_page(tree, page, get_extent, &bio, 0);
1873         if (bio)
1874                 submit_one_bio(READ, bio, 0);
1875         return ret;
1876 }
1877 EXPORT_SYMBOL(extent_read_full_page);
1878
1879 /*
1880  * the writepage semantics are similar to regular writepage.  extent
1881  * records are inserted to lock ranges in the tree, and as dirty areas
1882  * are found, they are marked writeback.  Then the lock bits are removed
1883  * and the end_io handler clears the writeback ranges
1884  */
1885 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
1886                               void *data)
1887 {
1888         struct inode *inode = page->mapping->host;
1889         struct extent_page_data *epd = data;
1890         struct extent_io_tree *tree = epd->tree;
1891         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1892         u64 delalloc_start;
1893         u64 page_end = start + PAGE_CACHE_SIZE - 1;
1894         u64 end;
1895         u64 cur = start;
1896         u64 extent_offset;
1897         u64 last_byte = i_size_read(inode);
1898         u64 block_start;
1899         u64 iosize;
1900         u64 unlock_start;
1901         sector_t sector;
1902         struct extent_map *em;
1903         struct block_device *bdev;
1904         int ret;
1905         int nr = 0;
1906         size_t pg_offset = 0;
1907         size_t blocksize;
1908         loff_t i_size = i_size_read(inode);
1909         unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
1910         u64 nr_delalloc;
1911         u64 delalloc_end;
1912
1913         WARN_ON(!PageLocked(page));
1914         pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
1915         if (page->index > end_index ||
1916            (page->index == end_index && !pg_offset)) {
1917                 page->mapping->a_ops->invalidatepage(page, 0);
1918                 unlock_page(page);
1919                 return 0;
1920         }
1921
1922         if (page->index == end_index) {
1923                 char *userpage;
1924
1925                 userpage = kmap_atomic(page, KM_USER0);
1926                 memset(userpage + pg_offset, 0,
1927                        PAGE_CACHE_SIZE - pg_offset);
1928                 kunmap_atomic(userpage, KM_USER0);
1929                 flush_dcache_page(page);
1930         }
1931         pg_offset = 0;
1932
1933         set_page_extent_mapped(page);
1934
1935         delalloc_start = start;
1936         delalloc_end = 0;
1937         while(delalloc_end < page_end) {
1938                 nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
1939                                                        &delalloc_end,
1940                                                        128 * 1024 * 1024);
1941                 if (nr_delalloc == 0) {
1942                         delalloc_start = delalloc_end + 1;
1943                         continue;
1944                 }
1945                 tree->ops->fill_delalloc(inode, delalloc_start,
1946                                          delalloc_end);
1947                 clear_extent_bit(tree, delalloc_start,
1948                                  delalloc_end,
1949                                  EXTENT_LOCKED | EXTENT_DELALLOC,
1950                                  1, 0, GFP_NOFS);
1951                 delalloc_start = delalloc_end + 1;
1952         }
1953         lock_extent(tree, start, page_end, GFP_NOFS);
1954         unlock_start = start;
1955
1956         if (tree->ops && tree->ops->writepage_start_hook) {
1957                 ret = tree->ops->writepage_start_hook(page, start, page_end);
1958                 if (ret == -EAGAIN) {
1959                         unlock_extent(tree, start, page_end, GFP_NOFS);
1960                         redirty_page_for_writepage(wbc, page);
1961                         unlock_page(page);
1962                         return 0;
1963                 }
1964         }
1965
1966         end = page_end;
1967         if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
1968                 printk("found delalloc bits after lock_extent\n");
1969         }
1970
1971         if (last_byte <= start) {
1972                 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1973                 unlock_extent(tree, start, page_end, GFP_NOFS);
1974                 if (tree->ops && tree->ops->writepage_end_io_hook)
1975                         tree->ops->writepage_end_io_hook(page, start,
1976                                                          page_end, NULL, 1);
1977                 unlock_start = page_end + 1;
1978                 goto done;
1979         }
1980
1981         set_extent_uptodate(tree, start, page_end, GFP_NOFS);
1982         blocksize = inode->i_sb->s_blocksize;
1983
1984         while (cur <= end) {
1985                 if (cur >= last_byte) {
1986                         clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
1987                         unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
1988                         if (tree->ops && tree->ops->writepage_end_io_hook)
1989                                 tree->ops->writepage_end_io_hook(page, cur,
1990                                                          page_end, NULL, 1);
1991                         unlock_start = page_end + 1;
1992                         break;
1993                 }
1994                 em = epd->get_extent(inode, page, pg_offset, cur,
1995                                      end - cur + 1, 1);
1996                 if (IS_ERR(em) || !em) {
1997                         SetPageError(page);
1998                         break;
1999                 }
2000
2001                 extent_offset = cur - em->start;
2002                 BUG_ON(extent_map_end(em) <= cur);
2003                 BUG_ON(end < cur);
2004                 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2005                 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2006                 sector = (em->block_start + extent_offset) >> 9;
2007                 bdev = em->bdev;
2008                 block_start = em->block_start;
2009                 free_extent_map(em);
2010                 em = NULL;
2011
2012                 if (block_start == EXTENT_MAP_HOLE ||
2013                     block_start == EXTENT_MAP_INLINE) {
2014                         clear_extent_dirty(tree, cur,
2015                                            cur + iosize - 1, GFP_NOFS);
2016
2017                         unlock_extent(tree, unlock_start, cur + iosize -1,
2018                                       GFP_NOFS);
2019
2020                         if (tree->ops && tree->ops->writepage_end_io_hook)
2021                                 tree->ops->writepage_end_io_hook(page, cur,
2022                                                          cur + iosize - 1,
2023                                                          NULL, 1);
2024                         cur = cur + iosize;
2025                         pg_offset += iosize;
2026                         unlock_start = cur;
2027                         continue;
2028                 }
2029
2030                 /* leave this out until we have a page_mkwrite call */
2031                 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2032                                    EXTENT_DIRTY, 0)) {
2033                         cur = cur + iosize;
2034                         pg_offset += iosize;
2035                         continue;
2036                 }
2037                 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
2038                 if (tree->ops && tree->ops->writepage_io_hook) {
2039                         ret = tree->ops->writepage_io_hook(page, cur,
2040                                                 cur + iosize - 1);
2041                 } else {
2042                         ret = 0;
2043                 }
2044                 if (ret) {
2045                         SetPageError(page);
2046                 } else {
2047                         unsigned long max_nr = end_index + 1;
2048
2049                         set_range_writeback(tree, cur, cur + iosize - 1);
2050                         if (!PageWriteback(page)) {
2051                                 printk("warning page %lu not writeback, "
2052                                        "cur %llu end %llu\n", page->index,
2053                                        (unsigned long long)cur,
2054                                        (unsigned long long)end);
2055                         }
2056
2057                         ret = submit_extent_page(WRITE, tree, page, sector,
2058                                                  iosize, pg_offset, bdev,
2059                                                  &epd->bio, max_nr,
2060                                                  end_bio_extent_writepage, 0);
2061                         if (ret)
2062                                 SetPageError(page);
2063                 }
2064                 cur = cur + iosize;
2065                 pg_offset += iosize;
2066                 nr++;
2067         }
2068 done:
2069         if (nr == 0) {
2070                 /* make sure the mapping tag for page dirty gets cleared */
2071                 set_page_writeback(page);
2072                 end_page_writeback(page);
2073         }
2074         if (unlock_start <= page_end)
2075                 unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2076         unlock_page(page);
2077         return 0;
2078 }
2079
2080 /**
2081  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2082  * @mapping: address space structure to write
2083  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2084  * @writepage: function called for each page
2085  * @data: data passed to writepage function
2086  *
2087  * If a page is already under I/O, write_cache_pages() skips it, even
2088  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
2089  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
2090  * and msync() need to guarantee that all the data which was dirty at the time
2091  * the call was made get new I/O started against them.  If wbc->sync_mode is
2092  * WB_SYNC_ALL then we were called for data integrity and we must wait for
2093  * existing IO to complete.
2094  */
2095 int extent_write_cache_pages(struct extent_io_tree *tree,
2096                              struct address_space *mapping,
2097                              struct writeback_control *wbc,
2098                              writepage_t writepage, void *data)
2099 {
2100         struct backing_dev_info *bdi = mapping->backing_dev_info;
2101         int ret = 0;
2102         int done = 0;
2103         struct pagevec pvec;
2104         int nr_pages;
2105         pgoff_t index;
2106         pgoff_t end;            /* Inclusive */
2107         int scanned = 0;
2108         int range_whole = 0;
2109
2110         if (wbc->nonblocking && bdi_write_congested(bdi)) {
2111                 wbc->encountered_congestion = 1;
2112                 return 0;
2113         }
2114
2115         pagevec_init(&pvec, 0);
2116         if (wbc->range_cyclic) {
2117                 index = mapping->writeback_index; /* Start from prev offset */
2118                 end = -1;
2119         } else {
2120                 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2121                 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2122                 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2123                         range_whole = 1;
2124                 scanned = 1;
2125         }
2126 retry:
2127         while (!done && (index <= end) &&
2128                (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2129                                               PAGECACHE_TAG_DIRTY,
2130                                               min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2131                 unsigned i;
2132
2133                 scanned = 1;
2134                 for (i = 0; i < nr_pages; i++) {
2135                         struct page *page = pvec.pages[i];
2136
2137                         /*
2138                          * At this point we hold neither mapping->tree_lock nor
2139                          * lock on the page itself: the page may be truncated or
2140                          * invalidated (changing page->mapping to NULL), or even
2141                          * swizzled back from swapper_space to tmpfs file
2142                          * mapping
2143                          */
2144                         if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2145                                 tree->ops->write_cache_pages_lock_hook(page);
2146                         else
2147                                 lock_page(page);
2148
2149                         if (unlikely(page->mapping != mapping)) {
2150                                 unlock_page(page);
2151                                 continue;
2152                         }
2153
2154                         if (!wbc->range_cyclic && page->index > end) {
2155                                 done = 1;
2156                                 unlock_page(page);
2157                                 continue;
2158                         }
2159
2160                         if (wbc->sync_mode != WB_SYNC_NONE)
2161                                 wait_on_page_writeback(page);
2162
2163                         if (PageWriteback(page) ||
2164                             !clear_page_dirty_for_io(page)) {
2165                                 unlock_page(page);
2166                                 continue;
2167                         }
2168
2169                         ret = (*writepage)(page, wbc, data);
2170
2171                         if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2172                                 unlock_page(page);
2173                                 ret = 0;
2174                         }
2175                         if (ret || (--(wbc->nr_to_write) <= 0))
2176                                 done = 1;
2177                         if (wbc->nonblocking && bdi_write_congested(bdi)) {
2178                                 wbc->encountered_congestion = 1;
2179                                 done = 1;
2180                         }
2181                 }
2182                 pagevec_release(&pvec);
2183                 cond_resched();
2184         }
2185         if (!scanned && !done) {
2186                 /*
2187                  * We hit the last page and there is more work to be done: wrap
2188                  * back to the start of the file
2189                  */
2190                 scanned = 1;
2191                 index = 0;
2192                 goto retry;
2193         }
2194         if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2195                 mapping->writeback_index = index;
2196
2197         if (wbc->range_cont)
2198                 wbc->range_start = index << PAGE_CACHE_SHIFT;
2199         return ret;
2200 }
2201 EXPORT_SYMBOL(extent_write_cache_pages);
2202
2203 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2204                           get_extent_t *get_extent,
2205                           struct writeback_control *wbc)
2206 {
2207         int ret;
2208         struct address_space *mapping = page->mapping;
2209         struct extent_page_data epd = {
2210                 .bio = NULL,
2211                 .tree = tree,
2212                 .get_extent = get_extent,
2213         };
2214         struct writeback_control wbc_writepages = {
2215                 .bdi            = wbc->bdi,
2216                 .sync_mode      = WB_SYNC_NONE,
2217                 .older_than_this = NULL,
2218                 .nr_to_write    = 64,
2219                 .range_start    = page_offset(page) + PAGE_CACHE_SIZE,
2220                 .range_end      = (loff_t)-1,
2221         };
2222
2223
2224         ret = __extent_writepage(page, wbc, &epd);
2225
2226         extent_write_cache_pages(tree, mapping, &wbc_writepages,
2227                                  __extent_writepage, &epd);
2228         if (epd.bio) {
2229                 submit_one_bio(WRITE, epd.bio, 0);
2230         }
2231         return ret;
2232 }
2233 EXPORT_SYMBOL(extent_write_full_page);
2234
2235
2236 int extent_writepages(struct extent_io_tree *tree,
2237                       struct address_space *mapping,
2238                       get_extent_t *get_extent,
2239                       struct writeback_control *wbc)
2240 {
2241         int ret = 0;
2242         struct extent_page_data epd = {
2243                 .bio = NULL,
2244                 .tree = tree,
2245                 .get_extent = get_extent,
2246         };
2247
2248         ret = extent_write_cache_pages(tree, mapping, wbc,
2249                                        __extent_writepage, &epd);
2250         if (epd.bio) {
2251                 submit_one_bio(WRITE, epd.bio, 0);
2252         }
2253         return ret;
2254 }
2255 EXPORT_SYMBOL(extent_writepages);
2256
2257 int extent_readpages(struct extent_io_tree *tree,
2258                      struct address_space *mapping,
2259                      struct list_head *pages, unsigned nr_pages,
2260                      get_extent_t get_extent)
2261 {
2262         struct bio *bio = NULL;
2263         unsigned page_idx;
2264         struct pagevec pvec;
2265
2266         pagevec_init(&pvec, 0);
2267         for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2268                 struct page *page = list_entry(pages->prev, struct page, lru);
2269
2270                 prefetchw(&page->flags);
2271                 list_del(&page->lru);
2272                 /*
2273                  * what we want to do here is call add_to_page_cache_lru,
2274                  * but that isn't exported, so we reproduce it here
2275                  */
2276                 if (!add_to_page_cache(page, mapping,
2277                                         page->index, GFP_KERNEL)) {
2278
2279                         /* open coding of lru_cache_add, also not exported */
2280                         page_cache_get(page);
2281                         if (!pagevec_add(&pvec, page))
2282                                 __pagevec_lru_add(&pvec);
2283                         __extent_read_full_page(tree, page, get_extent,
2284                                                 &bio, 0);
2285                 }
2286                 page_cache_release(page);
2287         }
2288         if (pagevec_count(&pvec))
2289                 __pagevec_lru_add(&pvec);
2290         BUG_ON(!list_empty(pages));
2291         if (bio)
2292                 submit_one_bio(READ, bio, 0);
2293         return 0;
2294 }
2295 EXPORT_SYMBOL(extent_readpages);
2296
2297 /*
2298  * basic invalidatepage code, this waits on any locked or writeback
2299  * ranges corresponding to the page, and then deletes any extent state
2300  * records from the tree
2301  */
2302 int extent_invalidatepage(struct extent_io_tree *tree,
2303                           struct page *page, unsigned long offset)
2304 {
2305         u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2306         u64 end = start + PAGE_CACHE_SIZE - 1;
2307         size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2308
2309         start += (offset + blocksize -1) & ~(blocksize - 1);
2310         if (start > end)
2311                 return 0;
2312
2313         lock_extent(tree, start, end, GFP_NOFS);
2314         wait_on_extent_writeback(tree, start, end);
2315         clear_extent_bit(tree, start, end,
2316                          EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2317                          1, 1, GFP_NOFS);
2318         return 0;
2319 }
2320 EXPORT_SYMBOL(extent_invalidatepage);
2321
2322 /*
2323  * simple commit_write call, set_range_dirty is used to mark both
2324  * the pages and the extent records as dirty
2325  */
2326 int extent_commit_write(struct extent_io_tree *tree,
2327                         struct inode *inode, struct page *page,
2328                         unsigned from, unsigned to)
2329 {
2330         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2331
2332         set_page_extent_mapped(page);
2333         set_page_dirty(page);
2334
2335         if (pos > inode->i_size) {
2336                 i_size_write(inode, pos);
2337                 mark_inode_dirty(inode);
2338         }
2339         return 0;
2340 }
2341 EXPORT_SYMBOL(extent_commit_write);
2342
2343 int extent_prepare_write(struct extent_io_tree *tree,
2344                          struct inode *inode, struct page *page,
2345                          unsigned from, unsigned to, get_extent_t *get_extent)
2346 {
2347         u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2348         u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2349         u64 block_start;
2350         u64 orig_block_start;
2351         u64 block_end;
2352         u64 cur_end;
2353         struct extent_map *em;
2354         unsigned blocksize = 1 << inode->i_blkbits;
2355         size_t page_offset = 0;
2356         size_t block_off_start;
2357         size_t block_off_end;
2358         int err = 0;
2359         int iocount = 0;
2360         int ret = 0;
2361         int isnew;
2362
2363         set_page_extent_mapped(page);
2364
2365         block_start = (page_start + from) & ~((u64)blocksize - 1);
2366         block_end = (page_start + to - 1) | (blocksize - 1);
2367         orig_block_start = block_start;
2368
2369         lock_extent(tree, page_start, page_end, GFP_NOFS);
2370         while(block_start <= block_end) {
2371                 em = get_extent(inode, page, page_offset, block_start,
2372                                 block_end - block_start + 1, 1);
2373                 if (IS_ERR(em) || !em) {
2374                         goto err;
2375                 }
2376                 cur_end = min(block_end, extent_map_end(em) - 1);
2377                 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2378                 block_off_end = block_off_start + blocksize;
2379                 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2380
2381                 if (!PageUptodate(page) && isnew &&
2382                     (block_off_end > to || block_off_start < from)) {
2383                         void *kaddr;
2384
2385                         kaddr = kmap_atomic(page, KM_USER0);
2386                         if (block_off_end > to)
2387                                 memset(kaddr + to, 0, block_off_end - to);
2388                         if (block_off_start < from)
2389                                 memset(kaddr + block_off_start, 0,
2390                                        from - block_off_start);
2391                         flush_dcache_page(page);
2392                         kunmap_atomic(kaddr, KM_USER0);
2393                 }
2394                 if ((em->block_start != EXTENT_MAP_HOLE &&
2395                      em->block_start != EXTENT_MAP_INLINE) &&
2396                     !isnew && !PageUptodate(page) &&
2397                     (block_off_end > to || block_off_start < from) &&
2398                     !test_range_bit(tree, block_start, cur_end,
2399                                     EXTENT_UPTODATE, 1)) {
2400                         u64 sector;
2401                         u64 extent_offset = block_start - em->start;
2402                         size_t iosize;
2403                         sector = (em->block_start + extent_offset) >> 9;
2404                         iosize = (cur_end - block_start + blocksize) &
2405                                 ~((u64)blocksize - 1);
2406                         /*
2407                          * we've already got the extent locked, but we
2408                          * need to split the state such that our end_bio
2409                          * handler can clear the lock.
2410                          */
2411                         set_extent_bit(tree, block_start,
2412                                        block_start + iosize - 1,
2413                                        EXTENT_LOCKED, 0, NULL, GFP_NOFS);
2414                         ret = submit_extent_page(READ, tree, page,
2415                                          sector, iosize, page_offset, em->bdev,
2416                                          NULL, 1,
2417                                          end_bio_extent_preparewrite, 0);
2418                         iocount++;
2419                         block_start = block_start + iosize;
2420                 } else {
2421                         set_extent_uptodate(tree, block_start, cur_end,
2422                                             GFP_NOFS);
2423                         unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2424                         block_start = cur_end + 1;
2425                 }
2426                 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2427                 free_extent_map(em);
2428         }
2429         if (iocount) {
2430                 wait_extent_bit(tree, orig_block_start,
2431                                 block_end, EXTENT_LOCKED);
2432         }
2433         check_page_uptodate(tree, page);
2434 err:
2435         /* FIXME, zero out newly allocated blocks on error */
2436         return err;
2437 }
2438 EXPORT_SYMBOL(extent_prepare_write);
2439
2440 /*
2441  * a helper for releasepage, this tests for areas of the page that
2442  * are locked or under IO and drops the related state bits if it is safe
2443  * to drop the page.
2444  */
2445 int try_release_extent_state(struct extent_map_tree *map,
2446                              struct extent_io_tree *tree, struct page *page,
2447                              gfp_t mask)
2448 {
2449         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2450         u64 end = start + PAGE_CACHE_SIZE - 1;
2451         int ret = 1;
2452
2453         if (test_range_bit(tree, start, end,
2454                            EXTENT_IOBITS | EXTENT_ORDERED, 0))
2455                 ret = 0;
2456         else {
2457                 if ((mask & GFP_NOFS) == GFP_NOFS)
2458                         mask = GFP_NOFS;
2459                 clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
2460                                  1, 1, mask);
2461         }
2462         return ret;
2463 }
2464 EXPORT_SYMBOL(try_release_extent_state);
2465
2466 /*
2467  * a helper for releasepage.  As long as there are no locked extents
2468  * in the range corresponding to the page, both state records and extent
2469  * map records are removed
2470  */
2471 int try_release_extent_mapping(struct extent_map_tree *map,
2472                                struct extent_io_tree *tree, struct page *page,
2473                                gfp_t mask)
2474 {
2475         struct extent_map *em;
2476         u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2477         u64 end = start + PAGE_CACHE_SIZE - 1;
2478
2479         if ((mask & __GFP_WAIT) &&
2480             page->mapping->host->i_size > 16 * 1024 * 1024) {
2481                 u64 len;
2482                 while (start <= end) {
2483                         len = end - start + 1;
2484                         spin_lock(&map->lock);
2485                         em = lookup_extent_mapping(map, start, len);
2486                         if (!em || IS_ERR(em)) {
2487                                 spin_unlock(&map->lock);
2488                                 break;
2489                         }
2490                         if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2491                             em->start != start) {
2492                                 spin_unlock(&map->lock);
2493                                 free_extent_map(em);
2494                                 break;
2495                         }
2496                         if (!test_range_bit(tree, em->start,
2497                                             extent_map_end(em) - 1,
2498                                             EXTENT_LOCKED, 0)) {
2499                                 remove_extent_mapping(map, em);
2500                                 /* once for the rb tree */
2501                                 free_extent_map(em);
2502                         }
2503                         start = extent_map_end(em);
2504                         spin_unlock(&map->lock);
2505
2506                         /* once for us */
2507                         free_extent_map(em);
2508                 }
2509         }
2510         return try_release_extent_state(map, tree, page, mask);
2511 }
2512 EXPORT_SYMBOL(try_release_extent_mapping);
2513
2514 sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2515                 get_extent_t *get_extent)
2516 {
2517         struct inode *inode = mapping->host;
2518         u64 start = iblock << inode->i_blkbits;
2519         sector_t sector = 0;
2520         struct extent_map *em;
2521
2522         em = get_extent(inode, NULL, 0, start, (1 << inode->i_blkbits), 0);
2523         if (!em || IS_ERR(em))
2524                 return 0;
2525
2526         if (em->block_start == EXTENT_MAP_INLINE ||
2527             em->block_start == EXTENT_MAP_HOLE)
2528                 goto out;
2529
2530         sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2531 out:
2532         free_extent_map(em);
2533         return sector;
2534 }
2535
2536 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2537                                               unsigned long i)
2538 {
2539         struct page *p;
2540         struct address_space *mapping;
2541
2542         if (i == 0)
2543                 return eb->first_page;
2544         i += eb->start >> PAGE_CACHE_SHIFT;
2545         mapping = eb->first_page->mapping;
2546         if (!mapping)
2547                 return NULL;
2548
2549         /*
2550          * extent_buffer_page is only called after pinning the page
2551          * by increasing the reference count.  So we know the page must
2552          * be in the radix tree.
2553          */
2554         rcu_read_lock();
2555         p = radix_tree_lookup(&mapping->page_tree, i);
2556         rcu_read_unlock();
2557
2558         return p;
2559 }
2560
2561 static inline unsigned long num_extent_pages(u64 start, u64 len)
2562 {
2563         return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2564                 (start >> PAGE_CACHE_SHIFT);
2565 }
2566
2567 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2568                                                    u64 start,
2569                                                    unsigned long len,
2570                                                    gfp_t mask)
2571 {
2572         struct extent_buffer *eb = NULL;
2573 #ifdef LEAK_DEBUG
2574         unsigned long flags;
2575 #endif
2576
2577         eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2578         eb->start = start;
2579         eb->len = len;
2580         mutex_init(&eb->mutex);
2581 #ifdef LEAK_DEBUG
2582         spin_lock_irqsave(&leak_lock, flags);
2583         list_add(&eb->leak_list, &buffers);
2584         spin_unlock_irqrestore(&leak_lock, flags);
2585 #endif
2586         atomic_set(&eb->refs, 1);
2587
2588         return eb;
2589 }
2590
2591 static void __free_extent_buffer(struct extent_buffer *eb)
2592 {
2593 #ifdef LEAK_DEBUG
2594         unsigned long flags;
2595         spin_lock_irqsave(&leak_lock, flags);
2596         list_del(&eb->leak_list);
2597         spin_unlock_irqrestore(&leak_lock, flags);
2598 #endif
2599         kmem_cache_free(extent_buffer_cache, eb);
2600 }
2601
2602 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2603                                           u64 start, unsigned long len,
2604                                           struct page *page0,
2605                                           gfp_t mask)
2606 {
2607         unsigned long num_pages = num_extent_pages(start, len);
2608         unsigned long i;
2609         unsigned long index = start >> PAGE_CACHE_SHIFT;
2610         struct extent_buffer *eb;
2611         struct extent_buffer *exists = NULL;
2612         struct page *p;
2613         struct address_space *mapping = tree->mapping;
2614         int uptodate = 1;
2615
2616         spin_lock(&tree->buffer_lock);
2617         eb = buffer_search(tree, start);
2618         if (eb) {
2619                 atomic_inc(&eb->refs);
2620                 spin_unlock(&tree->buffer_lock);
2621                 mark_page_accessed(eb->first_page);
2622                 return eb;
2623         }
2624         spin_unlock(&tree->buffer_lock);
2625
2626         eb = __alloc_extent_buffer(tree, start, len, mask);
2627         if (!eb)
2628                 return NULL;
2629
2630         if (page0) {
2631                 eb->first_page = page0;
2632                 i = 1;
2633                 index++;
2634                 page_cache_get(page0);
2635                 mark_page_accessed(page0);
2636                 set_page_extent_mapped(page0);
2637                 set_page_extent_head(page0, len);
2638                 uptodate = PageUptodate(page0);
2639         } else {
2640                 i = 0;
2641         }
2642         for (; i < num_pages; i++, index++) {
2643                 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
2644                 if (!p) {
2645                         WARN_ON(1);
2646                         goto free_eb;
2647                 }
2648                 set_page_extent_mapped(p);
2649                 mark_page_accessed(p);
2650                 if (i == 0) {
2651                         eb->first_page = p;
2652                         set_page_extent_head(p, len);
2653                 } else {
2654                         set_page_private(p, EXTENT_PAGE_PRIVATE);
2655                 }
2656                 if (!PageUptodate(p))
2657                         uptodate = 0;
2658                 unlock_page(p);
2659         }
2660         if (uptodate)
2661                 eb->flags |= EXTENT_UPTODATE;
2662         eb->flags |= EXTENT_BUFFER_FILLED;
2663
2664         spin_lock(&tree->buffer_lock);
2665         exists = buffer_tree_insert(tree, start, &eb->rb_node);
2666         if (exists) {
2667                 /* add one reference for the caller */
2668                 atomic_inc(&exists->refs);
2669                 spin_unlock(&tree->buffer_lock);
2670                 goto free_eb;
2671         }
2672         spin_unlock(&tree->buffer_lock);
2673
2674         /* add one reference for the tree */
2675         atomic_inc(&eb->refs);
2676         return eb;
2677
2678 free_eb:
2679         if (!atomic_dec_and_test(&eb->refs))
2680                 return exists;
2681         for (index = 1; index < i; index++)
2682                 page_cache_release(extent_buffer_page(eb, index));
2683         page_cache_release(extent_buffer_page(eb, 0));
2684         __free_extent_buffer(eb);
2685         return exists;
2686 }
2687 EXPORT_SYMBOL(alloc_extent_buffer);
2688
2689 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
2690                                          u64 start, unsigned long len,
2691                                           gfp_t mask)
2692 {
2693         struct extent_buffer *eb;
2694
2695         spin_lock(&tree->buffer_lock);
2696         eb = buffer_search(tree, start);
2697         if (eb)
2698                 atomic_inc(&eb->refs);
2699         spin_unlock(&tree->buffer_lock);
2700
2701         if (eb)
2702                 mark_page_accessed(eb->first_page);
2703
2704         return eb;
2705 }
2706 EXPORT_SYMBOL(find_extent_buffer);
2707
2708 void free_extent_buffer(struct extent_buffer *eb)
2709 {
2710         if (!eb)
2711                 return;
2712
2713         if (!atomic_dec_and_test(&eb->refs))
2714                 return;
2715
2716         WARN_ON(1);
2717 }
2718 EXPORT_SYMBOL(free_extent_buffer);
2719
2720 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
2721                               struct extent_buffer *eb)
2722 {
2723         int set;
2724         unsigned long i;
2725         unsigned long num_pages;
2726         struct page *page;
2727
2728         u64 start = eb->start;
2729         u64 end = start + eb->len - 1;
2730
2731         set = clear_extent_dirty(tree, start, end, GFP_NOFS);
2732         num_pages = num_extent_pages(eb->start, eb->len);
2733
2734         for (i = 0; i < num_pages; i++) {
2735                 page = extent_buffer_page(eb, i);
2736                 lock_page(page);
2737                 if (i == 0)
2738                         set_page_extent_head(page, eb->len);
2739                 else
2740                         set_page_private(page, EXTENT_PAGE_PRIVATE);
2741
2742                 /*
2743                  * if we're on the last page or the first page and the
2744                  * block isn't aligned on a page boundary, do extra checks
2745                  * to make sure we don't clean page that is partially dirty
2746                  */
2747                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2748                     ((i == num_pages - 1) &&
2749                      ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2750                         start = (u64)page->index << PAGE_CACHE_SHIFT;
2751                         end  = start + PAGE_CACHE_SIZE - 1;
2752                         if (test_range_bit(tree, start, end,
2753                                            EXTENT_DIRTY, 0)) {
2754                                 unlock_page(page);
2755                                 continue;
2756                         }
2757                 }
2758                 clear_page_dirty_for_io(page);
2759                 spin_lock_irq(&page->mapping->tree_lock);
2760                 if (!PageDirty(page)) {
2761                         radix_tree_tag_clear(&page->mapping->page_tree,
2762                                                 page_index(page),
2763                                                 PAGECACHE_TAG_DIRTY);
2764                 }
2765                 spin_unlock_irq(&page->mapping->tree_lock);
2766                 unlock_page(page);
2767         }
2768         return 0;
2769 }
2770 EXPORT_SYMBOL(clear_extent_buffer_dirty);
2771
2772 int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
2773                                     struct extent_buffer *eb)
2774 {
2775         return wait_on_extent_writeback(tree, eb->start,
2776                                         eb->start + eb->len - 1);
2777 }
2778 EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
2779
2780 int set_extent_buffer_dirty(struct extent_io_tree *tree,
2781                              struct extent_buffer *eb)
2782 {
2783         unsigned long i;
2784         unsigned long num_pages;
2785
2786         num_pages = num_extent_pages(eb->start, eb->len);
2787         for (i = 0; i < num_pages; i++) {
2788                 struct page *page = extent_buffer_page(eb, i);
2789                 /* writepage may need to do something special for the
2790                  * first page, we have to make sure page->private is
2791                  * properly set.  releasepage may drop page->private
2792                  * on us if the page isn't already dirty.
2793                  */
2794                 lock_page(page);
2795                 if (i == 0) {
2796                         set_page_extent_head(page, eb->len);
2797                 } else if (PagePrivate(page) &&
2798                            page->private != EXTENT_PAGE_PRIVATE) {
2799                         set_page_extent_mapped(page);
2800                 }
2801                 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
2802                 set_extent_dirty(tree, page_offset(page),
2803                                  page_offset(page) + PAGE_CACHE_SIZE -1,
2804                                  GFP_NOFS);
2805                 unlock_page(page);
2806         }
2807         return 0;
2808 }
2809 EXPORT_SYMBOL(set_extent_buffer_dirty);
2810
2811 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
2812                                 struct extent_buffer *eb)
2813 {
2814         unsigned long i;
2815         struct page *page;
2816         unsigned long num_pages;
2817
2818         num_pages = num_extent_pages(eb->start, eb->len);
2819         eb->flags &= ~EXTENT_UPTODATE;
2820
2821         clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
2822                               GFP_NOFS);
2823         for (i = 0; i < num_pages; i++) {
2824                 page = extent_buffer_page(eb, i);
2825                 if (page)
2826                         ClearPageUptodate(page);
2827         }
2828         return 0;
2829 }
2830
2831 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
2832                                 struct extent_buffer *eb)
2833 {
2834         unsigned long i;
2835         struct page *page;
2836         unsigned long num_pages;
2837
2838         num_pages = num_extent_pages(eb->start, eb->len);
2839
2840         set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
2841                             GFP_NOFS);
2842         for (i = 0; i < num_pages; i++) {
2843                 page = extent_buffer_page(eb, i);
2844                 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2845                     ((i == num_pages - 1) &&
2846                      ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2847                         check_page_uptodate(tree, page);
2848                         continue;
2849                 }
2850                 SetPageUptodate(page);
2851         }
2852         return 0;
2853 }
2854 EXPORT_SYMBOL(set_extent_buffer_uptodate);
2855
2856 int extent_range_uptodate(struct extent_io_tree *tree,
2857                           u64 start, u64 end)
2858 {
2859         struct page *page;
2860         int ret;
2861         int pg_uptodate = 1;
2862         int uptodate;
2863         unsigned long index;
2864
2865         ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
2866         if (ret)
2867                 return 1;
2868         while(start <= end) {
2869                 index = start >> PAGE_CACHE_SHIFT;
2870                 page = find_get_page(tree->mapping, index);
2871                 uptodate = PageUptodate(page);
2872                 page_cache_release(page);
2873                 if (!uptodate) {
2874                         pg_uptodate = 0;
2875                         break;
2876                 }
2877                 start += PAGE_CACHE_SIZE;
2878         }
2879         return pg_uptodate;
2880 }
2881
2882 int extent_buffer_uptodate(struct extent_io_tree *tree,
2883                            struct extent_buffer *eb)
2884 {
2885         int ret = 0;
2886         unsigned long num_pages;
2887         unsigned long i;
2888         struct page *page;
2889         int pg_uptodate = 1;
2890
2891         if (eb->flags & EXTENT_UPTODATE)
2892                 return 1;
2893
2894         ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2895                            EXTENT_UPTODATE, 1);
2896         if (ret)
2897                 return ret;
2898
2899         num_pages = num_extent_pages(eb->start, eb->len);
2900         for (i = 0; i < num_pages; i++) {
2901                 page = extent_buffer_page(eb, i);
2902                 if (!PageUptodate(page)) {
2903                         pg_uptodate = 0;
2904                         break;
2905                 }
2906         }
2907         return pg_uptodate;
2908 }
2909 EXPORT_SYMBOL(extent_buffer_uptodate);
2910
2911 int read_extent_buffer_pages(struct extent_io_tree *tree,
2912                              struct extent_buffer *eb,
2913                              u64 start, int wait,
2914                              get_extent_t *get_extent, int mirror_num)
2915 {
2916         unsigned long i;
2917         unsigned long start_i;
2918         struct page *page;
2919         int err;
2920         int ret = 0;
2921         int locked_pages = 0;
2922         int all_uptodate = 1;
2923         int inc_all_pages = 0;
2924         unsigned long num_pages;
2925         struct bio *bio = NULL;
2926
2927         if (eb->flags & EXTENT_UPTODATE)
2928                 return 0;
2929
2930         if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2931                            EXTENT_UPTODATE, 1)) {
2932                 return 0;
2933         }
2934
2935         if (start) {
2936                 WARN_ON(start < eb->start);
2937                 start_i = (start >> PAGE_CACHE_SHIFT) -
2938                         (eb->start >> PAGE_CACHE_SHIFT);
2939         } else {
2940                 start_i = 0;
2941         }
2942
2943         num_pages = num_extent_pages(eb->start, eb->len);
2944         for (i = start_i; i < num_pages; i++) {
2945                 page = extent_buffer_page(eb, i);
2946                 if (!wait) {
2947                         if (!trylock_page(page))
2948                                 goto unlock_exit;
2949                 } else {
2950                         lock_page(page);
2951                 }
2952                 locked_pages++;
2953                 if (!PageUptodate(page)) {
2954                         all_uptodate = 0;
2955                 }
2956         }
2957         if (all_uptodate) {
2958                 if (start_i == 0)
2959                         eb->flags |= EXTENT_UPTODATE;
2960                 if (ret) {
2961                         printk("all up to date but ret is %d\n", ret);
2962                 }
2963                 goto unlock_exit;
2964         }
2965
2966         for (i = start_i; i < num_pages; i++) {
2967                 page = extent_buffer_page(eb, i);
2968                 if (inc_all_pages)
2969                         page_cache_get(page);
2970                 if (!PageUptodate(page)) {
2971                         if (start_i == 0)
2972                                 inc_all_pages = 1;
2973                         ClearPageError(page);
2974                         err = __extent_read_full_page(tree, page,
2975                                                       get_extent, &bio,
2976                                                       mirror_num);
2977                         if (err) {
2978                                 ret = err;
2979                                 printk("err %d from __extent_read_full_page\n", ret);
2980                         }
2981                 } else {
2982                         unlock_page(page);
2983                 }
2984         }
2985
2986         if (bio)
2987                 submit_one_bio(READ, bio, mirror_num);
2988
2989         if (ret || !wait) {
2990                 if (ret)
2991                         printk("ret %d wait %d returning\n", ret, wait);
2992                 return ret;
2993         }
2994         for (i = start_i; i < num_pages; i++) {
2995                 page = extent_buffer_page(eb, i);
2996                 wait_on_page_locked(page);
2997                 if (!PageUptodate(page)) {
2998                         printk("page not uptodate after wait_on_page_locked\n");
2999                         ret = -EIO;
3000                 }
3001         }
3002         if (!ret)
3003                 eb->flags |= EXTENT_UPTODATE;
3004         return ret;
3005
3006 unlock_exit:
3007         i = start_i;
3008         while(locked_pages > 0) {
3009                 page = extent_buffer_page(eb, i);
3010                 i++;
3011                 unlock_page(page);
3012                 locked_pages--;
3013         }
3014         return ret;
3015 }
3016 EXPORT_SYMBOL(read_extent_buffer_pages);
3017
3018 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3019                         unsigned long start,
3020                         unsigned long len)
3021 {
3022         size_t cur;
3023         size_t offset;
3024         struct page *page;
3025         char *kaddr;
3026         char *dst = (char *)dstv;
3027         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3028         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3029
3030         WARN_ON(start > eb->len);
3031         WARN_ON(start + len > eb->start + eb->len);
3032
3033         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3034
3035         while(len > 0) {
3036                 page = extent_buffer_page(eb, i);
3037
3038                 cur = min(len, (PAGE_CACHE_SIZE - offset));
3039                 kaddr = kmap_atomic(page, KM_USER1);
3040                 memcpy(dst, kaddr + offset, cur);
3041                 kunmap_atomic(kaddr, KM_USER1);
3042
3043                 dst += cur;
3044                 len -= cur;
3045                 offset = 0;
3046                 i++;
3047         }
3048 }
3049 EXPORT_SYMBOL(read_extent_buffer);
3050
3051 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3052                                unsigned long min_len, char **token, char **map,
3053                                unsigned long *map_start,
3054                                unsigned long *map_len, int km)
3055 {
3056         size_t offset = start & (PAGE_CACHE_SIZE - 1);
3057         char *kaddr;
3058         struct page *p;
3059         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3060         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3061         unsigned long end_i = (start_offset + start + min_len - 1) >>
3062                 PAGE_CACHE_SHIFT;
3063
3064         if (i != end_i)
3065                 return -EINVAL;
3066
3067         if (i == 0) {
3068                 offset = start_offset;
3069                 *map_start = 0;
3070         } else {
3071                 offset = 0;
3072                 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3073         }
3074         if (start + min_len > eb->len) {
3075 printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
3076                 WARN_ON(1);
3077         }
3078
3079         p = extent_buffer_page(eb, i);
3080         kaddr = kmap_atomic(p, km);
3081         *token = kaddr;
3082         *map = kaddr + offset;
3083         *map_len = PAGE_CACHE_SIZE - offset;
3084         return 0;
3085 }
3086 EXPORT_SYMBOL(map_private_extent_buffer);
3087
3088 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3089                       unsigned long min_len,
3090                       char **token, char **map,
3091                       unsigned long *map_start,
3092                       unsigned long *map_len, int km)
3093 {
3094         int err;
3095         int save = 0;
3096         if (eb->map_token) {
3097                 unmap_extent_buffer(eb, eb->map_token, km);
3098                 eb->map_token = NULL;
3099                 save = 1;
3100         }
3101         err = map_private_extent_buffer(eb, start, min_len, token, map,
3102                                        map_start, map_len, km);
3103         if (!err && save) {
3104                 eb->map_token = *token;
3105                 eb->kaddr = *map;
3106                 eb->map_start = *map_start;
3107                 eb->map_len = *map_len;
3108         }
3109         return err;
3110 }
3111 EXPORT_SYMBOL(map_extent_buffer);
3112
3113 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3114 {
3115         kunmap_atomic(token, km);
3116 }
3117 EXPORT_SYMBOL(unmap_extent_buffer);
3118
3119 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3120                           unsigned long start,
3121                           unsigned long len)
3122 {
3123         size_t cur;
3124         size_t offset;
3125         struct page *page;
3126         char *kaddr;
3127         char *ptr = (char *)ptrv;
3128         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3129         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3130         int ret = 0;
3131
3132         WARN_ON(start > eb->len);
3133         WARN_ON(start + len > eb->start + eb->len);
3134
3135         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3136
3137         while(len > 0) {
3138                 page = extent_buffer_page(eb, i);
3139
3140                 cur = min(len, (PAGE_CACHE_SIZE - offset));
3141
3142                 kaddr = kmap_atomic(page, KM_USER0);
3143                 ret = memcmp(ptr, kaddr + offset, cur);
3144                 kunmap_atomic(kaddr, KM_USER0);
3145                 if (ret)
3146                         break;
3147
3148                 ptr += cur;
3149                 len -= cur;
3150                 offset = 0;
3151                 i++;
3152         }
3153         return ret;
3154 }
3155 EXPORT_SYMBOL(memcmp_extent_buffer);
3156
3157 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3158                          unsigned long start, unsigned long len)
3159 {
3160         size_t cur;
3161         size_t offset;
3162         struct page *page;
3163         char *kaddr;
3164         char *src = (char *)srcv;
3165         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3166         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3167
3168         WARN_ON(start > eb->len);
3169         WARN_ON(start + len > eb->start + eb->len);
3170
3171         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3172
3173         while(len > 0) {
3174                 page = extent_buffer_page(eb, i);
3175                 WARN_ON(!PageUptodate(page));
3176
3177                 cur = min(len, PAGE_CACHE_SIZE - offset);
3178                 kaddr = kmap_atomic(page, KM_USER1);
3179                 memcpy(kaddr + offset, src, cur);
3180                 kunmap_atomic(kaddr, KM_USER1);
3181
3182                 src += cur;
3183                 len -= cur;
3184                 offset = 0;
3185                 i++;
3186         }
3187 }
3188 EXPORT_SYMBOL(write_extent_buffer);
3189
3190 void memset_extent_buffer(struct extent_buffer *eb, char c,
3191                           unsigned long start, unsigned long len)
3192 {
3193         size_t cur;
3194         size_t offset;
3195         struct page *page;
3196         char *kaddr;
3197         size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3198         unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3199
3200         WARN_ON(start > eb->len);
3201         WARN_ON(start + len > eb->start + eb->len);
3202
3203         offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3204
3205         while(len > 0) {
3206                 page = extent_buffer_page(eb, i);
3207                 WARN_ON(!PageUptodate(page));
3208
3209                 cur = min(len, PAGE_CACHE_SIZE - offset);
3210                 kaddr = kmap_atomic(page, KM_USER0);
3211                 memset(kaddr + offset, c, cur);
3212                 kunmap_atomic(kaddr, KM_USER0);
3213
3214                 len -= cur;
3215                 offset = 0;
3216                 i++;
3217         }
3218 }
3219 EXPORT_SYMBOL(memset_extent_buffer);
3220
3221 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3222                         unsigned long dst_offset, unsigned long src_offset,
3223                         unsigned long len)
3224 {
3225         u64 dst_len = dst->len;
3226         size_t cur;
3227         size_t offset;
3228         struct page *page;
3229         char *kaddr;
3230         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3231         unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3232
3233         WARN_ON(src->len != dst_len);
3234
3235         offset = (start_offset + dst_offset) &
3236                 ((unsigned long)PAGE_CACHE_SIZE - 1);
3237
3238         while(len > 0) {
3239                 page = extent_buffer_page(dst, i);
3240                 WARN_ON(!PageUptodate(page));
3241
3242                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3243
3244                 kaddr = kmap_atomic(page, KM_USER0);
3245                 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3246                 kunmap_atomic(kaddr, KM_USER0);
3247
3248                 src_offset += cur;
3249                 len -= cur;
3250                 offset = 0;
3251                 i++;
3252         }
3253 }
3254 EXPORT_SYMBOL(copy_extent_buffer);
3255
3256 static void move_pages(struct page *dst_page, struct page *src_page,
3257                        unsigned long dst_off, unsigned long src_off,
3258                        unsigned long len)
3259 {
3260         char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3261         if (dst_page == src_page) {
3262                 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3263         } else {
3264                 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3265                 char *p = dst_kaddr + dst_off + len;
3266                 char *s = src_kaddr + src_off + len;
3267
3268                 while (len--)
3269                         *--p = *--s;
3270
3271                 kunmap_atomic(src_kaddr, KM_USER1);
3272         }
3273         kunmap_atomic(dst_kaddr, KM_USER0);
3274 }
3275
3276 static void copy_pages(struct page *dst_page, struct page *src_page,
3277                        unsigned long dst_off, unsigned long src_off,
3278                        unsigned long len)
3279 {
3280         char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3281         char *src_kaddr;
3282
3283         if (dst_page != src_page)
3284                 src_kaddr = kmap_atomic(src_page, KM_USER1);
3285         else
3286                 src_kaddr = dst_kaddr;
3287
3288         memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3289         kunmap_atomic(dst_kaddr, KM_USER0);
3290         if (dst_page != src_page)
3291                 kunmap_atomic(src_kaddr, KM_USER1);
3292 }
3293
3294 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3295                            unsigned long src_offset, unsigned long len)
3296 {
3297         size_t cur;
3298         size_t dst_off_in_page;
3299         size_t src_off_in_page;
3300         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3301         unsigned long dst_i;
3302         unsigned long src_i;
3303
3304         if (src_offset + len > dst->len) {
3305                 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3306                        src_offset, len, dst->len);
3307                 BUG_ON(1);
3308         }
3309         if (dst_offset + len > dst->len) {
3310                 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3311                        dst_offset, len, dst->len);
3312                 BUG_ON(1);
3313         }
3314
3315         while(len > 0) {
3316                 dst_off_in_page = (start_offset + dst_offset) &
3317                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3318                 src_off_in_page = (start_offset + src_offset) &
3319                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3320
3321                 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3322                 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3323
3324                 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3325                                                src_off_in_page));
3326                 cur = min_t(unsigned long, cur,
3327                         (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3328
3329                 copy_pages(extent_buffer_page(dst, dst_i),
3330                            extent_buffer_page(dst, src_i),
3331                            dst_off_in_page, src_off_in_page, cur);
3332
3333                 src_offset += cur;
3334                 dst_offset += cur;
3335                 len -= cur;
3336         }
3337 }
3338 EXPORT_SYMBOL(memcpy_extent_buffer);
3339
3340 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3341                            unsigned long src_offset, unsigned long len)
3342 {
3343         size_t cur;
3344         size_t dst_off_in_page;
3345         size_t src_off_in_page;
3346         unsigned long dst_end = dst_offset + len - 1;
3347         unsigned long src_end = src_offset + len - 1;
3348         size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3349         unsigned long dst_i;
3350         unsigned long src_i;
3351
3352         if (src_offset + len > dst->len) {
3353                 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3354                        src_offset, len, dst->len);
3355                 BUG_ON(1);
3356         }
3357         if (dst_offset + len > dst->len) {
3358                 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3359                        dst_offset, len, dst->len);
3360                 BUG_ON(1);
3361         }
3362         if (dst_offset < src_offset) {
3363                 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3364                 return;
3365         }
3366         while(len > 0) {
3367                 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3368                 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3369
3370                 dst_off_in_page = (start_offset + dst_end) &
3371                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3372                 src_off_in_page = (start_offset + src_end) &
3373                         ((unsigned long)PAGE_CACHE_SIZE - 1);
3374
3375                 cur = min_t(unsigned long, len, src_off_in_page + 1);
3376                 cur = min(cur, dst_off_in_page + 1);
3377                 move_pages(extent_buffer_page(dst, dst_i),
3378                            extent_buffer_page(dst, src_i),
3379                            dst_off_in_page - cur + 1,
3380                            src_off_in_page - cur + 1, cur);
3381
3382                 dst_end -= cur;
3383                 src_end -= cur;
3384                 len -= cur;
3385         }
3386 }
3387 EXPORT_SYMBOL(memmove_extent_buffer);
3388
3389 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3390 {
3391         u64 start = page_offset(page);
3392         struct extent_buffer *eb;
3393         int ret = 1;
3394         unsigned long i;
3395         unsigned long num_pages;
3396
3397         spin_lock(&tree->buffer_lock);
3398         eb = buffer_search(tree, start);
3399         if (!eb)
3400                 goto out;
3401
3402         if (atomic_read(&eb->refs) > 1) {
3403                 ret = 0;
3404                 goto out;
3405         }
3406         /* at this point we can safely release the extent buffer */
3407         num_pages = num_extent_pages(eb->start, eb->len);
3408         for (i = 0; i < num_pages; i++)
3409                 page_cache_release(extent_buffer_page(eb, i));
3410         rb_erase(&eb->rb_node, &tree->buffer);
3411         __free_extent_buffer(eb);
3412 out:
3413         spin_unlock(&tree->buffer_lock);
3414         return ret;
3415 }
3416 EXPORT_SYMBOL(try_release_extent_buffer);