#include <linux/elevator.h>
#include <linux/rbtree.h>
#include <linux/ioprio.h>
+#include <linux/blktrace_api.h>
/*
* tunables
#define RQ_CIC(rq) \
((struct cfq_io_context *) (rq)->elevator_private)
-#define RQ_CFQQ(rq) ((rq)->elevator_private2)
+#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2)
static struct kmem_cache *cfq_pool;
static struct kmem_cache *cfq_ioc_pool;
static DEFINE_PER_CPU(unsigned long, ioc_count);
static struct completion *ioc_gone;
+static DEFINE_SPINLOCK(ioc_gone_lock);
#define CFQ_PRIO_LISTS IOPRIO_BE_NR
#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
struct cfq_queue {
/* reference count */
atomic_t ref;
+ /* various state flags, see below */
+ unsigned int flags;
/* parent cfq_data */
struct cfq_data *cfqd;
/* service_tree member */
int queued[2];
/* currently allocated requests */
int allocated[2];
- /* pending metadata requests */
- int meta_pending;
/* fifo list of requests in sort_list */
struct list_head fifo;
unsigned long slice_end;
long slice_resid;
+ /* pending metadata requests */
+ int meta_pending;
/* number of requests that are on the dispatch list or inside driver */
int dispatched;
unsigned short ioprio, org_ioprio;
unsigned short ioprio_class, org_ioprio_class;
- /* various state flags, see below */
- unsigned int flags;
+ pid_t pid;
};
enum cfqq_state_flags {
CFQ_CFQQ_FNS(sync);
#undef CFQ_CFQQ_FNS
+#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
+ blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
+#define cfq_log(cfqd, fmt, args...) \
+ blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
+
static void cfq_dispatch_insert(struct request_queue *, struct request *);
static struct cfq_queue *cfq_get_queue(struct cfq_data *, int,
struct io_context *, gfp_t);
*/
static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
{
- if (cfqd->busy_queues)
+ if (cfqd->busy_queues) {
+ cfq_log(cfqd, "schedule dispatch");
kblockd_schedule_work(&cfqd->unplug_work);
+ }
}
static int cfq_queue_empty(struct request_queue *q)
cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
+ cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
}
/*
*/
static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
+ cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
BUG_ON(cfq_cfqq_on_rr(cfqq));
cfq_mark_cfqq_on_rr(cfqq);
cfqd->busy_queues++;
*/
static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
+ cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
BUG_ON(!cfq_cfqq_on_rr(cfqq));
cfq_clear_cfqq_on_rr(cfqq);
struct cfq_data *cfqd = q->elevator->elevator_data;
cfqd->rq_in_driver++;
+ cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
+ cfqd->rq_in_driver);
/*
* If the depth is larger 1, it really could be queueing. But lets
WARN_ON(!cfqd->rq_in_driver);
cfqd->rq_in_driver--;
+ cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
+ cfqd->rq_in_driver);
}
static void cfq_remove_request(struct request *rq)
struct cfq_queue *cfqq)
{
if (cfqq) {
+ cfq_log_cfqq(cfqd, cfqq, "set_active");
cfqq->slice_end = 0;
cfq_clear_cfqq_must_alloc_slice(cfqq);
cfq_clear_cfqq_fifo_expire(cfqq);
__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
int timed_out)
{
+ cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
+
if (cfq_cfqq_wait_request(cfqq))
del_timer(&cfqd->idle_slice_timer);
/*
* store what was left of this slice, if the queue idled/timed out
*/
- if (timed_out && !cfq_cfqq_slice_new(cfqq))
+ if (timed_out && !cfq_cfqq_slice_new(cfqq)) {
cfqq->slice_resid = cfqq->slice_end - jiffies;
+ cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
+ }
cfq_resort_rr_list(cfqd, cfqq);
if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq))
return;
+ /*
+ * still requests with the driver, don't idle
+ */
+ if (cfqd->rq_in_driver)
+ return;
+
/*
* task has exited, don't wait
*/
sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
+ cfq_log(cfqd, "arm_idle: %lu", sl);
}
/*
struct cfq_data *cfqd = q->elevator->elevator_data;
struct cfq_queue *cfqq = RQ_CFQQ(rq);
+ cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
+
cfq_remove_request(rq);
cfqq->dispatched++;
elv_dispatch_sort(q, rq);
rq = rq_entry_fifo(cfqq->fifo.next);
if (time_before(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo]))
- return NULL;
+ rq = NULL;
+ cfq_log_cfqq(cfqd, cfqq, "fifo=%p", rq);
return rq;
}
BUG_ON(cfqd->busy_queues);
+ cfq_log(cfqd, "forced_dispatch=%d\n", dispatched);
return dispatched;
}
dispatched += __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
}
+ cfq_log(cfqd, "dispatched=%d", dispatched);
return dispatched;
}
if (!atomic_dec_and_test(&cfqq->ref))
return;
+ cfq_log_cfqq(cfqd, cfqq, "put_queue");
BUG_ON(rb_first(&cfqq->sort_list));
BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
BUG_ON(cfq_cfqq_on_rr(cfqq));
}
/*
- * Call func for each cic attached to this ioc. Returns number of cic's seen.
+ * Must always be called with the rcu_read_lock() held
*/
-static unsigned int
-call_for_each_cic(struct io_context *ioc,
- void (*func)(struct io_context *, struct cfq_io_context *))
+static void
+__call_for_each_cic(struct io_context *ioc,
+ void (*func)(struct io_context *, struct cfq_io_context *))
{
struct cfq_io_context *cic;
struct hlist_node *n;
- int called = 0;
- rcu_read_lock();
- hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list) {
+ hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
func(ioc, cic);
- called++;
- }
+}
+
+/*
+ * Call func for each cic attached to this ioc.
+ */
+static void
+call_for_each_cic(struct io_context *ioc,
+ void (*func)(struct io_context *, struct cfq_io_context *))
+{
+ rcu_read_lock();
+ __call_for_each_cic(ioc, func);
rcu_read_unlock();
+}
+
+static void cfq_cic_free_rcu(struct rcu_head *head)
+{
+ struct cfq_io_context *cic;
+
+ cic = container_of(head, struct cfq_io_context, rcu_head);
+
+ kmem_cache_free(cfq_ioc_pool, cic);
+ elv_ioc_count_dec(ioc_count);
+
+ if (ioc_gone) {
+ /*
+ * CFQ scheduler is exiting, grab exit lock and check
+ * the pending io context count. If it hits zero,
+ * complete ioc_gone and set it back to NULL
+ */
+ spin_lock(&ioc_gone_lock);
+ if (ioc_gone && !elv_ioc_count_read(ioc_count)) {
+ complete(ioc_gone);
+ ioc_gone = NULL;
+ }
+ spin_unlock(&ioc_gone_lock);
+ }
+}
- return called;
+static void cfq_cic_free(struct cfq_io_context *cic)
+{
+ call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
}
static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
hlist_del_rcu(&cic->cic_list);
spin_unlock_irqrestore(&ioc->lock, flags);
- kmem_cache_free(cfq_ioc_pool, cic);
+ cfq_cic_free(cic);
}
+/*
+ * Must be called with rcu_read_lock() held or preemption otherwise disabled.
+ * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
+ * and ->trim() which is called with the task lock held
+ */
static void cfq_free_io_context(struct io_context *ioc)
{
- int freed;
-
/*
- * ioc->refcount is zero here, so no more cic's are allowed to be
- * linked into this ioc. So it should be ok to iterate over the known
- * list, we will see all cic's since no new ones are added.
+ * ioc->refcount is zero here, or we are called from elv_unregister(),
+ * so no more cic's are allowed to be linked into this ioc. So it
+ * should be ok to iterate over the known list, we will see all cic's
+ * since no new ones are added.
*/
- freed = call_for_each_cic(ioc, cic_free_func);
-
- elv_ioc_count_mod(ioc_count, -freed);
-
- if (ioc_gone && !elv_ioc_count_read(ioc_count))
- complete(ioc_gone);
+ __call_for_each_cic(ioc, cic_free_func);
}
static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
struct cfq_io_context *cic)
{
+ struct io_context *ioc = cic->ioc;
+
list_del_init(&cic->queue_list);
/*
cic->dead_key = (unsigned long) cic->key;
cic->key = NULL;
+ if (ioc->ioc_data == cic)
+ rcu_assign_pointer(ioc->ioc_data, NULL);
+
if (cic->cfqq[ASYNC]) {
cfq_exit_cfqq(cfqd, cic->cfqq[ASYNC]);
cic->cfqq[ASYNC] = NULL;
*/
static void cfq_exit_io_context(struct io_context *ioc)
{
- rcu_assign_pointer(ioc->ioc_data, NULL);
call_for_each_cic(ioc, cfq_exit_single_io_context);
}
printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
case IOPRIO_CLASS_NONE:
/*
- * no prio set, place us in the middle of the BE classes
+ * no prio set, inherit CPU scheduling settings
*/
cfqq->ioprio = task_nice_ioprio(tsk);
- cfqq->ioprio_class = IOPRIO_CLASS_BE;
+ cfqq->ioprio_class = task_nice_ioclass(tsk);
break;
case IOPRIO_CLASS_RT:
cfqq->ioprio = task_ioprio(ioc);
cfq_mark_cfqq_idle_window(cfqq);
cfq_mark_cfqq_sync(cfqq);
}
+ cfqq->pid = current->pid;
+ cfq_log_cfqq(cfqd, cfqq, "alloced");
}
if (new_cfqq)
return cfqq;
}
-static void cfq_cic_free(struct cfq_io_context *cic)
-{
- kmem_cache_free(cfq_ioc_pool, cic);
- elv_ioc_count_dec(ioc_count);
-
- if (ioc_gone && !elv_ioc_count_read(ioc_count))
- complete(ioc_gone);
-}
-
/*
* We drop cfq io contexts lazily, so we may find a dead one.
*/
spin_lock_irqsave(&ioc->lock, flags);
- if (ioc->ioc_data == cic)
- rcu_assign_pointer(ioc->ioc_data, NULL);
+ BUG_ON(ioc->ioc_data == cic);
radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd);
hlist_del_rcu(&cic->cic_list);
cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
{
struct cfq_io_context *cic;
+ unsigned long flags;
void *k;
if (unlikely(!ioc))
return NULL;
+ rcu_read_lock();
+
/*
* we maintain a last-hit cache, to avoid browsing over the tree
*/
cic = rcu_dereference(ioc->ioc_data);
- if (cic && cic->key == cfqd)
+ if (cic && cic->key == cfqd) {
+ rcu_read_unlock();
return cic;
+ }
do {
- rcu_read_lock();
cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd);
rcu_read_unlock();
if (!cic)
k = cic->key;
if (unlikely(!k)) {
cfq_drop_dead_cic(cfqd, ioc, cic);
+ rcu_read_lock();
continue;
}
+ spin_lock_irqsave(&ioc->lock, flags);
rcu_assign_pointer(ioc->ioc_data, cic);
+ spin_unlock_irqrestore(&ioc->lock, flags);
break;
} while (1);
cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
struct cfq_io_context *cic)
{
- int enable_idle;
+ int old_idle, enable_idle;
/*
* Don't idle for async or idle io prio class
if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
return;
- enable_idle = cfq_cfqq_idle_window(cfqq);
+ enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
(cfqd->hw_tag && CIC_SEEKY(cic)))
enable_idle = 1;
}
- if (enable_idle)
- cfq_mark_cfqq_idle_window(cfqq);
- else
- cfq_clear_cfqq_idle_window(cfqq);
+ if (old_idle != enable_idle) {
+ cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
+ if (enable_idle)
+ cfq_mark_cfqq_idle_window(cfqq);
+ else
+ cfq_clear_cfqq_idle_window(cfqq);
+ }
}
/*
*/
static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
+ cfq_log_cfqq(cfqd, cfqq, "preempt");
cfq_slice_expired(cfqd, 1);
/*
struct cfq_data *cfqd = q->elevator->elevator_data;
struct cfq_queue *cfqq = RQ_CFQQ(rq);
+ cfq_log_cfqq(cfqd, cfqq, "insert_request");
cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
cfq_add_rq_rb(rq);
unsigned long now;
now = jiffies;
+ cfq_log_cfqq(cfqd, cfqq, "complete");
WARN_ON(!cfqd->rq_in_driver);
WARN_ON(!cfqq->dispatched);
cfq_schedule_dispatch(cfqd);
spin_unlock_irqrestore(q->queue_lock, flags);
+ cfq_log(cfqd, "set_request fail");
return 1;
}
unsigned long flags;
int timed_out = 1;
+ cfq_log(cfqd, "idle timer fired");
+
spin_lock_irqsave(cfqd->queue->queue_lock, flags);
cfqq = cfqd->active_queue;
static void cfq_slab_kill(void)
{
+ /*
+ * Caller already ensured that pending RCU callbacks are completed,
+ * so we should have no busy allocations at this point.
+ */
if (cfq_pool)
kmem_cache_destroy(cfq_pool);
if (cfq_ioc_pool)
if (!cfq_pool)
goto fail;
- cfq_ioc_pool = KMEM_CACHE(cfq_io_context, SLAB_DESTROY_BY_RCU);
+ cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
if (!cfq_ioc_pool)
goto fail;
ioc_gone = &all_gone;
/* ioc_gone's update must be visible before reading ioc_count */
smp_wmb();
+
+ /*
+ * this also protects us from entering cfq_slab_kill() with
+ * pending RCU callbacks
+ */
if (elv_ioc_count_read(ioc_count))
- wait_for_completion(ioc_gone);
- synchronize_rcu();
+ wait_for_completion(&all_gone);
cfq_slab_kill();
}