}
 
 static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
-static DEFINE_PER_CPU(cpumask_t, valid_cpu_mask);
 
 static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
 {
-       int       cpu;
-       cpumask_t *valid_mask = &__get_cpu_var(valid_cpu_mask);
        int       lowest_prio = -1;
+       int       lowest_cpu  = -1;
        int       count       = 0;
+       int       cpu;
 
-       cpus_clear(*lowest_mask);
-       cpus_and(*valid_mask, cpu_online_map, task->cpus_allowed);
+       cpus_and(*lowest_mask, cpu_online_map, task->cpus_allowed);
 
        /*
         * Scan each rq for the lowest prio.
         */
-       for_each_cpu_mask(cpu, *valid_mask) {
+       for_each_cpu_mask(cpu, *lowest_mask) {
                struct rq *rq = cpu_rq(cpu);
 
                /* We look for lowest RT prio or non-rt CPU */
                if (rq->rt.highest_prio >= MAX_RT_PRIO) {
-                       if (count)
+                       /*
+                        * if we already found a low RT queue
+                        * and now we found this non-rt queue
+                        * clear the mask and set our bit.
+                        * Otherwise just return the queue as is
+                        * and the count==1 will cause the algorithm
+                        * to use the first bit found.
+                        */
+                       if (lowest_cpu != -1) {
                                cpus_clear(*lowest_mask);
-                       cpu_set(rq->cpu, *lowest_mask);
+                               cpu_set(rq->cpu, *lowest_mask);
+                       }
                        return 1;
                }
 
                        if (rq->rt.highest_prio > lowest_prio) {
                                /* new low - clear old data */
                                lowest_prio = rq->rt.highest_prio;
-                               if (count) {
-                                       cpus_clear(*lowest_mask);
-                                       count = 0;
-                               }
+                               lowest_cpu = cpu;
+                               count = 0;
                        }
-                       cpu_set(rq->cpu, *lowest_mask);
                        count++;
+               } else
+                       cpu_clear(cpu, *lowest_mask);
+       }
+
+       /*
+        * Clear out all the set bits that represent
+        * runqueues that were of higher prio than
+        * the lowest_prio.
+        */
+       if (lowest_cpu > 0) {
+               /*
+                * Perhaps we could add another cpumask op to
+                * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
+                * Then that could be optimized to use memset and such.
+                */
+               for_each_cpu_mask(cpu, *lowest_mask) {
+                       if (cpu >= lowest_cpu)
+                               break;
+                       cpu_clear(cpu, *lowest_mask);
                }
        }