spin_unlock_irqrestore(&dst->lock, flags);
 
        /*
-        * Make the list addition visible before sending the ipi.
+        * The list addition should be visible before sending the IPI
+        * handler locks the list to pull the entry off it because of
+        * normal cache coherency rules implied by spinlocks.
+        *
+        * If IPIs can go out of order to the cache coherency protocol
+        * in an architecture, sufficient synchronisation should be added
+        * to arch code to make it appear to obey cache coherency WRT
+        * locking and barrier primitives. Generic code isn't really equipped
+        * to do the right thing...
         */
-       smp_mb();
 
        if (ipi)
                arch_send_call_function_single_ipi(cpu);
        struct call_function_data *data;
        int cpu = get_cpu();
 
+       /*
+        * Ensure entry is visible on call_function_queue after we have
+        * entered the IPI. See comment in smp_call_function_many.
+        * If we don't have this, then we may miss an entry on the list
+        * and never get another IPI to process it.
+        */
+       smp_mb();
+
        /*
         * It's ok to use list_for_each_rcu() here even though we may delete
         * 'pos', since list_del_rcu() doesn't clear ->next
 {
        struct call_single_queue *q = &__get_cpu_var(call_single_queue);
        LIST_HEAD(list);
+       unsigned int data_flags;
 
-       /*
-        * Need to see other stores to list head for checking whether
-        * list is empty without holding q->lock
-        */
-       smp_read_barrier_depends();
-       while (!list_empty(&q->list)) {
-               unsigned int data_flags;
-
-               spin_lock(&q->lock);
-               list_replace_init(&q->list, &list);
-               spin_unlock(&q->lock);
+       spin_lock(&q->lock);
+       list_replace_init(&q->list, &list);
+       spin_unlock(&q->lock);
 
-               while (!list_empty(&list)) {
-                       struct call_single_data *data;
+       while (!list_empty(&list)) {
+               struct call_single_data *data;
 
-                       data = list_entry(list.next, struct call_single_data,
-                                               list);
-                       list_del(&data->list);
+               data = list_entry(list.next, struct call_single_data,
+                                       list);
+               list_del(&data->list);
 
-                       /*
-                        * 'data' can be invalid after this call if
-                        * flags == 0 (when called through
-                        * generic_exec_single(), so save them away before
-                        * making the call.
-                        */
-                       data_flags = data->flags;
-
-                       data->func(data->info);
-
-                       if (data_flags & CSD_FLAG_WAIT) {
-                               smp_wmb();
-                               data->flags &= ~CSD_FLAG_WAIT;
-                       } else if (data_flags & CSD_FLAG_LOCK) {
-                               smp_wmb();
-                               data->flags &= ~CSD_FLAG_LOCK;
-                       } else if (data_flags & CSD_FLAG_ALLOC)
-                               kfree(data);
-               }
                /*
-                * See comment on outer loop
+                * 'data' can be invalid after this call if
+                * flags == 0 (when called through
+                * generic_exec_single(), so save them away before
+                * making the call.
                 */
-               smp_read_barrier_depends();
+               data_flags = data->flags;
+
+               data->func(data->info);
+
+               if (data_flags & CSD_FLAG_WAIT) {
+                       smp_wmb();
+                       data->flags &= ~CSD_FLAG_WAIT;
+               } else if (data_flags & CSD_FLAG_LOCK) {
+                       smp_wmb();
+                       data->flags &= ~CSD_FLAG_LOCK;
+               } else if (data_flags & CSD_FLAG_ALLOC)
+                       kfree(data);
        }
 }
 
 
        /*
         * Make the list addition visible before sending the ipi.
+        * (IPIs must obey or appear to obey normal Linux cache coherency
+        * rules -- see comment in generic_exec_single).
         */
        smp_mb();