diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index f714f873bf9daea68d6e469a9fa3d8ef38c1f125..e99a5234d9ed86261cc56a882e42f957079cc87d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -400,6 +400,7 @@ struct rcu_data {
 #ifdef CONFIG_RCU_FAST_NO_HZ
 	struct rcu_head oom_head;
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+	atomic_long_t exp_workdone0;	/* # done by workqueue. */
 	atomic_long_t exp_workdone1;	/* # done by others #1. */
 	atomic_long_t exp_workdone2;	/* # done by others #2. */
 	atomic_long_t exp_workdone3;	/* # done by others #3. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 1549f456fb7b73f79c1c0fdead126ca0f3fed92f..97f5ffe42b58aa6c1d5ac94dd181a88788bc6b32 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -500,7 +500,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
 	 * next GP, to proceed.
 	 */
 	mutex_lock(&rsp->exp_wake_mutex);
-	mutex_unlock(&rsp->exp_mutex);
 
 	rcu_for_each_node_breadth_first(rsp, rnp) {
 		if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
@@ -516,6 +515,29 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
 	mutex_unlock(&rsp->exp_wake_mutex);
 }
 
+/* Let the workqueue handler know what it is supposed to do. */
+struct rcu_exp_work {
+	smp_call_func_t rew_func;
+	struct rcu_state *rew_rsp;
+	unsigned long rew_s;
+	struct work_struct rew_work;
+};
+
+/*
+ * Work-queue handler to drive an expedited grace period forward.
+ */
+static void wait_rcu_exp_gp(struct work_struct *wp)
+{
+	struct rcu_exp_work *rewp;
+
+	/* Initialize the rcu_node tree in preparation for the wait. */
+	rewp = container_of(wp, struct rcu_exp_work, rew_work);
+	sync_rcu_exp_select_cpus(rewp->rew_rsp, rewp->rew_func);
+
+	/* Wait and clean up, including waking everyone. */
+	rcu_exp_wait_wake(rewp->rew_rsp, rewp->rew_s);
+}
+
 /*
  * Given an rcu_state pointer and a smp_call_function() handler, kick
  * off the specified flavor of expedited grace period.
@@ -523,6 +545,9 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
 static void _synchronize_rcu_expedited(struct rcu_state *rsp,
 				       smp_call_func_t func)
 {
+	struct rcu_data *rdp;
+	struct rcu_exp_work rew;
+	struct rcu_node *rnp;
 	unsigned long s;
 
 	/* If expedited grace periods are prohibited, fall back to normal. */
@@ -536,11 +561,22 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
 	if (exp_funnel_lock(rsp, s))
 		return;  /* Someone else did our work for us. */
 
-	/* Initialize the rcu_node tree in preparation for the wait. */
-	sync_rcu_exp_select_cpus(rsp, func);
-
-	/* Wait and clean up, including waking everyone. */
-	rcu_exp_wait_wake(rsp, s);
+	/* Marshall arguments and schedule the expedited grace period. */
+	rew.rew_func = func;
+	rew.rew_rsp = rsp;
+	rew.rew_s = s;
+	INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
+	schedule_work(&rew.rew_work);
+
+	/* Wait for expedited grace period to complete. */
+	rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
+	rnp = rcu_get_root(rsp);
+	wait_event(rnp->exp_wq[(s >> 1) & 0x3],
+		   sync_exp_work_done(rsp,
+				      &rdp->exp_workdone0, s));
+
+	/* Let the next expedited grace period start. */
+	mutex_unlock(&rsp->exp_mutex);
 }
 
 /**
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 86782f9a460432614725f0e234785a6b49d9f0db..b1f28972872cb1fe3edd4a835ad3cbb5d692886a 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -185,16 +185,17 @@ static int show_rcuexp(struct seq_file *m, void *v)
 	int cpu;
 	struct rcu_state *rsp = (struct rcu_state *)m->private;
 	struct rcu_data *rdp;
-	unsigned long s1 = 0, s2 = 0, s3 = 0;
+	unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0;
 
 	for_each_possible_cpu(cpu) {
 		rdp = per_cpu_ptr(rsp->rda, cpu);
+		s0 += atomic_long_read(&rdp->exp_workdone0);
 		s1 += atomic_long_read(&rdp->exp_workdone1);
 		s2 += atomic_long_read(&rdp->exp_workdone2);
 		s3 += atomic_long_read(&rdp->exp_workdone3);
 	}
-	seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
-		   rsp->expedited_sequence, s1, s2, s3,
+	seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
+		   rsp->expedited_sequence, s0, s1, s2, s3,
 		   atomic_long_read(&rsp->expedited_normal),
 		   atomic_read(&rsp->expedited_need_qs),
 		   rsp->expedited_sequence / 2);