diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index dcd8a563ab522f2593fd71b8e5dc8bf3dcaea0ab..128a67a40065f25292035455f603075f8504fb8d 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -292,6 +292,8 @@ struct io_ring_ctx {
 	struct {
 		spinlock_t		completion_lock;
 
+		bool			poll_multi_queue;
+
 		/*
 		 * ->iopoll_list is protected by the ctx->uring_lock for
 		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
@@ -300,7 +302,6 @@ struct io_ring_ctx {
 		 */
 		struct io_wq_work_list	iopoll_list;
 		struct io_hash_table	cancel_table;
-		bool			poll_multi_queue;
 
 		struct llist_head	work_llist;
 
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 6f1d0e5df23ad815479904639282bf7a8b4b3b33..992dcd9f8c4cf31a5579a651b8b044c1ea4bfac4 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -1230,6 +1230,7 @@ static void io_wq_cancel_tw_create(struct io_wq *wq)
 
 		worker = container_of(cb, struct io_worker, create_work);
 		io_worker_cancel_cb(worker);
+		kfree(worker);
 	}
 }
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 58ac13b69dc8dc8a9ae54497091c12be55ec1d48..2ac1cd8d23ea623cb371f7472a35a45cf9e358b6 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -731,6 +731,8 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 	size_t ocq_size = sizeof(struct io_overflow_cqe);
 	bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
 
+	lockdep_assert_held(&ctx->completion_lock);
+
 	if (is_cqe32)
 		ocq_size += sizeof(struct io_uring_cqe);
 
@@ -820,9 +822,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 {
 	struct io_uring_cqe *cqe;
 
-	if (!ctx->task_complete)
-		lockdep_assert_held(&ctx->completion_lock);
-
 	ctx->cq_extra++;
 
 	/*
@@ -1236,13 +1235,18 @@ static void io_req_local_work_add(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
-	if (!llist_add(&req->io_task_work.node, &ctx->work_llist))
+	percpu_ref_get(&ctx->refs);
+
+	if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) {
+		percpu_ref_put(&ctx->refs);
 		return;
+	}
 	/* need it for the following io_cqring_wake() */
 	smp_mb__after_atomic();
 
 	if (unlikely(atomic_read(&req->task->io_uring->in_idle))) {
 		io_move_task_work_from_local(ctx);
+		percpu_ref_put(&ctx->refs);
 		return;
 	}
 
@@ -1252,6 +1256,7 @@ static void io_req_local_work_add(struct io_kiocb *req)
 	if (ctx->has_evfd)
 		io_eventfd_signal(ctx);
 	__io_cqring_wake(ctx);
+	percpu_ref_put(&ctx->refs);
 }
 
 void __io_req_task_work_add(struct io_kiocb *req, bool allow_local)
@@ -2465,7 +2470,7 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx)
 /* when returns >0, the caller should retry */
 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 					  struct io_wait_queue *iowq,
-					  ktime_t timeout)
+					  ktime_t *timeout)
 {
 	int ret;
 	unsigned long check_cq;
@@ -2483,7 +2488,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 		if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))
 			return -EBADR;
 	}
-	if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
+	if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS))
 		return -ETIME;
 
 	/*
@@ -2559,7 +2564,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		}
 		prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
 						TASK_INTERRUPTIBLE);
-		ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
+		ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
 		if (__io_cqring_events_user(ctx) >= min_events)
 			break;
 		cond_resched();
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index e9f0d41ebb9960189930e2699669fc206b0b764f..ab4b2a1c3b7e80fba53d0c090962fd99a475aea4 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -79,6 +79,19 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
 bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
 			bool cancel_all);
 
+#define io_lockdep_assert_cq_locked(ctx)				\
+	do {								\
+		if (ctx->flags & IORING_SETUP_IOPOLL) {			\
+			lockdep_assert_held(&ctx->uring_lock);		\
+		} else if (!ctx->task_complete) {			\
+			lockdep_assert_held(&ctx->completion_lock);	\
+		} else if (ctx->submitter_task->flags & PF_EXITING) {	\
+			lockdep_assert(current_work());			\
+		} else {						\
+			lockdep_assert(current == ctx->submitter_task);	\
+		}							\
+	} while (0)
+
 static inline void io_req_task_work_add(struct io_kiocb *req)
 {
 	__io_req_task_work_add(req, true);
@@ -92,6 +105,8 @@ void io_cq_unlock_post(struct io_ring_ctx *ctx);
 static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx,
 						       bool overflow)
 {
+	io_lockdep_assert_cq_locked(ctx);
+
 	if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
 		struct io_uring_cqe *cqe = ctx->cqe_cached;