diff --git a/.clang-format b/.clang-format
index 2c61b4553374b994be3a795cd8600b930d5e9213..d988e9fa9b265324df3f8d637f7d21fbd6f08776 100644
--- a/.clang-format
+++ b/.clang-format
@@ -226,7 +226,6 @@ ForEachMacros:
   - 'for_each_console_srcu'
   - 'for_each_cpu'
   - 'for_each_cpu_and'
-  - 'for_each_cpu_not'
   - 'for_each_cpu_wrap'
   - 'for_each_dapm_widgets'
   - 'for_each_dedup_cand'
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 96d13cb7c19f085cc62395011a9499cdfa2515a6..15f6cfddcc080b47693f2b6212d4ca5fab6bf3e0 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -783,11 +783,9 @@ __init void prefill_possible_map(void)
 
 static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
 {
-	cpumask_t tmp_map;
 	int cpu;
 
-	cpumask_complement(&tmp_map, cpu_present_mask);
-	cpu = cpumask_first(&tmp_map);
+	cpu = cpumask_first_zero(cpu_present_mask);
 	if (cpu >= nr_cpu_ids)
 		return -EINVAL;
 
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 10c92bd9b8070a5e3c6e006502c4405b89c150af..8fbe76607965e9e3de8e948c8b923a086bbce763 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -50,8 +50,41 @@ static inline void set_nr_cpu_ids(unsigned int nr)
 #endif
 }
 
-/* Deprecated. Always use nr_cpu_ids. */
-#define nr_cpumask_bits	nr_cpu_ids
+/*
+ * We have several different "preferred sizes" for the cpumask
+ * operations, depending on operation.
+ *
+ * For example, the bitmap scanning and operating operations have
+ * optimized routines that work for the single-word case, but only when
+ * the size is constant. So if NR_CPUS fits in one single word, we are
+ * better off using that small constant, in order to trigger the
+ * optimized bit finding. That is 'small_cpumask_size'.
+ *
+ * The clearing and copying operations will similarly perform better
+ * with a constant size, but we limit that size arbitrarily to four
+ * words. We call this 'large_cpumask_size'.
+ *
+ * Finally, some operations just want the exact limit, either because
+ * they set bits or just don't have any faster fixed-sized versions. We
+ * call this just 'nr_cpumask_size'.
+ *
+ * Note that these optional constants are always guaranteed to be at
+ * least as big as 'nr_cpu_ids' itself is, and all our cpumask
+ * allocations are at least that size (see cpumask_size()). The
+ * optimization comes from being able to potentially use a compile-time
+ * constant instead of a run-time generated exact number of CPUs.
+ */
+#if NR_CPUS <= BITS_PER_LONG
+  #define small_cpumask_bits ((unsigned int)NR_CPUS)
+  #define large_cpumask_bits ((unsigned int)NR_CPUS)
+#elif NR_CPUS <= 4*BITS_PER_LONG
+  #define small_cpumask_bits nr_cpu_ids
+  #define large_cpumask_bits ((unsigned int)NR_CPUS)
+#else
+  #define small_cpumask_bits nr_cpu_ids
+  #define large_cpumask_bits nr_cpu_ids
+#endif
+#define nr_cpumask_bits nr_cpu_ids
 
 /*
  * The following particular system cpumasks and operations manage
@@ -126,7 +159,7 @@ static __always_inline unsigned int cpumask_check(unsigned int cpu)
  */
 static inline unsigned int cpumask_first(const struct cpumask *srcp)
 {
-	return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
+	return find_first_bit(cpumask_bits(srcp), small_cpumask_bits);
 }
 
 /**
@@ -137,7 +170,7 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
  */
 static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
 {
-	return find_first_zero_bit(cpumask_bits(srcp), nr_cpumask_bits);
+	return find_first_zero_bit(cpumask_bits(srcp), small_cpumask_bits);
 }
 
 /**
@@ -150,7 +183,7 @@ static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
 static inline
 unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2)
 {
-	return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits);
+	return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
 }
 
 /**
@@ -161,7 +194,7 @@ unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask
  */
 static inline unsigned int cpumask_last(const struct cpumask *srcp)
 {
-	return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
+	return find_last_bit(cpumask_bits(srcp), small_cpumask_bits);
 }
 
 /**
@@ -177,7 +210,7 @@ unsigned int cpumask_next(int n, const struct cpumask *srcp)
 	/* -1 is a legal arg here. */
 	if (n != -1)
 		cpumask_check(n);
-	return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1);
+	return find_next_bit(cpumask_bits(srcp), small_cpumask_bits, n + 1);
 }
 
 /**
@@ -192,7 +225,7 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
 	/* -1 is a legal arg here. */
 	if (n != -1)
 		cpumask_check(n);
-	return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1);
+	return find_next_zero_bit(cpumask_bits(srcp), small_cpumask_bits, n+1);
 }
 
 #if NR_CPUS == 1
@@ -235,7 +268,7 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
 	if (n != -1)
 		cpumask_check(n);
 	return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
-		nr_cpumask_bits, n + 1);
+		small_cpumask_bits, n + 1);
 }
 
 /**
@@ -246,17 +279,7 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
  * After the loop, cpu is >= nr_cpu_ids.
  */
 #define for_each_cpu(cpu, mask)				\
-	for_each_set_bit(cpu, cpumask_bits(mask), nr_cpumask_bits)
-
-/**
- * for_each_cpu_not - iterate over every cpu in a complemented mask
- * @cpu: the (optionally unsigned) integer iterator
- * @mask: the cpumask pointer
- *
- * After the loop, cpu is >= nr_cpu_ids.
- */
-#define for_each_cpu_not(cpu, mask)				\
-	for_each_clear_bit(cpu, cpumask_bits(mask), nr_cpumask_bits)
+	for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits)
 
 #if NR_CPUS == 1
 static inline
@@ -290,7 +313,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
  * After the loop, cpu is >= nr_cpu_ids.
  */
 #define for_each_cpu_wrap(cpu, mask, start)				\
-	for_each_set_bit_wrap(cpu, cpumask_bits(mask), nr_cpumask_bits, start)
+	for_each_set_bit_wrap(cpu, cpumask_bits(mask), small_cpumask_bits, start)
 
 /**
  * for_each_cpu_and - iterate over every cpu in both masks
@@ -307,7 +330,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
  * After the loop, cpu is >= nr_cpu_ids.
  */
 #define for_each_cpu_and(cpu, mask1, mask2)				\
-	for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), nr_cpumask_bits)
+	for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)
 
 /**
  * for_each_cpu_andnot - iterate over every cpu present in one mask, excluding
@@ -325,7 +348,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
  * After the loop, cpu is >= nr_cpu_ids.
  */
 #define for_each_cpu_andnot(cpu, mask1, mask2)				\
-	for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), nr_cpumask_bits)
+	for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)
 
 /**
  * cpumask_any_but - return a "random" in a cpumask, but not this one.
@@ -356,7 +379,7 @@ unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
  */
 static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp)
 {
-	return find_nth_bit(cpumask_bits(srcp), nr_cpumask_bits, cpumask_check(cpu));
+	return find_nth_bit(cpumask_bits(srcp), small_cpumask_bits, cpumask_check(cpu));
 }
 
 /**
@@ -372,7 +395,7 @@ unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1,
 							const struct cpumask *srcp2)
 {
 	return find_nth_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
-				nr_cpumask_bits, cpumask_check(cpu));
+				small_cpumask_bits, cpumask_check(cpu));
 }
 
 /**
@@ -388,7 +411,7 @@ unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1,
 							const struct cpumask *srcp2)
 {
 	return find_nth_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
-				nr_cpumask_bits, cpumask_check(cpu));
+				small_cpumask_bits, cpumask_check(cpu));
 }
 
 /**
@@ -408,7 +431,7 @@ unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp
 	return find_nth_and_andnot_bit(cpumask_bits(srcp1),
 					cpumask_bits(srcp2),
 					cpumask_bits(srcp3),
-					nr_cpumask_bits, cpumask_check(cpu));
+					small_cpumask_bits, cpumask_check(cpu));
 }
 
 #define CPU_BITS_NONE						\
@@ -495,10 +518,14 @@ static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *
 /**
  * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask
  * @dstp: the cpumask pointer
+ *
+ * Note: since we set bits, we should use the tighter 'bitmap_set()' with
+ * the eact number of bits, not 'bitmap_fill()' that will fill past the
+ * end.
  */
 static inline void cpumask_setall(struct cpumask *dstp)
 {
-	bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits);
+	bitmap_set(cpumask_bits(dstp), 0, nr_cpumask_bits);
 }
 
 /**
@@ -507,7 +534,7 @@ static inline void cpumask_setall(struct cpumask *dstp)
  */
 static inline void cpumask_clear(struct cpumask *dstp)
 {
-	bitmap_zero(cpumask_bits(dstp), nr_cpumask_bits);
+	bitmap_zero(cpumask_bits(dstp), large_cpumask_bits);
 }
 
 /**
@@ -523,7 +550,7 @@ static inline bool cpumask_and(struct cpumask *dstp,
 			       const struct cpumask *src2p)
 {
 	return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p),
-				       cpumask_bits(src2p), nr_cpumask_bits);
+				       cpumask_bits(src2p), small_cpumask_bits);
 }
 
 /**
@@ -536,7 +563,7 @@ static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
 			      const struct cpumask *src2p)
 {
 	bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p),
-				      cpumask_bits(src2p), nr_cpumask_bits);
+				      cpumask_bits(src2p), small_cpumask_bits);
 }
 
 /**
@@ -550,7 +577,7 @@ static inline void cpumask_xor(struct cpumask *dstp,
 			       const struct cpumask *src2p)
 {
 	bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p),
-				       cpumask_bits(src2p), nr_cpumask_bits);
+				       cpumask_bits(src2p), small_cpumask_bits);
 }
 
 /**
@@ -566,19 +593,7 @@ static inline bool cpumask_andnot(struct cpumask *dstp,
 				  const struct cpumask *src2p)
 {
 	return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p),
-					  cpumask_bits(src2p), nr_cpumask_bits);
-}
-
-/**
- * cpumask_complement - *dstp = ~*srcp
- * @dstp: the cpumask result
- * @srcp: the input to invert
- */
-static inline void cpumask_complement(struct cpumask *dstp,
-				      const struct cpumask *srcp)
-{
-	bitmap_complement(cpumask_bits(dstp), cpumask_bits(srcp),
-					      nr_cpumask_bits);
+					  cpumask_bits(src2p), small_cpumask_bits);
 }
 
 /**
@@ -590,7 +605,7 @@ static inline bool cpumask_equal(const struct cpumask *src1p,
 				const struct cpumask *src2p)
 {
 	return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p),
-						 nr_cpumask_bits);
+						 small_cpumask_bits);
 }
 
 /**
@@ -604,7 +619,7 @@ static inline bool cpumask_or_equal(const struct cpumask *src1p,
 				    const struct cpumask *src3p)
 {
 	return bitmap_or_equal(cpumask_bits(src1p), cpumask_bits(src2p),
-			       cpumask_bits(src3p), nr_cpumask_bits);
+			       cpumask_bits(src3p), small_cpumask_bits);
 }
 
 /**
@@ -616,7 +631,7 @@ static inline bool cpumask_intersects(const struct cpumask *src1p,
 				     const struct cpumask *src2p)
 {
 	return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p),
-						      nr_cpumask_bits);
+						      small_cpumask_bits);
 }
 
 /**
@@ -630,7 +645,7 @@ static inline bool cpumask_subset(const struct cpumask *src1p,
 				 const struct cpumask *src2p)
 {
 	return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p),
-						  nr_cpumask_bits);
+						  small_cpumask_bits);
 }
 
 /**
@@ -639,7 +654,7 @@ static inline bool cpumask_subset(const struct cpumask *src1p,
  */
 static inline bool cpumask_empty(const struct cpumask *srcp)
 {
-	return bitmap_empty(cpumask_bits(srcp), nr_cpumask_bits);
+	return bitmap_empty(cpumask_bits(srcp), small_cpumask_bits);
 }
 
 /**
@@ -657,7 +672,7 @@ static inline bool cpumask_full(const struct cpumask *srcp)
  */
 static inline unsigned int cpumask_weight(const struct cpumask *srcp)
 {
-	return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits);
+	return bitmap_weight(cpumask_bits(srcp), small_cpumask_bits);
 }
 
 /**
@@ -668,7 +683,7 @@ static inline unsigned int cpumask_weight(const struct cpumask *srcp)
 static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1,
 						const struct cpumask *srcp2)
 {
-	return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits);
+	return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
 }
 
 /**
@@ -681,7 +696,7 @@ static inline void cpumask_shift_right(struct cpumask *dstp,
 				       const struct cpumask *srcp, int n)
 {
 	bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n,
-					       nr_cpumask_bits);
+					       small_cpumask_bits);
 }
 
 /**
@@ -705,7 +720,7 @@ static inline void cpumask_shift_left(struct cpumask *dstp,
 static inline void cpumask_copy(struct cpumask *dstp,
 				const struct cpumask *srcp)
 {
-	bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits);
+	bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), large_cpumask_bits);
 }
 
 /**
@@ -789,7 +804,7 @@ static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
  */
 static inline unsigned int cpumask_size(void)
 {
-	return BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long);
+	return BITS_TO_LONGS(large_cpumask_bits) * sizeof(long);
 }
 
 /*
diff --git a/lib/cpumask_kunit.c b/lib/cpumask_kunit.c
index d1fc6ece21f3704566f9296b47bf47d3f04c8296..a105e6369efc912cc910aeb932ae0335688ac485 100644
--- a/lib/cpumask_kunit.c
+++ b/lib/cpumask_kunit.c
@@ -23,16 +23,6 @@
 		KUNIT_EXPECT_EQ_MSG((test), mask_weight, iter, MASK_MSG(mask));	\
 	} while (0)
 
-#define EXPECT_FOR_EACH_CPU_NOT_EQ(test, mask)					\
-	do {									\
-		const cpumask_t *m = (mask);					\
-		int mask_weight = cpumask_weight(m);				\
-		int cpu, iter = 0;						\
-		for_each_cpu_not(cpu, m)					\
-			iter++;							\
-		KUNIT_EXPECT_EQ_MSG((test), nr_cpu_ids - mask_weight, iter, MASK_MSG(mask));	\
-	} while (0)
-
 #define EXPECT_FOR_EACH_CPU_OP_EQ(test, op, mask1, mask2)			\
 	do {									\
 		const cpumask_t *m1 = (mask1);					\
@@ -77,7 +67,7 @@ static void test_cpumask_weight(struct kunit *test)
 	KUNIT_EXPECT_EQ_MSG(test, 0, cpumask_weight(&mask_empty), MASK_MSG(&mask_empty));
 	KUNIT_EXPECT_EQ_MSG(test, nr_cpu_ids, cpumask_weight(cpu_possible_mask),
 			    MASK_MSG(cpu_possible_mask));
-	KUNIT_EXPECT_EQ_MSG(test, nr_cpumask_bits, cpumask_weight(&mask_all), MASK_MSG(&mask_all));
+	KUNIT_EXPECT_EQ_MSG(test, nr_cpu_ids, cpumask_weight(&mask_all), MASK_MSG(&mask_all));
 }
 
 static void test_cpumask_first(struct kunit *test)
@@ -113,14 +103,12 @@ static void test_cpumask_next(struct kunit *test)
 static void test_cpumask_iterators(struct kunit *test)
 {
 	EXPECT_FOR_EACH_CPU_EQ(test, &mask_empty);
-	EXPECT_FOR_EACH_CPU_NOT_EQ(test, &mask_empty);
 	EXPECT_FOR_EACH_CPU_WRAP_EQ(test, &mask_empty);
 	EXPECT_FOR_EACH_CPU_OP_EQ(test, and, &mask_empty, &mask_empty);
 	EXPECT_FOR_EACH_CPU_OP_EQ(test, and, cpu_possible_mask, &mask_empty);
 	EXPECT_FOR_EACH_CPU_OP_EQ(test, andnot, &mask_empty, &mask_empty);
 
 	EXPECT_FOR_EACH_CPU_EQ(test, cpu_possible_mask);
-	EXPECT_FOR_EACH_CPU_NOT_EQ(test, cpu_possible_mask);
 	EXPECT_FOR_EACH_CPU_WRAP_EQ(test, cpu_possible_mask);
 	EXPECT_FOR_EACH_CPU_OP_EQ(test, and, cpu_possible_mask, cpu_possible_mask);
 	EXPECT_FOR_EACH_CPU_OP_EQ(test, andnot, cpu_possible_mask, &mask_empty);