diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 426fa892d311a308e38131edf21de630db0654aa..dc3a939da2c0a56567a5dd06d7d1f40b16ee38a1 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3207,6 +3207,7 @@
 					       spectre_v2_user=off [X86]
 					       spec_store_bypass_disable=off [X86,PPC]
 					       ssbd=force-off [ARM64]
+					       nospectre_bhb [ARM64]
 					       l1tf=off [X86]
 					       mds=off [X86]
 					       tsx_async_abort=off [X86]
@@ -3613,7 +3614,7 @@
 
 	nohugeiomap	[KNL,X86,PPC,ARM64] Disable kernel huge I/O mappings.
 
-	nohugevmalloc	[PPC] Disable kernel huge vmalloc mappings.
+	nohugevmalloc	[KNL,X86,PPC,ARM64] Disable kernel huge vmalloc mappings.
 
 	nosmt		[KNL,S390] Disable symmetric multithreading (SMT).
 			Equivalent to smt=1.
@@ -3631,6 +3632,10 @@
 			vulnerability. System may allow data leaks with this
 			option.
 
+	nospectre_bhb	[ARM64] Disable all mitigations for Spectre-BHB (branch
+			history injection) vulnerability. System may allow data leaks
+			with this option.
+
 	nospec_store_bypass_disable
 			[HW] Disable all mitigations for the Speculative Store Bypass vulnerability
 
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 526ab76cd233ce1eb860b53604ea5c344c0aa630..1675310f179121ef0a7ce86c774aa2e365443ad4 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -149,6 +149,7 @@ config ARM64
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_BITREVERSE
 	select HAVE_ARCH_COMPILER_H
+	select HAVE_ARCH_HUGE_VMALLOC
 	select HAVE_ARCH_HUGE_VMAP
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_ARCH_JUMP_LABEL_RELATIVE
@@ -230,6 +231,7 @@ config ARM64
 	select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD
 	select TRACE_IRQFLAGS_SUPPORT
 	select TRACE_IRQFLAGS_NMI_SUPPORT
+	select HAVE_SOFTIRQ_ON_OWN_STACK
 	help
 	  ARM 64-bit (AArch64) Linux support.
 
@@ -1575,6 +1577,9 @@ config THUMB2_COMPAT_VDSO
 	  Compile the compat vDSO with '-mthumb -fomit-frame-pointer' if y,
 	  otherwise with '-marm'.
 
+config COMPAT_ALIGNMENT_FIXUPS
+	bool "Fix up misaligned multi-word loads and stores in user space"
+
 menuconfig ARMV8_DEPRECATED
 	bool "Emulate deprecated/obsolete ARMv8 instructions"
 	depends on SYSCTL
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index 0278a58abe69aa0ad3adc52aad055317feceb718..19713d0f013b7635341e0ddc34769c076626bfe9 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -71,6 +71,7 @@ void do_sysinstr(unsigned long esr, struct pt_regs *regs);
 void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs);
 void bad_el0_sync(struct pt_regs *regs, int reason, unsigned long esr);
 void do_cp15instr(unsigned long esr, struct pt_regs *regs);
+int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs);
 void do_el0_svc(struct pt_regs *regs);
 void do_el0_svc_compat(struct pt_regs *regs);
 void do_el0_fpac(struct pt_regs *regs, unsigned long esr);
diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index aa523591a44e5635ee1d2ed2fd4f22e418513fa6..760c62f8e22f84f95c48de4f1470d9cea67ca536 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -42,7 +42,9 @@ void mte_sync_tags(pte_t old_pte, pte_t pte);
 void mte_copy_page_tags(void *kto, const void *kfrom);
 void mte_thread_init_user(void);
 void mte_thread_switch(struct task_struct *next);
+void mte_cpu_setup(void);
 void mte_suspend_enter(void);
+void mte_suspend_exit(void);
 long set_mte_ctrl(struct task_struct *task, unsigned long arg);
 long get_mte_ctrl(struct task_struct *task);
 int mte_ptrace_copy_tags(struct task_struct *child, long request,
@@ -72,6 +74,9 @@ static inline void mte_thread_switch(struct task_struct *next)
 static inline void mte_suspend_enter(void)
 {
 }
+static inline void mte_suspend_exit(void)
+{
+}
 static inline long set_mte_ctrl(struct task_struct *task, unsigned long arg)
 {
 	return 0;
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 2fc9f0861769a76e9ee835ab1007be4547a930c8..5c7b2f9d5913759472a1cb79335ca7263c6a5b4d 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -203,9 +203,11 @@ static inline void uaccess_enable_privileged(void)
 }
 
 /*
- * Sanitise a uaccess pointer such that it becomes NULL if above the maximum
- * user address. In case the pointer is tagged (has the top byte set), untag
- * the pointer before checking.
+ * Sanitize a uaccess pointer such that it cannot reach any kernel address.
+ *
+ * Clearing bit 55 ensures the pointer cannot address any portion of the TTBR1
+ * address range (i.e. any kernel address), and either the pointer falls within
+ * the TTBR0 address range or must cause a fault.
  */
 #define uaccess_mask_ptr(ptr) (__typeof__(ptr))__uaccess_mask_ptr(ptr)
 static inline void __user *__uaccess_mask_ptr(const void __user *ptr)
@@ -213,14 +215,12 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr)
 	void __user *safe_ptr;
 
 	asm volatile(
-	"	bics	xzr, %3, %2\n"
-	"	csel	%0, %1, xzr, eq\n"
-	: "=&r" (safe_ptr)
-	: "r" (ptr), "r" (TASK_SIZE_MAX - 1),
-	  "r" (untagged_addr(ptr))
-	: "cc");
-
-	csdb();
+	"	bic	%0, %1, %2\n"
+	: "=r" (safe_ptr)
+	: "r" (ptr),
+	  "i" (BIT(55))
+	);
+
 	return safe_ptr;
 }
 
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 1add7b01efa72ed016fbfacd88f80bd5289ce39d..38a0b0291edb896796f2eb0ce063b229f6782792 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -45,6 +45,7 @@ $(obj)/%.stub.o: $(obj)/%.o FORCE
 obj-$(CONFIG_COMPAT)			+= sys32.o signal32.o			\
 					   sys_compat.o
 obj-$(CONFIG_COMPAT)			+= sigreturn32.o
+obj-$(CONFIG_COMPAT_ALIGNMENT_FIXUPS)	+= compat_alignment.o
 obj-$(CONFIG_KUSER_HELPERS)		+= kuser32.o
 obj-$(CONFIG_FUNCTION_TRACER)		+= ftrace.o entry-ftrace.o
 obj-$(CONFIG_MODULES)			+= module.o
diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c
new file mode 100644
index 0000000000000000000000000000000000000000..5edec2f49ec98c9b040d7698db52a62be0315b48
--- /dev/null
+++ b/arch/arm64/kernel/compat_alignment.c
@@ -0,0 +1,387 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// based on arch/arm/mm/alignment.c
+
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/perf_event.h>
+#include <linux/uaccess.h>
+
+#include <asm/exception.h>
+#include <asm/ptrace.h>
+#include <asm/traps.h>
+
+/*
+ * 32-bit misaligned trap handler (c) 1998 San Mehat (CCC) -July 1998
+ *
+ * Speed optimisations and better fault handling by Russell King.
+ */
+#define CODING_BITS(i)	(i & 0x0e000000)
+
+#define LDST_P_BIT(i)	(i & (1 << 24))		/* Preindex		*/
+#define LDST_U_BIT(i)	(i & (1 << 23))		/* Add offset		*/
+#define LDST_W_BIT(i)	(i & (1 << 21))		/* Writeback		*/
+#define LDST_L_BIT(i)	(i & (1 << 20))		/* Load			*/
+
+#define LDST_P_EQ_U(i)	((((i) ^ ((i) >> 1)) & (1 << 23)) == 0)
+
+#define LDSTHD_I_BIT(i)	(i & (1 << 22))		/* double/half-word immed */
+
+#define RN_BITS(i)	((i >> 16) & 15)	/* Rn			*/
+#define RD_BITS(i)	((i >> 12) & 15)	/* Rd			*/
+#define RM_BITS(i)	(i & 15)		/* Rm			*/
+
+#define REGMASK_BITS(i)	(i & 0xffff)
+
+#define BAD_INSTR 	0xdeadc0de
+
+/* Thumb-2 32 bit format per ARMv7 DDI0406A A6.3, either f800h,e800h,f800h */
+#define IS_T32(hi16) \
+	(((hi16) & 0xe000) == 0xe000 && ((hi16) & 0x1800))
+
+union offset_union {
+	unsigned long un;
+	  signed long sn;
+};
+
+#define TYPE_ERROR	0
+#define TYPE_FAULT	1
+#define TYPE_LDST	2
+#define TYPE_DONE	3
+
+static void
+do_alignment_finish_ldst(unsigned long addr, u32 instr, struct pt_regs *regs,
+			 union offset_union offset)
+{
+	if (!LDST_U_BIT(instr))
+		offset.un = -offset.un;
+
+	if (!LDST_P_BIT(instr))
+		addr += offset.un;
+
+	if (!LDST_P_BIT(instr) || LDST_W_BIT(instr))
+		regs->regs[RN_BITS(instr)] = addr;
+}
+
+static int
+do_alignment_ldrdstrd(unsigned long addr, u32 instr, struct pt_regs *regs)
+{
+	unsigned int rd = RD_BITS(instr);
+	unsigned int rd2;
+	int load;
+
+	if ((instr & 0xfe000000) == 0xe8000000) {
+		/* ARMv7 Thumb-2 32-bit LDRD/STRD */
+		rd2 = (instr >> 8) & 0xf;
+		load = !!(LDST_L_BIT(instr));
+	} else if (((rd & 1) == 1) || (rd == 14)) {
+		return TYPE_ERROR;
+	} else {
+		load = ((instr & 0xf0) == 0xd0);
+		rd2 = rd + 1;
+	}
+
+	if (load) {
+		unsigned int val, val2;
+
+		if (get_user(val, (u32 __user *)addr) ||
+		    get_user(val2, (u32 __user *)(addr + 4)))
+			return TYPE_FAULT;
+		regs->regs[rd] = val;
+		regs->regs[rd2] = val2;
+	} else {
+		if (put_user(regs->regs[rd], (u32 __user *)addr) ||
+		    put_user(regs->regs[rd2], (u32 __user *)(addr + 4)))
+			return TYPE_FAULT;
+	}
+	return TYPE_LDST;
+}
+
+/*
+ * LDM/STM alignment handler.
+ *
+ * There are 4 variants of this instruction:
+ *
+ * B = rn pointer before instruction, A = rn pointer after instruction
+ *              ------ increasing address ----->
+ *	        |    | r0 | r1 | ... | rx |    |
+ * PU = 01             B                    A
+ * PU = 11        B                    A
+ * PU = 00        A                    B
+ * PU = 10             A                    B
+ */
+static int
+do_alignment_ldmstm(unsigned long addr, u32 instr, struct pt_regs *regs)
+{
+	unsigned int rd, rn, nr_regs, regbits;
+	unsigned long eaddr, newaddr;
+	unsigned int val;
+
+	/* count the number of registers in the mask to be transferred */
+	nr_regs = hweight16(REGMASK_BITS(instr)) * 4;
+
+	rn = RN_BITS(instr);
+	newaddr = eaddr = regs->regs[rn];
+
+	if (!LDST_U_BIT(instr))
+		nr_regs = -nr_regs;
+	newaddr += nr_regs;
+	if (!LDST_U_BIT(instr))
+		eaddr = newaddr;
+
+	if (LDST_P_EQ_U(instr))	/* U = P */
+		eaddr += 4;
+
+	for (regbits = REGMASK_BITS(instr), rd = 0; regbits;
+	     regbits >>= 1, rd += 1)
+		if (regbits & 1) {
+			if (LDST_L_BIT(instr)) {
+				if (get_user(val, (u32 __user *)eaddr))
+					return TYPE_FAULT;
+				if (rd < 15)
+					regs->regs[rd] = val;
+				else
+					regs->pc = val;
+			} else {
+				/*
+				 * The PC register has a bias of +8 in ARM mode
+				 * and +4 in Thumb mode. This means that a read
+				 * of the value of PC should account for this.
+				 * Since Thumb does not permit STM instructions
+				 * to refer to PC, just add 8 here.
+				 */
+				val = (rd < 15) ? regs->regs[rd] : regs->pc + 8;
+				if (put_user(val, (u32 __user *)eaddr))
+					return TYPE_FAULT;
+			}
+			eaddr += 4;
+		}
+
+	if (LDST_W_BIT(instr))
+		regs->regs[rn] = newaddr;
+
+	return TYPE_DONE;
+}
+
+/*
+ * Convert Thumb multi-word load/store instruction forms to equivalent ARM
+ * instructions so we can reuse ARM userland alignment fault fixups for Thumb.
+ *
+ * This implementation was initially based on the algorithm found in
+ * gdb/sim/arm/thumbemu.c. It is basically just a code reduction of same
+ * to convert only Thumb ld/st instruction forms to equivalent ARM forms.
+ *
+ * NOTES:
+ * 1. Comments below refer to ARM ARM DDI0100E Thumb Instruction sections.
+ * 2. If for some reason we're passed an non-ld/st Thumb instruction to
+ *    decode, we return 0xdeadc0de. This should never happen under normal
+ *    circumstances but if it does, we've got other problems to deal with
+ *    elsewhere and we obviously can't fix those problems here.
+ */
+
+static unsigned long thumb2arm(u16 tinstr)
+{
+	u32 L = (tinstr & (1<<11)) >> 11;
+
+	switch ((tinstr & 0xf800) >> 11) {
+	/* 6.6.1 Format 1: */
+	case 0xc000 >> 11:				/* 7.1.51 STMIA */
+	case 0xc800 >> 11:				/* 7.1.25 LDMIA */
+		{
+			u32 Rn = (tinstr & (7<<8)) >> 8;
+			u32 W = ((L<<Rn) & (tinstr&255)) ? 0 : 1<<21;
+
+			return 0xe8800000 | W | (L<<20) | (Rn<<16) |
+				(tinstr&255);
+		}
+
+	/* 6.6.1 Format 2: */
+	case 0xb000 >> 11:				/* 7.1.48 PUSH */
+	case 0xb800 >> 11:				/* 7.1.47 POP */
+		if ((tinstr & (3 << 9)) == 0x0400) {
+			static const u32 subset[4] = {
+				0xe92d0000,	/* STMDB sp!,{registers} */
+				0xe92d4000,	/* STMDB sp!,{registers,lr} */
+				0xe8bd0000,	/* LDMIA sp!,{registers} */
+				0xe8bd8000	/* LDMIA sp!,{registers,pc} */
+			};
+			return subset[(L<<1) | ((tinstr & (1<<8)) >> 8)] |
+			    (tinstr & 255);		/* register_list */
+		}
+		fallthrough;	/* for illegal instruction case */
+
+	default:
+		return BAD_INSTR;
+	}
+}
+
+/*
+ * Convert Thumb-2 32 bit LDM, STM, LDRD, STRD to equivalent instruction
+ * handlable by ARM alignment handler, also find the corresponding handler,
+ * so that we can reuse ARM userland alignment fault fixups for Thumb.
+ *
+ * @pinstr: original Thumb-2 instruction; returns new handlable instruction
+ * @regs: register context.
+ * @poffset: return offset from faulted addr for later writeback
+ *
+ * NOTES:
+ * 1. Comments below refer to ARMv7 DDI0406A Thumb Instruction sections.
+ * 2. Register name Rt from ARMv7 is same as Rd from ARMv6 (Rd is Rt)
+ */
+static void *
+do_alignment_t32_to_handler(u32 *pinstr, struct pt_regs *regs,
+			    union offset_union *poffset)
+{
+	u32 instr = *pinstr;
+	u16 tinst1 = (instr >> 16) & 0xffff;
+	u16 tinst2 = instr & 0xffff;
+
+	switch (tinst1 & 0xffe0) {
+	/* A6.3.5 Load/Store multiple */
+	case 0xe880:		/* STM/STMIA/STMEA,LDM/LDMIA, PUSH/POP T2 */
+	case 0xe8a0:		/* ...above writeback version */
+	case 0xe900:		/* STMDB/STMFD, LDMDB/LDMEA */
+	case 0xe920:		/* ...above writeback version */
+		/* no need offset decision since handler calculates it */
+		return do_alignment_ldmstm;
+
+	case 0xf840:		/* POP/PUSH T3 (single register) */
+		if (RN_BITS(instr) == 13 && (tinst2 & 0x09ff) == 0x0904) {
+			u32 L = !!(LDST_L_BIT(instr));
+			const u32 subset[2] = {
+				0xe92d0000,	/* STMDB sp!,{registers} */
+				0xe8bd0000,	/* LDMIA sp!,{registers} */
+			};
+			*pinstr = subset[L] | (1<<RD_BITS(instr));
+			return do_alignment_ldmstm;
+		}
+		/* Else fall through for illegal instruction case */
+		break;
+
+	/* A6.3.6 Load/store double, STRD/LDRD(immed, lit, reg) */
+	case 0xe860:
+	case 0xe960:
+	case 0xe8e0:
+	case 0xe9e0:
+		poffset->un = (tinst2 & 0xff) << 2;
+		fallthrough;
+
+	case 0xe940:
+	case 0xe9c0:
+		return do_alignment_ldrdstrd;
+
+	/*
+	 * No need to handle load/store instructions up to word size
+	 * since ARMv6 and later CPUs can perform unaligned accesses.
+	 */
+	default:
+		break;
+	}
+	return NULL;
+}
+
+static int alignment_get_arm(struct pt_regs *regs, __le32 __user *ip, u32 *inst)
+{
+	__le32 instr = 0;
+	int fault;
+
+	fault = get_user(instr, ip);
+	if (fault)
+		return fault;
+
+	*inst = __le32_to_cpu(instr);
+	return 0;
+}
+
+static int alignment_get_thumb(struct pt_regs *regs, __le16 __user *ip, u16 *inst)
+{
+	__le16 instr = 0;
+	int fault;
+
+	fault = get_user(instr, ip);
+	if (fault)
+		return fault;
+
+	*inst = __le16_to_cpu(instr);
+	return 0;
+}
+
+int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs)
+{
+	union offset_union offset;
+	unsigned long instrptr;
+	int (*handler)(unsigned long addr, u32 instr, struct pt_regs *regs);
+	unsigned int type;
+	u32 instr = 0;
+	u16 tinstr = 0;
+	int isize = 4;
+	int thumb2_32b = 0;
+	int fault;
+
+	instrptr = instruction_pointer(regs);
+
+	if (compat_thumb_mode(regs)) {
+		__le16 __user *ptr = (__le16 __user *)(instrptr & ~1);
+
+		fault = alignment_get_thumb(regs, ptr, &tinstr);
+		if (!fault) {
+			if (IS_T32(tinstr)) {
+				/* Thumb-2 32-bit */
+				u16 tinst2;
+				fault = alignment_get_thumb(regs, ptr + 1, &tinst2);
+				instr = ((u32)tinstr << 16) | tinst2;
+				thumb2_32b = 1;
+			} else {
+				isize = 2;
+				instr = thumb2arm(tinstr);
+			}
+		}
+	} else {
+		fault = alignment_get_arm(regs, (__le32 __user *)instrptr, &instr);
+	}
+
+	if (fault)
+		return 1;
+
+	switch (CODING_BITS(instr)) {
+	case 0x00000000:	/* 3.13.4 load/store instruction extensions */
+		if (LDSTHD_I_BIT(instr))
+			offset.un = (instr & 0xf00) >> 4 | (instr & 15);
+		else
+			offset.un = regs->regs[RM_BITS(instr)];
+
+		if ((instr & 0x001000f0) == 0x000000d0 || /* LDRD */
+		    (instr & 0x001000f0) == 0x000000f0)   /* STRD */
+			handler = do_alignment_ldrdstrd;
+		else
+			return 1;
+		break;
+
+	case 0x08000000:	/* ldm or stm, or thumb-2 32bit instruction */
+		if (thumb2_32b) {
+			offset.un = 0;
+			handler = do_alignment_t32_to_handler(&instr, regs, &offset);
+		} else {
+			offset.un = 0;
+			handler = do_alignment_ldmstm;
+		}
+		break;
+
+	default:
+		return 1;
+	}
+
+	type = handler(addr, instr, regs);
+
+	if (type == TYPE_ERROR || type == TYPE_FAULT)
+		return 1;
+
+	if (type == TYPE_LDST)
+		do_alignment_finish_ldst(addr, instr, regs, offset);
+
+	perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, regs->pc);
+	arm64_skip_faulting_instruction(regs, isize);
+
+	return 0;
+}
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 5d0527ba08049c4d5b83d124b5b189763a93fa53..a51edf3c02145290d17b4ea73f6786f0a25e9136 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2043,7 +2043,8 @@ static void bti_enable(const struct arm64_cpu_capabilities *__unused)
 static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap)
 {
 	sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_ATA | SCTLR_EL1_ATA0);
-	isb();
+
+	mte_cpu_setup();
 
 	/*
 	 * Clear the tags in the zero page. This needs to be done via the
diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c
index bda49430c9ea3c9beb0669689e760bc6bd26c3ce..38dbd3828f139106ff6f1755a79e1d511e5108b2 100644
--- a/arch/arm64/kernel/irq.c
+++ b/arch/arm64/kernel/irq.c
@@ -21,7 +21,9 @@
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
 #include <asm/daifflags.h>
+#include <asm/exception.h>
 #include <asm/vmap_stack.h>
+#include <asm/softirq_stack.h>
 
 /* Only access this in an NMI enter/exit */
 DEFINE_PER_CPU(struct nmi_ctx, nmi_contexts);
@@ -71,6 +73,18 @@ static void init_irq_stacks(void)
 }
 #endif
 
+#ifndef CONFIG_PREEMPT_RT
+static void ____do_softirq(struct pt_regs *regs)
+{
+	__do_softirq();
+}
+
+void do_softirq_own_stack(void)
+{
+	call_on_irq_stack(NULL, ____do_softirq);
+}
+#endif
+
 static void default_handle_irq(struct pt_regs *regs)
 {
 	panic("IRQ taken without a root IRQ handler\n");
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index b2b730233274b72f4e1c619d39a0e68d22835f13..aca88470fb69d5dca875d7c4b73591997fc64ff7 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -285,6 +285,49 @@ void mte_thread_switch(struct task_struct *next)
 	mte_check_tfsr_el1();
 }
 
+void mte_cpu_setup(void)
+{
+	u64 rgsr;
+
+	/*
+	 * CnP must be enabled only after the MAIR_EL1 register has been set
+	 * up. Inconsistent MAIR_EL1 between CPUs sharing the same TLB may
+	 * lead to the wrong memory type being used for a brief window during
+	 * CPU power-up.
+	 *
+	 * CnP is not a boot feature so MTE gets enabled before CnP, but let's
+	 * make sure that is the case.
+	 */
+	BUG_ON(read_sysreg(ttbr0_el1) & TTBR_CNP_BIT);
+	BUG_ON(read_sysreg(ttbr1_el1) & TTBR_CNP_BIT);
+
+	/* Normal Tagged memory type at the corresponding MAIR index */
+	sysreg_clear_set(mair_el1,
+			 MAIR_ATTRIDX(MAIR_ATTR_MASK, MT_NORMAL_TAGGED),
+			 MAIR_ATTRIDX(MAIR_ATTR_NORMAL_TAGGED,
+				      MT_NORMAL_TAGGED));
+
+	write_sysreg_s(KERNEL_GCR_EL1, SYS_GCR_EL1);
+
+	/*
+	 * If GCR_EL1.RRND=1 is implemented the same way as RRND=0, then
+	 * RGSR_EL1.SEED must be non-zero for IRG to produce
+	 * pseudorandom numbers. As RGSR_EL1 is UNKNOWN out of reset, we
+	 * must initialize it.
+	 */
+	rgsr = (read_sysreg(CNTVCT_EL0) & SYS_RGSR_EL1_SEED_MASK) <<
+	       SYS_RGSR_EL1_SEED_SHIFT;
+	if (rgsr == 0)
+		rgsr = 1 << SYS_RGSR_EL1_SEED_SHIFT;
+	write_sysreg_s(rgsr, SYS_RGSR_EL1);
+
+	/* clear any pending tag check faults in TFSR*_EL1 */
+	write_sysreg_s(0, SYS_TFSR_EL1);
+	write_sysreg_s(0, SYS_TFSRE0_EL1);
+
+	local_flush_tlb_all();
+}
+
 void mte_suspend_enter(void)
 {
 	if (!system_supports_mte())
@@ -301,6 +344,14 @@ void mte_suspend_enter(void)
 	mte_check_tfsr_el1();
 }
 
+void mte_suspend_exit(void)
+{
+	if (!system_supports_mte())
+		return;
+
+	mte_cpu_setup();
+}
+
 long set_mte_ctrl(struct task_struct *task, unsigned long arg)
 {
 	u64 mte_ctrl = (~((arg & PR_MTE_TAG_MASK) >> PR_MTE_TAG_SHIFT) &
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index d1d1823202453d2259536636099b75b7d0f09d99..c9e4d072028561d8b0730a8da68f193ad14197ee 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -44,13 +44,28 @@ post_kprobe_handler(struct kprobe *, struct kprobe_ctlblk *, struct pt_regs *);
 static void __kprobes arch_prepare_ss_slot(struct kprobe *p)
 {
 	kprobe_opcode_t *addr = p->ainsn.api.insn;
-	void *addrs[] = {addr, addr + 1};
-	u32 insns[] = {p->opcode, BRK64_OPCODE_KPROBES_SS};
 
-	/* prepare insn slot */
-	aarch64_insn_patch_text(addrs, insns, 2);
-
-	flush_icache_range((uintptr_t)addr, (uintptr_t)(addr + MAX_INSN_SIZE));
+	/*
+	 * Prepare insn slot, Mark Rutland points out it depends on a coupe of
+	 * subtleties:
+	 *
+	 * - That the I-cache maintenance for these instructions is complete
+	 *   *before* the kprobe BRK is written (and aarch64_insn_patch_text_nosync()
+	 *   ensures this, but just omits causing a Context-Synchronization-Event
+	 *   on all CPUS).
+	 *
+	 * - That the kprobe BRK results in an exception (and consequently a
+	 *   Context-Synchronoization-Event), which ensures that the CPU will
+	 *   fetch thesingle-step slot instructions *after* this, ensuring that
+	 *   the new instructions are used
+	 *
+	 * It supposes to place ISB after patching to guarantee I-cache maintenance
+	 * is observed on all CPUS, however, single-step slot is installed in
+	 * the BRK exception handler, so it is unnecessary to generate
+	 * Contex-Synchronization-Event via ISB again.
+	 */
+	aarch64_insn_patch_text_nosync(addr, p->opcode);
+	aarch64_insn_patch_text_nosync(addr + 1, BRK64_OPCODE_KPROBES_SS);
 
 	/*
 	 * Needs restoring of return address after stepping xol.
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index ea659a382e24a19fb2f27af57ec97a2f05e0afa3..a8ea1637b13798ce49a594d79f410efd4e49703e 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -988,6 +988,14 @@ static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot)
 	isb();
 }
 
+static bool __read_mostly __nospectre_bhb;
+static int __init parse_spectre_bhb_param(char *str)
+{
+	__nospectre_bhb = true;
+	return 0;
+}
+early_param("nospectre_bhb", parse_spectre_bhb_param);
+
 void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry)
 {
 	bp_hardening_cb_t cpu_cb;
@@ -1001,7 +1009,7 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry)
 		/* No point mitigating Spectre-BHB alone. */
 	} else if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) {
 		pr_info_once("spectre-bhb mitigation disabled by compile time option\n");
-	} else if (cpu_mitigations_off()) {
+	} else if (cpu_mitigations_off() || __nospectre_bhb) {
 		pr_info_once("spectre-bhb mitigation disabled by command line option\n");
 	} else if (supports_ecbhb(SCOPE_LOCAL_CPU)) {
 		state = SPECTRE_MITIGATED;
diff --git a/arch/arm64/kernel/reloc_test_core.c b/arch/arm64/kernel/reloc_test_core.c
index e87a2b7f20f613830daec851755813e7195bd1c0..99f2ffe9fc0576400fa03e593b9ae7658892031e 100644
--- a/arch/arm64/kernel/reloc_test_core.c
+++ b/arch/arm64/kernel/reloc_test_core.c
@@ -48,7 +48,7 @@ static struct {
 	{ "R_AARCH64_PREL16",		relative_data16, (u64)&sym64_rel },
 };
 
-static int reloc_test_init(void)
+static int __init reloc_test_init(void)
 {
 	int i;
 
@@ -67,7 +67,7 @@ static int reloc_test_init(void)
 	return 0;
 }
 
-static void reloc_test_exit(void)
+static void __exit reloc_test_exit(void)
 {
 }
 
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index 9135fe0f3df536f4fe77832f2758ed5815c877b1..8b02d310838f92404c7d61a1083ffa8398194d20 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -43,6 +43,8 @@ void notrace __cpu_suspend_exit(void)
 {
 	unsigned int cpu = smp_processor_id();
 
+	mte_suspend_exit();
+
 	/*
 	 * We are resuming from reset with the idmap active in TTBR0_EL1.
 	 * We must uninstall the idmap and restore the expected MMU
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 599cf81f568514123a3ee5874c2802f082a837a1..83a512a6ff0dd51b1855d0e5dc4d6a22c59fbe60 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -36,7 +36,7 @@ void arch_dma_prep_coherent(struct page *page, size_t size)
 {
 	unsigned long start = (unsigned long)page_address(page);
 
-	dcache_clean_inval_poc(start, start + size);
+	dcache_clean_poc(start, start + size);
 }
 
 #ifdef CONFIG_IOMMU_DMA
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c33f1fad27450307eb0fc4440914355e4de11e02..5b391490e045be91b9cf1e85a2b964474f4d8c4d 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -691,6 +691,9 @@ static int __kprobes do_translation_fault(unsigned long far,
 static int do_alignment_fault(unsigned long far, unsigned long esr,
 			      struct pt_regs *regs)
 {
+	if (IS_ENABLED(CONFIG_COMPAT_ALIGNMENT_FIXUPS) &&
+	    compat_user_mode(regs))
+		return do_compat_alignment_fixup(far, regs);
 	do_bad_area(far, esr, regs);
 	return 0;
 }
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 5810eddfb48e906eb2b45a3b89dea3208b5cb2bb..01f42c1185eb6a186f1e1601377458604f619f96 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -535,7 +535,7 @@ static void __init map_mem(pgd_t *pgdp)
 	 */
 	BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end));
 
-	if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
+	if (can_set_direct_map())
 		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 
 	/*
@@ -1180,14 +1180,6 @@ static void free_empty_tables(unsigned long addr, unsigned long end,
 }
 #endif
 
-#if !ARM64_KERNEL_USES_PMD_MAPS
-int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
-		struct vmem_altmap *altmap)
-{
-	WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
-	return vmemmap_populate_basepages(start, end, node, altmap);
-}
-#else	/* !ARM64_KERNEL_USES_PMD_MAPS */
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 		struct vmem_altmap *altmap)
 {
@@ -1199,6 +1191,10 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 	pmd_t *pmdp;
 
 	WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
+
+	if (!ARM64_KERNEL_USES_PMD_MAPS)
+		return vmemmap_populate_basepages(start, end, node, altmap);
+
 	do {
 		next = pmd_addr_end(addr, end);
 
@@ -1232,7 +1228,6 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 
 	return 0;
 }
-#endif	/* !ARM64_KERNEL_USES_PMD_MAPS */
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 void vmemmap_free(unsigned long start, unsigned long end,
@@ -1547,11 +1542,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
 
 	VM_BUG_ON(!mhp_range_allowed(start, size, true));
 
-	/*
-	 * KFENCE requires linear map to be mapped at page granularity, so that
-	 * it is possible to protect/unprotect single pages in the KFENCE pool.
-	 */
-	if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
+	if (can_set_direct_map())
 		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 
 	__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 64e985eaa52d8df35244427a0479c0b173ee8668..d107c3d434e224552ae34011a4df3319cc977516 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -21,7 +21,13 @@ bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED
 
 bool can_set_direct_map(void)
 {
-	return rodata_full || debug_pagealloc_enabled();
+	/*
+	 * rodata_full, DEBUG_PAGEALLOC and KFENCE require linear map to be
+	 * mapped at page granularity, so that it is possible to
+	 * protect/unprotect single pages.
+	 */
+	return rodata_full || debug_pagealloc_enabled() ||
+		IS_ENABLED(CONFIG_KFENCE);
 }
 
 static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 5f7784ee60447f2a2ff7ad869a8ae39625f50eac..f38bccdd374a56a4032b0b49dd3d63c702c068b6 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -48,17 +48,19 @@
 
 #ifdef CONFIG_KASAN_HW_TAGS
 #define TCR_MTE_FLAGS TCR_TCMA1 | TCR_TBI1 | TCR_TBID1
-#else
+#elif defined(CONFIG_ARM64_MTE)
 /*
  * The mte_zero_clear_page_tags() implementation uses DC GZVA, which relies on
  * TBI being enabled at EL1.
  */
 #define TCR_MTE_FLAGS TCR_TBI1 | TCR_TBID1
+#else
+#define TCR_MTE_FLAGS 0
 #endif
 
 /*
  * Default MAIR_EL1. MT_NORMAL_TAGGED is initially mapped as Normal memory and
- * changed during __cpu_setup to Normal Tagged if the system supports MTE.
+ * changed during mte_cpu_setup to Normal Tagged if the system supports MTE.
  */
 #define MAIR_EL1_SET							\
 	(MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRnE, MT_DEVICE_nGnRnE) |	\
@@ -426,46 +428,8 @@ SYM_FUNC_START(__cpu_setup)
 	mov_q	mair, MAIR_EL1_SET
 	mov_q	tcr, TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \
 			TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
-			TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS
-
-#ifdef CONFIG_ARM64_MTE
-	/*
-	 * Update MAIR_EL1, GCR_EL1 and TFSR*_EL1 if MTE is supported
-	 * (ID_AA64PFR1_EL1[11:8] > 1).
-	 */
-	mrs	x10, ID_AA64PFR1_EL1
-	ubfx	x10, x10, #ID_AA64PFR1_EL1_MTE_SHIFT, #4
-	cmp	x10, #ID_AA64PFR1_EL1_MTE_MTE2
-	b.lt	1f
-
-	/* Normal Tagged memory type at the corresponding MAIR index */
-	mov	x10, #MAIR_ATTR_NORMAL_TAGGED
-	bfi	mair, x10, #(8 *  MT_NORMAL_TAGGED), #8
+			TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS
 
-	mov	x10, #KERNEL_GCR_EL1
-	msr_s	SYS_GCR_EL1, x10
-
-	/*
-	 * If GCR_EL1.RRND=1 is implemented the same way as RRND=0, then
-	 * RGSR_EL1.SEED must be non-zero for IRG to produce
-	 * pseudorandom numbers. As RGSR_EL1 is UNKNOWN out of reset, we
-	 * must initialize it.
-	 */
-	mrs	x10, CNTVCT_EL0
-	ands	x10, x10, #SYS_RGSR_EL1_SEED_MASK
-	csinc	x10, x10, xzr, ne
-	lsl	x10, x10, #SYS_RGSR_EL1_SEED_SHIFT
-	msr_s	SYS_RGSR_EL1, x10
-
-	/* clear any pending tag check faults in TFSR*_EL1 */
-	msr_s	SYS_TFSR_EL1, xzr
-	msr_s	SYS_TFSRE0_EL1, xzr
-
-	/* set the TCR_EL1 bits */
-	mov_q	x10, TCR_MTE_FLAGS
-	orr	tcr, tcr, x10
-1:
-#endif
 	tcr_clear_errata_bits tcr, x9, x5
 
 #ifdef CONFIG_ARM64_VA_BITS_52