diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index f4acf9c2e90f6e3c3416546227db1a734f3b34b2..382818a7197aa2bea74345755967783fd5e90b89 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -41,8 +41,8 @@ Support
 Architectures
 ~~~~~~~~~~~~~
 
-Generic KASAN is supported on x86_64, arm, arm64, powerpc, riscv, s390, and
-xtensa, and the tag-based KASAN modes are supported only on arm64.
+Generic KASAN is supported on x86_64, arm, arm64, powerpc, riscv, s390, xtensa,
+and loongarch, and the tag-based KASAN modes are supported only on arm64.
 
 Compilers
 ~~~~~~~~~
diff --git a/Documentation/features/debug/KASAN/arch-support.txt b/Documentation/features/debug/KASAN/arch-support.txt
index bf0124fae643a1bbb13d34a539171354642351b1..c4581c2edb28006def42321db12567f20f4e1b5e 100644
--- a/Documentation/features/debug/KASAN/arch-support.txt
+++ b/Documentation/features/debug/KASAN/arch-support.txt
@@ -13,7 +13,7 @@
     |        csky: | TODO |
     |     hexagon: | TODO |
     |        ia64: | TODO |
-    |   loongarch: | TODO |
+    |   loongarch: |  ok  |
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: | TODO |
diff --git a/Documentation/features/debug/kcov/arch-support.txt b/Documentation/features/debug/kcov/arch-support.txt
index ffcc9f2b1d74d3930302349b3bcf541bb368c6e8..de84cefbcdd36f028e2eb0f1770dde51551a4652 100644
--- a/Documentation/features/debug/kcov/arch-support.txt
+++ b/Documentation/features/debug/kcov/arch-support.txt
@@ -13,7 +13,7 @@
     |        csky: | TODO |
     |     hexagon: | TODO |
     |        ia64: | TODO |
-    |   loongarch: | TODO |
+    |   loongarch: |  ok  |
     |        m68k: | TODO |
     |  microblaze: | TODO |
     |        mips: |  ok  |
diff --git a/Documentation/features/debug/kgdb/arch-support.txt b/Documentation/features/debug/kgdb/arch-support.txt
index 958498f9f2a41f2b6c56f2bd3ead11d88b5ef5ae..5e91ec78c80b759cd3af62c696817e7a607e8b93 100644
--- a/Documentation/features/debug/kgdb/arch-support.txt
+++ b/Documentation/features/debug/kgdb/arch-support.txt
@@ -13,7 +13,7 @@
     |        csky: | TODO |
     |     hexagon: |  ok  |
     |        ia64: | TODO |
-    |   loongarch: | TODO |
+    |   loongarch: |  ok  |
     |        m68k: | TODO |
     |  microblaze: |  ok  |
     |        mips: |  ok  |
diff --git a/Documentation/translations/zh_CN/dev-tools/kasan.rst b/Documentation/translations/zh_CN/dev-tools/kasan.rst
index 05ef904dbcfb2e154526cb546d22aa65f58b4d90..8fdb20c9665b4b5e8a9afe12fb361f9a1d2315d7 100644
--- a/Documentation/translations/zh_CN/dev-tools/kasan.rst
+++ b/Documentation/translations/zh_CN/dev-tools/kasan.rst
@@ -42,7 +42,7 @@ KASAN有三种模式:
 体系架构
 ~~~~~~~~
 
-在x86_64、arm、arm64、powerpc、riscv、s390和xtensa上支持通用KASAN,
+在x86_64、arm、arm64、powerpc、riscv、s390、xtensa和loongarch上支持通用KASAN,
 而基于标签的KASAN模式只在arm64上支持。
 
 编译器
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index ecf282dee513fedb807f5b456e6773fd1a633b4f..e14396a2ddcbfc6d6130b63dca258343a03f35cb 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -8,11 +8,13 @@ config LOONGARCH
 	select ACPI_PPTT if ACPI
 	select ACPI_SYSTEM_POWER_STATES_SUPPORT	if ACPI
 	select ARCH_BINFMT_ELF_STATE
+	select ARCH_DISABLE_KASAN_INLINE
 	select ARCH_ENABLE_MEMORY_HOTPLUG
 	select ARCH_ENABLE_MEMORY_HOTREMOVE
 	select ARCH_HAS_ACPI_TABLE_UPGRADE	if ACPI
 	select ARCH_HAS_CPU_FINALIZE_INIT
 	select ARCH_HAS_FORTIFY_SOURCE
+	select ARCH_HAS_KCOV
 	select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
 	select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
 	select ARCH_HAS_PTE_SPECIAL
@@ -91,6 +93,9 @@ config LOONGARCH
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_ARCH_JUMP_LABEL_RELATIVE
+	select HAVE_ARCH_KASAN
+	select HAVE_ARCH_KFENCE
+	select HAVE_ARCH_KGDB if PERF_EVENTS
 	select HAVE_ARCH_MMAP_RND_BITS if MMU
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_TRACEHOOK
@@ -115,6 +120,7 @@ config LOONGARCH
 	select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACER
+	select HAVE_GCC_PLUGINS
 	select HAVE_GENERIC_VDSO
 	select HAVE_HW_BREAKPOINT if PERF_EVENTS
 	select HAVE_IOREMAP_PROT
@@ -254,6 +260,9 @@ config AS_HAS_LSX_EXTENSION
 config AS_HAS_LASX_EXTENSION
 	def_bool $(as-instr,xvld \$xr0$(comma)\$a0$(comma)0)
 
+config AS_HAS_LBT_EXTENSION
+	def_bool $(as-instr,movscr2gr \$a0$(comma)\$scr0)
+
 menu "Kernel type and options"
 
 source "kernel/Kconfig.hz"
@@ -534,6 +543,18 @@ config CPU_HAS_LASX
 
 	  If unsure, say Y.
 
+config CPU_HAS_LBT
+	bool "Support for the Loongson Binary Translation Extension"
+	depends on AS_HAS_LBT_EXTENSION
+	help
+	  Loongson Binary Translation (LBT) introduces 4 scratch registers (SCR0
+	  to SCR3), x86/ARM eflags (eflags) and x87 fpu stack pointer (ftop).
+	  Enabling this option allows the kernel to allocate and switch registers
+	  specific to LBT.
+
+	  If you want to use this feature, such as the Loongson Architecture
+	  Translator (LAT), say Y.
+
 config CPU_HAS_PREFETCH
 	bool
 	default y
@@ -638,6 +659,11 @@ config ARCH_MMAP_RND_BITS_MAX
 config ARCH_SUPPORTS_UPROBES
 	def_bool y
 
+config KASAN_SHADOW_OFFSET
+	hex
+	default 0x0
+	depends on KASAN
+
 menu "Power management options"
 
 config ARCH_SUSPEND_POSSIBLE
diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile
index ef87bab46754e63142761c54f688cd2850b9ebbd..fb0fada43197e4ad520359c8f2a6b6350d48d99d 100644
--- a/arch/loongarch/Makefile
+++ b/arch/loongarch/Makefile
@@ -84,7 +84,10 @@ LDFLAGS_vmlinux			+= -static -pie --no-dynamic-linker -z notext
 endif
 
 cflags-y += $(call cc-option, -mno-check-zero-division)
+
+ifndef CONFIG_KASAN
 cflags-y += -fno-builtin-memcpy -fno-builtin-memmove -fno-builtin-memset
+endif
 
 load-y		= 0x9000000000200000
 bootvars-y	= VMLINUX_LOAD_ADDRESS=$(load-y)
diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig
index d64849b4cba1634849e89f35b2a623c0610c39e9..a3b52aaa83b33634c5be146bc02d50589bf5f762 100644
--- a/arch/loongarch/configs/loongson3_defconfig
+++ b/arch/loongarch/configs/loongson3_defconfig
@@ -30,7 +30,6 @@ CONFIG_NAMESPACES=y
 CONFIG_USER_NS=y
 CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_SCHED_AUTOGROUP=y
-CONFIG_SYSFS_DEPRECATED=y
 CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_EXPERT=y
@@ -47,8 +46,12 @@ CONFIG_SMP=y
 CONFIG_HOTPLUG_CPU=y
 CONFIG_NR_CPUS=64
 CONFIG_NUMA=y
+CONFIG_CPU_HAS_FPU=y
+CONFIG_CPU_HAS_LSX=y
+CONFIG_CPU_HAS_LASX=y
 CONFIG_KEXEC=y
 CONFIG_CRASH_DUMP=y
+CONFIG_RANDOMIZE_BASE=y
 CONFIG_SUSPEND=y
 CONFIG_HIBERNATION=y
 CONFIG_ACPI=y
@@ -63,6 +66,7 @@ CONFIG_EFI_ZBOOT=y
 CONFIG_EFI_GENERIC_STUB_INITRD_CMDLINE_LOADER=y
 CONFIG_EFI_CAPSULE_LOADER=m
 CONFIG_EFI_TEST=m
+CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
 CONFIG_MODULE_FORCE_LOAD=y
 CONFIG_MODULE_UNLOAD=y
@@ -108,7 +112,12 @@ CONFIG_IP_PNP_BOOTP=y
 CONFIG_IP_PNP_RARP=y
 CONFIG_NET_IPIP=m
 CONFIG_NET_IPGRE_DEMUX=m
+CONFIG_NET_IPGRE=m
+CONFIG_NET_IPGRE_BROADCAST=y
 CONFIG_IP_MROUTE=y
+CONFIG_IP_MROUTE_MULTIPLE_TABLES=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
 CONFIG_INET_ESP=m
 CONFIG_INET_UDP_DIAG=y
 CONFIG_TCP_CONG_ADVANCED=y
@@ -137,7 +146,6 @@ CONFIG_NFT_MASQ=m
 CONFIG_NFT_REDIR=m
 CONFIG_NFT_NAT=m
 CONFIG_NFT_TUNNEL=m
-CONFIG_NFT_OBJREF=m
 CONFIG_NFT_QUEUE=m
 CONFIG_NFT_QUOTA=m
 CONFIG_NFT_REJECT=m
@@ -208,7 +216,11 @@ CONFIG_IP_VS=m
 CONFIG_IP_VS_IPV6=y
 CONFIG_IP_VS_PROTO_TCP=y
 CONFIG_IP_VS_PROTO_UDP=y
+CONFIG_IP_VS_PROTO_ESP=y
+CONFIG_IP_VS_PROTO_AH=y
+CONFIG_IP_VS_PROTO_SCTP=y
 CONFIG_IP_VS_RR=m
+CONFIG_IP_VS_WRR=m
 CONFIG_IP_VS_NFCT=y
 CONFIG_NF_TABLES_IPV4=y
 CONFIG_NFT_DUP_IPV4=m
@@ -227,7 +239,6 @@ CONFIG_IP_NF_TARGET_MASQUERADE=m
 CONFIG_IP_NF_TARGET_NETMAP=m
 CONFIG_IP_NF_TARGET_REDIRECT=m
 CONFIG_IP_NF_MANGLE=m
-CONFIG_IP_NF_TARGET_CLUSTERIP=m
 CONFIG_IP_NF_TARGET_ECN=m
 CONFIG_IP_NF_TARGET_TTL=m
 CONFIG_IP_NF_RAW=m
@@ -363,6 +374,8 @@ CONFIG_MTD_CFI_AMDSTD=m
 CONFIG_MTD_CFI_STAA=m
 CONFIG_MTD_RAM=m
 CONFIG_MTD_ROM=m
+CONFIG_MTD_UBI=m
+CONFIG_MTD_UBI_BLOCK=y
 CONFIG_PARPORT=y
 CONFIG_PARPORT_PC=y
 CONFIG_PARPORT_SERIAL=y
@@ -370,6 +383,7 @@ CONFIG_PARPORT_PC_FIFO=y
 CONFIG_ZRAM=m
 CONFIG_ZRAM_DEF_COMP_ZSTD=y
 CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=8192
@@ -516,6 +530,8 @@ CONFIG_STMMAC_ETH=y
 # CONFIG_NET_VENDOR_TEHUTI is not set
 # CONFIG_NET_VENDOR_TI is not set
 # CONFIG_NET_VENDOR_VIA is not set
+CONFIG_NGBE=y
+CONFIG_TXGBE=y
 # CONFIG_NET_VENDOR_WIZNET is not set
 # CONFIG_NET_VENDOR_XILINX is not set
 CONFIG_PPP=m
@@ -602,9 +618,15 @@ CONFIG_HW_RANDOM_VIRTIO=m
 CONFIG_I2C_CHARDEV=y
 CONFIG_I2C_PIIX4=y
 CONFIG_I2C_GPIO=y
+CONFIG_I2C_LS2X=y
 CONFIG_SPI=y
+CONFIG_SPI_LOONGSON_PCI=m
+CONFIG_SPI_LOONGSON_PLATFORM=m
+CONFIG_PINCTRL=y
+CONFIG_PINCTRL_LOONGSON2=y
 CONFIG_GPIO_SYSFS=y
 CONFIG_GPIO_LOONGSON=y
+CONFIG_GPIO_LOONGSON_64BIT=y
 CONFIG_POWER_RESET=y
 CONFIG_POWER_RESET_RESTART=y
 CONFIG_POWER_RESET_SYSCON=y
@@ -614,6 +636,7 @@ CONFIG_SENSORS_LM75=m
 CONFIG_SENSORS_LM93=m
 CONFIG_SENSORS_W83795=m
 CONFIG_SENSORS_W83627HF=m
+CONFIG_LOONGSON2_THERMAL=m
 CONFIG_RC_CORE=m
 CONFIG_LIRC=y
 CONFIG_RC_DECODERS=y
@@ -643,6 +666,7 @@ CONFIG_DRM_AMDGPU_USERPTR=y
 CONFIG_DRM_AST=y
 CONFIG_DRM_QXL=m
 CONFIG_DRM_VIRTIO_GPU=m
+CONFIG_DRM_LOONGSON=y
 CONFIG_FB=y
 CONFIG_FB_EFI=y
 CONFIG_FB_RADEON=y
@@ -712,6 +736,7 @@ CONFIG_UCSI_ACPI=m
 CONFIG_INFINIBAND=m
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_EFI=y
+CONFIG_RTC_DRV_LOONGSON=y
 CONFIG_DMADEVICES=y
 CONFIG_UIO=m
 CONFIG_UIO_PDRV_GENIRQ=m
@@ -745,7 +770,9 @@ CONFIG_COMEDI_NI_LABPC_PCI=m
 CONFIG_COMEDI_NI_PCIDIO=m
 CONFIG_COMEDI_NI_PCIMIO=m
 CONFIG_STAGING=y
-CONFIG_R8188EU=m
+CONFIG_COMMON_CLK_LOONGSON2=y
+CONFIG_LOONGSON2_GUTS=y
+CONFIG_LOONGSON2_PM=y
 CONFIG_PM_DEVFREQ=y
 CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=y
 CONFIG_DEVFREQ_GOV_PERFORMANCE=y
@@ -759,10 +786,17 @@ CONFIG_EXT2_FS_SECURITY=y
 CONFIG_EXT3_FS=y
 CONFIG_EXT3_FS_POSIX_ACL=y
 CONFIG_EXT3_FS_SECURITY=y
+CONFIG_JFS_FS=m
+CONFIG_JFS_POSIX_ACL=y
+CONFIG_JFS_SECURITY=y
 CONFIG_XFS_FS=y
 CONFIG_XFS_QUOTA=y
 CONFIG_XFS_POSIX_ACL=y
+CONFIG_GFS2_FS=m
+CONFIG_GFS2_FS_LOCKING_DLM=y
+CONFIG_OCFS2_FS=m
 CONFIG_BTRFS_FS=y
+CONFIG_BTRFS_FS_POSIX_ACL=y
 CONFIG_FANOTIFY=y
 CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y
 CONFIG_QUOTA=y
@@ -771,11 +805,14 @@ CONFIG_QFMT_V1=m
 CONFIG_QFMT_V2=m
 CONFIG_AUTOFS_FS=y
 CONFIG_FUSE_FS=m
+CONFIG_CUSE=m
+CONFIG_VIRTIO_FS=m
 CONFIG_OVERLAY_FS=y
 CONFIG_OVERLAY_FS_INDEX=y
 CONFIG_OVERLAY_FS_XINO_AUTO=y
 CONFIG_OVERLAY_FS_METACOPY=y
 CONFIG_FSCACHE=y
+CONFIG_CACHEFILES=m
 CONFIG_ISO9660_FS=y
 CONFIG_JOLIET=y
 CONFIG_ZISOFS=y
@@ -784,19 +821,42 @@ CONFIG_MSDOS_FS=m
 CONFIG_VFAT_FS=m
 CONFIG_FAT_DEFAULT_CODEPAGE=936
 CONFIG_FAT_DEFAULT_IOCHARSET="gb2312"
+CONFIG_EXFAT_FS=m
+CONFIG_NTFS3_FS=m
+CONFIG_NTFS3_64BIT_CLUSTER=y
+CONFIG_NTFS3_LZX_XPRESS=y
 CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_TMPFS_POSIX_ACL=y
 CONFIG_HUGETLBFS=y
 CONFIG_CONFIGFS_FS=y
+CONFIG_ORANGEFS_FS=m
+CONFIG_ECRYPT_FS=m
+CONFIG_ECRYPT_FS_MESSAGING=y
 CONFIG_HFS_FS=m
 CONFIG_HFSPLUS_FS=m
+CONFIG_UBIFS_FS=m
+CONFIG_UBIFS_FS_ADVANCED_COMPR=y
 CONFIG_CRAMFS=m
 CONFIG_SQUASHFS=y
 CONFIG_SQUASHFS_XATTR=y
 CONFIG_SQUASHFS_LZ4=y
 CONFIG_SQUASHFS_LZO=y
 CONFIG_SQUASHFS_XZ=y
+CONFIG_MINIX_FS=m
+CONFIG_ROMFS_FS=m
+CONFIG_PSTORE=m
+CONFIG_PSTORE_LZO_COMPRESS=m
+CONFIG_PSTORE_LZ4_COMPRESS=m
+CONFIG_PSTORE_LZ4HC_COMPRESS=m
+CONFIG_PSTORE_842_COMPRESS=y
+CONFIG_PSTORE_ZSTD_COMPRESS=y
+CONFIG_PSTORE_ZSTD_COMPRESS_DEFAULT=y
+CONFIG_SYSV_FS=m
+CONFIG_UFS_FS=m
+CONFIG_EROFS_FS=m
+CONFIG_EROFS_FS_ZIP_LZMA=y
+CONFIG_EROFS_FS_PCPU_KTHREAD=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3_ACL=y
 CONFIG_NFS_V4=y
@@ -807,6 +867,10 @@ CONFIG_NFSD=y
 CONFIG_NFSD_V3_ACL=y
 CONFIG_NFSD_V4=y
 CONFIG_NFSD_BLOCKLAYOUT=y
+CONFIG_CEPH_FS=m
+CONFIG_CEPH_FSCACHE=y
+CONFIG_CEPH_FS_POSIX_ACL=y
+CONFIG_CEPH_FS_SECURITY_LABEL=y
 CONFIG_CIFS=m
 # CONFIG_CIFS_DEBUG is not set
 CONFIG_9P_FS=y
@@ -814,6 +878,7 @@ CONFIG_NLS_CODEPAGE_437=y
 CONFIG_NLS_CODEPAGE_936=y
 CONFIG_NLS_ASCII=y
 CONFIG_NLS_UTF8=y
+CONFIG_DLM=m
 CONFIG_KEY_DH_OPERATIONS=y
 CONFIG_SECURITY=y
 CONFIG_SECURITY_SELINUX=y
@@ -847,6 +912,7 @@ CONFIG_CRYPTO_USER_API_HASH=m
 CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
+CONFIG_CRYPTO_CRC32_LOONGARCH=m
 CONFIG_CRYPTO_DEV_VIRTIO=m
 CONFIG_PRINTK_TIME=y
 CONFIG_STRIP_ASM_SYMS=y
diff --git a/arch/loongarch/include/asm/asm-prototypes.h b/arch/loongarch/include/asm/asm-prototypes.h
index ed06d3997420833c173c4deab5697ff8749dedb8..cf8e1a4e7c19dad55c6adc9a56820b02e0fae977 100644
--- a/arch/loongarch/include/asm/asm-prototypes.h
+++ b/arch/loongarch/include/asm/asm-prototypes.h
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/uaccess.h>
 #include <asm/fpu.h>
+#include <asm/lbt.h>
 #include <asm/mmu_context.h>
 #include <asm/page.h>
 #include <asm/ftrace.h>
diff --git a/arch/loongarch/include/asm/asmmacro.h b/arch/loongarch/include/asm/asmmacro.h
index 79e1d53fea89cb998f973ff557d20fb845968128..c9544f358c33991b46b67edec578af1df44939b7 100644
--- a/arch/loongarch/include/asm/asmmacro.h
+++ b/arch/loongarch/include/asm/asmmacro.h
@@ -10,113 +10,6 @@
 #include <asm/fpregdef.h>
 #include <asm/loongarch.h>
 
-	.macro	parse_v var val
-	\var	= \val
-	.endm
-
-	.macro	parse_r var r
-	\var	= -1
-	.ifc	\r, $r0
-	\var	= 0
-	.endif
-	.ifc	\r, $r1
-	\var	= 1
-	.endif
-	.ifc	\r, $r2
-	\var	= 2
-	.endif
-	.ifc	\r, $r3
-	\var	= 3
-	.endif
-	.ifc	\r, $r4
-	\var	= 4
-	.endif
-	.ifc	\r, $r5
-	\var	= 5
-	.endif
-	.ifc	\r, $r6
-	\var	= 6
-	.endif
-	.ifc	\r, $r7
-	\var	= 7
-	.endif
-	.ifc	\r, $r8
-	\var	= 8
-	.endif
-	.ifc	\r, $r9
-	\var	= 9
-	.endif
-	.ifc	\r, $r10
-	\var	= 10
-	.endif
-	.ifc	\r, $r11
-	\var	= 11
-	.endif
-	.ifc	\r, $r12
-	\var	= 12
-	.endif
-	.ifc	\r, $r13
-	\var	= 13
-	.endif
-	.ifc	\r, $r14
-	\var	= 14
-	.endif
-	.ifc	\r, $r15
-	\var	= 15
-	.endif
-	.ifc	\r, $r16
-	\var	= 16
-	.endif
-	.ifc	\r, $r17
-	\var	= 17
-	.endif
-	.ifc	\r, $r18
-	\var	= 18
-	.endif
-	.ifc	\r, $r19
-	\var	= 19
-	.endif
-	.ifc	\r, $r20
-	\var	= 20
-	.endif
-	.ifc	\r, $r21
-	\var	= 21
-	.endif
-	.ifc	\r, $r22
-	\var	= 22
-	.endif
-	.ifc	\r, $r23
-	\var	= 23
-	.endif
-	.ifc	\r, $r24
-	\var	= 24
-	.endif
-	.ifc	\r, $r25
-	\var	= 25
-	.endif
-	.ifc	\r, $r26
-	\var	= 26
-	.endif
-	.ifc	\r, $r27
-	\var	= 27
-	.endif
-	.ifc	\r, $r28
-	\var	= 28
-	.endif
-	.ifc	\r, $r29
-	\var	= 29
-	.endif
-	.ifc	\r, $r30
-	\var	= 30
-	.endif
-	.ifc	\r, $r31
-	\var	= 31
-	.endif
-	.iflt	\var
-	.error	"Unable to parse register name \r"
-	.endif
-	.endm
-
 	.macro	cpu_save_nonscratch thread
 	stptr.d	s0, \thread, THREAD_REG23
 	stptr.d	s1, \thread, THREAD_REG24
@@ -148,12 +41,51 @@
 
 	.macro fpu_save_csr thread tmp
 	movfcsr2gr	\tmp, fcsr0
-	stptr.w	\tmp, \thread, THREAD_FCSR
+	stptr.w		\tmp, \thread, THREAD_FCSR
+#ifdef CONFIG_CPU_HAS_LBT
+	/* TM bit is always 0 if LBT not supported */
+	andi		\tmp, \tmp, FPU_CSR_TM
+	beqz		\tmp, 1f
+	/* Save FTOP */
+	x86mftop	\tmp
+	stptr.w		\tmp, \thread, THREAD_FTOP
+	/* Turn off TM to ensure the order of FPR in memory independent of TM */
+	x86clrtm
+1:
+#endif
 	.endm
 
-	.macro fpu_restore_csr thread tmp
-	ldptr.w	\tmp, \thread, THREAD_FCSR
-	movgr2fcsr	fcsr0, \tmp
+	.macro fpu_restore_csr thread tmp0 tmp1
+	ldptr.w		\tmp0, \thread, THREAD_FCSR
+	movgr2fcsr	fcsr0, \tmp0
+#ifdef CONFIG_CPU_HAS_LBT
+	/* TM bit is always 0 if LBT not supported */
+	andi		\tmp0, \tmp0, FPU_CSR_TM
+	beqz		\tmp0, 2f
+	/* Restore FTOP */
+	ldptr.w		\tmp0, \thread, THREAD_FTOP
+	andi		\tmp0, \tmp0, 0x7
+	la.pcrel	\tmp1, 1f
+	alsl.d		\tmp1, \tmp0, \tmp1, 3
+	jr		\tmp1
+1:
+	x86mttop	0
+	b	2f
+	x86mttop	1
+	b	2f
+	x86mttop	2
+	b	2f
+	x86mttop	3
+	b	2f
+	x86mttop	4
+	b	2f
+	x86mttop	5
+	b	2f
+	x86mttop	6
+	b	2f
+	x86mttop	7
+2:
+#endif
 	.endm
 
 	.macro fpu_save_cc thread tmp0 tmp1
@@ -353,7 +285,7 @@
 	.macro	lsx_restore_all	thread tmp0 tmp1
 	lsx_restore_data	\thread, \tmp0
 	fpu_restore_cc		\thread, \tmp0, \tmp1
-	fpu_restore_csr		\thread, \tmp0
+	fpu_restore_csr		\thread, \tmp0, \tmp1
 	.endm
 
 	.macro	lsx_save_upper vd base tmp off
@@ -563,7 +495,7 @@
 	.macro	lasx_restore_all thread tmp0 tmp1
 	lasx_restore_data	\thread, \tmp0
 	fpu_restore_cc		\thread, \tmp0, \tmp1
-	fpu_restore_csr		\thread, \tmp0
+	fpu_restore_csr		\thread, \tmp0, \tmp1
 	.endm
 
 	.macro	lasx_save_upper xd base tmp off
diff --git a/arch/loongarch/include/asm/kasan.h b/arch/loongarch/include/asm/kasan.h
new file mode 100644
index 0000000000000000000000000000000000000000..deeff8158f45ce1df6bdb0c0169cd31ff8132240
--- /dev/null
+++ b/arch/loongarch/include/asm/kasan.h
@@ -0,0 +1,126 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_KASAN_H
+#define __ASM_KASAN_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/linkage.h>
+#include <linux/mmzone.h>
+#include <asm/addrspace.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+
+#define __HAVE_ARCH_SHADOW_MAP
+
+#define KASAN_SHADOW_SCALE_SHIFT 3
+#define KASAN_SHADOW_OFFSET	_AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
+
+#define XRANGE_SHIFT (48)
+
+/* Valid address length */
+#define XRANGE_SHADOW_SHIFT	(PGDIR_SHIFT + PAGE_SHIFT - 3)
+/* Used for taking out the valid address */
+#define XRANGE_SHADOW_MASK	GENMASK_ULL(XRANGE_SHADOW_SHIFT - 1, 0)
+/* One segment whole address space size */
+#define XRANGE_SIZE		(XRANGE_SHADOW_MASK + 1)
+
+/* 64-bit segment value. */
+#define XKPRANGE_UC_SEG		(0x8000)
+#define XKPRANGE_CC_SEG		(0x9000)
+#define XKVRANGE_VC_SEG		(0xffff)
+
+/* Cached */
+#define XKPRANGE_CC_START		CACHE_BASE
+#define XKPRANGE_CC_SIZE		XRANGE_SIZE
+#define XKPRANGE_CC_KASAN_OFFSET	(0)
+#define XKPRANGE_CC_SHADOW_SIZE		(XKPRANGE_CC_SIZE >> KASAN_SHADOW_SCALE_SHIFT)
+#define XKPRANGE_CC_SHADOW_END		(XKPRANGE_CC_KASAN_OFFSET + XKPRANGE_CC_SHADOW_SIZE)
+
+/* UnCached */
+#define XKPRANGE_UC_START		UNCACHE_BASE
+#define XKPRANGE_UC_SIZE		XRANGE_SIZE
+#define XKPRANGE_UC_KASAN_OFFSET	XKPRANGE_CC_SHADOW_END
+#define XKPRANGE_UC_SHADOW_SIZE		(XKPRANGE_UC_SIZE >> KASAN_SHADOW_SCALE_SHIFT)
+#define XKPRANGE_UC_SHADOW_END		(XKPRANGE_UC_KASAN_OFFSET + XKPRANGE_UC_SHADOW_SIZE)
+
+/* VMALLOC (Cached or UnCached)  */
+#define XKVRANGE_VC_START		MODULES_VADDR
+#define XKVRANGE_VC_SIZE		round_up(KFENCE_AREA_END - MODULES_VADDR + 1, PGDIR_SIZE)
+#define XKVRANGE_VC_KASAN_OFFSET	XKPRANGE_UC_SHADOW_END
+#define XKVRANGE_VC_SHADOW_SIZE		(XKVRANGE_VC_SIZE >> KASAN_SHADOW_SCALE_SHIFT)
+#define XKVRANGE_VC_SHADOW_END		(XKVRANGE_VC_KASAN_OFFSET + XKVRANGE_VC_SHADOW_SIZE)
+
+/* KAsan shadow memory start right after vmalloc. */
+#define KASAN_SHADOW_START		round_up(KFENCE_AREA_END, PGDIR_SIZE)
+#define KASAN_SHADOW_SIZE		(XKVRANGE_VC_SHADOW_END - XKPRANGE_CC_KASAN_OFFSET)
+#define KASAN_SHADOW_END		round_up(KASAN_SHADOW_START + KASAN_SHADOW_SIZE, PGDIR_SIZE)
+
+#define XKPRANGE_CC_SHADOW_OFFSET	(KASAN_SHADOW_START + XKPRANGE_CC_KASAN_OFFSET)
+#define XKPRANGE_UC_SHADOW_OFFSET	(KASAN_SHADOW_START + XKPRANGE_UC_KASAN_OFFSET)
+#define XKVRANGE_VC_SHADOW_OFFSET	(KASAN_SHADOW_START + XKVRANGE_VC_KASAN_OFFSET)
+
+extern bool kasan_early_stage;
+extern unsigned char kasan_early_shadow_page[PAGE_SIZE];
+
+#define kasan_arch_is_ready kasan_arch_is_ready
+static __always_inline bool kasan_arch_is_ready(void)
+{
+	return !kasan_early_stage;
+}
+
+static inline void *kasan_mem_to_shadow(const void *addr)
+{
+	if (!kasan_arch_is_ready()) {
+		return (void *)(kasan_early_shadow_page);
+	} else {
+		unsigned long maddr = (unsigned long)addr;
+		unsigned long xrange = (maddr >> XRANGE_SHIFT) & 0xffff;
+		unsigned long offset = 0;
+
+		maddr &= XRANGE_SHADOW_MASK;
+		switch (xrange) {
+		case XKPRANGE_CC_SEG:
+			offset = XKPRANGE_CC_SHADOW_OFFSET;
+			break;
+		case XKPRANGE_UC_SEG:
+			offset = XKPRANGE_UC_SHADOW_OFFSET;
+			break;
+		case XKVRANGE_VC_SEG:
+			offset = XKVRANGE_VC_SHADOW_OFFSET;
+			break;
+		default:
+			WARN_ON(1);
+			return NULL;
+		}
+
+		return (void *)((maddr >> KASAN_SHADOW_SCALE_SHIFT) + offset);
+	}
+}
+
+static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
+{
+	unsigned long addr = (unsigned long)shadow_addr;
+
+	if (unlikely(addr > KASAN_SHADOW_END) ||
+		unlikely(addr < KASAN_SHADOW_START)) {
+		WARN_ON(1);
+		return NULL;
+	}
+
+	if (addr >= XKVRANGE_VC_SHADOW_OFFSET)
+		return (void *)(((addr - XKVRANGE_VC_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT) + XKVRANGE_VC_START);
+	else if (addr >= XKPRANGE_UC_SHADOW_OFFSET)
+		return (void *)(((addr - XKPRANGE_UC_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT) + XKPRANGE_UC_START);
+	else if (addr >= XKPRANGE_CC_SHADOW_OFFSET)
+		return (void *)(((addr - XKPRANGE_CC_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT) + XKPRANGE_CC_START);
+	else {
+		WARN_ON(1);
+		return NULL;
+	}
+}
+
+void kasan_init(void);
+asmlinkage void kasan_early_init(void);
+
+#endif
+#endif
diff --git a/arch/loongarch/include/asm/kfence.h b/arch/loongarch/include/asm/kfence.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c82aea1c99398c46484a77cc28da1316799affb
--- /dev/null
+++ b/arch/loongarch/include/asm/kfence.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * KFENCE support for LoongArch.
+ *
+ * Author: Enze Li <lienze@kylinos.cn>
+ * Copyright (C) 2022-2023 KylinSoft Corporation.
+ */
+
+#ifndef _ASM_LOONGARCH_KFENCE_H
+#define _ASM_LOONGARCH_KFENCE_H
+
+#include <linux/kfence.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+
+static inline bool arch_kfence_init_pool(void)
+{
+	int err;
+	char *kfence_pool = __kfence_pool;
+	struct vm_struct *area;
+
+	area = __get_vm_area_caller(KFENCE_POOL_SIZE, VM_IOREMAP,
+				    KFENCE_AREA_START, KFENCE_AREA_END,
+				    __builtin_return_address(0));
+	if (!area)
+		return false;
+
+	__kfence_pool = (char *)area->addr;
+	err = ioremap_page_range((unsigned long)__kfence_pool,
+				 (unsigned long)__kfence_pool + KFENCE_POOL_SIZE,
+				 virt_to_phys((void *)kfence_pool), PAGE_KERNEL);
+	if (err) {
+		free_vm_area(area);
+		__kfence_pool = kfence_pool;
+		return false;
+	}
+
+	return true;
+}
+
+/* Protect the given page and flush TLB. */
+static inline bool kfence_protect_page(unsigned long addr, bool protect)
+{
+	pte_t *pte = virt_to_kpte(addr);
+
+	if (WARN_ON(!pte) || pte_none(*pte))
+		return false;
+
+	if (protect)
+		set_pte(pte, __pte(pte_val(*pte) & ~(_PAGE_VALID | _PAGE_PRESENT)));
+	else
+		set_pte(pte, __pte(pte_val(*pte) | (_PAGE_VALID | _PAGE_PRESENT)));
+
+	preempt_disable();
+	local_flush_tlb_one(addr);
+	preempt_enable();
+
+	return true;
+}
+
+#endif /* _ASM_LOONGARCH_KFENCE_H */
diff --git a/arch/loongarch/include/asm/kgdb.h b/arch/loongarch/include/asm/kgdb.h
new file mode 100644
index 0000000000000000000000000000000000000000..2041ae58b161576d8d9bf5e715f8331b8c93c243
--- /dev/null
+++ b/arch/loongarch/include/asm/kgdb.h
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2023 Loongson Technology Corporation Limited
+ */
+
+#ifndef _ASM_LOONGARCH_KGDB_H
+#define _ASM_LOONGARCH_KGDB_H
+
+#define GDB_SIZEOF_REG		sizeof(u64)
+
+/* gdb remote procotol expects the following register layout. */
+
+/*
+ * General purpose registers:
+ *     r0-r31: 64 bit
+ *     orig_a0: 64 bit
+ *     pc : 64 bit
+ *     csr_badvaddr: 64 bit
+ */
+#define DBG_PT_REGS_BASE	0
+#define DBG_PT_REGS_NUM		35
+#define DBG_PT_REGS_END		(DBG_PT_REGS_BASE + DBG_PT_REGS_NUM - 1)
+
+/*
+ * Floating point registers:
+ *     f0-f31: 64 bit
+ */
+#define DBG_FPR_BASE		(DBG_PT_REGS_END + 1)
+#define DBG_FPR_NUM		32
+#define DBG_FPR_END		(DBG_FPR_BASE + DBG_FPR_NUM - 1)
+
+/*
+ * Condition Flag registers:
+ *     fcc0-fcc8: 8 bit
+ */
+#define DBG_FCC_BASE		(DBG_FPR_END + 1)
+#define DBG_FCC_NUM		8
+#define DBG_FCC_END		(DBG_FCC_BASE + DBG_FCC_NUM - 1)
+
+/*
+ * Floating-point Control and Status registers:
+ *     fcsr: 32 bit
+ */
+#define DBG_FCSR_NUM		1
+#define DBG_FCSR		(DBG_FCC_END + 1)
+
+#define DBG_MAX_REG_NUM		(DBG_FCSR + 1)
+
+/*
+ * Size of I/O buffer for gdb packet.
+ * considering to hold all register contents, size is set
+ */
+#define BUFMAX			2048
+
+/*
+ * Number of bytes required for gdb_regs buffer.
+ * PT_REGS and FPR: 8 bytes; FCSR: 4 bytes; FCC: 1 bytes.
+ * GDB fails to connect for size beyond this with error
+ * "'g' packet reply is too long"
+ */
+#define NUMREGBYTES		((DBG_PT_REGS_NUM + DBG_FPR_NUM) * GDB_SIZEOF_REG + DBG_FCC_NUM * 1 + DBG_FCSR_NUM * 4)
+
+#define BREAK_INSTR_SIZE	4
+#define CACHE_FLUSH_IS_SAFE	0
+
+/* Register numbers of various important registers. */
+enum dbg_loongarch_regnum {
+	DBG_LOONGARCH_ZERO = 0,
+	DBG_LOONGARCH_RA,
+	DBG_LOONGARCH_TP,
+	DBG_LOONGARCH_SP,
+	DBG_LOONGARCH_A0,
+	DBG_LOONGARCH_FP = 22,
+	DBG_LOONGARCH_S0,
+	DBG_LOONGARCH_S1,
+	DBG_LOONGARCH_S2,
+	DBG_LOONGARCH_S3,
+	DBG_LOONGARCH_S4,
+	DBG_LOONGARCH_S5,
+	DBG_LOONGARCH_S6,
+	DBG_LOONGARCH_S7,
+	DBG_LOONGARCH_S8,
+	DBG_LOONGARCH_ORIG_A0,
+	DBG_LOONGARCH_PC,
+	DBG_LOONGARCH_BADV
+};
+
+void kgdb_breakinst(void);
+void arch_kgdb_breakpoint(void);
+
+#ifdef CONFIG_KGDB
+bool kgdb_breakpoint_handler(struct pt_regs *regs);
+#else /* !CONFIG_KGDB */
+static inline bool kgdb_breakpoint_handler(struct pt_regs *regs) { return false; }
+#endif /* CONFIG_KGDB */
+
+#endif /* __ASM_KGDB_H_ */
diff --git a/arch/loongarch/include/asm/lbt.h b/arch/loongarch/include/asm/lbt.h
new file mode 100644
index 0000000000000000000000000000000000000000..e671978bf5523fba8b8f7d9da0657baed66ecd8f
--- /dev/null
+++ b/arch/loongarch/include/asm/lbt.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Author: Qi Hu <huqi@loongson.cn>
+ *         Huacai Chen <chenhuacai@loongson.cn>
+ * Copyright (C) 2020-2023 Loongson Technology Corporation Limited
+ */
+#ifndef _ASM_LBT_H
+#define _ASM_LBT_H
+
+#include <asm/cpu.h>
+#include <asm/current.h>
+#include <asm/loongarch.h>
+#include <asm/processor.h>
+
+extern void _init_lbt(void);
+extern void _save_lbt(struct loongarch_lbt *);
+extern void _restore_lbt(struct loongarch_lbt *);
+
+static inline int is_lbt_enabled(void)
+{
+	if (!cpu_has_lbt)
+		return 0;
+
+	return (csr_read32(LOONGARCH_CSR_EUEN) & CSR_EUEN_LBTEN) ?
+		1 : 0;
+}
+
+static inline int is_lbt_owner(void)
+{
+	return test_thread_flag(TIF_USEDLBT);
+}
+
+#ifdef CONFIG_CPU_HAS_LBT
+
+static inline void enable_lbt(void)
+{
+	if (cpu_has_lbt)
+		csr_xchg32(CSR_EUEN_LBTEN, CSR_EUEN_LBTEN, LOONGARCH_CSR_EUEN);
+}
+
+static inline void disable_lbt(void)
+{
+	if (cpu_has_lbt)
+		csr_xchg32(0, CSR_EUEN_LBTEN, LOONGARCH_CSR_EUEN);
+}
+
+static inline void __own_lbt(void)
+{
+	enable_lbt();
+	set_thread_flag(TIF_USEDLBT);
+	KSTK_EUEN(current) |= CSR_EUEN_LBTEN;
+}
+
+static inline void own_lbt_inatomic(int restore)
+{
+	if (cpu_has_lbt && !is_lbt_owner()) {
+		__own_lbt();
+		if (restore)
+			_restore_lbt(&current->thread.lbt);
+	}
+}
+
+static inline void own_lbt(int restore)
+{
+	preempt_disable();
+	own_lbt_inatomic(restore);
+	preempt_enable();
+}
+
+static inline void lose_lbt_inatomic(int save, struct task_struct *tsk)
+{
+	if (cpu_has_lbt && is_lbt_owner()) {
+		if (save)
+			_save_lbt(&tsk->thread.lbt);
+
+		disable_lbt();
+		clear_tsk_thread_flag(tsk, TIF_USEDLBT);
+	}
+	KSTK_EUEN(tsk) &= ~(CSR_EUEN_LBTEN);
+}
+
+static inline void lose_lbt(int save)
+{
+	preempt_disable();
+	lose_lbt_inatomic(save, current);
+	preempt_enable();
+}
+
+static inline void init_lbt(void)
+{
+	__own_lbt();
+	_init_lbt();
+}
+#else
+static inline void own_lbt_inatomic(int restore) {}
+static inline void lose_lbt_inatomic(int save, struct task_struct *tsk) {}
+static inline void init_lbt(void) {}
+static inline void lose_lbt(int save) {}
+#endif
+
+static inline int thread_lbt_context_live(void)
+{
+	if (!cpu_has_lbt)
+		return 0;
+
+	return test_thread_flag(TIF_LBT_CTX_LIVE);
+}
+
+#endif /* _ASM_LBT_H */
diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h
index 10748a20a2ab5593f5a19b5b2cbc5dba8ebdda2b..33531d432b492d201f98ae64b941255a94355f91 100644
--- a/arch/loongarch/include/asm/loongarch.h
+++ b/arch/loongarch/include/asm/loongarch.h
@@ -12,49 +12,6 @@
 #ifndef __ASSEMBLY__
 #include <larchintrin.h>
 
-/*
- * parse_r var, r - Helper assembler macro for parsing register names.
- *
- * This converts the register name in $n form provided in \r to the
- * corresponding register number, which is assigned to the variable \var. It is
- * needed to allow explicit encoding of instructions in inline assembly where
- * registers are chosen by the compiler in $n form, allowing us to avoid using
- * fixed register numbers.
- *
- * It also allows newer instructions (not implemented by the assembler) to be
- * transparently implemented using assembler macros, instead of needing separate
- * cases depending on toolchain support.
- *
- * Simple usage example:
- * __asm__ __volatile__("parse_r addr, %0\n\t"
- *			"#invtlb op, 0, %0\n\t"
- *			".word ((0x6498000) | (addr << 10) | (0 << 5) | op)"
- *			: "=r" (status);
- */
-
-/* Match an individual register number and assign to \var */
-#define _IFC_REG(n)				\
-	".ifc	\\r, $r" #n "\n\t"		\
-	"\\var	= " #n "\n\t"			\
-	".endif\n\t"
-
-__asm__(".macro	parse_r var r\n\t"
-	"\\var	= -1\n\t"
-	_IFC_REG(0)  _IFC_REG(1)  _IFC_REG(2)  _IFC_REG(3)
-	_IFC_REG(4)  _IFC_REG(5)  _IFC_REG(6)  _IFC_REG(7)
-	_IFC_REG(8)  _IFC_REG(9)  _IFC_REG(10) _IFC_REG(11)
-	_IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15)
-	_IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19)
-	_IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23)
-	_IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27)
-	_IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31)
-	".iflt	\\var\n\t"
-	".error	\"Unable to parse register name \\r\"\n\t"
-	".endif\n\t"
-	".endm");
-
-#undef _IFC_REG
-
 /* CPUCFG */
 #define read_cpucfg(reg) __cpucfg(reg)
 
@@ -1453,6 +1410,10 @@ __BUILD_CSR_OP(tlbidx)
 #define FPU_CSR_RU	0x200	/* towards +Infinity */
 #define FPU_CSR_RD	0x300	/* towards -Infinity */
 
+/* Bit 6 of FPU Status Register specify the LBT TOP simulation mode */
+#define FPU_CSR_TM_SHIFT	0x6
+#define FPU_CSR_TM		(_ULCAST_(1) << FPU_CSR_TM_SHIFT)
+
 #define read_fcsr(source)	\
 ({	\
 	unsigned int __res;	\
diff --git a/arch/loongarch/include/asm/mmzone.h b/arch/loongarch/include/asm/mmzone.h
index fe67d0b4b33ddce60d9b28e72b1c942b6bdca91c..2b9a90727e191224875e23f041dc3f454a2ed540 100644
--- a/arch/loongarch/include/asm/mmzone.h
+++ b/arch/loongarch/include/asm/mmzone.h
@@ -13,6 +13,4 @@ extern struct pglist_data *node_data[];
 
 #define NODE_DATA(nid)	(node_data[(nid)])
 
-extern void setup_zero_pages(void);
-
 #endif /* _ASM_MMZONE_H_ */
diff --git a/arch/loongarch/include/asm/page.h b/arch/loongarch/include/asm/page.h
index 26e8dccb661909c2f306f2f002c730abdddaf5af..63f137ce82a41fc4e251f2adceef441e72dcd691 100644
--- a/arch/loongarch/include/asm/page.h
+++ b/arch/loongarch/include/asm/page.h
@@ -84,7 +84,12 @@ typedef struct { unsigned long pgprot; } pgprot_t;
 #define sym_to_pfn(x)		__phys_to_pfn(__pa_symbol(x))
 
 #define virt_to_pfn(kaddr)	PFN_DOWN(PHYSADDR(kaddr))
-#define virt_to_page(kaddr)	pfn_to_page(virt_to_pfn(kaddr))
+
+#define virt_to_page(kaddr)								\
+({											\
+	(likely((unsigned long)kaddr < vm_map_base)) ?					\
+	dmw_virt_to_page((unsigned long)kaddr) : tlb_virt_to_page((unsigned long)kaddr);\
+})
 
 extern int __virt_addr_valid(volatile void *kaddr);
 #define virt_addr_valid(kaddr)	__virt_addr_valid((volatile void *)(kaddr))
diff --git a/arch/loongarch/include/asm/pgalloc.h b/arch/loongarch/include/asm/pgalloc.h
index 23f5b1107246a3cb5e0db4ced632e9e6aff8bcfe..79470f0b4f1d8dfbd715e4d1dc2e711949175b30 100644
--- a/arch/loongarch/include/asm/pgalloc.h
+++ b/arch/loongarch/include/asm/pgalloc.h
@@ -94,4 +94,5 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address)
 
 #endif /* __PAGETABLE_PUD_FOLDED */
 
+extern pte_t * __init populate_kernel_pte(unsigned long addr);
 #endif /* _ASM_PGALLOC_H */
diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h
index 06963a172319da14c298cbeaca2c1f0beb80a5e8..29d9b12298bc843ecf24012d37f9042079d939e5 100644
--- a/arch/loongarch/include/asm/pgtable.h
+++ b/arch/loongarch/include/asm/pgtable.h
@@ -70,12 +70,9 @@ struct vm_area_struct;
  * for zero-mapped memory areas etc..
  */
 
-extern unsigned long empty_zero_page;
-extern unsigned long zero_page_mask;
+extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
 
-#define ZERO_PAGE(vaddr) \
-	(virt_to_page((void *)(empty_zero_page + (((unsigned long)(vaddr)) & zero_page_mask))))
-#define __HAVE_COLOR_ZERO_PAGE
+#define ZERO_PAGE(vaddr)	virt_to_page(empty_zero_page)
 
 /*
  * TLB refill handlers may also map the vmalloc area into xkvrange.
@@ -85,14 +82,30 @@ extern unsigned long zero_page_mask;
 #define MODULES_VADDR	(vm_map_base + PCI_IOSIZE + (2 * PAGE_SIZE))
 #define MODULES_END	(MODULES_VADDR + SZ_256M)
 
+#ifdef CONFIG_KFENCE
+#define KFENCE_AREA_SIZE	(((CONFIG_KFENCE_NUM_OBJECTS + 1) * 2 + 2) * PAGE_SIZE)
+#else
+#define KFENCE_AREA_SIZE	0
+#endif
+
 #define VMALLOC_START	MODULES_END
+
+#ifndef CONFIG_KASAN
 #define VMALLOC_END	\
 	(vm_map_base +	\
-	 min(PTRS_PER_PGD * PTRS_PER_PUD * PTRS_PER_PMD * PTRS_PER_PTE * PAGE_SIZE, (1UL << cpu_vabits)) - PMD_SIZE - VMEMMAP_SIZE)
+	 min(PTRS_PER_PGD * PTRS_PER_PUD * PTRS_PER_PMD * PTRS_PER_PTE * PAGE_SIZE, (1UL << cpu_vabits)) - PMD_SIZE - VMEMMAP_SIZE - KFENCE_AREA_SIZE)
+#else
+#define VMALLOC_END	\
+	(vm_map_base +	\
+	 min(PTRS_PER_PGD * PTRS_PER_PUD * PTRS_PER_PMD * PTRS_PER_PTE * PAGE_SIZE, (1UL << cpu_vabits) / 2) - PMD_SIZE - VMEMMAP_SIZE - KFENCE_AREA_SIZE)
+#endif
 
 #define vmemmap		((struct page *)((VMALLOC_END + PMD_SIZE) & PMD_MASK))
 #define VMEMMAP_END	((unsigned long)vmemmap + VMEMMAP_SIZE - 1)
 
+#define KFENCE_AREA_START	(VMEMMAP_END + 1)
+#define KFENCE_AREA_END		(KFENCE_AREA_START + KFENCE_AREA_SIZE - 1)
+
 #define pte_ERROR(e) \
 	pr_err("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e))
 #ifndef __PAGETABLE_PMD_FOLDED
@@ -350,6 +363,9 @@ static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *pt
 extern pgd_t swapper_pg_dir[];
 extern pgd_t invalid_pg_dir[];
 
+struct page *dmw_virt_to_page(unsigned long kaddr);
+struct page *tlb_virt_to_page(unsigned long kaddr);
+
 /*
  * The following only work if pte_present() is true.
  * Undefined behaviour if not..
@@ -596,6 +612,9 @@ static inline long pmd_protnone(pmd_t pmd)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
+#define pmd_leaf(pmd)		((pmd_val(pmd) & _PAGE_HUGE) != 0)
+#define pud_leaf(pud)		((pud_val(pud) & _PAGE_HUGE) != 0)
+
 /*
  * We provide our own get_unmapped area to cope with the virtual aliasing
  * constraints placed on us by the cache architecture.
diff --git a/arch/loongarch/include/asm/processor.h b/arch/loongarch/include/asm/processor.h
index 636e1c66398c17c7bfb930b5b225bd241f831a45..c3bc44b5f5b30b2d43b901da6c339c26594e08fc 100644
--- a/arch/loongarch/include/asm/processor.h
+++ b/arch/loongarch/include/asm/processor.h
@@ -80,11 +80,22 @@ BUILD_FPR_ACCESS(32)
 BUILD_FPR_ACCESS(64)
 
 struct loongarch_fpu {
-	unsigned int	fcsr;
 	uint64_t	fcc;	/* 8x8 */
+	uint32_t	fcsr;
+	uint32_t	ftop;
 	union fpureg	fpr[NUM_FPU_REGS];
 };
 
+struct loongarch_lbt {
+	/* Scratch registers */
+	unsigned long scr0;
+	unsigned long scr1;
+	unsigned long scr2;
+	unsigned long scr3;
+	/* Eflags register */
+	unsigned long eflags;
+};
+
 #define INIT_CPUMASK { \
 	{0,} \
 }
@@ -113,15 +124,6 @@ struct thread_struct {
 	unsigned long csr_ecfg;
 	unsigned long csr_badvaddr;	/* Last user fault */
 
-	/* Scratch registers */
-	unsigned long scr0;
-	unsigned long scr1;
-	unsigned long scr2;
-	unsigned long scr3;
-
-	/* Eflags register */
-	unsigned long eflags;
-
 	/* Other stuff associated with the thread. */
 	unsigned long trap_nr;
 	unsigned long error_code;
@@ -133,6 +135,7 @@ struct thread_struct {
 	 * context because they are conditionally copied at fork().
 	 */
 	struct loongarch_fpu fpu FPU_ALIGN;
+	struct loongarch_lbt lbt; /* Also conditionally copied */
 
 	/* Hardware breakpoints pinned to this task. */
 	struct perf_event *hbp_break[LOONGARCH_MAX_BRP];
@@ -174,8 +177,9 @@ struct thread_struct {
 	 * FPU & vector registers				\
 	 */							\
 	.fpu			= {				\
-		.fcsr		= 0,				\
 		.fcc		= 0,				\
+		.fcsr		= 0,				\
+		.ftop		= 0,				\
 		.fpr		= {{{0,},},},			\
 	},							\
 	.hbp_break		= {0},				\
diff --git a/arch/loongarch/include/asm/setup.h b/arch/loongarch/include/asm/setup.h
index be05c0e706a2e23d1e5c82859f02be62c82ce0b9..a0bc159ce8bdc0348defe27953c07d973dd58b5f 100644
--- a/arch/loongarch/include/asm/setup.h
+++ b/arch/loongarch/include/asm/setup.h
@@ -7,6 +7,7 @@
 #define _LOONGARCH_SETUP_H
 
 #include <linux/types.h>
+#include <asm/sections.h>
 #include <uapi/asm/setup.h>
 
 #define VECSIZE 0x200
@@ -33,8 +34,13 @@ extern long __la_abs_end;
 extern long __rela_dyn_begin;
 extern long __rela_dyn_end;
 
-extern void * __init relocate_kernel(void);
+extern unsigned long __init relocate_kernel(void);
 
 #endif
 
+static inline unsigned long kaslr_offset(void)
+{
+	return (unsigned long)&_text - VMLINUX_LOAD_ADDRESS;
+}
+
 #endif /* __SETUP_H */
diff --git a/arch/loongarch/include/asm/stackframe.h b/arch/loongarch/include/asm/stackframe.h
index 7df80e6ae9d2c8d76af305b7478bd52776ed380a..4fb1e6408b982aec130c75413e04bb6d2fb6c52c 100644
--- a/arch/loongarch/include/asm/stackframe.h
+++ b/arch/loongarch/include/asm/stackframe.h
@@ -158,6 +158,10 @@
 	cfi_st  u0, PT_R21, \docfi
 	csrrd	u0, PERCPU_BASE_KS
 9:
+#ifdef CONFIG_KGDB
+	li.w	t0, CSR_CRMD_WE
+	csrxchg	t0, t0, LOONGARCH_CSR_CRMD
+#endif
 	.endm
 
 	.macro	SAVE_ALL docfi=0
diff --git a/arch/loongarch/include/asm/string.h b/arch/loongarch/include/asm/string.h
index 7b29cc9c70aa617049a3790f8cff9632ff100a66..5bb5a90d26815133e46a9c8ca2f304350acac18e 100644
--- a/arch/loongarch/include/asm/string.h
+++ b/arch/loongarch/include/asm/string.h
@@ -7,11 +7,31 @@
 
 #define __HAVE_ARCH_MEMSET
 extern void *memset(void *__s, int __c, size_t __count);
+extern void *__memset(void *__s, int __c, size_t __count);
 
 #define __HAVE_ARCH_MEMCPY
 extern void *memcpy(void *__to, __const__ void *__from, size_t __n);
+extern void *__memcpy(void *__to, __const__ void *__from, size_t __n);
 
 #define __HAVE_ARCH_MEMMOVE
 extern void *memmove(void *__dest, __const__ void *__src, size_t __n);
+extern void *__memmove(void *__dest, __const__ void *__src, size_t __n);
+
+#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
+
+/*
+ * For files that are not instrumented (e.g. mm/slub.c) we
+ * should use not instrumented version of mem* functions.
+ */
+
+#define memset(s, c, n) __memset(s, c, n)
+#define memcpy(dst, src, len) __memcpy(dst, src, len)
+#define memmove(dst, src, len) __memmove(dst, src, len)
+
+#ifndef __NO_FORTIFY
+#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */
+#endif
+
+#endif
 
 #endif /* _ASM_STRING_H */
diff --git a/arch/loongarch/include/asm/switch_to.h b/arch/loongarch/include/asm/switch_to.h
index 24e3094bebab166c046264280ef9b5929b350377..5b225aff3ba21aa06d0713bc8e73e1b941389630 100644
--- a/arch/loongarch/include/asm/switch_to.h
+++ b/arch/loongarch/include/asm/switch_to.h
@@ -7,6 +7,7 @@
 
 #include <asm/cpu-features.h>
 #include <asm/fpu.h>
+#include <asm/lbt.h>
 
 struct task_struct;
 
@@ -34,6 +35,7 @@ extern asmlinkage struct task_struct *__switch_to(struct task_struct *prev,
 #define switch_to(prev, next, last)						\
 do {										\
 	lose_fpu_inatomic(1, prev);						\
+	lose_lbt_inatomic(1, prev);						\
 	hw_breakpoint_thread_switch(next);					\
 	(last) = __switch_to(prev, next, task_thread_info(next),		\
 		 __builtin_return_address(0), __builtin_frame_address(0));	\
diff --git a/arch/loongarch/include/asm/thread_info.h b/arch/loongarch/include/asm/thread_info.h
index 1a3354ca056e9af11e15bfe80ffce5ad97c9e640..8cb653d49a54343ebfa26814e037ddf844c98351 100644
--- a/arch/loongarch/include/asm/thread_info.h
+++ b/arch/loongarch/include/asm/thread_info.h
@@ -84,6 +84,8 @@ register unsigned long current_stack_pointer __asm__("$sp");
 #define TIF_SINGLESTEP		16	/* Single Step */
 #define TIF_LSX_CTX_LIVE	17	/* LSX context must be preserved */
 #define TIF_LASX_CTX_LIVE	18	/* LASX context must be preserved */
+#define TIF_USEDLBT		19	/* LBT was used by this task this quantum (SMP) */
+#define TIF_LBT_CTX_LIVE	20	/* LBT context must be preserved */
 
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
@@ -101,6 +103,8 @@ register unsigned long current_stack_pointer __asm__("$sp");
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
 #define _TIF_LSX_CTX_LIVE	(1<<TIF_LSX_CTX_LIVE)
 #define _TIF_LASX_CTX_LIVE	(1<<TIF_LASX_CTX_LIVE)
+#define _TIF_USEDLBT		(1<<TIF_USEDLBT)
+#define _TIF_LBT_CTX_LIVE	(1<<TIF_LBT_CTX_LIVE)
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_THREAD_INFO_H */
diff --git a/arch/loongarch/include/asm/xor.h b/arch/loongarch/include/asm/xor.h
new file mode 100644
index 0000000000000000000000000000000000000000..12467fffee46875b444b500532f98e001b59eb8b
--- /dev/null
+++ b/arch/loongarch/include/asm/xor.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
+ */
+#ifndef _ASM_LOONGARCH_XOR_H
+#define _ASM_LOONGARCH_XOR_H
+
+#include <asm/cpu-features.h>
+#include <asm/xor_simd.h>
+
+#ifdef CONFIG_CPU_HAS_LSX
+static struct xor_block_template xor_block_lsx = {
+	.name = "lsx",
+	.do_2 = xor_lsx_2,
+	.do_3 = xor_lsx_3,
+	.do_4 = xor_lsx_4,
+	.do_5 = xor_lsx_5,
+};
+
+#define XOR_SPEED_LSX()					\
+	do {						\
+		if (cpu_has_lsx)			\
+			xor_speed(&xor_block_lsx);	\
+	} while (0)
+#else /* CONFIG_CPU_HAS_LSX */
+#define XOR_SPEED_LSX()
+#endif /* CONFIG_CPU_HAS_LSX */
+
+#ifdef CONFIG_CPU_HAS_LASX
+static struct xor_block_template xor_block_lasx = {
+	.name = "lasx",
+	.do_2 = xor_lasx_2,
+	.do_3 = xor_lasx_3,
+	.do_4 = xor_lasx_4,
+	.do_5 = xor_lasx_5,
+};
+
+#define XOR_SPEED_LASX()					\
+	do {							\
+		if (cpu_has_lasx)				\
+			xor_speed(&xor_block_lasx);		\
+	} while (0)
+#else /* CONFIG_CPU_HAS_LASX */
+#define XOR_SPEED_LASX()
+#endif /* CONFIG_CPU_HAS_LASX */
+
+/*
+ * For grins, also test the generic routines.
+ *
+ * More importantly: it cannot be ruled out at this point of time, that some
+ * future (maybe reduced) models could run the vector algorithms slower than
+ * the scalar ones, maybe for errata or micro-op reasons. It may be
+ * appropriate to revisit this after one or two more uarch generations.
+ */
+#include <asm-generic/xor.h>
+
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES				\
+do {							\
+	xor_speed(&xor_block_8regs);			\
+	xor_speed(&xor_block_8regs_p);			\
+	xor_speed(&xor_block_32regs);			\
+	xor_speed(&xor_block_32regs_p);			\
+	XOR_SPEED_LSX();				\
+	XOR_SPEED_LASX();				\
+} while (0)
+
+#endif /* _ASM_LOONGARCH_XOR_H */
diff --git a/arch/loongarch/include/asm/xor_simd.h b/arch/loongarch/include/asm/xor_simd.h
new file mode 100644
index 0000000000000000000000000000000000000000..471b96332f38176ebc5dff2d075697e169f72f58
--- /dev/null
+++ b/arch/loongarch/include/asm/xor_simd.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
+ */
+#ifndef _ASM_LOONGARCH_XOR_SIMD_H
+#define _ASM_LOONGARCH_XOR_SIMD_H
+
+#ifdef CONFIG_CPU_HAS_LSX
+void xor_lsx_2(unsigned long bytes, unsigned long * __restrict p1,
+	       const unsigned long * __restrict p2);
+void xor_lsx_3(unsigned long bytes, unsigned long * __restrict p1,
+	       const unsigned long * __restrict p2, const unsigned long * __restrict p3);
+void xor_lsx_4(unsigned long bytes, unsigned long * __restrict p1,
+	       const unsigned long * __restrict p2, const unsigned long * __restrict p3,
+	       const unsigned long * __restrict p4);
+void xor_lsx_5(unsigned long bytes, unsigned long * __restrict p1,
+	       const unsigned long * __restrict p2, const unsigned long * __restrict p3,
+	       const unsigned long * __restrict p4, const unsigned long * __restrict p5);
+#endif /* CONFIG_CPU_HAS_LSX */
+
+#ifdef CONFIG_CPU_HAS_LASX
+void xor_lasx_2(unsigned long bytes, unsigned long * __restrict p1,
+	        const unsigned long * __restrict p2);
+void xor_lasx_3(unsigned long bytes, unsigned long * __restrict p1,
+	        const unsigned long * __restrict p2, const unsigned long * __restrict p3);
+void xor_lasx_4(unsigned long bytes, unsigned long * __restrict p1,
+	        const unsigned long * __restrict p2, const unsigned long * __restrict p3,
+	        const unsigned long * __restrict p4);
+void xor_lasx_5(unsigned long bytes, unsigned long * __restrict p1,
+	        const unsigned long * __restrict p2, const unsigned long * __restrict p3,
+	        const unsigned long * __restrict p4, const unsigned long * __restrict p5);
+#endif /* CONFIG_CPU_HAS_LASX */
+
+#endif /* _ASM_LOONGARCH_XOR_SIMD_H */
diff --git a/arch/loongarch/include/uapi/asm/ptrace.h b/arch/loongarch/include/uapi/asm/ptrace.h
index 06e3be52cb0427c91e03ac2fdddcb85b28ca860c..ac915f84165053964105383889e4d8f48c72f741 100644
--- a/arch/loongarch/include/uapi/asm/ptrace.h
+++ b/arch/loongarch/include/uapi/asm/ptrace.h
@@ -56,6 +56,12 @@ struct user_lasx_state {
 	uint64_t vregs[32*4];
 };
 
+struct user_lbt_state {
+	uint64_t scr[4];
+	uint32_t eflags;
+	uint32_t ftop;
+};
+
 struct user_watch_state {
 	uint64_t dbg_info;
 	struct {
diff --git a/arch/loongarch/include/uapi/asm/sigcontext.h b/arch/loongarch/include/uapi/asm/sigcontext.h
index 4cd7d16f70377586a1d209dca634367c89713ffa..6c22f616b8f15d746316a62468a9a9a22c736443 100644
--- a/arch/loongarch/include/uapi/asm/sigcontext.h
+++ b/arch/loongarch/include/uapi/asm/sigcontext.h
@@ -59,4 +59,14 @@ struct lasx_context {
 	__u32	fcsr;
 };
 
+/* LBT context */
+#define LBT_CTX_MAGIC		0x42540001
+#define LBT_CTX_ALIGN		8
+struct lbt_context {
+	__u64	regs[4];
+	__u32	eflags;
+	__u32	ftop;
+};
+
+
 #endif /* _UAPI_ASM_SIGCONTEXT_H */
diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile
index 8e279f04f9e7a8031784fc46f0a8a41148e1337b..c56ea0b7544899e2b7ccf6aee30c8ae40fe350d0 100644
--- a/arch/loongarch/kernel/Makefile
+++ b/arch/loongarch/kernel/Makefile
@@ -15,6 +15,8 @@ obj-$(CONFIG_EFI) 		+= efi.o
 
 obj-$(CONFIG_CPU_HAS_FPU)	+= fpu.o kfpu.o
 
+obj-$(CONFIG_CPU_HAS_LBT)	+= lbt.o
+
 obj-$(CONFIG_ARCH_STRICT_ALIGN)	+= unaligned.o
 
 ifdef CONFIG_FUNCTION_TRACER
@@ -32,6 +34,12 @@ ifdef CONFIG_FUNCTION_TRACER
   CFLAGS_REMOVE_rethook_trampoline.o = $(CC_FLAGS_FTRACE)
 endif
 
+KASAN_SANITIZE_efi.o := n
+KASAN_SANITIZE_cpu-probe.o := n
+KASAN_SANITIZE_traps.o := n
+KASAN_SANITIZE_smp.o := n
+KASAN_SANITIZE_vdso.o := n
+
 obj-$(CONFIG_MODULES)		+= module.o module-sections.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 
@@ -54,6 +62,7 @@ obj-$(CONFIG_UNWINDER_PROLOGUE) += unwind_prologue.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_event.o perf_regs.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
 
+obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_RETHOOK)		+= rethook.o rethook_trampoline.o
 obj-$(CONFIG_UPROBES)		+= uprobes.o
diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c
index 505e4bf59603148ba85ab979073b36933e6a9927..8da0726777edb41ea66d47f640308c18435f4551 100644
--- a/arch/loongarch/kernel/asm-offsets.c
+++ b/arch/loongarch/kernel/asm-offsets.c
@@ -118,13 +118,6 @@ void output_thread_defines(void)
 	OFFSET(THREAD_CSRECFG, task_struct,
 	       thread.csr_ecfg);
 
-	OFFSET(THREAD_SCR0, task_struct, thread.scr0);
-	OFFSET(THREAD_SCR1, task_struct, thread.scr1);
-	OFFSET(THREAD_SCR2, task_struct, thread.scr2);
-	OFFSET(THREAD_SCR3, task_struct, thread.scr3);
-
-	OFFSET(THREAD_EFLAGS, task_struct, thread.eflags);
-
 	OFFSET(THREAD_FPU, task_struct, thread.fpu);
 
 	OFFSET(THREAD_BVADDR, task_struct, \
@@ -172,6 +165,17 @@ void output_thread_fpu_defines(void)
 
 	OFFSET(THREAD_FCSR, loongarch_fpu, fcsr);
 	OFFSET(THREAD_FCC,  loongarch_fpu, fcc);
+	OFFSET(THREAD_FTOP, loongarch_fpu, ftop);
+	BLANK();
+}
+
+void output_thread_lbt_defines(void)
+{
+	OFFSET(THREAD_SCR0,  loongarch_lbt, scr0);
+	OFFSET(THREAD_SCR1,  loongarch_lbt, scr1);
+	OFFSET(THREAD_SCR2,  loongarch_lbt, scr2);
+	OFFSET(THREAD_SCR3,  loongarch_lbt, scr3);
+	OFFSET(THREAD_EFLAGS, loongarch_lbt, eflags);
 	BLANK();
 }
 
diff --git a/arch/loongarch/kernel/cpu-probe.c b/arch/loongarch/kernel/cpu-probe.c
index e925579c7a71eab8822ff683dd8444f8982e42d6..55320813ee0819f0f23429361b7fc9722b710d65 100644
--- a/arch/loongarch/kernel/cpu-probe.c
+++ b/arch/loongarch/kernel/cpu-probe.c
@@ -144,6 +144,20 @@ static void cpu_probe_common(struct cpuinfo_loongarch *c)
 		c->options |= LOONGARCH_CPU_LVZ;
 		elf_hwcap |= HWCAP_LOONGARCH_LVZ;
 	}
+#ifdef CONFIG_CPU_HAS_LBT
+	if (config & CPUCFG2_X86BT) {
+		c->options |= LOONGARCH_CPU_LBT_X86;
+		elf_hwcap |= HWCAP_LOONGARCH_LBT_X86;
+	}
+	if (config & CPUCFG2_ARMBT) {
+		c->options |= LOONGARCH_CPU_LBT_ARM;
+		elf_hwcap |= HWCAP_LOONGARCH_LBT_ARM;
+	}
+	if (config & CPUCFG2_MIPSBT) {
+		c->options |= LOONGARCH_CPU_LBT_MIPS;
+		elf_hwcap |= HWCAP_LOONGARCH_LBT_MIPS;
+	}
+#endif
 
 	config = read_cpucfg(LOONGARCH_CPUCFG6);
 	if (config & CPUCFG6_PMP)
diff --git a/arch/loongarch/kernel/entry.S b/arch/loongarch/kernel/entry.S
index d737e3cf42d3fd8ca882e08113d58d2887db32c1..65518bb8f47285e406b95f8ba4c63a29e2c23a4a 100644
--- a/arch/loongarch/kernel/entry.S
+++ b/arch/loongarch/kernel/entry.S
@@ -58,6 +58,11 @@ SYM_FUNC_START(handle_syscall)
 
 	SAVE_STATIC
 
+#ifdef CONFIG_KGDB
+	li.w		t1, CSR_CRMD_WE
+	csrxchg		t1, t1, LOONGARCH_CSR_CRMD
+#endif
+
 	move		u0, t0
 	li.d		tp, ~_THREAD_MASK
 	and		tp, tp, sp
diff --git a/arch/loongarch/kernel/fpu.S b/arch/loongarch/kernel/fpu.S
index 501094a09f5d89da2af50522b6c0fc9a9b65ec95..d53ab10f464465e3f88910614afefce92b5af607 100644
--- a/arch/loongarch/kernel/fpu.S
+++ b/arch/loongarch/kernel/fpu.S
@@ -22,7 +22,7 @@
 
 	.macro	EX insn, reg, src, offs
 .ex\@:	\insn	\reg, \src, \offs
-	_asm_extable .ex\@, fault
+	_asm_extable .ex\@, .L_fpu_fault
 	.endm
 
 	.macro sc_save_fp base
@@ -138,6 +138,13 @@
 	.macro sc_save_fcsr base, tmp0
 	movfcsr2gr	\tmp0, fcsr0
 	EX	st.w	\tmp0, \base, 0
+#if defined(CONFIG_CPU_HAS_LBT)
+	/* TM bit is always 0 if LBT not supported */
+	andi		\tmp0, \tmp0, FPU_CSR_TM
+	beqz		\tmp0, 1f
+	x86clrtm
+1:
+#endif
 	.endm
 
 	.macro sc_restore_fcsr base, tmp0
@@ -309,7 +316,7 @@ EXPORT_SYMBOL(_save_fp)
  */
 SYM_FUNC_START(_restore_fp)
 	fpu_restore_double	a0 t1		# clobbers t1
-	fpu_restore_csr		a0 t1
+	fpu_restore_csr		a0 t1 t2
 	fpu_restore_cc		a0 t1 t2	# clobbers t1, t2
 	jr			ra
 SYM_FUNC_END(_restore_fp)
@@ -514,7 +521,6 @@ SYM_FUNC_START(_restore_lasx_context)
 	jr	ra
 SYM_FUNC_END(_restore_lasx_context)
 
-SYM_FUNC_START(fault)
+.L_fpu_fault:
 	li.w	a0, -EFAULT				# failure
 	jr	ra
-SYM_FUNC_END(fault)
diff --git a/arch/loongarch/kernel/head.S b/arch/loongarch/kernel/head.S
index 5e828a8bc0a0e0b95006449d2a1ab10b14ca2d4a..53b883db0786207318b2cfc6857a05857b9dd1a9 100644
--- a/arch/loongarch/kernel/head.S
+++ b/arch/loongarch/kernel/head.S
@@ -95,12 +95,17 @@ SYM_CODE_START(kernel_entry)			# kernel entry point
 	PTR_LI		sp, (_THREAD_SIZE - PT_SIZE)
 	PTR_ADD		sp, sp, tp
 	set_saved_sp	sp, t0, t1
-#endif
 
-	/* relocate_kernel() returns the new kernel entry point */
-	jr		a0
-	ASM_BUG()
+	/* Jump to the new kernel: new_pc = current_pc + random_offset */
+	pcaddi		t0, 0
+	add.d		t0, t0, a0
+	jirl		zero, t0, 0xc
+#endif /* CONFIG_RANDOMIZE_BASE */
+
+#endif /* CONFIG_RELOCATABLE */
 
+#ifdef CONFIG_KASAN
+	bl		kasan_early_init
 #endif
 
 	bl		start_kernel
diff --git a/arch/loongarch/kernel/kfpu.c b/arch/loongarch/kernel/kfpu.c
index 5c46ae8c6cac13b433ccbdfbdacf194eba6d0d19..ec5b28e570c963482d18e50f28043b066a425ffc 100644
--- a/arch/loongarch/kernel/kfpu.c
+++ b/arch/loongarch/kernel/kfpu.c
@@ -8,19 +8,40 @@
 #include <asm/fpu.h>
 #include <asm/smp.h>
 
+static unsigned int euen_mask = CSR_EUEN_FPEN;
+
+/*
+ * The critical section between kernel_fpu_begin() and kernel_fpu_end()
+ * is non-reentrant. It is the caller's responsibility to avoid reentrance.
+ * See drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c as an example.
+ */
 static DEFINE_PER_CPU(bool, in_kernel_fpu);
+static DEFINE_PER_CPU(unsigned int, euen_current);
 
 void kernel_fpu_begin(void)
 {
+	unsigned int *euen_curr;
+
 	preempt_disable();
 
 	WARN_ON(this_cpu_read(in_kernel_fpu));
 
 	this_cpu_write(in_kernel_fpu, true);
+	euen_curr = this_cpu_ptr(&euen_current);
 
-	if (!is_fpu_owner())
-		enable_fpu();
+	*euen_curr = csr_xchg32(euen_mask, euen_mask, LOONGARCH_CSR_EUEN);
+
+#ifdef CONFIG_CPU_HAS_LASX
+	if (*euen_curr & CSR_EUEN_LASXEN)
+		_save_lasx(&current->thread.fpu);
+	else
+#endif
+#ifdef CONFIG_CPU_HAS_LSX
+	if (*euen_curr & CSR_EUEN_LSXEN)
+		_save_lsx(&current->thread.fpu);
 	else
+#endif
+	if (*euen_curr & CSR_EUEN_FPEN)
 		_save_fp(&current->thread.fpu);
 
 	write_fcsr(LOONGARCH_FCSR0, 0);
@@ -29,15 +50,41 @@ EXPORT_SYMBOL_GPL(kernel_fpu_begin);
 
 void kernel_fpu_end(void)
 {
+	unsigned int *euen_curr;
+
 	WARN_ON(!this_cpu_read(in_kernel_fpu));
 
-	if (!is_fpu_owner())
-		disable_fpu();
+	euen_curr = this_cpu_ptr(&euen_current);
+
+#ifdef CONFIG_CPU_HAS_LASX
+	if (*euen_curr & CSR_EUEN_LASXEN)
+		_restore_lasx(&current->thread.fpu);
 	else
+#endif
+#ifdef CONFIG_CPU_HAS_LSX
+	if (*euen_curr & CSR_EUEN_LSXEN)
+		_restore_lsx(&current->thread.fpu);
+	else
+#endif
+	if (*euen_curr & CSR_EUEN_FPEN)
 		_restore_fp(&current->thread.fpu);
 
+	*euen_curr = csr_xchg32(*euen_curr, euen_mask, LOONGARCH_CSR_EUEN);
+
 	this_cpu_write(in_kernel_fpu, false);
 
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(kernel_fpu_end);
+
+static int __init init_euen_mask(void)
+{
+	if (cpu_has_lsx)
+		euen_mask |= CSR_EUEN_LSXEN;
+
+	if (cpu_has_lasx)
+		euen_mask |= CSR_EUEN_LASXEN;
+
+	return 0;
+}
+arch_initcall(init_euen_mask);
diff --git a/arch/loongarch/kernel/kgdb.c b/arch/loongarch/kernel/kgdb.c
new file mode 100644
index 0000000000000000000000000000000000000000..445c452d72a79cac7f2527319301b7f430258c09
--- /dev/null
+++ b/arch/loongarch/kernel/kgdb.c
@@ -0,0 +1,727 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * LoongArch KGDB support
+ *
+ * Copyright (C) 2023 Loongson Technology Corporation Limited
+ */
+
+#include <linux/hw_breakpoint.h>
+#include <linux/kdebug.h>
+#include <linux/kgdb.h>
+#include <linux/processor.h>
+#include <linux/ptrace.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+
+#include <asm/cacheflush.h>
+#include <asm/fpu.h>
+#include <asm/hw_breakpoint.h>
+#include <asm/inst.h>
+#include <asm/irq_regs.h>
+#include <asm/ptrace.h>
+#include <asm/sigcontext.h>
+
+int kgdb_watch_activated;
+static unsigned int stepped_opcode;
+static unsigned long stepped_address;
+
+struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
+	{ "r0", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[0]) },
+	{ "r1", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[1]) },
+	{ "r2", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[2]) },
+	{ "r3", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[3]) },
+	{ "r4", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[4]) },
+	{ "r5", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[5]) },
+	{ "r6", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[6]) },
+	{ "r7", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[7]) },
+	{ "r8", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[8]) },
+	{ "r9", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[9]) },
+	{ "r10", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[10]) },
+	{ "r11", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[11]) },
+	{ "r12", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[12]) },
+	{ "r13", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[13]) },
+	{ "r14", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[14]) },
+	{ "r15", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[15]) },
+	{ "r16", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[16]) },
+	{ "r17", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[17]) },
+	{ "r18", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[18]) },
+	{ "r19", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[19]) },
+	{ "r20", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[20]) },
+	{ "r21", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[21]) },
+	{ "r22", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[22]) },
+	{ "r23", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[23]) },
+	{ "r24", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[24]) },
+	{ "r25", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[25]) },
+	{ "r26", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[26]) },
+	{ "r27", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[27]) },
+	{ "r28", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[28]) },
+	{ "r29", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[29]) },
+	{ "r30", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[30]) },
+	{ "r31", GDB_SIZEOF_REG, offsetof(struct pt_regs, regs[31]) },
+	{ "orig_a0", GDB_SIZEOF_REG, offsetof(struct pt_regs, orig_a0) },
+	{ "pc", GDB_SIZEOF_REG, offsetof(struct pt_regs, csr_era) },
+	{ "badv", GDB_SIZEOF_REG, offsetof(struct pt_regs, csr_badvaddr) },
+	{ "f0", GDB_SIZEOF_REG, 0 },
+	{ "f1", GDB_SIZEOF_REG, 1 },
+	{ "f2", GDB_SIZEOF_REG, 2 },
+	{ "f3", GDB_SIZEOF_REG, 3 },
+	{ "f4", GDB_SIZEOF_REG, 4 },
+	{ "f5", GDB_SIZEOF_REG, 5 },
+	{ "f6", GDB_SIZEOF_REG, 6 },
+	{ "f7", GDB_SIZEOF_REG, 7 },
+	{ "f8", GDB_SIZEOF_REG, 8 },
+	{ "f9", GDB_SIZEOF_REG, 9 },
+	{ "f10", GDB_SIZEOF_REG, 10 },
+	{ "f11", GDB_SIZEOF_REG, 11 },
+	{ "f12", GDB_SIZEOF_REG, 12 },
+	{ "f13", GDB_SIZEOF_REG, 13 },
+	{ "f14", GDB_SIZEOF_REG, 14 },
+	{ "f15", GDB_SIZEOF_REG, 15 },
+	{ "f16", GDB_SIZEOF_REG, 16 },
+	{ "f17", GDB_SIZEOF_REG, 17 },
+	{ "f18", GDB_SIZEOF_REG, 18 },
+	{ "f19", GDB_SIZEOF_REG, 19 },
+	{ "f20", GDB_SIZEOF_REG, 20 },
+	{ "f21", GDB_SIZEOF_REG, 21 },
+	{ "f22", GDB_SIZEOF_REG, 22 },
+	{ "f23", GDB_SIZEOF_REG, 23 },
+	{ "f24", GDB_SIZEOF_REG, 24 },
+	{ "f25", GDB_SIZEOF_REG, 25 },
+	{ "f26", GDB_SIZEOF_REG, 26 },
+	{ "f27", GDB_SIZEOF_REG, 27 },
+	{ "f28", GDB_SIZEOF_REG, 28 },
+	{ "f29", GDB_SIZEOF_REG, 29 },
+	{ "f30", GDB_SIZEOF_REG, 30 },
+	{ "f31", GDB_SIZEOF_REG, 31 },
+	{ "fcc0", 1, 0 },
+	{ "fcc1", 1, 1 },
+	{ "fcc2", 1, 2 },
+	{ "fcc3", 1, 3 },
+	{ "fcc4", 1, 4 },
+	{ "fcc5", 1, 5 },
+	{ "fcc6", 1, 6 },
+	{ "fcc7", 1, 7 },
+	{ "fcsr", 4, 0 },
+};
+
+char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
+{
+	int reg_offset, reg_size;
+
+	if (regno < 0 || regno >= DBG_MAX_REG_NUM)
+		return NULL;
+
+	reg_offset = dbg_reg_def[regno].offset;
+	reg_size = dbg_reg_def[regno].size;
+
+	if (reg_offset == -1)
+		goto out;
+
+	/* Handle general-purpose/orig_a0/pc/badv registers */
+	if (regno <= DBG_PT_REGS_END) {
+		memcpy(mem, (void *)regs + reg_offset, reg_size);
+		goto out;
+	}
+
+	if (!(regs->csr_euen & CSR_EUEN_FPEN))
+		goto out;
+
+	save_fp(current);
+
+	/* Handle FP registers */
+	switch (regno) {
+	case DBG_FCSR:				/* Process the fcsr */
+		memcpy(mem, (void *)&current->thread.fpu.fcsr, reg_size);
+		break;
+	case DBG_FCC_BASE ... DBG_FCC_END:	/* Process the fcc */
+		memcpy(mem, (void *)&current->thread.fpu.fcc + reg_offset, reg_size);
+		break;
+	case DBG_FPR_BASE ... DBG_FPR_END:	/* Process the fpr */
+		memcpy(mem, (void *)&current->thread.fpu.fpr[reg_offset], reg_size);
+		break;
+	default:
+		break;
+	}
+
+out:
+	return dbg_reg_def[regno].name;
+}
+
+int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
+{
+	int reg_offset, reg_size;
+
+	if (regno < 0 || regno >= DBG_MAX_REG_NUM)
+		return -EINVAL;
+
+	reg_offset = dbg_reg_def[regno].offset;
+	reg_size = dbg_reg_def[regno].size;
+
+	if (reg_offset == -1)
+		return 0;
+
+	/* Handle general-purpose/orig_a0/pc/badv registers */
+	if (regno <= DBG_PT_REGS_END) {
+		memcpy((void *)regs + reg_offset, mem, reg_size);
+		return 0;
+	}
+
+	if (!(regs->csr_euen & CSR_EUEN_FPEN))
+		return 0;
+
+	/* Handle FP registers */
+	switch (regno) {
+	case DBG_FCSR:				/* Process the fcsr */
+		memcpy((void *)&current->thread.fpu.fcsr, mem, reg_size);
+		break;
+	case DBG_FCC_BASE ... DBG_FCC_END:	/* Process the fcc */
+		memcpy((void *)&current->thread.fpu.fcc + reg_offset, mem, reg_size);
+		break;
+	case DBG_FPR_BASE ... DBG_FPR_END:	/* Process the fpr */
+		memcpy((void *)&current->thread.fpu.fpr[reg_offset], mem, reg_size);
+		break;
+	default:
+		break;
+	}
+
+	restore_fp(current);
+
+	return 0;
+}
+
+/*
+ * Similar to regs_to_gdb_regs() except that process is sleeping and so
+ * we may not be able to get all the info.
+ */
+void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
+{
+	/* Initialize to zero */
+	memset((char *)gdb_regs, 0, NUMREGBYTES);
+
+	gdb_regs[DBG_LOONGARCH_RA] = p->thread.reg01;
+	gdb_regs[DBG_LOONGARCH_TP] = (long)p;
+	gdb_regs[DBG_LOONGARCH_SP] = p->thread.reg03;
+
+	/* S0 - S8 */
+	gdb_regs[DBG_LOONGARCH_S0] = p->thread.reg23;
+	gdb_regs[DBG_LOONGARCH_S1] = p->thread.reg24;
+	gdb_regs[DBG_LOONGARCH_S2] = p->thread.reg25;
+	gdb_regs[DBG_LOONGARCH_S3] = p->thread.reg26;
+	gdb_regs[DBG_LOONGARCH_S4] = p->thread.reg27;
+	gdb_regs[DBG_LOONGARCH_S5] = p->thread.reg28;
+	gdb_regs[DBG_LOONGARCH_S6] = p->thread.reg29;
+	gdb_regs[DBG_LOONGARCH_S7] = p->thread.reg30;
+	gdb_regs[DBG_LOONGARCH_S8] = p->thread.reg31;
+
+	/*
+	 * PC use return address (RA), i.e. the moment after return from __switch_to()
+	 */
+	gdb_regs[DBG_LOONGARCH_PC] = p->thread.reg01;
+}
+
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc)
+{
+	regs->csr_era = pc;
+}
+
+void arch_kgdb_breakpoint(void)
+{
+	__asm__ __volatile__ (			\
+		".globl kgdb_breakinst\n\t"	\
+		"nop\n"				\
+		"kgdb_breakinst:\tbreak 2\n\t"); /* BRK_KDB = 2 */
+}
+
+/*
+ * Calls linux_debug_hook before the kernel dies. If KGDB is enabled,
+ * then try to fall into the debugger
+ */
+static int kgdb_loongarch_notify(struct notifier_block *self, unsigned long cmd, void *ptr)
+{
+	struct die_args *args = (struct die_args *)ptr;
+	struct pt_regs *regs = args->regs;
+
+	/* Userspace events, ignore. */
+	if (user_mode(regs))
+		return NOTIFY_DONE;
+
+	if (!kgdb_io_module_registered)
+		return NOTIFY_DONE;
+
+	if (atomic_read(&kgdb_active) != -1)
+		kgdb_nmicallback(smp_processor_id(), regs);
+
+	if (kgdb_handle_exception(args->trapnr, args->signr, cmd, regs))
+		return NOTIFY_DONE;
+
+	if (atomic_read(&kgdb_setting_breakpoint))
+		if (regs->csr_era == (unsigned long)&kgdb_breakinst)
+			regs->csr_era += LOONGARCH_INSN_SIZE;
+
+	return NOTIFY_STOP;
+}
+
+bool kgdb_breakpoint_handler(struct pt_regs *regs)
+{
+	struct die_args args = {
+		.regs	= regs,
+		.str	= "Break",
+		.err	= BRK_KDB,
+		.trapnr = read_csr_excode(),
+		.signr	= SIGTRAP,
+
+	};
+
+	return (kgdb_loongarch_notify(NULL, DIE_TRAP, &args) == NOTIFY_STOP) ? true : false;
+}
+
+static struct notifier_block kgdb_notifier = {
+	.notifier_call = kgdb_loongarch_notify,
+};
+
+static inline void kgdb_arch_update_addr(struct pt_regs *regs,
+					 char *remcom_in_buffer)
+{
+	unsigned long addr;
+	char *ptr;
+
+	ptr = &remcom_in_buffer[1];
+	if (kgdb_hex2long(&ptr, &addr))
+		regs->csr_era = addr;
+}
+
+/* Calculate the new address for after a step */
+static int get_step_address(struct pt_regs *regs, unsigned long *next_addr)
+{
+	char cj_val;
+	unsigned int si, si_l, si_h, rd, rj, cj;
+	unsigned long pc = instruction_pointer(regs);
+	union loongarch_instruction *ip = (union loongarch_instruction *)pc;
+
+	if (pc & 3) {
+		pr_warn("%s: invalid pc 0x%lx\n", __func__, pc);
+		return -EINVAL;
+	}
+
+	*next_addr = pc + LOONGARCH_INSN_SIZE;
+
+	si_h = ip->reg0i26_format.immediate_h;
+	si_l = ip->reg0i26_format.immediate_l;
+	switch (ip->reg0i26_format.opcode) {
+	case b_op:
+		*next_addr = pc + sign_extend64((si_h << 16 | si_l) << 2, 27);
+		return 0;
+	case bl_op:
+		*next_addr = pc + sign_extend64((si_h << 16 | si_l) << 2, 27);
+		regs->regs[1] = pc + LOONGARCH_INSN_SIZE;
+		return 0;
+	}
+
+	rj = ip->reg1i21_format.rj;
+	cj = (rj & 0x07) + DBG_FCC_BASE;
+	si_l = ip->reg1i21_format.immediate_l;
+	si_h = ip->reg1i21_format.immediate_h;
+	dbg_get_reg(cj, &cj_val, regs);
+	switch (ip->reg1i21_format.opcode) {
+	case beqz_op:
+		if (regs->regs[rj] == 0)
+			*next_addr = pc + sign_extend64((si_h << 16 | si_l) << 2, 22);
+		return 0;
+	case bnez_op:
+		if (regs->regs[rj] != 0)
+			*next_addr = pc + sign_extend64((si_h << 16 | si_l) << 2, 22);
+		return 0;
+	case bceqz_op: /* bceqz_op = bcnez_op */
+		if (((rj & 0x18) == 0x00) && !cj_val) /* bceqz */
+			*next_addr = pc + sign_extend64((si_h << 16 | si_l) << 2, 22);
+		if (((rj & 0x18) == 0x08) && cj_val) /* bcnez */
+			*next_addr = pc + sign_extend64((si_h << 16 | si_l) << 2, 22);
+		return 0;
+	}
+
+	rj = ip->reg2i16_format.rj;
+	rd = ip->reg2i16_format.rd;
+	si = ip->reg2i16_format.immediate;
+	switch (ip->reg2i16_format.opcode) {
+	case beq_op:
+		if (regs->regs[rj] == regs->regs[rd])
+			*next_addr = pc + sign_extend64(si << 2, 17);
+		return 0;
+	case bne_op:
+		if (regs->regs[rj] != regs->regs[rd])
+			*next_addr = pc + sign_extend64(si << 2, 17);
+		return 0;
+	case blt_op:
+		if ((long)regs->regs[rj] < (long)regs->regs[rd])
+			*next_addr = pc + sign_extend64(si << 2, 17);
+		return 0;
+	case bge_op:
+		if ((long)regs->regs[rj] >= (long)regs->regs[rd])
+			*next_addr = pc + sign_extend64(si << 2, 17);
+		return 0;
+	case bltu_op:
+		if (regs->regs[rj] < regs->regs[rd])
+			*next_addr = pc + sign_extend64(si << 2, 17);
+		return 0;
+	case bgeu_op:
+		if (regs->regs[rj] >= regs->regs[rd])
+			*next_addr = pc + sign_extend64(si << 2, 17);
+		return 0;
+	case jirl_op:
+		regs->regs[rd] = pc + LOONGARCH_INSN_SIZE;
+		*next_addr = regs->regs[rj] + sign_extend64(si << 2, 17);
+		return 0;
+	}
+
+	return 0;
+}
+
+static int do_single_step(struct pt_regs *regs)
+{
+	int error = 0;
+	unsigned long addr = 0; /* Determine where the target instruction will send us to */
+
+	error = get_step_address(regs, &addr);
+	if (error)
+		return error;
+
+	/* Store the opcode in the stepped address */
+	error = get_kernel_nofault(stepped_opcode, (void *)addr);
+	if (error)
+		return error;
+
+	stepped_address = addr;
+
+	/* Replace the opcode with the break instruction */
+	error = copy_to_kernel_nofault((void *)stepped_address,
+				       arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
+	flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
+
+	if (error) {
+		stepped_opcode = 0;
+		stepped_address = 0;
+	} else {
+		kgdb_single_step = 1;
+		atomic_set(&kgdb_cpu_doing_single_step, raw_smp_processor_id());
+	}
+
+	return error;
+}
+
+/* Undo a single step */
+static void undo_single_step(struct pt_regs *regs)
+{
+	if (stepped_opcode) {
+		copy_to_kernel_nofault((void *)stepped_address,
+				       (void *)&stepped_opcode, BREAK_INSTR_SIZE);
+		flush_icache_range(stepped_address, stepped_address + BREAK_INSTR_SIZE);
+	}
+
+	stepped_opcode = 0;
+	stepped_address = 0;
+	kgdb_single_step = 0;
+	atomic_set(&kgdb_cpu_doing_single_step, -1);
+}
+
+int kgdb_arch_handle_exception(int vector, int signo, int err_code,
+			       char *remcom_in_buffer, char *remcom_out_buffer,
+			       struct pt_regs *regs)
+{
+	int ret = 0;
+
+	undo_single_step(regs);
+	regs->csr_prmd |= CSR_PRMD_PWE;
+
+	switch (remcom_in_buffer[0]) {
+	case 'D':
+	case 'k':
+		regs->csr_prmd &= ~CSR_PRMD_PWE;
+		fallthrough;
+	case 'c':
+		kgdb_arch_update_addr(regs, remcom_in_buffer);
+		break;
+	case 's':
+		kgdb_arch_update_addr(regs, remcom_in_buffer);
+		ret = do_single_step(regs);
+		break;
+	default:
+		ret = -1;
+	}
+
+	return ret;
+}
+
+static struct hw_breakpoint {
+	unsigned int		enabled;
+	unsigned long		addr;
+	int			len;
+	int			type;
+	struct perf_event	* __percpu *pev;
+} breakinfo[LOONGARCH_MAX_BRP];
+
+static int hw_break_reserve_slot(int breakno)
+{
+	int cpu, cnt = 0;
+	struct perf_event **pevent;
+
+	for_each_online_cpu(cpu) {
+		cnt++;
+		pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
+		if (dbg_reserve_bp_slot(*pevent))
+			goto fail;
+	}
+
+	return 0;
+
+fail:
+	for_each_online_cpu(cpu) {
+		cnt--;
+		if (!cnt)
+			break;
+		pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
+		dbg_release_bp_slot(*pevent);
+	}
+
+	return -1;
+}
+
+static int hw_break_release_slot(int breakno)
+{
+	int cpu;
+	struct perf_event **pevent;
+
+	if (dbg_is_early)
+		return 0;
+
+	for_each_online_cpu(cpu) {
+		pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
+		if (dbg_release_bp_slot(*pevent))
+			/*
+			 * The debugger is responsible for handing the retry on
+			 * remove failure.
+			 */
+			return -1;
+	}
+
+	return 0;
+}
+
+static int kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
+{
+	int i;
+
+	for (i = 0; i < LOONGARCH_MAX_BRP; i++)
+		if (!breakinfo[i].enabled)
+			break;
+
+	if (i == LOONGARCH_MAX_BRP)
+		return -1;
+
+	switch (bptype) {
+	case BP_HARDWARE_BREAKPOINT:
+		breakinfo[i].type = HW_BREAKPOINT_X;
+		break;
+	case BP_READ_WATCHPOINT:
+		breakinfo[i].type = HW_BREAKPOINT_R;
+		break;
+	case BP_WRITE_WATCHPOINT:
+		breakinfo[i].type = HW_BREAKPOINT_W;
+		break;
+	case BP_ACCESS_WATCHPOINT:
+		breakinfo[i].type = HW_BREAKPOINT_RW;
+		break;
+	default:
+		return -1;
+	}
+
+	switch (len) {
+	case 1:
+		breakinfo[i].len = HW_BREAKPOINT_LEN_1;
+		break;
+	case 2:
+		breakinfo[i].len = HW_BREAKPOINT_LEN_2;
+		break;
+	case 4:
+		breakinfo[i].len = HW_BREAKPOINT_LEN_4;
+		break;
+	case 8:
+		breakinfo[i].len = HW_BREAKPOINT_LEN_8;
+		break;
+	default:
+		return -1;
+	}
+
+	breakinfo[i].addr = addr;
+	if (hw_break_reserve_slot(i)) {
+		breakinfo[i].addr = 0;
+		return -1;
+	}
+	breakinfo[i].enabled = 1;
+
+	return 0;
+}
+
+static int kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
+{
+	int i;
+
+	for (i = 0; i < LOONGARCH_MAX_BRP; i++)
+		if (breakinfo[i].addr == addr && breakinfo[i].enabled)
+			break;
+
+	if (i == LOONGARCH_MAX_BRP)
+		return -1;
+
+	if (hw_break_release_slot(i)) {
+		pr_err("Cannot remove hw breakpoint at %lx\n", addr);
+		return -1;
+	}
+	breakinfo[i].enabled = 0;
+
+	return 0;
+}
+
+static void kgdb_disable_hw_break(struct pt_regs *regs)
+{
+	int i;
+	int cpu = raw_smp_processor_id();
+	struct perf_event *bp;
+
+	for (i = 0; i < LOONGARCH_MAX_BRP; i++) {
+		if (!breakinfo[i].enabled)
+			continue;
+
+		bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
+		if (bp->attr.disabled == 1)
+			continue;
+
+		arch_uninstall_hw_breakpoint(bp);
+		bp->attr.disabled = 1;
+	}
+
+	/* Disable hardware debugging while we are in kgdb */
+	csr_xchg32(0, CSR_CRMD_WE, LOONGARCH_CSR_CRMD);
+}
+
+static void kgdb_remove_all_hw_break(void)
+{
+	int i;
+	int cpu = raw_smp_processor_id();
+	struct perf_event *bp;
+
+	for (i = 0; i < LOONGARCH_MAX_BRP; i++) {
+		if (!breakinfo[i].enabled)
+			continue;
+
+		bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
+		if (!bp->attr.disabled) {
+			arch_uninstall_hw_breakpoint(bp);
+			bp->attr.disabled = 1;
+			continue;
+		}
+
+		if (hw_break_release_slot(i))
+			pr_err("KGDB: hw bpt remove failed %lx\n", breakinfo[i].addr);
+		breakinfo[i].enabled = 0;
+	}
+
+	csr_xchg32(0, CSR_CRMD_WE, LOONGARCH_CSR_CRMD);
+	kgdb_watch_activated = 0;
+}
+
+static void kgdb_correct_hw_break(void)
+{
+	int i, activated = 0;
+
+	for (i = 0; i < LOONGARCH_MAX_BRP; i++) {
+		struct perf_event *bp;
+		int val;
+		int cpu = raw_smp_processor_id();
+
+		if (!breakinfo[i].enabled)
+			continue;
+
+		bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
+		if (bp->attr.disabled != 1)
+			continue;
+
+		bp->attr.bp_addr = breakinfo[i].addr;
+		bp->attr.bp_len = breakinfo[i].len;
+		bp->attr.bp_type = breakinfo[i].type;
+
+		val = hw_breakpoint_arch_parse(bp, &bp->attr, counter_arch_bp(bp));
+		if (val)
+			return;
+
+		val = arch_install_hw_breakpoint(bp);
+		if (!val)
+			bp->attr.disabled = 0;
+		activated = 1;
+	}
+
+	csr_xchg32(activated ? CSR_CRMD_WE : 0, CSR_CRMD_WE, LOONGARCH_CSR_CRMD);
+	kgdb_watch_activated = activated;
+}
+
+const struct kgdb_arch arch_kgdb_ops = {
+	.gdb_bpt_instr		= {0x02, 0x00, break_op >> 1, 0x00}, /* BRK_KDB = 2 */
+	.flags			= KGDB_HW_BREAKPOINT,
+	.set_hw_breakpoint	= kgdb_set_hw_break,
+	.remove_hw_breakpoint	= kgdb_remove_hw_break,
+	.disable_hw_break	= kgdb_disable_hw_break,
+	.remove_all_hw_break	= kgdb_remove_all_hw_break,
+	.correct_hw_break	= kgdb_correct_hw_break,
+};
+
+int kgdb_arch_init(void)
+{
+	return register_die_notifier(&kgdb_notifier);
+}
+
+void kgdb_arch_late(void)
+{
+	int i, cpu;
+	struct perf_event_attr attr;
+	struct perf_event **pevent;
+
+	hw_breakpoint_init(&attr);
+
+	attr.bp_addr = (unsigned long)kgdb_arch_init;
+	attr.bp_len = HW_BREAKPOINT_LEN_4;
+	attr.bp_type = HW_BREAKPOINT_W;
+	attr.disabled = 1;
+
+	for (i = 0; i < LOONGARCH_MAX_BRP; i++) {
+		if (breakinfo[i].pev)
+			continue;
+
+		breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL);
+		if (IS_ERR((void * __force)breakinfo[i].pev)) {
+			pr_err("kgdb: Could not allocate hw breakpoints.\n");
+			breakinfo[i].pev = NULL;
+			return;
+		}
+
+		for_each_online_cpu(cpu) {
+			pevent = per_cpu_ptr(breakinfo[i].pev, cpu);
+			if (pevent[0]->destroy) {
+				pevent[0]->destroy = NULL;
+				release_bp_slot(*pevent);
+			}
+		}
+	}
+}
+
+void kgdb_arch_exit(void)
+{
+	int i;
+
+	for (i = 0; i < LOONGARCH_MAX_BRP; i++) {
+		if (breakinfo[i].pev) {
+			unregister_wide_hw_breakpoint(breakinfo[i].pev);
+			breakinfo[i].pev = NULL;
+		}
+	}
+
+	unregister_die_notifier(&kgdb_notifier);
+}
diff --git a/arch/loongarch/kernel/lbt.S b/arch/loongarch/kernel/lbt.S
new file mode 100644
index 0000000000000000000000000000000000000000..9c75120a26d836d75d3bb08c2b5a0022a8c5d58d
--- /dev/null
+++ b/arch/loongarch/kernel/lbt.S
@@ -0,0 +1,155 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Author: Qi Hu <huqi@loongson.cn>
+ *         Huacai Chen <chenhuacai@loongson.cn>
+ *
+ * Copyright (C) 2020-2023 Loongson Technology Corporation Limited
+ */
+#include <asm/asm.h>
+#include <asm/asmmacro.h>
+#include <asm/asm-extable.h>
+#include <asm/asm-offsets.h>
+#include <asm/errno.h>
+#include <asm/regdef.h>
+
+#define SCR_REG_WIDTH 8
+
+	.macro	EX insn, reg, src, offs
+.ex\@:	\insn	\reg, \src, \offs
+	_asm_extable .ex\@, .L_lbt_fault
+	.endm
+
+/*
+ * Save a thread's lbt context.
+ */
+SYM_FUNC_START(_save_lbt)
+	movscr2gr	t1, $scr0		# save scr
+	stptr.d		t1, a0, THREAD_SCR0
+	movscr2gr	t1, $scr1
+	stptr.d		t1, a0, THREAD_SCR1
+	movscr2gr	t1, $scr2
+	stptr.d		t1, a0, THREAD_SCR2
+	movscr2gr	t1, $scr3
+	stptr.d		t1, a0, THREAD_SCR3
+
+	x86mfflag	t1, 0x3f		# save eflags
+	stptr.d		t1, a0, THREAD_EFLAGS
+	jr		ra
+SYM_FUNC_END(_save_lbt)
+EXPORT_SYMBOL(_save_lbt)
+
+/*
+ * Restore a thread's lbt context.
+ */
+SYM_FUNC_START(_restore_lbt)
+	ldptr.d		t1, a0, THREAD_SCR0	# restore scr
+	movgr2scr	$scr0, t1
+	ldptr.d		t1, a0, THREAD_SCR1
+	movgr2scr	$scr1, t1
+	ldptr.d		t1, a0, THREAD_SCR2
+	movgr2scr	$scr2, t1
+	ldptr.d		t1, a0, THREAD_SCR3
+	movgr2scr	$scr3, t1
+
+	ldptr.d		t1, a0, THREAD_EFLAGS	# restore eflags
+	x86mtflag	t1, 0x3f
+	jr		ra
+SYM_FUNC_END(_restore_lbt)
+EXPORT_SYMBOL(_restore_lbt)
+
+/*
+ * Load scr/eflag with zero.
+ */
+SYM_FUNC_START(_init_lbt)
+	movgr2scr	$scr0, zero
+	movgr2scr	$scr1, zero
+	movgr2scr	$scr2, zero
+	movgr2scr	$scr3, zero
+
+	x86mtflag	zero, 0x3f
+	jr		ra
+SYM_FUNC_END(_init_lbt)
+
+/*
+ * a0: scr
+ * a1: eflag
+ */
+SYM_FUNC_START(_save_lbt_context)
+	movscr2gr	t1, $scr0		# save scr
+	EX	st.d	t1, a0, (0 * SCR_REG_WIDTH)
+	movscr2gr	t1, $scr1
+	EX	st.d	t1, a0, (1 * SCR_REG_WIDTH)
+	movscr2gr	t1, $scr2
+	EX	st.d	t1, a0, (2 * SCR_REG_WIDTH)
+	movscr2gr	t1, $scr3
+	EX	st.d	t1, a0, (3 * SCR_REG_WIDTH)
+
+	x86mfflag	t1, 0x3f		# save eflags
+	EX 	st.w	t1, a1, 0
+	li.w		a0, 0			# success
+	jr		ra
+SYM_FUNC_END(_save_lbt_context)
+
+/*
+ * a0: scr
+ * a1: eflag
+ */
+SYM_FUNC_START(_restore_lbt_context)
+	EX	ld.d	t1, a0, (0 * SCR_REG_WIDTH)	# restore scr
+	movgr2scr	$scr0, t1
+	EX	ld.d	t1, a0, (1 * SCR_REG_WIDTH)
+	movgr2scr	$scr1, t1
+	EX	ld.d	t1, a0, (2 * SCR_REG_WIDTH)
+	movgr2scr	$scr2, t1
+	EX	ld.d	t1, a0, (3 * SCR_REG_WIDTH)
+	movgr2scr	$scr3, t1
+
+	EX 	ld.w	t1, a1, 0			# restore eflags
+	x86mtflag	t1, 0x3f
+	li.w		a0, 0			# success
+	jr		ra
+SYM_FUNC_END(_restore_lbt_context)
+
+/*
+ * a0: ftop
+ */
+SYM_FUNC_START(_save_ftop_context)
+	x86mftop	t1
+	st.w		t1, a0, 0
+	li.w		a0, 0			# success
+	jr		ra
+SYM_FUNC_END(_save_ftop_context)
+
+/*
+ * a0: ftop
+ */
+SYM_FUNC_START(_restore_ftop_context)
+	ld.w		t1, a0, 0
+	andi		t1, t1, 0x7
+	la.pcrel	a0, 1f
+	alsl.d		a0, t1, a0, 3
+	jr		a0
+1:
+	x86mttop	0
+	b	2f
+	x86mttop	1
+	b	2f
+	x86mttop	2
+	b	2f
+	x86mttop	3
+	b	2f
+	x86mttop	4
+	b	2f
+	x86mttop	5
+	b	2f
+	x86mttop	6
+	b	2f
+	x86mttop	7
+2:
+	li.w		a0, 0			# success
+	jr		ra
+SYM_FUNC_END(_restore_ftop_context)
+
+.L_lbt_fault:
+	li.w		a0, -EFAULT		# failure
+	jr		ra
diff --git a/arch/loongarch/kernel/numa.c b/arch/loongarch/kernel/numa.c
index 708665895b47d314df2f13c13db75c021f7b198a..c7d33c489e048ccc0c5ec41b164b6b70c685e6d1 100644
--- a/arch/loongarch/kernel/numa.c
+++ b/arch/loongarch/kernel/numa.c
@@ -67,39 +67,7 @@ static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
 
 void __init pcpu_populate_pte(unsigned long addr)
 {
-	pgd_t *pgd = pgd_offset_k(addr);
-	p4d_t *p4d = p4d_offset(pgd, addr);
-	pud_t *pud;
-	pmd_t *pmd;
-
-	if (p4d_none(*p4d)) {
-		pud_t *new;
-
-		new = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
-		pgd_populate(&init_mm, pgd, new);
-#ifndef __PAGETABLE_PUD_FOLDED
-		pud_init(new);
-#endif
-	}
-
-	pud = pud_offset(p4d, addr);
-	if (pud_none(*pud)) {
-		pmd_t *new;
-
-		new = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
-		pud_populate(&init_mm, pud, new);
-#ifndef __PAGETABLE_PMD_FOLDED
-		pmd_init(new);
-#endif
-	}
-
-	pmd = pmd_offset(pud, addr);
-	if (!pmd_present(*pmd)) {
-		pte_t *new;
-
-		new = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
-		pmd_populate_kernel(&init_mm, pmd, new);
-	}
+	populate_kernel_pte(addr);
 }
 
 void __init setup_per_cpu_areas(void)
@@ -470,7 +438,6 @@ void __init mem_init(void)
 {
 	high_memory = (void *) __va(get_num_physpages() << PAGE_SHIFT);
 	memblock_free_all();
-	setup_zero_pages();	/* This comes from node 0 */
 }
 
 int pcibus_to_node(struct pci_bus *bus)
diff --git a/arch/loongarch/kernel/process.c b/arch/loongarch/kernel/process.c
index ba457e43f5be535012431151f3269b7b0b36ce0c..3cb082e0c99298c44ebbf062a0e6d42be7073dc6 100644
--- a/arch/loongarch/kernel/process.c
+++ b/arch/loongarch/kernel/process.c
@@ -38,6 +38,7 @@
 #include <asm/cpu.h>
 #include <asm/elf.h>
 #include <asm/fpu.h>
+#include <asm/lbt.h>
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/irq_regs.h>
@@ -82,9 +83,11 @@ void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp)
 	euen = regs->csr_euen & ~(CSR_EUEN_FPEN);
 	regs->csr_euen = euen;
 	lose_fpu(0);
+	lose_lbt(0);
 
 	clear_thread_flag(TIF_LSX_CTX_LIVE);
 	clear_thread_flag(TIF_LASX_CTX_LIVE);
+	clear_thread_flag(TIF_LBT_CTX_LIVE);
 	clear_used_math();
 	regs->csr_era = pc;
 	regs->regs[3] = sp;
@@ -121,10 +124,14 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 
 	preempt_enable();
 
-	if (used_math())
-		memcpy(dst, src, sizeof(struct task_struct));
-	else
+	if (!used_math())
 		memcpy(dst, src, offsetof(struct task_struct, thread.fpu.fpr));
+	else
+		memcpy(dst, src, offsetof(struct task_struct, thread.lbt.scr0));
+
+#ifdef CONFIG_CPU_HAS_LBT
+	memcpy(&dst->thread.lbt, &src->thread.lbt, sizeof(struct loongarch_lbt));
+#endif
 
 	return 0;
 }
@@ -189,8 +196,10 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 	ptrace_hw_copy_thread(p);
 	clear_tsk_thread_flag(p, TIF_USEDFPU);
 	clear_tsk_thread_flag(p, TIF_USEDSIMD);
+	clear_tsk_thread_flag(p, TIF_USEDLBT);
 	clear_tsk_thread_flag(p, TIF_LSX_CTX_LIVE);
 	clear_tsk_thread_flag(p, TIF_LASX_CTX_LIVE);
+	clear_tsk_thread_flag(p, TIF_LBT_CTX_LIVE);
 
 	return 0;
 }
diff --git a/arch/loongarch/kernel/ptrace.c b/arch/loongarch/kernel/ptrace.c
index f72adbf530c6465a15d5f53b1b8e311dc877280b..c114c5ef13325af024be5772b396690bbab7cbc4 100644
--- a/arch/loongarch/kernel/ptrace.c
+++ b/arch/loongarch/kernel/ptrace.c
@@ -38,6 +38,7 @@
 #include <asm/cpu.h>
 #include <asm/cpu-info.h>
 #include <asm/fpu.h>
+#include <asm/lbt.h>
 #include <asm/loongarch.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -338,6 +339,46 @@ static int simd_set(struct task_struct *target,
 
 #endif /* CONFIG_CPU_HAS_LSX */
 
+#ifdef CONFIG_CPU_HAS_LBT
+static int lbt_get(struct task_struct *target,
+		   const struct user_regset *regset,
+		   struct membuf to)
+{
+	int r;
+
+	r = membuf_write(&to, &target->thread.lbt.scr0, sizeof(target->thread.lbt.scr0));
+	r = membuf_write(&to, &target->thread.lbt.scr1, sizeof(target->thread.lbt.scr1));
+	r = membuf_write(&to, &target->thread.lbt.scr2, sizeof(target->thread.lbt.scr2));
+	r = membuf_write(&to, &target->thread.lbt.scr3, sizeof(target->thread.lbt.scr3));
+	r = membuf_write(&to, &target->thread.lbt.eflags, sizeof(u32));
+	r = membuf_write(&to, &target->thread.fpu.ftop, sizeof(u32));
+
+	return r;
+}
+
+static int lbt_set(struct task_struct *target,
+		   const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   const void *kbuf, const void __user *ubuf)
+{
+	int err = 0;
+	const int eflags_start = 4 * sizeof(target->thread.lbt.scr0);
+	const int ftop_start = eflags_start + sizeof(u32);
+
+	err |= user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				  &target->thread.lbt.scr0,
+				  0, 4 * sizeof(target->thread.lbt.scr0));
+	err |= user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				  &target->thread.lbt.eflags,
+				  eflags_start, ftop_start);
+	err |= user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				  &target->thread.fpu.ftop,
+				  ftop_start, ftop_start + sizeof(u32));
+
+	return err;
+}
+#endif /* CONFIG_CPU_HAS_LBT */
+
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 
 /*
@@ -802,6 +843,9 @@ enum loongarch_regset {
 #ifdef CONFIG_CPU_HAS_LASX
 	REGSET_LASX,
 #endif
+#ifdef CONFIG_CPU_HAS_LBT
+	REGSET_LBT,
+#endif
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 	REGSET_HW_BREAK,
 	REGSET_HW_WATCH,
@@ -853,6 +897,16 @@ static const struct user_regset loongarch64_regsets[] = {
 		.set		= simd_set,
 	},
 #endif
+#ifdef CONFIG_CPU_HAS_LBT
+	[REGSET_LBT] = {
+		.core_note_type	= NT_LOONGARCH_LBT,
+		.n		= 5,
+		.size		= sizeof(u64),
+		.align		= sizeof(u64),
+		.regset_get	= lbt_get,
+		.set		= lbt_set,
+	},
+#endif
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 	[REGSET_HW_BREAK] = {
 		.core_note_type = NT_LOONGARCH_HW_BREAK,
diff --git a/arch/loongarch/kernel/relocate.c b/arch/loongarch/kernel/relocate.c
index 01f94d1e3edf6f8b26b31f30f2494a1ace76f0e8..6c3eff9af9fb1ed4cc4af8ffa9b9ea5490591ee3 100644
--- a/arch/loongarch/kernel/relocate.c
+++ b/arch/loongarch/kernel/relocate.c
@@ -157,12 +157,11 @@ static inline void __init update_reloc_offset(unsigned long *addr, long random_o
 	*new_addr = (unsigned long)reloc_offset;
 }
 
-void * __init relocate_kernel(void)
+unsigned long __init relocate_kernel(void)
 {
 	unsigned long kernel_length;
 	unsigned long random_offset = 0;
 	void *location_new = _text; /* Default to original kernel start */
-	void *kernel_entry = start_kernel; /* Default to original kernel entry point */
 	char *cmdline = early_ioremap(fw_arg1, COMMAND_LINE_SIZE); /* Boot command line is passed in fw_arg1 */
 
 	strscpy(boot_command_line, cmdline, COMMAND_LINE_SIZE);
@@ -190,9 +189,6 @@ void * __init relocate_kernel(void)
 
 		reloc_offset += random_offset;
 
-		/* Return the new kernel's entry point */
-		kernel_entry = RELOCATED_KASLR(start_kernel);
-
 		/* The current thread is now within the relocated kernel */
 		__current_thread_info = RELOCATED_KASLR(__current_thread_info);
 
@@ -204,7 +200,7 @@ void * __init relocate_kernel(void)
 
 	relocate_absolute(random_offset);
 
-	return kernel_entry;
+	return random_offset;
 }
 
 /*
diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c
index 9d830ab4e3025e8ab7afacfc21428dfa9665afde..7783f0a3d742c7e0cce68d93cc9252eee4e7f381 100644
--- a/arch/loongarch/kernel/setup.c
+++ b/arch/loongarch/kernel/setup.c
@@ -626,4 +626,8 @@ void __init setup_arch(char **cmdline_p)
 #endif
 
 	paging_init();
+
+#ifdef CONFIG_KASAN
+	kasan_init();
+#endif
 }
diff --git a/arch/loongarch/kernel/signal.c b/arch/loongarch/kernel/signal.c
index ceb899366c0a7b2ba2793e8631792f8aaaa03ab3..504fdfe852030f54ca66a00fdde05abd4d577907 100644
--- a/arch/loongarch/kernel/signal.c
+++ b/arch/loongarch/kernel/signal.c
@@ -32,6 +32,7 @@
 #include <asm/cacheflush.h>
 #include <asm/cpu-features.h>
 #include <asm/fpu.h>
+#include <asm/lbt.h>
 #include <asm/ucontext.h>
 #include <asm/vdso.h>
 
@@ -44,6 +45,9 @@
 /* Make sure we will not lose FPU ownership */
 #define lock_fpu_owner()	({ preempt_disable(); pagefault_disable(); })
 #define unlock_fpu_owner()	({ pagefault_enable(); preempt_enable(); })
+/* Make sure we will not lose LBT ownership */
+#define lock_lbt_owner()	({ preempt_disable(); pagefault_disable(); })
+#define unlock_lbt_owner()	({ pagefault_enable(); preempt_enable(); })
 
 /* Assembly functions to move context to/from the FPU */
 extern asmlinkage int
@@ -59,6 +63,13 @@ _save_lasx_context(void __user *fpregs, void __user *fcc, void __user *fcsr);
 extern asmlinkage int
 _restore_lasx_context(void __user *fpregs, void __user *fcc, void __user *fcsr);
 
+#ifdef CONFIG_CPU_HAS_LBT
+extern asmlinkage int _save_lbt_context(void __user *regs, void __user *eflags);
+extern asmlinkage int _restore_lbt_context(void __user *regs, void __user *eflags);
+extern asmlinkage int _save_ftop_context(void __user *ftop);
+extern asmlinkage int _restore_ftop_context(void __user *ftop);
+#endif
+
 struct rt_sigframe {
 	struct siginfo rs_info;
 	struct ucontext rs_uctx;
@@ -75,6 +86,7 @@ struct extctx_layout {
 	struct _ctx_layout fpu;
 	struct _ctx_layout lsx;
 	struct _ctx_layout lasx;
+	struct _ctx_layout lbt;
 	struct _ctx_layout end;
 };
 
@@ -215,6 +227,52 @@ static int copy_lasx_from_sigcontext(struct lasx_context __user *ctx)
 	return err;
 }
 
+#ifdef CONFIG_CPU_HAS_LBT
+static int copy_lbt_to_sigcontext(struct lbt_context __user *ctx)
+{
+	int err = 0;
+	uint64_t __user *regs	= (uint64_t *)&ctx->regs;
+	uint32_t __user *eflags	= (uint32_t *)&ctx->eflags;
+
+	err |= __put_user(current->thread.lbt.scr0, &regs[0]);
+	err |= __put_user(current->thread.lbt.scr1, &regs[1]);
+	err |= __put_user(current->thread.lbt.scr2, &regs[2]);
+	err |= __put_user(current->thread.lbt.scr3, &regs[3]);
+	err |= __put_user(current->thread.lbt.eflags, eflags);
+
+	return err;
+}
+
+static int copy_lbt_from_sigcontext(struct lbt_context __user *ctx)
+{
+	int err = 0;
+	uint64_t __user *regs	= (uint64_t *)&ctx->regs;
+	uint32_t __user *eflags	= (uint32_t *)&ctx->eflags;
+
+	err |= __get_user(current->thread.lbt.scr0, &regs[0]);
+	err |= __get_user(current->thread.lbt.scr1, &regs[1]);
+	err |= __get_user(current->thread.lbt.scr2, &regs[2]);
+	err |= __get_user(current->thread.lbt.scr3, &regs[3]);
+	err |= __get_user(current->thread.lbt.eflags, eflags);
+
+	return err;
+}
+
+static int copy_ftop_to_sigcontext(struct lbt_context __user *ctx)
+{
+	uint32_t  __user *ftop	= &ctx->ftop;
+
+	return __put_user(current->thread.fpu.ftop, ftop);
+}
+
+static int copy_ftop_from_sigcontext(struct lbt_context __user *ctx)
+{
+	uint32_t  __user *ftop	= &ctx->ftop;
+
+	return __get_user(current->thread.fpu.ftop, ftop);
+}
+#endif
+
 /*
  * Wrappers for the assembly _{save,restore}_fp_context functions.
  */
@@ -272,6 +330,41 @@ static int restore_hw_lasx_context(struct lasx_context __user *ctx)
 	return _restore_lasx_context(regs, fcc, fcsr);
 }
 
+/*
+ * Wrappers for the assembly _{save,restore}_lbt_context functions.
+ */
+#ifdef CONFIG_CPU_HAS_LBT
+static int save_hw_lbt_context(struct lbt_context __user *ctx)
+{
+	uint64_t __user *regs	= (uint64_t *)&ctx->regs;
+	uint32_t __user *eflags	= (uint32_t *)&ctx->eflags;
+
+	return _save_lbt_context(regs, eflags);
+}
+
+static int restore_hw_lbt_context(struct lbt_context __user *ctx)
+{
+	uint64_t __user *regs	= (uint64_t *)&ctx->regs;
+	uint32_t __user *eflags	= (uint32_t *)&ctx->eflags;
+
+	return _restore_lbt_context(regs, eflags);
+}
+
+static int save_hw_ftop_context(struct lbt_context __user *ctx)
+{
+	uint32_t __user *ftop	= &ctx->ftop;
+
+	return _save_ftop_context(ftop);
+}
+
+static int restore_hw_ftop_context(struct lbt_context __user *ctx)
+{
+	uint32_t __user *ftop	= &ctx->ftop;
+
+	return _restore_ftop_context(ftop);
+}
+#endif
+
 static int fcsr_pending(unsigned int __user *fcsr)
 {
 	int err, sig = 0;
@@ -519,6 +612,77 @@ static int protected_restore_lasx_context(struct extctx_layout *extctx)
 	return err ?: sig;
 }
 
+#ifdef CONFIG_CPU_HAS_LBT
+static int protected_save_lbt_context(struct extctx_layout *extctx)
+{
+	int err = 0;
+	struct sctx_info __user *info = extctx->lbt.addr;
+	struct lbt_context __user *lbt_ctx =
+		(struct lbt_context *)get_ctx_through_ctxinfo(info);
+	uint64_t __user *regs	= (uint64_t *)&lbt_ctx->regs;
+	uint32_t __user *eflags	= (uint32_t *)&lbt_ctx->eflags;
+
+	while (1) {
+		lock_lbt_owner();
+		if (is_lbt_owner())
+			err |= save_hw_lbt_context(lbt_ctx);
+		else
+			err |= copy_lbt_to_sigcontext(lbt_ctx);
+		if (is_fpu_owner())
+			err |= save_hw_ftop_context(lbt_ctx);
+		else
+			err |= copy_ftop_to_sigcontext(lbt_ctx);
+		unlock_lbt_owner();
+
+		err |= __put_user(LBT_CTX_MAGIC, &info->magic);
+		err |= __put_user(extctx->lbt.size, &info->size);
+
+		if (likely(!err))
+			break;
+		/* Touch the LBT context and try again */
+		err = __put_user(0, &regs[0]) | __put_user(0, eflags);
+
+		if (err)
+			return err;
+	}
+
+	return err;
+}
+
+static int protected_restore_lbt_context(struct extctx_layout *extctx)
+{
+	int err = 0, tmp __maybe_unused;
+	struct sctx_info __user *info = extctx->lbt.addr;
+	struct lbt_context __user *lbt_ctx =
+		(struct lbt_context *)get_ctx_through_ctxinfo(info);
+	uint64_t __user *regs	= (uint64_t *)&lbt_ctx->regs;
+	uint32_t __user *eflags	= (uint32_t *)&lbt_ctx->eflags;
+
+	while (1) {
+		lock_lbt_owner();
+		if (is_lbt_owner())
+			err |= restore_hw_lbt_context(lbt_ctx);
+		else
+			err |= copy_lbt_from_sigcontext(lbt_ctx);
+		if (is_fpu_owner())
+			err |= restore_hw_ftop_context(lbt_ctx);
+		else
+			err |= copy_ftop_from_sigcontext(lbt_ctx);
+		unlock_lbt_owner();
+
+		if (likely(!err))
+			break;
+		/* Touch the LBT context and try again */
+		err = __get_user(tmp, &regs[0]) | __get_user(tmp, eflags);
+
+		if (err)
+			return err;
+	}
+
+	return err;
+}
+#endif
+
 static int setup_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
 			    struct extctx_layout *extctx)
 {
@@ -539,6 +703,11 @@ static int setup_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
 	else if (extctx->fpu.addr)
 		err |= protected_save_fpu_context(extctx);
 
+#ifdef CONFIG_CPU_HAS_LBT
+	if (extctx->lbt.addr)
+		err |= protected_save_lbt_context(extctx);
+#endif
+
 	/* Set the "end" magic */
 	info = (struct sctx_info *)extctx->end.addr;
 	err |= __put_user(0, &info->magic);
@@ -584,6 +753,13 @@ static int parse_extcontext(struct sigcontext __user *sc, struct extctx_layout *
 			extctx->lasx.addr = info;
 			break;
 
+		case LBT_CTX_MAGIC:
+			if (size < (sizeof(struct sctx_info) +
+				    sizeof(struct lbt_context)))
+				goto invalid;
+			extctx->lbt.addr = info;
+			break;
+
 		default:
 			goto invalid;
 		}
@@ -636,6 +812,11 @@ static int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc
 	else if (extctx.fpu.addr)
 		err |= protected_restore_fpu_context(&extctx);
 
+#ifdef CONFIG_CPU_HAS_LBT
+	if (extctx.lbt.addr)
+		err |= protected_restore_lbt_context(&extctx);
+#endif
+
 bad:
 	return err;
 }
@@ -700,6 +881,13 @@ static unsigned long setup_extcontext(struct extctx_layout *extctx, unsigned lon
 			  sizeof(struct fpu_context), FPU_CTX_ALIGN, new_sp);
 	}
 
+#ifdef CONFIG_CPU_HAS_LBT
+	if (cpu_has_lbt && thread_lbt_context_live()) {
+		new_sp = extframe_alloc(extctx, &extctx->lbt,
+			  sizeof(struct lbt_context), LBT_CTX_ALIGN, new_sp);
+	}
+#endif
+
 	return new_sp;
 }
 
diff --git a/arch/loongarch/kernel/stacktrace.c b/arch/loongarch/kernel/stacktrace.c
index 2463d2fea21f5f4bc9dd762fd5b1ed3c77c0fc40..92270f14db948271b00167ef0887703620b50c31 100644
--- a/arch/loongarch/kernel/stacktrace.c
+++ b/arch/loongarch/kernel/stacktrace.c
@@ -18,17 +18,19 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
 	struct pt_regs dummyregs;
 	struct unwind_state state;
 
-	regs = &dummyregs;
+	if (!regs) {
+		regs = &dummyregs;
 
-	if (task == current) {
-		regs->regs[3] = (unsigned long)__builtin_frame_address(0);
-		regs->csr_era = (unsigned long)__builtin_return_address(0);
-	} else {
-		regs->regs[3] = thread_saved_fp(task);
-		regs->csr_era = thread_saved_ra(task);
+		if (task == current) {
+			regs->regs[3] = (unsigned long)__builtin_frame_address(0);
+			regs->csr_era = (unsigned long)__builtin_return_address(0);
+		} else {
+			regs->regs[3] = thread_saved_fp(task);
+			regs->csr_era = thread_saved_ra(task);
+		}
+		regs->regs[1] = 0;
 	}
 
-	regs->regs[1] = 0;
 	for (unwind_start(&state, task, regs);
 	     !unwind_done(&state) && !unwind_error(&state); unwind_next_frame(&state)) {
 		addr = unwind_get_return_address(&state);
diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c
index 89699db45cec17ab87724ca16b2d24995b88e99b..65214774ef7c6bade648ccf4f8702cdefc7baac1 100644
--- a/arch/loongarch/kernel/traps.c
+++ b/arch/loongarch/kernel/traps.c
@@ -36,7 +36,9 @@
 #include <asm/break.h>
 #include <asm/cpu.h>
 #include <asm/fpu.h>
+#include <asm/lbt.h>
 #include <asm/inst.h>
+#include <asm/kgdb.h>
 #include <asm/loongarch.h>
 #include <asm/mmu_context.h>
 #include <asm/pgtable.h>
@@ -702,6 +704,11 @@ asmlinkage void noinstr do_bp(struct pt_regs *regs)
 	 * pertain to them.
 	 */
 	switch (bcode) {
+	case BRK_KDB:
+		if (kgdb_breakpoint_handler(regs))
+			goto out;
+		else
+			break;
 	case BRK_KPROBE_BP:
 		if (kprobe_breakpoint_handler(regs))
 			goto out;
@@ -768,6 +775,9 @@ asmlinkage void noinstr do_watch(struct pt_regs *regs)
 #ifndef CONFIG_HAVE_HW_BREAKPOINT
 	pr_warn("Hardware watch point handler not implemented!\n");
 #else
+	if (kgdb_breakpoint_handler(regs))
+		goto out;
+
 	if (test_tsk_thread_flag(current, TIF_SINGLESTEP)) {
 		int llbit = (csr_read32(LOONGARCH_CSR_LLBCTL) & 0x1);
 		unsigned long pc = instruction_pointer(regs);
@@ -966,13 +976,47 @@ asmlinkage void noinstr do_lasx(struct pt_regs *regs)
 	irqentry_exit(regs, state);
 }
 
+static void init_restore_lbt(void)
+{
+	if (!thread_lbt_context_live()) {
+		/* First time LBT context user */
+		init_lbt();
+		set_thread_flag(TIF_LBT_CTX_LIVE);
+	} else {
+		if (!is_lbt_owner())
+			own_lbt_inatomic(1);
+	}
+
+	BUG_ON(!is_lbt_enabled());
+}
+
 asmlinkage void noinstr do_lbt(struct pt_regs *regs)
 {
 	irqentry_state_t state = irqentry_enter(regs);
 
-	local_irq_enable();
-	force_sig(SIGILL);
-	local_irq_disable();
+	/*
+	 * BTD (Binary Translation Disable exception) can be triggered
+	 * during FP save/restore if TM (Top Mode) is on, which may
+	 * cause irq_enable during 'switch_to'. To avoid this situation
+	 * (including the user using 'MOVGR2GCSR' to turn on TM, which
+	 * will not trigger the BTE), we need to check PRMD first.
+	 */
+	if (regs->csr_prmd & CSR_PRMD_PIE)
+		local_irq_enable();
+
+	if (!cpu_has_lbt) {
+		force_sig(SIGILL);
+		goto out;
+	}
+	BUG_ON(is_lbt_enabled());
+
+	preempt_disable();
+	init_restore_lbt();
+	preempt_enable();
+
+out:
+	if (regs->csr_prmd & CSR_PRMD_PIE)
+		local_irq_disable();
 
 	irqentry_exit(regs, state);
 }
diff --git a/arch/loongarch/lib/Makefile b/arch/loongarch/lib/Makefile
index d60d4e096cfa9e4ea1a5b3e1d2f01f1be26bc9b1..a77bf160bfc4246fa34cc666f037ace2eaf01996 100644
--- a/arch/loongarch/lib/Makefile
+++ b/arch/loongarch/lib/Makefile
@@ -6,4 +6,6 @@
 lib-y	+= delay.o memset.o memcpy.o memmove.o \
 	   clear_user.o copy_user.o csum.o dump_tlb.o unaligned.o
 
+obj-$(CONFIG_CPU_HAS_LSX) += xor_simd.o xor_simd_glue.o
+
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
diff --git a/arch/loongarch/lib/clear_user.S b/arch/loongarch/lib/clear_user.S
index 0790eadce166dee8b3b9f96c8a371dadc4259353..be741544e62bf63f7198d451c72c8f47eb0501d7 100644
--- a/arch/loongarch/lib/clear_user.S
+++ b/arch/loongarch/lib/clear_user.S
@@ -11,19 +11,6 @@
 #include <asm/cpu.h>
 #include <asm/regdef.h>
 
-.irp to, 0, 1, 2, 3, 4, 5, 6, 7
-.L_fixup_handle_\to\():
-	sub.d	a0, a2, a0
-	addi.d	a0, a0, (\to) * (-8)
-	jr	ra
-.endr
-
-.irp to, 0, 2, 4
-.L_fixup_handle_s\to\():
-	addi.d	a0, a1, -\to
-	jr	ra
-.endr
-
 SYM_FUNC_START(__clear_user)
 	/*
 	 * Some CPUs support hardware unaligned access
@@ -51,7 +38,7 @@ SYM_FUNC_START(__clear_user_generic)
 2:	move	a0, a1
 	jr	ra
 
-	_asm_extable 1b, .L_fixup_handle_s0
+	_asm_extable 1b, 2b
 SYM_FUNC_END(__clear_user_generic)
 
 /*
@@ -173,33 +160,47 @@ SYM_FUNC_START(__clear_user_fast)
 	jr	ra
 
 	/* fixup and ex_table */
-	_asm_extable 0b, .L_fixup_handle_0
-	_asm_extable 1b, .L_fixup_handle_0
-	_asm_extable 2b, .L_fixup_handle_1
-	_asm_extable 3b, .L_fixup_handle_2
-	_asm_extable 4b, .L_fixup_handle_3
-	_asm_extable 5b, .L_fixup_handle_4
-	_asm_extable 6b, .L_fixup_handle_5
-	_asm_extable 7b, .L_fixup_handle_6
-	_asm_extable 8b, .L_fixup_handle_7
-	_asm_extable 9b, .L_fixup_handle_0
-	_asm_extable 10b, .L_fixup_handle_1
-	_asm_extable 11b, .L_fixup_handle_2
-	_asm_extable 12b, .L_fixup_handle_3
-	_asm_extable 13b, .L_fixup_handle_0
-	_asm_extable 14b, .L_fixup_handle_1
-	_asm_extable 15b, .L_fixup_handle_0
-	_asm_extable 16b, .L_fixup_handle_0
-	_asm_extable 17b, .L_fixup_handle_s0
-	_asm_extable 18b, .L_fixup_handle_s0
-	_asm_extable 19b, .L_fixup_handle_s0
-	_asm_extable 20b, .L_fixup_handle_s2
-	_asm_extable 21b, .L_fixup_handle_s0
-	_asm_extable 22b, .L_fixup_handle_s0
-	_asm_extable 23b, .L_fixup_handle_s4
-	_asm_extable 24b, .L_fixup_handle_s0
-	_asm_extable 25b, .L_fixup_handle_s4
-	_asm_extable 26b, .L_fixup_handle_s0
-	_asm_extable 27b, .L_fixup_handle_s4
-	_asm_extable 28b, .L_fixup_handle_s0
+.Llarge_fixup:
+	sub.d	a1, a2, a0
+
+.Lsmall_fixup:
+29:	st.b	zero, a0, 0
+	addi.d	a0, a0, 1
+	addi.d	a1, a1, -1
+	bgt	a1, zero, 29b
+
+.Lexit:
+	move	a0, a1
+	jr	ra
+
+	_asm_extable 0b, .Lsmall_fixup
+	_asm_extable 1b, .Llarge_fixup
+	_asm_extable 2b, .Llarge_fixup
+	_asm_extable 3b, .Llarge_fixup
+	_asm_extable 4b, .Llarge_fixup
+	_asm_extable 5b, .Llarge_fixup
+	_asm_extable 6b, .Llarge_fixup
+	_asm_extable 7b, .Llarge_fixup
+	_asm_extable 8b, .Llarge_fixup
+	_asm_extable 9b, .Llarge_fixup
+	_asm_extable 10b, .Llarge_fixup
+	_asm_extable 11b, .Llarge_fixup
+	_asm_extable 12b, .Llarge_fixup
+	_asm_extable 13b, .Llarge_fixup
+	_asm_extable 14b, .Llarge_fixup
+	_asm_extable 15b, .Llarge_fixup
+	_asm_extable 16b, .Llarge_fixup
+	_asm_extable 17b, .Lexit
+	_asm_extable 18b, .Lsmall_fixup
+	_asm_extable 19b, .Lsmall_fixup
+	_asm_extable 20b, .Lsmall_fixup
+	_asm_extable 21b, .Lsmall_fixup
+	_asm_extable 22b, .Lsmall_fixup
+	_asm_extable 23b, .Lsmall_fixup
+	_asm_extable 24b, .Lsmall_fixup
+	_asm_extable 25b, .Lsmall_fixup
+	_asm_extable 26b, .Lsmall_fixup
+	_asm_extable 27b, .Lsmall_fixup
+	_asm_extable 28b, .Lsmall_fixup
+	_asm_extable 29b, .Lexit
 SYM_FUNC_END(__clear_user_fast)
diff --git a/arch/loongarch/lib/copy_user.S b/arch/loongarch/lib/copy_user.S
index bfe3d2793d0002cf68338000e202059be9cbc44a..feec3d3628032f433dbf65e006b8a07acfbc161c 100644
--- a/arch/loongarch/lib/copy_user.S
+++ b/arch/loongarch/lib/copy_user.S
@@ -11,19 +11,6 @@
 #include <asm/cpu.h>
 #include <asm/regdef.h>
 
-.irp to, 0, 1, 2, 3, 4, 5, 6, 7
-.L_fixup_handle_\to\():
-	sub.d	a0, a2, a0
-	addi.d	a0, a0, (\to) * (-8)
-	jr	ra
-.endr
-
-.irp to, 0, 2, 4
-.L_fixup_handle_s\to\():
-	addi.d	a0, a2, -\to
-	jr	ra
-.endr
-
 SYM_FUNC_START(__copy_user)
 	/*
 	 * Some CPUs support hardware unaligned access
@@ -54,8 +41,8 @@ SYM_FUNC_START(__copy_user_generic)
 3:	move	a0, a2
 	jr	ra
 
-	_asm_extable 1b, .L_fixup_handle_s0
-	_asm_extable 2b, .L_fixup_handle_s0
+	_asm_extable 1b, 3b
+	_asm_extable 2b, 3b
 SYM_FUNC_END(__copy_user_generic)
 
 /*
@@ -69,10 +56,10 @@ SYM_FUNC_START(__copy_user_fast)
 	sltui	t0, a2, 9
 	bnez	t0, .Lsmall
 
-	add.d	a3, a1, a2
-	add.d	a2, a0, a2
 0:	ld.d	t0, a1, 0
 1:	st.d	t0, a0, 0
+	add.d	a3, a1, a2
+	add.d	a2, a0, a2
 
 	/* align up destination address */
 	andi	t1, a0, 7
@@ -94,7 +81,6 @@ SYM_FUNC_START(__copy_user_fast)
 7:	ld.d	t5, a1, 40
 8:	ld.d	t6, a1, 48
 9:	ld.d	t7, a1, 56
-	addi.d	a1, a1, 64
 10:	st.d	t0, a0, 0
 11:	st.d	t1, a0, 8
 12:	st.d	t2, a0, 16
@@ -103,6 +89,7 @@ SYM_FUNC_START(__copy_user_fast)
 15:	st.d	t5, a0, 40
 16:	st.d	t6, a0, 48
 17:	st.d	t7, a0, 56
+	addi.d	a1, a1, 64
 	addi.d	a0, a0, 64
 	bltu	a1, a4, .Lloop64
 
@@ -114,11 +101,11 @@ SYM_FUNC_START(__copy_user_fast)
 19:	ld.d	t1, a1, 8
 20:	ld.d	t2, a1, 16
 21:	ld.d	t3, a1, 24
-	addi.d	a1, a1, 32
 22:	st.d	t0, a0, 0
 23:	st.d	t1, a0, 8
 24:	st.d	t2, a0, 16
 25:	st.d	t3, a0, 24
+	addi.d	a1, a1, 32
 	addi.d	a0, a0, 32
 
 .Llt32:
@@ -126,9 +113,9 @@ SYM_FUNC_START(__copy_user_fast)
 	bgeu	a1, a4, .Llt16
 26:	ld.d	t0, a1, 0
 27:	ld.d	t1, a1, 8
-	addi.d	a1, a1, 16
 28:	st.d	t0, a0, 0
 29:	st.d	t1, a0, 8
+	addi.d	a1, a1, 16
 	addi.d	a0, a0, 16
 
 .Llt16:
@@ -136,6 +123,7 @@ SYM_FUNC_START(__copy_user_fast)
 	bgeu	a1, a4, .Llt8
 30:	ld.d	t0, a1, 0
 31:	st.d	t0, a0, 0
+	addi.d	a1, a1, 8
 	addi.d	a0, a0, 8
 
 .Llt8:
@@ -214,62 +202,79 @@ SYM_FUNC_START(__copy_user_fast)
 	jr	ra
 
 	/* fixup and ex_table */
-	_asm_extable 0b, .L_fixup_handle_0
-	_asm_extable 1b, .L_fixup_handle_0
-	_asm_extable 2b, .L_fixup_handle_0
-	_asm_extable 3b, .L_fixup_handle_0
-	_asm_extable 4b, .L_fixup_handle_0
-	_asm_extable 5b, .L_fixup_handle_0
-	_asm_extable 6b, .L_fixup_handle_0
-	_asm_extable 7b, .L_fixup_handle_0
-	_asm_extable 8b, .L_fixup_handle_0
-	_asm_extable 9b, .L_fixup_handle_0
-	_asm_extable 10b, .L_fixup_handle_0
-	_asm_extable 11b, .L_fixup_handle_1
-	_asm_extable 12b, .L_fixup_handle_2
-	_asm_extable 13b, .L_fixup_handle_3
-	_asm_extable 14b, .L_fixup_handle_4
-	_asm_extable 15b, .L_fixup_handle_5
-	_asm_extable 16b, .L_fixup_handle_6
-	_asm_extable 17b, .L_fixup_handle_7
-	_asm_extable 18b, .L_fixup_handle_0
-	_asm_extable 19b, .L_fixup_handle_0
-	_asm_extable 20b, .L_fixup_handle_0
-	_asm_extable 21b, .L_fixup_handle_0
-	_asm_extable 22b, .L_fixup_handle_0
-	_asm_extable 23b, .L_fixup_handle_1
-	_asm_extable 24b, .L_fixup_handle_2
-	_asm_extable 25b, .L_fixup_handle_3
-	_asm_extable 26b, .L_fixup_handle_0
-	_asm_extable 27b, .L_fixup_handle_0
-	_asm_extable 28b, .L_fixup_handle_0
-	_asm_extable 29b, .L_fixup_handle_1
-	_asm_extable 30b, .L_fixup_handle_0
-	_asm_extable 31b, .L_fixup_handle_0
-	_asm_extable 32b, .L_fixup_handle_0
-	_asm_extable 33b, .L_fixup_handle_0
-	_asm_extable 34b, .L_fixup_handle_s0
-	_asm_extable 35b, .L_fixup_handle_s0
-	_asm_extable 36b, .L_fixup_handle_s0
-	_asm_extable 37b, .L_fixup_handle_s0
-	_asm_extable 38b, .L_fixup_handle_s0
-	_asm_extable 39b, .L_fixup_handle_s0
-	_asm_extable 40b, .L_fixup_handle_s0
-	_asm_extable 41b, .L_fixup_handle_s2
-	_asm_extable 42b, .L_fixup_handle_s0
-	_asm_extable 43b, .L_fixup_handle_s0
-	_asm_extable 44b, .L_fixup_handle_s0
-	_asm_extable 45b, .L_fixup_handle_s0
-	_asm_extable 46b, .L_fixup_handle_s0
-	_asm_extable 47b, .L_fixup_handle_s4
-	_asm_extable 48b, .L_fixup_handle_s0
-	_asm_extable 49b, .L_fixup_handle_s0
-	_asm_extable 50b, .L_fixup_handle_s0
-	_asm_extable 51b, .L_fixup_handle_s4
-	_asm_extable 52b, .L_fixup_handle_s0
-	_asm_extable 53b, .L_fixup_handle_s0
-	_asm_extable 54b, .L_fixup_handle_s0
-	_asm_extable 55b, .L_fixup_handle_s4
-	_asm_extable 56b, .L_fixup_handle_s0
-	_asm_extable 57b, .L_fixup_handle_s0
+.Llarge_fixup:
+	sub.d	a2, a2, a0
+
+.Lsmall_fixup:
+58:	ld.b	t0, a1, 0
+59:	st.b	t0, a0, 0
+	addi.d	a0, a0, 1
+	addi.d	a1, a1, 1
+	addi.d	a2, a2, -1
+	bgt	a2, zero, 58b
+
+.Lexit:
+	move	a0, a2
+	jr	ra
+
+	_asm_extable 0b, .Lsmall_fixup
+	_asm_extable 1b, .Lsmall_fixup
+	_asm_extable 2b, .Llarge_fixup
+	_asm_extable 3b, .Llarge_fixup
+	_asm_extable 4b, .Llarge_fixup
+	_asm_extable 5b, .Llarge_fixup
+	_asm_extable 6b, .Llarge_fixup
+	_asm_extable 7b, .Llarge_fixup
+	_asm_extable 8b, .Llarge_fixup
+	_asm_extable 9b, .Llarge_fixup
+	_asm_extable 10b, .Llarge_fixup
+	_asm_extable 11b, .Llarge_fixup
+	_asm_extable 12b, .Llarge_fixup
+	_asm_extable 13b, .Llarge_fixup
+	_asm_extable 14b, .Llarge_fixup
+	_asm_extable 15b, .Llarge_fixup
+	_asm_extable 16b, .Llarge_fixup
+	_asm_extable 17b, .Llarge_fixup
+	_asm_extable 18b, .Llarge_fixup
+	_asm_extable 19b, .Llarge_fixup
+	_asm_extable 20b, .Llarge_fixup
+	_asm_extable 21b, .Llarge_fixup
+	_asm_extable 22b, .Llarge_fixup
+	_asm_extable 23b, .Llarge_fixup
+	_asm_extable 24b, .Llarge_fixup
+	_asm_extable 25b, .Llarge_fixup
+	_asm_extable 26b, .Llarge_fixup
+	_asm_extable 27b, .Llarge_fixup
+	_asm_extable 28b, .Llarge_fixup
+	_asm_extable 29b, .Llarge_fixup
+	_asm_extable 30b, .Llarge_fixup
+	_asm_extable 31b, .Llarge_fixup
+	_asm_extable 32b, .Llarge_fixup
+	_asm_extable 33b, .Llarge_fixup
+	_asm_extable 34b, .Lexit
+	_asm_extable 35b, .Lexit
+	_asm_extable 36b, .Lsmall_fixup
+	_asm_extable 37b, .Lsmall_fixup
+	_asm_extable 38b, .Lsmall_fixup
+	_asm_extable 39b, .Lsmall_fixup
+	_asm_extable 40b, .Lsmall_fixup
+	_asm_extable 41b, .Lsmall_fixup
+	_asm_extable 42b, .Lsmall_fixup
+	_asm_extable 43b, .Lsmall_fixup
+	_asm_extable 44b, .Lsmall_fixup
+	_asm_extable 45b, .Lsmall_fixup
+	_asm_extable 46b, .Lsmall_fixup
+	_asm_extable 47b, .Lsmall_fixup
+	_asm_extable 48b, .Lsmall_fixup
+	_asm_extable 49b, .Lsmall_fixup
+	_asm_extable 50b, .Lsmall_fixup
+	_asm_extable 51b, .Lsmall_fixup
+	_asm_extable 52b, .Lsmall_fixup
+	_asm_extable 53b, .Lsmall_fixup
+	_asm_extable 54b, .Lsmall_fixup
+	_asm_extable 55b, .Lsmall_fixup
+	_asm_extable 56b, .Lsmall_fixup
+	_asm_extable 57b, .Lsmall_fixup
+	_asm_extable 58b, .Lexit
+	_asm_extable 59b, .Lexit
 SYM_FUNC_END(__copy_user_fast)
diff --git a/arch/loongarch/lib/memcpy.S b/arch/loongarch/lib/memcpy.S
index cc30b3b6252f7fd2ecc70fca7b0ffed82d64714f..fa1148878d2b9d06ccdfcf6f2c14fd302fd14114 100644
--- a/arch/loongarch/lib/memcpy.S
+++ b/arch/loongarch/lib/memcpy.S
@@ -10,6 +10,8 @@
 #include <asm/cpu.h>
 #include <asm/regdef.h>
 
+.section .noinstr.text, "ax"
+
 SYM_FUNC_START(memcpy)
 	/*
 	 * Some CPUs support hardware unaligned access
@@ -17,9 +19,13 @@ SYM_FUNC_START(memcpy)
 	ALTERNATIVE	"b __memcpy_generic", \
 			"b __memcpy_fast", CPU_FEATURE_UAL
 SYM_FUNC_END(memcpy)
-_ASM_NOKPROBE(memcpy)
+SYM_FUNC_ALIAS(__memcpy, memcpy)
 
 EXPORT_SYMBOL(memcpy)
+EXPORT_SYMBOL(__memcpy)
+
+_ASM_NOKPROBE(memcpy)
+_ASM_NOKPROBE(__memcpy)
 
 /*
  * void *__memcpy_generic(void *dst, const void *src, size_t n)
diff --git a/arch/loongarch/lib/memmove.S b/arch/loongarch/lib/memmove.S
index 7dc76d1484b6a84252a76c5f334a69d252d24465..82dae062fec85497a3c01d4fb3084df8ebb277c9 100644
--- a/arch/loongarch/lib/memmove.S
+++ b/arch/loongarch/lib/memmove.S
@@ -10,23 +10,29 @@
 #include <asm/cpu.h>
 #include <asm/regdef.h>
 
+.section .noinstr.text, "ax"
+
 SYM_FUNC_START(memmove)
-	blt	a0, a1, memcpy	/* dst < src, memcpy */
-	blt	a1, a0, rmemcpy	/* src < dst, rmemcpy */
-	jr	ra		/* dst == src, return */
+	blt	a0, a1, __memcpy	/* dst < src, memcpy */
+	blt	a1, a0, __rmemcpy	/* src < dst, rmemcpy */
+	jr	ra			/* dst == src, return */
 SYM_FUNC_END(memmove)
-_ASM_NOKPROBE(memmove)
+SYM_FUNC_ALIAS(__memmove, memmove)
 
 EXPORT_SYMBOL(memmove)
+EXPORT_SYMBOL(__memmove)
+
+_ASM_NOKPROBE(memmove)
+_ASM_NOKPROBE(__memmove)
 
-SYM_FUNC_START(rmemcpy)
+SYM_FUNC_START(__rmemcpy)
 	/*
 	 * Some CPUs support hardware unaligned access
 	 */
 	ALTERNATIVE	"b __rmemcpy_generic", \
 			"b __rmemcpy_fast", CPU_FEATURE_UAL
-SYM_FUNC_END(rmemcpy)
-_ASM_NOKPROBE(rmemcpy)
+SYM_FUNC_END(__rmemcpy)
+_ASM_NOKPROBE(__rmemcpy)
 
 /*
  * void *__rmemcpy_generic(void *dst, const void *src, size_t n)
diff --git a/arch/loongarch/lib/memset.S b/arch/loongarch/lib/memset.S
index 3f20f7996e8ed6e3a38ffcffb1eab7629275da17..06d3ca54cbfe7d73c6cc7a5cadeca4d558b3e6dc 100644
--- a/arch/loongarch/lib/memset.S
+++ b/arch/loongarch/lib/memset.S
@@ -16,6 +16,8 @@
 	bstrins.d \r0, \r0, 63, 32
 .endm
 
+.section .noinstr.text, "ax"
+
 SYM_FUNC_START(memset)
 	/*
 	 * Some CPUs support hardware unaligned access
@@ -23,9 +25,13 @@ SYM_FUNC_START(memset)
 	ALTERNATIVE	"b __memset_generic", \
 			"b __memset_fast", CPU_FEATURE_UAL
 SYM_FUNC_END(memset)
-_ASM_NOKPROBE(memset)
+SYM_FUNC_ALIAS(__memset, memset)
 
 EXPORT_SYMBOL(memset)
+EXPORT_SYMBOL(__memset)
+
+_ASM_NOKPROBE(memset)
+_ASM_NOKPROBE(__memset)
 
 /*
  * void *__memset_generic(void *s, int c, size_t n)
diff --git a/arch/loongarch/lib/xor_simd.c b/arch/loongarch/lib/xor_simd.c
new file mode 100644
index 0000000000000000000000000000000000000000..84cd24b728c47968344ef2b771895e0e321262dd
--- /dev/null
+++ b/arch/loongarch/lib/xor_simd.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * LoongArch SIMD XOR operations
+ *
+ * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
+ */
+
+#include "xor_simd.h"
+
+/*
+ * Process one cache line (64 bytes) per loop. This is assuming all future
+ * popular LoongArch cores are similar performance-characteristics-wise to the
+ * current models.
+ */
+#define LINE_WIDTH 64
+
+#ifdef CONFIG_CPU_HAS_LSX
+
+#define LD(reg, base, offset)	\
+	"vld $vr" #reg ", %[" #base "], " #offset "\n\t"
+#define ST(reg, base, offset)	\
+	"vst $vr" #reg ", %[" #base "], " #offset "\n\t"
+#define XOR(dj, k)	"vxor.v $vr" #dj ", $vr" #dj ", $vr" #k "\n\t"
+
+#define LD_INOUT_LINE(base)	\
+	LD(0, base, 0)		\
+	LD(1, base, 16)		\
+	LD(2, base, 32)		\
+	LD(3, base, 48)
+
+#define LD_AND_XOR_LINE(base)	\
+	LD(4, base, 0)		\
+	LD(5, base, 16)		\
+	LD(6, base, 32)		\
+	LD(7, base, 48)		\
+	XOR(0, 4)		\
+	XOR(1, 5)		\
+	XOR(2, 6)		\
+	XOR(3, 7)
+
+#define ST_LINE(base)		\
+	ST(0, base, 0)		\
+	ST(1, base, 16)		\
+	ST(2, base, 32)		\
+	ST(3, base, 48)
+
+#define XOR_FUNC_NAME(nr) __xor_lsx_##nr
+#include "xor_template.c"
+
+#undef LD
+#undef ST
+#undef XOR
+#undef LD_INOUT_LINE
+#undef LD_AND_XOR_LINE
+#undef ST_LINE
+#undef XOR_FUNC_NAME
+
+#endif /* CONFIG_CPU_HAS_LSX */
+
+#ifdef CONFIG_CPU_HAS_LASX
+
+#define LD(reg, base, offset)	\
+	"xvld $xr" #reg ", %[" #base "], " #offset "\n\t"
+#define ST(reg, base, offset)	\
+	"xvst $xr" #reg ", %[" #base "], " #offset "\n\t"
+#define XOR(dj, k)	"xvxor.v $xr" #dj ", $xr" #dj ", $xr" #k "\n\t"
+
+#define LD_INOUT_LINE(base)	\
+	LD(0, base, 0)		\
+	LD(1, base, 32)
+
+#define LD_AND_XOR_LINE(base)	\
+	LD(2, base, 0)		\
+	LD(3, base, 32)		\
+	XOR(0, 2)		\
+	XOR(1, 3)
+
+#define ST_LINE(base)		\
+	ST(0, base, 0)		\
+	ST(1, base, 32)
+
+#define XOR_FUNC_NAME(nr) __xor_lasx_##nr
+#include "xor_template.c"
+
+#undef LD
+#undef ST
+#undef XOR
+#undef LD_INOUT_LINE
+#undef LD_AND_XOR_LINE
+#undef ST_LINE
+#undef XOR_FUNC_NAME
+
+#endif /* CONFIG_CPU_HAS_LASX */
diff --git a/arch/loongarch/lib/xor_simd.h b/arch/loongarch/lib/xor_simd.h
new file mode 100644
index 0000000000000000000000000000000000000000..f50f32514d80481519b4a5eed0e688fcbb232f47
--- /dev/null
+++ b/arch/loongarch/lib/xor_simd.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Simple interface to link xor_simd.c and xor_simd_glue.c
+ *
+ * Separating these files ensures that no SIMD instructions are run outside of
+ * the kfpu critical section.
+ */
+
+#ifndef __LOONGARCH_LIB_XOR_SIMD_H
+#define __LOONGARCH_LIB_XOR_SIMD_H
+
+#ifdef CONFIG_CPU_HAS_LSX
+void __xor_lsx_2(unsigned long bytes, unsigned long * __restrict p1,
+		 const unsigned long * __restrict p2);
+void __xor_lsx_3(unsigned long bytes, unsigned long * __restrict p1,
+		 const unsigned long * __restrict p2, const unsigned long * __restrict p3);
+void __xor_lsx_4(unsigned long bytes, unsigned long * __restrict p1,
+		 const unsigned long * __restrict p2, const unsigned long * __restrict p3,
+		 const unsigned long * __restrict p4);
+void __xor_lsx_5(unsigned long bytes, unsigned long * __restrict p1,
+		 const unsigned long * __restrict p2, const unsigned long * __restrict p3,
+		 const unsigned long * __restrict p4, const unsigned long * __restrict p5);
+#endif /* CONFIG_CPU_HAS_LSX */
+
+#ifdef CONFIG_CPU_HAS_LASX
+void __xor_lasx_2(unsigned long bytes, unsigned long * __restrict p1,
+		  const unsigned long * __restrict p2);
+void __xor_lasx_3(unsigned long bytes, unsigned long * __restrict p1,
+		  const unsigned long * __restrict p2, const unsigned long * __restrict p3);
+void __xor_lasx_4(unsigned long bytes, unsigned long * __restrict p1,
+		  const unsigned long * __restrict p2, const unsigned long * __restrict p3,
+		  const unsigned long * __restrict p4);
+void __xor_lasx_5(unsigned long bytes, unsigned long * __restrict p1,
+		  const unsigned long * __restrict p2, const unsigned long * __restrict p3,
+		  const unsigned long * __restrict p4, const unsigned long * __restrict p5);
+#endif /* CONFIG_CPU_HAS_LASX */
+
+#endif /* __LOONGARCH_LIB_XOR_SIMD_H */
diff --git a/arch/loongarch/lib/xor_simd_glue.c b/arch/loongarch/lib/xor_simd_glue.c
new file mode 100644
index 0000000000000000000000000000000000000000..393f689dbcf67827a034e07f4c2b4ce43b44b721
--- /dev/null
+++ b/arch/loongarch/lib/xor_simd_glue.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * LoongArch SIMD XOR operations
+ *
+ * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
+ */
+
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <asm/fpu.h>
+#include <asm/xor_simd.h>
+#include "xor_simd.h"
+
+#define MAKE_XOR_GLUE_2(flavor)							\
+void xor_##flavor##_2(unsigned long bytes, unsigned long * __restrict p1,	\
+		      const unsigned long * __restrict p2)			\
+{										\
+	kernel_fpu_begin();							\
+	__xor_##flavor##_2(bytes, p1, p2);					\
+	kernel_fpu_end();							\
+}										\
+EXPORT_SYMBOL_GPL(xor_##flavor##_2)
+
+#define MAKE_XOR_GLUE_3(flavor)							\
+void xor_##flavor##_3(unsigned long bytes, unsigned long * __restrict p1,	\
+		      const unsigned long * __restrict p2,			\
+		      const unsigned long * __restrict p3)			\
+{										\
+	kernel_fpu_begin();							\
+	__xor_##flavor##_3(bytes, p1, p2, p3);					\
+	kernel_fpu_end();							\
+}										\
+EXPORT_SYMBOL_GPL(xor_##flavor##_3)
+
+#define MAKE_XOR_GLUE_4(flavor)							\
+void xor_##flavor##_4(unsigned long bytes, unsigned long * __restrict p1,	\
+		      const unsigned long * __restrict p2,			\
+		      const unsigned long * __restrict p3,			\
+		      const unsigned long * __restrict p4)			\
+{										\
+	kernel_fpu_begin();							\
+	__xor_##flavor##_4(bytes, p1, p2, p3, p4);				\
+	kernel_fpu_end();							\
+}										\
+EXPORT_SYMBOL_GPL(xor_##flavor##_4)
+
+#define MAKE_XOR_GLUE_5(flavor)							\
+void xor_##flavor##_5(unsigned long bytes, unsigned long * __restrict p1,	\
+		      const unsigned long * __restrict p2,			\
+		      const unsigned long * __restrict p3,			\
+		      const unsigned long * __restrict p4,			\
+		      const unsigned long * __restrict p5)			\
+{										\
+	kernel_fpu_begin();							\
+	__xor_##flavor##_5(bytes, p1, p2, p3, p4, p5);				\
+	kernel_fpu_end();							\
+}										\
+EXPORT_SYMBOL_GPL(xor_##flavor##_5)
+
+#define MAKE_XOR_GLUES(flavor)		\
+	MAKE_XOR_GLUE_2(flavor);	\
+	MAKE_XOR_GLUE_3(flavor);	\
+	MAKE_XOR_GLUE_4(flavor);	\
+	MAKE_XOR_GLUE_5(flavor)
+
+#ifdef CONFIG_CPU_HAS_LSX
+MAKE_XOR_GLUES(lsx);
+#endif
+
+#ifdef CONFIG_CPU_HAS_LASX
+MAKE_XOR_GLUES(lasx);
+#endif
diff --git a/arch/loongarch/lib/xor_template.c b/arch/loongarch/lib/xor_template.c
new file mode 100644
index 0000000000000000000000000000000000000000..0358ced7fe333838e4c0ca4c144a13b12925d062
--- /dev/null
+++ b/arch/loongarch/lib/xor_template.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
+ *
+ * Template for XOR operations, instantiated in xor_simd.c.
+ *
+ * Expected preprocessor definitions:
+ *
+ * - LINE_WIDTH
+ * - XOR_FUNC_NAME(nr)
+ * - LD_INOUT_LINE(buf)
+ * - LD_AND_XOR_LINE(buf)
+ * - ST_LINE(buf)
+ */
+
+void XOR_FUNC_NAME(2)(unsigned long bytes,
+		      unsigned long * __restrict v1,
+		      const unsigned long * __restrict v2)
+{
+	unsigned long lines = bytes / LINE_WIDTH;
+
+	do {
+		__asm__ __volatile__ (
+			LD_INOUT_LINE(v1)
+			LD_AND_XOR_LINE(v2)
+			ST_LINE(v1)
+		: : [v1] "r"(v1), [v2] "r"(v2) : "memory"
+		);
+
+		v1 += LINE_WIDTH / sizeof(unsigned long);
+		v2 += LINE_WIDTH / sizeof(unsigned long);
+	} while (--lines > 0);
+}
+
+void XOR_FUNC_NAME(3)(unsigned long bytes,
+		      unsigned long * __restrict v1,
+		      const unsigned long * __restrict v2,
+		      const unsigned long * __restrict v3)
+{
+	unsigned long lines = bytes / LINE_WIDTH;
+
+	do {
+		__asm__ __volatile__ (
+			LD_INOUT_LINE(v1)
+			LD_AND_XOR_LINE(v2)
+			LD_AND_XOR_LINE(v3)
+			ST_LINE(v1)
+		: : [v1] "r"(v1), [v2] "r"(v2), [v3] "r"(v3) : "memory"
+		);
+
+		v1 += LINE_WIDTH / sizeof(unsigned long);
+		v2 += LINE_WIDTH / sizeof(unsigned long);
+		v3 += LINE_WIDTH / sizeof(unsigned long);
+	} while (--lines > 0);
+}
+
+void XOR_FUNC_NAME(4)(unsigned long bytes,
+		      unsigned long * __restrict v1,
+		      const unsigned long * __restrict v2,
+		      const unsigned long * __restrict v3,
+		      const unsigned long * __restrict v4)
+{
+	unsigned long lines = bytes / LINE_WIDTH;
+
+	do {
+		__asm__ __volatile__ (
+			LD_INOUT_LINE(v1)
+			LD_AND_XOR_LINE(v2)
+			LD_AND_XOR_LINE(v3)
+			LD_AND_XOR_LINE(v4)
+			ST_LINE(v1)
+		: : [v1] "r"(v1), [v2] "r"(v2), [v3] "r"(v3), [v4] "r"(v4)
+		: "memory"
+		);
+
+		v1 += LINE_WIDTH / sizeof(unsigned long);
+		v2 += LINE_WIDTH / sizeof(unsigned long);
+		v3 += LINE_WIDTH / sizeof(unsigned long);
+		v4 += LINE_WIDTH / sizeof(unsigned long);
+	} while (--lines > 0);
+}
+
+void XOR_FUNC_NAME(5)(unsigned long bytes,
+		      unsigned long * __restrict v1,
+		      const unsigned long * __restrict v2,
+		      const unsigned long * __restrict v3,
+		      const unsigned long * __restrict v4,
+		      const unsigned long * __restrict v5)
+{
+	unsigned long lines = bytes / LINE_WIDTH;
+
+	do {
+		__asm__ __volatile__ (
+			LD_INOUT_LINE(v1)
+			LD_AND_XOR_LINE(v2)
+			LD_AND_XOR_LINE(v3)
+			LD_AND_XOR_LINE(v4)
+			LD_AND_XOR_LINE(v5)
+			ST_LINE(v1)
+		: : [v1] "r"(v1), [v2] "r"(v2), [v3] "r"(v3), [v4] "r"(v4),
+		    [v5] "r"(v5) : "memory"
+		);
+
+		v1 += LINE_WIDTH / sizeof(unsigned long);
+		v2 += LINE_WIDTH / sizeof(unsigned long);
+		v3 += LINE_WIDTH / sizeof(unsigned long);
+		v4 += LINE_WIDTH / sizeof(unsigned long);
+		v5 += LINE_WIDTH / sizeof(unsigned long);
+	} while (--lines > 0);
+}
diff --git a/arch/loongarch/mm/Makefile b/arch/loongarch/mm/Makefile
index 8ffc6383f8360e33a833be7a09de3934c89a5d91..e4d1e581dbae3aee294ad1eb81ec72516dbcd9ae 100644
--- a/arch/loongarch/mm/Makefile
+++ b/arch/loongarch/mm/Makefile
@@ -7,3 +7,6 @@ obj-y				+= init.o cache.o tlb.o tlbex.o extable.o \
 				   fault.o ioremap.o maccess.o mmap.o pgtable.o page.o
 
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
+obj-$(CONFIG_KASAN)		+= kasan_init.o
+
+KASAN_SANITIZE_kasan_init.o     := n
diff --git a/arch/loongarch/mm/cache.c b/arch/loongarch/mm/cache.c
index 72685a48eaf084f61ce967bcd16fd0bb79bf9ce4..6be04d36ca0769658a2b52d25af50dd6ad7e07e0 100644
--- a/arch/loongarch/mm/cache.c
+++ b/arch/loongarch/mm/cache.c
@@ -156,7 +156,6 @@ void cpu_cache_init(void)
 
 	current_cpu_data.cache_leaves_present = leaf;
 	current_cpu_data.options |= LOONGARCH_CPU_PREFETCH;
-	shm_align_mask = PAGE_SIZE - 1;
 }
 
 static const pgprot_t protection_map[16] = {
diff --git a/arch/loongarch/mm/fault.c b/arch/loongarch/mm/fault.c
index da5b6d518cdb1d6c5ec550083f52f588ccf29066..e6376e3dce862ff83d5995154ab48331a59d4586 100644
--- a/arch/loongarch/mm/fault.c
+++ b/arch/loongarch/mm/fault.c
@@ -23,6 +23,7 @@
 #include <linux/kprobes.h>
 #include <linux/perf_event.h>
 #include <linux/uaccess.h>
+#include <linux/kfence.h>
 
 #include <asm/branch.h>
 #include <asm/mmu_context.h>
@@ -30,7 +31,8 @@
 
 int show_unhandled_signals = 1;
 
-static void __kprobes no_context(struct pt_regs *regs, unsigned long address)
+static void __kprobes no_context(struct pt_regs *regs,
+			unsigned long write, unsigned long address)
 {
 	const int field = sizeof(unsigned long) * 2;
 
@@ -38,6 +40,9 @@ static void __kprobes no_context(struct pt_regs *regs, unsigned long address)
 	if (fixup_exception(regs))
 		return;
 
+	if (kfence_handle_page_fault(address, write, regs))
+		return;
+
 	/*
 	 * Oops. The kernel tried to access some bad page. We'll have to
 	 * terminate things with extreme prejudice.
@@ -51,14 +56,15 @@ static void __kprobes no_context(struct pt_regs *regs, unsigned long address)
 	die("Oops", regs);
 }
 
-static void __kprobes do_out_of_memory(struct pt_regs *regs, unsigned long address)
+static void __kprobes do_out_of_memory(struct pt_regs *regs,
+			unsigned long write, unsigned long address)
 {
 	/*
 	 * We ran out of memory, call the OOM killer, and return the userspace
 	 * (which will retry the fault, or kill us if we got oom-killed).
 	 */
 	if (!user_mode(regs)) {
-		no_context(regs, address);
+		no_context(regs, write, address);
 		return;
 	}
 	pagefault_out_of_memory();
@@ -69,7 +75,7 @@ static void __kprobes do_sigbus(struct pt_regs *regs,
 {
 	/* Kernel mode? Handle exceptions or die */
 	if (!user_mode(regs)) {
-		no_context(regs, address);
+		no_context(regs, write, address);
 		return;
 	}
 
@@ -90,7 +96,7 @@ static void __kprobes do_sigsegv(struct pt_regs *regs,
 
 	/* Kernel mode? Handle exceptions or die */
 	if (!user_mode(regs)) {
-		no_context(regs, address);
+		no_context(regs, write, address);
 		return;
 	}
 
@@ -149,7 +155,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs,
 	 */
 	if (address & __UA_LIMIT) {
 		if (!user_mode(regs))
-			no_context(regs, address);
+			no_context(regs, write, address);
 		else
 			do_sigsegv(regs, write, address, si_code);
 		return;
@@ -211,7 +217,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs,
 
 	if (fault_signal_pending(fault, regs)) {
 		if (!user_mode(regs))
-			no_context(regs, address);
+			no_context(regs, write, address);
 		return;
 	}
 
@@ -232,7 +238,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs,
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		mmap_read_unlock(mm);
 		if (fault & VM_FAULT_OOM) {
-			do_out_of_memory(regs, address);
+			do_out_of_memory(regs, write, address);
 			return;
 		} else if (fault & VM_FAULT_SIGSEGV) {
 			do_sigsegv(regs, write, address, si_code);
diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c
index 3b7d8129570b83ac87455e8bb1c2f7c8a0ee35b1..f3fe8c06ba4db352ca373824254b782dc9893226 100644
--- a/arch/loongarch/mm/init.c
+++ b/arch/loongarch/mm/init.c
@@ -35,33 +35,8 @@
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
 
-/*
- * We have up to 8 empty zeroed pages so we can map one of the right colour
- * when needed.	 Since page is never written to after the initialization we
- * don't have to care about aliases on other CPUs.
- */
-unsigned long empty_zero_page, zero_page_mask;
+unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss;
 EXPORT_SYMBOL(empty_zero_page);
-EXPORT_SYMBOL(zero_page_mask);
-
-void setup_zero_pages(void)
-{
-	unsigned int order, i;
-	struct page *page;
-
-	order = 0;
-
-	empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
-	if (!empty_zero_page)
-		panic("Oh boy, that early out of memory?");
-
-	page = virt_to_page((void *)empty_zero_page);
-	split_page(page, order);
-	for (i = 0; i < (1 << order); i++, page++)
-		mark_page_reserved(page);
-
-	zero_page_mask = ((PAGE_SIZE << order) - 1) & PAGE_MASK;
-}
 
 void copy_user_highpage(struct page *to, struct page *from,
 	unsigned long vaddr, struct vm_area_struct *vma)
@@ -106,7 +81,6 @@ void __init mem_init(void)
 	high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT);
 
 	memblock_free_all();
-	setup_zero_pages();	/* Setup zeroed pages.  */
 }
 #endif /* !CONFIG_NUMA */
 
@@ -191,43 +165,42 @@ void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *al
 #endif
 #endif
 
-static pte_t *fixmap_pte(unsigned long addr)
+pte_t * __init populate_kernel_pte(unsigned long addr)
 {
-	pgd_t *pgd;
-	p4d_t *p4d;
+	pgd_t *pgd = pgd_offset_k(addr);
+	p4d_t *p4d = p4d_offset(pgd, addr);
 	pud_t *pud;
 	pmd_t *pmd;
 
-	pgd = pgd_offset_k(addr);
-	p4d = p4d_offset(pgd, addr);
-
-	if (pgd_none(*pgd)) {
-		pud_t *new __maybe_unused;
-
-		new = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE);
-		pgd_populate(&init_mm, pgd, new);
+	if (p4d_none(*p4d)) {
+		pud = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+		if (!pud)
+			panic("%s: Failed to allocate memory\n", __func__);
+		p4d_populate(&init_mm, p4d, pud);
 #ifndef __PAGETABLE_PUD_FOLDED
-		pud_init(new);
+		pud_init(pud);
 #endif
 	}
 
 	pud = pud_offset(p4d, addr);
 	if (pud_none(*pud)) {
-		pmd_t *new __maybe_unused;
-
-		new = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE);
-		pud_populate(&init_mm, pud, new);
+		pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+		if (!pmd)
+			panic("%s: Failed to allocate memory\n", __func__);
+		pud_populate(&init_mm, pud, pmd);
 #ifndef __PAGETABLE_PMD_FOLDED
-		pmd_init(new);
+		pmd_init(pmd);
 #endif
 	}
 
 	pmd = pmd_offset(pud, addr);
-	if (pmd_none(*pmd)) {
-		pte_t *new __maybe_unused;
+	if (!pmd_present(*pmd)) {
+		pte_t *pte;
 
-		new = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE);
-		pmd_populate_kernel(&init_mm, pmd, new);
+		pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+		if (!pte)
+			panic("%s: Failed to allocate memory\n", __func__);
+		pmd_populate_kernel(&init_mm, pmd, pte);
 	}
 
 	return pte_offset_kernel(pmd, addr);
@@ -241,7 +214,7 @@ void __init __set_fixmap(enum fixed_addresses idx,
 
 	BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses);
 
-	ptep = fixmap_pte(addr);
+	ptep = populate_kernel_pte(addr);
 	if (!pte_none(*ptep)) {
 		pte_ERROR(*ptep);
 		return;
diff --git a/arch/loongarch/mm/kasan_init.c b/arch/loongarch/mm/kasan_init.c
new file mode 100644
index 0000000000000000000000000000000000000000..da68bc1a464348affc033b559759a784a6080b2b
--- /dev/null
+++ b/arch/loongarch/mm/kasan_init.c
@@ -0,0 +1,243 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 Loongson Technology Corporation Limited
+ */
+#define pr_fmt(fmt) "kasan: " fmt
+#include <linux/kasan.h>
+#include <linux/memblock.h>
+#include <linux/sched/task.h>
+
+#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
+#include <asm-generic/sections.h>
+
+static pgd_t kasan_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
+
+#ifdef __PAGETABLE_PUD_FOLDED
+#define __p4d_none(early, p4d) (0)
+#else
+#define __p4d_none(early, p4d) (early ? (p4d_val(p4d) == 0) : \
+(__pa(p4d_val(p4d)) == (unsigned long)__pa(kasan_early_shadow_pud)))
+#endif
+
+#ifdef __PAGETABLE_PMD_FOLDED
+#define __pud_none(early, pud) (0)
+#else
+#define __pud_none(early, pud) (early ? (pud_val(pud) == 0) : \
+(__pa(pud_val(pud)) == (unsigned long)__pa(kasan_early_shadow_pmd)))
+#endif
+
+#define __pmd_none(early, pmd) (early ? (pmd_val(pmd) == 0) : \
+(__pa(pmd_val(pmd)) == (unsigned long)__pa(kasan_early_shadow_pte)))
+
+#define __pte_none(early, pte) (early ? pte_none(pte) : \
+((pte_val(pte) & _PFN_MASK) == (unsigned long)__pa(kasan_early_shadow_page)))
+
+bool kasan_early_stage = true;
+
+/*
+ * Alloc memory for shadow memory page table.
+ */
+static phys_addr_t __init kasan_alloc_zeroed_page(int node)
+{
+	void *p = memblock_alloc_try_nid(PAGE_SIZE, PAGE_SIZE,
+					__pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, node);
+	if (!p)
+		panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%llx\n",
+			__func__, PAGE_SIZE, PAGE_SIZE, node, __pa(MAX_DMA_ADDRESS));
+
+	return __pa(p);
+}
+
+static pte_t *__init kasan_pte_offset(pmd_t *pmdp, unsigned long addr, int node, bool early)
+{
+	if (__pmd_none(early, READ_ONCE(*pmdp))) {
+		phys_addr_t pte_phys = early ?
+				__pa_symbol(kasan_early_shadow_pte) : kasan_alloc_zeroed_page(node);
+		if (!early)
+			memcpy(__va(pte_phys), kasan_early_shadow_pte, sizeof(kasan_early_shadow_pte));
+		pmd_populate_kernel(NULL, pmdp, (pte_t *)__va(pte_phys));
+	}
+
+	return pte_offset_kernel(pmdp, addr);
+}
+
+static pmd_t *__init kasan_pmd_offset(pud_t *pudp, unsigned long addr, int node, bool early)
+{
+	if (__pud_none(early, READ_ONCE(*pudp))) {
+		phys_addr_t pmd_phys = early ?
+				__pa_symbol(kasan_early_shadow_pmd) : kasan_alloc_zeroed_page(node);
+		if (!early)
+			memcpy(__va(pmd_phys), kasan_early_shadow_pmd, sizeof(kasan_early_shadow_pmd));
+		pud_populate(&init_mm, pudp, (pmd_t *)__va(pmd_phys));
+	}
+
+	return pmd_offset(pudp, addr);
+}
+
+static pud_t *__init kasan_pud_offset(p4d_t *p4dp, unsigned long addr, int node, bool early)
+{
+	if (__p4d_none(early, READ_ONCE(*p4dp))) {
+		phys_addr_t pud_phys = early ?
+			__pa_symbol(kasan_early_shadow_pud) : kasan_alloc_zeroed_page(node);
+		if (!early)
+			memcpy(__va(pud_phys), kasan_early_shadow_pud, sizeof(kasan_early_shadow_pud));
+		p4d_populate(&init_mm, p4dp, (pud_t *)__va(pud_phys));
+	}
+
+	return pud_offset(p4dp, addr);
+}
+
+static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr,
+				      unsigned long end, int node, bool early)
+{
+	unsigned long next;
+	pte_t *ptep = kasan_pte_offset(pmdp, addr, node, early);
+
+	do {
+		phys_addr_t page_phys = early ?
+					__pa_symbol(kasan_early_shadow_page)
+					      : kasan_alloc_zeroed_page(node);
+		next = addr + PAGE_SIZE;
+		set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL));
+	} while (ptep++, addr = next, addr != end && __pte_none(early, READ_ONCE(*ptep)));
+}
+
+static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr,
+				      unsigned long end, int node, bool early)
+{
+	unsigned long next;
+	pmd_t *pmdp = kasan_pmd_offset(pudp, addr, node, early);
+
+	do {
+		next = pmd_addr_end(addr, end);
+		kasan_pte_populate(pmdp, addr, next, node, early);
+	} while (pmdp++, addr = next, addr != end && __pmd_none(early, READ_ONCE(*pmdp)));
+}
+
+static void __init kasan_pud_populate(p4d_t *p4dp, unsigned long addr,
+					    unsigned long end, int node, bool early)
+{
+	unsigned long next;
+	pud_t *pudp = kasan_pud_offset(p4dp, addr, node, early);
+
+	do {
+		next = pud_addr_end(addr, end);
+		kasan_pmd_populate(pudp, addr, next, node, early);
+	} while (pudp++, addr = next, addr != end);
+}
+
+static void __init kasan_p4d_populate(pgd_t *pgdp, unsigned long addr,
+					    unsigned long end, int node, bool early)
+{
+	unsigned long next;
+	p4d_t *p4dp = p4d_offset(pgdp, addr);
+
+	do {
+		next = p4d_addr_end(addr, end);
+		kasan_pud_populate(p4dp, addr, next, node, early);
+	} while (p4dp++, addr = next, addr != end);
+}
+
+static void __init kasan_pgd_populate(unsigned long addr, unsigned long end,
+				      int node, bool early)
+{
+	unsigned long next;
+	pgd_t *pgdp;
+
+	pgdp = pgd_offset_k(addr);
+
+	do {
+		next = pgd_addr_end(addr, end);
+		kasan_p4d_populate(pgdp, addr, next, node, early);
+	} while (pgdp++, addr = next, addr != end);
+
+}
+
+/* Set up full kasan mappings, ensuring that the mapped pages are zeroed */
+static void __init kasan_map_populate(unsigned long start, unsigned long end,
+				      int node)
+{
+	kasan_pgd_populate(start & PAGE_MASK, PAGE_ALIGN(end), node, false);
+}
+
+asmlinkage void __init kasan_early_init(void)
+{
+	BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PGDIR_SIZE));
+	BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE));
+}
+
+static inline void kasan_set_pgd(pgd_t *pgdp, pgd_t pgdval)
+{
+	WRITE_ONCE(*pgdp, pgdval);
+}
+
+static void __init clear_pgds(unsigned long start, unsigned long end)
+{
+	/*
+	 * Remove references to kasan page tables from
+	 * swapper_pg_dir. pgd_clear() can't be used
+	 * here because it's nop on 2,3-level pagetable setups
+	 */
+	for (; start < end; start += PGDIR_SIZE)
+		kasan_set_pgd((pgd_t *)pgd_offset_k(start), __pgd(0));
+}
+
+void __init kasan_init(void)
+{
+	u64 i;
+	phys_addr_t pa_start, pa_end;
+
+	/*
+	 * PGD was populated as invalid_pmd_table or invalid_pud_table
+	 * in pagetable_init() which depends on how many levels of page
+	 * table you are using, but we had to clean the gpd of kasan
+	 * shadow memory, as the pgd value is none-zero.
+	 * The assertion pgd_none is going to be false and the formal populate
+	 * afterwards is not going to create any new pgd at all.
+	 */
+	memcpy(kasan_pg_dir, swapper_pg_dir, sizeof(kasan_pg_dir));
+	csr_write64(__pa_symbol(kasan_pg_dir), LOONGARCH_CSR_PGDH);
+	local_flush_tlb_all();
+
+	clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
+
+	/* Maps everything to a single page of zeroes */
+	kasan_pgd_populate(KASAN_SHADOW_START, KASAN_SHADOW_END, NUMA_NO_NODE, true);
+
+	kasan_populate_early_shadow(kasan_mem_to_shadow((void *)VMALLOC_START),
+					kasan_mem_to_shadow((void *)KFENCE_AREA_END));
+
+	kasan_early_stage = false;
+
+	/* Populate the linear mapping */
+	for_each_mem_range(i, &pa_start, &pa_end) {
+		void *start = (void *)phys_to_virt(pa_start);
+		void *end   = (void *)phys_to_virt(pa_end);
+
+		if (start >= end)
+			break;
+
+		kasan_map_populate((unsigned long)kasan_mem_to_shadow(start),
+			(unsigned long)kasan_mem_to_shadow(end), NUMA_NO_NODE);
+	}
+
+	/* Populate modules mapping */
+	kasan_map_populate((unsigned long)kasan_mem_to_shadow((void *)MODULES_VADDR),
+		(unsigned long)kasan_mem_to_shadow((void *)MODULES_END), NUMA_NO_NODE);
+	/*
+	 * KAsan may reuse the contents of kasan_early_shadow_pte directly, so we
+	 * should make sure that it maps the zero page read-only.
+	 */
+	for (i = 0; i < PTRS_PER_PTE; i++)
+		set_pte(&kasan_early_shadow_pte[i],
+			pfn_pte(__phys_to_pfn(__pa_symbol(kasan_early_shadow_page)), PAGE_KERNEL_RO));
+
+	memset(kasan_early_shadow_page, 0, PAGE_SIZE);
+	csr_write64(__pa_symbol(swapper_pg_dir), LOONGARCH_CSR_PGDH);
+	local_flush_tlb_all();
+
+	/* At this point kasan is fully initialized. Enable error messages */
+	init_task.kasan_depth = 0;
+	pr_info("KernelAddressSanitizer initialized.\n");
+}
diff --git a/arch/loongarch/mm/mmap.c b/arch/loongarch/mm/mmap.c
index fbe1a4856fc42d4f255d9e9c3a9e349c3fa09e9a..a9630a81b38abbfc575ea4174af049ccd5a9a888 100644
--- a/arch/loongarch/mm/mmap.c
+++ b/arch/loongarch/mm/mmap.c
@@ -8,12 +8,11 @@
 #include <linux/mm.h>
 #include <linux/mman.h>
 
-unsigned long shm_align_mask = PAGE_SIZE - 1;	/* Sane caches */
-EXPORT_SYMBOL(shm_align_mask);
+#define SHM_ALIGN_MASK	(SHMLBA - 1)
 
-#define COLOUR_ALIGN(addr, pgoff)				\
-	((((addr) + shm_align_mask) & ~shm_align_mask) +	\
-	 (((pgoff) << PAGE_SHIFT) & shm_align_mask))
+#define COLOUR_ALIGN(addr, pgoff)			\
+	((((addr) + SHM_ALIGN_MASK) & ~SHM_ALIGN_MASK)	\
+	 + (((pgoff) << PAGE_SHIFT) & SHM_ALIGN_MASK))
 
 enum mmap_allocation_direction {UP, DOWN};
 
@@ -40,7 +39,7 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp,
 		 * cache aliasing constraints.
 		 */
 		if ((flags & MAP_SHARED) &&
-		    ((addr - (pgoff << PAGE_SHIFT)) & shm_align_mask))
+		    ((addr - (pgoff << PAGE_SHIFT)) & SHM_ALIGN_MASK))
 			return -EINVAL;
 		return addr;
 	}
@@ -63,7 +62,7 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp,
 	}
 
 	info.length = len;
-	info.align_mask = do_color_align ? (PAGE_MASK & shm_align_mask) : 0;
+	info.align_mask = do_color_align ? (PAGE_MASK & SHM_ALIGN_MASK) : 0;
 	info.align_offset = pgoff << PAGE_SHIFT;
 
 	if (dir == DOWN) {
diff --git a/arch/loongarch/mm/pgtable.c b/arch/loongarch/mm/pgtable.c
index b14343e211b63f492b7aedd6ee0db9071156aa4a..71d0539e2d0b0207f901e3ef75679bc4e6bc2fa3 100644
--- a/arch/loongarch/mm/pgtable.c
+++ b/arch/loongarch/mm/pgtable.c
@@ -9,6 +9,18 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 
+struct page *dmw_virt_to_page(unsigned long kaddr)
+{
+	return pfn_to_page(virt_to_pfn(kaddr));
+}
+EXPORT_SYMBOL_GPL(dmw_virt_to_page);
+
+struct page *tlb_virt_to_page(unsigned long kaddr)
+{
+	return pfn_to_page(pte_pfn(*virt_to_kpte(kaddr)));
+}
+EXPORT_SYMBOL_GPL(tlb_virt_to_page);
+
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	pgd_t *init, *ret = NULL;
diff --git a/arch/loongarch/vdso/Makefile b/arch/loongarch/vdso/Makefile
index a50308b6fc259082fb61e29e7b5b9fb99082fb83..5c97d14633282186534b2aea039add1af77b0518 100644
--- a/arch/loongarch/vdso/Makefile
+++ b/arch/loongarch/vdso/Makefile
@@ -1,6 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
 # Objects to go into the VDSO.
 
+KASAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+
 # Include the generic Makefile to check the built vdso.
 include $(srctree)/lib/vdso/Makefile
 
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 819b6bc8ac0886d8d671a117ac9d9c694baebe95..3df5499f79369df25448c3069f00888d0cbb31f2 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -54,11 +54,13 @@ extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];
 int kasan_populate_early_shadow(const void *shadow_start,
 				const void *shadow_end);
 
+#ifndef __HAVE_ARCH_SHADOW_MAP
 static inline void *kasan_mem_to_shadow(const void *addr)
 {
 	return (void *)((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT)
 		+ KASAN_SHADOW_OFFSET;
 }
+#endif
 
 int kasan_add_zero_shadow(void *start, unsigned long size);
 void kasan_remove_zero_shadow(void *start, unsigned long size);
diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index f29aaaf2eb21e9bdab1943c50309919bec16effe..006e18decfad0796a9b8d14cc09170c074493c27 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -108,6 +108,8 @@ extern const struct raid6_calls raid6_vpermxor1;
 extern const struct raid6_calls raid6_vpermxor2;
 extern const struct raid6_calls raid6_vpermxor4;
 extern const struct raid6_calls raid6_vpermxor8;
+extern const struct raid6_calls raid6_lsx;
+extern const struct raid6_calls raid6_lasx;
 
 struct raid6_recov_calls {
 	void (*data2)(int, size_t, int, int, void **);
@@ -123,6 +125,8 @@ extern const struct raid6_recov_calls raid6_recov_avx2;
 extern const struct raid6_recov_calls raid6_recov_avx512;
 extern const struct raid6_recov_calls raid6_recov_s390xc;
 extern const struct raid6_recov_calls raid6_recov_neon;
+extern const struct raid6_recov_calls raid6_recov_lsx;
+extern const struct raid6_recov_calls raid6_recov_lasx;
 
 extern const struct raid6_calls raid6_neonx1;
 extern const struct raid6_calls raid6_neonx2;
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 45e17619422b4ef4b3c278b242806668d034b19f..035b0a4db476a1fba02620c2a422323dc5a1f560 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -9,6 +9,7 @@ raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \
                               vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o
 raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
 raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
+raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o
 
 hostprogs	+= mktables
 
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index a22a05c9af8a3a75610fd2eca0c0b195009e2fe9..0ec534faf019bc966095f9cecf082720a834d412 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -73,6 +73,14 @@ const struct raid6_calls * const raid6_algos[] = {
 	&raid6_neonx2,
 	&raid6_neonx1,
 #endif
+#ifdef CONFIG_LOONGARCH
+#ifdef CONFIG_CPU_HAS_LASX
+	&raid6_lasx,
+#endif
+#ifdef CONFIG_CPU_HAS_LSX
+	&raid6_lsx,
+#endif
+#endif
 #if defined(__ia64__)
 	&raid6_intx32,
 	&raid6_intx16,
@@ -103,6 +111,14 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
 #endif
 #if defined(CONFIG_KERNEL_MODE_NEON)
 	&raid6_recov_neon,
+#endif
+#ifdef CONFIG_LOONGARCH
+#ifdef CONFIG_CPU_HAS_LASX
+	&raid6_recov_lasx,
+#endif
+#ifdef CONFIG_CPU_HAS_LSX
+	&raid6_recov_lsx,
+#endif
 #endif
 	&raid6_recov_intx1,
 	NULL
diff --git a/lib/raid6/loongarch.h b/lib/raid6/loongarch.h
new file mode 100644
index 0000000000000000000000000000000000000000..acfc33ce705625058131f4fd54bd4c5a64af0d30
--- /dev/null
+++ b/lib/raid6/loongarch.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
+ *
+ * raid6/loongarch.h
+ *
+ * Definitions common to LoongArch RAID-6 code only
+ */
+
+#ifndef _LIB_RAID6_LOONGARCH_H
+#define _LIB_RAID6_LOONGARCH_H
+
+#ifdef __KERNEL__
+
+#include <asm/cpu-features.h>
+#include <asm/fpu.h>
+
+#else /* for user-space testing */
+
+#include <sys/auxv.h>
+
+/* have to supply these defines for glibc 2.37- and musl */
+#ifndef HWCAP_LOONGARCH_LSX
+#define HWCAP_LOONGARCH_LSX	(1 << 4)
+#endif
+#ifndef HWCAP_LOONGARCH_LASX
+#define HWCAP_LOONGARCH_LASX	(1 << 5)
+#endif
+
+#define kernel_fpu_begin()
+#define kernel_fpu_end()
+
+#define cpu_has_lsx	(getauxval(AT_HWCAP) & HWCAP_LOONGARCH_LSX)
+#define cpu_has_lasx	(getauxval(AT_HWCAP) & HWCAP_LOONGARCH_LASX)
+
+#endif /* __KERNEL__ */
+
+#endif /* _LIB_RAID6_LOONGARCH_H */
diff --git a/lib/raid6/loongarch_simd.c b/lib/raid6/loongarch_simd.c
new file mode 100644
index 0000000000000000000000000000000000000000..aa5d9f924ca39694f7487cf3efc7295c78574c66
--- /dev/null
+++ b/lib/raid6/loongarch_simd.c
@@ -0,0 +1,422 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RAID6 syndrome calculations in LoongArch SIMD (LSX & LASX)
+ *
+ * Copyright 2023 WANG Xuerui <git@xen0n.name>
+ *
+ * Based on the generic RAID-6 code (int.uc):
+ *
+ * Copyright 2002-2004 H. Peter Anvin
+ */
+
+#include <linux/raid/pq.h>
+#include "loongarch.h"
+
+/*
+ * The vector algorithms are currently priority 0, which means the generic
+ * scalar algorithms are not being disabled if vector support is present.
+ * This is like the similar LoongArch RAID5 XOR code, with the main reason
+ * repeated here: it cannot be ruled out at this point of time, that some
+ * future (maybe reduced) models could run the vector algorithms slower than
+ * the scalar ones, maybe for errata or micro-op reasons. It may be
+ * appropriate to revisit this after one or two more uarch generations.
+ */
+
+#ifdef CONFIG_CPU_HAS_LSX
+#define NSIZE 16
+
+static int raid6_has_lsx(void)
+{
+	return cpu_has_lsx;
+}
+
+static void raid6_lsx_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	/*
+	 * $vr0, $vr1, $vr2, $vr3: wp
+	 * $vr4, $vr5, $vr6, $vr7: wq
+	 * $vr8, $vr9, $vr10, $vr11: wd
+	 * $vr12, $vr13, $vr14, $vr15: w2
+	 * $vr16, $vr17, $vr18, $vr19: w1
+	 */
+	for (d = 0; d < bytes; d += NSIZE*4) {
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
+		asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
+		asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE]));
+		asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE]));
+		asm volatile("vori.b $vr4, $vr0, 0");
+		asm volatile("vori.b $vr5, $vr1, 0");
+		asm volatile("vori.b $vr6, $vr2, 0");
+		asm volatile("vori.b $vr7, $vr3, 0");
+		for (z = z0-1; z >= 0; z--) {
+			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
+			asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE]));
+			asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE]));
+			asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE]));
+			asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE]));
+			/* wp$$ ^= wd$$; */
+			asm volatile("vxor.v $vr0, $vr0, $vr8");
+			asm volatile("vxor.v $vr1, $vr1, $vr9");
+			asm volatile("vxor.v $vr2, $vr2, $vr10");
+			asm volatile("vxor.v $vr3, $vr3, $vr11");
+			/* w2$$ = MASK(wq$$); */
+			asm volatile("vslti.b $vr12, $vr4, 0");
+			asm volatile("vslti.b $vr13, $vr5, 0");
+			asm volatile("vslti.b $vr14, $vr6, 0");
+			asm volatile("vslti.b $vr15, $vr7, 0");
+			/* w1$$ = SHLBYTE(wq$$); */
+			asm volatile("vslli.b $vr16, $vr4, 1");
+			asm volatile("vslli.b $vr17, $vr5, 1");
+			asm volatile("vslli.b $vr18, $vr6, 1");
+			asm volatile("vslli.b $vr19, $vr7, 1");
+			/* w2$$ &= NBYTES(0x1d); */
+			asm volatile("vandi.b $vr12, $vr12, 0x1d");
+			asm volatile("vandi.b $vr13, $vr13, 0x1d");
+			asm volatile("vandi.b $vr14, $vr14, 0x1d");
+			asm volatile("vandi.b $vr15, $vr15, 0x1d");
+			/* w1$$ ^= w2$$; */
+			asm volatile("vxor.v $vr16, $vr16, $vr12");
+			asm volatile("vxor.v $vr17, $vr17, $vr13");
+			asm volatile("vxor.v $vr18, $vr18, $vr14");
+			asm volatile("vxor.v $vr19, $vr19, $vr15");
+			/* wq$$ = w1$$ ^ wd$$; */
+			asm volatile("vxor.v $vr4, $vr16, $vr8");
+			asm volatile("vxor.v $vr5, $vr17, $vr9");
+			asm volatile("vxor.v $vr6, $vr18, $vr10");
+			asm volatile("vxor.v $vr7, $vr19, $vr11");
+		}
+		/* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */
+		asm volatile("vst $vr0, %0" : "=m"(p[d+NSIZE*0]));
+		asm volatile("vst $vr1, %0" : "=m"(p[d+NSIZE*1]));
+		asm volatile("vst $vr2, %0" : "=m"(p[d+NSIZE*2]));
+		asm volatile("vst $vr3, %0" : "=m"(p[d+NSIZE*3]));
+		/* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */
+		asm volatile("vst $vr4, %0" : "=m"(q[d+NSIZE*0]));
+		asm volatile("vst $vr5, %0" : "=m"(q[d+NSIZE*1]));
+		asm volatile("vst $vr6, %0" : "=m"(q[d+NSIZE*2]));
+		asm volatile("vst $vr7, %0" : "=m"(q[d+NSIZE*3]));
+	}
+
+	kernel_fpu_end();
+}
+
+static void raid6_lsx_xor_syndrome(int disks, int start, int stop,
+				   size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	/*
+	 * $vr0, $vr1, $vr2, $vr3: wp
+	 * $vr4, $vr5, $vr6, $vr7: wq
+	 * $vr8, $vr9, $vr10, $vr11: wd
+	 * $vr12, $vr13, $vr14, $vr15: w2
+	 * $vr16, $vr17, $vr18, $vr19: w1
+	 */
+	for (d = 0; d < bytes; d += NSIZE*4) {
+		/* P/Q data pages */
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
+		asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
+		asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE]));
+		asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE]));
+		asm volatile("vori.b $vr4, $vr0, 0");
+		asm volatile("vori.b $vr5, $vr1, 0");
+		asm volatile("vori.b $vr6, $vr2, 0");
+		asm volatile("vori.b $vr7, $vr3, 0");
+		for (z = z0-1; z >= start; z--) {
+			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
+			asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE]));
+			asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE]));
+			asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE]));
+			asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE]));
+			/* wp$$ ^= wd$$; */
+			asm volatile("vxor.v $vr0, $vr0, $vr8");
+			asm volatile("vxor.v $vr1, $vr1, $vr9");
+			asm volatile("vxor.v $vr2, $vr2, $vr10");
+			asm volatile("vxor.v $vr3, $vr3, $vr11");
+			/* w2$$ = MASK(wq$$); */
+			asm volatile("vslti.b $vr12, $vr4, 0");
+			asm volatile("vslti.b $vr13, $vr5, 0");
+			asm volatile("vslti.b $vr14, $vr6, 0");
+			asm volatile("vslti.b $vr15, $vr7, 0");
+			/* w1$$ = SHLBYTE(wq$$); */
+			asm volatile("vslli.b $vr16, $vr4, 1");
+			asm volatile("vslli.b $vr17, $vr5, 1");
+			asm volatile("vslli.b $vr18, $vr6, 1");
+			asm volatile("vslli.b $vr19, $vr7, 1");
+			/* w2$$ &= NBYTES(0x1d); */
+			asm volatile("vandi.b $vr12, $vr12, 0x1d");
+			asm volatile("vandi.b $vr13, $vr13, 0x1d");
+			asm volatile("vandi.b $vr14, $vr14, 0x1d");
+			asm volatile("vandi.b $vr15, $vr15, 0x1d");
+			/* w1$$ ^= w2$$; */
+			asm volatile("vxor.v $vr16, $vr16, $vr12");
+			asm volatile("vxor.v $vr17, $vr17, $vr13");
+			asm volatile("vxor.v $vr18, $vr18, $vr14");
+			asm volatile("vxor.v $vr19, $vr19, $vr15");
+			/* wq$$ = w1$$ ^ wd$$; */
+			asm volatile("vxor.v $vr4, $vr16, $vr8");
+			asm volatile("vxor.v $vr5, $vr17, $vr9");
+			asm volatile("vxor.v $vr6, $vr18, $vr10");
+			asm volatile("vxor.v $vr7, $vr19, $vr11");
+		}
+
+		/* P/Q left side optimization */
+		for (z = start-1; z >= 0; z--) {
+			/* w2$$ = MASK(wq$$); */
+			asm volatile("vslti.b $vr12, $vr4, 0");
+			asm volatile("vslti.b $vr13, $vr5, 0");
+			asm volatile("vslti.b $vr14, $vr6, 0");
+			asm volatile("vslti.b $vr15, $vr7, 0");
+			/* w1$$ = SHLBYTE(wq$$); */
+			asm volatile("vslli.b $vr16, $vr4, 1");
+			asm volatile("vslli.b $vr17, $vr5, 1");
+			asm volatile("vslli.b $vr18, $vr6, 1");
+			asm volatile("vslli.b $vr19, $vr7, 1");
+			/* w2$$ &= NBYTES(0x1d); */
+			asm volatile("vandi.b $vr12, $vr12, 0x1d");
+			asm volatile("vandi.b $vr13, $vr13, 0x1d");
+			asm volatile("vandi.b $vr14, $vr14, 0x1d");
+			asm volatile("vandi.b $vr15, $vr15, 0x1d");
+			/* wq$$ = w1$$ ^ w2$$; */
+			asm volatile("vxor.v $vr4, $vr16, $vr12");
+			asm volatile("vxor.v $vr5, $vr17, $vr13");
+			asm volatile("vxor.v $vr6, $vr18, $vr14");
+			asm volatile("vxor.v $vr7, $vr19, $vr15");
+		}
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+		 */
+		asm volatile(
+			"vld $vr20, %0\n\t"
+			"vld $vr21, %1\n\t"
+			"vld $vr22, %2\n\t"
+			"vld $vr23, %3\n\t"
+			"vld $vr24, %4\n\t"
+			"vld $vr25, %5\n\t"
+			"vld $vr26, %6\n\t"
+			"vld $vr27, %7\n\t"
+			"vxor.v $vr20, $vr20, $vr0\n\t"
+			"vxor.v $vr21, $vr21, $vr1\n\t"
+			"vxor.v $vr22, $vr22, $vr2\n\t"
+			"vxor.v $vr23, $vr23, $vr3\n\t"
+			"vxor.v $vr24, $vr24, $vr4\n\t"
+			"vxor.v $vr25, $vr25, $vr5\n\t"
+			"vxor.v $vr26, $vr26, $vr6\n\t"
+			"vxor.v $vr27, $vr27, $vr7\n\t"
+			"vst $vr20, %0\n\t"
+			"vst $vr21, %1\n\t"
+			"vst $vr22, %2\n\t"
+			"vst $vr23, %3\n\t"
+			"vst $vr24, %4\n\t"
+			"vst $vr25, %5\n\t"
+			"vst $vr26, %6\n\t"
+			"vst $vr27, %7\n\t"
+			: "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]),
+			  "+m"(p[d+NSIZE*2]), "+m"(p[d+NSIZE*3]),
+			  "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1]),
+			  "+m"(q[d+NSIZE*2]), "+m"(q[d+NSIZE*3])
+		);
+	}
+
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_lsx = {
+	raid6_lsx_gen_syndrome,
+	raid6_lsx_xor_syndrome,
+	raid6_has_lsx,
+	"lsx",
+	.priority = 0 /* see the comment near the top of the file for reason */
+};
+
+#undef NSIZE
+#endif /* CONFIG_CPU_HAS_LSX */
+
+#ifdef CONFIG_CPU_HAS_LASX
+#define NSIZE 32
+
+static int raid6_has_lasx(void)
+{
+	return cpu_has_lasx;
+}
+
+static void raid6_lasx_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;		/* Highest data disk */
+	p = dptr[z0+1];		/* XOR parity */
+	q = dptr[z0+2];		/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	/*
+	 * $xr0, $xr1: wp
+	 * $xr2, $xr3: wq
+	 * $xr4, $xr5: wd
+	 * $xr6, $xr7: w2
+	 * $xr8, $xr9: w1
+	 */
+	for (d = 0; d < bytes; d += NSIZE*2) {
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
+		asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
+		asm volatile("xvori.b $xr2, $xr0, 0");
+		asm volatile("xvori.b $xr3, $xr1, 0");
+		for (z = z0-1; z >= 0; z--) {
+			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
+			asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE]));
+			asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE]));
+			/* wp$$ ^= wd$$; */
+			asm volatile("xvxor.v $xr0, $xr0, $xr4");
+			asm volatile("xvxor.v $xr1, $xr1, $xr5");
+			/* w2$$ = MASK(wq$$); */
+			asm volatile("xvslti.b $xr6, $xr2, 0");
+			asm volatile("xvslti.b $xr7, $xr3, 0");
+			/* w1$$ = SHLBYTE(wq$$); */
+			asm volatile("xvslli.b $xr8, $xr2, 1");
+			asm volatile("xvslli.b $xr9, $xr3, 1");
+			/* w2$$ &= NBYTES(0x1d); */
+			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
+			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
+			/* w1$$ ^= w2$$; */
+			asm volatile("xvxor.v $xr8, $xr8, $xr6");
+			asm volatile("xvxor.v $xr9, $xr9, $xr7");
+			/* wq$$ = w1$$ ^ wd$$; */
+			asm volatile("xvxor.v $xr2, $xr8, $xr4");
+			asm volatile("xvxor.v $xr3, $xr9, $xr5");
+		}
+		/* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */
+		asm volatile("xvst $xr0, %0" : "=m"(p[d+NSIZE*0]));
+		asm volatile("xvst $xr1, %0" : "=m"(p[d+NSIZE*1]));
+		/* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */
+		asm volatile("xvst $xr2, %0" : "=m"(q[d+NSIZE*0]));
+		asm volatile("xvst $xr3, %0" : "=m"(q[d+NSIZE*1]));
+	}
+
+	kernel_fpu_end();
+}
+
+static void raid6_lasx_xor_syndrome(int disks, int start, int stop,
+				    size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	/*
+	 * $xr0, $xr1: wp
+	 * $xr2, $xr3: wq
+	 * $xr4, $xr5: wd
+	 * $xr6, $xr7: w2
+	 * $xr8, $xr9: w1
+	 */
+	for (d = 0; d < bytes; d += NSIZE*2) {
+		/* P/Q data pages */
+		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+		asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
+		asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
+		asm volatile("xvori.b $xr2, $xr0, 0");
+		asm volatile("xvori.b $xr3, $xr1, 0");
+		for (z = z0-1; z >= start; z--) {
+			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
+			asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE]));
+			asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE]));
+			/* wp$$ ^= wd$$; */
+			asm volatile("xvxor.v $xr0, $xr0, $xr4");
+			asm volatile("xvxor.v $xr1, $xr1, $xr5");
+			/* w2$$ = MASK(wq$$); */
+			asm volatile("xvslti.b $xr6, $xr2, 0");
+			asm volatile("xvslti.b $xr7, $xr3, 0");
+			/* w1$$ = SHLBYTE(wq$$); */
+			asm volatile("xvslli.b $xr8, $xr2, 1");
+			asm volatile("xvslli.b $xr9, $xr3, 1");
+			/* w2$$ &= NBYTES(0x1d); */
+			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
+			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
+			/* w1$$ ^= w2$$; */
+			asm volatile("xvxor.v $xr8, $xr8, $xr6");
+			asm volatile("xvxor.v $xr9, $xr9, $xr7");
+			/* wq$$ = w1$$ ^ wd$$; */
+			asm volatile("xvxor.v $xr2, $xr8, $xr4");
+			asm volatile("xvxor.v $xr3, $xr9, $xr5");
+		}
+
+		/* P/Q left side optimization */
+		for (z = start-1; z >= 0; z--) {
+			/* w2$$ = MASK(wq$$); */
+			asm volatile("xvslti.b $xr6, $xr2, 0");
+			asm volatile("xvslti.b $xr7, $xr3, 0");
+			/* w1$$ = SHLBYTE(wq$$); */
+			asm volatile("xvslli.b $xr8, $xr2, 1");
+			asm volatile("xvslli.b $xr9, $xr3, 1");
+			/* w2$$ &= NBYTES(0x1d); */
+			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
+			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
+			/* wq$$ = w1$$ ^ w2$$; */
+			asm volatile("xvxor.v $xr2, $xr8, $xr6");
+			asm volatile("xvxor.v $xr3, $xr9, $xr7");
+		}
+		/*
+		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+		 */
+		asm volatile(
+			"xvld $xr10, %0\n\t"
+			"xvld $xr11, %1\n\t"
+			"xvld $xr12, %2\n\t"
+			"xvld $xr13, %3\n\t"
+			"xvxor.v $xr10, $xr10, $xr0\n\t"
+			"xvxor.v $xr11, $xr11, $xr1\n\t"
+			"xvxor.v $xr12, $xr12, $xr2\n\t"
+			"xvxor.v $xr13, $xr13, $xr3\n\t"
+			"xvst $xr10, %0\n\t"
+			"xvst $xr11, %1\n\t"
+			"xvst $xr12, %2\n\t"
+			"xvst $xr13, %3\n\t"
+			: "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]),
+			  "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1])
+		);
+	}
+
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_lasx = {
+	raid6_lasx_gen_syndrome,
+	raid6_lasx_xor_syndrome,
+	raid6_has_lasx,
+	"lasx",
+	.priority = 0 /* see the comment near the top of the file for reason */
+};
+#undef NSIZE
+#endif /* CONFIG_CPU_HAS_LASX */
diff --git a/lib/raid6/recov_loongarch_simd.c b/lib/raid6/recov_loongarch_simd.c
new file mode 100644
index 0000000000000000000000000000000000000000..94aeac85e6f75e393363868c6306aff8d4521a5b
--- /dev/null
+++ b/lib/raid6/recov_loongarch_simd.c
@@ -0,0 +1,513 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * RAID6 recovery algorithms in LoongArch SIMD (LSX & LASX)
+ *
+ * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
+ *
+ * Originally based on recov_avx2.c and recov_ssse3.c:
+ *
+ * Copyright (C) 2012 Intel Corporation
+ * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
+ */
+
+#include <linux/raid/pq.h>
+#include "loongarch.h"
+
+/*
+ * Unlike with the syndrome calculation algorithms, there's no boot-time
+ * selection of recovery algorithms by benchmarking, so we have to specify
+ * the priorities and hope the future cores will all have decent vector
+ * support (i.e. no LASX slower than LSX, or even scalar code).
+ */
+
+#ifdef CONFIG_CPU_HAS_LSX
+static int raid6_has_lsx(void)
+{
+	return cpu_has_lsx;
+}
+
+static void raid6_2data_recov_lsx(int disks, size_t bytes, int faila,
+				  int failb, void **ptrs)
+{
+	u8 *p, *q, *dp, *dq;
+	const u8 *pbmul;	/* P multiplier table for B data */
+	const u8 *qmul;		/* Q multiplier table (for both) */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data pages
+	 * Use the dead data pages as temporary storage for
+	 * delta p and delta q
+	 */
+	dp = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 2] = dp;
+	dq = (u8 *)ptrs[failb];
+	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila] = dp;
+	ptrs[failb] = dq;
+	ptrs[disks - 2] = p;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]];
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]];
+
+	kernel_fpu_begin();
+
+	/*
+	 * vr20, vr21: qmul
+	 * vr22, vr23: pbmul
+	 */
+	asm volatile("vld $vr20, %0" : : "m" (qmul[0]));
+	asm volatile("vld $vr21, %0" : : "m" (qmul[16]));
+	asm volatile("vld $vr22, %0" : : "m" (pbmul[0]));
+	asm volatile("vld $vr23, %0" : : "m" (pbmul[16]));
+
+	while (bytes) {
+		/* vr4 - vr7: Q */
+		asm volatile("vld $vr4, %0" : : "m" (q[0]));
+		asm volatile("vld $vr5, %0" : : "m" (q[16]));
+		asm volatile("vld $vr6, %0" : : "m" (q[32]));
+		asm volatile("vld $vr7, %0" : : "m" (q[48]));
+		/*  vr4 - vr7: Q + Qxy */
+		asm volatile("vld $vr8, %0" : : "m" (dq[0]));
+		asm volatile("vld $vr9, %0" : : "m" (dq[16]));
+		asm volatile("vld $vr10, %0" : : "m" (dq[32]));
+		asm volatile("vld $vr11, %0" : : "m" (dq[48]));
+		asm volatile("vxor.v $vr4, $vr4, $vr8");
+		asm volatile("vxor.v $vr5, $vr5, $vr9");
+		asm volatile("vxor.v $vr6, $vr6, $vr10");
+		asm volatile("vxor.v $vr7, $vr7, $vr11");
+		/* vr0 - vr3: P */
+		asm volatile("vld $vr0, %0" : : "m" (p[0]));
+		asm volatile("vld $vr1, %0" : : "m" (p[16]));
+		asm volatile("vld $vr2, %0" : : "m" (p[32]));
+		asm volatile("vld $vr3, %0" : : "m" (p[48]));
+		/* vr0 - vr3: P + Pxy */
+		asm volatile("vld $vr8, %0" : : "m" (dp[0]));
+		asm volatile("vld $vr9, %0" : : "m" (dp[16]));
+		asm volatile("vld $vr10, %0" : : "m" (dp[32]));
+		asm volatile("vld $vr11, %0" : : "m" (dp[48]));
+		asm volatile("vxor.v $vr0, $vr0, $vr8");
+		asm volatile("vxor.v $vr1, $vr1, $vr9");
+		asm volatile("vxor.v $vr2, $vr2, $vr10");
+		asm volatile("vxor.v $vr3, $vr3, $vr11");
+
+		/* vr8 - vr11: higher 4 bits of each byte of (Q + Qxy) */
+		asm volatile("vsrli.b $vr8, $vr4, 4");
+		asm volatile("vsrli.b $vr9, $vr5, 4");
+		asm volatile("vsrli.b $vr10, $vr6, 4");
+		asm volatile("vsrli.b $vr11, $vr7, 4");
+		/* vr4 - vr7: lower 4 bits of each byte of (Q + Qxy) */
+		asm volatile("vandi.b $vr4, $vr4, 0x0f");
+		asm volatile("vandi.b $vr5, $vr5, 0x0f");
+		asm volatile("vandi.b $vr6, $vr6, 0x0f");
+		asm volatile("vandi.b $vr7, $vr7, 0x0f");
+		/* lookup from qmul[0] */
+		asm volatile("vshuf.b $vr4, $vr20, $vr20, $vr4");
+		asm volatile("vshuf.b $vr5, $vr20, $vr20, $vr5");
+		asm volatile("vshuf.b $vr6, $vr20, $vr20, $vr6");
+		asm volatile("vshuf.b $vr7, $vr20, $vr20, $vr7");
+		/* lookup from qmul[16] */
+		asm volatile("vshuf.b $vr8, $vr21, $vr21, $vr8");
+		asm volatile("vshuf.b $vr9, $vr21, $vr21, $vr9");
+		asm volatile("vshuf.b $vr10, $vr21, $vr21, $vr10");
+		asm volatile("vshuf.b $vr11, $vr21, $vr21, $vr11");
+		/* vr16 - vr19: B(Q + Qxy) */
+		asm volatile("vxor.v $vr16, $vr8, $vr4");
+		asm volatile("vxor.v $vr17, $vr9, $vr5");
+		asm volatile("vxor.v $vr18, $vr10, $vr6");
+		asm volatile("vxor.v $vr19, $vr11, $vr7");
+
+		/* vr4 - vr7: higher 4 bits of each byte of (P + Pxy) */
+		asm volatile("vsrli.b $vr4, $vr0, 4");
+		asm volatile("vsrli.b $vr5, $vr1, 4");
+		asm volatile("vsrli.b $vr6, $vr2, 4");
+		asm volatile("vsrli.b $vr7, $vr3, 4");
+		/* vr12 - vr15: lower 4 bits of each byte of (P + Pxy) */
+		asm volatile("vandi.b $vr12, $vr0, 0x0f");
+		asm volatile("vandi.b $vr13, $vr1, 0x0f");
+		asm volatile("vandi.b $vr14, $vr2, 0x0f");
+		asm volatile("vandi.b $vr15, $vr3, 0x0f");
+		/* lookup from pbmul[0] */
+		asm volatile("vshuf.b $vr12, $vr22, $vr22, $vr12");
+		asm volatile("vshuf.b $vr13, $vr22, $vr22, $vr13");
+		asm volatile("vshuf.b $vr14, $vr22, $vr22, $vr14");
+		asm volatile("vshuf.b $vr15, $vr22, $vr22, $vr15");
+		/* lookup from pbmul[16] */
+		asm volatile("vshuf.b $vr4, $vr23, $vr23, $vr4");
+		asm volatile("vshuf.b $vr5, $vr23, $vr23, $vr5");
+		asm volatile("vshuf.b $vr6, $vr23, $vr23, $vr6");
+		asm volatile("vshuf.b $vr7, $vr23, $vr23, $vr7");
+		/* vr4 - vr7: A(P + Pxy) */
+		asm volatile("vxor.v $vr4, $vr4, $vr12");
+		asm volatile("vxor.v $vr5, $vr5, $vr13");
+		asm volatile("vxor.v $vr6, $vr6, $vr14");
+		asm volatile("vxor.v $vr7, $vr7, $vr15");
+
+		/* vr4 - vr7: A(P + Pxy) + B(Q + Qxy) = Dx */
+		asm volatile("vxor.v $vr4, $vr4, $vr16");
+		asm volatile("vxor.v $vr5, $vr5, $vr17");
+		asm volatile("vxor.v $vr6, $vr6, $vr18");
+		asm volatile("vxor.v $vr7, $vr7, $vr19");
+		asm volatile("vst $vr4, %0" : "=m" (dq[0]));
+		asm volatile("vst $vr5, %0" : "=m" (dq[16]));
+		asm volatile("vst $vr6, %0" : "=m" (dq[32]));
+		asm volatile("vst $vr7, %0" : "=m" (dq[48]));
+
+		/* vr0 - vr3: P + Pxy + Dx = Dy */
+		asm volatile("vxor.v $vr0, $vr0, $vr4");
+		asm volatile("vxor.v $vr1, $vr1, $vr5");
+		asm volatile("vxor.v $vr2, $vr2, $vr6");
+		asm volatile("vxor.v $vr3, $vr3, $vr7");
+		asm volatile("vst $vr0, %0" : "=m" (dp[0]));
+		asm volatile("vst $vr1, %0" : "=m" (dp[16]));
+		asm volatile("vst $vr2, %0" : "=m" (dp[32]));
+		asm volatile("vst $vr3, %0" : "=m" (dp[48]));
+
+		bytes -= 64;
+		p += 64;
+		q += 64;
+		dp += 64;
+		dq += 64;
+	}
+
+	kernel_fpu_end();
+}
+
+static void raid6_datap_recov_lsx(int disks, size_t bytes, int faila,
+				  void **ptrs)
+{
+	u8 *p, *q, *dq;
+	const u8 *qmul;		/* Q multiplier table */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data page
+	 * Use the dead data page as temporary storage for delta q
+	 */
+	dq = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila] = dq;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+	kernel_fpu_begin();
+
+	/* vr22, vr23: qmul */
+	asm volatile("vld $vr22, %0" : : "m" (qmul[0]));
+	asm volatile("vld $vr23, %0" : : "m" (qmul[16]));
+
+	while (bytes) {
+		/* vr0 - vr3: P + Dx */
+		asm volatile("vld $vr0, %0" : : "m" (p[0]));
+		asm volatile("vld $vr1, %0" : : "m" (p[16]));
+		asm volatile("vld $vr2, %0" : : "m" (p[32]));
+		asm volatile("vld $vr3, %0" : : "m" (p[48]));
+		/* vr4 - vr7: Qx */
+		asm volatile("vld $vr4, %0" : : "m" (dq[0]));
+		asm volatile("vld $vr5, %0" : : "m" (dq[16]));
+		asm volatile("vld $vr6, %0" : : "m" (dq[32]));
+		asm volatile("vld $vr7, %0" : : "m" (dq[48]));
+		/* vr4 - vr7: Q + Qx */
+		asm volatile("vld $vr8, %0" : : "m" (q[0]));
+		asm volatile("vld $vr9, %0" : : "m" (q[16]));
+		asm volatile("vld $vr10, %0" : : "m" (q[32]));
+		asm volatile("vld $vr11, %0" : : "m" (q[48]));
+		asm volatile("vxor.v $vr4, $vr4, $vr8");
+		asm volatile("vxor.v $vr5, $vr5, $vr9");
+		asm volatile("vxor.v $vr6, $vr6, $vr10");
+		asm volatile("vxor.v $vr7, $vr7, $vr11");
+
+		/* vr8 - vr11: higher 4 bits of each byte of (Q + Qx) */
+		asm volatile("vsrli.b $vr8, $vr4, 4");
+		asm volatile("vsrli.b $vr9, $vr5, 4");
+		asm volatile("vsrli.b $vr10, $vr6, 4");
+		asm volatile("vsrli.b $vr11, $vr7, 4");
+		/* vr4 - vr7: lower 4 bits of each byte of (Q + Qx) */
+		asm volatile("vandi.b $vr4, $vr4, 0x0f");
+		asm volatile("vandi.b $vr5, $vr5, 0x0f");
+		asm volatile("vandi.b $vr6, $vr6, 0x0f");
+		asm volatile("vandi.b $vr7, $vr7, 0x0f");
+		/* lookup from qmul[0] */
+		asm volatile("vshuf.b $vr4, $vr22, $vr22, $vr4");
+		asm volatile("vshuf.b $vr5, $vr22, $vr22, $vr5");
+		asm volatile("vshuf.b $vr6, $vr22, $vr22, $vr6");
+		asm volatile("vshuf.b $vr7, $vr22, $vr22, $vr7");
+		/* lookup from qmul[16] */
+		asm volatile("vshuf.b $vr8, $vr23, $vr23, $vr8");
+		asm volatile("vshuf.b $vr9, $vr23, $vr23, $vr9");
+		asm volatile("vshuf.b $vr10, $vr23, $vr23, $vr10");
+		asm volatile("vshuf.b $vr11, $vr23, $vr23, $vr11");
+		/* vr4 - vr7: qmul(Q + Qx) = Dx */
+		asm volatile("vxor.v $vr4, $vr4, $vr8");
+		asm volatile("vxor.v $vr5, $vr5, $vr9");
+		asm volatile("vxor.v $vr6, $vr6, $vr10");
+		asm volatile("vxor.v $vr7, $vr7, $vr11");
+		asm volatile("vst $vr4, %0" : "=m" (dq[0]));
+		asm volatile("vst $vr5, %0" : "=m" (dq[16]));
+		asm volatile("vst $vr6, %0" : "=m" (dq[32]));
+		asm volatile("vst $vr7, %0" : "=m" (dq[48]));
+
+		/* vr0 - vr3: P + Dx + Dx = P */
+		asm volatile("vxor.v $vr0, $vr0, $vr4");
+		asm volatile("vxor.v $vr1, $vr1, $vr5");
+		asm volatile("vxor.v $vr2, $vr2, $vr6");
+		asm volatile("vxor.v $vr3, $vr3, $vr7");
+		asm volatile("vst $vr0, %0" : "=m" (p[0]));
+		asm volatile("vst $vr1, %0" : "=m" (p[16]));
+		asm volatile("vst $vr2, %0" : "=m" (p[32]));
+		asm volatile("vst $vr3, %0" : "=m" (p[48]));
+
+		bytes -= 64;
+		p += 64;
+		q += 64;
+		dq += 64;
+	}
+
+	kernel_fpu_end();
+}
+
+const struct raid6_recov_calls raid6_recov_lsx = {
+	.data2 = raid6_2data_recov_lsx,
+	.datap = raid6_datap_recov_lsx,
+	.valid = raid6_has_lsx,
+	.name = "lsx",
+	.priority = 1,
+};
+#endif /* CONFIG_CPU_HAS_LSX */
+
+#ifdef CONFIG_CPU_HAS_LASX
+static int raid6_has_lasx(void)
+{
+	return cpu_has_lasx;
+}
+
+static void raid6_2data_recov_lasx(int disks, size_t bytes, int faila,
+				   int failb, void **ptrs)
+{
+	u8 *p, *q, *dp, *dq;
+	const u8 *pbmul;	/* P multiplier table for B data */
+	const u8 *qmul;		/* Q multiplier table (for both) */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data pages
+	 * Use the dead data pages as temporary storage for
+	 * delta p and delta q
+	 */
+	dp = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 2] = dp;
+	dq = (u8 *)ptrs[failb];
+	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila] = dp;
+	ptrs[failb] = dq;
+	ptrs[disks - 2] = p;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]];
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]];
+
+	kernel_fpu_begin();
+
+	/*
+	 * xr20, xr21: qmul
+	 * xr22, xr23: pbmul
+	 */
+	asm volatile("vld $vr20, %0" : : "m" (qmul[0]));
+	asm volatile("vld $vr21, %0" : : "m" (qmul[16]));
+	asm volatile("vld $vr22, %0" : : "m" (pbmul[0]));
+	asm volatile("vld $vr23, %0" : : "m" (pbmul[16]));
+	asm volatile("xvreplve0.q $xr20, $xr20");
+	asm volatile("xvreplve0.q $xr21, $xr21");
+	asm volatile("xvreplve0.q $xr22, $xr22");
+	asm volatile("xvreplve0.q $xr23, $xr23");
+
+	while (bytes) {
+		/* xr0, xr1: Q */
+		asm volatile("xvld $xr0, %0" : : "m" (q[0]));
+		asm volatile("xvld $xr1, %0" : : "m" (q[32]));
+		/* xr0, xr1: Q + Qxy */
+		asm volatile("xvld $xr4, %0" : : "m" (dq[0]));
+		asm volatile("xvld $xr5, %0" : : "m" (dq[32]));
+		asm volatile("xvxor.v $xr0, $xr0, $xr4");
+		asm volatile("xvxor.v $xr1, $xr1, $xr5");
+		/* xr2, xr3: P */
+		asm volatile("xvld $xr2, %0" : : "m" (p[0]));
+		asm volatile("xvld $xr3, %0" : : "m" (p[32]));
+		/* xr2, xr3: P + Pxy */
+		asm volatile("xvld $xr4, %0" : : "m" (dp[0]));
+		asm volatile("xvld $xr5, %0" : : "m" (dp[32]));
+		asm volatile("xvxor.v $xr2, $xr2, $xr4");
+		asm volatile("xvxor.v $xr3, $xr3, $xr5");
+
+		/* xr4, xr5: higher 4 bits of each byte of (Q + Qxy) */
+		asm volatile("xvsrli.b $xr4, $xr0, 4");
+		asm volatile("xvsrli.b $xr5, $xr1, 4");
+		/* xr0, xr1: lower 4 bits of each byte of (Q + Qxy) */
+		asm volatile("xvandi.b $xr0, $xr0, 0x0f");
+		asm volatile("xvandi.b $xr1, $xr1, 0x0f");
+		/* lookup from qmul[0] */
+		asm volatile("xvshuf.b $xr0, $xr20, $xr20, $xr0");
+		asm volatile("xvshuf.b $xr1, $xr20, $xr20, $xr1");
+		/* lookup from qmul[16] */
+		asm volatile("xvshuf.b $xr4, $xr21, $xr21, $xr4");
+		asm volatile("xvshuf.b $xr5, $xr21, $xr21, $xr5");
+		/* xr6, xr7: B(Q + Qxy) */
+		asm volatile("xvxor.v $xr6, $xr4, $xr0");
+		asm volatile("xvxor.v $xr7, $xr5, $xr1");
+
+		/* xr4, xr5: higher 4 bits of each byte of (P + Pxy) */
+		asm volatile("xvsrli.b $xr4, $xr2, 4");
+		asm volatile("xvsrli.b $xr5, $xr3, 4");
+		/* xr0, xr1: lower 4 bits of each byte of (P + Pxy) */
+		asm volatile("xvandi.b $xr0, $xr2, 0x0f");
+		asm volatile("xvandi.b $xr1, $xr3, 0x0f");
+		/* lookup from pbmul[0] */
+		asm volatile("xvshuf.b $xr0, $xr22, $xr22, $xr0");
+		asm volatile("xvshuf.b $xr1, $xr22, $xr22, $xr1");
+		/* lookup from pbmul[16] */
+		asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4");
+		asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5");
+		/* xr0, xr1: A(P + Pxy) */
+		asm volatile("xvxor.v $xr0, $xr0, $xr4");
+		asm volatile("xvxor.v $xr1, $xr1, $xr5");
+
+		/* xr0, xr1: A(P + Pxy) + B(Q + Qxy) = Dx */
+		asm volatile("xvxor.v $xr0, $xr0, $xr6");
+		asm volatile("xvxor.v $xr1, $xr1, $xr7");
+
+		/* xr2, xr3: P + Pxy + Dx = Dy */
+		asm volatile("xvxor.v $xr2, $xr2, $xr0");
+		asm volatile("xvxor.v $xr3, $xr3, $xr1");
+
+		asm volatile("xvst $xr0, %0" : "=m" (dq[0]));
+		asm volatile("xvst $xr1, %0" : "=m" (dq[32]));
+		asm volatile("xvst $xr2, %0" : "=m" (dp[0]));
+		asm volatile("xvst $xr3, %0" : "=m" (dp[32]));
+
+		bytes -= 64;
+		p += 64;
+		q += 64;
+		dp += 64;
+		dq += 64;
+	}
+
+	kernel_fpu_end();
+}
+
+static void raid6_datap_recov_lasx(int disks, size_t bytes, int faila,
+				   void **ptrs)
+{
+	u8 *p, *q, *dq;
+	const u8 *qmul;		/* Q multiplier table */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data page
+	 * Use the dead data page as temporary storage for delta q
+	 */
+	dq = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila] = dq;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+	kernel_fpu_begin();
+
+	/* xr22, xr23: qmul */
+	asm volatile("vld $vr22, %0" : : "m" (qmul[0]));
+	asm volatile("xvreplve0.q $xr22, $xr22");
+	asm volatile("vld $vr23, %0" : : "m" (qmul[16]));
+	asm volatile("xvreplve0.q $xr23, $xr23");
+
+	while (bytes) {
+		/* xr0, xr1: P + Dx */
+		asm volatile("xvld $xr0, %0" : : "m" (p[0]));
+		asm volatile("xvld $xr1, %0" : : "m" (p[32]));
+		/* xr2, xr3: Qx */
+		asm volatile("xvld $xr2, %0" : : "m" (dq[0]));
+		asm volatile("xvld $xr3, %0" : : "m" (dq[32]));
+		/* xr2, xr3: Q + Qx */
+		asm volatile("xvld $xr4, %0" : : "m" (q[0]));
+		asm volatile("xvld $xr5, %0" : : "m" (q[32]));
+		asm volatile("xvxor.v $xr2, $xr2, $xr4");
+		asm volatile("xvxor.v $xr3, $xr3, $xr5");
+
+		/* xr4, xr5: higher 4 bits of each byte of (Q + Qx) */
+		asm volatile("xvsrli.b $xr4, $xr2, 4");
+		asm volatile("xvsrli.b $xr5, $xr3, 4");
+		/* xr2, xr3: lower 4 bits of each byte of (Q + Qx) */
+		asm volatile("xvandi.b $xr2, $xr2, 0x0f");
+		asm volatile("xvandi.b $xr3, $xr3, 0x0f");
+		/* lookup from qmul[0] */
+		asm volatile("xvshuf.b $xr2, $xr22, $xr22, $xr2");
+		asm volatile("xvshuf.b $xr3, $xr22, $xr22, $xr3");
+		/* lookup from qmul[16] */
+		asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4");
+		asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5");
+		/* xr2, xr3: qmul(Q + Qx) = Dx */
+		asm volatile("xvxor.v $xr2, $xr2, $xr4");
+		asm volatile("xvxor.v $xr3, $xr3, $xr5");
+
+		/* xr0, xr1: P + Dx + Dx = P */
+		asm volatile("xvxor.v $xr0, $xr0, $xr2");
+		asm volatile("xvxor.v $xr1, $xr1, $xr3");
+
+		asm volatile("xvst $xr2, %0" : "=m" (dq[0]));
+		asm volatile("xvst $xr3, %0" : "=m" (dq[32]));
+		asm volatile("xvst $xr0, %0" : "=m" (p[0]));
+		asm volatile("xvst $xr1, %0" : "=m" (p[32]));
+
+		bytes -= 64;
+		p += 64;
+		q += 64;
+		dq += 64;
+	}
+
+	kernel_fpu_end();
+}
+
+const struct raid6_recov_calls raid6_recov_lasx = {
+	.data2 = raid6_2data_recov_lasx,
+	.datap = raid6_datap_recov_lasx,
+	.valid = raid6_has_lasx,
+	.name = "lasx",
+	.priority = 2,
+};
+#endif /* CONFIG_CPU_HAS_LASX */
diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile
index 1f693ea3b980cef7f229ba19c5a794d4e24377b8..2abe0076a636c3816678d13df2b0c7d036da154a 100644
--- a/lib/raid6/test/Makefile
+++ b/lib/raid6/test/Makefile
@@ -41,6 +41,16 @@ ifeq ($(findstring ppc,$(ARCH)),ppc)
                          gcc -c -x c - >/dev/null && rm ./-.o && echo yes)
 endif
 
+ifeq ($(ARCH),loongarch64)
+        CFLAGS += -I../../../arch/loongarch/include -DCONFIG_LOONGARCH=1
+        CFLAGS += $(shell echo 'vld $$vr0, $$zero, 0' |         \
+                    gcc -c -x assembler - >/dev/null 2>&1 &&    \
+                    rm ./-.o && echo -DCONFIG_CPU_HAS_LSX=1)
+        CFLAGS += $(shell echo 'xvld $$xr0, $$zero, 0' |        \
+                    gcc -c -x assembler - >/dev/null 2>&1 &&    \
+                    rm ./-.o && echo -DCONFIG_CPU_HAS_LASX=1)
+endif
+
 ifeq ($(IS_X86),yes)
         OBJS   += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o
         CFLAGS += -DCONFIG_X86
@@ -54,6 +64,8 @@ else ifeq ($(HAS_ALTIVEC),yes)
         CFLAGS += -DCONFIG_ALTIVEC
         OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \
                 vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o
+else ifeq ($(ARCH),loongarch64)
+        OBJS += loongarch_simd.o recov_loongarch_simd.o
 endif
 
 .c.o:
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index dcfec277e8394be8312058c3cab2dcdacd5b105b..89895f38f722423c02744f359799db4bb7f12dc2 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -139,6 +139,10 @@ static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr,
 	return 0;
 }
 
+void __weak __meminit pmd_init(void *addr)
+{
+}
+
 static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr,
 				unsigned long end)
 {
@@ -166,8 +170,9 @@ static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr,
 				if (!p)
 					return -ENOMEM;
 			} else {
-				pud_populate(&init_mm, pud,
-					early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+				p = early_alloc(PAGE_SIZE, NUMA_NO_NODE);
+				pmd_init(p);
+				pud_populate(&init_mm, pud, p);
 			}
 		}
 		zero_pmd_populate(pud, addr, next);
@@ -176,6 +181,10 @@ static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr,
 	return 0;
 }
 
+void __weak __meminit pud_init(void *addr)
+{
+}
+
 static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
 				unsigned long end)
 {
@@ -207,8 +216,9 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
 				if (!p)
 					return -ENOMEM;
 			} else {
-				p4d_populate(&init_mm, p4d,
-					early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+				p = early_alloc(PAGE_SIZE, NUMA_NO_NODE);
+				pud_init(p);
+				p4d_populate(&init_mm, p4d, p);
 			}
 		}
 		zero_pud_populate(p4d, addr, next);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 2e973b36fe0725339c04eda6f10446b8b28244ca..f70e3d7a602e1e816abe7357793fcb16fdfa3f91 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -291,16 +291,22 @@ struct kasan_stack_ring {
 
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 
+#ifndef __HAVE_ARCH_SHADOW_MAP
 static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
 {
 	return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
 		<< KASAN_SHADOW_SCALE_SHIFT);
 }
+#endif
 
 static __always_inline bool addr_has_metadata(const void *addr)
 {
+#ifdef __HAVE_ARCH_SHADOW_MAP
+	return (kasan_mem_to_shadow((void *)addr) != NULL);
+#else
 	return (kasan_reset_tag(addr) >=
 		kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
+#endif
 }
 
 /**
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 96fd0411f5c5866d69913f6e6e8a87285e830aad..3872528d096380b0c412897e7cc7b56b4cb59eb9 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -574,13 +574,14 @@ static void rcu_guarded_free(struct rcu_head *h)
  */
 static unsigned long kfence_init_pool(void)
 {
-	unsigned long addr = (unsigned long)__kfence_pool;
+	unsigned long addr;
 	struct page *pages;
 	int i;
 
 	if (!arch_kfence_init_pool())
-		return addr;
+		return (unsigned long)__kfence_pool;
 
+	addr = (unsigned long)__kfence_pool;
 	pages = virt_to_page(__kfence_pool);
 
 	/*