diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c
index 335a6ae3d594044fa0ca99e44ee189631238b609..11c92dc53791bc2d6b62c727c09fe48cc032a7de 100644
--- a/arch/mips/kernel/cpu-probe.c
+++ b/arch/mips/kernel/cpu-probe.c
@@ -45,18 +45,7 @@ static void r39xx_wait(void)
 	local_irq_enable();
 }
 
-/*
- * There is a race when WAIT instruction executed with interrupt
- * enabled.
- * But it is implementation-dependent wheter the pipelie restarts when
- * a non-enabled interrupt is requested.
- */
-static void r4k_wait(void)
-{
-	__asm__("	.set	mips3			\n"
-		"	wait				\n"
-		"	.set	mips0			\n");
-}
+extern void r4k_wait(void);
 
 /*
  * This variant is preferable as it allows testing need_resched and going to
@@ -128,7 +117,7 @@ static int __init wait_disable(char *s)
 
 __setup("nowait", wait_disable);
 
-static inline void check_wait(void)
+void __init check_wait(void)
 {
 	struct cpuinfo_mips *c = &current_cpu_data;
 
@@ -242,7 +231,6 @@ static inline void check_errata(void)
 
 void __init check_bugs32(void)
 {
-	check_wait();
 	check_errata();
 }
 
diff --git a/arch/mips/kernel/genex.S b/arch/mips/kernel/genex.S
index c6ada98ee042039a968e57d57f800ee1c8313ed6..f886dd7f708e1d13f833fce5db545b737f15e684 100644
--- a/arch/mips/kernel/genex.S
+++ b/arch/mips/kernel/genex.S
@@ -20,6 +20,7 @@
 #include <asm/stackframe.h>
 #include <asm/war.h>
 #include <asm/page.h>
+#include <asm/thread_info.h>
 
 #define PANIC_PIC(msg)					\
 		.set push;				\
@@ -126,7 +127,42 @@ handle_vcei:
 
 	__FINIT
 
+	.align	5	/* 32 byte rollback region */
+LEAF(r4k_wait)
+	.set	push
+	.set	noreorder
+	/* start of rollback region */
+	LONG_L	t0, TI_FLAGS($28)
+	nop
+	andi	t0, _TIF_NEED_RESCHED
+	bnez	t0, 1f
+	 nop
+	nop
+	nop
+	.set	mips3
+	wait
+	/* end of rollback region (the region size must be power of two) */
+	.set	pop
+1:
+	jr	ra
+	END(r4k_wait)
+
+	.macro	BUILD_ROLLBACK_PROLOGUE handler
+	FEXPORT(rollback_\handler)
+	.set	push
+	.set	noat
+	MFC0	k0, CP0_EPC
+	PTR_LA	k1, r4k_wait
+	ori	k0, 0x1f	/* 32 byte rollback region */
+	xori	k0, 0x1f
+	bne	k0, k1, 9f
+	MTC0	k0, CP0_EPC
+9:
+	.set pop
+	.endm
+
 	.align  5
+BUILD_ROLLBACK_PROLOGUE handle_int
 NESTED(handle_int, PT_SIZE, sp)
 #ifdef CONFIG_TRACE_IRQFLAGS
 	/*
@@ -201,6 +237,7 @@ NESTED(except_vec_ejtag_debug, 0, sp)
  * This prototype is copied to ebase + n*IntCtl.VS and patched
  * to invoke the handler
  */
+BUILD_ROLLBACK_PROLOGUE except_vec_vi
 NESTED(except_vec_vi, 0, sp)
 	SAVE_SOME
 	SAVE_AT
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index b16facd9ea8ecae7b27116952090beca25450c78..ce7684335a415fb98de1781058d9db9f6d5271eb 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -148,6 +148,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
 	clear_tsk_thread_flag(p, TIF_USEDFPU);
 
 #ifdef CONFIG_MIPS_MT_FPAFF
+	clear_tsk_thread_flag(p, TIF_FPUBOUND);
+
 	/*
 	 * FPU affinity support is cleaner if we track the
 	 * user-visible CPU affinity from the very beginning.
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 6bee29097a565e71459f80ee75c3affd3d0ed25b..5fd0cd020af58e3f2fa5c3753be3098e710ba109 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -46,6 +46,9 @@
 #include <asm/types.h>
 #include <asm/stacktrace.h>
 
+extern void check_wait(void);
+extern asmlinkage void r4k_wait(void);
+extern asmlinkage void rollback_handle_int(void);
 extern asmlinkage void handle_int(void);
 extern asmlinkage void handle_tlbm(void);
 extern asmlinkage void handle_tlbl(void);
@@ -1251,6 +1254,9 @@ static void *set_vi_srs_handler(int n, vi_handler_t addr, int srs)
 
 		extern char except_vec_vi, except_vec_vi_lui;
 		extern char except_vec_vi_ori, except_vec_vi_end;
+		extern char rollback_except_vec_vi;
+		char *vec_start = (cpu_wait == r4k_wait) ?
+			&rollback_except_vec_vi : &except_vec_vi;
 #ifdef CONFIG_MIPS_MT_SMTC
 		/*
 		 * We need to provide the SMTC vectored interrupt handler
@@ -1258,11 +1264,11 @@ static void *set_vi_srs_handler(int n, vi_handler_t addr, int srs)
 		 * Status.IM bit to be masked before going there.
 		 */
 		extern char except_vec_vi_mori;
-		const int mori_offset = &except_vec_vi_mori - &except_vec_vi;
+		const int mori_offset = &except_vec_vi_mori - vec_start;
 #endif /* CONFIG_MIPS_MT_SMTC */
-		const int handler_len = &except_vec_vi_end - &except_vec_vi;
-		const int lui_offset = &except_vec_vi_lui - &except_vec_vi;
-		const int ori_offset = &except_vec_vi_ori - &except_vec_vi;
+		const int handler_len = &except_vec_vi_end - vec_start;
+		const int lui_offset = &except_vec_vi_lui - vec_start;
+		const int ori_offset = &except_vec_vi_ori - vec_start;
 
 		if (handler_len > VECTORSPACING) {
 			/*
@@ -1272,7 +1278,7 @@ static void *set_vi_srs_handler(int n, vi_handler_t addr, int srs)
 			panic("VECTORSPACING too small");
 		}
 
-		memcpy(b, &except_vec_vi, handler_len);
+		memcpy(b, vec_start, handler_len);
 #ifdef CONFIG_MIPS_MT_SMTC
 		BUG_ON(n > 7);	/* Vector index %d exceeds SMTC maximum. */
 
@@ -1554,6 +1560,10 @@ void __init trap_init(void)
 	extern char except_vec3_generic, except_vec3_r4000;
 	extern char except_vec4;
 	unsigned long i;
+	int rollback;
+
+	check_wait();
+	rollback = (cpu_wait == r4k_wait);
 
 #if defined(CONFIG_KGDB)
 	if (kgdb_early_setup)
@@ -1618,7 +1628,7 @@ void __init trap_init(void)
 	if (board_be_init)
 		board_be_init();
 
-	set_except_vector(0, handle_int);
+	set_except_vector(0, rollback ? rollback_handle_int : handle_int);
 	set_except_vector(1, handle_tlbm);
 	set_except_vector(2, handle_tlbl);
 	set_except_vector(3, handle_tlbs);
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index b5470ceb418b301847f385d72e306eff0b1bf6dc..afb119f35682680c2eb5eeddb20137aab7e64aa9 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -36,6 +36,7 @@ SECTIONS
 		SCHED_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
+		*(.text.*)
 		*(.fixup)
 		*(.gnu.warning)
 	} :text = 0
diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S
index 8d7784122c143b7a36689d82dbe057928244a355..edac9892c51a19a5e8084ad26ca5584d54c78499 100644
--- a/arch/mips/lib/csum_partial.S
+++ b/arch/mips/lib/csum_partial.S
@@ -39,12 +39,14 @@
 #ifdef USE_DOUBLE
 
 #define LOAD   ld
+#define LOAD32 lwu
 #define ADD    daddu
 #define NBYTES 8
 
 #else
 
 #define LOAD   lw
+#define LOAD32 lw
 #define ADD    addu
 #define NBYTES 4
 
@@ -60,6 +62,14 @@
 	ADD	sum, v1;					\
 	.set	pop
 
+#define ADDC32(sum,reg)						\
+	.set	push;						\
+	.set	noat;						\
+	addu	sum, reg;					\
+	sltu	v1, sum, reg;					\
+	addu	sum, v1;					\
+	.set	pop
+
 #define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)	\
 	LOAD	_t0, (offset + UNIT(0))(src);			\
 	LOAD	_t1, (offset + UNIT(1))(src);			\
@@ -132,7 +142,7 @@ LEAF(csum_partial)
 	beqz	t8, .Lqword_align
 	 andi	t8, src, 0x8
 
-	lw	t0, 0x00(src)
+	LOAD32	t0, 0x00(src)
 	LONG_SUBU	a1, a1, 0x4
 	ADDC(sum, t0)
 	PTR_ADDU	src, src, 0x4
@@ -211,7 +221,7 @@ LEAF(csum_partial)
 	LONG_SRL	t8, t8, 0x2
 
 .Lend_words:
-	lw	t0, (src)
+	LOAD32	t0, (src)
 	LONG_SUBU	t8, t8, 0x1
 	ADDC(sum, t0)
 	.set	reorder				/* DADDI_WAR */
@@ -230,6 +240,9 @@ LEAF(csum_partial)
 	/* Still a full word to go  */
 	ulw	t1, (src)
 	PTR_ADDIU	src, 4
+#ifdef USE_DOUBLE
+	dsll	t1, t1, 32			/* clear lower 32bit */
+#endif
 	ADDC(sum, t1)
 
 1:	move	t1, zero
@@ -280,7 +293,7 @@ LEAF(csum_partial)
 1:
 	.set	reorder
 	/* Add the passed partial csum.  */
-	ADDC(sum, a2)
+	ADDC32(sum, a2)
 	jr	ra
 	.set	noreorder
 	END(csum_partial)
@@ -681,7 +694,7 @@ EXC(	sb	t0, NBYTES-2(dst), .Ls_exc)
 	.set	pop
 1:
 	.set reorder
-	ADDC(sum, psum)
+	ADDC32(sum, psum)
 	jr	ra
 	.set noreorder