diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile
index 3580d57ea21841285bc687d928f39268a325edc1..79e9bdbfc491a29521939aa2747862fc491c6d6d 100644
--- a/arch/arm/boot/compressed/Makefile
+++ b/arch/arm/boot/compressed/Makefile
@@ -124,7 +124,7 @@ KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS))
 endif
 
 ccflags-y := -fpic -mno-single-pic-base -fno-builtin -I$(obj)
-asflags-y := -Wa,-march=all -DZIMAGE
+asflags-y := -DZIMAGE
 
 # Supply kernel BSS size to the decompressor via a linker symbol.
 KBSS_SZ = $(shell $(CROSS_COMPILE)size $(obj)/../../../../vmlinux | \
diff --git a/arch/arm/boot/compressed/debug.S b/arch/arm/boot/compressed/debug.S
index 6e8382d5b7a4d31418a934565a473b823ffe5b6f..5392ee63338fac3453f30b125366e03241158133 100644
--- a/arch/arm/boot/compressed/debug.S
+++ b/arch/arm/boot/compressed/debug.S
@@ -1,6 +1,8 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
+#ifndef CONFIG_DEBUG_SEMIHOSTING
+
 #include CONFIG_DEBUG_LL_INCLUDE
 
 ENTRY(putc)
@@ -10,3 +12,29 @@ ENTRY(putc)
 	busyuart r3, r1
 	mov	 pc, lr
 ENDPROC(putc)
+
+#else
+
+ENTRY(putc)
+	adr	r1, 1f
+	ldmia	r1, {r2, r3}
+	add	r2, r2, r1
+	ldr	r1, [r2, r3]
+	strb	r0, [r1]
+	mov	r0, #0x03		@ SYS_WRITEC
+   ARM(	svc	#0x123456	)
+ THUMB(	svc	#0xab		)
+	mov	pc, lr
+	.align	2
+1:	.word	_GLOBAL_OFFSET_TABLE_ - .
+	.word	semi_writec_buf(GOT)
+ENDPROC(putc)
+
+	.bss
+	.global	semi_writec_buf
+	.type   semi_writec_buf, %object
+semi_writec_buf:
+	.space	4
+	.size	semi_writec_buf, 4
+
+#endif
diff --git a/arch/arm/boot/compressed/head-sa1100.S b/arch/arm/boot/compressed/head-sa1100.S
index 6179d94dd5c665a634e5e2913a916c96c9652c84..3115e313d9f65a31ad746b7d8d282741a5b1f765 100644
--- a/arch/arm/boot/compressed/head-sa1100.S
+++ b/arch/arm/boot/compressed/head-sa1100.S
@@ -11,6 +11,7 @@
 #include <asm/mach-types.h>
 
 		.section        ".start", "ax"
+		.arch	armv4
 
 __SA1100_start:
 
diff --git a/arch/arm/boot/compressed/head-shark.S b/arch/arm/boot/compressed/head-shark.S
index 089c560e07f13947ed68ab446edcc9de40c30604..92b56897ed64014037b48eb0e677bb633d646be1 100644
--- a/arch/arm/boot/compressed/head-shark.S
+++ b/arch/arm/boot/compressed/head-shark.S
@@ -18,6 +18,7 @@
 	
 		.section	".start", "ax"
 
+		.arch armv4
 		b	__beginning
 	
 __ofw_data:	.long	0				@ the number of memory blocks
diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
index fe4d9c3ad761c8dfaadce6e214d709417947f396..032a8d987148b6a24c97d7ec05467bef14b82ab0 100644
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -11,6 +11,7 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
+	.arch	armv7-a
 /*
  * Debugging stuff
  *
@@ -805,8 +806,8 @@ call_cache_fn:	adr	r12, proc_types
 		.align	2
 		.type	proc_types,#object
 proc_types:
-		.word	0x00000000		@ old ARM ID
-		.word	0x0000f000
+		.word	0x41000000		@ old ARM ID
+		.word	0xff00f000
 		mov	pc, lr
  THUMB(		nop				)
 		mov	pc, lr
diff --git a/arch/arm/include/asm/percpu.h b/arch/arm/include/asm/percpu.h
index 968c0a14e0a36b12565a0bf05f2393b61ed0a27f..209e6504922e1c0b5a63733572055af723e37119 100644
--- a/arch/arm/include/asm/percpu.h
+++ b/arch/arm/include/asm/percpu.h
@@ -30,8 +30,15 @@ static inline void set_my_cpu_offset(unsigned long off)
 static inline unsigned long __my_cpu_offset(void)
 {
 	unsigned long off;
-	/* Read TPIDRPRW */
-	asm("mrc p15, 0, %0, c13, c0, 4" : "=r" (off) : : "memory");
+	register unsigned long *sp asm ("sp");
+
+	/*
+	 * Read TPIDRPRW.
+	 * We want to allow caching the value, so avoid using volatile and
+	 * instead use a fake stack read to hazard against barrier().
+	 */
+	asm("mrc p15, 0, %0, c13, c0, 4" : "=r" (off) : "Q" (*sp));
+
 	return off;
 }
 #define __my_cpu_offset __my_cpu_offset()
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index f10316b4ecdc7558b05644d3758532a26fa6de58..c5a59546a256c9b3720022265eb08fcde0dc4f68 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -13,6 +13,7 @@
 
 #include <linux/cpu.h>
 #include <linux/cpumask.h>
+#include <linux/export.h>
 #include <linux/init.h>
 #include <linux/percpu.h>
 #include <linux/node.h>
@@ -200,6 +201,7 @@ static inline void update_cpu_power(unsigned int cpuid, unsigned int mpidr) {}
  * cpu topology table
  */
 struct cputopo_arm cpu_topology[NR_CPUS];
+EXPORT_SYMBOL_GPL(cpu_topology);
 
 const struct cpumask *cpu_coregroup_mask(int cpu)
 {