diff --git a/src/cpu/intel/haswell/haswell.h b/src/cpu/intel/haswell/haswell.h
index 26807e9f3a..a510e7de58 100644
--- a/src/cpu/intel/haswell/haswell.h
+++ b/src/cpu/intel/haswell/haswell.h
@@ -144,6 +144,7 @@ int cpu_config_tdp_levels(void);
 /* Returns 0 on success, < 0 on failure. */
 int smm_initialize(void);
 void smm_initiate_relocation(void);
+void smm_initiate_relocation_parallel(void);
 struct bus;
 void bsp_init_and_start_aps(struct bus *cpu_bus);
 /* Returns 0 on succes. < 0 on failure. */
@@ -151,7 +152,7 @@ int setup_ap_init(struct bus *cpu_bus, int *max_cpus,
                   const void *microcode_patch);
 /* Returns 0 on success, < 0 on failure. */
 int start_aps(struct bus *cpu_bus, int max_cpus);
-void release_aps_for_smm_relocation(void);
+void release_aps_for_smm_relocation(int do_parallel_relocation);
 #endif
 #endif
 
diff --git a/src/cpu/intel/haswell/haswell_init.c b/src/cpu/intel/haswell/haswell_init.c
index 82430b750d..c7f89ee646 100644
--- a/src/cpu/intel/haswell/haswell_init.c
+++ b/src/cpu/intel/haswell/haswell_init.c
@@ -549,9 +549,6 @@ void bsp_init_and_start_aps(struct bus *cpu_bus)
 		return;
 	}
 
-	/* Release APs to perform SMM relocation. */
-	release_aps_for_smm_relocation();
-
 	/* After SMM relocation a 2nd microcode load is required. */
 	intel_microcode_load_unlocked(microcode_patch);
 }
diff --git a/src/cpu/intel/haswell/mp_init.c b/src/cpu/intel/haswell/mp_init.c
index 7f15c391d7..c8bd5c22be 100644
--- a/src/cpu/intel/haswell/mp_init.c
+++ b/src/cpu/intel/haswell/mp_init.c
@@ -75,9 +75,16 @@ static device_t cpu_devs[CONFIG_MAX_CPUS];
 
 /* Number of APs checked that have checked in. */
 static atomic_t num_aps;
+/* Number of APs that have relocated their SMM handler. */
+static atomic_t num_aps_relocated_smm;
 /* Barrier to stop APs from performing SMM relcoation. */
 static int smm_relocation_barrier_begin __attribute__ ((aligned (64)));
 
+static inline void mfence(void)
+{
+	__asm__ __volatile__("mfence\t\n": : :"memory");
+}
+
 static inline void wait_for_barrier(volatile int *barrier)
 {
 	while (*barrier == 0) {
@@ -95,13 +102,18 @@ static void ap_wait_for_smm_relocation_begin(void)
 	wait_for_barrier(&smm_relocation_barrier_begin);
 }
 
+/* This function pointer is used by the non-BSP CPUs to initiate relocation. It
+ * points to either a serial or parallel SMM initiation. */
+static void (*ap_initiate_smm_relocation)(void) = &smm_initiate_relocation;
+
 
 /* Returns 1 if timeout waiting for APs. 0 if target aps found. */
-static int wait_for_aps(int target, int total_delay, int delay_step)
+static int wait_for_aps(atomic_t *val, int target, int total_delay,
+                        int delay_step)
 {
 	int timeout = 0;
 	int delayed = 0;
-	while (atomic_read(&num_aps) != target) {
+	while (atomic_read(val) != target) {
 		udelay(delay_step);
 		delayed += delay_step;
 		if (delayed >= total_delay) {
@@ -113,9 +125,19 @@ static int wait_for_aps(int target, int total_delay, int delay_step)
 	return timeout;
 }
 
-void release_aps_for_smm_relocation(void)
+void release_aps_for_smm_relocation(int do_parallel)
 {
+	/* Change the AP SMM initiation function, and ensure it is visible
+	 * before releasing the APs. */
+	if (do_parallel) {
+		ap_initiate_smm_relocation = &smm_initiate_relocation_parallel;
+		mfence();
+	}
 	release_barrier(&smm_relocation_barrier_begin);
+	/* Wait for CPUs to relocate their SMM handler up to 100ms. */
+	if (wait_for_aps(&num_aps_relocated_smm, atomic_read(&num_aps),
+	                 100000 /* 100 ms */, 200 /* us */))
+		printk(BIOS_DEBUG, "Timed out waiting for AP SMM relocation\n");
 }
 
 /* The mtrr code sets up ROM caching on the BSP, but not the others. However,
@@ -172,7 +194,10 @@ ap_init(unsigned int cpu, void *microcode_ptr)
 
 	ap_wait_for_smm_relocation_begin();
 
-	smm_initiate_relocation();
+	ap_initiate_smm_relocation();
+
+	/* Indicate that SMM relocation has occured on this thread. */
+	atomic_inc(&num_aps_relocated_smm);
 
 	/* After SMM relocation a 2nd microcode load is required. */
 	intel_microcode_load_unlocked(microcode_ptr);
@@ -483,7 +508,7 @@ int start_aps(struct bus *cpu_bus, int ap_count)
 		printk(BIOS_DEBUG, "done.\n");
 	}
 	/* Wait for CPUs to check in up to 200 us. */
-	wait_for_aps(ap_count, 200 /* us */, 15 /* us */);
+	wait_for_aps(&num_aps, ap_count, 200 /* us */, 15 /* us */);
 
 	/* Send 2nd SIPI */
 	if ((lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY)) {
@@ -507,7 +532,7 @@ int start_aps(struct bus *cpu_bus, int ap_count)
 	}
 
 	/* Wait for CPUs to check in. */
-	if (wait_for_aps(ap_count, 10000 /* 10 ms */, 50 /* us */)) {
+	if (wait_for_aps(&num_aps, ap_count, 10000 /* 10 ms */, 50 /* us */)) {
 		printk(BIOS_DEBUG, "Not all APs checked in: %d/%d.\n",
 		       atomic_read(&num_aps), ap_count);
 		return -1;
@@ -516,17 +541,12 @@ int start_aps(struct bus *cpu_bus, int ap_count)
 	return 0;
 }
 
-DECLARE_SPIN_LOCK(smm_relocation_lock);
-
-void smm_initiate_relocation(void)
+void smm_initiate_relocation_parallel(void)
 {
-	spin_lock(&smm_relocation_lock);
-
 	if ((lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY)) {
 		printk(BIOS_DEBUG, "Waiting for ICR not to be busy...");
 		if (apic_wait_timeout(1000 /* 1 ms */, 50)) {
 			printk(BIOS_DEBUG, "timed out. Aborting.\n");
-			spin_unlock(&smm_relocation_lock);
 			return;
 		} else
 			printk(BIOS_DEBUG, "done.\n");
@@ -539,6 +559,14 @@ void smm_initiate_relocation(void)
 	} else
 		printk(BIOS_DEBUG, "Relocation complete.\n");
 
+}
+
+DECLARE_SPIN_LOCK(smm_relocation_lock);
+
+void smm_initiate_relocation(void)
+{
+	spin_lock(&smm_relocation_lock);
+	smm_initiate_relocation_parallel();
 	spin_unlock(&smm_relocation_lock);
 }
 
diff --git a/src/cpu/intel/haswell/smmrelocate.c b/src/cpu/intel/haswell/smmrelocate.c
index 2bf304ebbf..2a322a7f9a 100644
--- a/src/cpu/intel/haswell/smmrelocate.c
+++ b/src/cpu/intel/haswell/smmrelocate.c
@@ -36,6 +36,14 @@
 #define EMRRphysMask_MSR 0x1f5
 #define UNCORE_EMRRphysBase_MSR 0x2f4
 #define UNCORE_EMRRphysMask_MSR 0x2f5
+#define SMM_MCA_CAP_MSR 0x17d
+#define   SMM_CPU_SVRSTR_BIT 57
+#define   SMM_CPU_SVRSTR_MASK (1 << (SMM_CPU_SVRSTR_BIT - 32))
+#define SMM_FEATURE_CONTROL_MSR 0x4e0
+#define   SMM_CPU_SAVE_EN (1 << 1)
+/* SMM save state MSRs */
+#define SMBASE_MSR 0xc20
+#define IEDBASE_MSR 0xc22
 
 #define SMRR_SUPPORTED (1<<11)
 #define EMRR_SUPPORTED (1<<12)
@@ -51,6 +59,10 @@ struct smm_relocation_params {
 	msr_t emrr_mask;
 	msr_t uncore_emrr_base;
 	msr_t uncore_emrr_mask;
+	/* The smm_save_state_in_msrs field indicates if SMM save state
+	 * locations live in MSRs. This indicates to the CPUs how to adjust
+	 * the SMMBASE and IEDBASE */
+	int smm_save_state_in_msrs;
 };
 
 /* This gets filled in and used during relocation. */
@@ -82,13 +94,79 @@ static inline void write_uncore_emrr(struct smm_relocation_params *relo_params)
 	wrmsr(UNCORE_EMRRphysMask_MSR, relo_params->uncore_emrr_mask);
 }
 
+static void update_save_state(int cpu,
+                              struct smm_relocation_params *relo_params,
+                              const struct smm_runtime *runtime)
+{
+	u32 smbase;
+	u32 iedbase;
+
+	/* The relocated handler runs with all CPUs concurrently. Therefore
+	 * stagger the entry points adjusting SMBASE downwards by save state
+	 * size * CPU num. */
+	smbase = relo_params->smram_base - cpu * runtime->save_state_size;
+	iedbase = relo_params->ied_base;
+
+	printk(BIOS_DEBUG, "New SMBASE=0x%08x IEDBASE=0x%08x\n",
+	       smbase, iedbase);
+
+	/* All threads need to set IEDBASE and SMBASE to the relocated
+	 * handler region. However, the save state location depends on the
+	 * smm_save_state_in_msrs field in the relocation parameters. If
+	 * smm_save_state_in_msrs is non-zero then the CPUs are relocating
+	 * the SMM handler in parallel, and each CPUs save state area is
+	 * located in their respective MSR space. If smm_save_state_in_msrs
+	 * is zero then the SMM relocation is happening serially so the
+	 * save state is at the same default location for all CPUs. */
+	if (relo_params->smm_save_state_in_msrs) {
+		msr_t smbase_msr;
+		msr_t iedbase_msr;
+
+		smbase_msr.lo = smbase;
+		smbase_msr.hi = 0;
+
+		/* According the BWG the IEDBASE MSR is in bits 63:32. It's
+		 * not clear why it differs from the SMBASE MSR. */
+		iedbase_msr.lo = 0;
+		iedbase_msr.hi = iedbase;
+
+		wrmsr(SMBASE_MSR, smbase_msr);
+		wrmsr(IEDBASE_MSR, iedbase_msr);
+	} else {
+		em64t101_smm_state_save_area_t *save_state;
+
+		save_state = (void *)(runtime->smbase + SMM_DEFAULT_SIZE -
+				      runtime->save_state_size);
+
+		save_state->smbase = smbase;
+		save_state->iedbase = iedbase;
+	}
+}
+
+/* Returns 1 if SMM MSR save state was set. */
+static int bsp_setup_msr_save_state(struct smm_relocation_params *relo_params)
+{
+	msr_t smm_mca_cap;
+
+	smm_mca_cap = rdmsr(SMM_MCA_CAP_MSR);
+	if (smm_mca_cap.hi & SMM_CPU_SVRSTR_MASK) {
+		msr_t smm_feature_control;
+
+		smm_feature_control = rdmsr(SMM_FEATURE_CONTROL_MSR);
+		smm_feature_control.hi = 0;
+		smm_feature_control.lo |= SMM_CPU_SAVE_EN;
+		wrmsr(SMM_FEATURE_CONTROL_MSR, smm_feature_control);
+		relo_params->smm_save_state_in_msrs = 1;
+	}
+	return relo_params->smm_save_state_in_msrs;
+}
+
 /* The relocation work is actually performed in SMM context, but the code
  * resides in the ramstage module. This occurs by trampolining from the default
  * SMRAM entry point to here. */
 static void __attribute__((cdecl))
 cpu_smm_do_relocation(void *arg, int cpu, const struct smm_runtime *runtime)
 {
-	em64t101_smm_state_save_area_t *save_state;
 	msr_t mtrr_cap;
 	struct smm_relocation_params *relo_params = arg;
 
@@ -100,21 +178,32 @@ cpu_smm_do_relocation(void *arg, int cpu, const struct smm_runtime *runtime)
 
 	printk(BIOS_DEBUG, "In relocation handler: cpu %d\n", cpu);
 
-	/* All threads need to set IEDBASE and SMBASE in the save state area.
-	 * Since one thread runs at a time during the relocation the save state
-	 * is the same for all cpus. */
-	save_state = (void *)(runtime->smbase + SMM_DEFAULT_SIZE -
-                              runtime->save_state_size);
+	/* Determine if the processor supports saving state in MSRs. If so,
+	 * enable it before the non-BSPs run so that SMM relocation can occur
+	 * in parallel in the non-BSP CPUs. */
+	if (cpu == 0) {
+		/* If smm_save_state_in_msrs is 1 then that means this is the
+		 * 2nd time through the relocation handler for the BSP.
+		 * Parallel SMM handler relocation is taking place. However,
+		 * it is desired to access other CPUs save state in the real
+		 * SMM handler. Therefore, disable the SMM save state in MSRs
+		 * feature. */
+		if (relo_params->smm_save_state_in_msrs) {
+			msr_t smm_feature_control;
 
-	/* The relocated handler runs with all CPUs concurrently. Therefore
-	 * stagger the entry points adjusting SMBASE downwards by save state
-	 * size * CPU num. */
-	save_state->smbase = relo_params->smram_base -
-	                     cpu * runtime->save_state_size;
-	save_state->iedbase = relo_params->ied_base;
+			smm_feature_control = rdmsr(SMM_FEATURE_CONTROL_MSR);
+			smm_feature_control.lo &= ~SMM_CPU_SAVE_EN;
+			wrmsr(SMM_FEATURE_CONTROL_MSR, smm_feature_control);
+		} else if (bsp_setup_msr_save_state(relo_params))
+			/* Just return from relocation handler if MSR save
+			 * state is enabled. In that case the BSP will come
+			 * back into the relocation handler to setup the new
+			 * SMBASE as well disabling SMM save state in MSRs. */
+			return;
+	}
 
-	printk(BIOS_DEBUG, "New SMBASE=0x%08x IEDBASE=0x%08x @ %p\n",
-	       save_state->smbase, save_state->iedbase, save_state);
+	/* Make appropriate changes to the save state map. */
+	update_save_state(cpu, relo_params, runtime);
 
 	/* Write EMRR and SMRR MSRs based on indicated support. */
 	mtrr_cap = rdmsr(MTRRcap_MSR);
@@ -128,8 +217,6 @@ cpu_smm_do_relocation(void *arg, int cpu, const struct smm_runtime *runtime)
 		if (cpu == 0)
 			write_uncore_emrr(relo_params);
 	}
-
-	southbridge_clear_smi_status();
 }
 
 static u32 northbridge_get_base_reg(device_t dev, int reg)
@@ -199,10 +286,12 @@ static void fill_in_relocation_params(device_t dev,
 static int install_relocation_handler(int num_cpus,
                                       struct smm_relocation_params *relo_params)
 {
-	/* The default SMM entry happens serially at the default location.
-	 * Therefore, there is only 1 concurrent save state area. Set the
-	 * stack size to the save state size, and call into the
-	 * do_relocation handler. */
+	/* The default SMM entry can happen in parallel or serially. If the
+	 * default SMM entry is done in parallel the BSP has already setup
+	 * the saving state to each CPU's MSRs. At least one save state size
+	 * is required for the initial SMM entry for the BSP to determine if
+	 * parallel SMM relocation is even feasible.  Set the stack size to
+	 * the save state size, and call into the do_relocation handler. */
 	int save_state_size = sizeof(em64t101_smm_state_save_area_t);
 	struct smm_loader_params smm_params = {
 		.per_cpu_stack_size = save_state_size,
@@ -309,6 +398,17 @@ int smm_initialize(void)
 	/* Run the relocation handler. */
 	smm_initiate_relocation();
 
+	/* If smm_save_state_in_msrs is non-zero then parallel SMM relocation
+	 * shall take place. Run the relocation handler a second time to do
+	 * the final move. */
+	if (smm_reloc_params.smm_save_state_in_msrs) {
+		printk(BIOS_DEBUG, "Doing parallel SMM relocation.\n");
+		release_aps_for_smm_relocation(1);
+		smm_initiate_relocation_parallel();
+	} else {
+		release_aps_for_smm_relocation(0);
+	}
+
 	/* Lock down the SMRAM space. */
 	smm_lock();