commonlib: Add assembly optimization for ipchksum() on arm64

This patch adds a bit of optimized assembly code to the ipchksum() algorithm for arm64 targets in order to take advantage of larger load sizes and the add-with-carry instruction. This improves execution speed on a Cortex-A75 by more than 20x. Change-Id: I9c7bbc9d7a1cd083ced62fe9222592243a796077 Signed-off-by: Julius Werner <jwerner@chromium.org> Reviewed-on: https://review.coreboot.org/c/coreboot/+/80254 Tested-by: build bot (Jenkins) <no-reply@coreboot.org> Reviewed-by: Arthur Heymans <arthur@aheymans.xyz> Reviewed-by: Yidi Lin <yidilin@google.com>
2024-01-30 17:26:52 -08:00 · 2024-01-30 17:26:52 -08:00 · 89fae18bf4
parent 177aee2c1f
commit 89fae18bf4
1 changed files with 25 additions and 0 deletions
--- a/src/commonlib/bsd/ipchksum.c
+++ b/src/commonlib/bsd/ipchksum.c
@ -11,6 +11,31 @@ uint16_t ipchksum(const void *data, size_t size)
 	uint32_t sum = 0;
 	size_t i = 0;

+#if defined(__aarch64__)
+	size_t size16 = size / 16;
+	const uint64_t *p8 = data;
+	if (size16) {
+		unsigned long tmp1, tmp2;
+		i = size16 * 16;
+		asm (
+			"adds	xzr, xzr, xzr\n\t"	/* init carry flag for addition */
+			"1:\n\t"
+			"ldp	%[v1], %[v2], [%[p8]], #16\n\t"
+			"adcs	%[wsum], %[wsum], %[v1]\n\t"
+			"adcs	%[wsum], %[wsum], %[v2]\n\t"
+			"sub	%[size16], %[size16], #1\n\t"
+			"cbnz	%[size16], 1b\n\t"
+			"adcs	%[wsum], %[wsum], xzr\n\t"	/* use up last carry */
+		: [v1] "=r" (tmp1),
+		  [v2] "=r" (tmp2),
+		  [wsum] "+r" (wide_sum),
+		  [p8] "+r" (p8),
+		  [size16] "+r" (size16)
+		:: "cc"
+		);
+	}
+#endif
+
 	while (wide_sum) {
 		sum += wide_sum & 0xFFFF;
 		wide_sum >>= 16;