diff mbox

Adding support for hardware crc to ARM aarch64

Message ID 1403687030.3355.19.camel@localhost.localdomain
State New
Headers show

Commit Message

Edward Nevill June 25, 2014, 9:03 a.m. UTC
Hi,

I would like to add support for hardware crc for ARM's new 64 bit architecture, aarch64.

I would be grateful if some committer could help me though the process of getting this change pushed into the trunk.

I have prepared an initial patch below.

The patch is completely conditionalized on __arch64__

For the moment I have only done the non pipelined version as the hw I have only has 1 crc execute unit.

Some initial benchmarks on terasort give

sw crc: 107 sec
hw crc: 103 sec

The performance improvement is quite small, but this is limited by the fact that I am using early stage hw which is not performant.

I have also built it on x86 and I think the change is fairly safe for other architectures because post conditionalization the src is identical on other architectures.

Thanks for you help,
Ed.


--- CUT HERE ---
Index: hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c

Comments

Vinod Kumar Vavilapalli June 26, 2014, 3:16 a.m. UTC | #1
Edward,

Thanks for your contribution. Please file a JIRA ticket at http://issues.apache.org/jira/browse/HADOOP and upload your patch there. Other reviewers/committers can take it from there and provide feedback. For more instructions, you can see http://wiki.apache.org/hadoop/HowToContribute.

HTH
+Vinod

On Jun 25, 2014, at 2:03 AM, Edward Nevill <edward.nevill@linaro.org> wrote:

> Hi,
> 
> I would like to add support for hardware crc for ARM's new 64 bit architecture, aarch64.
> 
> I would be grateful if some committer could help me though the process of getting this change pushed into the trunk.
> 
> I have prepared an initial patch below.
> 
> The patch is completely conditionalized on __arch64__
> 
> For the moment I have only done the non pipelined version as the hw I have only has 1 crc execute unit.
> 
> Some initial benchmarks on terasort give
> 
> sw crc: 107 sec
> hw crc: 103 sec
> 
> The performance improvement is quite small, but this is limited by the fact that I am using early stage hw which is not performant.
> 
> I have also built it on x86 and I think the change is fairly safe for other architectures because post conditionalization the src is identical on other architectures.
> 
> Thanks for you help,
> Ed.
> 
> 
> --- CUT HERE ---
> Index: hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c
> ===================================================================
> --- hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c	(revision 1605031)
> +++ hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c	(working copy)
> @@ -38,7 +38,7 @@
> #include "bulk_crc32.h"
> #include "gcc_optimizations.h"
> 
> -#if (!defined(__FreeBSD__) && !defined(WINDOWS))
> +#if (!defined(__FreeBSD__) && !defined(WINDOWS)) && !defined(__aarch64__)
> #define USE_PIPELINED
> #endif
> 
> @@ -672,8 +672,61 @@
> 
> # endif // 64-bit vs 32-bit
> 
> -#else // end x86 architecture
> +#elif defined(__aarch64__) // end x86 architecture
> 
> +#include <sys/auxv.h>
> +#include <asm/hwcap.h>
> +
> +#ifndef HWCAP_CRC32
> +#define HWCAP_CRC32 (1<<7)
> +#endif
> +
> +/**
> + * On library load, determine what sort of crc we are going to do
> + * and set cached_cpu_supports_crc32 appropriately.
> + */
> +void __attribute__ ((constructor)) init_cpu_support_flag(void) {
> +  unsigned long auxv = getauxval(AT_HWCAP);
> +  cached_cpu_supports_crc32 = auxv & HWCAP_CRC32;
> +}
> +
> +#define CRC32X(crc,value) asm("crc32cx %w[c], %w[c], %x[v]" : [c]"+r"(crc) : [v]"r"(value))
> +#define CRC32W(crc,value) asm("crc32cw %w[c], %w[c], %w[v]" : [c]"+r"(crc) : [v]"r"(value))
> +#define CRC32H(crc,value) asm("crc32ch %w[c], %w[c], %w[v]" : [c]"+r"(crc) : [v]"r"(value))
> +#define CRC32B(crc,value) asm("crc32cb %w[c], %w[c], %w[v]" : [c]"+r"(crc) : [v]"r"(value))
> +
> +/**
> + * Hardware-accelerated CRC32C calculation using the 64-bit instructions.
> + */
> +static uint32_t crc32c_hardware(uint32_t crc, const uint8_t* p_buf, size_t length) {
> +  int64_t len = length;
> +  asm(".cpu generic+crc");	// Allow crc instructions in asm
> +  if ((len -= sizeof(uint64_t)) >= 0) {
> +    do {
> +      CRC32X(crc, *(uint64_t*)p_buf);
> +      p_buf += sizeof(uint64_t);
> +    } while ((len -= sizeof(uint64_t)) >= 0);
> +  }
> +
> +  // The following is more efficient than the straight loop
> +  if (len & sizeof(uint32_t)) {
> +      CRC32W(crc, *(uint32_t*)p_buf);
> +      p_buf += sizeof(uint32_t);
> +  }
> +  if (len & sizeof(uint16_t)) {
> +      CRC32H(crc, *(uint16_t*)p_buf);
> +      p_buf += sizeof(uint16_t);
> +  }
> +  if (len & sizeof(uint8_t)) {
> +      CRC32B(crc, *p_buf);
> +      p_buf++;
> +  }
> +
> +  return crc;
> +}
> +
> +#else
> +
> static uint32_t crc32c_hardware(uint32_t crc, const uint8_t* data, size_t length) {
>   // never called!
>   assert(0 && "hardware crc called on an unsupported platform");
> --- CUT HERE ---
> 
>
diff mbox

Patch

===================================================================
--- hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c	(revision 1605031)
+++ hadoop-common-project/hadoop-common/src/main/native/src/org/apache/hadoop/util/bulk_crc32.c	(working copy)
@@ -38,7 +38,7 @@ 
 #include "bulk_crc32.h"
 #include "gcc_optimizations.h"
 
-#if (!defined(__FreeBSD__) && !defined(WINDOWS))
+#if (!defined(__FreeBSD__) && !defined(WINDOWS)) && !defined(__aarch64__)
 #define USE_PIPELINED
 #endif
 
@@ -672,8 +672,61 @@ 
 
 # endif // 64-bit vs 32-bit
 
-#else // end x86 architecture
+#elif defined(__aarch64__) // end x86 architecture
 
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+
+#ifndef HWCAP_CRC32
+#define HWCAP_CRC32 (1<<7)
+#endif
+
+/**
+ * On library load, determine what sort of crc we are going to do
+ * and set cached_cpu_supports_crc32 appropriately.
+ */
+void __attribute__ ((constructor)) init_cpu_support_flag(void) {
+  unsigned long auxv = getauxval(AT_HWCAP);
+  cached_cpu_supports_crc32 = auxv & HWCAP_CRC32;
+}
+
+#define CRC32X(crc,value) asm("crc32cx %w[c], %w[c], %x[v]" : [c]"+r"(crc) : [v]"r"(value))
+#define CRC32W(crc,value) asm("crc32cw %w[c], %w[c], %w[v]" : [c]"+r"(crc) : [v]"r"(value))
+#define CRC32H(crc,value) asm("crc32ch %w[c], %w[c], %w[v]" : [c]"+r"(crc) : [v]"r"(value))
+#define CRC32B(crc,value) asm("crc32cb %w[c], %w[c], %w[v]" : [c]"+r"(crc) : [v]"r"(value))
+
+/**
+ * Hardware-accelerated CRC32C calculation using the 64-bit instructions.
+ */
+static uint32_t crc32c_hardware(uint32_t crc, const uint8_t* p_buf, size_t length) {
+  int64_t len = length;
+  asm(".cpu generic+crc");	// Allow crc instructions in asm
+  if ((len -= sizeof(uint64_t)) >= 0) {
+    do {
+      CRC32X(crc, *(uint64_t*)p_buf);
+      p_buf += sizeof(uint64_t);
+    } while ((len -= sizeof(uint64_t)) >= 0);
+  }
+
+  // The following is more efficient than the straight loop
+  if (len & sizeof(uint32_t)) {
+      CRC32W(crc, *(uint32_t*)p_buf);
+      p_buf += sizeof(uint32_t);
+  }
+  if (len & sizeof(uint16_t)) {
+      CRC32H(crc, *(uint16_t*)p_buf);
+      p_buf += sizeof(uint16_t);
+  }
+  if (len & sizeof(uint8_t)) {
+      CRC32B(crc, *p_buf);
+      p_buf++;
+  }
+
+  return crc;
+}
+
+#else
+
 static uint32_t crc32c_hardware(uint32_t crc, const uint8_t* data, size_t length) {
   // never called!
   assert(0 && "hardware crc called on an unsupported platform");
--- CUT HERE ---