===================================================================
@@ -75,11 +75,37 @@ extern void matmul_c10 (gfc_array_c10 * const rest
int blas_limit, blas_call gemm);
export_proto(matmul_c10);
+#ifdef __x86_64__
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_c10 (gfc_array_c10 * const restrict retarray,
+ gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones("avx,default")));
+
void
matmul_c10 (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_c10 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_c10 (gfc_array_c10 * const restrict retarray,
+ gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_c10 (gfc_array_c10 * const restrict retarray,
+ gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_COMPLEX_10 * restrict abase;
const GFC_COMPLEX_10 * restrict bbase;
GFC_COMPLEX_10 * restrict dest;
===================================================================
@@ -75,11 +75,37 @@ extern void matmul_c16 (gfc_array_c16 * const rest
int blas_limit, blas_call gemm);
export_proto(matmul_c16);
+#ifdef __x86_64__
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_c16 (gfc_array_c16 * const restrict retarray,
+ gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones("avx,default")));
+
void
matmul_c16 (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_c16 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_c16 (gfc_array_c16 * const restrict retarray,
+ gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_c16 (gfc_array_c16 * const restrict retarray,
+ gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_COMPLEX_16 * restrict abase;
const GFC_COMPLEX_16 * restrict bbase;
GFC_COMPLEX_16 * restrict dest;
===================================================================
@@ -75,11 +75,37 @@ extern void matmul_c4 (gfc_array_c4 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_c4);
+#ifdef __x86_64__
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_c4 (gfc_array_c4 * const restrict retarray,
+ gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones("avx,default")));
+
void
matmul_c4 (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_c4 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_c4 (gfc_array_c4 * const restrict retarray,
+ gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_c4 (gfc_array_c4 * const restrict retarray,
+ gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_COMPLEX_4 * restrict abase;
const GFC_COMPLEX_4 * restrict bbase;
GFC_COMPLEX_4 * restrict dest;
===================================================================
@@ -75,11 +75,37 @@ extern void matmul_c8 (gfc_array_c8 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_c8);
+#ifdef __x86_64__
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_c8 (gfc_array_c8 * const restrict retarray,
+ gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones("avx,default")));
+
void
matmul_c8 (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_c8 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_c8 (gfc_array_c8 * const restrict retarray,
+ gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_c8 (gfc_array_c8 * const restrict retarray,
+ gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_COMPLEX_8 * restrict abase;
const GFC_COMPLEX_8 * restrict bbase;
GFC_COMPLEX_8 * restrict dest;
===================================================================
@@ -75,11 +75,37 @@ extern void matmul_i1 (gfc_array_i1 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_i1);
+#ifdef __x86_64__
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_i1 (gfc_array_i1 * const restrict retarray,
+ gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones("avx,default")));
+
void
matmul_i1 (gfc_array_i1 * const restrict retarray,
gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_i1 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_i1 (gfc_array_i1 * const restrict retarray,
+ gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_i1 (gfc_array_i1 * const restrict retarray,
+ gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_INTEGER_1 * restrict abase;
const GFC_INTEGER_1 * restrict bbase;
GFC_INTEGER_1 * restrict dest;
===================================================================
@@ -75,11 +75,37 @@ extern void matmul_i16 (gfc_array_i16 * const rest
int blas_limit, blas_call gemm);
export_proto(matmul_i16);
+#ifdef __x86_64__
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_i16 (gfc_array_i16 * const restrict retarray,
+ gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones("avx,default")));
+
void
matmul_i16 (gfc_array_i16 * const restrict retarray,
gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_i16 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_i16 (gfc_array_i16 * const restrict retarray,
+ gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_i16 (gfc_array_i16 * const restrict retarray,
+ gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_INTEGER_16 * restrict abase;
const GFC_INTEGER_16 * restrict bbase;
GFC_INTEGER_16 * restrict dest;
===================================================================
@@ -75,11 +75,37 @@ extern void matmul_i2 (gfc_array_i2 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_i2);
+#ifdef __x86_64__
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_i2 (gfc_array_i2 * const restrict retarray,
+ gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones("avx,default")));
+
void
matmul_i2 (gfc_array_i2 * const restrict retarray,
gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_i2 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_i2 (gfc_array_i2 * const restrict retarray,
+ gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_i2 (gfc_array_i2 * const restrict retarray,
+ gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_INTEGER_2 * restrict abase;
const GFC_INTEGER_2 * restrict bbase;
GFC_INTEGER_2 * restrict dest;
===================================================================
@@ -75,11 +75,37 @@ extern void matmul_i4 (gfc_array_i4 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_i4);
+#ifdef __x86_64__
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_i4 (gfc_array_i4 * const restrict retarray,
+ gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones("avx,default")));
+
void
matmul_i4 (gfc_array_i4 * const restrict retarray,
gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_i4 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_i4 (gfc_array_i4 * const restrict retarray,
+ gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_i4 (gfc_array_i4 * const restrict retarray,
+ gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_INTEGER_4 * restrict abase;
const GFC_INTEGER_4 * restrict bbase;
GFC_INTEGER_4 * restrict dest;
===================================================================
@@ -75,11 +75,37 @@ extern void matmul_i8 (gfc_array_i8 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_i8);
+#ifdef __x86_64__
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_i8 (gfc_array_i8 * const restrict retarray,
+ gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones("avx,default")));
+
void
matmul_i8 (gfc_array_i8 * const restrict retarray,
gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_i8 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_i8 (gfc_array_i8 * const restrict retarray,
+ gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_i8 (gfc_array_i8 * const restrict retarray,
+ gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_INTEGER_8 * restrict abase;
const GFC_INTEGER_8 * restrict bbase;
GFC_INTEGER_8 * restrict dest;
===================================================================
@@ -75,11 +75,37 @@ extern void matmul_r10 (gfc_array_r10 * const rest
int blas_limit, blas_call gemm);
export_proto(matmul_r10);
+#ifdef __x86_64__
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_r10 (gfc_array_r10 * const restrict retarray,
+ gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones("avx,default")));
+
void
matmul_r10 (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_r10 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_r10 (gfc_array_r10 * const restrict retarray,
+ gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_r10 (gfc_array_r10 * const restrict retarray,
+ gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_REAL_10 * restrict abase;
const GFC_REAL_10 * restrict bbase;
GFC_REAL_10 * restrict dest;
===================================================================
@@ -75,11 +75,37 @@ extern void matmul_r16 (gfc_array_r16 * const rest
int blas_limit, blas_call gemm);
export_proto(matmul_r16);
+#ifdef __x86_64__
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_r16 (gfc_array_r16 * const restrict retarray,
+ gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones("avx,default")));
+
void
matmul_r16 (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_r16 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_r16 (gfc_array_r16 * const restrict retarray,
+ gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_r16 (gfc_array_r16 * const restrict retarray,
+ gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_REAL_16 * restrict abase;
const GFC_REAL_16 * restrict bbase;
GFC_REAL_16 * restrict dest;
===================================================================
@@ -75,11 +75,37 @@ extern void matmul_r4 (gfc_array_r4 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_r4);
+#ifdef __x86_64__
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_r4 (gfc_array_r4 * const restrict retarray,
+ gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones("avx,default")));
+
void
matmul_r4 (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_r4 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_r4 (gfc_array_r4 * const restrict retarray,
+ gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_r4 (gfc_array_r4 * const restrict retarray,
+ gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_REAL_4 * restrict abase;
const GFC_REAL_4 * restrict bbase;
GFC_REAL_4 * restrict dest;
===================================================================
@@ -75,11 +75,37 @@ extern void matmul_r8 (gfc_array_r8 * const restri
int blas_limit, blas_call gemm);
export_proto(matmul_r8);
+#ifdef __x86_64__
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_r8 (gfc_array_r8 * const restrict retarray,
+ gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones("avx,default")));
+
void
matmul_r8 (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_r8 (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_r8 (gfc_array_r8 * const restrict retarray,
+ gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_r8 (gfc_array_r8 * const restrict retarray,
+ gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const GFC_REAL_8 * restrict abase;
const GFC_REAL_8 * restrict bbase;
GFC_REAL_8 * restrict dest;
===================================================================
@@ -76,11 +76,37 @@ extern void matmul_'rtype_code` ('rtype` * const r
int blas_limit, blas_call gemm);
export_proto(matmul_'rtype_code`);
+#ifdef __x86_64__
+
+/* For x86_64, we switch to AVX if that is available. For this, we
+ let the actual work be done by the static aux_matmul - function.
+ The user-callable function will then automagically contain the
+ selection code for the right architecture. This is done to avoid
+ knowledge of architecture details in the front end. */
+
+static void aux_matmul_'rtype_code` ('rtype` * const restrict retarray,
+ 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+ __attribute__ ((target_clones("avx,default")));
+
void
matmul_'rtype_code` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
+ aux_matmul_'rtype_code` (retarray, a, b, try_blas, blas_limit, gemm);
+}
+
+static void
+aux_matmul_'rtype_code` ('rtype` * const restrict retarray,
+ 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#else
+matmul_'rtype_code` ('rtype` * const restrict retarray,
+ 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+ int blas_limit, blas_call gemm)
+#endif
+{
const 'rtype_name` * restrict abase;
const 'rtype_name` * restrict bbase;
'rtype_name` * restrict dest;