===================================================================
@@ -652,14 +652,14 @@ (define_insn "sve_ld1r<mode>"
;; Load 128 bits from memory and duplicate to fill a vector. Since there
;; are so few operations on 128-bit "elements", we don't define a VNx1TI
;; and simply use vectors of bytes instead.
-(define_insn "sve_ld1rq"
- [(set (match_operand:VNx16QI 0 "register_operand" "=w")
- (unspec:VNx16QI
- [(match_operand:VNx16BI 1 "register_operand" "Upl")
+(define_insn "*sve_ld1rq<Vesize>"
+ [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+ (unspec:SVE_ALL
+ [(match_operand:<VPRED> 1 "register_operand" "Upl")
(match_operand:TI 2 "aarch64_sve_ld1r_operand" "Uty")]
UNSPEC_LD1RQ))]
"TARGET_SVE"
- "ld1rqb\t%0.b, %1/z, %2"
+ "ld1rq<Vesize>\t%0.<Vetype>, %1/z, %2"
)
;; Implement a predicate broadcast by shifting the low bit of the scalar
===================================================================
@@ -2787,16 +2787,7 @@ aarch64_expand_sve_widened_duplicate (rt
return true;
}
- /* The bytes are loaded in little-endian order, so do a byteswap on
- big-endian targets. */
- if (BYTES_BIG_ENDIAN)
- {
- src = simplify_unary_operation (BSWAP, src_mode, src, src_mode);
- if (!src)
- return NULL_RTX;
- }
-
- /* Use LD1RQ to load the 128 bits from memory. */
+ /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
src = force_const_mem (src_mode, src);
if (!src)
return false;
@@ -2808,8 +2799,12 @@ aarch64_expand_sve_widened_duplicate (rt
src = replace_equiv_address (src, addr);
}
- rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
- emit_insn (gen_sve_ld1rq (gen_lowpart (VNx16QImode, dest), ptrue, src));
+ machine_mode mode = GET_MODE (dest);
+ unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
+ machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
+ rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
+ src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
+ emit_insn (gen_rtx_SET (dest, src));
return true;
}
===================================================================
@@ -32,7 +32,7 @@ TEST_ALL (VEC_PERM)
/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h, } 2 } } */
/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 3 } } */
/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 3 } } */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rqd\tz[0-9]+\.d, } 3 } } */
/* { dg-final { scan-assembler-not {\tzip1\t} } } */
/* { dg-final { scan-assembler-not {\tzip2\t} } } */
===================================================================
@@ -36,7 +36,7 @@ TEST_ALL (VEC_PERM)
/* 1 for each 16-bit type and 4 for double. */
/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 7 } } */
/* 1 for each 32-bit type. */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rqw\tz[0-9]+\.s, } 3 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #41\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #25\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #31\n} 2 } } */
===================================================================
@@ -38,7 +38,7 @@ TEST_ALL (VEC_PERM)
/* 1 for each 8-bit type, 4 for each 32-bit type and 8 for double. */
/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 22 } } */
/* 1 for each 16-bit type. */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]\.b, } 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]\.h, } 3 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #99\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #11\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #17\n} 2 } } */