Message ID | 20200815013145.539409-16-richard.henderson@linaro.org |
---|---|
State | Superseded |
Headers | show |
Series | target/arm: SVE2 preparatory patches | expand |
On Sat, 15 Aug 2020 at 02:32, Richard Henderson <richard.henderson@linaro.org> wrote: > > Missed out on compressing the second half of a predicate > with length vl % 512 > 256. > > Adjust all of the x + (y << s) to x | (y << s) as a > general style fix. > > Reported-by: Laurent Desnogues <laurent.desnogues@gmail.com> > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > target/arm/sve_helper.c | 30 +++++++++++++++++++++--------- > 1 file changed, 21 insertions(+), 9 deletions(-) > > diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c > index 4758d46f34..fcb46f150f 100644 > --- a/target/arm/sve_helper.c > +++ b/target/arm/sve_helper.c > @@ -1938,7 +1938,7 @@ void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) > if (oprsz <= 8) { > l = compress_bits(n[0] >> odd, esz); > h = compress_bits(m[0] >> odd, esz); > - d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz); > + d[0] = l | (h << (4 * oprsz)); Why did we drop the extract64() here ? This doesn't seem to correspond to either of the things the commit message says we're doing. Also, if oprsz is < 8, don't we need to mask out the high bits in l that would otherwise overlap with h << (4 * oprsz) ? Are they guaranteed zeroes somehow? > } else { > ARMPredicateReg tmp_m; > intptr_t oprsz_16 = oprsz / 16; > @@ -1952,23 +1952,35 @@ void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) > h = n[2 * i + 1]; > l = compress_bits(l >> odd, esz); > h = compress_bits(h >> odd, esz); > - d[i] = l + (h << 32); > + d[i] = l | (h << 32); > } > > - /* For VL which is not a power of 2, the results from M do not > - align nicely with the uint64_t for D. Put the aligned results > - from M into TMP_M and then copy it into place afterward. */ > + /* > + * For VL which is not a multiple of 512, the results from M do not > + * align nicely with the uint64_t for D. Put the aligned results > + * from M into TMP_M and then copy it into place afterward. > + */ > if (oprsz & 15) { > - d[i] = compress_bits(n[2 * i] >> odd, esz); > + int final_shift = (oprsz & 15) * 2; > + > + l = n[2 * i + 0]; > + h = n[2 * i + 1]; > + l = compress_bits(l >> odd, esz); > + h = compress_bits(h >> odd, esz); > + d[i] = l | (h << final_shift); Similarly here, why don't we need to mask out the top parts of l and h ? > > for (i = 0; i < oprsz_16; i++) { > l = m[2 * i + 0]; > h = m[2 * i + 1]; > l = compress_bits(l >> odd, esz); > h = compress_bits(h >> odd, esz); > - tmp_m.p[i] = l + (h << 32); > + tmp_m.p[i] = l | (h << 32); > } > - tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz); > + l = m[2 * i + 0]; > + h = m[2 * i + 1]; > + l = compress_bits(l >> odd, esz); > + h = compress_bits(h >> odd, esz); > + tmp_m.p[i] = l | (h << final_shift); > > swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2); Aren't there cases where the 'n' part of the result doesn't end up a whole number of bytes and we have to do a shift as well as a byte copy? > } else { > @@ -1977,7 +1989,7 @@ void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) > h = m[2 * i + 1]; > l = compress_bits(l >> odd, esz); > h = compress_bits(h >> odd, esz); > - d[oprsz_16 + i] = l + (h << 32); > + d[oprsz_16 + i] = l | (h << 32); > } > } > } thanks -- PMM
On 8/25/20 6:43 AM, Peter Maydell wrote: > On Sat, 15 Aug 2020 at 02:32, Richard Henderson > <richard.henderson@linaro.org> wrote: >> >> Missed out on compressing the second half of a predicate >> with length vl % 512 > 256. >> >> Adjust all of the x + (y << s) to x | (y << s) as a >> general style fix. >> >> Reported-by: Laurent Desnogues <laurent.desnogues@gmail.com> >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> >> --- >> target/arm/sve_helper.c | 30 +++++++++++++++++++++--------- >> 1 file changed, 21 insertions(+), 9 deletions(-) >> >> diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c >> index 4758d46f34..fcb46f150f 100644 >> --- a/target/arm/sve_helper.c >> +++ b/target/arm/sve_helper.c >> @@ -1938,7 +1938,7 @@ void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) >> if (oprsz <= 8) { >> l = compress_bits(n[0] >> odd, esz); >> h = compress_bits(m[0] >> odd, esz); >> - d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz); >> + d[0] = l | (h << (4 * oprsz)); > > Why did we drop the extract64() here ? This doesn't seem > to correspond to either of the things the commit message > says we're doing. Indeed, the commit message could use expansion. > Also, if oprsz is < 8, don't we need to mask out the high > bits in l that would otherwise overlap with h << (4 * oprsz) ? > Are they guaranteed zeroes somehow? They are guaranteed zeros. See aarch64_sve_narrow_vq. >> for (i = 0; i < oprsz_16; i++) { >> l = m[2 * i + 0]; >> h = m[2 * i + 1]; >> l = compress_bits(l >> odd, esz); >> h = compress_bits(h >> odd, esz); >> - tmp_m.p[i] = l + (h << 32); >> + tmp_m.p[i] = l | (h << 32); >> } >> - tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz); >> + l = m[2 * i + 0]; >> + h = m[2 * i + 1]; >> + l = compress_bits(l >> odd, esz); >> + h = compress_bits(h >> odd, esz); >> + tmp_m.p[i] = l | (h << final_shift); >> >> swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2); > > Aren't there cases where the 'n' part of the result doesn't > end up a whole number of bytes and we have to do a shift as > well as a byte copy? No, oprsz will always be a multiple of 2 for predicates. Just like oprsz will always be a multiple of 16 for sve vectors. r~
On Tue, 25 Aug 2020 at 15:02, Richard Henderson <richard.henderson@linaro.org> wrote: > > On 8/25/20 6:43 AM, Peter Maydell wrote: > > On Sat, 15 Aug 2020 at 02:32, Richard Henderson > > <richard.henderson@linaro.org> wrote: > >> > >> Missed out on compressing the second half of a predicate > >> with length vl % 512 > 256. > >> > >> Adjust all of the x + (y << s) to x | (y << s) as a > >> general style fix. > >> > >> Reported-by: Laurent Desnogues <laurent.desnogues@gmail.com> > >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > >> --- > >> target/arm/sve_helper.c | 30 +++++++++++++++++++++--------- > >> 1 file changed, 21 insertions(+), 9 deletions(-) > >> > >> diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c > >> index 4758d46f34..fcb46f150f 100644 > >> --- a/target/arm/sve_helper.c > >> +++ b/target/arm/sve_helper.c > >> @@ -1938,7 +1938,7 @@ void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) > >> if (oprsz <= 8) { > >> l = compress_bits(n[0] >> odd, esz); > >> h = compress_bits(m[0] >> odd, esz); > >> - d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz); > >> + d[0] = l | (h << (4 * oprsz)); > > > > Why did we drop the extract64() here ? This doesn't seem > > to correspond to either of the things the commit message > > says we're doing. > > Indeed, the commit message could use expansion. > > > Also, if oprsz is < 8, don't we need to mask out the high > > bits in l that would otherwise overlap with h << (4 * oprsz) ? > > Are they guaranteed zeroes somehow? > > They are guaranteed zeros. See aarch64_sve_narrow_vq. > > >> for (i = 0; i < oprsz_16; i++) { > >> l = m[2 * i + 0]; > >> h = m[2 * i + 1]; > >> l = compress_bits(l >> odd, esz); > >> h = compress_bits(h >> odd, esz); > >> - tmp_m.p[i] = l + (h << 32); > >> + tmp_m.p[i] = l | (h << 32); > >> } > >> - tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz); > >> + l = m[2 * i + 0]; > >> + h = m[2 * i + 1]; > >> + l = compress_bits(l >> odd, esz); > >> + h = compress_bits(h >> odd, esz); > >> + tmp_m.p[i] = l | (h << final_shift); > >> > >> swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2); > > > > Aren't there cases where the 'n' part of the result doesn't > > end up a whole number of bytes and we have to do a shift as > > well as a byte copy? > > No, oprsz will always be a multiple of 2 for predicates. Ah, I see, so final_shift is a multiple of 4, and (if it's not also a multiple of 8) the last byte of the 'n' part of the result is then 4 bits from n[2 * i] and 4 bits from n[2 * i + 1] making up a complete byte. thanks -- PMM
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c index 4758d46f34..fcb46f150f 100644 --- a/target/arm/sve_helper.c +++ b/target/arm/sve_helper.c @@ -1938,7 +1938,7 @@ void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) if (oprsz <= 8) { l = compress_bits(n[0] >> odd, esz); h = compress_bits(m[0] >> odd, esz); - d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz); + d[0] = l | (h << (4 * oprsz)); } else { ARMPredicateReg tmp_m; intptr_t oprsz_16 = oprsz / 16; @@ -1952,23 +1952,35 @@ void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) h = n[2 * i + 1]; l = compress_bits(l >> odd, esz); h = compress_bits(h >> odd, esz); - d[i] = l + (h << 32); + d[i] = l | (h << 32); } - /* For VL which is not a power of 2, the results from M do not - align nicely with the uint64_t for D. Put the aligned results - from M into TMP_M and then copy it into place afterward. */ + /* + * For VL which is not a multiple of 512, the results from M do not + * align nicely with the uint64_t for D. Put the aligned results + * from M into TMP_M and then copy it into place afterward. + */ if (oprsz & 15) { - d[i] = compress_bits(n[2 * i] >> odd, esz); + int final_shift = (oprsz & 15) * 2; + + l = n[2 * i + 0]; + h = n[2 * i + 1]; + l = compress_bits(l >> odd, esz); + h = compress_bits(h >> odd, esz); + d[i] = l | (h << final_shift); for (i = 0; i < oprsz_16; i++) { l = m[2 * i + 0]; h = m[2 * i + 1]; l = compress_bits(l >> odd, esz); h = compress_bits(h >> odd, esz); - tmp_m.p[i] = l + (h << 32); + tmp_m.p[i] = l | (h << 32); } - tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz); + l = m[2 * i + 0]; + h = m[2 * i + 1]; + l = compress_bits(l >> odd, esz); + h = compress_bits(h >> odd, esz); + tmp_m.p[i] = l | (h << final_shift); swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2); } else { @@ -1977,7 +1989,7 @@ void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc) h = m[2 * i + 1]; l = compress_bits(l >> odd, esz); h = compress_bits(h >> odd, esz); - d[oprsz_16 + i] = l + (h << 32); + d[oprsz_16 + i] = l | (h << 32); } } }
Missed out on compressing the second half of a predicate with length vl % 512 > 256. Adjust all of the x + (y << s) to x | (y << s) as a general style fix. Reported-by: Laurent Desnogues <laurent.desnogues@gmail.com> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/sve_helper.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) -- 2.25.1