diff mbox

RFR: Add support for String.indexOf

Message ID 1408379268.20323.13.camel@localhost.localdomain
State New
Headers show

Commit Message

Edward Nevill Aug. 18, 2014, 4:27 p.m. UTC
Hi,

The following patch adds support for the String.indexOf intrinsic.

It uses a combination of 2 algorithms, a simplified Boyer Moore, and a straightforward linear scan.

Boyer Moore is used when the pattern length is >= 8 and the source length is >= 4 * pattern length. This heuristic seems to produce the best results from benchmarking.

Essentially Boyer Moore generates the best results when the pattern length is short relative to the source but is sufficiently long to make the skip span worthwhile. As the pattern length grows towards the source length the initialisation of the pattern string tends to dominate.

The code also contains special case code for 1, 2, 3, and 4 chars in the pattern.

Benchmarking shows 1.96 X improvement on my synthetic benchmark which searches for patterns of length { 1, 2, 3, 4, 8, 12, 16, 24, 32, 64 } in a source string of length { 65, 129, 964 }. None of the patterns match in the benchmark but the patterns are contrived so that the initial substring of the pattern does match at several places in the source string.

On the hotspot indexof test it show 1.89 X improvement.

Tests as usual with jtreg/hotspot.

OK to push?
Ed.

--- CUT HERE ---
# HG changeset patch
# User Edward Nevill edward.nevill@linaro.org
# Date 1408378229 -3600
#      Mon Aug 18 17:10:29 2014 +0100
# Node ID 978b74cb5c7b233477d5ce9346c038fbccd80869
# Parent  2dfe9abe27fe6c9231e64ebd66e7ca3e6fb5b2bf
Add support for String.indexOf intrinsic

Comments

Andrew Haley Aug. 19, 2014, 9:18 a.m. UTC | #1
Hi,

On 18/08/14 17:27, Edward Nevill wrote:
> 
> The following patch adds support for the String.indexOf intrinsic.
> 
> It uses a combination of 2 algorithms, a simplified Boyer Moore, and
> a straightforward linear scan.

In what way is this algorithm simplified from standard Boyer-Moore?
How can we have confidence it is correct?

> Boyer Moore is used when the pattern length is >= 8 and the source
> length is >= 4 * pattern length. This heuristic seems to produce the
> best results from benchmarking.
> 
> Essentially Boyer Moore generates the best results when the pattern
> length is short relative to the source but is sufficiently long to
> make the skip span worthwhile. As the pattern length grows towards
> the source length the initialisation of the pattern string tends to
> dominate.
> 
> The code also contains special case code for 1, 2, 3, and 4 chars in
> the pattern.
> 
> Benchmarking shows 1.96 X improvement on my synthetic benchmark
> which searches for patterns of length { 1, 2, 3, 4, 8, 12, 16, 24,
> 32, 64 } in a source string of length { 65, 129, 964 }. None of the
> patterns match in the benchmark but the patterns are contrived so
> that the initial substring of the pattern does match at several
> places in the source string.
> 
> On the hotspot indexof test it show 1.89 X improvement.
> 
> Tests as usual with jtreg/hotspot.
> 
> OK to push?

Some comments inline.

I'm a bit concerned that we always inline this code.  AFAICS if
icnt1 == -1 we might as well branch to a stub.

Andrew.


> --- CUT HERE ---
> # HG changeset patch
> # User Edward Nevill edward.nevill@linaro.org
> # Date 1408378229 -3600
> #      Mon Aug 18 17:10:29 2014 +0100
> # Node ID 978b74cb5c7b233477d5ce9346c038fbccd80869
> # Parent  2dfe9abe27fe6c9231e64ebd66e7ca3e6fb5b2bf
> Add support for String.indexOf intrinsic
> 
> diff -r 2dfe9abe27fe -r 978b74cb5c7b src/cpu/aarch64/vm/aarch64.ad
> --- a/src/cpu/aarch64/vm/aarch64.ad	Tue Aug 05 15:56:26 2014 +0100
> +++ b/src/cpu/aarch64/vm/aarch64.ad	Mon Aug 18 17:10:29 2014 +0100
> @@ -3415,6 +3415,16 @@
>    interface(CONST_INTER);
>  %}
>  
> +operand immI_le_4()
> +%{
> +  predicate(n->get_int() <= 4);
> +  match(ConI);
> +
> +  op_cost(0);
> +  format %{ %}
> +  interface(CONST_INTER);
> +%}
> +
>  operand immI_31()
>  %{
>    predicate(n->get_int() == 31);
> @@ -11733,6 +11743,44 @@
>    ins_pipe(pipe_class_memory);
>  %}
>  
> +instruct string_indexof(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2,
> +       iRegI_R0 result, iRegI tmp1, iRegI tmp2, iRegI tmp3, iRegI tmp4, rFlagsReg cr)
> +%{
> +  match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
> +  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2,
> +         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
> +  format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result" %}
> +
> +  ins_encode %{
> +    __ string_indexof($str1$$Register, $str2$$Register,
> +                      $cnt1$$Register, $cnt2$$Register,
> +                      $tmp1$$Register, $tmp2$$Register,
> +                      $tmp3$$Register, $tmp4$$Register,
> +                      -1, $result$$Register);
> +  %}
> +  ins_pipe(pipe_class_memory);
> +%}
> +
> +instruct string_indexof_con(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2,
> +                 immI_le_4 int_cnt2, iRegI_R0 result, iRegI tmp1, iRegI tmp2,
> +                 iRegI tmp3, iRegI tmp4, rFlagsReg cr)
> +%{
> +  match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2)));
> +  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1,
> +         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
> +  format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result" %}
> +
> +  ins_encode %{
> +    int icnt2 = (int)$int_cnt2$$constant;
> +    __ string_indexof($str1$$Register, $str2$$Register,
> +                      $cnt1$$Register, zr,
> +                      $tmp1$$Register, $tmp2$$Register,
> +                      $tmp3$$Register, $tmp4$$Register,
> +                      icnt2, $result$$Register);
> +  %}
> +  ins_pipe(pipe_class_memory);
> +%}
> +
>  instruct string_equals(iRegP_R1 str1, iRegP_R3 str2, iRegI_R4 cnt,
>                          iRegI_R0 result, iRegP_R10 tmp, rFlagsReg cr)
>  %{
> diff -r 2dfe9abe27fe -r 978b74cb5c7b src/cpu/aarch64/vm/vm_version_aarch64.cpp
> --- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Tue Aug 05 15:56:26 2014 +0100
> +++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Mon Aug 18 17:10:29 2014 +0100
> @@ -105,6 +105,7 @@
>    FLAG_SET_DEFAULT(PrefetchScanIntervalInBytes, 256);
>    FLAG_SET_DEFAULT(PrefetchFieldsAhead, 256);
>    FLAG_SET_DEFAULT(PrefetchCopyIntervalInBytes, 256);
> +  FLAG_SET_DEFAULT(UseSSE42Intrinsics, true);
>  
>  #ifndef BUILTIN_SIM
>    unsigned long auxv = getauxval(AT_HWCAP);
> diff -r 2dfe9abe27fe -r 978b74cb5c7b src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
> --- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Tue Aug 05 15:56:26 2014 +0100
> +++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Mon Aug 18 17:10:29 2014 +0100
> @@ -1,3 +1,4 @@
> +/*
>  /*
>   * Copyright (c) 2013, Red Hat Inc.
>   * Copyright (c) 1997, 2012, Oracle and/or its affiliates.
> @@ -3386,6 +3387,338 @@
>    }
>  }
>  
> +// Search for str1 in str2 and return index or -1
> +void MacroAssembler::string_indexof(Register str2, Register str1,
> +                                    Register cnt2, Register cnt1,
> +                                    Register tmp1, Register tmp2,
> +                                    Register tmp3, Register tmp4,
> +                                    int icnt1, Register result) {
> +  Label BM, LS, DONE, NOMATCH, MATCH;
> +
> +  Register ch1 = rscratch1;
> +  Register ch2 = rscratch2;
> +  Register cnt1tmp = tmp1;
> +  Register cnt2tmp = tmp2;
> +  Register cnt1_neg = cnt1;
> +  Register cnt2_neg = cnt2;
> +  Register result_tmp = tmp4;
> +
> +  // Note, inline_string_indexOf() generates checks:
> +  // if (substr.count > string.count) return -1;
> +  // if (substr.count == 0) return 0;
> +
> +// We have two strings, a source string in str2, cnt2 and a pattern string
> +// in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
> +
> +// For larger pattern and source we use a simplified Boyer Moore algorithm.
> +// With a small pattern and source we use linear scan.
> +
> +  if (icnt1 == -1) {
> +    cmp(cnt1, 256);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
> +    ccmp(cnt1, 8, 0b0000, CC);  // Can't handle skip >= 256 because we use

Can this CC be LO, please.  This is really important because the ARM
carry flag is inverted compared with e.g. Intel.

> +    br(CC, LS);                 // a byte array.
> +    cmp(cnt1, cnt2, LSR, 2);    // Source must be 4 * pattern for BM
> +    br(CS, LS);

No, you may not give a label the same name as one of the condition
codes.  This is an unexploded bomb.

> +  }
> +
> +// Larger pattern and source use the following Boyer Moore alogorithm.
> +//
> +// #define ASIZE 128
> +//
> +//    int bm(unsigned char *x, int m, unsigned char *y, int n) {
> +//       int i, j;
> +//       unsigned c;
> +//       unsigned char bc[ASIZE];
> +//    
> +//       /* Preprocessing */
> +//       for (i = 0; i < ASIZE; ++i)
> +//          bc[i] = 0;
> +//       for (i = 0; i < m - 1; ) {
> +//          c = x[i];
> +//          ++i;
> +//          if (c < ASIZE) bc[c] = i;
> +//       }
> +//    
> +//       /* Searching */
> +//       j = 0;
> +//       while (j <= n - m) {
> +//          c = y[i+j];
> +//          if (x[m-1] == c)
> +//            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
> +//          if (i < 0) return j;
> +//          if (c < ASIZE)
> +//            j = j - bc[y[j+m-1]] + m;
> +//          else
> +//            j += 1; // Advance by 1 only if char >= ASIZE
> +//       }
> +//    }
> +
> +  if (icnt1 == -1) {
> +    BIND(BM);
> +
> +    Label ZLOOP, BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP;
> +    Label BMADV, BMMATCH, BMCHECKEND;
> +
> +    Register cnt1end = tmp2;
> +    Register str2end = cnt2;
> +    Register skipch = tmp2;
> +
> +    // Restrict ASIZE to 128 to reduce stack space/initialisation.
> +    // The presence of chars >= ASIZE in the target string does not affect
> +    // performance, but we must be careful not to initialise them in the stack
> +    // array.
> +    // The presence of chars >= ASIZE in the source string may adversely affect
> +    // performance since we can only advance by one when we encounter one.
> +
> +      stp(zr, zr, pre(sp, -128));
> +      stp(zr, zr, Address(sp, 1*16));
> +      stp(zr, zr, Address(sp, 2*16));
> +      stp(zr, zr, Address(sp, 3*16));
> +      stp(zr, zr, Address(sp, 4*16));
> +      stp(zr, zr, Address(sp, 5*16));
> +      stp(zr, zr, Address(sp, 6*16));
> +      stp(zr, zr, Address(sp, 7*16));

Why is this not a loop?

> +
> +      mov(cnt1tmp, 0);
> +      sub(cnt1end, cnt1, 1);
> +    BIND(BCLOOP);
> +      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
> +      cmp(ch1, 128);
> +      add(cnt1tmp, cnt1tmp, 1);
> +      br(HS, BCSKIP);
> +      strb(cnt1tmp, Address(sp, ch1));
> +    BIND(BCSKIP);
> +      cmp(cnt1tmp, cnt1end);
> +      br(LT, BCLOOP);
> +
> +      mov(result_tmp, str2);
> +
> +      sub(cnt2, cnt2, cnt1);
> +      add(str2end, str2, cnt2, LSL, 1);
> +    BIND(BMLOOPSTR2);
> +      sub(cnt1tmp, cnt1, 1);
> +      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
> +      ldrh(skipch, Address(str2, cnt1tmp, Address::lsl(1)));
> +      cmp(ch1, skipch);
> +      br(NE, BMSKIP);
> +      subs(cnt1tmp, cnt1tmp, 1);
> +      br(LT, BMMATCH);
> +    BIND(BMLOOPSTR1);
> +      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
> +      ldrh(ch2, Address(str2, cnt1tmp, Address::lsl(1)));
> +      cmp(ch1, ch2);
> +      br(NE, BMSKIP);
> +      subs(cnt1tmp, cnt1tmp, 1);
> +      br(GE, BMLOOPSTR1);
> +    BIND(BMMATCH);
> +      sub(result_tmp, str2, result_tmp);
> +      lsr(result, result_tmp, 1);
> +      add(sp, sp, 128);
> +      b(DONE);
> +    BIND(BMADV);
> +      add(str2, str2, 2);
> +      b(BMCHECKEND);
> +    BIND(BMSKIP);
> +      cmp(skipch, 128);
> +      br(HS, BMADV);
> +      ldrb(ch2, Address(sp, skipch));
> +      add(str2, str2, cnt1, LSL, 1);
> +      sub(str2, str2, ch2, LSL, 1);
> +    BIND(BMCHECKEND);
> +      cmp(str2, str2end);
> +      br(LE, BMLOOPSTR2);
> +      add(sp, sp, 128);
> +      b(NOMATCH);
> +  }
> +
> +  BIND(LS);
> +  {
> +    Label DO1, DO2, DO3;
> +
> +    Register str2tmp = tmp2;
> +    Register first = tmp3;
> +
> +    if (icnt1 == -1)
> +    {
> +        Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT, LAST_WORD;
> +
> +        cmp(cnt1, 4);
> +        br(LT, DOSHORT);
> +
> +        sub(cnt2, cnt2, cnt1);
> +        sub(cnt1, cnt1, 4);
> +        mov(result_tmp, cnt2);
> +
> +        lea(str1, Address(str1, cnt1, Address::uxtw(1)));
> +        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
> +        sub(cnt1_neg, zr, cnt1, LSL, 1);
> +        sub(cnt2_neg, zr, cnt2, LSL, 1);
> +        ldr(first, Address(str1, cnt1_neg));
> +
> +      BIND(FIRST_LOOP);
> +        ldr(ch2, Address(str2, cnt2_neg));
> +        cmp(first, ch2);
> +        br(EQ, STR1_LOOP);
> +      BIND(STR2_NEXT);
> +        adds(cnt2_neg, cnt2_neg, 2);
> +        br(LE, FIRST_LOOP);
> +        b(NOMATCH);
> +
> +      BIND(STR1_LOOP);
> +        adds(cnt1tmp, cnt1_neg, 8);
> +        add(cnt2tmp, cnt2_neg, 8);
> +        br(GE, LAST_WORD);
> +
> +      BIND(STR1_NEXT);
> +        ldr(ch1, Address(str1, cnt1tmp));
> +        ldr(ch2, Address(str2, cnt2tmp));
> +        cmp(ch1, ch2);
> +        br(NE, STR2_NEXT);
> +        adds(cnt1tmp, cnt1tmp, 8);
> +        add(cnt2tmp, cnt2tmp, 8);
> +        br(LT, STR1_NEXT);
> +
> +      BIND(LAST_WORD);
> +        ldr(ch1, Address(str1));
> +        sub(str2tmp, str2, cnt1_neg);         // adjust to corresponding
> +        ldr(ch2, Address(str2tmp, cnt2_neg)); // word in str2
> +        cmp(ch1, ch2);
> +        br(NE, STR2_NEXT);
> +        b(MATCH);
> +
> +      BIND(DOSHORT);
> +        cmp(cnt1, 2);
> +        br(LT, DO1);
> +        br(GT, DO3);
> +    }
> +
> +    if (icnt1 == 4) {
> +      Label CH1_LOOP;
> +
> +        ldr(ch1, str1);
> +        sub(cnt2, cnt2, 4);
> +        mov(result_tmp, cnt2);
> +        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
> +        sub(cnt2_neg, zr, cnt2, LSL, 1);
> +
> +      BIND(CH1_LOOP);
> +        ldr(ch2, Address(str2, cnt2_neg));
> +        cmp(ch1, ch2);
> +        br(EQ, MATCH);
> +        adds(cnt2_neg, cnt2_neg, 2);
> +        br(LE, CH1_LOOP);
> +        b(NOMATCH);
> +    }
> +
> +    if (icnt1 == -1 || icnt1 == 2) {
> +      Label CH1_LOOP;
> +
> +      BIND(DO2);
> +        ldrw(ch1, str1);
> +        sub(cnt2, cnt2, 2);
> +        mov(result_tmp, cnt2);
> +        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
> +        sub(cnt2_neg, zr, cnt2, LSL, 1);
> +
> +      BIND(CH1_LOOP);
> +        ldrw(ch2, Address(str2, cnt2_neg));
> +        cmp(ch1, ch2);
> +        br(EQ, MATCH);
> +        adds(cnt2_neg, cnt2_neg, 2);
> +        br(LE, CH1_LOOP);
> +        b(NOMATCH);
> +    }
> +
> +    if (icnt1 == -1 || icnt1 == 3) {
> +      Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
> +
> +      BIND(DO3);
> +        ldrw(first, str1);
> +        ldrh(ch1, Address(str1, 4));
> +
> +        sub(cnt2, cnt2, 3);
> +        mov(result_tmp, cnt2);
> +        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
> +        sub(cnt2_neg, zr, cnt2, LSL, 1);
> +
> +      BIND(FIRST_LOOP);
> +        ldrw(ch2, Address(str2, cnt2_neg));
> +        cmpw(first, ch2);
> +        br(EQ, STR1_LOOP);
> +      BIND(STR2_NEXT);
> +        adds(cnt2_neg, cnt2_neg, 2);
> +        br(LE, FIRST_LOOP);
> +        b(NOMATCH);
> +
> +      BIND(STR1_LOOP);
> +        add(cnt2tmp, cnt2_neg, 4);
> +        ldrh(ch2, Address(str2, cnt2tmp));
> +        cmp(ch1, ch2);
> +        br(NE, STR2_NEXT);
> +        add(result, result_tmp, cnt2_neg, ASR, 1);
> +        b(DONE);
> +    }
> +
> +    if (icnt1 == -1 || icnt1 == 1) {
> +      Label CH1_LOOP, HAS_ZERO;
> +      Label DO1_SHORT, DO1_LOOP;
> +
> +      BIND(DO1);
> +        ldrh(ch1, str1);
> +        cmp(cnt2, 4);
> +        br(LT, DO1_SHORT);
> +
> +        orr(ch1, ch1, ch1, LSL, 16);
> +        orr(ch1, ch1, ch1, LSL, 32);
> +
> +        sub(cnt2, cnt2, 4);
> +        mov(result_tmp, cnt2);
> +        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
> +        sub(cnt2_neg, zr, cnt2, LSL, 1);
> +
> +        mov(tmp3, 0x0001000100010001);
> +      BIND(CH1_LOOP);
> +        ldr(ch2, Address(str2, cnt2_neg));
> +        eor(ch2, ch1, ch2);
> +        sub(tmp1, ch2, tmp3);
> +        orr(tmp2, ch2, 0x7fff7fff7fff7fff);
> +        bics(tmp1, tmp1, tmp2);
> +        br(NE, HAS_ZERO);
> +        adds(cnt2_neg, cnt2_neg, 8);
> +        br(LT, CH1_LOOP);
> +
> +        cmp(cnt2_neg, 8);
> +        mov(cnt2_neg, 0);
> +        br(LT, CH1_LOOP);
> +        b(NOMATCH);
> +
> +      BIND(HAS_ZERO);
> +        rev(tmp1, tmp1);
> +        clz(tmp1, tmp1);
> +        add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
> +        add(result, result_tmp, cnt2_neg, ASR, 1);
> +        b(DONE);
> +
> +      BIND(DO1_SHORT);
> +        mov(result_tmp, cnt2);
> +        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
> +        sub(cnt2_neg, zr, cnt2, LSL, 1);
> +      BIND(DO1_LOOP);
> +        ldrh(ch2, Address(str2, cnt2_neg));
> +        cmpw(ch1, ch2);
> +        br(EQ, MATCH);
> +        adds(cnt2_neg, cnt2_neg, 2);
> +        br(LT, DO1_LOOP);
> +    }
> +  }
> +  BIND(NOMATCH);
> +    mov(result, -1);
> +    b(DONE);
> +  BIND(MATCH);
> +    add(result, result_tmp, cnt2_neg, ASR, 1);
> +  BIND(DONE);
> +}
> +
>  // Compare strings.
>  void MacroAssembler::string_compare(Register str1, Register str2,
>                                      Register cnt1, Register cnt2, Register result,
> diff -r 2dfe9abe27fe -r 978b74cb5c7b src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
> --- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Tue Aug 05 15:56:26 2014 +0100
> +++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Mon Aug 18 17:10:29 2014 +0100
> @@ -1091,6 +1091,11 @@
>                          Register len, Register result,
>                          FloatRegister Vtmp1, FloatRegister Vtmp2,
>                          FloatRegister Vtmp3, FloatRegister Vtmp4);
> +  void string_indexof(Register str1, Register str2,
> +                      Register cnt1, Register cnt2,
> +                      Register tmp1, Register tmp2,
> +                      Register tmp3, Register tmp4,
> +                      int int_cnt1, Register result);
>  };
>  
>  // Used by aarch64.ad to control code generation
> --- CUT HERE ---
> 
>
Edward Nevill Aug. 19, 2014, 10:31 a.m. UTC | #2
On Tue, 2014-08-19 at 10:18 +0100, Andrew Haley wrote:
> Hi,
> 
> On 18/08/14 17:27, Edward Nevill wrote:
> > 
> > The following patch adds support for the String.indexOf intrinsic.
> > 
> > It uses a combination of 2 algorithms, a simplified Boyer Moore, and
> > a straightforward linear scan.
> 
> In what way is this algorithm simplified from standard Boyer-Moore?
> How can we have confidence it is correct?

OK. I based the algorithm on the following

http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm

This describes an algorithm with 2 shift rules. The 'Bad Character' rule and the 'Good Suffix' rule.

These rules are essentially heuristics for how far we can shift the pattern along the search string.

I have implemented it using the 'Bad Character' rule only because as the entry says "The Good Suffix rule is markedly more complex in both concept and implementation than the Bad Character rule.'

> 
> I'm a bit concerned that we always inline this code.  AFAICS if
> icnt1 == -1 we might as well branch to a stub.

It is 600 bytes which is on the limit of what is sensible. On a code heap size of 128M I think this is OK. It isn't inlined that often, but when it is we would like it to be as fast as possible.


> > +  if (icnt1 == -1) {
> > +    cmp(cnt1, 256);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
> > +    ccmp(cnt1, 8, 0b0000, CC);  // Can't handle skip >= 256 because we use
> 
> Can this CC be LO, please.  This is really important because the ARM
> carry flag is inverted compared with e.g. Intel.

Yes. For those of us brought up in the 6502 world there is only one way the carry flag should ever be.

There are other instances of CC and CS throughout. I will change those as well.

> 
> > +    br(CC, LS);                 // a byte array.
> > +    cmp(cnt1, cnt2, LSR, 2);    // Source must be 4 * pattern for BM
> > +    br(CS, LS);
> 
> No, you may not give a label the same name as one of the condition
> codes.  This is an unexploded bomb.

What could possible go wrong? Especially when I change the CS to HS, then we have br(HS, LS) for max confusion.

I'll change it.

> > +      stp(zr, zr, pre(sp, -128));
> > +      stp(zr, zr, Address(sp, 1*16));
> > +      stp(zr, zr, Address(sp, 2*16));
> > +      stp(zr, zr, Address(sp, 3*16));
> > +      stp(zr, zr, Address(sp, 4*16));
> > +      stp(zr, zr, Address(sp, 5*16));
> > +      stp(zr, zr, Address(sp, 6*16));
> > +      stp(zr, zr, Address(sp, 7*16));
> 
> Why is this not a loop?

Mistaken sense of efficiency.

Something like this better?

  sub sp, sp, #128
  mov tmp1, #128
loop
  subs tmp1, tmp1, #16
  stp zr, zr, [sp, tmp]
  bne loop

or this?

  mov tmp1, #8
loop
  subs tmp1, tmp1, #1
  stp zr, zr, [sp, #-16]!
  bne loop

which is one instruction shorter but I am concerned about the efficiency of repeated stores with pre-decrement writeback to the base register. I think this could take 1 additional cycle per store on some implementations.

Regards,
Ed.
Andrew Haley Aug. 19, 2014, 12:29 p.m. UTC | #3
On 08/19/2014 11:31 AM, Edward Nevill wrote:
> On Tue, 2014-08-19 at 10:18 +0100, Andrew Haley wrote:
>> Hi,
>>
>> On 18/08/14 17:27, Edward Nevill wrote:
>>>
>>> The following patch adds support for the String.indexOf intrinsic.
>>>
>>> It uses a combination of 2 algorithms, a simplified Boyer Moore, and
>>> a straightforward linear scan.
>>
>> In what way is this algorithm simplified from standard Boyer-Moore?
>> How can we have confidence it is correct?
> 
> OK. I based the algorithm on the following
> 
> http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
> 
> This describes an algorithm with 2 shift rules. The 'Bad Character'
> rule and the 'Good Suffix' rule.
> 
> These rules are essentially heuristics for how far we can shift the
> pattern along the search string.
> 
> I have implemented it using the 'Bad Character' rule only because as
> the entry says "The Good Suffix rule is markedly more complex in
> both concept and implementation than the Bad Character rule.'

OK.  This deserves a comment.  And a pointer to Wikipedia.

>> I'm a bit concerned that we always inline this code.  AFAICS if
>> icnt1 == -1 we might as well branch to a stub.
> 
> It is 600 bytes which is on the limit of what is sensible. On a code
> heap size of 128M I think this is OK. It isn't inlined that often,
> but when it is we would like it to be as fast as possible.

Right, but if this routine is called twice in the same method it'll
read exactly the same routine into the icache twice.  And that's
slowing you down as well as wasting memory.

>>> +      stp(zr, zr, pre(sp, -128));
>>> +      stp(zr, zr, Address(sp, 1*16));
>>> +      stp(zr, zr, Address(sp, 2*16));
>>> +      stp(zr, zr, Address(sp, 3*16));
>>> +      stp(zr, zr, Address(sp, 4*16));
>>> +      stp(zr, zr, Address(sp, 5*16));
>>> +      stp(zr, zr, Address(sp, 6*16));
>>> +      stp(zr, zr, Address(sp, 7*16));
>>
>> Why is this not a loop?
> 
> Mistaken sense of efficiency.
> 
> Something like this better?
> 
>   sub sp, sp, #128
>   mov tmp1, #128
> loop
>   subs tmp1, tmp1, #16
>   stp zr, zr, [sp, tmp]
>   bne loop
> 
> or this?
> 
>   mov tmp1, #8
> loop
>   subs tmp1, tmp1, #1
>   stp zr, zr, [sp, #-16]!
>   bne loop
> 
> which is one instruction shorter but I am concerned about the
> efficiency of repeated stores with pre-decrement writeback to the
> base register. I think this could take 1 additional cycle per store
> on some implementations.

No, that's not what I meant, sorry.  Why not

  for (int i = 1; i < 8; i++)
      stp(zr, zr, Address(sp, i*16));

?

It's just a stylistic thing.

Andrew.
diff mbox

Patch

diff -r 2dfe9abe27fe -r 978b74cb5c7b src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad	Tue Aug 05 15:56:26 2014 +0100
+++ b/src/cpu/aarch64/vm/aarch64.ad	Mon Aug 18 17:10:29 2014 +0100
@@ -3415,6 +3415,16 @@ 
   interface(CONST_INTER);
 %}
 
+operand immI_le_4()
+%{
+  predicate(n->get_int() <= 4);
+  match(ConI);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
 operand immI_31()
 %{
   predicate(n->get_int() == 31);
@@ -11733,6 +11743,44 @@ 
   ins_pipe(pipe_class_memory);
 %}
 
+instruct string_indexof(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2,
+       iRegI_R0 result, iRegI tmp1, iRegI tmp2, iRegI tmp3, iRegI tmp4, rFlagsReg cr)
+%{
+  match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
+  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2,
+         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
+  format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result" %}
+
+  ins_encode %{
+    __ string_indexof($str1$$Register, $str2$$Register,
+                      $cnt1$$Register, $cnt2$$Register,
+                      $tmp1$$Register, $tmp2$$Register,
+                      $tmp3$$Register, $tmp4$$Register,
+                      -1, $result$$Register);
+  %}
+  ins_pipe(pipe_class_memory);
+%}
+
+instruct string_indexof_con(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2,
+                 immI_le_4 int_cnt2, iRegI_R0 result, iRegI tmp1, iRegI tmp2,
+                 iRegI tmp3, iRegI tmp4, rFlagsReg cr)
+%{
+  match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2)));
+  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1,
+         TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
+  format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result" %}
+
+  ins_encode %{
+    int icnt2 = (int)$int_cnt2$$constant;
+    __ string_indexof($str1$$Register, $str2$$Register,
+                      $cnt1$$Register, zr,
+                      $tmp1$$Register, $tmp2$$Register,
+                      $tmp3$$Register, $tmp4$$Register,
+                      icnt2, $result$$Register);
+  %}
+  ins_pipe(pipe_class_memory);
+%}
+
 instruct string_equals(iRegP_R1 str1, iRegP_R3 str2, iRegI_R4 cnt,
                         iRegI_R0 result, iRegP_R10 tmp, rFlagsReg cr)
 %{
diff -r 2dfe9abe27fe -r 978b74cb5c7b src/cpu/aarch64/vm/vm_version_aarch64.cpp
--- a/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Tue Aug 05 15:56:26 2014 +0100
+++ b/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Mon Aug 18 17:10:29 2014 +0100
@@ -105,6 +105,7 @@ 
   FLAG_SET_DEFAULT(PrefetchScanIntervalInBytes, 256);
   FLAG_SET_DEFAULT(PrefetchFieldsAhead, 256);
   FLAG_SET_DEFAULT(PrefetchCopyIntervalInBytes, 256);
+  FLAG_SET_DEFAULT(UseSSE42Intrinsics, true);
 
 #ifndef BUILTIN_SIM
   unsigned long auxv = getauxval(AT_HWCAP);
diff -r 2dfe9abe27fe -r 978b74cb5c7b src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Tue Aug 05 15:56:26 2014 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Mon Aug 18 17:10:29 2014 +0100
@@ -1,3 +1,4 @@ 
+/*
 /*
  * Copyright (c) 2013, Red Hat Inc.
  * Copyright (c) 1997, 2012, Oracle and/or its affiliates.
@@ -3386,6 +3387,338 @@ 
   }
 }
 
+// Search for str1 in str2 and return index or -1
+void MacroAssembler::string_indexof(Register str2, Register str1,
+                                    Register cnt2, Register cnt1,
+                                    Register tmp1, Register tmp2,
+                                    Register tmp3, Register tmp4,
+                                    int icnt1, Register result) {
+  Label BM, LS, DONE, NOMATCH, MATCH;
+
+  Register ch1 = rscratch1;
+  Register ch2 = rscratch2;
+  Register cnt1tmp = tmp1;
+  Register cnt2tmp = tmp2;
+  Register cnt1_neg = cnt1;
+  Register cnt2_neg = cnt2;
+  Register result_tmp = tmp4;
+
+  // Note, inline_string_indexOf() generates checks:
+  // if (substr.count > string.count) return -1;
+  // if (substr.count == 0) return 0;
+
+// We have two strings, a source string in str2, cnt2 and a pattern string
+// in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
+
+// For larger pattern and source we use a simplified Boyer Moore algorithm.
+// With a small pattern and source we use linear scan.
+
+  if (icnt1 == -1) {
+    cmp(cnt1, 256);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
+    ccmp(cnt1, 8, 0b0000, CC);  // Can't handle skip >= 256 because we use
+    br(CC, LS);                 // a byte array.
+    cmp(cnt1, cnt2, LSR, 2);    // Source must be 4 * pattern for BM
+    br(CS, LS);
+  }
+
+// Larger pattern and source use the following Boyer Moore alogorithm.
+//
+// #define ASIZE 128
+//
+//    int bm(unsigned char *x, int m, unsigned char *y, int n) {
+//       int i, j;
+//       unsigned c;
+//       unsigned char bc[ASIZE];
+//    
+//       /* Preprocessing */
+//       for (i = 0; i < ASIZE; ++i)
+//          bc[i] = 0;
+//       for (i = 0; i < m - 1; ) {
+//          c = x[i];
+//          ++i;
+//          if (c < ASIZE) bc[c] = i;
+//       }
+//    
+//       /* Searching */
+//       j = 0;
+//       while (j <= n - m) {
+//          c = y[i+j];
+//          if (x[m-1] == c)
+//            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
+//          if (i < 0) return j;
+//          if (c < ASIZE)
+//            j = j - bc[y[j+m-1]] + m;
+//          else
+//            j += 1; // Advance by 1 only if char >= ASIZE
+//       }
+//    }
+
+  if (icnt1 == -1) {
+    BIND(BM);
+
+    Label ZLOOP, BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP;
+    Label BMADV, BMMATCH, BMCHECKEND;
+
+    Register cnt1end = tmp2;
+    Register str2end = cnt2;
+    Register skipch = tmp2;
+
+    // Restrict ASIZE to 128 to reduce stack space/initialisation.
+    // The presence of chars >= ASIZE in the target string does not affect
+    // performance, but we must be careful not to initialise them in the stack
+    // array.
+    // The presence of chars >= ASIZE in the source string may adversely affect
+    // performance since we can only advance by one when we encounter one.
+
+      stp(zr, zr, pre(sp, -128));
+      stp(zr, zr, Address(sp, 1*16));
+      stp(zr, zr, Address(sp, 2*16));
+      stp(zr, zr, Address(sp, 3*16));
+      stp(zr, zr, Address(sp, 4*16));
+      stp(zr, zr, Address(sp, 5*16));
+      stp(zr, zr, Address(sp, 6*16));
+      stp(zr, zr, Address(sp, 7*16));
+
+      mov(cnt1tmp, 0);
+      sub(cnt1end, cnt1, 1);
+    BIND(BCLOOP);
+      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
+      cmp(ch1, 128);
+      add(cnt1tmp, cnt1tmp, 1);
+      br(HS, BCSKIP);
+      strb(cnt1tmp, Address(sp, ch1));
+    BIND(BCSKIP);
+      cmp(cnt1tmp, cnt1end);
+      br(LT, BCLOOP);
+
+      mov(result_tmp, str2);
+
+      sub(cnt2, cnt2, cnt1);
+      add(str2end, str2, cnt2, LSL, 1);
+    BIND(BMLOOPSTR2);
+      sub(cnt1tmp, cnt1, 1);
+      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
+      ldrh(skipch, Address(str2, cnt1tmp, Address::lsl(1)));
+      cmp(ch1, skipch);
+      br(NE, BMSKIP);
+      subs(cnt1tmp, cnt1tmp, 1);
+      br(LT, BMMATCH);
+    BIND(BMLOOPSTR1);
+      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
+      ldrh(ch2, Address(str2, cnt1tmp, Address::lsl(1)));
+      cmp(ch1, ch2);
+      br(NE, BMSKIP);
+      subs(cnt1tmp, cnt1tmp, 1);
+      br(GE, BMLOOPSTR1);
+    BIND(BMMATCH);
+      sub(result_tmp, str2, result_tmp);
+      lsr(result, result_tmp, 1);
+      add(sp, sp, 128);
+      b(DONE);
+    BIND(BMADV);
+      add(str2, str2, 2);
+      b(BMCHECKEND);
+    BIND(BMSKIP);
+      cmp(skipch, 128);
+      br(HS, BMADV);
+      ldrb(ch2, Address(sp, skipch));
+      add(str2, str2, cnt1, LSL, 1);
+      sub(str2, str2, ch2, LSL, 1);
+    BIND(BMCHECKEND);
+      cmp(str2, str2end);
+      br(LE, BMLOOPSTR2);
+      add(sp, sp, 128);
+      b(NOMATCH);
+  }
+
+  BIND(LS);
+  {
+    Label DO1, DO2, DO3;
+
+    Register str2tmp = tmp2;
+    Register first = tmp3;
+
+    if (icnt1 == -1)
+    {
+        Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT, LAST_WORD;
+
+        cmp(cnt1, 4);
+        br(LT, DOSHORT);
+
+        sub(cnt2, cnt2, cnt1);
+        sub(cnt1, cnt1, 4);
+        mov(result_tmp, cnt2);
+
+        lea(str1, Address(str1, cnt1, Address::uxtw(1)));
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt1_neg, zr, cnt1, LSL, 1);
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+        ldr(first, Address(str1, cnt1_neg));
+
+      BIND(FIRST_LOOP);
+        ldr(ch2, Address(str2, cnt2_neg));
+        cmp(first, ch2);
+        br(EQ, STR1_LOOP);
+      BIND(STR2_NEXT);
+        adds(cnt2_neg, cnt2_neg, 2);
+        br(LE, FIRST_LOOP);
+        b(NOMATCH);
+
+      BIND(STR1_LOOP);
+        adds(cnt1tmp, cnt1_neg, 8);
+        add(cnt2tmp, cnt2_neg, 8);
+        br(GE, LAST_WORD);
+
+      BIND(STR1_NEXT);
+        ldr(ch1, Address(str1, cnt1tmp));
+        ldr(ch2, Address(str2, cnt2tmp));
+        cmp(ch1, ch2);
+        br(NE, STR2_NEXT);
+        adds(cnt1tmp, cnt1tmp, 8);
+        add(cnt2tmp, cnt2tmp, 8);
+        br(LT, STR1_NEXT);
+
+      BIND(LAST_WORD);
+        ldr(ch1, Address(str1));
+        sub(str2tmp, str2, cnt1_neg);         // adjust to corresponding
+        ldr(ch2, Address(str2tmp, cnt2_neg)); // word in str2
+        cmp(ch1, ch2);
+        br(NE, STR2_NEXT);
+        b(MATCH);
+
+      BIND(DOSHORT);
+        cmp(cnt1, 2);
+        br(LT, DO1);
+        br(GT, DO3);
+    }
+
+    if (icnt1 == 4) {
+      Label CH1_LOOP;
+
+        ldr(ch1, str1);
+        sub(cnt2, cnt2, 4);
+        mov(result_tmp, cnt2);
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+
+      BIND(CH1_LOOP);
+        ldr(ch2, Address(str2, cnt2_neg));
+        cmp(ch1, ch2);
+        br(EQ, MATCH);
+        adds(cnt2_neg, cnt2_neg, 2);
+        br(LE, CH1_LOOP);
+        b(NOMATCH);
+    }
+
+    if (icnt1 == -1 || icnt1 == 2) {
+      Label CH1_LOOP;
+
+      BIND(DO2);
+        ldrw(ch1, str1);
+        sub(cnt2, cnt2, 2);
+        mov(result_tmp, cnt2);
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+
+      BIND(CH1_LOOP);
+        ldrw(ch2, Address(str2, cnt2_neg));
+        cmp(ch1, ch2);
+        br(EQ, MATCH);
+        adds(cnt2_neg, cnt2_neg, 2);
+        br(LE, CH1_LOOP);
+        b(NOMATCH);
+    }
+
+    if (icnt1 == -1 || icnt1 == 3) {
+      Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
+
+      BIND(DO3);
+        ldrw(first, str1);
+        ldrh(ch1, Address(str1, 4));
+
+        sub(cnt2, cnt2, 3);
+        mov(result_tmp, cnt2);
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+
+      BIND(FIRST_LOOP);
+        ldrw(ch2, Address(str2, cnt2_neg));
+        cmpw(first, ch2);
+        br(EQ, STR1_LOOP);
+      BIND(STR2_NEXT);
+        adds(cnt2_neg, cnt2_neg, 2);
+        br(LE, FIRST_LOOP);
+        b(NOMATCH);
+
+      BIND(STR1_LOOP);
+        add(cnt2tmp, cnt2_neg, 4);
+        ldrh(ch2, Address(str2, cnt2tmp));
+        cmp(ch1, ch2);
+        br(NE, STR2_NEXT);
+        add(result, result_tmp, cnt2_neg, ASR, 1);
+        b(DONE);
+    }
+
+    if (icnt1 == -1 || icnt1 == 1) {
+      Label CH1_LOOP, HAS_ZERO;
+      Label DO1_SHORT, DO1_LOOP;
+
+      BIND(DO1);
+        ldrh(ch1, str1);
+        cmp(cnt2, 4);
+        br(LT, DO1_SHORT);
+
+        orr(ch1, ch1, ch1, LSL, 16);
+        orr(ch1, ch1, ch1, LSL, 32);
+
+        sub(cnt2, cnt2, 4);
+        mov(result_tmp, cnt2);
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+
+        mov(tmp3, 0x0001000100010001);
+      BIND(CH1_LOOP);
+        ldr(ch2, Address(str2, cnt2_neg));
+        eor(ch2, ch1, ch2);
+        sub(tmp1, ch2, tmp3);
+        orr(tmp2, ch2, 0x7fff7fff7fff7fff);
+        bics(tmp1, tmp1, tmp2);
+        br(NE, HAS_ZERO);
+        adds(cnt2_neg, cnt2_neg, 8);
+        br(LT, CH1_LOOP);
+
+        cmp(cnt2_neg, 8);
+        mov(cnt2_neg, 0);
+        br(LT, CH1_LOOP);
+        b(NOMATCH);
+
+      BIND(HAS_ZERO);
+        rev(tmp1, tmp1);
+        clz(tmp1, tmp1);
+        add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
+        add(result, result_tmp, cnt2_neg, ASR, 1);
+        b(DONE);
+
+      BIND(DO1_SHORT);
+        mov(result_tmp, cnt2);
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+      BIND(DO1_LOOP);
+        ldrh(ch2, Address(str2, cnt2_neg));
+        cmpw(ch1, ch2);
+        br(EQ, MATCH);
+        adds(cnt2_neg, cnt2_neg, 2);
+        br(LT, DO1_LOOP);
+    }
+  }
+  BIND(NOMATCH);
+    mov(result, -1);
+    b(DONE);
+  BIND(MATCH);
+    add(result, result_tmp, cnt2_neg, ASR, 1);
+  BIND(DONE);
+}
+
 // Compare strings.
 void MacroAssembler::string_compare(Register str1, Register str2,
                                     Register cnt1, Register cnt2, Register result,
diff -r 2dfe9abe27fe -r 978b74cb5c7b src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
--- a/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Tue Aug 05 15:56:26 2014 +0100
+++ b/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Mon Aug 18 17:10:29 2014 +0100
@@ -1091,6 +1091,11 @@ 
                         Register len, Register result,
                         FloatRegister Vtmp1, FloatRegister Vtmp2,
                         FloatRegister Vtmp3, FloatRegister Vtmp4);
+  void string_indexof(Register str1, Register str2,
+                      Register cnt1, Register cnt2,
+                      Register tmp1, Register tmp2,
+                      Register tmp3, Register tmp4,
+                      int int_cnt1, Register result);
 };
 
 // Used by aarch64.ad to control code generation
--- CUT HERE ---