@@ -358,8 +358,10 @@ class Context(object):
self.add_config(arch='sparc64',
os_name='linux-gnu',
glibcs=[{},
+ {'arch': 'sparcv8',
+ 'ccopts': '-m32 -mlong-double-128 -mcpu=leon3'}],
{'arch': 'sparcv9',
- 'ccopts': '-m32 -mlong-double-128'}],
+ 'ccopts': '-m32 -mlong-double-128 -mcpu=v9'}],
extra_glibcs=[{'variant': 'disable-multi-arch',
'cfg': ['--disable-multi-arch']},
{'variant': 'disable-multi-arch',
@@ -1,24 +1,10 @@
# preconfigure fragment for sparc.
case "$machine" in
-sparc | sparcv[67])
+sparc | sparcv8 | superspac | hyperspace)
base_machine=sparc machine=sparc/sparc32 ;;
-sparcv8 | supersparc | hypersparc)
- base_machine=sparc machine=sparc/sparc32/sparcv8 ;;
-sparcv8plus | sparcv8plusa | sparcv9)
+sparcv8plus | sparcv8plus* | sparcv9* )
base_machine=sparc machine=sparc/sparc32/sparcv9 ;;
-sparcv8plusb | sparcv9b)
- base_machine=sparc machine=sparc/sparc32/sparcv9/sparcv9b ;;
-sparcv9v)
- base_machine=sparc machine=sparc/sparc32/sparcv9/sparcv9v ;;
-sparcv9v2)
- base_machine=sparc machine=sparc/sparc32/sparcv9/sparcv9v2 ;;
-sparc64)
+sparc64*)
base_machine=sparc machine=sparc/sparc64 ;;
-sparc64b)
- base_machine=sparc machine=sparc/sparc64/sparcv9b ;;
-sparc64v)
- base_machine=sparc machine=sparc/sparc64/sparcv9v ;;
-sparc64v2)
- base_machine=sparc machine=sparc/sparc64/sparcv9v2 ;;
esac
@@ -19,35 +19,8 @@ ifeq ($(subdir),gnulib)
sysdep_routines = dotmul umul $(divrem) alloca
endif # gnulib
-# We distribute these files, even though they are generated,
-# so as to avoid the need for a functioning m4 to build the library.
divrem := sdiv udiv rem urem
-+divrem-NAME-sdiv := div
-+divrem-NAME-udiv := udiv
-+divrem-NAME-rem := rem
-+divrem-NAME-urem := urem
-+divrem-NAME = $(+divrem-NAME-$(basename $(notdir $@)))
-+divrem-OP-div := div
-+divrem-OP-udiv := div
-+divrem-OP-rem := rem
-+divrem-OP-urem := rem
-+divrem-S-div := true
-+divrem-S-rem := true
-+divrem-S-udiv := false
-+divrem-S-urem := false
-$(divrem:%=$(sysdep_dir)/sparc/sparc32/%.S): $(sysdep_dir)/sparc/sparc32/divrem.m4
- (echo -n "define(NAME,\`.$(+divrem-NAME)')"; \
- echo -n " define(OP,\`$(+divrem-OP-$(+divrem-NAME))')"; \
- echo -n " define(S,\`$(+divrem-S-$(+divrem-NAME))')"; \
- echo " /* This file is generated from divrem.m4; DO NOT EDIT! */"; \
- cat $<) | $(M4) > $@-tmp
-# Make it unwritable so noone will edit it by mistake.
- -chmod a-w $@-tmp
- mv -f $@-tmp $@
-
-sysdep-realclean := $(sysdep-realclean) $(divrem:%=sysdeps/sparc/sparc32/%.S)
-
# libgcc __divdi3 and __moddi3 uses .udiv and since it is also exported by
# libc.so linker will create PLTs for the symbol. To avoid it we strong alias
# the exported libc one to __wrap_.udiv and use linker option --wrap to make any
@@ -1,146 +1,118 @@
-! SPARC __mpn_addmul_1 -- Multiply a limb vector with a limb and add
-! the result to a second limb vector.
-!
+! SPARC v8 __mpn_addmul_1 -- Multiply a limb vector with a limb and
+! add the result to a second limb vector.
+
! Copyright (C) 1992-2019 Free Software Foundation, Inc.
-!
+
! This file is part of the GNU MP Library.
-!
+
! The GNU MP Library is free software; you can redistribute it and/or modify
! it under the terms of the GNU Lesser General Public License as published by
! the Free Software Foundation; either version 2.1 of the License, or (at your
! option) any later version.
-!
+
! The GNU MP Library is distributed in the hope that it will be useful, but
! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! License for more details.
-!
+
! You should have received a copy of the GNU Lesser General Public License
! along with the GNU MP Library; see the file COPYING.LIB. If not,
! see <https://www.gnu.org/licenses/>.
! INPUT PARAMETERS
-! RES_PTR o0
-! S1_PTR o1
-! SIZE o2
-! S2_LIMB o3
+! res_ptr o0
+! s1_ptr o1
+! size o2
+! s2_limb o3
#include <sysdep.h>
ENTRY(__mpn_addmul_1)
- ! Make S1_PTR and RES_PTR point at the end of their blocks
- ! and put (- 4 x SIZE) in index/loop counter.
- sll %o2,2,%o2
- add %o0,%o2,%o4 ! RES_PTR in o4 since o0 is retval
- add %o1,%o2,%o1
- sub %g0,%o2,%o2
+ ld [%o1+0],%o4 ! 1
+ sll %o2,4,%g1
+ orcc %g0,%g0,%g2
+ mov %o7,%g4 ! Save return address register
+ and %g1,(4-1)<<4,%g1
+1: call 2f
+ add %o7,3f-1b,%g3
+2: jmp %g3+%g1
+ mov %g4,%o7 ! Restore return address register
- cmp %o3,0xfff
- bgu LOC(large)
+ .align 4
+3:
+LOC(00):
+ add %o0,-4,%o0
+ b LOC(loop00) /* 4, 8, 12, ... */
+ add %o1,-4,%o1
+ nop
+LOC(01):
+ b LOC(loop01) /* 1, 5, 9, ... */
+ nop
+ nop
+ nop
+LOC(10):
+ add %o0,-12,%o0 /* 2, 6, 10, ... */
+ b LOC(loop10)
+ add %o1,4,%o1
+ nop
+LOC(11):
+ add %o0,-8,%o0 /* 3, 7, 11, ... */
+ b LOC(loop11)
+ add %o1,-8,%o1
nop
- ld [%o1+%o2],%o5
- mov 0,%o0
- b LOC(0)
- add %o4,-4,%o4
-LOC(loop0):
- addcc %o5,%g1,%g1
- ld [%o1+%o2],%o5
- addx %o0,%g0,%o0
- st %g1,[%o4+%o2]
-LOC(0): wr %g0,%o3,%y
- sra %o5,31,%g2
- and %o3,%g2,%g2
- andcc %g1,0,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,0,%g1
- sra %g1,20,%g4
- sll %g1,12,%g1
- rd %y,%g3
- srl %g3,20,%g3
- or %g1,%g3,%g1
-
- addcc %g1,%o0,%g1
- addx %g2,%g4,%o0 ! add sign-compensation and cy to hi limb
- addcc %o2,4,%o2 ! loop counter
- bne LOC(loop0)
- ld [%o4+%o2],%o5
-
- addcc %o5,%g1,%g1
- addx %o0,%g0,%o0
- retl
- st %g1,[%o4+%o2]
-
-
-LOC(large):
- ld [%o1+%o2],%o5
- mov 0,%o0
- sra %o3,31,%g4 ! g4 = mask of ones iff S2_LIMB < 0
- b LOC(1)
- add %o4,-4,%o4
LOC(loop):
- addcc %o5,%g3,%g3
- ld [%o1+%o2],%o5
- addx %o0,%g0,%o0
- st %g3,[%o4+%o2]
-LOC(1): wr %g0,%o5,%y
- and %o5,%g4,%g2
- andcc %g0,%g0,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%g0,%g1
- rd %y,%g3
- addcc %g3,%o0,%g3
- addx %g2,%g1,%o0
- addcc %o2,4,%o2
- bne LOC(loop)
- ld [%o4+%o2],%o5
+ addcc %g3,%g2,%g3 ! 1
+ ld [%o1+4],%o4 ! 2
+ rd %y,%g2 ! 1
+ addx %g0,%g2,%g2
+ ld [%o0+0],%g1 ! 2
+ addcc %g1,%g3,%g3
+ st %g3,[%o0+0] ! 1
+LOC(loop00):
+ umul %o4,%o3,%g3 ! 2
+ ld [%o0+4],%g1 ! 2
+ addxcc %g3,%g2,%g3 ! 2
+ ld [%o1+8],%o4 ! 3
+ rd %y,%g2 ! 2
+ addx %g0,%g2,%g2
+ nop
+ addcc %g1,%g3,%g3
+ st %g3,[%o0+4] ! 2
+LOC(loop11):
+ umul %o4,%o3,%g3 ! 3
+ addxcc %g3,%g2,%g3 ! 3
+ ld [%o1+12],%o4 ! 4
+ rd %y,%g2 ! 3
+ add %o1,16,%o1
+ addx %g0,%g2,%g2
+ ld [%o0+8],%g1 ! 2
+ addcc %g1,%g3,%g3
+ st %g3,[%o0+8] ! 3
+LOC(loop10):
+ umul %o4,%o3,%g3 ! 4
+ addxcc %g3,%g2,%g3 ! 4
+ ld [%o1+0],%o4 ! 1
+ rd %y,%g2 ! 4
+ addx %g0,%g2,%g2
+ ld [%o0+12],%g1 ! 2
+ addcc %g1,%g3,%g3
+ st %g3,[%o0+12] ! 4
+ add %o0,16,%o0
+ addx %g0,%g2,%g2
+LOC(loop01):
+ addcc %o2,-4,%o2
+ bg LOC(loop)
+ umul %o4,%o3,%g3 ! 1
- addcc %o5,%g3,%g3
- addx %o0,%g0,%o0
+ addcc %g3,%g2,%g3 ! 4
+ rd %y,%g2 ! 4
+ addx %g0,%g2,%g2
+ ld [%o0+0],%g1 ! 2
+ addcc %g1,%g3,%g3
+ st %g3,[%o0+0] ! 4
retl
- st %g3,[%o4+%o2]
+ addx %g0,%g2,%o0
END(__mpn_addmul_1)
new file mode 100644
@@ -0,0 +1,162 @@
+# This file is generated from configure.ac by Autoconf. DO NOT EDIT!
+ # Local configure fragment for sysdeps/sparc/sparc32
+
+# Test if compiler targets at least sparcv8.
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5
+$as_echo_n "checking for grep that handles long lines and -e... " >&6; }
+if ${ac_cv_path_GREP+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ if test -z "$GREP"; then
+ ac_path_GREP_found=false
+ # Loop through the user's path and test for each of PROGNAME-LIST
+ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_prog in grep ggrep; do
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext"
+ as_fn_executable_p "$ac_path_GREP" || continue
+# Check for GNU ac_path_GREP and select it if it is found.
+ # Check for GNU $ac_path_GREP
+case `"$ac_path_GREP" --version 2>&1` in
+*GNU*)
+ ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;;
+*)
+ ac_count=0
+ $as_echo_n 0123456789 >"conftest.in"
+ while :
+ do
+ cat "conftest.in" "conftest.in" >"conftest.tmp"
+ mv "conftest.tmp" "conftest.in"
+ cp "conftest.in" "conftest.nl"
+ $as_echo 'GREP' >> "conftest.nl"
+ "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+ diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+ as_fn_arith $ac_count + 1 && ac_count=$as_val
+ if test $ac_count -gt ${ac_path_GREP_max-0}; then
+ # Best one so far, save it but keep looking for a better one
+ ac_cv_path_GREP="$ac_path_GREP"
+ ac_path_GREP_max=$ac_count
+ fi
+ # 10*(2^10) chars as input seems more than enough
+ test $ac_count -gt 10 && break
+ done
+ rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+ $ac_path_GREP_found && break 3
+ done
+ done
+ done
+IFS=$as_save_IFS
+ if test -z "$ac_cv_path_GREP"; then
+ as_fn_error $? "no acceptable grep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
+ fi
+else
+ ac_cv_path_GREP=$GREP
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5
+$as_echo "$ac_cv_path_GREP" >&6; }
+ GREP="$ac_cv_path_GREP"
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for egrep" >&5
+$as_echo_n "checking for egrep... " >&6; }
+if ${ac_cv_path_EGREP+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ if echo a | $GREP -E '(a|b)' >/dev/null 2>&1
+ then ac_cv_path_EGREP="$GREP -E"
+ else
+ if test -z "$EGREP"; then
+ ac_path_EGREP_found=false
+ # Loop through the user's path and test for each of PROGNAME-LIST
+ as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+ IFS=$as_save_IFS
+ test -z "$as_dir" && as_dir=.
+ for ac_prog in egrep; do
+ for ac_exec_ext in '' $ac_executable_extensions; do
+ ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext"
+ as_fn_executable_p "$ac_path_EGREP" || continue
+# Check for GNU ac_path_EGREP and select it if it is found.
+ # Check for GNU $ac_path_EGREP
+case `"$ac_path_EGREP" --version 2>&1` in
+*GNU*)
+ ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;;
+*)
+ ac_count=0
+ $as_echo_n 0123456789 >"conftest.in"
+ while :
+ do
+ cat "conftest.in" "conftest.in" >"conftest.tmp"
+ mv "conftest.tmp" "conftest.in"
+ cp "conftest.in" "conftest.nl"
+ $as_echo 'EGREP' >> "conftest.nl"
+ "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+ diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+ as_fn_arith $ac_count + 1 && ac_count=$as_val
+ if test $ac_count -gt ${ac_path_EGREP_max-0}; then
+ # Best one so far, save it but keep looking for a better one
+ ac_cv_path_EGREP="$ac_path_EGREP"
+ ac_path_EGREP_max=$ac_count
+ fi
+ # 10*(2^10) chars as input seems more than enough
+ test $ac_count -gt 10 && break
+ done
+ rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+ $ac_path_EGREP_found && break 3
+ done
+ done
+ done
+IFS=$as_save_IFS
+ if test -z "$ac_cv_path_EGREP"; then
+ as_fn_error $? "no acceptable egrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
+ fi
+else
+ ac_cv_path_EGREP=$EGREP
+fi
+
+ fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_EGREP" >&5
+$as_echo "$ac_cv_path_EGREP" >&6; }
+ EGREP="$ac_cv_path_EGREP"
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for at least sparcv8 support" >&5
+$as_echo_n "checking for at least sparcv8 support... " >&6; }
+if ${libc_cv_sparcv8+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+#if defined (__sparc_v8__) || defined (__sparc_v9__)
+ yes
+ #endif
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+ $EGREP "yes" >/dev/null 2>&1; then :
+ libc_cv_sparcv8=yes
+else
+ libc_sparcv8=no
+fi
+rm -f conftest*
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_sparcv8" >&5
+$as_echo "$libc_cv_sparcv8" >&6; }
+if test $libc_cv_sparcv8 = no; then
+ as_fn_error $? "no support for pre-v8 sparc" "$LINENO" 5
+fi
new file mode 100644
@@ -0,0 +1,13 @@
+GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
+# Local configure fragment for sysdeps/sparc/sparc32
+
+# Test if compiler targets at least sparcv8.
+AC_CACHE_CHECK([for at least sparcv8 support],
+ [libc_cv_sparcv8],
+ [AC_EGREP_CPP(yes,[#if defined (__sparc_v8__) || defined (__sparc_v9__)
+ yes
+ #endif
+ ], libc_cv_sparcv8=yes, libc_sparcv8=no)])
+if test $libc_cv_sparcv8 = no; then
+ AC_MSG_ERROR([no support for pre-v8 sparc])
+fi
deleted file mode 100644
@@ -1,234 +0,0 @@
-/*
- * Division and remainder, from Appendix E of the Sparc Version 8
- * Architecture Manual, with fixes from Gordon Irlam.
- */
-
-/*
- * Input: dividend and divisor in %o0 and %o1 respectively.
- *
- * m4 parameters:
- * NAME name of function to generate
- * OP OP=div => %o0 / %o1; OP=rem => %o0 % %o1
- * S S=true => signed; S=false => unsigned
- *
- * Algorithm parameters:
- * N how many bits per iteration we try to get (4)
- * WORDSIZE total number of bits (32)
- *
- * Derived constants:
- * TOPBITS number of bits in the top `decade' of a number
- *
- * Important variables:
- * Q the partial quotient under development (initially 0)
- * R the remainder so far, initially the dividend
- * ITER number of main division loop iterations required;
- * equal to ceil(log2(quotient) / N). Note that this
- * is the log base (2^N) of the quotient.
- * V the current comparand, initially divisor*2^(ITER*N-1)
- *
- * Cost:
- * Current estimate for non-large dividend is
- * ceil(log2(quotient) / N) * (10 + 7N/2) + C
- * A large dividend is one greater than 2^(31-TOPBITS) and takes a
- * different path, as the upper bits of the quotient must be developed
- * one bit at a time.
- */
-
-define(N, `4')dnl
-define(WORDSIZE, `32')dnl
-define(TOPBITS, eval(WORDSIZE - N*((WORDSIZE-1)/N)))dnl
-dnl
-define(dividend, `%o0')dnl
-define(divisor, `%o1')dnl
-define(Q, `%o2')dnl
-define(R, `%o3')dnl
-define(ITER, `%o4')dnl
-define(V, `%o5')dnl
-dnl
-dnl m4 reminder: ifelse(a,b,c,d) => if a is b, then c, else d
-define(T, `%g1')dnl
-define(SC, `%g2')dnl
-ifelse(S, `true', `define(SIGN, `%g3')')dnl
-
-dnl
-dnl This is the recursive definition for developing quotient digits.
-dnl
-dnl Parameters:
-dnl $1 the current depth, 1 <= $1 <= N
-dnl $2 the current accumulation of quotient bits
-dnl N max depth
-dnl
-dnl We add a new bit to $2 and either recurse or insert the bits in
-dnl the quotient. R, Q, and V are inputs and outputs as defined above;
-dnl the condition codes are expected to reflect the input R, and are
-dnl modified to reflect the output R.
-dnl
-define(DEVELOP_QUOTIENT_BITS,
-` ! depth $1, accumulated bits $2
- bl LOC($1.eval(2**N+$2))
- srl V,1,V
- ! remainder is positive
- subcc R,V,R
- ifelse($1, N,
- ` b 9f
- add Q, ($2*2+1), Q
-', ` DEVELOP_QUOTIENT_BITS(incr($1), `eval(2*$2+1)')')
-LOC($1.eval(2**N+$2)):
- ! remainder is negative
- addcc R,V,R
- ifelse($1, N,
- ` b 9f
- add Q, ($2*2-1), Q
-', ` DEVELOP_QUOTIENT_BITS(incr($1), `eval(2*$2-1)')')
-ifelse($1, 1, `9:')')dnl
-
-#include <sysdep.h>
-#include <sys/trap.h>
-
-ENTRY(NAME)
-ifelse(S, `true',
-` ! compute sign of result; if neither is negative, no problem
- orcc divisor, dividend, %g0 ! either negative?
- bge 2f ! no, go do the divide
-ifelse(OP, `div',
-` xor divisor, dividend, SIGN ! compute sign in any case',
-` mov dividend, SIGN ! sign of remainder matches dividend')
- tst divisor
- bge 1f
- tst dividend
- ! divisor is definitely negative; dividend might also be negative
- bge 2f ! if dividend not negative...
- sub %g0, divisor, divisor ! in any case, make divisor nonneg
-1: ! dividend is negative, divisor is nonnegative
- sub %g0, dividend, dividend ! make dividend nonnegative
-2:
-')
- ! Ready to divide. Compute size of quotient; scale comparand.
- orcc divisor, %g0, V
- bne 1f
- mov dividend, R
-
- ! Divide by zero trap. If it returns, return 0 (about as
- ! wrong as possible, but that is what SunOS does...).
- ta ST_DIV0
- retl
- clr %o0
-
-1:
- cmp R, V ! if divisor exceeds dividend, done
- blu LOC(got_result) ! (and algorithm fails otherwise)
- clr Q
- sethi %hi(1 << (WORDSIZE - TOPBITS - 1)), T
- cmp R, T
- blu LOC(not_really_big)
- clr ITER
-
- ! `Here the dividend is >= 2**(31-N) or so. We must be careful here,
- ! as our usual N-at-a-shot divide step will cause overflow and havoc.
- ! The number of bits in the result here is N*ITER+SC, where SC <= N.
- ! Compute ITER in an unorthodox manner: know we need to shift V into
- ! the top decade: so do not even bother to compare to R.'
- 1:
- cmp V, T
- bgeu 3f
- mov 1, SC
- sll V, N, V
- b 1b
- add ITER, 1, ITER
-
- ! Now compute SC.
- 2: addcc V, V, V
- bcc LOC(not_too_big)
- add SC, 1, SC
-
- ! We get here if the divisor overflowed while shifting.
- ! This means that R has the high-order bit set.
- ! Restore V and subtract from R.
- sll T, TOPBITS, T ! high order bit
- srl V, 1, V ! rest of V
- add V, T, V
- b LOC(do_single_div)
- sub SC, 1, SC
-
- LOC(not_too_big):
- 3: cmp V, R
- blu 2b
- nop
- be LOC(do_single_div)
- nop
- /* NB: these are commented out in the V8-Sparc manual as well */
- /* (I do not understand this) */
- ! V > R: went too far: back up 1 step
- ! srl V, 1, V
- ! dec SC
- ! do single-bit divide steps
- !
- ! We have to be careful here. We know that R >= V, so we can do the
- ! first divide step without thinking. BUT, the others are conditional,
- ! and are only done if R >= 0. Because both R and V may have the high-
- ! order bit set in the first step, just falling into the regular
- ! division loop will mess up the first time around.
- ! So we unroll slightly...
- LOC(do_single_div):
- subcc SC, 1, SC
- bl LOC(end_regular_divide)
- nop
- sub R, V, R
- mov 1, Q
- b LOC(end_single_divloop)
- nop
- LOC(single_divloop):
- sll Q, 1, Q
- bl 1f
- srl V, 1, V
- ! R >= 0
- sub R, V, R
- b 2f
- add Q, 1, Q
- 1: ! R < 0
- add R, V, R
- sub Q, 1, Q
- 2:
- LOC(end_single_divloop):
- subcc SC, 1, SC
- bge LOC(single_divloop)
- tst R
- b,a LOC(end_regular_divide)
-
-LOC(not_really_big):
-1:
- sll V, N, V
- cmp V, R
- bleu 1b
- addcc ITER, 1, ITER
- be LOC(got_result)
- sub ITER, 1, ITER
-
- tst R ! set up for initial iteration
-LOC(divloop):
- sll Q, N, Q
- DEVELOP_QUOTIENT_BITS(1, 0)
-LOC(end_regular_divide):
- subcc ITER, 1, ITER
- bge LOC(divloop)
- tst R
- bl,a LOC(got_result)
- ! non-restoring fixup here (one instruction only!)
-ifelse(OP, `div',
-` sub Q, 1, Q
-', ` add R, divisor, R
-')
-
-LOC(got_result):
-ifelse(S, `true',
-` ! check to see if answer should be < 0
- tst SIGN
- bl,a 1f
- ifelse(OP, `div', `sub %g0, Q, Q', `sub %g0, R, R')
-1:')
- retl
- ifelse(OP, `div', `mov Q, %o0', `mov R, %o0')
-
-END(NAME)
-ifelse(OP, `div', ifelse(S, `false', `strong_alias (.udiv, __wrap_.udiv)
-'))dnl
@@ -1,127 +1,13 @@
/*
- * Signed multiply, from Appendix E of the Sparc Version 8
- * Architecture Manual.
- */
-
-/*
- * Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the upper 32 bits of
- * the 64-bit product).
- *
- * This code optimizes short (less than 13-bit) multiplies.
+ * Sparc v8 has multiply.
*/
#include <sysdep.h>
-
ENTRY(.mul)
- mov %o0, %y ! multiplier -> Y
- andncc %o0, 0xfff, %g0 ! test bits 12..31
- be LOC(mul_shortway) ! if zero, can do it the short way
- andcc %g0, %g0, %o4 ! zero the partial product and clear N and V
-
- /*
- * Long multiply. 32 steps, followed by a final shift step.
- */
- mulscc %o4, %o1, %o4 ! 1
- mulscc %o4, %o1, %o4 ! 2
- mulscc %o4, %o1, %o4 ! 3
- mulscc %o4, %o1, %o4 ! 4
- mulscc %o4, %o1, %o4 ! 5
- mulscc %o4, %o1, %o4 ! 6
- mulscc %o4, %o1, %o4 ! 7
- mulscc %o4, %o1, %o4 ! 8
- mulscc %o4, %o1, %o4 ! 9
- mulscc %o4, %o1, %o4 ! 10
- mulscc %o4, %o1, %o4 ! 11
- mulscc %o4, %o1, %o4 ! 12
- mulscc %o4, %o1, %o4 ! 13
- mulscc %o4, %o1, %o4 ! 14
- mulscc %o4, %o1, %o4 ! 15
- mulscc %o4, %o1, %o4 ! 16
- mulscc %o4, %o1, %o4 ! 17
- mulscc %o4, %o1, %o4 ! 18
- mulscc %o4, %o1, %o4 ! 19
- mulscc %o4, %o1, %o4 ! 20
- mulscc %o4, %o1, %o4 ! 21
- mulscc %o4, %o1, %o4 ! 22
- mulscc %o4, %o1, %o4 ! 23
- mulscc %o4, %o1, %o4 ! 24
- mulscc %o4, %o1, %o4 ! 25
- mulscc %o4, %o1, %o4 ! 26
- mulscc %o4, %o1, %o4 ! 27
- mulscc %o4, %o1, %o4 ! 28
- mulscc %o4, %o1, %o4 ! 29
- mulscc %o4, %o1, %o4 ! 30
- mulscc %o4, %o1, %o4 ! 31
- mulscc %o4, %o1, %o4 ! 32
- mulscc %o4, %g0, %o4 ! final shift
-
- ! If %o0 was negative, the result is
- ! (%o0 * %o1) + (%o1 << 32))
- ! We fix that here.
-
-#if 0
- tst %o0
- bge 1f
- rd %y, %o0
-
- ! %o0 was indeed negative; fix upper 32 bits of result by subtracting
- ! %o1 (i.e., return %o4 - %o1 in %o1).
- retl
- sub %o4, %o1, %o1
-
-1:
- retl
- mov %o4, %o1
-#else
- /* Faster code adapted from tege@sics.se's code for umul.S. */
- sra %o0, 31, %o2 ! make mask from sign bit
- and %o1, %o2, %o2 ! %o2 = 0 or %o1, depending on sign of %o0
- rd %y, %o0 ! get lower half of product
- retl
- sub %o4, %o2, %o1 ! subtract compensation
- ! and put upper half in place
-#endif
-
-LOC(mul_shortway):
- /*
- * Short multiply. 12 steps, followed by a final shift step.
- * The resulting bits are off by 12 and (32-12) = 20 bit positions,
- * but there is no problem with %o0 being negative (unlike above).
- */
- mulscc %o4, %o1, %o4 ! 1
- mulscc %o4, %o1, %o4 ! 2
- mulscc %o4, %o1, %o4 ! 3
- mulscc %o4, %o1, %o4 ! 4
- mulscc %o4, %o1, %o4 ! 5
- mulscc %o4, %o1, %o4 ! 6
- mulscc %o4, %o1, %o4 ! 7
- mulscc %o4, %o1, %o4 ! 8
- mulscc %o4, %o1, %o4 ! 9
- mulscc %o4, %o1, %o4 ! 10
- mulscc %o4, %o1, %o4 ! 11
- mulscc %o4, %o1, %o4 ! 12
- mulscc %o4, %g0, %o4 ! final shift
-
- /*
- * %o4 has 20 of the bits that should be in the low part of the
- * result; %y has the bottom 12 (as %y's top 12). That is:
- *
- * %o4 %y
- * +----------------+----------------+
- * | -12- | -20- | -12- | -20- |
- * +------(---------+------)---------+
- * --hi-- ----low-part----
- *
- * The upper 12 bits of %o4 should be sign-extended to form the
- * high part of the product (i.e., highpart = %o4 >> 20).
- */
- rd %y, %o5
- sll %o4, 12, %o0 ! shift middle bits left 12
- srl %o5, 20, %o5 ! shift low bits right 20, zero fill at left
- or %o5, %o0, %o0 ! construct low part of result
+ smul %o0, %o1, %o0
retl
- sra %o4, 20, %o1 ! ... and extract high part of result
+ rd %y, %o1
END(.mul)
@@ -1,198 +1,102 @@
-! SPARC __mpn_mul_1 -- Multiply a limb vector with a limb and store
-! the result in a second limb vector.
-!
+! SPARC v8 __mpn_mul_1 -- Multiply a limb vector with a single limb and
+! store the product in a second limb vector.
+
! Copyright (C) 1992-2019 Free Software Foundation, Inc.
-!
+
! This file is part of the GNU MP Library.
-!
+
! The GNU MP Library is free software; you can redistribute it and/or modify
! it under the terms of the GNU Lesser General Public License as published by
! the Free Software Foundation; either version 2.1 of the License, or (at your
! option) any later version.
-!
+
! The GNU MP Library is distributed in the hope that it will be useful, but
! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! License for more details.
-!
+
! You should have received a copy of the GNU Lesser General Public License
! along with the GNU MP Library; see the file COPYING.LIB. If not,
! see <https://www.gnu.org/licenses/>.
! INPUT PARAMETERS
-! RES_PTR o0
-! S1_PTR o1
-! SIZE o2
-! S2_LIMB o3
-
-! ADD CODE FOR SMALL MULTIPLIERS!
-!1: ld
-! st
-!
-!2: ld ,a
-! addxcc a,a,x
-! st x,
-!
-!3_unrolled:
-! ld ,a
-! addxcc a,a,x1 ! 2a + cy
-! addx %g0,%g0,x2
-! addcc a,x1,x ! 3a + c
-! st x,
-!
-! ld ,a
-! addxcc a,a,y1
-! addx %g0,%g0,y2
-! addcc a,y1,x
-! st x,
-!
-!4_unrolled:
-! ld ,a
-! srl a,2,x1 ! 4a
-! addxcc y2,x1,x
-! sll a,30,x2
-! st x,
-!
-! ld ,a
-! srl a,2,y1
-! addxcc x2,y1,y
-! sll a,30,y2
-! st x,
-!
-!5_unrolled:
-! ld ,a
-! srl a,2,x1 ! 4a
-! addxcc a,x1,x ! 5a + c
-! sll a,30,x2
-! addx %g0,x2,x2
-! st x,
-!
-! ld ,a
-! srl a,2,y1
-! addxcc a,y1,x
-! sll a,30,y2
-! addx %g0,y2,y2
-! st x,
-!
-!8_unrolled:
-! ld ,a
-! srl a,3,x1 ! 8a
-! addxcc y2,x1,x
-! sll a,29,x2
-! st x,
-!
-! ld ,a
-! srl a,3,y1
-! addxcc x2,y1,y
-! sll a,29,y2
-! st x,
+! res_ptr o0
+! s1_ptr o1
+! size o2
+! s2_limb o3
#include <sysdep.h>
ENTRY(__mpn_mul_1)
- ! Make S1_PTR and RES_PTR point at the end of their blocks
- ! and put (- 4 x SIZE) in index/loop counter.
- sll %o2,2,%o2
- add %o0,%o2,%o4 ! RES_PTR in o4 since o0 is retval
- add %o1,%o2,%o1
- sub %g0,%o2,%o2
+ sll %o2,4,%g1
+ mov %o7,%g4 ! Save return address register
+ and %g1,(4-1)<<4,%g1
+1: call 2f
+ add %o7,3f-1b,%g3
+2: mov %g4,%o7 ! Restore return address register
+ jmp %g3+%g1
+ ld [%o1+0],%o4 ! 1
- cmp %o3,0xfff
- bgu LOC(large)
+ .align 4
+3:
+LOC(00):
+ add %o0,-4,%o0
+ add %o1,-4,%o1
+ b LOC(loop00) /* 4, 8, 12, ... */
+ orcc %g0,%g0,%g2
+LOC(01):
+ b LOC(loop01) /* 1, 5, 9, ... */
+ orcc %g0,%g0,%g2
nop
+ nop
+LOC(10):
+ add %o0,-12,%o0 /* 2, 6, 10, ... */
+ add %o1,4,%o1
+ b LOC(loop10)
+ orcc %g0,%g0,%g2
+ nop
+LOC(11):
+ add %o0,-8,%o0 /* 3, 7, 11, ... */
+ add %o1,-8,%o1
+ b LOC(loop11)
+ orcc %g0,%g0,%g2
- ld [%o1+%o2],%o5
- mov 0,%o0
- b LOC(0)
- add %o4,-4,%o4
-LOC(loop0):
- st %g1,[%o4+%o2]
-LOC(0): wr %g0,%o3,%y
- sra %o5,31,%g2
- and %o3,%g2,%g2
- andcc %g1,0,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,0,%g1
- sra %g1,20,%g4
- sll %g1,12,%g1
- rd %y,%g3
- srl %g3,20,%g3
- or %g1,%g3,%g1
-
- addcc %g1,%o0,%g1
- addx %g2,%g4,%o0 ! add sign-compensation and cy to hi limb
- addcc %o2,4,%o2 ! loop counter
- bne,a LOC(loop0)
- ld [%o1+%o2],%o5
-
- retl
- st %g1,[%o4+%o2]
-
-
-LOC(large):
- ld [%o1+%o2],%o5
- mov 0,%o0
- sra %o3,31,%g4 ! g4 = mask of ones iff S2_LIMB < 0
- b LOC(1)
- add %o4,-4,%o4
LOC(loop):
- st %g3,[%o4+%o2]
-LOC(1): wr %g0,%o5,%y
- and %o5,%g4,%g2 ! g2 = S1_LIMB iff S2_LIMB < 0, else 0
- andcc %g0,%g0,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%g0,%g1
- rd %y,%g3
- addcc %g3,%o0,%g3
- addx %g2,%g1,%o0 ! add sign-compensation and cy to hi limb
- addcc %o2,4,%o2 ! loop counter
- bne,a LOC(loop)
- ld [%o1+%o2],%o5
+ addcc %g3,%g2,%g3 ! 1
+ ld [%o1+4],%o4 ! 2
+ st %g3,[%o0+0] ! 1
+ rd %y,%g2 ! 1
+LOC(loop00):
+ umul %o4,%o3,%g3 ! 2
+ addxcc %g3,%g2,%g3 ! 2
+ ld [%o1+8],%o4 ! 3
+ st %g3,[%o0+4] ! 2
+ rd %y,%g2 ! 2
+LOC(loop11):
+ umul %o4,%o3,%g3 ! 3
+ addxcc %g3,%g2,%g3 ! 3
+ ld [%o1+12],%o4 ! 4
+ add %o1,16,%o1
+ st %g3,[%o0+8] ! 3
+ rd %y,%g2 ! 3
+LOC(loop10):
+ umul %o4,%o3,%g3 ! 4
+ addxcc %g3,%g2,%g3 ! 4
+ ld [%o1+0],%o4 ! 1
+ st %g3,[%o0+12] ! 4
+ add %o0,16,%o0
+ rd %y,%g2 ! 4
+ addx %g0,%g2,%g2
+LOC(loop01):
+ addcc %o2,-4,%o2
+ bg LOC(loop)
+ umul %o4,%o3,%g3 ! 1
+ addcc %g3,%g2,%g3 ! 4
+ st %g3,[%o0+0] ! 4
+ rd %y,%g2 ! 4
retl
- st %g3,[%o4+%o2]
+ addx %g0,%g2,%o0
END(__mpn_mul_1)
@@ -1,363 +1,21 @@
- /* This file is generated from divrem.m4; DO NOT EDIT! */
/*
- * Division and remainder, from Appendix E of the Sparc Version 8
- * Architecture Manual, with fixes from Gordon Irlam.
+ * Sparc v8 has divide.
*/
-/*
- * Input: dividend and divisor in %o0 and %o1 respectively.
- *
- * m4 parameters:
- * .rem name of function to generate
- * rem rem=div => %o0 / %o1; rem=rem => %o0 % %o1
- * true true=true => signed; true=false => unsigned
- *
- * Algorithm parameters:
- * N how many bits per iteration we try to get (4)
- * WORDSIZE total number of bits (32)
- *
- * Derived constants:
- * TOPBITS number of bits in the top decade of a number
- *
- * Important variables:
- * Q the partial quotient under development (initially 0)
- * R the remainder so far, initially the dividend
- * ITER number of main division loop iterations required;
- * equal to ceil(log2(quotient) / N). Note that this
- * is the log base (2^N) of the quotient.
- * V the current comparand, initially divisor*2^(ITER*N-1)
- *
- * Cost:
- * Current estimate for non-large dividend is
- * ceil(log2(quotient) / N) * (10 + 7N/2) + C
- * A large dividend is one greater than 2^(31-TOPBITS) and takes a
- * different path, as the upper bits of the quotient must be developed
- * one bit at a time.
- */
-
-
-
#include <sysdep.h>
-#include <sys/trap.h>
ENTRY(.rem)
- ! compute sign of result; if neither is negative, no problem
- orcc %o1, %o0, %g0 ! either negative?
- bge 2f ! no, go do the divide
- mov %o0, %g3 ! sign of remainder matches %o0
- tst %o1
- bge 1f
- tst %o0
- ! %o1 is definitely negative; %o0 might also be negative
- bge 2f ! if %o0 not negative...
- sub %g0, %o1, %o1 ! in any case, make %o1 nonneg
-1: ! %o0 is negative, %o1 is nonnegative
- sub %g0, %o0, %o0 ! make %o0 nonnegative
-2:
-
- ! Ready to divide. Compute size of quotient; scale comparand.
- orcc %o1, %g0, %o5
- bne 1f
- mov %o0, %o3
-
- ! Divide by zero trap. If it returns, return 0 (about as
- ! wrong as possible, but that is what SunOS does...).
- ta ST_DIV0
- retl
- clr %o0
-
-1:
- cmp %o3, %o5 ! if %o1 exceeds %o0, done
- blu LOC(got_result) ! (and algorithm fails otherwise)
- clr %o2
- sethi %hi(1 << (32 - 4 - 1)), %g1
- cmp %o3, %g1
- blu LOC(not_really_big)
- clr %o4
-
- ! Here the dividend is >= 2**(31-N) or so. We must be careful here,
- ! as our usual N-at-a-shot divide step will cause overflow and havoc.
- ! The number of bits in the result here is N*ITER+SC, where SC <= N.
- ! Compute ITER in an unorthodox manner: know we need to shift V into
- ! the top decade: so do not even bother to compare to R.
- 1:
- cmp %o5, %g1
- bgeu 3f
- mov 1, %g2
- sll %o5, 4, %o5
- b 1b
- add %o4, 1, %o4
-
- ! Now compute %g2.
- 2: addcc %o5, %o5, %o5
- bcc LOC(not_too_big)
- add %g2, 1, %g2
-
- ! We get here if the %o1 overflowed while shifting.
- ! This means that %o3 has the high-order bit set.
- ! Restore %o5 and subtract from %o3.
- sll %g1, 4, %g1 ! high order bit
- srl %o5, 1, %o5 ! rest of %o5
- add %o5, %g1, %o5
- b LOC(do_single_div)
- sub %g2, 1, %g2
-
- LOC(not_too_big):
- 3: cmp %o5, %o3
- blu 2b
- nop
- be LOC(do_single_div)
- nop
- /* NB: these are commented out in the V8-Sparc manual as well */
- /* (I do not understand this) */
- ! %o5 > %o3: went too far: back up 1 step
- ! srl %o5, 1, %o5
- ! dec %g2
- ! do single-bit divide steps
- !
- ! We have to be careful here. We know that %o3 >= %o5, so we can do the
- ! first divide step without thinking. BUT, the others are conditional,
- ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
- ! order bit set in the first step, just falling into the regular
- ! division loop will mess up the first time around.
- ! So we unroll slightly...
- LOC(do_single_div):
- subcc %g2, 1, %g2
- bl LOC(end_regular_divide)
- nop
- sub %o3, %o5, %o3
- mov 1, %o2
- b LOC(end_single_divloop)
- nop
- LOC(single_divloop):
- sll %o2, 1, %o2
- bl 1f
- srl %o5, 1, %o5
- ! %o3 >= 0
- sub %o3, %o5, %o3
- b 2f
- add %o2, 1, %o2
- 1: ! %o3 < 0
- add %o3, %o5, %o3
- sub %o2, 1, %o2
- 2:
- LOC(end_single_divloop):
- subcc %g2, 1, %g2
- bge LOC(single_divloop)
- tst %o3
- b,a LOC(end_regular_divide)
-
-LOC(not_really_big):
-1:
- sll %o5, 4, %o5
- cmp %o5, %o3
- bleu 1b
- addcc %o4, 1, %o4
- be LOC(got_result)
- sub %o4, 1, %o4
-
- tst %o3 ! set up for initial iteration
-LOC(divloop):
- sll %o2, 4, %o2
- ! depth 1, accumulated bits 0
- bl LOC(1.16)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 2, accumulated bits 1
- bl LOC(2.17)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 3, accumulated bits 3
- bl LOC(3.19)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 4, accumulated bits 7
- bl LOC(4.23)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (7*2+1), %o2
-
-LOC(4.23):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (7*2-1), %o2
-
-
-LOC(3.19):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 4, accumulated bits 5
- bl LOC(4.21)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (5*2+1), %o2
-
-LOC(4.21):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (5*2-1), %o2
-
-
-
-LOC(2.17):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 3, accumulated bits 1
- bl LOC(3.17)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 4, accumulated bits 3
- bl LOC(4.19)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (3*2+1), %o2
-
-LOC(4.19):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (3*2-1), %o2
-
-
-LOC(3.17):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 4, accumulated bits 1
- bl LOC(4.17)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (1*2+1), %o2
-
-LOC(4.17):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (1*2-1), %o2
-
-
-
-
-LOC(1.16):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 2, accumulated bits -1
- bl LOC(2.15)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 3, accumulated bits -1
- bl LOC(3.15)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 4, accumulated bits -1
- bl LOC(4.15)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (-1*2+1), %o2
-
-LOC(4.15):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (-1*2-1), %o2
-
-
-LOC(3.15):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 4, accumulated bits -3
- bl LOC(4.13)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (-3*2+1), %o2
-
-LOC(4.13):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (-3*2-1), %o2
-
-
-
-LOC(2.15):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 3, accumulated bits -3
- bl LOC(3.13)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 4, accumulated bits -5
- bl LOC(4.11)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (-5*2+1), %o2
-
-LOC(4.11):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (-5*2-1), %o2
-
-
-LOC(3.13):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 4, accumulated bits -7
- bl LOC(4.9)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (-7*2+1), %o2
-
-LOC(4.9):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (-7*2-1), %o2
-
-
-
-
-9:
-LOC(end_regular_divide):
- subcc %o4, 1, %o4
- bge LOC(divloop)
- tst %o3
- bl,a LOC(got_result)
- ! non-restoring fixup here (one instruction only!)
- add %o3, %o1, %o3
-
-LOC(got_result):
- ! check to see if answer should be < 0
- tst %g3
- bl,a 1f
- sub %g0, %o3, %o3
-1:
+ sra %o0, 31, %o2
+ wr %o2, 0, %y
+ nop
+ nop
+ nop
+ sdivcc %o0, %o1, %o2
+ bvs,a 1f
+ xnor %o2, %g0, %o2
+1: smul %o2, %o1, %o2
retl
- mov %o3, %o0
+ sub %o0, %o2, %o0
END(.rem)
@@ -1,363 +1,20 @@
- /* This file is generated from divrem.m4; DO NOT EDIT! */
/*
- * Division and remainder, from Appendix E of the Sparc Version 8
- * Architecture Manual, with fixes from Gordon Irlam.
+ * Sparc v8 has divide.
*/
-/*
- * Input: dividend and divisor in %o0 and %o1 respectively.
- *
- * m4 parameters:
- * .div name of function to generate
- * div div=div => %o0 / %o1; div=rem => %o0 % %o1
- * true true=true => signed; true=false => unsigned
- *
- * Algorithm parameters:
- * N how many bits per iteration we try to get (4)
- * WORDSIZE total number of bits (32)
- *
- * Derived constants:
- * TOPBITS number of bits in the top decade of a number
- *
- * Important variables:
- * Q the partial quotient under development (initially 0)
- * R the remainder so far, initially the dividend
- * ITER number of main division loop iterations required;
- * equal to ceil(log2(quotient) / N). Note that this
- * is the log base (2^N) of the quotient.
- * V the current comparand, initially divisor*2^(ITER*N-1)
- *
- * Cost:
- * Current estimate for non-large dividend is
- * ceil(log2(quotient) / N) * (10 + 7N/2) + C
- * A large dividend is one greater than 2^(31-TOPBITS) and takes a
- * different path, as the upper bits of the quotient must be developed
- * one bit at a time.
- */
-
-
-
#include <sysdep.h>
-#include <sys/trap.h>
ENTRY(.div)
- ! compute sign of result; if neither is negative, no problem
- orcc %o1, %o0, %g0 ! either negative?
- bge 2f ! no, go do the divide
- xor %o1, %o0, %g3 ! compute sign in any case
- tst %o1
- bge 1f
- tst %o0
- ! %o1 is definitely negative; %o0 might also be negative
- bge 2f ! if %o0 not negative...
- sub %g0, %o1, %o1 ! in any case, make %o1 nonneg
-1: ! %o0 is negative, %o1 is nonnegative
- sub %g0, %o0, %o0 ! make %o0 nonnegative
-2:
-
- ! Ready to divide. Compute size of quotient; scale comparand.
- orcc %o1, %g0, %o5
- bne 1f
- mov %o0, %o3
-
- ! Divide by zero trap. If it returns, return 0 (about as
- ! wrong as possible, but that is what SunOS does...).
- ta ST_DIV0
- retl
- clr %o0
-
-1:
- cmp %o3, %o5 ! if %o1 exceeds %o0, done
- blu LOC(got_result) ! (and algorithm fails otherwise)
- clr %o2
- sethi %hi(1 << (32 - 4 - 1)), %g1
- cmp %o3, %g1
- blu LOC(not_really_big)
- clr %o4
-
- ! Here the dividend is >= 2**(31-N) or so. We must be careful here,
- ! as our usual N-at-a-shot divide step will cause overflow and havoc.
- ! The number of bits in the result here is N*ITER+SC, where SC <= N.
- ! Compute ITER in an unorthodox manner: know we need to shift V into
- ! the top decade: so do not even bother to compare to R.
- 1:
- cmp %o5, %g1
- bgeu 3f
- mov 1, %g2
- sll %o5, 4, %o5
- b 1b
- add %o4, 1, %o4
-
- ! Now compute %g2.
- 2: addcc %o5, %o5, %o5
- bcc LOC(not_too_big)
- add %g2, 1, %g2
-
- ! We get here if the %o1 overflowed while shifting.
- ! This means that %o3 has the high-order bit set.
- ! Restore %o5 and subtract from %o3.
- sll %g1, 4, %g1 ! high order bit
- srl %o5, 1, %o5 ! rest of %o5
- add %o5, %g1, %o5
- b LOC(do_single_div)
- sub %g2, 1, %g2
-
- LOC(not_too_big):
- 3: cmp %o5, %o3
- blu 2b
- nop
- be LOC(do_single_div)
- nop
- /* NB: these are commented out in the V8-Sparc manual as well */
- /* (I do not understand this) */
- ! %o5 > %o3: went too far: back up 1 step
- ! srl %o5, 1, %o5
- ! dec %g2
- ! do single-bit divide steps
- !
- ! We have to be careful here. We know that %o3 >= %o5, so we can do the
- ! first divide step without thinking. BUT, the others are conditional,
- ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
- ! order bit set in the first step, just falling into the regular
- ! division loop will mess up the first time around.
- ! So we unroll slightly...
- LOC(do_single_div):
- subcc %g2, 1, %g2
- bl LOC(end_regular_divide)
- nop
- sub %o3, %o5, %o3
- mov 1, %o2
- b LOC(end_single_divloop)
- nop
- LOC(single_divloop):
- sll %o2, 1, %o2
- bl 1f
- srl %o5, 1, %o5
- ! %o3 >= 0
- sub %o3, %o5, %o3
- b 2f
- add %o2, 1, %o2
- 1: ! %o3 < 0
- add %o3, %o5, %o3
- sub %o2, 1, %o2
- 2:
- LOC(end_single_divloop):
- subcc %g2, 1, %g2
- bge LOC(single_divloop)
- tst %o3
- b,a LOC(end_regular_divide)
-
-LOC(not_really_big):
-1:
- sll %o5, 4, %o5
- cmp %o5, %o3
- bleu 1b
- addcc %o4, 1, %o4
- be LOC(got_result)
- sub %o4, 1, %o4
-
- tst %o3 ! set up for initial iteration
-LOC(divloop):
- sll %o2, 4, %o2
- ! depth 1, accumulated bits 0
- bl LOC(1.16)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 2, accumulated bits 1
- bl LOC(2.17)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 3, accumulated bits 3
- bl LOC(3.19)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 4, accumulated bits 7
- bl LOC(4.23)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (7*2+1), %o2
-
-LOC(4.23):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (7*2-1), %o2
-
-
-LOC(3.19):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 4, accumulated bits 5
- bl LOC(4.21)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (5*2+1), %o2
-
-LOC(4.21):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (5*2-1), %o2
-
-
-
-LOC(2.17):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 3, accumulated bits 1
- bl LOC(3.17)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 4, accumulated bits 3
- bl LOC(4.19)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (3*2+1), %o2
-
-LOC(4.19):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (3*2-1), %o2
-
-
-LOC(3.17):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 4, accumulated bits 1
- bl LOC(4.17)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (1*2+1), %o2
-
-LOC(4.17):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (1*2-1), %o2
-
-
-
-
-LOC(1.16):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 2, accumulated bits -1
- bl LOC(2.15)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 3, accumulated bits -1
- bl LOC(3.15)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 4, accumulated bits -1
- bl LOC(4.15)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (-1*2+1), %o2
-
-LOC(4.15):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (-1*2-1), %o2
-
-
-LOC(3.15):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 4, accumulated bits -3
- bl LOC(4.13)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (-3*2+1), %o2
-
-LOC(4.13):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (-3*2-1), %o2
-
-
-
-LOC(2.15):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 3, accumulated bits -3
- bl LOC(3.13)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 4, accumulated bits -5
- bl LOC(4.11)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (-5*2+1), %o2
-
-LOC(4.11):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (-5*2-1), %o2
-
-
-LOC(3.13):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 4, accumulated bits -7
- bl LOC(4.9)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (-7*2+1), %o2
-
-LOC(4.9):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (-7*2-1), %o2
-
-
-
-
-9:
-LOC(end_regular_divide):
- subcc %o4, 1, %o4
- bge LOC(divloop)
- tst %o3
- bl,a LOC(got_result)
- ! non-restoring fixup here (one instruction only!)
- sub %o2, 1, %o2
-
-LOC(got_result):
- ! check to see if answer should be < 0
- tst %g3
- bl,a 1f
- sub %g0, %o2, %o2
-1:
- retl
- mov %o2, %o0
+ sra %o0, 31, %o2
+ wr %o2, 0, %y
+ nop
+ nop
+ nop
+ sdivcc %o0, %o1, %o0
+ bvs,a 1f
+ xnor %o0, %g0, %o0
+1: retl
+ nop
END(.div)
deleted file mode 100644
@@ -1 +0,0 @@
-sysdep-CFLAGS += -mcpu=v8
deleted file mode 100644
@@ -1,118 +0,0 @@
-! SPARC v8 __mpn_addmul_1 -- Multiply a limb vector with a limb and
-! add the result to a second limb vector.
-
-! Copyright (C) 1992-2019 Free Software Foundation, Inc.
-
-! This file is part of the GNU MP Library.
-
-! The GNU MP Library is free software; you can redistribute it and/or modify
-! it under the terms of the GNU Lesser General Public License as published by
-! the Free Software Foundation; either version 2.1 of the License, or (at your
-! option) any later version.
-
-! The GNU MP Library is distributed in the hope that it will be useful, but
-! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-! License for more details.
-
-! You should have received a copy of the GNU Lesser General Public License
-! along with the GNU MP Library; see the file COPYING.LIB. If not,
-! see <https://www.gnu.org/licenses/>.
-
-
-! INPUT PARAMETERS
-! res_ptr o0
-! s1_ptr o1
-! size o2
-! s2_limb o3
-
-#include <sysdep.h>
-
-ENTRY(__mpn_addmul_1)
- ld [%o1+0],%o4 ! 1
- sll %o2,4,%g1
- orcc %g0,%g0,%g2
- mov %o7,%g4 ! Save return address register
- and %g1,(4-1)<<4,%g1
-1: call 2f
- add %o7,3f-1b,%g3
-2: jmp %g3+%g1
- mov %g4,%o7 ! Restore return address register
-
- .align 4
-3:
-LOC(00):
- add %o0,-4,%o0
- b LOC(loop00) /* 4, 8, 12, ... */
- add %o1,-4,%o1
- nop
-LOC(01):
- b LOC(loop01) /* 1, 5, 9, ... */
- nop
- nop
- nop
-LOC(10):
- add %o0,-12,%o0 /* 2, 6, 10, ... */
- b LOC(loop10)
- add %o1,4,%o1
- nop
-LOC(11):
- add %o0,-8,%o0 /* 3, 7, 11, ... */
- b LOC(loop11)
- add %o1,-8,%o1
- nop
-
-LOC(loop):
- addcc %g3,%g2,%g3 ! 1
- ld [%o1+4],%o4 ! 2
- rd %y,%g2 ! 1
- addx %g0,%g2,%g2
- ld [%o0+0],%g1 ! 2
- addcc %g1,%g3,%g3
- st %g3,[%o0+0] ! 1
-LOC(loop00):
- umul %o4,%o3,%g3 ! 2
- ld [%o0+4],%g1 ! 2
- addxcc %g3,%g2,%g3 ! 2
- ld [%o1+8],%o4 ! 3
- rd %y,%g2 ! 2
- addx %g0,%g2,%g2
- nop
- addcc %g1,%g3,%g3
- st %g3,[%o0+4] ! 2
-LOC(loop11):
- umul %o4,%o3,%g3 ! 3
- addxcc %g3,%g2,%g3 ! 3
- ld [%o1+12],%o4 ! 4
- rd %y,%g2 ! 3
- add %o1,16,%o1
- addx %g0,%g2,%g2
- ld [%o0+8],%g1 ! 2
- addcc %g1,%g3,%g3
- st %g3,[%o0+8] ! 3
-LOC(loop10):
- umul %o4,%o3,%g3 ! 4
- addxcc %g3,%g2,%g3 ! 4
- ld [%o1+0],%o4 ! 1
- rd %y,%g2 ! 4
- addx %g0,%g2,%g2
- ld [%o0+12],%g1 ! 2
- addcc %g1,%g3,%g3
- st %g3,[%o0+12] ! 4
- add %o0,16,%o0
- addx %g0,%g2,%g2
-LOC(loop01):
- addcc %o2,-4,%o2
- bg LOC(loop)
- umul %o4,%o3,%g3 ! 1
-
- addcc %g3,%g2,%g3 ! 4
- rd %y,%g2 ! 4
- addx %g0,%g2,%g2
- ld [%o0+0],%g1 ! 2
- addcc %g1,%g3,%g3
- st %g3,[%o0+0] ! 4
- retl
- addx %g0,%g2,%o0
-
-END(__mpn_addmul_1)
deleted file mode 100644
@@ -1,13 +0,0 @@
-/*
- * Sparc v8 has multiply.
- */
-
-#include <sysdep.h>
-
-ENTRY(.mul)
-
- smul %o0, %o1, %o0
- retl
- rd %y, %o1
-
-END(.mul)
deleted file mode 100644
@@ -1,102 +0,0 @@
-! SPARC v8 __mpn_mul_1 -- Multiply a limb vector with a single limb and
-! store the product in a second limb vector.
-
-! Copyright (C) 1992-2019 Free Software Foundation, Inc.
-
-! This file is part of the GNU MP Library.
-
-! The GNU MP Library is free software; you can redistribute it and/or modify
-! it under the terms of the GNU Lesser General Public License as published by
-! the Free Software Foundation; either version 2.1 of the License, or (at your
-! option) any later version.
-
-! The GNU MP Library is distributed in the hope that it will be useful, but
-! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-! License for more details.
-
-! You should have received a copy of the GNU Lesser General Public License
-! along with the GNU MP Library; see the file COPYING.LIB. If not,
-! see <https://www.gnu.org/licenses/>.
-
-
-! INPUT PARAMETERS
-! res_ptr o0
-! s1_ptr o1
-! size o2
-! s2_limb o3
-
-#include <sysdep.h>
-
-ENTRY(__mpn_mul_1)
- sll %o2,4,%g1
- mov %o7,%g4 ! Save return address register
- and %g1,(4-1)<<4,%g1
-1: call 2f
- add %o7,3f-1b,%g3
-2: mov %g4,%o7 ! Restore return address register
- jmp %g3+%g1
- ld [%o1+0],%o4 ! 1
-
- .align 4
-3:
-LOC(00):
- add %o0,-4,%o0
- add %o1,-4,%o1
- b LOC(loop00) /* 4, 8, 12, ... */
- orcc %g0,%g0,%g2
-LOC(01):
- b LOC(loop01) /* 1, 5, 9, ... */
- orcc %g0,%g0,%g2
- nop
- nop
-LOC(10):
- add %o0,-12,%o0 /* 2, 6, 10, ... */
- add %o1,4,%o1
- b LOC(loop10)
- orcc %g0,%g0,%g2
- nop
-LOC(11):
- add %o0,-8,%o0 /* 3, 7, 11, ... */
- add %o1,-8,%o1
- b LOC(loop11)
- orcc %g0,%g0,%g2
-
-LOC(loop):
- addcc %g3,%g2,%g3 ! 1
- ld [%o1+4],%o4 ! 2
- st %g3,[%o0+0] ! 1
- rd %y,%g2 ! 1
-LOC(loop00):
- umul %o4,%o3,%g3 ! 2
- addxcc %g3,%g2,%g3 ! 2
- ld [%o1+8],%o4 ! 3
- st %g3,[%o0+4] ! 2
- rd %y,%g2 ! 2
-LOC(loop11):
- umul %o4,%o3,%g3 ! 3
- addxcc %g3,%g2,%g3 ! 3
- ld [%o1+12],%o4 ! 4
- add %o1,16,%o1
- st %g3,[%o0+8] ! 3
- rd %y,%g2 ! 3
-LOC(loop10):
- umul %o4,%o3,%g3 ! 4
- addxcc %g3,%g2,%g3 ! 4
- ld [%o1+0],%o4 ! 1
- st %g3,[%o0+12] ! 4
- add %o0,16,%o0
- rd %y,%g2 ! 4
- addx %g0,%g2,%g2
-LOC(loop01):
- addcc %o2,-4,%o2
- bg LOC(loop)
- umul %o4,%o3,%g3 ! 1
-
- addcc %g3,%g2,%g3 ! 4
- st %g3,[%o0+0] ! 4
- rd %y,%g2 ! 4
- retl
- addx %g0,%g2,%o0
-
-END(__mpn_mul_1)
deleted file mode 100644
@@ -1,21 +0,0 @@
-/*
- * Sparc v8 has divide.
- */
-
-#include <sysdep.h>
-
-ENTRY(.rem)
-
- sra %o0, 31, %o2
- wr %o2, 0, %y
- nop
- nop
- nop
- sdivcc %o0, %o1, %o2
- bvs,a 1f
- xnor %o2, %g0, %o2
-1: smul %o2, %o1, %o2
- retl
- sub %o0, %o2, %o0
-
-END(.rem)
deleted file mode 100644
@@ -1,20 +0,0 @@
-/*
- * Sparc v8 has divide.
- */
-
-#include <sysdep.h>
-
-ENTRY(.div)
-
- sra %o0, 31, %o2
- wr %o2, 0, %y
- nop
- nop
- nop
- sdivcc %o0, %o1, %o0
- bvs,a 1f
- xnor %o0, %g0, %o0
-1: retl
- nop
-
-END(.div)
deleted file mode 100644
@@ -1,57 +0,0 @@
-! SPARC v8 __mpn_submul_1 -- Multiply a limb vector with a limb and
-! subtract the result from a second limb vector.
-
-! Copyright (C) 1992-2019 Free Software Foundation, Inc.
-
-! This file is part of the GNU MP Library.
-
-! The GNU MP Library is free software; you can redistribute it and/or modify
-! it under the terms of the GNU Lesser General Public License as published by
-! the Free Software Foundation; either version 2.1 of the License, or (at your
-! option) any later version.
-
-! The GNU MP Library is distributed in the hope that it will be useful, but
-! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-! License for more details.
-
-! You should have received a copy of the GNU Lesser General Public License
-! along with the GNU MP Library; see the file COPYING.LIB. If not,
-! see <https://www.gnu.org/licenses/>.
-
-
-! INPUT PARAMETERS
-! res_ptr o0
-! s1_ptr o1
-! size o2
-! s2_limb o3
-
-#include <sysdep.h>
-
-ENTRY(__mpn_submul_1)
- sub %g0,%o2,%o2 ! negate ...
- sll %o2,2,%o2 ! ... and scale size
- sub %o1,%o2,%o1 ! o1 is offset s1_ptr
- sub %o0,%o2,%g1 ! g1 is offset res_ptr
-
- mov 0,%o0 ! clear cy_limb
-
-LOC(loop):
- ld [%o1+%o2],%o4
- ld [%g1+%o2],%g2
- umul %o4,%o3,%o5
- rd %y,%g3
- addcc %o5,%o0,%o5
- addx %g3,0,%o0
- subcc %g2,%o5,%g2
- addx %o0,0,%o0
- st %g2,[%g1+%o2]
-
- addcc %o2,4,%o2
- bne LOC(loop)
- nop
-
- retl
- nop
-
-END(__mpn_submul_1)
deleted file mode 100644
@@ -1,16 +0,0 @@
-/*
- * Sparc v8 has divide.
- */
-
-#include <sysdep.h>
-
-ENTRY(.udiv)
-
- wr %g0, 0, %y
- nop
- nop
- retl
- udiv %o0, %o1, %o0
-
-END(.udiv)
-strong_alias (.udiv, __wrap_.udiv)
deleted file mode 100644
@@ -1,13 +0,0 @@
-/*
- * Sparc v8 has multiply.
- */
-
-#include <sysdep.h>
-
-ENTRY(.umul)
-
- umul %o0, %o1, %o0
- retl
- rd %y, %o1
-
-END(.umul)
deleted file mode 100644
@@ -1,18 +0,0 @@
-/*
- * Sparc v8 has divide.
- */
-
-#include <sysdep.h>
-
-ENTRY(.urem)
-
- wr %g0, 0, %y
- nop
- nop
- nop
- udiv %o0, %o1, %o2
- umul %o2, %o1, %o2
- retl
- sub %o0, %o2, %o0
-
-END(.urem)
@@ -1,146 +1,57 @@
-! SPARC __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
-! the result from a second limb vector.
-!
+! SPARC v8 __mpn_submul_1 -- Multiply a limb vector with a limb and
+! subtract the result from a second limb vector.
+
! Copyright (C) 1992-2019 Free Software Foundation, Inc.
-!
+
! This file is part of the GNU MP Library.
-!
+
! The GNU MP Library is free software; you can redistribute it and/or modify
! it under the terms of the GNU Lesser General Public License as published by
! the Free Software Foundation; either version 2.1 of the License, or (at your
! option) any later version.
-!
+
! The GNU MP Library is distributed in the hope that it will be useful, but
! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
! License for more details.
-!
+
! You should have received a copy of the GNU Lesser General Public License
! along with the GNU MP Library; see the file COPYING.LIB. If not,
! see <https://www.gnu.org/licenses/>.
! INPUT PARAMETERS
-! RES_PTR o0
-! S1_PTR o1
-! SIZE o2
-! S2_LIMB o3
+! res_ptr o0
+! s1_ptr o1
+! size o2
+! s2_limb o3
#include <sysdep.h>
ENTRY(__mpn_submul_1)
- ! Make S1_PTR and RES_PTR point at the end of their blocks
- ! and put (- 4 x SIZE) in index/loop counter.
- sll %o2,2,%o2
- add %o0,%o2,%o4 ! RES_PTR in o4 since o0 is retval
- add %o1,%o2,%o1
- sub %g0,%o2,%o2
-
- cmp %o3,0xfff
- bgu LOC(large)
- nop
+ sub %g0,%o2,%o2 ! negate ...
+ sll %o2,2,%o2 ! ... and scale size
+ sub %o1,%o2,%o1 ! o1 is offset s1_ptr
+ sub %o0,%o2,%g1 ! g1 is offset res_ptr
- ld [%o1+%o2],%o5
- mov 0,%o0
- b LOC(0)
- add %o4,-4,%o4
-LOC(loop0):
- subcc %o5,%g1,%g1
- ld [%o1+%o2],%o5
- addx %o0,%g0,%o0
- st %g1,[%o4+%o2]
-LOC(0): wr %g0,%o3,%y
- sra %o5,31,%g2
- and %o3,%g2,%g2
- andcc %g1,0,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,%o5,%g1
- mulscc %g1,0,%g1
- sra %g1,20,%g4
- sll %g1,12,%g1
- rd %y,%g3
- srl %g3,20,%g3
- or %g1,%g3,%g1
+ mov 0,%o0 ! clear cy_limb
- addcc %g1,%o0,%g1
- addx %g2,%g4,%o0 ! add sign-compensation and cy to hi limb
- addcc %o2,4,%o2 ! loop counter
- bne LOC(loop0)
- ld [%o4+%o2],%o5
-
- subcc %o5,%g1,%g1
- addx %o0,%g0,%o0
- retl
- st %g1,[%o4+%o2]
-
-
-LOC(large):
- ld [%o1+%o2],%o5
- mov 0,%o0
- sra %o3,31,%g4 ! g4 = mask of ones iff S2_LIMB < 0
- b LOC(1)
- add %o4,-4,%o4
LOC(loop):
- subcc %o5,%g3,%g3
- ld [%o1+%o2],%o5
- addx %o0,%g0,%o0
- st %g3,[%o4+%o2]
-LOC(1): wr %g0,%o5,%y
- and %o5,%g4,%g2
- andcc %g0,%g0,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%o3,%g1
- mulscc %g1,%g0,%g1
+ ld [%o1+%o2],%o4
+ ld [%g1+%o2],%g2
+ umul %o4,%o3,%o5
rd %y,%g3
- addcc %g3,%o0,%g3
- addx %g2,%g1,%o0
+ addcc %o5,%o0,%o5
+ addx %g3,0,%o0
+ subcc %g2,%o5,%g2
+ addx %o0,0,%o0
+ st %g2,[%g1+%o2]
+
addcc %o2,4,%o2
bne LOC(loop)
- ld [%o4+%o2],%o5
+ nop
- subcc %o5,%g3,%g3
- addx %o0,%g0,%o0
retl
- st %g3,[%o4+%o2]
+ nop
END(__mpn_submul_1)
@@ -1,347 +1,16 @@
- /* This file is generated from divrem.m4; DO NOT EDIT! */
/*
- * Division and remainder, from Appendix E of the Sparc Version 8
- * Architecture Manual, with fixes from Gordon Irlam.
+ * Sparc v8 has divide.
*/
-/*
- * Input: dividend and divisor in %o0 and %o1 respectively.
- *
- * m4 parameters:
- * .udiv name of function to generate
- * div div=div => %o0 / %o1; div=rem => %o0 % %o1
- * false false=true => signed; false=false => unsigned
- *
- * Algorithm parameters:
- * N how many bits per iteration we try to get (4)
- * WORDSIZE total number of bits (32)
- *
- * Derived constants:
- * TOPBITS number of bits in the top decade of a number
- *
- * Important variables:
- * Q the partial quotient under development (initially 0)
- * R the remainder so far, initially the dividend
- * ITER number of main division loop iterations required;
- * equal to ceil(log2(quotient) / N). Note that this
- * is the log base (2^N) of the quotient.
- * V the current comparand, initially divisor*2^(ITER*N-1)
- *
- * Cost:
- * Current estimate for non-large dividend is
- * ceil(log2(quotient) / N) * (10 + 7N/2) + C
- * A large dividend is one greater than 2^(31-TOPBITS) and takes a
- * different path, as the upper bits of the quotient must be developed
- * one bit at a time.
- */
-
-
-
#include <sysdep.h>
-#include <sys/trap.h>
ENTRY(.udiv)
- ! Ready to divide. Compute size of quotient; scale comparand.
- orcc %o1, %g0, %o5
- bne 1f
- mov %o0, %o3
-
- ! Divide by zero trap. If it returns, return 0 (about as
- ! wrong as possible, but that is what SunOS does...).
- ta ST_DIV0
- retl
- clr %o0
-
-1:
- cmp %o3, %o5 ! if %o1 exceeds %o0, done
- blu LOC(got_result) ! (and algorithm fails otherwise)
- clr %o2
- sethi %hi(1 << (32 - 4 - 1)), %g1
- cmp %o3, %g1
- blu LOC(not_really_big)
- clr %o4
-
- ! Here the dividend is >= 2**(31-N) or so. We must be careful here,
- ! as our usual N-at-a-shot divide step will cause overflow and havoc.
- ! The number of bits in the result here is N*ITER+SC, where SC <= N.
- ! Compute ITER in an unorthodox manner: know we need to shift V into
- ! the top decade: so do not even bother to compare to R.
- 1:
- cmp %o5, %g1
- bgeu 3f
- mov 1, %g2
- sll %o5, 4, %o5
- b 1b
- add %o4, 1, %o4
-
- ! Now compute %g2.
- 2: addcc %o5, %o5, %o5
- bcc LOC(not_too_big)
- add %g2, 1, %g2
-
- ! We get here if the %o1 overflowed while shifting.
- ! This means that %o3 has the high-order bit set.
- ! Restore %o5 and subtract from %o3.
- sll %g1, 4, %g1 ! high order bit
- srl %o5, 1, %o5 ! rest of %o5
- add %o5, %g1, %o5
- b LOC(do_single_div)
- sub %g2, 1, %g2
-
- LOC(not_too_big):
- 3: cmp %o5, %o3
- blu 2b
- nop
- be LOC(do_single_div)
- nop
- /* NB: these are commented out in the V8-Sparc manual as well */
- /* (I do not understand this) */
- ! %o5 > %o3: went too far: back up 1 step
- ! srl %o5, 1, %o5
- ! dec %g2
- ! do single-bit divide steps
- !
- ! We have to be careful here. We know that %o3 >= %o5, so we can do the
- ! first divide step without thinking. BUT, the others are conditional,
- ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
- ! order bit set in the first step, just falling into the regular
- ! division loop will mess up the first time around.
- ! So we unroll slightly...
- LOC(do_single_div):
- subcc %g2, 1, %g2
- bl LOC(end_regular_divide)
- nop
- sub %o3, %o5, %o3
- mov 1, %o2
- b LOC(end_single_divloop)
- nop
- LOC(single_divloop):
- sll %o2, 1, %o2
- bl 1f
- srl %o5, 1, %o5
- ! %o3 >= 0
- sub %o3, %o5, %o3
- b 2f
- add %o2, 1, %o2
- 1: ! %o3 < 0
- add %o3, %o5, %o3
- sub %o2, 1, %o2
- 2:
- LOC(end_single_divloop):
- subcc %g2, 1, %g2
- bge LOC(single_divloop)
- tst %o3
- b,a LOC(end_regular_divide)
-
-LOC(not_really_big):
-1:
- sll %o5, 4, %o5
- cmp %o5, %o3
- bleu 1b
- addcc %o4, 1, %o4
- be LOC(got_result)
- sub %o4, 1, %o4
-
- tst %o3 ! set up for initial iteration
-LOC(divloop):
- sll %o2, 4, %o2
- ! depth 1, accumulated bits 0
- bl LOC(1.16)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 2, accumulated bits 1
- bl LOC(2.17)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 3, accumulated bits 3
- bl LOC(3.19)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 4, accumulated bits 7
- bl LOC(4.23)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (7*2+1), %o2
-
-LOC(4.23):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (7*2-1), %o2
-
-
-LOC(3.19):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 4, accumulated bits 5
- bl LOC(4.21)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (5*2+1), %o2
-
-LOC(4.21):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (5*2-1), %o2
-
-
-
-LOC(2.17):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 3, accumulated bits 1
- bl LOC(3.17)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 4, accumulated bits 3
- bl LOC(4.19)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (3*2+1), %o2
-
-LOC(4.19):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (3*2-1), %o2
-
-
-LOC(3.17):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 4, accumulated bits 1
- bl LOC(4.17)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (1*2+1), %o2
-
-LOC(4.17):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (1*2-1), %o2
-
-
-
-
-LOC(1.16):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 2, accumulated bits -1
- bl LOC(2.15)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 3, accumulated bits -1
- bl LOC(3.15)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 4, accumulated bits -1
- bl LOC(4.15)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (-1*2+1), %o2
-
-LOC(4.15):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (-1*2-1), %o2
-
-
-LOC(3.15):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 4, accumulated bits -3
- bl LOC(4.13)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (-3*2+1), %o2
-
-LOC(4.13):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (-3*2-1), %o2
-
-
-
-LOC(2.15):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 3, accumulated bits -3
- bl LOC(3.13)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 4, accumulated bits -5
- bl LOC(4.11)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (-5*2+1), %o2
-
-LOC(4.11):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (-5*2-1), %o2
-
-
-LOC(3.13):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 4, accumulated bits -7
- bl LOC(4.9)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (-7*2+1), %o2
-
-LOC(4.9):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (-7*2-1), %o2
-
-
-
-
-9:
-LOC(end_regular_divide):
- subcc %o4, 1, %o4
- bge LOC(divloop)
- tst %o3
- bl,a LOC(got_result)
- ! non-restoring fixup here (one instruction only!)
- sub %o2, 1, %o2
-
-
-LOC(got_result):
-
+ wr %g0, 0, %y
+ nop
+ nop
retl
- mov %o2, %o0
+ udiv %o0, %o1, %o0
END(.udiv)
strong_alias (.udiv, __wrap_.udiv)
@@ -1,155 +1,13 @@
/*
- * Unsigned multiply. Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the
- * upper 32 bits of the 64-bit product).
- *
- * This code optimizes short (less than 13-bit) multiplies. Short
- * multiplies require 25 instruction cycles, and long ones require
- * 45 instruction cycles.
- *
- * On return, overflow has occurred (%o1 is not zero) if and only if
- * the Z condition code is clear, allowing, e.g., the following:
- *
- * call .umul
- * nop
- * bnz overflow (or tnz)
+ * Sparc v8 has multiply.
*/
#include <sysdep.h>
ENTRY(.umul)
- or %o0, %o1, %o4
- mov %o0, %y ! multiplier -> Y
- andncc %o4, 0xfff, %g0 ! test bits 12..31 of *both* args
- be LOC(mul_shortway) ! if zero, can do it the short way
- andcc %g0, %g0, %o4 ! zero the partial product; clear N & V
- /*
- * Long multiply. 32 steps, followed by a final shift step.
- */
- mulscc %o4, %o1, %o4 ! 1
- mulscc %o4, %o1, %o4 ! 2
- mulscc %o4, %o1, %o4 ! 3
- mulscc %o4, %o1, %o4 ! 4
- mulscc %o4, %o1, %o4 ! 5
- mulscc %o4, %o1, %o4 ! 6
- mulscc %o4, %o1, %o4 ! 7
- mulscc %o4, %o1, %o4 ! 8
- mulscc %o4, %o1, %o4 ! 9
- mulscc %o4, %o1, %o4 ! 10
- mulscc %o4, %o1, %o4 ! 11
- mulscc %o4, %o1, %o4 ! 12
- mulscc %o4, %o1, %o4 ! 13
- mulscc %o4, %o1, %o4 ! 14
- mulscc %o4, %o1, %o4 ! 15
- mulscc %o4, %o1, %o4 ! 16
- mulscc %o4, %o1, %o4 ! 17
- mulscc %o4, %o1, %o4 ! 18
- mulscc %o4, %o1, %o4 ! 19
- mulscc %o4, %o1, %o4 ! 20
- mulscc %o4, %o1, %o4 ! 21
- mulscc %o4, %o1, %o4 ! 22
- mulscc %o4, %o1, %o4 ! 23
- mulscc %o4, %o1, %o4 ! 24
- mulscc %o4, %o1, %o4 ! 25
- mulscc %o4, %o1, %o4 ! 26
- mulscc %o4, %o1, %o4 ! 27
- mulscc %o4, %o1, %o4 ! 28
- mulscc %o4, %o1, %o4 ! 29
- mulscc %o4, %o1, %o4 ! 30
- mulscc %o4, %o1, %o4 ! 31
- mulscc %o4, %o1, %o4 ! 32
- mulscc %o4, %g0, %o4 ! final shift
-
- /*
- * Normally, with the shift-and-add approach, if both numbers are
- * positive you get the correct result. With 32-bit two's-complement
- * numbers, -x is represented as
- *
- * x 32
- * ( 2 - ------ ) mod 2 * 2
- * 32
- * 2
- *
- * (the `mod 2' subtracts 1 from 1.bbbb). To avoid lots of 2^32s,
- * we can treat this as if the radix point were just to the left
- * of the sign bit (multiply by 2^32), and get
- *
- * -x = (2 - x) mod 2
- *
- * Then, ignoring the `mod 2's for convenience:
- *
- * x * y = xy
- * -x * y = 2y - xy
- * x * -y = 2x - xy
- * -x * -y = 4 - 2x - 2y + xy
- *
- * For signed multiplies, we subtract (x << 32) from the partial
- * product to fix this problem for negative multipliers (see mul.s).
- * Because of the way the shift into the partial product is calculated
- * (N xor V), this term is automatically removed for the multiplicand,
- * so we don't have to adjust.
- *
- * But for unsigned multiplies, the high order bit wasn't a sign bit,
- * and the correction is wrong. So for unsigned multiplies where the
- * high order bit is one, we end up with xy - (y << 32). To fix it
- * we add y << 32.
- */
-#if 0
- tst %o1
- bl,a 1f ! if %o1 < 0 (high order bit = 1),
- add %o4, %o0, %o4 ! %o4 += %o0 (add y to upper half)
-1: rd %y, %o0 ! get lower half of product
- retl
- addcc %o4, %g0, %o1 ! put upper half in place and set Z for %o1==0
-#else
- /* Faster code from tege@sics.se. */
- sra %o1, 31, %o2 ! make mask from sign bit
- and %o0, %o2, %o2 ! %o2 = 0 or %o0, depending on sign of %o1
- rd %y, %o0 ! get lower half of product
- retl
- addcc %o4, %o2, %o1 ! add compensation and put upper half in place
-#endif
-
-LOC(mul_shortway):
- /*
- * Short multiply. 12 steps, followed by a final shift step.
- * The resulting bits are off by 12 and (32-12) = 20 bit positions,
- * but there is no problem with %o0 being negative (unlike above),
- * and overflow is impossible (the answer is at most 24 bits long).
- */
- mulscc %o4, %o1, %o4 ! 1
- mulscc %o4, %o1, %o4 ! 2
- mulscc %o4, %o1, %o4 ! 3
- mulscc %o4, %o1, %o4 ! 4
- mulscc %o4, %o1, %o4 ! 5
- mulscc %o4, %o1, %o4 ! 6
- mulscc %o4, %o1, %o4 ! 7
- mulscc %o4, %o1, %o4 ! 8
- mulscc %o4, %o1, %o4 ! 9
- mulscc %o4, %o1, %o4 ! 10
- mulscc %o4, %o1, %o4 ! 11
- mulscc %o4, %o1, %o4 ! 12
- mulscc %o4, %g0, %o4 ! final shift
-
- /*
- * %o4 has 20 of the bits that should be in the result; %y has
- * the bottom 12 (as %y's top 12). That is:
- *
- * %o4 %y
- * +----------------+----------------+
- * | -12- | -20- | -12- | -20- |
- * +------(---------+------)---------+
- * -----result-----
- *
- * The 12 bits of %o4 left of the `result' area are all zero;
- * in fact, all top 20 bits of %o4 are zero.
- */
-
- rd %y, %o5
- sll %o4, 12, %o0 ! shift middle bits left 12
- srl %o5, 20, %o5 ! shift low bits right 20
- or %o5, %o0, %o0
+ umul %o0, %o1, %o0
retl
- addcc %g0, %g0, %o1 ! %o1 = zero, and set Z
+ rd %y, %o1
END(.umul)
@@ -1,346 +1,18 @@
- /* This file is generated from divrem.m4; DO NOT EDIT! */
/*
- * Division and remainder, from Appendix E of the Sparc Version 8
- * Architecture Manual, with fixes from Gordon Irlam.
+ * Sparc v8 has divide.
*/
-/*
- * Input: dividend and divisor in %o0 and %o1 respectively.
- *
- * m4 parameters:
- * .urem name of function to generate
- * rem rem=div => %o0 / %o1; rem=rem => %o0 % %o1
- * false false=true => signed; false=false => unsigned
- *
- * Algorithm parameters:
- * N how many bits per iteration we try to get (4)
- * WORDSIZE total number of bits (32)
- *
- * Derived constants:
- * TOPBITS number of bits in the top decade of a number
- *
- * Important variables:
- * Q the partial quotient under development (initially 0)
- * R the remainder so far, initially the dividend
- * ITER number of main division loop iterations required;
- * equal to ceil(log2(quotient) / N). Note that this
- * is the log base (2^N) of the quotient.
- * V the current comparand, initially divisor*2^(ITER*N-1)
- *
- * Cost:
- * Current estimate for non-large dividend is
- * ceil(log2(quotient) / N) * (10 + 7N/2) + C
- * A large dividend is one greater than 2^(31-TOPBITS) and takes a
- * different path, as the upper bits of the quotient must be developed
- * one bit at a time.
- */
-
-
-
#include <sysdep.h>
-#include <sys/trap.h>
ENTRY(.urem)
- ! Ready to divide. Compute size of quotient; scale comparand.
- orcc %o1, %g0, %o5
- bne 1f
- mov %o0, %o3
-
- ! Divide by zero trap. If it returns, return 0 (about as
- ! wrong as possible, but that is what SunOS does...).
- ta ST_DIV0
- retl
- clr %o0
-
-1:
- cmp %o3, %o5 ! if %o1 exceeds %o0, done
- blu LOC(got_result) ! (and algorithm fails otherwise)
- clr %o2
- sethi %hi(1 << (32 - 4 - 1)), %g1
- cmp %o3, %g1
- blu LOC(not_really_big)
- clr %o4
-
- ! Here the dividend is >= 2**(31-N) or so. We must be careful here,
- ! as our usual N-at-a-shot divide step will cause overflow and havoc.
- ! The number of bits in the result here is N*ITER+SC, where SC <= N.
- ! Compute ITER in an unorthodox manner: know we need to shift V into
- ! the top decade: so do not even bother to compare to R.
- 1:
- cmp %o5, %g1
- bgeu 3f
- mov 1, %g2
- sll %o5, 4, %o5
- b 1b
- add %o4, 1, %o4
-
- ! Now compute %g2.
- 2: addcc %o5, %o5, %o5
- bcc LOC(not_too_big)
- add %g2, 1, %g2
-
- ! We get here if the %o1 overflowed while shifting.
- ! This means that %o3 has the high-order bit set.
- ! Restore %o5 and subtract from %o3.
- sll %g1, 4, %g1 ! high order bit
- srl %o5, 1, %o5 ! rest of %o5
- add %o5, %g1, %o5
- b LOC(do_single_div)
- sub %g2, 1, %g2
-
- LOC(not_too_big):
- 3: cmp %o5, %o3
- blu 2b
- nop
- be LOC(do_single_div)
- nop
- /* NB: these are commented out in the V8-Sparc manual as well */
- /* (I do not understand this) */
- ! %o5 > %o3: went too far: back up 1 step
- ! srl %o5, 1, %o5
- ! dec %g2
- ! do single-bit divide steps
- !
- ! We have to be careful here. We know that %o3 >= %o5, so we can do the
- ! first divide step without thinking. BUT, the others are conditional,
- ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
- ! order bit set in the first step, just falling into the regular
- ! division loop will mess up the first time around.
- ! So we unroll slightly...
- LOC(do_single_div):
- subcc %g2, 1, %g2
- bl LOC(end_regular_divide)
- nop
- sub %o3, %o5, %o3
- mov 1, %o2
- b LOC(end_single_divloop)
- nop
- LOC(single_divloop):
- sll %o2, 1, %o2
- bl 1f
- srl %o5, 1, %o5
- ! %o3 >= 0
- sub %o3, %o5, %o3
- b 2f
- add %o2, 1, %o2
- 1: ! %o3 < 0
- add %o3, %o5, %o3
- sub %o2, 1, %o2
- 2:
- LOC(end_single_divloop):
- subcc %g2, 1, %g2
- bge LOC(single_divloop)
- tst %o3
- b,a LOC(end_regular_divide)
-
-LOC(not_really_big):
-1:
- sll %o5, 4, %o5
- cmp %o5, %o3
- bleu 1b
- addcc %o4, 1, %o4
- be LOC(got_result)
- sub %o4, 1, %o4
-
- tst %o3 ! set up for initial iteration
-LOC(divloop):
- sll %o2, 4, %o2
- ! depth 1, accumulated bits 0
- bl LOC(1.16)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 2, accumulated bits 1
- bl LOC(2.17)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 3, accumulated bits 3
- bl LOC(3.19)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 4, accumulated bits 7
- bl LOC(4.23)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (7*2+1), %o2
-
-LOC(4.23):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (7*2-1), %o2
-
-
-LOC(3.19):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 4, accumulated bits 5
- bl LOC(4.21)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (5*2+1), %o2
-
-LOC(4.21):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (5*2-1), %o2
-
-
-
-LOC(2.17):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 3, accumulated bits 1
- bl LOC(3.17)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 4, accumulated bits 3
- bl LOC(4.19)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (3*2+1), %o2
-
-LOC(4.19):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (3*2-1), %o2
-
-
-LOC(3.17):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 4, accumulated bits 1
- bl LOC(4.17)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (1*2+1), %o2
-
-LOC(4.17):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (1*2-1), %o2
-
-
-
-
-LOC(1.16):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 2, accumulated bits -1
- bl LOC(2.15)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 3, accumulated bits -1
- bl LOC(3.15)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 4, accumulated bits -1
- bl LOC(4.15)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (-1*2+1), %o2
-
-LOC(4.15):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (-1*2-1), %o2
-
-
-LOC(3.15):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 4, accumulated bits -3
- bl LOC(4.13)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (-3*2+1), %o2
-
-LOC(4.13):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (-3*2-1), %o2
-
-
-
-LOC(2.15):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 3, accumulated bits -3
- bl LOC(3.13)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- ! depth 4, accumulated bits -5
- bl LOC(4.11)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (-5*2+1), %o2
-
-LOC(4.11):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (-5*2-1), %o2
-
-
-LOC(3.13):
- ! remainder is negative
- addcc %o3,%o5,%o3
- ! depth 4, accumulated bits -7
- bl LOC(4.9)
- srl %o5,1,%o5
- ! remainder is positive
- subcc %o3,%o5,%o3
- b 9f
- add %o2, (-7*2+1), %o2
-
-LOC(4.9):
- ! remainder is negative
- addcc %o3,%o5,%o3
- b 9f
- add %o2, (-7*2-1), %o2
-
-
-
-
-9:
-LOC(end_regular_divide):
- subcc %o4, 1, %o4
- bge LOC(divloop)
- tst %o3
- bl,a LOC(got_result)
- ! non-restoring fixup here (one instruction only!)
- add %o3, %o1, %o3
-
-
-LOC(got_result):
-
+ wr %g0, 0, %y
+ nop
+ nop
+ nop
+ udiv %o0, %o1, %o2
+ umul %o2, %o1, %o2
retl
- mov %o3, %o0
+ sub %o0, %o2, %o0
END(.urem)