diff mbox

[libstdc++] Optimize selection sampling by using generator to get two values at once

Message ID 58074F89.4050403@eelis.net
State New
Headers show

Commit Message

Eelis van der Weegen Oct. 19, 2016, 10:48 a.m. UTC
This is the same optimization as was recently applied to std::shuffle.

It reduces the runtime of the following program by 20% on my machine:

	int main()
	{
		std::vector<uint64_t> a(10000), b(1000);
		std::mt19937 gen;

		uint64_t c = 0;

		for (int i = 0; i != 1000; ++i)
		{
			std::sample(a.begin(), a.end(), b.begin(), b.size(), gen);
			c += uint64_t(b[32]);
		}

		std::cout << c;
	}
diff mbox

Patch

Index: bits/stl_algo.h
===================================================================
--- bits/stl_algo.h	(revision 241299)
+++ bits/stl_algo.h	(working copy)
@@ -3741,6 +3741,37 @@ 
 
 #ifdef _GLIBCXX_USE_C99_STDINT_TR1
   /**
+   *  @brief Generate two uniformly distributed integers using a
+   *         single distribution invocation.
+   *  @param  __b0    The upper bound for the first integer.
+   *  @param  __b1    The upper bound for the second integer.
+   *  @param  __g     A UniformRandomBitGenerator.
+   *  @return  A pair (i, j) with i and j uniformly distributed
+   *           over [0, __b0) and [0, __b1), respectively.
+   *
+   *  Requires: __b0 * __b1 <= __g.max() - __g.min().
+   *
+   *  Using uniform_int_distribution with a range that is very
+   *  small relative to the range of the generator ends up wasting
+   *  potentially expensively generated randomness, since
+   *  uniform_int_distribution does not store leftover randomness
+   *  between invocations.
+   *
+   *  If we know we want two integers in ranges that are sufficiently
+   *  small, we can compose the ranges, use a single distribution
+   *  invocation, and significantly reduce the waste.
+  */
+  template<typename _IntType,
+	   typename _UniformRandomBitGenerator>
+    std::pair<_IntType, _IntType>
+    __gen_two_uniform_ints(_IntType __b0, _IntType __b1,
+			   _UniformRandomBitGenerator&& __g)
+    {
+      _IntType __x = uniform_int_distribution<_IntType>{0, (__b0 * __b1) - 1}(__g);
+      return std::make_pair(__x / __b1, __x % __b1);
+    }
+
+  /**
    *  @brief Shuffle the elements of a sequence using a uniform random
    *         number generator.
    *  @ingroup mutating_algorithms
@@ -3801,13 +3832,12 @@ 
 	while (__i != __last)
 	{
 	  const __uc_type __swap_range = __uc_type(__i - __first) + 1;
-	  const __uc_type __comp_range = __swap_range * (__swap_range + 1);
 
-	  std::uniform_int_distribution<__uc_type> __d{0, __comp_range - 1};
-	  const __uc_type __pospos = __d(__g);
+	  const std::pair<__uc_type, __uc_type> __pospos =
+	    __gen_two_uniform_ints(__swap_range, __swap_range + 1);
 
-	  std::iter_swap(__i++, __first + (__pospos % __swap_range));
-	  std::iter_swap(__i++, __first + (__pospos / __swap_range));
+	  std::iter_swap(__i++, __first + __pospos.first);
+	  std::iter_swap(__i++, __first + __pospos.second);
 	}
 
 	return;
@@ -5695,9 +5725,51 @@ 
     {
       using __distrib_type = uniform_int_distribution<_Size>;
       using __param_type = typename __distrib_type::param_type;
+      using _USize = typename std::make_unsigned<_Size>::type;
+      using _Gen = typename std::remove_reference<_UniformRandomBitGenerator>::type;
+      using __uc_type = typename std::common_type<typename _Gen::result_type, _USize>::type;
+
       __distrib_type __d{};
       _Size __unsampled_sz = std::distance(__first, __last);
-      for (__n = std::min(__n, __unsampled_sz); __n != 0; ++__first)
+      __n = std::min(__n, __unsampled_sz);
+
+      // If possible, we use __gen_two_uniform_ints to efficiently produce
+      // two random numbers using a single distribution invocation:
+
+      const __uc_type __urngrange = __g.max() - __g.min();
+      if (__urngrange / __uc_type(__unsampled_sz) >= __uc_type(__unsampled_sz))
+        // I.e. (__urngrange >= __unsampled_sz * __unsampled_sz) but without wrap issues.
+        {
+	  while (__n != 0 && __unsampled_sz >= 2)
+	    {
+	      const std::pair<_Size, _Size> p =
+		__gen_two_uniform_ints(__unsampled_sz, __unsampled_sz - 1, __g);
+
+	      --__unsampled_sz;
+	      if (p.first < __n)
+		{
+		  *__out++ = *__first;
+		  --__n;
+		}
+
+	      ++__first;
+
+	      if (__n == 0) break;
+
+	      --__unsampled_sz;
+	      if (p.second < __n)
+		{
+		  *__out++ = *__first;
+		  --__n;
+		}
+
+	      ++__first;
+	    }
+        }
+
+      // The loop above is otherwise equivalent to this one-at-a-time version:
+
+      for (; __n != 0; ++__first)
 	if (__d(__g, __param_type{0, --__unsampled_sz}) < __n)
 	  {
 	    *__out++ = *__first;