Add in optimizations for 64-bit integers.

Alexhuszagh · Alexhuszagh · commit 6b518e5657f4 · 2024-12-07T02:29:32.000-06:00
This uses the partial jeaiii algorithm, only for sizes &lt;= u32::MAX, otherwise it uses a mix of that along with a fixed-width Alexandrescu unfolding algorithm. The latter requires more multiplications but has less branches, the major performance issue.
diff --git a/lexical-write-integer/src/decimal.rs b/lexical-write-integer/src/decimal.rs
@@ -16,7 +16,7 @@
 use lexical_util::format::{RADIX, RADIX_SHIFT, STANDARD};
 use lexical_util::num::UnsignedInteger;
 
-use crate::algorithm::{algorithm, algorithm_u128};
+use crate::algorithm::algorithm_u128;
 use crate::digit_count::fast_log2;
 use crate::jeaiii;
 use crate::table::DIGIT_TO_BASE10_SQUARED;
@@ -254,37 +254,21 @@ pub trait Decimal: DecimalCount {
 
 // Implement decimal for type.
 macro_rules! decimal_impl {
-    ($($t:ty)*) => ($(
+    ($($t:ty; $f:ident)*) => ($(
         impl Decimal for $t {
             #[inline(always)]
             fn decimal(self, buffer: &mut [u8]) -> usize {
-                algorithm(self, 10, &DIGIT_TO_BASE10_SQUARED, buffer)
+                jeaiii::$f(self, buffer)
             }
         }
     )*);
 }
 
-decimal_impl! { u64 }
-
-impl Decimal for u8 {
-    #[inline(always)]
-    fn decimal(self, buffer: &mut [u8]) -> usize {
-        jeaiii::from_u8(self, buffer)
-    }
-}
-
-impl Decimal for u16 {
-    #[inline(always)]
-    fn decimal(self, buffer: &mut [u8]) -> usize {
-        jeaiii::from_u16(self, buffer)
-    }
-}
-
-impl Decimal for u32 {
-    #[inline(always)]
-    fn decimal(self, buffer: &mut [u8]) -> usize {
-        jeaiii::from_u32(self, buffer)
-    }
+decimal_impl! {
+    u8; from_u8
+    u16; from_u16
+    u32; from_u32
+    u64; from_u64
 }
 
 impl Decimal for u128 {
diff --git a/lexical-write-integer/src/jeaiii.rs b/lexical-write-integer/src/jeaiii.rs
@@ -66,6 +66,23 @@ macro_rules! write_n {
         $buffer[index + 1] = i!(DIGIT_TO_BASE10_SQUARED[r + 1]);
         index + 2
     }};
+
+    // Identical to `@2` except it's writing from the end, not front.
+    // This is for our Alexandrescu-popularized algorithm.
+    (@2sub $buffer:ident, $index:ident, $r:expr) => {{
+        $index -= 2;
+        _ = write_n!(@2 $buffer, $index, $r);
+    }};
+
+    // This writes 4 digits, using 2sub twice after getting the high and low.
+    (@4sub $buffer:ident, $index:ident, $value:ident) => {{
+        let r = $value % 10000;
+        $value /= 10000;
+        let r1 = 2 * (r / 100);
+        let r2 = 2 * (r % 100);
+        write_n!(@2sub $buffer, $index, r2);
+        write_n!(@2sub $buffer, $index, r1);
+    }};
 }
 
 // Print the next 2 digits, using `next2`.
@@ -159,6 +176,35 @@ macro_rules! write_digits {
         _ = write_n!(@2 $buffer, 6, next2(&mut y) * 2);
         write_n!(@2 $buffer, 8, next2(&mut y) * 2)
     }};
+
+    (@10u64 $buffer:ident, $n:ident) => {{
+        // Unfortunately, there is no good way without using 128 bits,
+        // since the smallest interval overflows a 64-bit integer at
+        // ~>= 5.5e9. This requires the value to be in `[1e9, 1e10)`,
+        // since there's no lower bound for the calculation and so it
+        // will not work with smaller values.
+        // D = 32, k = 8, L = 28
+        // `11529215047 = ceil(2^60 / 10^8)`
+        let prod = ($n as u128) * 11529215047u128;
+        let mut y = (prod >> 28) as u64;
+        _ = write_n!(@2 $buffer, 0, (y >> 32) * 2);
+        _ = write_n!(@2 $buffer, 2, next2(&mut y) * 2);
+        _ = write_n!(@2 $buffer, 4, next2(&mut y) * 2);
+        _ = write_n!(@2 $buffer, 6, next2(&mut y) * 2);
+        write_n!(@2 $buffer, 8, next2(&mut y) * 2)
+    }};
+
+    (@10alex $buffer:ident, $n:ident, $offset:ident) => {{
+        // This always writes 10 digits for any value `[0, 1e10)`,
+        // but it uses a slower algorithm to do so. Since we don't
+        // have to worry about
+        let mut value = $n;
+        let mut index = 10 + $offset;
+        write_n!(@4sub $buffer, index, value);
+        write_n!(@4sub $buffer, index, value);
+        write_n!(@2sub $buffer, index, value * 2);
+        10 + $offset
+    }};
 }
 
 /// Optimized jeaiii algorithm for u8.
@@ -225,8 +271,51 @@ pub fn from_u32(n: u32, buffer: &mut [u8]) -> usize {
     }
 }
 
+/// Optimized jeaiii algorithm for u64.
+#[inline(always)]
+#[allow(clippy::collapsible_else_if)] // reason = "branching is fine-tuned for performance"
+pub fn from_u64(n: u64, buffer: &mut [u8]) -> usize {
+    // NOTE: Like before, this optimizes better for large and small
+    // values if there's a flat comparison with larger values first.
+    const FACTOR: u64 = 100_0000_0000;
+    let buffer = &mut buffer[..20];
+    if n < 1_0000 {
+        // 1 to 4 digits
+        if n >= 100 {
+            write_digits!(@3-4 buffer, n)
+        } else if n >= 10 {
+            write_digits!(@2 buffer, n)
+        } else {
+            write_digits!(@1 buffer, n)
+        }
+    } else if n < 100_0000_0000 {
+        // 5 to 10 digits
+        if n >= 10_0000_0000 {
+            // NOTE: We DO NOT know if this is >= u32::MAX,
+            // and the `write_digits!(@10)` is only accurate
+            // if `n <= 5.5e9`, which we cannot guarantee.
+            write_digits!(@10u64 buffer, n)
+        } else if n >= 1_0000_0000 {
+            write_digits!(@9 buffer, n)
+        } else if n >= 100_0000 {
+            write_digits!(@7-8 buffer, n)
+        } else {
+            write_digits!(@5-6 buffer, n)
+        }
+    } else {
+        // 11-20 digits, can do in 2 steps
+        // NOTE: `hi` has to be in [0, 2^31], while `lo` is in `[0, 10^11)`
+        // So, we can use our `from_u64_small` for hi. For our `lo`, we always
+        // need to write 10 digits. However, the `jeaiii` algorithm is too
+        // slow, so we use a modified variant of our 2-digit unfolding for
+        // exactly 10 digits to read our values. We can optimize this in
+        // 2x 4 digits and 1x 2 digits.
+        let hi = (n / FACTOR) as u32;
+        let lo = n % FACTOR;
+        let offset = from_u32(hi, buffer);
+        write_digits!(@10alex buffer, lo, offset)
+    }
+}
+
 // TODO: Implement for:
-//  from_u64
 //  from_u128
-//  from_mant32 (23 bits)
-//  from_mant64 (53 bits)
diff --git a/lexical-write-integer/tests/api_tests.rs b/lexical-write-integer/tests/api_tests.rs
@@ -1443,12 +1443,6 @@ fn u64_buffer_test() {
     123456i64.to_lexical(&mut buffer);
 }
 
-#[test]
-fn u64_buffer_no_panic_test() {
-    let mut buffer = [b'\x00'; 6];
-    12345i64.to_lexical(&mut buffer);
-}
-
 #[test]
 #[should_panic]
 fn u128_buffer_test() {