Skip to content

Commit 6b518e5

Browse files
committed
Add in optimizations for 64-bit integers.
This uses the partial jeaiii algorithm, only for sizes <= u32::MAX, otherwise it uses a mix of that along with a fixed-width Alexandrescu unfolding algorithm. The latter requires more multiplications but has less branches, the major performance issue.
1 parent 9090ffa commit 6b518e5

3 files changed

Lines changed: 100 additions & 33 deletions

File tree

lexical-write-integer/src/decimal.rs

Lines changed: 8 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
use lexical_util::format::{RADIX, RADIX_SHIFT, STANDARD};
1717
use lexical_util::num::UnsignedInteger;
1818

19-
use crate::algorithm::{algorithm, algorithm_u128};
19+
use crate::algorithm::algorithm_u128;
2020
use crate::digit_count::fast_log2;
2121
use crate::jeaiii;
2222
use crate::table::DIGIT_TO_BASE10_SQUARED;
@@ -254,37 +254,21 @@ pub trait Decimal: DecimalCount {
254254

255255
// Implement decimal for type.
256256
macro_rules! decimal_impl {
257-
($($t:ty)*) => ($(
257+
($($t:ty; $f:ident)*) => ($(
258258
impl Decimal for $t {
259259
#[inline(always)]
260260
fn decimal(self, buffer: &mut [u8]) -> usize {
261-
algorithm(self, 10, &DIGIT_TO_BASE10_SQUARED, buffer)
261+
jeaiii::$f(self, buffer)
262262
}
263263
}
264264
)*);
265265
}
266266

267-
decimal_impl! { u64 }
268-
269-
impl Decimal for u8 {
270-
#[inline(always)]
271-
fn decimal(self, buffer: &mut [u8]) -> usize {
272-
jeaiii::from_u8(self, buffer)
273-
}
274-
}
275-
276-
impl Decimal for u16 {
277-
#[inline(always)]
278-
fn decimal(self, buffer: &mut [u8]) -> usize {
279-
jeaiii::from_u16(self, buffer)
280-
}
281-
}
282-
283-
impl Decimal for u32 {
284-
#[inline(always)]
285-
fn decimal(self, buffer: &mut [u8]) -> usize {
286-
jeaiii::from_u32(self, buffer)
287-
}
267+
decimal_impl! {
268+
u8; from_u8
269+
u16; from_u16
270+
u32; from_u32
271+
u64; from_u64
288272
}
289273

290274
impl Decimal for u128 {

lexical-write-integer/src/jeaiii.rs

Lines changed: 92 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,23 @@ macro_rules! write_n {
6666
$buffer[index + 1] = i!(DIGIT_TO_BASE10_SQUARED[r + 1]);
6767
index + 2
6868
}};
69+
70+
// Identical to `@2` except it's writing from the end, not front.
71+
// This is for our Alexandrescu-popularized algorithm.
72+
(@2sub $buffer:ident, $index:ident, $r:expr) => {{
73+
$index -= 2;
74+
_ = write_n!(@2 $buffer, $index, $r);
75+
}};
76+
77+
// This writes 4 digits, using 2sub twice after getting the high and low.
78+
(@4sub $buffer:ident, $index:ident, $value:ident) => {{
79+
let r = $value % 10000;
80+
$value /= 10000;
81+
let r1 = 2 * (r / 100);
82+
let r2 = 2 * (r % 100);
83+
write_n!(@2sub $buffer, $index, r2);
84+
write_n!(@2sub $buffer, $index, r1);
85+
}};
6986
}
7087

7188
// Print the next 2 digits, using `next2`.
@@ -159,6 +176,35 @@ macro_rules! write_digits {
159176
_ = write_n!(@2 $buffer, 6, next2(&mut y) * 2);
160177
write_n!(@2 $buffer, 8, next2(&mut y) * 2)
161178
}};
179+
180+
(@10u64 $buffer:ident, $n:ident) => {{
181+
// Unfortunately, there is no good way without using 128 bits,
182+
// since the smallest interval overflows a 64-bit integer at
183+
// ~>= 5.5e9. This requires the value to be in `[1e9, 1e10)`,
184+
// since there's no lower bound for the calculation and so it
185+
// will not work with smaller values.
186+
// D = 32, k = 8, L = 28
187+
// `11529215047 = ceil(2^60 / 10^8)`
188+
let prod = ($n as u128) * 11529215047u128;
189+
let mut y = (prod >> 28) as u64;
190+
_ = write_n!(@2 $buffer, 0, (y >> 32) * 2);
191+
_ = write_n!(@2 $buffer, 2, next2(&mut y) * 2);
192+
_ = write_n!(@2 $buffer, 4, next2(&mut y) * 2);
193+
_ = write_n!(@2 $buffer, 6, next2(&mut y) * 2);
194+
write_n!(@2 $buffer, 8, next2(&mut y) * 2)
195+
}};
196+
197+
(@10alex $buffer:ident, $n:ident, $offset:ident) => {{
198+
// This always writes 10 digits for any value `[0, 1e10)`,
199+
// but it uses a slower algorithm to do so. Since we don't
200+
// have to worry about
201+
let mut value = $n;
202+
let mut index = 10 + $offset;
203+
write_n!(@4sub $buffer, index, value);
204+
write_n!(@4sub $buffer, index, value);
205+
write_n!(@2sub $buffer, index, value * 2);
206+
10 + $offset
207+
}};
162208
}
163209

164210
/// Optimized jeaiii algorithm for u8.
@@ -225,8 +271,51 @@ pub fn from_u32(n: u32, buffer: &mut [u8]) -> usize {
225271
}
226272
}
227273

274+
/// Optimized jeaiii algorithm for u64.
275+
#[inline(always)]
276+
#[allow(clippy::collapsible_else_if)] // reason = "branching is fine-tuned for performance"
277+
pub fn from_u64(n: u64, buffer: &mut [u8]) -> usize {
278+
// NOTE: Like before, this optimizes better for large and small
279+
// values if there's a flat comparison with larger values first.
280+
const FACTOR: u64 = 100_0000_0000;
281+
let buffer = &mut buffer[..20];
282+
if n < 1_0000 {
283+
// 1 to 4 digits
284+
if n >= 100 {
285+
write_digits!(@3-4 buffer, n)
286+
} else if n >= 10 {
287+
write_digits!(@2 buffer, n)
288+
} else {
289+
write_digits!(@1 buffer, n)
290+
}
291+
} else if n < 100_0000_0000 {
292+
// 5 to 10 digits
293+
if n >= 10_0000_0000 {
294+
// NOTE: We DO NOT know if this is >= u32::MAX,
295+
// and the `write_digits!(@10)` is only accurate
296+
// if `n <= 5.5e9`, which we cannot guarantee.
297+
write_digits!(@10u64 buffer, n)
298+
} else if n >= 1_0000_0000 {
299+
write_digits!(@9 buffer, n)
300+
} else if n >= 100_0000 {
301+
write_digits!(@7-8 buffer, n)
302+
} else {
303+
write_digits!(@5-6 buffer, n)
304+
}
305+
} else {
306+
// 11-20 digits, can do in 2 steps
307+
// NOTE: `hi` has to be in [0, 2^31], while `lo` is in `[0, 10^11)`
308+
// So, we can use our `from_u64_small` for hi. For our `lo`, we always
309+
// need to write 10 digits. However, the `jeaiii` algorithm is too
310+
// slow, so we use a modified variant of our 2-digit unfolding for
311+
// exactly 10 digits to read our values. We can optimize this in
312+
// 2x 4 digits and 1x 2 digits.
313+
let hi = (n / FACTOR) as u32;
314+
let lo = n % FACTOR;
315+
let offset = from_u32(hi, buffer);
316+
write_digits!(@10alex buffer, lo, offset)
317+
}
318+
}
319+
228320
// TODO: Implement for:
229-
// from_u64
230321
// from_u128
231-
// from_mant32 (23 bits)
232-
// from_mant64 (53 bits)

lexical-write-integer/tests/api_tests.rs

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1443,12 +1443,6 @@ fn u64_buffer_test() {
14431443
123456i64.to_lexical(&mut buffer);
14441444
}
14451445

1446-
#[test]
1447-
fn u64_buffer_no_panic_test() {
1448-
let mut buffer = [b'\x00'; 6];
1449-
12345i64.to_lexical(&mut buffer);
1450-
}
1451-
14521446
#[test]
14531447
#[should_panic]
14541448
fn u128_buffer_test() {

0 commit comments

Comments
 (0)