@@ -66,6 +66,23 @@ macro_rules! write_n {
6666 $buffer[ index + 1 ] = i!( DIGIT_TO_BASE10_SQUARED [ r + 1 ] ) ;
6767 index + 2
6868 } } ;
69+
70+ // Identical to `@2` except it's writing from the end, not front.
71+ // This is for our Alexandrescu-popularized algorithm.
72+ ( @2 sub $buffer: ident, $index: ident, $r: expr) => { {
73+ $index -= 2 ;
74+ _ = write_n!( @2 $buffer, $index, $r) ;
75+ } } ;
76+
77+ // This writes 4 digits, using 2sub twice after getting the high and low.
78+ ( @4 sub $buffer: ident, $index: ident, $value: ident) => { {
79+ let r = $value % 10000 ;
80+ $value /= 10000 ;
81+ let r1 = 2 * ( r / 100 ) ;
82+ let r2 = 2 * ( r % 100 ) ;
83+ write_n!( @2 sub $buffer, $index, r2) ;
84+ write_n!( @2 sub $buffer, $index, r1) ;
85+ } } ;
6986}
7087
7188// Print the next 2 digits, using `next2`.
@@ -159,6 +176,35 @@ macro_rules! write_digits {
159176 _ = write_n!( @2 $buffer, 6 , next2( & mut y) * 2 ) ;
160177 write_n!( @2 $buffer, 8 , next2( & mut y) * 2 )
161178 } } ;
179+
180+ ( @10u64 $buffer: ident, $n: ident) => { {
181+ // Unfortunately, there is no good way without using 128 bits,
182+ // since the smallest interval overflows a 64-bit integer at
183+ // ~>= 5.5e9. This requires the value to be in `[1e9, 1e10)`,
184+ // since there's no lower bound for the calculation and so it
185+ // will not work with smaller values.
186+ // D = 32, k = 8, L = 28
187+ // `11529215047 = ceil(2^60 / 10^8)`
188+ let prod = ( $n as u128 ) * 11529215047u128 ;
189+ let mut y = ( prod >> 28 ) as u64 ;
190+ _ = write_n!( @2 $buffer, 0 , ( y >> 32 ) * 2 ) ;
191+ _ = write_n!( @2 $buffer, 2 , next2( & mut y) * 2 ) ;
192+ _ = write_n!( @2 $buffer, 4 , next2( & mut y) * 2 ) ;
193+ _ = write_n!( @2 $buffer, 6 , next2( & mut y) * 2 ) ;
194+ write_n!( @2 $buffer, 8 , next2( & mut y) * 2 )
195+ } } ;
196+
197+ ( @10 alex $buffer: ident, $n: ident, $offset: ident) => { {
198+ // This always writes 10 digits for any value `[0, 1e10)`,
199+ // but it uses a slower algorithm to do so. Since we don't
200+ // have to worry about
201+ let mut value = $n;
202+ let mut index = 10 + $offset;
203+ write_n!( @4 sub $buffer, index, value) ;
204+ write_n!( @4 sub $buffer, index, value) ;
205+ write_n!( @2 sub $buffer, index, value * 2 ) ;
206+ 10 + $offset
207+ } } ;
162208}
163209
164210/// Optimized jeaiii algorithm for u8.
@@ -225,8 +271,51 @@ pub fn from_u32(n: u32, buffer: &mut [u8]) -> usize {
225271 }
226272}
227273
274+ /// Optimized jeaiii algorithm for u64.
275+ #[ inline( always) ]
276+ #[ allow( clippy:: collapsible_else_if) ] // reason = "branching is fine-tuned for performance"
277+ pub fn from_u64 ( n : u64 , buffer : & mut [ u8 ] ) -> usize {
278+ // NOTE: Like before, this optimizes better for large and small
279+ // values if there's a flat comparison with larger values first.
280+ const FACTOR : u64 = 100_0000_0000 ;
281+ let buffer = & mut buffer[ ..20 ] ;
282+ if n < 1_0000 {
283+ // 1 to 4 digits
284+ if n >= 100 {
285+ write_digits ! ( @3 -4 buffer, n)
286+ } else if n >= 10 {
287+ write_digits ! ( @2 buffer, n)
288+ } else {
289+ write_digits ! ( @1 buffer, n)
290+ }
291+ } else if n < 100_0000_0000 {
292+ // 5 to 10 digits
293+ if n >= 10_0000_0000 {
294+ // NOTE: We DO NOT know if this is >= u32::MAX,
295+ // and the `write_digits!(@10)` is only accurate
296+ // if `n <= 5.5e9`, which we cannot guarantee.
297+ write_digits ! ( @10u64 buffer, n)
298+ } else if n >= 1_0000_0000 {
299+ write_digits ! ( @9 buffer, n)
300+ } else if n >= 100_0000 {
301+ write_digits ! ( @7 -8 buffer, n)
302+ } else {
303+ write_digits ! ( @5 -6 buffer, n)
304+ }
305+ } else {
306+ // 11-20 digits, can do in 2 steps
307+ // NOTE: `hi` has to be in [0, 2^31], while `lo` is in `[0, 10^11)`
308+ // So, we can use our `from_u64_small` for hi. For our `lo`, we always
309+ // need to write 10 digits. However, the `jeaiii` algorithm is too
310+ // slow, so we use a modified variant of our 2-digit unfolding for
311+ // exactly 10 digits to read our values. We can optimize this in
312+ // 2x 4 digits and 1x 2 digits.
313+ let hi = ( n / FACTOR ) as u32 ;
314+ let lo = n % FACTOR ;
315+ let offset = from_u32 ( hi, buffer) ;
316+ write_digits ! ( @10 alex buffer, lo, offset)
317+ }
318+ }
319+
228320// TODO: Implement for:
229- // from_u64
230321// from_u128
231- // from_mant32 (23 bits)
232- // from_mant64 (53 bits)
0 commit comments