GPULlama3.java/LlamaTornadoCli.java at main · beehive-lab/GPULlama3.java · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
//JAVA 21
//PREVIEW
//DEPS io.github.beehive-lab:gpu-llama3:0.3.2-dev
//DEPS io.github.beehive-lab:tornado-api:2.2.0
//DEPS io.github.beehive-lab:tornado-runtime:2.2.0

//SOURCES TornadoFlags.java
// === Set to not get annoying warnings about annotation processing
//JAVAC_OPTIONS -proc:full

// Compiler options
//JAVAC_OPTIONS --enable-preview
//JAVAC_OPTIONS --add-modules=jdk.incubator.vector

// JVM options for basic setup
//JAVA_OPTIONS --enable-preview
//JAVA_OPTIONS --add-modules=jdk.incubator.vector

package org.beehive.gpullama3.cli;

import org.beehive.gpullama3.Options;
import org.beehive.gpullama3.auxiliary.LastRunMetrics;
import org.beehive.gpullama3.inference.sampler.Sampler;
import org.beehive.gpullama3.model.Model;

import java.io.IOException;

import static org.beehive.gpullama3.inference.sampler.Sampler.createSampler;
import static org.beehive.gpullama3.model.loader.ModelLoader.loadModel;

/**
 * LlamaTornadoCli - Pure Java CLI for running llama-tornado models
 *
 * This class provides a standalone command-line interface for running LLaMA models
 * with TornadoVM acceleration. It can be executed directly with JBang or as a
 * compiled Java application.
 *
 * Usage with JBang:
 *   jbang LlamaTornadoCli.java --model path/to/model.gguf --prompt "Your prompt here"
 *
 * Usage as compiled application:
 *   java --enable-preview --add-modules jdk.incubator.vector \
 *        -cp target/gpu-llama3-0.3.1.jar \
 *        org.beehive.gpullama3.cli.LlamaTornadoCli \
 *        --model path/to/model.gguf --prompt "Your prompt here"
 *
 * Examples:
 *   # Interactive chat mode
 *   jbang LlamaTornadoCli.java -m model.gguf --interactive
 *
 *   # Single instruction mode
 *   jbang LlamaTornadoCli.java -m model.gguf -p "Explain quantum computing"
 *
 *   # With TornadoVM acceleration
 *   jbang LlamaTornadoCli.java -m model.gguf -p "Hello" --use-tornadovm true
 *
 *   # Custom temperature and sampling
 *   jbang LlamaTornadoCli.java -m model.gguf -p "Tell me a story" \
 *        --temperature 0.7 --top-p 0.9 --max-tokens 512
 */
public class LlamaTornadoCli {

    // Configuration flags
    public static final boolean USE_VECTOR_API = Boolean.parseBoolean(
        System.getProperty("llama.VectorAPI", "true"));
    public static final boolean SHOW_PERF_INTERACTIVE = Boolean.parseBoolean(
        System.getProperty("llama.ShowPerfInteractive", "true"));

    /**
     * Run a single instruction and display the response
     */
    private static void runSingleInstruction(Model model, Sampler sampler, Options options) {
        String response = model.runInstructOnce(sampler, options);
        System.out.println(response);
        if (SHOW_PERF_INTERACTIVE) {
            LastRunMetrics.printMetrics();
        }
    }

    /**
     * Main entry point for the CLI application
     *
     * @param args command-line arguments (see Options.parseOptions for details)
     * @throws IOException if model loading fails
     */
    public static void main(String[] args) throws IOException {
        // Print banner
        printBanner();

        // Check if help requested
        if (args.length == 0 || hasHelpFlag(args)) {
            Options.printUsage(System.out);
            System.exit(0);
        }

        try {
            // Parse options
            Options options = Options.parseOptions(args);

            // Load model
            Model model = loadModel(options);

            // Create sampler
            Sampler sampler = createSampler(model, options);

            // Run in interactive or single-instruction mode
            if (options.interactive()) {
                System.out.println("Starting interactive chat mode...");
                System.out.println("Type your messages below (Ctrl+C to exit):");
                System.out.println();
                model.runInteractive(sampler, options);
            } else {
                runSingleInstruction(model, sampler, options);
            }
        } catch (Exception e) {
            System.err.println("Error: " + e.getMessage());
            e.printStackTrace();
            System.exit(1);
        }
    }

    /**
     * Check if help flag is present in arguments
     */
    private static boolean hasHelpFlag(String[] args) {
        for (String arg : args) {
            if (arg.equals("--help") || arg.equals("-h")) {
                return true;
            }
        }
        return false;
    }

    /**
     * Print ASCII banner
     */
    private static void printBanner() {
        System.out.println("""
            ╔══════════════════════════════════════════════════════════╗
            ║        Llama-Tornado CLI - GPU-Accelerated LLM           ║
            ║           Powered by TornadoVM & Java 21                 ║
            ╚══════════════════════════════════════════════════════════╝
            """);
    }
}