-
Notifications
You must be signed in to change notification settings - Fork 33
Expand file tree
/
Copy pathLlamaTornadoCli.java
More file actions
executable file
·145 lines (127 loc) · 4.99 KB
/
LlamaTornadoCli.java
File metadata and controls
executable file
·145 lines (127 loc) · 4.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
//JAVA 21
//PREVIEW
//DEPS io.github.beehive-lab:gpu-llama3:0.3.2-dev
//DEPS io.github.beehive-lab:tornado-api:2.2.0
//DEPS io.github.beehive-lab:tornado-runtime:2.2.0
//SOURCES TornadoFlags.java
// === Set to not get annoying warnings about annotation processing
//JAVAC_OPTIONS -proc:full
// Compiler options
//JAVAC_OPTIONS --enable-preview
//JAVAC_OPTIONS --add-modules=jdk.incubator.vector
// JVM options for basic setup
//JAVA_OPTIONS --enable-preview
//JAVA_OPTIONS --add-modules=jdk.incubator.vector
package org.beehive.gpullama3.cli;
import org.beehive.gpullama3.Options;
import org.beehive.gpullama3.auxiliary.LastRunMetrics;
import org.beehive.gpullama3.inference.sampler.Sampler;
import org.beehive.gpullama3.model.Model;
import java.io.IOException;
import static org.beehive.gpullama3.inference.sampler.Sampler.createSampler;
import static org.beehive.gpullama3.model.loader.ModelLoader.loadModel;
/**
* LlamaTornadoCli - Pure Java CLI for running llama-tornado models
*
* This class provides a standalone command-line interface for running LLaMA models
* with TornadoVM acceleration. It can be executed directly with JBang or as a
* compiled Java application.
*
* Usage with JBang:
* jbang LlamaTornadoCli.java --model path/to/model.gguf --prompt "Your prompt here"
*
* Usage as compiled application:
* java --enable-preview --add-modules jdk.incubator.vector \
* -cp target/gpu-llama3-0.3.1.jar \
* org.beehive.gpullama3.cli.LlamaTornadoCli \
* --model path/to/model.gguf --prompt "Your prompt here"
*
* Examples:
* # Interactive chat mode
* jbang LlamaTornadoCli.java -m model.gguf --interactive
*
* # Single instruction mode
* jbang LlamaTornadoCli.java -m model.gguf -p "Explain quantum computing"
*
* # With TornadoVM acceleration
* jbang LlamaTornadoCli.java -m model.gguf -p "Hello" --use-tornadovm true
*
* # Custom temperature and sampling
* jbang LlamaTornadoCli.java -m model.gguf -p "Tell me a story" \
* --temperature 0.7 --top-p 0.9 --max-tokens 512
*/
public class LlamaTornadoCli {
// Configuration flags
public static final boolean USE_VECTOR_API = Boolean.parseBoolean(
System.getProperty("llama.VectorAPI", "true"));
public static final boolean SHOW_PERF_INTERACTIVE = Boolean.parseBoolean(
System.getProperty("llama.ShowPerfInteractive", "true"));
/**
* Run a single instruction and display the response
*/
private static void runSingleInstruction(Model model, Sampler sampler, Options options) {
String response = model.runInstructOnce(sampler, options);
System.out.println(response);
if (SHOW_PERF_INTERACTIVE) {
LastRunMetrics.printMetrics();
}
}
/**
* Main entry point for the CLI application
*
* @param args command-line arguments (see Options.parseOptions for details)
* @throws IOException if model loading fails
*/
public static void main(String[] args) throws IOException {
// Print banner
printBanner();
// Check if help requested
if (args.length == 0 || hasHelpFlag(args)) {
Options.printUsage(System.out);
System.exit(0);
}
try {
// Parse options
Options options = Options.parseOptions(args);
// Load model
Model model = loadModel(options);
// Create sampler
Sampler sampler = createSampler(model, options);
// Run in interactive or single-instruction mode
if (options.interactive()) {
System.out.println("Starting interactive chat mode...");
System.out.println("Type your messages below (Ctrl+C to exit):");
System.out.println();
model.runInteractive(sampler, options);
} else {
runSingleInstruction(model, sampler, options);
}
} catch (Exception e) {
System.err.println("Error: " + e.getMessage());
e.printStackTrace();
System.exit(1);
}
}
/**
* Check if help flag is present in arguments
*/
private static boolean hasHelpFlag(String[] args) {
for (String arg : args) {
if (arg.equals("--help") || arg.equals("-h")) {
return true;
}
}
return false;
}
/**
* Print ASCII banner
*/
private static void printBanner() {
System.out.println("""
╔══════════════════════════════════════════════════════════╗
║ Llama-Tornado CLI - GPU-Accelerated LLM ║
║ Powered by TornadoVM & Java 21 ║
╚══════════════════════════════════════════════════════════╝
""");
}
}