Single-controller LoRA RL fine-tuning with vLLM (#735)

gursimar · web-flow · commit 7c9e470f7f04 · 2026-01-05T17:10:02.000+08:00
* Working and tested examples for grpo single controller lora using the vllm backend

* Cleaned up some debug statements

* Updated and tested (performance matched with full RL) as per new design

* removed old single controller examples in lora folder as they are not required anymore
diff --git a/areal/api/io_struct.py b/areal/api/io_struct.py
@@ -148,7 +148,7 @@ def from_disk(
         use_lora: bool = False,
         clear_checkpoint_after_load: bool = True,
         lora_name: str = "",
-        lora_int_id: int = 0,
+        lora_int_id: int = 1,
         base_model_name: str = "",
     ) -> "WeightUpdateMeta":
         from areal.utils.saver import Saver
diff --git a/areal/experimental/trainer/rl.py b/areal/experimental/trainer/rl.py
@@ -132,14 +132,25 @@ def __init__(
 
         # Prepare weight update meta and connect to inference engine
         if self.config.actor.weight_update_mode == "disk":
-            self.weight_update_meta = WeightUpdateMeta.from_disk(
-                experiment_name=config.experiment_name,
-                trial_name=config.trial_name,
-                file_root=config.cluster.fileroot,
-                name="default",
-                use_lora=config.actor.use_lora,
-                clear_checkpoint_after_load=True,
-            )
+            if config.actor.use_lora:
+                self.weight_update_meta = WeightUpdateMeta.from_disk(
+                    experiment_name=config.experiment_name,
+                    trial_name=config.trial_name,
+                    file_root=config.cluster.fileroot,
+                    name="default",
+                    clear_checkpoint_after_load=True,
+                    use_lora=config.actor.use_lora,
+                    lora_name=config.gconfig.lora_name,
+                    base_model_name=config.actor.path,
+                )
+            else:
+                self.weight_update_meta = WeightUpdateMeta.from_disk(
+                    experiment_name=config.experiment_name,
+                    trial_name=config.trial_name,
+                    file_root=config.cluster.fileroot,
+                    name="default",
+                    clear_checkpoint_after_load=True,
+                )
         elif self.config.actor.weight_update_mode == "xccl":
             # NCCL/XCCL weight update
             if self.allocation_mode.train_backend == "megatron":
diff --git a/examples/math/gsm8k_grpo_lora.yaml b/examples/math/gsm8k_grpo_lora.yaml
@@ -0,0 +1,192 @@
+experiment_name: gsm8k-grpo
+trial_name: trial0
+
+seed: 1
+enable_offload: false
+total_train_epochs: 3
+tokenizer_path: ${actor.path}
+
+cluster:
+  n_nodes: 1
+  n_gpus_per_node: 16
+  fileroot: /tmp/areal/experiments
+  name_resolve:
+    type: nfs
+    nfs_record_root: /tmp/areal/name_resolve
+
+allocation_mode: vllm:d8p1t1+d8p1t1
+
+
+scheduler:
+  type: local
+
+
+rollout:
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  max_concurrent_rollouts: 256
+  queue_size: null
+  consumer_batch_size: ${train_dataset.batch_size}
+  max_head_offpolicyness: 2
+  enable_rollout_tracing: false
+  use_lora: true
+  scheduling_spec: ${actor.scheduling_spec}
+
+gconfig:
+  n_samples: 4
+  min_new_tokens: 0
+  max_new_tokens: 1024
+  greedy: false
+  temperature: 1.0
+  lora_name: "lora-gsm8k"
+
+actor:
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  path: Qwen/Qwen3-0.6B
+  init_from_scratch: false
+  disable_dropout: true
+  gradient_checkpointing: true
+  dtype: bfloat16
+  mb_spec:
+    max_tokens_per_mb: 10240
+  optimizer:
+    type: adam
+    lr: 1.70e-4
+    weight_decay: 0.017
+    beta1: 0.9
+    beta2: 0.999
+    eps: 1e-8
+    lr_scheduler_type: constant
+    gradient_clipping: 1.0
+    warmup_steps_proportion: 0.001
+  group_size: ${gconfig.n_samples}
+  eps_clip: 0.4
+  temperature: ${gconfig.temperature}
+  reward_scaling: 10.0
+  reward_bias: -0.5
+  kl_ctl: 0.0
+  ppo_n_minibatches: 1
+  recompute_logprob: true
+  use_decoupled_loss: true
+  behav_imp_weight_cap: 5.0
+  dynamic_sampling: false
+  reward_norm:
+    mean_level: group
+    std_level: group
+    group_size: ${gconfig.n_samples}
+  adv_norm:
+    mean_level: batch
+    std_level: batch
+  max_new_tokens: ${gconfig.max_new_tokens}
+  weight_update_mode: disk
+  use_lora: ${rollout.use_lora}
+  peft_type: lora
+  lora_rank: 16
+  lora_alpha: 16
+  target_modules: [all-linear]
+  scheduling_spec:
+    - task_type: worker
+      port_count: 2
+      gpu: 1
+      cpu: 4
+      mem: 32
+      cmd: python3 -m areal.scheduler.rpc.rpc_server
+      env_vars: {}
+
+ref:
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  path: ${actor.path}
+  init_from_scratch: false
+  disable_dropout: true
+  dtype: ${actor.dtype}
+  mb_spec:
+    max_tokens_per_mb: 10240
+  optimizer: null
+  scheduling_strategy:
+    type: colocation
+    target: actor
+  scheduling_spec: ${actor.scheduling_spec}
+
+
+# SGLang
+sglang:
+  model_path: ${actor.path}
+  random_seed: ${seed}
+  skip_tokenizer_init: true
+  dtype: ${actor.dtype}
+  max_running_requests: null
+  context_length: 32768
+  mem_fraction_static: 0.8
+
+# vLLM
+vllm:
+  model: ${actor.path}
+  seed: ${seed}
+  skip_tokenizer_init: false
+  dtype: ${actor.dtype}
+  max_model_len: 32768
+  gpu_memory_utilization: 0.8
+  enable_lora: ${rollout.use_lora}
+  lora_modules: '{"name": "${gconfig.lora_name}", "path": ./model/Qwen3.0.6B-16rank", "base_model_name": "${actor.path}"}'
+  enforce_eager: true
+
+# datasets
+train_dataset:
+  batch_size: 256
+  shuffle: true
+  pin_memory: true
+  num_workers: 4
+  path: openai/gsm8k
+  type: rl
+  max_length: 1024
+
+valid_dataset:
+  batch_size: 256
+  pin_memory: true
+  num_workers: 4
+  path: openai/gsm8k
+  type: rl
+
+# Utilities
+saver:
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  fileroot: ${cluster.fileroot}
+  freq_epochs: 1
+  freq_steps: null
+  freq_secs: null
+
+recover:
+  mode: disabled
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  fileroot: ${cluster.fileroot}
+  freq_epochs: 1
+  freq_steps: null
+  freq_secs: 3600
+
+evaluator:
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  fileroot: ${cluster.fileroot}
+  freq_epochs: null
+  freq_steps: null
+  freq_secs: null
+
+stats_logger:
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  fileroot: ${cluster.fileroot}
+  wandb:
+    mode: disabled
+
+
+perf_tracer:
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  fileroot: ${cluster.fileroot}
+  enabled: false
+  session_tracer:
+    enabled: false