moeru-ai
diff --git a/‎apps/stage-tamagotchi/src/renderer/pages/index.vue‎
Lines changed: 61 additions & 3 deletions b/‎apps/stage-tamagotchi/src/renderer/pages/index.vue‎
Lines changed: 61 additions & 3 deletions
diff --git a/‎apps/stage-web/src/pages/index.vue‎
Lines changed: 46 additions & 4 deletions b/‎apps/stage-web/src/pages/index.vue‎
Lines changed: 46 additions & 4 deletions
diff --git a/‎packages/stage-pages/src/pages/settings/modules/hearing.vue‎
Lines changed: 50 additions & 7 deletions b/‎packages/stage-pages/src/pages/settings/modules/hearing.vue‎
Lines changed: 50 additions & 7 deletions
diff --git a/‎packages/stage-ui/src/composables/audio/device.ts‎
Lines changed: 8 additions & 1 deletion b/‎packages/stage-ui/src/composables/audio/device.ts‎
Lines changed: 8 additions & 1 deletion
@@ -103,11 +103,18 @@ watch([isOutsideFor250Ms, isAroundWindowBorderFor250Ms, isOutsideWindow, isTrans
 const settingsAudioDeviceStore = useSettingsAudioDevice()
 const { stream, enabled } = storeToRefs(settingsAudioDeviceStore)
 const { startRecord, stopRecord, onStopRecord } = useAudioRecorder(stream)
-const { transcribeForRecording } = useHearingSpeechInputPipeline()
+const hearingPipeline = useHearingSpeechInputPipeline()
+const {
+  transcribeForRecording,
+  transcribeForMediaStream,
+  stopStreamingTranscription,
+} = hearingPipeline
+const { supportsStreamInput } = storeToRefs(hearingPipeline)
 const providersStore = useProvidersStore()
 const consciousnessStore = useConsciousnessStore()
 const { activeProvider: activeChatProvider, activeModel: activeChatModel } = storeToRefs(consciousnessStore)
 const chatStore = useChatStore()
+const shouldUseStreamInput = computed(() => supportsStreamInput.value && !!stream.value)
 
 const {
   init: initVAD,
@@ -116,8 +123,12 @@ const {
   loaded: vadLoaded,
 } = useVAD(workletUrl, {
   threshold: ref(0.6),
-  onSpeechStart: () => startRecord(),
-  onSpeechEnd: () => stopRecord(),
+  onSpeechStart: () => {
+    void handleSpeechStart()
+  },
+  onSpeechEnd: () => {
+    void handleSpeechEnd()
+  },
 })
 
 let stopOnStopRecord: (() => void) | undefined
@@ -128,6 +139,49 @@ type CaptionChannelEvent
     | { type: 'caption-assistant', text: string }
 const { post: postCaption } = useBroadcastChannel<CaptionChannelEvent, CaptionChannelEvent>({ name: 'airi-caption-overlay' })
 
+async function handleSpeechStart() {
+  if (shouldUseStreamInput.value && stream.value) {
+    await transcribeForMediaStream(stream.value, {
+      onSentenceEnd: (delta) => {
+        const finalText = delta
+        if (!finalText || !finalText.trim()) {
+          return
+        }
+
+        postCaption({ type: 'caption-speaker', text: finalText })
+
+        void (async () => {
+          try {
+            const provider = await providersStore.getProviderInstance(activeChatProvider.value)
+            if (!provider || !activeChatModel.value)
+              return
+
+            await chatStore.send(finalText, { model: activeChatModel.value, chatProvider: provider as ChatProvider })
+          }
+          catch (err) {
+            console.error('Failed to send chat from voice:', err)
+          }
+        })()
+      },
+      onSpeechEnd: (text) => {
+        postCaption({ type: 'caption-speaker', text })
+      },
+    })
+    return
+  }
+
+  startRecord()
+}
+
+async function handleSpeechEnd() {
+  if (shouldUseStreamInput.value) {
+    // Keep streaming session alive; idle timer in pipeline will handle teardown.
+    return
+  }
+
+  stopRecord()
+}
+
 async function startAudioInteraction() {
   try {
     await initVAD()
@@ -136,6 +190,9 @@ async function startAudioInteraction() {
 
     // Hook once
     stopOnStopRecord = onStopRecord(async (recording) => {
+      if (shouldUseStreamInput.value)
+        return
+
       const text = await transcribeForRecording(recording)
       if (!text || !text.trim())
         return
@@ -164,6 +221,7 @@ function stopAudioInteraction() {
   try {
     stopOnStopRecord?.()
     stopOnStopRecord = undefined
+    void stopStreamingTranscription(true)
     disposeVAD()
   }
   catch {}
 
@@ -14,7 +14,7 @@ import { useProvidersStore } from '@proj-airi/stage-ui/stores/providers'
 import { useSettingsAudioDevice } from '@proj-airi/stage-ui/stores/settings'
 import { breakpointsTailwind, useBreakpoints, useMouse } from '@vueuse/core'
 import { storeToRefs } from 'pinia'
-import { onMounted, onUnmounted, ref, useTemplateRef, watch } from 'vue'
+import { computed, onMounted, onUnmounted, ref, useTemplateRef, watch } from 'vue'
 
 import Header from '../components/Layouts/Header.vue'
 import InteractiveArea from '../components/Layouts/InteractiveArea.vue'
@@ -47,21 +47,25 @@ onMounted(() => syncBackgroundTheme())
 const settingsAudioDeviceStore = useSettingsAudioDevice()
 const { stream, enabled } = storeToRefs(settingsAudioDeviceStore)
 const { startRecord, stopRecord, onStopRecord } = useAudioRecorder(stream)
-const { transcribeForRecording } = useHearingSpeechInputPipeline()
+const hearingPipeline = useHearingSpeechInputPipeline()
+const { transcribeForRecording, transcribeForMediaStream } = hearingPipeline
+const { supportsStreamInput } = storeToRefs(hearingPipeline)
 const providersStore = useProvidersStore()
 const consciousnessStore = useConsciousnessStore()
 const { activeProvider: activeChatProvider, activeModel: activeChatModel } = storeToRefs(consciousnessStore)
 const chatStore = useChatStore()
 
+const shouldUseStreamInput = computed(() => supportsStreamInput.value && !!stream.value)
+
 const {
   init: initVAD,
   dispose: disposeVAD,
   start: startVAD,
   loaded: vadLoaded,
 } = useVAD(workletUrl, {
   threshold: ref(0.6),
-  onSpeechStart: () => startRecord(),
-  onSpeechEnd: () => stopRecord(),
+  onSpeechStart: () => handleSpeechStart(),
+  onSpeechEnd: () => handleSpeechEnd(),
 })
 
 let stopOnStopRecord: (() => void) | undefined
@@ -95,6 +99,44 @@ async function startAudioInteraction() {
   }
 }
 
+async function handleSpeechStart() {
+  if (shouldUseStreamInput.value && stream.value) {
+    await transcribeForMediaStream(stream.value, {
+      onSentenceEnd: (delta) => {
+        const finalText = delta
+        if (!finalText || !finalText.trim()) {
+          return
+        }
+
+        void (async () => {
+          try {
+            const provider = await providersStore.getProviderInstance(activeChatProvider.value)
+            if (!provider || !activeChatModel.value)
+              return
+
+            await chatStore.send(finalText, { model: activeChatModel.value, chatProvider: provider as ChatProvider })
+          }
+          catch (err) {
+            console.error('Failed to send chat from voice:', err)
+          }
+        })()
+      },
+    })
+    return
+  }
+
+  startRecord()
+}
+
+async function handleSpeechEnd() {
+  if (shouldUseStreamInput.value) {
+    // Keep streaming session alive; idle timer in pipeline will handle teardown.
+    return
+  }
+
+  stopRecord()
+}
+
 function stopAudioInteraction() {
   try {
     stopOnStopRecord?.()
 
@@ -34,7 +34,14 @@ const { audioInputs, selectedAudioInput, stream } = storeToRefs(useSettingsAudio
 const { startRecord, stopRecord, onStopRecord } = useAudioRecorder(stream)
 const { startAnalyzer, stopAnalyzer, onAnalyzerUpdate, volumeLevel } = useAudioAnalyzer()
 const { audioContext } = storeToRefs(useAudioContext())
-const { transcribeForRecording } = useHearingSpeechInputPipeline()
+const {
+  transcribeForRecording,
+  transcribeForMediaStream,
+  stopStreamingTranscription,
+} = useHearingSpeechInputPipeline()
+const {
+  supportsStreamInput,
+} = storeToRefs(useHearingSpeechInputPipeline())
 
 const animationFrame = ref<number>()
 
@@ -54,6 +61,33 @@ const audioURLs = computed(() => {
 
 const useVADThreshold = ref(0.6) // 0.1 - 0.9
 const useVADModel = ref(true) // Toggle between VAD and volume-based detection
+const shouldUseStreamInput = computed(() => supportsStreamInput.value && !!stream.value)
+
+async function handleSpeechStart() {
+  if (shouldUseStreamInput.value && stream.value) {
+    await transcribeForMediaStream(stream.value, {
+      onSentenceEnd: (delta) => {
+        transcriptions.value.push(delta)
+      },
+      onSpeechEnd: (text) => {
+        transcriptions.value = [text]
+      },
+    })
+    return
+  }
+
+  startRecord()
+}
+
+async function handleSpeechEnd() {
+  if (shouldUseStreamInput.value) {
+    // For streaming providers, keep the session alive; idle timer will handle teardown.
+    return
+  }
+
+  stopRecord()
+}
+
 const {
   init: initVAD,
   dispose: disposeVAD,
@@ -66,8 +100,12 @@ const {
   loading: loadingVAD,
 } = useVAD(workletUrl, {
   threshold: useVADThreshold,
-  onSpeechStart: () => startRecord(),
-  onSpeechEnd: () => stopRecord(),
+  onSpeechStart: () => {
+    void handleSpeechStart()
+  },
+  onSpeechEnd: () => {
+    void handleSpeechEnd()
+  },
 })
 
 const isSpeechVolume = ref(false) // Volume-based speaking detection
@@ -122,6 +160,8 @@ async function stopAudioMonitoring() {
     cancelAnimationFrame(animationFrame.value)
     animationFrame.value = undefined
   }
+
+  await stopStreamingTranscription(true, activeTranscriptionProvider.value)
   if (stream.value) { // Stop media stream
     stopStream()
   }
@@ -174,6 +214,9 @@ function updateCustomModelName(value: string) {
 }
 
 onStopRecord(async (recording) => {
+  if (shouldUseStreamInput.value)
+    return
+
   if (recording && recording.size > 0)
     audios.value.push(recording)
 
@@ -365,10 +408,10 @@ onUnmounted(() => {
         </Button>
 
         <div>
-          <div v-for="(audio, index) in audioURLs" :key="index" class="mb-2">
-            <audio :src="audio" controls class="w-full" />
-            <div v-if="transcriptions[index]" class="mt-2 text-sm text-neutral-500 dark:text-neutral-400">
-              {{ transcriptions[index] }}
+          <div v-for="(transcription, index) in transcriptions" :key="index" class="mb-2">
+            <audio v-if="audioURLs[index]" :src="audioURLs[index]" controls class="w-full" />
+            <div v-if="transcription" class="mt-2 text-sm text-neutral-500 dark:text-neutral-400">
+              {{ transcription }}
             </div>
           </div>
         </div>
 
@@ -5,7 +5,14 @@ export function useAudioDevice() {
   const devices = useDevicesList({ constraints: { audio: true }, requestPermissions: true })
   const audioInputs = computed(() => devices.audioInputs.value)
   const selectedAudioInput = ref<string>(devices.audioInputs.value[0]?.deviceId || '')
-  const deviceConstraints = computed<MediaStreamConstraints>(() => ({ audio: { deviceId: { exact: selectedAudioInput.value }, autoGainControl: true, echoCancellation: true, noiseSuppression: true } }))
+  const deviceConstraints = computed<MediaStreamConstraints>(() => ({
+    audio: {
+      ...(selectedAudioInput.value ? { deviceId: { exact: selectedAudioInput.value } } : {}),
+      autoGainControl: true,
+      echoCancellation: true,
+      noiseSuppression: true,
+    },
+  }))
   const { stream, stop: stopStream, start: startStream } = useUserMedia({ constraints: deviceConstraints, enabled: false, autoSwitch: true })
 
   watch(audioInputs, () => {