Skip to content

Commit 1f4650a

Browse files
committed
fix(stage-ui,stage-tamagotchi,stage-web): incorrect implementation of Aliyun NLS
1 parent bc6f2ac commit 1f4650a

File tree

11 files changed

+576
-60
lines changed

11 files changed

+576
-60
lines changed

apps/stage-tamagotchi/src/renderer/pages/index.vue

Lines changed: 61 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,11 +103,18 @@ watch([isOutsideFor250Ms, isAroundWindowBorderFor250Ms, isOutsideWindow, isTrans
103103
const settingsAudioDeviceStore = useSettingsAudioDevice()
104104
const { stream, enabled } = storeToRefs(settingsAudioDeviceStore)
105105
const { startRecord, stopRecord, onStopRecord } = useAudioRecorder(stream)
106-
const { transcribeForRecording } = useHearingSpeechInputPipeline()
106+
const hearingPipeline = useHearingSpeechInputPipeline()
107+
const {
108+
transcribeForRecording,
109+
transcribeForMediaStream,
110+
stopStreamingTranscription,
111+
} = hearingPipeline
112+
const { supportsStreamInput } = storeToRefs(hearingPipeline)
107113
const providersStore = useProvidersStore()
108114
const consciousnessStore = useConsciousnessStore()
109115
const { activeProvider: activeChatProvider, activeModel: activeChatModel } = storeToRefs(consciousnessStore)
110116
const chatStore = useChatStore()
117+
const shouldUseStreamInput = computed(() => supportsStreamInput.value && !!stream.value)
111118
112119
const {
113120
init: initVAD,
@@ -116,8 +123,12 @@ const {
116123
loaded: vadLoaded,
117124
} = useVAD(workletUrl, {
118125
threshold: ref(0.6),
119-
onSpeechStart: () => startRecord(),
120-
onSpeechEnd: () => stopRecord(),
126+
onSpeechStart: () => {
127+
void handleSpeechStart()
128+
},
129+
onSpeechEnd: () => {
130+
void handleSpeechEnd()
131+
},
121132
})
122133
123134
let stopOnStopRecord: (() => void) | undefined
@@ -128,6 +139,49 @@ type CaptionChannelEvent
128139
| { type: 'caption-assistant', text: string }
129140
const { post: postCaption } = useBroadcastChannel<CaptionChannelEvent, CaptionChannelEvent>({ name: 'airi-caption-overlay' })
130141
142+
async function handleSpeechStart() {
143+
if (shouldUseStreamInput.value && stream.value) {
144+
await transcribeForMediaStream(stream.value, {
145+
onSentenceEnd: (delta) => {
146+
const finalText = delta
147+
if (!finalText || !finalText.trim()) {
148+
return
149+
}
150+
151+
postCaption({ type: 'caption-speaker', text: finalText })
152+
153+
void (async () => {
154+
try {
155+
const provider = await providersStore.getProviderInstance(activeChatProvider.value)
156+
if (!provider || !activeChatModel.value)
157+
return
158+
159+
await chatStore.send(finalText, { model: activeChatModel.value, chatProvider: provider as ChatProvider })
160+
}
161+
catch (err) {
162+
console.error('Failed to send chat from voice:', err)
163+
}
164+
})()
165+
},
166+
onSpeechEnd: (text) => {
167+
postCaption({ type: 'caption-speaker', text })
168+
},
169+
})
170+
return
171+
}
172+
173+
startRecord()
174+
}
175+
176+
async function handleSpeechEnd() {
177+
if (shouldUseStreamInput.value) {
178+
// Keep streaming session alive; idle timer in pipeline will handle teardown.
179+
return
180+
}
181+
182+
stopRecord()
183+
}
184+
131185
async function startAudioInteraction() {
132186
try {
133187
await initVAD()
@@ -136,6 +190,9 @@ async function startAudioInteraction() {
136190
137191
// Hook once
138192
stopOnStopRecord = onStopRecord(async (recording) => {
193+
if (shouldUseStreamInput.value)
194+
return
195+
139196
const text = await transcribeForRecording(recording)
140197
if (!text || !text.trim())
141198
return
@@ -164,6 +221,7 @@ function stopAudioInteraction() {
164221
try {
165222
stopOnStopRecord?.()
166223
stopOnStopRecord = undefined
224+
void stopStreamingTranscription(true)
167225
disposeVAD()
168226
}
169227
catch {}

apps/stage-web/src/pages/index.vue

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ import { useProvidersStore } from '@proj-airi/stage-ui/stores/providers'
1414
import { useSettingsAudioDevice } from '@proj-airi/stage-ui/stores/settings'
1515
import { breakpointsTailwind, useBreakpoints, useMouse } from '@vueuse/core'
1616
import { storeToRefs } from 'pinia'
17-
import { onMounted, onUnmounted, ref, useTemplateRef, watch } from 'vue'
17+
import { computed, onMounted, onUnmounted, ref, useTemplateRef, watch } from 'vue'
1818
1919
import Header from '../components/Layouts/Header.vue'
2020
import InteractiveArea from '../components/Layouts/InteractiveArea.vue'
@@ -47,21 +47,25 @@ onMounted(() => syncBackgroundTheme())
4747
const settingsAudioDeviceStore = useSettingsAudioDevice()
4848
const { stream, enabled } = storeToRefs(settingsAudioDeviceStore)
4949
const { startRecord, stopRecord, onStopRecord } = useAudioRecorder(stream)
50-
const { transcribeForRecording } = useHearingSpeechInputPipeline()
50+
const hearingPipeline = useHearingSpeechInputPipeline()
51+
const { transcribeForRecording, transcribeForMediaStream } = hearingPipeline
52+
const { supportsStreamInput } = storeToRefs(hearingPipeline)
5153
const providersStore = useProvidersStore()
5254
const consciousnessStore = useConsciousnessStore()
5355
const { activeProvider: activeChatProvider, activeModel: activeChatModel } = storeToRefs(consciousnessStore)
5456
const chatStore = useChatStore()
5557
58+
const shouldUseStreamInput = computed(() => supportsStreamInput.value && !!stream.value)
59+
5660
const {
5761
init: initVAD,
5862
dispose: disposeVAD,
5963
start: startVAD,
6064
loaded: vadLoaded,
6165
} = useVAD(workletUrl, {
6266
threshold: ref(0.6),
63-
onSpeechStart: () => startRecord(),
64-
onSpeechEnd: () => stopRecord(),
67+
onSpeechStart: () => handleSpeechStart(),
68+
onSpeechEnd: () => handleSpeechEnd(),
6569
})
6670
6771
let stopOnStopRecord: (() => void) | undefined
@@ -95,6 +99,44 @@ async function startAudioInteraction() {
9599
}
96100
}
97101
102+
async function handleSpeechStart() {
103+
if (shouldUseStreamInput.value && stream.value) {
104+
await transcribeForMediaStream(stream.value, {
105+
onSentenceEnd: (delta) => {
106+
const finalText = delta
107+
if (!finalText || !finalText.trim()) {
108+
return
109+
}
110+
111+
void (async () => {
112+
try {
113+
const provider = await providersStore.getProviderInstance(activeChatProvider.value)
114+
if (!provider || !activeChatModel.value)
115+
return
116+
117+
await chatStore.send(finalText, { model: activeChatModel.value, chatProvider: provider as ChatProvider })
118+
}
119+
catch (err) {
120+
console.error('Failed to send chat from voice:', err)
121+
}
122+
})()
123+
},
124+
})
125+
return
126+
}
127+
128+
startRecord()
129+
}
130+
131+
async function handleSpeechEnd() {
132+
if (shouldUseStreamInput.value) {
133+
// Keep streaming session alive; idle timer in pipeline will handle teardown.
134+
return
135+
}
136+
137+
stopRecord()
138+
}
139+
98140
function stopAudioInteraction() {
99141
try {
100142
stopOnStopRecord?.()

packages/stage-pages/src/pages/settings/modules/hearing.vue

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,14 @@ const { audioInputs, selectedAudioInput, stream } = storeToRefs(useSettingsAudio
3434
const { startRecord, stopRecord, onStopRecord } = useAudioRecorder(stream)
3535
const { startAnalyzer, stopAnalyzer, onAnalyzerUpdate, volumeLevel } = useAudioAnalyzer()
3636
const { audioContext } = storeToRefs(useAudioContext())
37-
const { transcribeForRecording } = useHearingSpeechInputPipeline()
37+
const {
38+
transcribeForRecording,
39+
transcribeForMediaStream,
40+
stopStreamingTranscription,
41+
} = useHearingSpeechInputPipeline()
42+
const {
43+
supportsStreamInput,
44+
} = storeToRefs(useHearingSpeechInputPipeline())
3845
3946
const animationFrame = ref<number>()
4047
@@ -54,6 +61,33 @@ const audioURLs = computed(() => {
5461
5562
const useVADThreshold = ref(0.6) // 0.1 - 0.9
5663
const useVADModel = ref(true) // Toggle between VAD and volume-based detection
64+
const shouldUseStreamInput = computed(() => supportsStreamInput.value && !!stream.value)
65+
66+
async function handleSpeechStart() {
67+
if (shouldUseStreamInput.value && stream.value) {
68+
await transcribeForMediaStream(stream.value, {
69+
onSentenceEnd: (delta) => {
70+
transcriptions.value.push(delta)
71+
},
72+
onSpeechEnd: (text) => {
73+
transcriptions.value = [text]
74+
},
75+
})
76+
return
77+
}
78+
79+
startRecord()
80+
}
81+
82+
async function handleSpeechEnd() {
83+
if (shouldUseStreamInput.value) {
84+
// For streaming providers, keep the session alive; idle timer will handle teardown.
85+
return
86+
}
87+
88+
stopRecord()
89+
}
90+
5791
const {
5892
init: initVAD,
5993
dispose: disposeVAD,
@@ -66,8 +100,12 @@ const {
66100
loading: loadingVAD,
67101
} = useVAD(workletUrl, {
68102
threshold: useVADThreshold,
69-
onSpeechStart: () => startRecord(),
70-
onSpeechEnd: () => stopRecord(),
103+
onSpeechStart: () => {
104+
void handleSpeechStart()
105+
},
106+
onSpeechEnd: () => {
107+
void handleSpeechEnd()
108+
},
71109
})
72110
73111
const isSpeechVolume = ref(false) // Volume-based speaking detection
@@ -122,6 +160,8 @@ async function stopAudioMonitoring() {
122160
cancelAnimationFrame(animationFrame.value)
123161
animationFrame.value = undefined
124162
}
163+
164+
await stopStreamingTranscription(true, activeTranscriptionProvider.value)
125165
if (stream.value) { // Stop media stream
126166
stopStream()
127167
}
@@ -174,6 +214,9 @@ function updateCustomModelName(value: string) {
174214
}
175215
176216
onStopRecord(async (recording) => {
217+
if (shouldUseStreamInput.value)
218+
return
219+
177220
if (recording && recording.size > 0)
178221
audios.value.push(recording)
179222
@@ -365,10 +408,10 @@ onUnmounted(() => {
365408
</Button>
366409

367410
<div>
368-
<div v-for="(audio, index) in audioURLs" :key="index" class="mb-2">
369-
<audio :src="audio" controls class="w-full" />
370-
<div v-if="transcriptions[index]" class="mt-2 text-sm text-neutral-500 dark:text-neutral-400">
371-
{{ transcriptions[index] }}
411+
<div v-for="(transcription, index) in transcriptions" :key="index" class="mb-2">
412+
<audio v-if="audioURLs[index]" :src="audioURLs[index]" controls class="w-full" />
413+
<div v-if="transcription" class="mt-2 text-sm text-neutral-500 dark:text-neutral-400">
414+
{{ transcription }}
372415
</div>
373416
</div>
374417
</div>

packages/stage-ui/src/composables/audio/device.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,14 @@ export function useAudioDevice() {
55
const devices = useDevicesList({ constraints: { audio: true }, requestPermissions: true })
66
const audioInputs = computed(() => devices.audioInputs.value)
77
const selectedAudioInput = ref<string>(devices.audioInputs.value[0]?.deviceId || '')
8-
const deviceConstraints = computed<MediaStreamConstraints>(() => ({ audio: { deviceId: { exact: selectedAudioInput.value }, autoGainControl: true, echoCancellation: true, noiseSuppression: true } }))
8+
const deviceConstraints = computed<MediaStreamConstraints>(() => ({
9+
audio: {
10+
...(selectedAudioInput.value ? { deviceId: { exact: selectedAudioInput.value } } : {}),
11+
autoGainControl: true,
12+
echoCancellation: true,
13+
noiseSuppression: true,
14+
},
15+
}))
916
const { stream, stop: stopStream, start: startStream } = useUserMedia({ constraints: deviceConstraints, enabled: false, autoSwitch: true })
1017

1118
watch(audioInputs, () => {

0 commit comments

Comments
 (0)