Skip to content

Commit f68d5d0

Browse files
DavertMikclaude
andcommitted
feat(mcp): make run_step_by_step actually interactive
Previously run_step_by_step ran the whole test to completion in one call and returned a fat blob of per-step artifacts. That's the aiTrace plugin's job, not an interactive tool's. Now it pauses after every step using the same pauseNow + handler machinery as run_test's pauseAt: agent calls run_step_by_step, gets back a paused payload after step 1, calls continue to advance to step 2, and so on. At any pause they can run_code / snapshot to inspect state. continue is unified: it races "test paused again" vs "test completed", so the same call works for run_step_by_step (re-pauses each time), pauseAt (runs to end), and explicit pause() in the test (runs to end). Module- level pendingTestFile / pendingStepInfo carry the paused-payload data through repeated continue cycles. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 6a9ed9f commit f68d5d0

2 files changed

Lines changed: 150 additions & 150 deletions

File tree

bin/mcp-server.js

Lines changed: 119 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,9 @@ function outputBaseDir() {
244244
let pausedController = null // { resolveContinue, registeredVariables }
245245
let pendingRunPromise = null // run_test's run() promise while paused
246246
let pendingRunResults = null // results array being collected while paused
247-
let pendingRunCleanup = null // cleanup callback to detach test.after listener
247+
let pendingRunCleanup = null // cleanup callback to detach test.after / step.after listeners
248+
let pendingTestFile = null // file path of the test currently running
249+
let pendingStepInfo = null // { index, name, status } of the last step that fired step.after
248250
const pauseEvents = new EventEmitter()
249251

250252
setPauseHandler(({ registeredVariables }) => {
@@ -294,13 +296,28 @@ function collectRunCompletion(errorMessage) {
294296
if (typeof pendingRunCleanup === 'function') pendingRunCleanup()
295297
pendingRunPromise = null
296298
pendingRunResults = null
299+
pendingTestFile = null
300+
pendingStepInfo = null
297301
return {
298302
status: 'completed',
299303
reporterJson: { stats, tests: results },
300304
error: errorMessage,
301305
}
302306
}
303307

308+
function pausedPayload() {
309+
return {
310+
status: 'paused',
311+
file: pendingTestFile,
312+
pausedAfter: pendingStepInfo,
313+
suggestions: [
314+
'Call snapshot to capture URL/HTML/ARIA/screenshot/console/storage at this point',
315+
'Call run_code to inspect or manipulate state (e.g. return await I.grabText("h1"))',
316+
'Call continue to release the pause and let the test run the next step (or finish)',
317+
],
318+
}
319+
}
320+
304321
async function initCodecept(configPath) {
305322
if (containerInitialized) return
306323

@@ -383,7 +400,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => ({
383400
},
384401
{
385402
name: 'run_step_by_step',
386-
description: 'Run a test step by step with pauses between steps.',
403+
description: 'Run a test interactively, pausing after every step. Returns paused payload after the first step (URL/title/contentSize, last step info, suggestions). Call continue to advance one step (and re-pause), or run_code/snapshot to inspect state. The test runs to completion when no more steps remain.',
387404
inputSchema: {
388405
type: 'object',
389406
properties: {
@@ -538,16 +555,33 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
538555
}
539556

540557
case 'continue': {
541-
if (!pausedController) throw new Error('No paused test. Run a test first via run_test; this tool becomes available if the test calls pause().')
558+
if (!pausedController) throw new Error('No paused test. Run a test first via run_test or run_step_by_step; this tool becomes available if the test pauses.')
559+
const { timeout = 60000 } = args || {}
542560
return await withSilencedIO(async () => {
543561
pausedController.resolveContinue()
544562
if (!pendingRunPromise) {
545563
return { content: [{ type: 'text', text: JSON.stringify({ status: 'continued' }, null, 2) }] }
546564
}
565+
566+
// Race: test pauses again (step-by-step or another pause()) vs test finishes.
567+
const pausedAgain = new Promise(resolve => pauseEvents.once('paused', () => resolve('paused')))
568+
const completed = pendingRunPromise.then(() => 'completed', () => 'completed')
569+
const which = await Promise.race([
570+
pausedAgain,
571+
completed,
572+
new Promise((_, reject) => setTimeout(() => reject(new Error(`Timeout after ${timeout}ms`)), timeout)),
573+
])
574+
575+
if (which === 'paused') {
576+
const page = await gatherPageBrief()
577+
return { content: [{ type: 'text', text: JSON.stringify({ ...pausedPayload(), page }, null, 2) }] }
578+
}
579+
547580
let runError = null
548581
try { await pendingRunPromise } catch (err) { runError = err }
582+
const file = pendingTestFile
549583
const final = collectRunCompletion(runError?.message)
550-
return { content: [{ type: 'text', text: JSON.stringify(final, null, 2) }] }
584+
return { content: [{ type: 'text', text: JSON.stringify({ ...final, file }, null, 2) }] }
551585
})
552586
}
553587

@@ -671,8 +705,9 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
671705
const testFile = testFiles[0]
672706

673707
pendingRunResults = []
708+
pendingTestFile = testFile
709+
pendingStepInfo = null
674710
let stepIndex = 0
675-
let lastStepInfo = null
676711

677712
const onAfter = t => {
678713
pendingRunResults.push({
@@ -686,9 +721,9 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
686721
const onStepAfter = step => {
687722
stepIndex += 1
688723
try {
689-
lastStepInfo = { index: stepIndex, name: step.toString(), status: step.status }
724+
pendingStepInfo = { index: stepIndex, name: step.toString(), status: step.status }
690725
} catch {
691-
lastStepInfo = { index: stepIndex }
726+
pendingStepInfo = { index: stepIndex }
692727
}
693728
if (typeof pauseAt === 'number' && stepIndex === pauseAt) {
694729
pauseNow()
@@ -728,17 +763,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
728763
return {
729764
content: [{
730765
type: 'text',
731-
text: JSON.stringify({
732-
status: 'paused',
733-
file: testFile,
734-
pausedAfter: lastStepInfo,
735-
page,
736-
suggestions: [
737-
'Call snapshot to capture URL/HTML/ARIA/screenshot/console/storage at this point',
738-
'Call run_code to inspect or manipulate state (e.g. return await I.grabText("h1"))',
739-
'Call continue to release the pause and let the test finish',
740-
],
741-
}, null, 2),
766+
text: JSON.stringify({ ...pausedPayload(), page }, null, 2),
742767
}],
743768
}
744769
}
@@ -750,115 +775,95 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
750775
}
751776

752777
case 'run_step_by_step': {
753-
const { test, timeout = 60000, config: configPath } = args
754-
await initCodecept(configPath)
755-
756-
return await withSilencedIO(async () => {
757-
codecept.loadTests()
758-
759-
let testFiles = codecept.testFiles
760-
if (test) {
761-
const testName = normalizePath(test).toLowerCase()
762-
testFiles = codecept.testFiles.filter(f => {
763-
const filePath = normalizePath(f).toLowerCase()
764-
return filePath.includes(testName) || filePath.endsWith(testName)
765-
})
766-
}
767-
768-
if (!testFiles.length) throw new Error(`No tests found matching: ${test}`)
769-
770-
const results = []
771-
const currentSteps = {}
772-
const traceDirs = {}
773-
let currentTestTitle = null
774-
const testFile = testFiles[0]
775-
776-
const onBefore = (t) => {
777-
const traceDir = traceDirFor(t.file, t.title, outputBaseDir())
778-
currentTestTitle = t.title
779-
currentSteps[t.title] = []
780-
traceDirs[t.title] = traceDir
781-
results.push({
782-
test: t.title,
783-
file: t.file,
784-
status: 'running',
785-
steps: [],
786-
})
778+
return await withLock(async () => {
779+
if (pausedController) {
780+
throw new Error('A previous run is still paused. Call "continue" first.')
787781
}
782+
const { test, timeout = 60000, config: configPath } = args || {}
783+
await initCodecept(configPath)
788784

789-
const onAfter = async (t) => {
790-
const r = results.find(x => x.test === t.title)
791-
if (r) {
792-
r.status = t.err ? 'failed' : 'completed'
793-
if (t.err) r.error = t.err.message
785+
return await withSilencedIO(async () => {
786+
codecept.loadTests()
794787

795-
if (t.artifacts?.aiTrace) {
796-
r.traceFile = pathToFileURL(t.artifacts.aiTrace).href
797-
}
798-
if (t.artifacts?.har) r.har = pathToFileURL(t.artifacts.har).href
799-
if (t.artifacts?.trace) r.trace = pathToFileURL(t.artifacts.trace).href
800-
801-
if (!t.artifacts?.aiTrace) {
802-
try {
803-
const helper = pickActingHelper(container.helpers())
804-
const dir = traceDirs[t.title]
805-
if (helper && dir) {
806-
mkdirp.sync(dir)
807-
const captured = await captureSnapshot(helper, { dir, prefix: 'final' })
808-
r.artifacts = artifactsToFileUrls(captured, dir)
809-
const tracePath = writeTraceMarkdown({
810-
dir,
811-
title: t.title,
812-
file: t.file,
813-
durationMs: 0,
814-
commands: (currentSteps[t.title] || []).map(s => s.step),
815-
captured,
816-
error: r.error,
817-
})
818-
r.traceFile = pathToFileURL(tracePath).href
819-
}
820-
} catch {}
821-
}
788+
let testFiles = codecept.testFiles
789+
if (test) {
790+
const testName = normalizePath(test).toLowerCase()
791+
testFiles = codecept.testFiles.filter(f => {
792+
const filePath = normalizePath(f).toLowerCase()
793+
return filePath.includes(testName) || filePath.endsWith(testName)
794+
})
822795
}
823-
currentTestTitle = null
824-
}
825796

826-
const onStepAfter = (step) => {
827-
if (!currentTestTitle || !currentSteps[currentTestTitle]) return
828-
currentSteps[currentTestTitle].push({
829-
step: step.toString(),
830-
status: step.status,
831-
time: step.endTime - step.startTime,
832-
})
833-
const r = results.find(x => x.test === currentTestTitle)
834-
if (r) r.steps = [...currentSteps[currentTestTitle]]
835-
}
797+
if (!testFiles.length) throw new Error(`No tests found matching: ${test}`)
798+
const testFile = testFiles[0]
836799

837-
event.dispatcher.on(event.test.before, onBefore)
838-
event.dispatcher.on(event.test.after, onAfter)
839-
event.dispatcher.on(event.step.after, onStepAfter)
800+
pendingRunResults = []
801+
pendingTestFile = testFile
802+
pendingStepInfo = null
803+
let stepIndex = 0
840804

841-
try {
842-
await Promise.race([
843-
(async () => {
805+
const onAfter = t => {
806+
pendingRunResults.push({
807+
title: t.title,
808+
file: t.file,
809+
status: t.err ? 'failed' : 'passed',
810+
error: t.err?.message,
811+
duration: t.duration,
812+
})
813+
}
814+
const onStepAfter = step => {
815+
stepIndex += 1
816+
try {
817+
pendingStepInfo = { index: stepIndex, name: step.toString(), status: step.status }
818+
} catch {
819+
pendingStepInfo = { index: stepIndex }
820+
}
821+
// Pause after every step — agent calls continue to advance.
822+
pauseNow()
823+
}
824+
event.dispatcher.on(event.test.after, onAfter)
825+
event.dispatcher.on(event.step.after, onStepAfter)
826+
pendingRunCleanup = () => {
827+
try { event.dispatcher.removeListener(event.test.after, onAfter) } catch {}
828+
try { event.dispatcher.removeListener(event.step.after, onStepAfter) } catch {}
829+
pendingRunCleanup = null
830+
}
831+
832+
let runError = null
833+
const runPromise = (async () => {
834+
try {
844835
await codecept.bootstrap()
845836
await codecept.run(testFile)
846-
})(),
837+
} catch (err) {
838+
runError = err
839+
throw err
840+
}
841+
})()
842+
843+
const pausedPromise = new Promise(resolve => pauseEvents.once('paused', () => resolve('paused')))
844+
const completedPromise = runPromise.then(() => 'completed', () => 'completed')
845+
846+
const which = await Promise.race([
847+
completedPromise,
848+
pausedPromise,
847849
new Promise((_, reject) => setTimeout(() => reject(new Error(`Timeout after ${timeout}ms`)), timeout)),
848850
])
849-
} catch (error) {
850-
const lastRunning = results.filter(r => r.status === 'running').pop()
851-
if (lastRunning) {
852-
lastRunning.status = 'failed'
853-
lastRunning.error = error.message
851+
852+
if (which === 'paused') {
853+
pendingRunPromise = runPromise
854+
const page = await gatherPageBrief()
855+
return {
856+
content: [{
857+
type: 'text',
858+
text: JSON.stringify({ ...pausedPayload(), page }, null, 2),
859+
}],
860+
}
854861
}
855-
} finally {
856-
try { event.dispatcher.removeListener(event.test.before, onBefore) } catch {}
857-
try { event.dispatcher.removeListener(event.test.after, onAfter) } catch {}
858-
try { event.dispatcher.removeListener(event.step.after, onStepAfter) } catch {}
859-
}
860862

861-
return { content: [{ type: 'text', text: JSON.stringify({ results, stepByStep: true }, null, 2) }] }
863+
// Test had zero steps (or finished before first pause) — return completion
864+
const final = collectRunCompletion(runError?.message)
865+
return { content: [{ type: 'text', text: JSON.stringify({ ...final, file: testFile }, null, 2) }] }
866+
})
862867
})
863868
}
864869

0 commit comments

Comments
 (0)