Skip to content

Commit 1b796df

Browse files
authored
chore: Update expected test outputs to reflect upstream config changes (#897)
* Code changes to reflect upstream io.yaml changes * Fix broken tests and add hints for Claude * Update expected test outputs to reflect upstream config changes * More updated expected test outputs
1 parent b92474d commit 1b796df

38 files changed

Lines changed: 99 additions & 23385 deletions

mellea/stdlib/components/intrinsic/rag.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,14 @@ def check_answerability(
3030
for answerability checks.
3131
3232
Returns:
33-
Answerability score as a floating-point value from 0 to 1.
33+
A string value of either "answerable" or "unanswerable"
3434
"""
3535
result_json = call_intrinsic(
3636
"answerability",
3737
context.add(Message("user", question, documents=list(documents))),
3838
backend,
3939
)
40-
return result_json["answerability_likelihood"]
40+
return result_json["answerability"]
4141

4242

4343
def rewrite_question(
@@ -146,7 +146,10 @@ def check_context_relevance(
146146
intrinsic.
147147
148148
Returns:
149-
Context relevance score as a floating-point value from 0 to 1.
149+
Context relevance judgement as one of the following strings:
150+
- "relevant"
151+
- "irrelevant"
152+
- "partially relevant"
150153
"""
151154
result_json = call_intrinsic(
152155
"context_relevance",
@@ -180,7 +183,7 @@ def flag_hallucinated_content(
180183
181184
Returns:
182185
List of records with the following fields: ``response_begin``,
183-
``response_end``, ``response_text``, ``faithfulness_likelihood``,
186+
``response_end``, ``response_text``, ``faithfulness``,
184187
``explanation``.
185188
"""
186189
result_json = call_intrinsic(

test/formatters/granite/test_intrinsics_formatters.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
)
3030
from mellea.formatters.granite.base import util as base_util
3131
from mellea.formatters.granite.intrinsics import json_util, util as intrinsics_util
32-
from test.predicates import require_gpu
3332

3433

3534
def _read_file(name):
@@ -434,6 +433,9 @@ def test_canned_input(yaml_json_combo_no_alora):
434433
assert after_json == expected_json
435434

436435

436+
# PLEASE DO NOT REMOVE THIS DECORATOR.
437+
# This decorator activates the network blocking features of the pytest-recording plugin.
438+
@pytest.mark.block_network
437439
def test_openai_compat(yaml_json_combo_no_alora):
438440
"""
439441
Verify that the dataclasses for intrinsics chat completions can be directly passed
@@ -573,12 +575,12 @@ def _round_floats(json_data, num_digits: int = 2):
573575
return result
574576

575577

578+
# THIS TEST DOES NOT REQUIRE A GPU.
579+
# Please do not mark it with @require_gpu.
580+
# THIS TEST USES gh_run TO SKIP EXPENSIVE WORK WHEN RUNNING ON CI.
581+
# Please do not mark this test with @pytest.mark.skipif(os.environ.get("CICD"))
576582
@pytest.mark.huggingface
577583
@pytest.mark.e2e
578-
@require_gpu(min_vram_gb=12)
579-
@pytest.mark.skipif(
580-
int(os.environ.get("CICD", 0)) == 1, reason="Skipping HuggingFace tests in CI"
581-
)
582584
def test_run_transformers(yaml_json_combo_with_model, gh_run):
583585
"""
584586
Run the target model end-to-end on transformers.
@@ -610,30 +612,30 @@ def test_run_transformers(yaml_json_combo_with_model, gh_run):
610612
pytest.xfail("Downloads fail on CI server because repo is private")
611613

612614
# Load IO config YAML for this model
613-
io_yaml_path = lora_dir / "io.yaml"
614-
if not os.path.exists(io_yaml_path):
615-
# Use local files until proper configs are up on Hugging Face
616-
io_yaml_path = cfg.yaml_file
615+
io_yaml_path = cfg.yaml_file if cfg.yaml_file else lora_dir / "io.yaml"
617616
rewriter = IntrinsicsRewriter(config_file=io_yaml_path)
618617
result_processor = IntrinsicsResultProcessor(config_file=io_yaml_path)
619618

620619
# Prepare inputs for inference
621620
transformed_input = rewriter.transform(model_input, **transform_kwargs)
622621

623-
if gh_run:
622+
if gh_run == 1:
624623
pytest.xfail(
625624
"Skipping end-to-end model evaluation for this test case because it takes "
626625
"more than 5 seconds. "
627-
"Mellea's CI fails the entire run without an error message if all 500+ "
626+
"Mellea's CI fails the entire run without an error message if all 1900+ "
628627
"tests combined take more than 15 minutes to complete. "
629-
"That works out to 1.8 seconds per test. "
628+
"That works out to 0.5 seconds per test. "
630629
"Any test that takes more than 5 seconds needs to disable or shortcut "
631630
"itself during CI, or all of Mellea's development infrastructure will "
632631
"grind to a halt."
633632
)
634633

635634
# Run the model using Hugging Face APIs
636635
model, tokenizer = base_util.load_transformers_lora(lora_dir)
636+
if torch.cuda.is_available(): # Use GPU if available
637+
model.cuda()
638+
637639
generate_input, other_input = (
638640
base_util.chat_completion_request_to_transformers_inputs(
639641
transformed_input.model_dump(), tokenizer, model

test/formatters/granite/testdata/input_yaml/answerability.yaml

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,10 @@ response_format: |
77
"enum": ["answerable", "unanswerable"]
88
}
99
transformations:
10-
# Convert categorical answer to continuous value by decoding logprobs
11-
- type: likelihood
12-
categories_to_values:
13-
"answerable": 1.0
14-
"unanswerable": 0.0
15-
input_path: []
1610
# Convert scalar value to a record for consistency with other intrinsics
1711
- type: nest
1812
input_path: []
19-
field_name: "answerability_likelihood"
13+
field_name: "answerability"
2014
instruction: ~
2115
parameters:
2216
# "unanswerable" can be 6 tokens at high temperatures

test/formatters/granite/testdata/test_canned_input/answerability_answerable.json

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,5 @@
2626
}
2727
}
2828
},
29-
"max_completion_tokens": 6,
30-
"logprobs": true,
31-
"top_logprobs": 10
29+
"max_completion_tokens": 6
3230
}

test/formatters/granite/testdata/test_canned_input/answerability_extra_params.json

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,5 @@
1919
},
2020
"frequency_penalty": 0.1,
2121
"n": 5,
22-
"max_completion_tokens": 6,
23-
"logprobs": true,
24-
"top_logprobs": 10
22+
"max_completion_tokens": 6
2523
}

test/formatters/granite/testdata/test_canned_input/answerability_simple.json

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,5 @@
1616
}
1717
}
1818
},
19-
"max_completion_tokens": 6,
20-
"logprobs": true,
21-
"top_logprobs": 10
19+
"max_completion_tokens": 6
2220
}

test/formatters/granite/testdata/test_canned_input/answerability_unanswerable.json

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,5 @@
3030
}
3131
}
3232
},
33-
"max_completion_tokens": 6,
34-
"logprobs": true,
35-
"top_logprobs": 10
33+
"max_completion_tokens": 6
3634
}

test/formatters/granite/testdata/test_canned_input/context_relevance.json

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,5 @@
3030
]
3131
}
3232
}
33-
},
34-
"logprobs": true,
35-
"top_logprobs": 10
33+
}
3634
}

test/formatters/granite/testdata/test_canned_input/gpt_oss_answerability.json

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,5 @@
2424
}
2525
}
2626
},
27-
"max_completion_tokens": 25,
28-
"logprobs": true,
29-
"top_logprobs": 10
27+
"max_completion_tokens": 25
3028
}

test/formatters/granite/testdata/test_canned_input/gpt_oss_hallucination_detection.json

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,5 @@
6464
}
6565
},
6666
"temperature": 0.0,
67-
"max_completion_tokens": 4096,
68-
"logprobs": true,
69-
"top_logprobs": 10
67+
"max_completion_tokens": 4096
7068
}

0 commit comments

Comments
 (0)