generative-computing
diff --git a/‎mellea/stdlib/components/intrinsic/rag.py‎
Lines changed: 7 additions & 4 deletions b/‎mellea/stdlib/components/intrinsic/rag.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎test/formatters/granite/test_intrinsics_formatters.py‎
Lines changed: 14 additions & 12 deletions b/‎test/formatters/granite/test_intrinsics_formatters.py‎
Lines changed: 14 additions & 12 deletions
diff --git a/‎test/formatters/granite/testdata/input_yaml/answerability.yaml‎
Lines changed: 1 addition & 7 deletions b/‎test/formatters/granite/testdata/input_yaml/answerability.yaml‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎test/formatters/granite/testdata/test_canned_input/answerability_answerable.json‎
Lines changed: 1 addition & 3 deletions b/‎test/formatters/granite/testdata/test_canned_input/answerability_answerable.json‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎test/formatters/granite/testdata/test_canned_input/answerability_extra_params.json‎
Lines changed: 1 addition & 3 deletions b/‎test/formatters/granite/testdata/test_canned_input/answerability_extra_params.json‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎test/formatters/granite/testdata/test_canned_input/answerability_simple.json‎
Lines changed: 1 addition & 3 deletions b/‎test/formatters/granite/testdata/test_canned_input/answerability_simple.json‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎test/formatters/granite/testdata/test_canned_input/answerability_unanswerable.json‎
Lines changed: 1 addition & 3 deletions b/‎test/formatters/granite/testdata/test_canned_input/answerability_unanswerable.json‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎test/formatters/granite/testdata/test_canned_input/context_relevance.json‎
Lines changed: 1 addition & 3 deletions b/‎test/formatters/granite/testdata/test_canned_input/context_relevance.json‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎test/formatters/granite/testdata/test_canned_input/gpt_oss_answerability.json‎
Lines changed: 1 addition & 3 deletions b/‎test/formatters/granite/testdata/test_canned_input/gpt_oss_answerability.json‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎test/formatters/granite/testdata/test_canned_input/gpt_oss_hallucination_detection.json‎
Lines changed: 1 addition & 3 deletions b/‎test/formatters/granite/testdata/test_canned_input/gpt_oss_hallucination_detection.json‎
Lines changed: 1 addition & 3 deletions
@@ -30,14 +30,14 @@ def check_answerability(
             for answerability checks.
 
     Returns:
-        Answerability score as a floating-point value from 0 to 1.
+        A string value of either "answerable" or "unanswerable"
     """
     result_json = call_intrinsic(
         "answerability",
         context.add(Message("user", question, documents=list(documents))),
         backend,
     )
-    return result_json["answerability_likelihood"]
+    return result_json["answerability"]
 
 
 def rewrite_question(
@@ -146,7 +146,10 @@ def check_context_relevance(
             intrinsic.
 
     Returns:
-        Context relevance score as a floating-point value from 0 to 1.
+        Context relevance judgement as one of the following strings:
+        - "relevant"
+        - "irrelevant"
+        - "partially relevant"
     """
     result_json = call_intrinsic(
         "context_relevance",
@@ -180,7 +183,7 @@ def flag_hallucinated_content(
 
     Returns:
         List of records with the following fields: ``response_begin``,
-        ``response_end``, ``response_text``, ``faithfulness_likelihood``,
+        ``response_end``, ``response_text``, ``faithfulness``,
         ``explanation``.
     """
     result_json = call_intrinsic(
 
@@ -29,7 +29,6 @@
 )
 from mellea.formatters.granite.base import util as base_util
 from mellea.formatters.granite.intrinsics import json_util, util as intrinsics_util
-from test.predicates import require_gpu
 
 
 def _read_file(name):
@@ -434,6 +433,9 @@ def test_canned_input(yaml_json_combo_no_alora):
     assert after_json == expected_json
 
 
+# PLEASE DO NOT REMOVE THIS DECORATOR.
+# This decorator activates the network blocking features of the pytest-recording plugin.
+@pytest.mark.block_network
 def test_openai_compat(yaml_json_combo_no_alora):
     """
     Verify that the dataclasses for intrinsics chat completions can be directly passed
@@ -573,12 +575,12 @@ def _round_floats(json_data, num_digits: int = 2):
     return result
 
 
+# THIS TEST DOES NOT REQUIRE A GPU.
+# Please do not mark it with @require_gpu.
+# THIS TEST USES gh_run TO SKIP EXPENSIVE WORK WHEN RUNNING ON CI.
+# Please do not mark this test with @pytest.mark.skipif(os.environ.get("CICD"))
 @pytest.mark.huggingface
 @pytest.mark.e2e
-@require_gpu(min_vram_gb=12)
-@pytest.mark.skipif(
-    int(os.environ.get("CICD", 0)) == 1, reason="Skipping HuggingFace tests in CI"
-)
 def test_run_transformers(yaml_json_combo_with_model, gh_run):
     """
     Run the target model end-to-end on transformers.
@@ -610,30 +612,30 @@ def test_run_transformers(yaml_json_combo_with_model, gh_run):
         pytest.xfail("Downloads fail on CI server because repo is private")
 
     # Load IO config YAML for this model
-    io_yaml_path = lora_dir / "io.yaml"
-    if not os.path.exists(io_yaml_path):
-        # Use local files until proper configs are up on Hugging Face
-        io_yaml_path = cfg.yaml_file
+    io_yaml_path = cfg.yaml_file if cfg.yaml_file else lora_dir / "io.yaml"
     rewriter = IntrinsicsRewriter(config_file=io_yaml_path)
     result_processor = IntrinsicsResultProcessor(config_file=io_yaml_path)
 
     # Prepare inputs for inference
     transformed_input = rewriter.transform(model_input, **transform_kwargs)
 
-    if gh_run:
+    if gh_run == 1:
         pytest.xfail(
             "Skipping end-to-end model evaluation for this test case because it takes "
             "more than 5 seconds. "
-            "Mellea's CI fails the entire run without an error message if all 500+ "
+            "Mellea's CI fails the entire run without an error message if all 1900+ "
             "tests combined take more than 15 minutes to complete. "
-            "That works out to 1.8 seconds per test. "
+            "That works out to 0.5 seconds per test. "
             "Any test that takes more than 5 seconds needs to disable or shortcut "
             "itself during CI, or all of Mellea's development infrastructure will "
             "grind to a halt."
         )
 
     # Run the model using Hugging Face APIs
     model, tokenizer = base_util.load_transformers_lora(lora_dir)
+    if torch.cuda.is_available():  # Use GPU if available
+        model.cuda()
+
     generate_input, other_input = (
         base_util.chat_completion_request_to_transformers_inputs(
             transformed_input.model_dump(), tokenizer, model
 
@@ -7,16 +7,10 @@ response_format: |
     "enum": ["answerable", "unanswerable"]
   }
 transformations:
-  # Convert categorical answer to continuous value by decoding logprobs
-  - type: likelihood
-    categories_to_values:
-      "answerable": 1.0
-      "unanswerable": 0.0
-    input_path: []
   # Convert scalar value to a record for consistency with other intrinsics
   - type: nest
     input_path: []
-    field_name: "answerability_likelihood"
+    field_name: "answerability"
 instruction: ~
 parameters:
   # "unanswerable" can be 6 tokens at high temperatures
 
@@ -26,7 +26,5 @@
       }
     }
   },
-  "max_completion_tokens": 6,
-  "logprobs": true,
-  "top_logprobs": 10
+  "max_completion_tokens": 6
 }
@@ -19,7 +19,5 @@
   },
   "frequency_penalty": 0.1,
   "n": 5,
-  "max_completion_tokens": 6,
-  "logprobs": true,
-  "top_logprobs": 10
+  "max_completion_tokens": 6
 }
@@ -16,7 +16,5 @@
       }
     }
   },
-  "max_completion_tokens": 6,
-  "logprobs": true,
-  "top_logprobs": 10
+  "max_completion_tokens": 6
 }
@@ -30,7 +30,5 @@
       }
     }
   },
-  "max_completion_tokens": 6,
-  "logprobs": true,
-  "top_logprobs": 10
+  "max_completion_tokens": 6
 }
@@ -30,7 +30,5 @@
         ]
       }
     }
-  },
-  "logprobs": true,
-  "top_logprobs": 10
+  }
 }
@@ -24,7 +24,5 @@
       }
     }
   },
-  "max_completion_tokens": 25,
-  "logprobs": true,
-  "top_logprobs": 10
+  "max_completion_tokens": 25
 }
@@ -64,7 +64,5 @@
     }
   },
   "temperature": 0.0,
-  "max_completion_tokens": 4096,
-  "logprobs": true,
-  "top_logprobs": 10
+  "max_completion_tokens": 4096
 }
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,5 @@`
`26`	`26`	`}`
`27`	`27`	`}`
`28`	`28`	`},`
`29`		`- "max_completion_tokens": 6,`
`30`		`- "logprobs": true,`
`31`		`- "top_logprobs": 10`
	`29`	`+ "max_completion_tokens": 6`
`32`	`30`	`}`
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,5 @@`
`19`	`19`	`},`
`20`	`20`	`"frequency_penalty": 0.1,`
`21`	`21`	`"n": 5,`
`22`		`- "max_completion_tokens": 6,`
`23`		`- "logprobs": true,`
`24`		`- "top_logprobs": 10`
	`22`	`+ "max_completion_tokens": 6`
`25`	`23`	`}`
Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,5 @@`
`16`	`16`	`}`
`17`	`17`	`}`
`18`	`18`	`},`
`19`		`- "max_completion_tokens": 6,`
`20`		`- "logprobs": true,`
`21`		`- "top_logprobs": 10`
	`19`	`+ "max_completion_tokens": 6`
`22`	`20`	`}`
Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,5 @@`
`30`	`30`	`}`
`31`	`31`	`}`
`32`	`32`	`},`
`33`		`- "max_completion_tokens": 6,`
`34`		`- "logprobs": true,`
`35`		`- "top_logprobs": 10`
	`33`	`+ "max_completion_tokens": 6`
`36`	`34`	`}`
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,5 @@`
`24`	`24`	`}`
`25`	`25`	`}`
`26`	`26`	`},`
`27`		`- "max_completion_tokens": 25,`
`28`		`- "logprobs": true,`
`29`		`- "top_logprobs": 10`
	`27`	`+ "max_completion_tokens": 25`
`30`	`28`	`}`
Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,5 @@`
`64`	`64`	`}`
`65`	`65`	`},`
`66`	`66`	`"temperature": 0.0,`
`67`		`- "max_completion_tokens": 4096,`
`68`		`- "logprobs": true,`
`69`		`- "top_logprobs": 10`
	`67`	`+ "max_completion_tokens": 4096`
`70`	`68`	`}`