Skip to content

Commit 2df0c31

Browse files
committed
cuda.core.system: Better checks for when we expect APIs to be unsupported
1 parent ce333b6 commit 2df0c31

8 files changed

Lines changed: 208 additions & 227 deletions

File tree

cuda_bindings/tests/nvml/conftest.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
33

44
from collections import namedtuple
5+
from contextlib import contextmanager
56

67
import pytest
78
from cuda.bindings import _nvml as nvml
@@ -128,3 +129,39 @@ def pci_info(ngpus, handles):
128129
pci_info = [nvml.device_get_pci_info_v3(handles[i]) for i in range(ngpus)]
129130
assert len(pci_info) == ngpus
130131
return pci_info
132+
133+
134+
@contextmanager
135+
def unsupported_before(device: int, expected_device_arch: nvml.DeviceArch | str | None):
136+
device_arch = nvml.device_get_architecture(device)
137+
138+
if isinstance(expected_device_arch, nvml.DeviceArch):
139+
expected_device_arch_int = int(expected_device_arch)
140+
elif expected_device_arch == "FERMI":
141+
expected_device_arch_int = 1
142+
else:
143+
expected_device_arch_int = 0
144+
145+
if expected_device_arch is None or expected_device_arch == "HAS_INFOROM" or device_arch == nvml.DeviceArch.UNKNOWN:
146+
# In this case, we don't /know/ if it will fail, but we are ok if it
147+
# does or does not.
148+
149+
# TODO: There are APIs that are documented as supported only if the
150+
# device has an InfoROM, but I couldn't find a way to detect that. For
151+
# now, they are just handled as "possibly failing".
152+
153+
try:
154+
yield
155+
except nvml.NotSupportedError:
156+
pytest.skip(
157+
f"Unsupported call for device architecture {nvml.DeviceArch(device_arch).name} "
158+
f"on device '{nvml.device_get_name(device)}'"
159+
)
160+
elif int(device_arch) < expected_device_arch_int:
161+
# In this case, we /know/ if will fail, and we want to assert that it does.
162+
with pytest.raises(nvml.NotSupportedError):
163+
yield
164+
pytest.skip("Unsupported before {expected_device_arch.name}, got {nvml.device_get_name(device)}")
165+
else:
166+
# In this case, we /know/ it should work, and if it fails, the test should fail.
167+
yield

cuda_bindings/tests/nvml/test_compute_mode.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
import pytest
88
from cuda.bindings import _nvml as nvml
99

10+
from .conftest import unsupported_before
11+
1012
COMPUTE_MODES = [
1113
nvml.ComputeMode.COMPUTEMODE_DEFAULT,
1214
nvml.ComputeMode.COMPUTEMODE_PROHIBITED,
@@ -16,18 +18,11 @@
1618

1719
@pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows")
1820
def test_compute_mode_supported_nonroot(all_devices):
19-
skip_reasons = set()
2021
for device in all_devices:
21-
try:
22+
with unsupported_before(device, None):
2223
original_compute_mode = nvml.device_get_compute_mode(device)
23-
except nvml.NotSupportedError:
24-
skip_reasons.add(f"nvmlDeviceGetComputeMode not supported for device {device}")
25-
continue
2624

2725
for cm in COMPUTE_MODES:
2826
with pytest.raises(nvml.NoPermissionError):
2927
nvml.device_set_compute_mode(device, cm)
3028
assert original_compute_mode == nvml.device_get_compute_mode(device), "Compute mode shouldn't have changed"
31-
32-
if skip_reasons:
33-
pytest.skip(" ; ".join(skip_reasons))

cuda_bindings/tests/nvml/test_gpu.py

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from cuda.bindings import _nvml as nvml
66

77
from . import util
8+
from .conftest import unsupported_before
89

910

1011
def test_gpu_get_module_id(nvml_init):
@@ -23,23 +24,14 @@ def test_gpu_get_module_id(nvml_init):
2324

2425

2526
def test_gpu_get_platform_info(all_devices):
26-
skip_reasons = set()
2727
for device in all_devices:
2828
if util.is_vgpu(device):
29-
skip_reasons.add(f"Not supported on vGPU device {device}")
30-
continue
29+
pytest.skip(f"Not supported on vGPU device {device}")
3130

32-
# TODO
33-
# if device.feature_dict.board.chip < board_class.Architecture.Blackwell:
34-
# test_utils.skip_test("Not supported on chip before Blackwell")
31+
# Documentation says Blackwell or newer only, but this does seem to pass
32+
# on some newer GPUs.
3533

36-
try:
34+
with unsupported_before(device, None):
3735
platform_info = nvml.device_get_platform_info(device)
38-
except nvml.NotSupportedError:
39-
skip_reasons.add(f"Not supported returned, linkely NVLink is disable for {device}")
40-
continue
4136

4237
assert isinstance(platform_info, nvml.PlatformInfo_v2)
43-
44-
if skip_reasons:
45-
pytest.skip(" ; ".join(skip_reasons))

cuda_bindings/tests/nvml/test_pynvml.py

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from cuda.bindings import _nvml as nvml
1111

1212
from . import util
13+
from .conftest import unsupported_before
1314

1415
XFAIL_LEGACY_NVLINK_MSG = "Legacy NVLink test expected to fail."
1516

@@ -66,7 +67,8 @@ def test_device_get_handle_by_pci_bus_id(ngpus, pci_info):
6667
def test_device_get_memory_affinity(handles, scope):
6768
size = 1024
6869
for handle in handles:
69-
node_set = nvml.device_get_memory_affinity(handle, size, scope)
70+
with unsupported_before(handle, nvml.DeviceArch.KEPLER):
71+
node_set = nvml.device_get_memory_affinity(handle, size, scope)
7072
assert node_set is not None
7173
assert len(node_set) == size
7274

@@ -76,7 +78,8 @@ def test_device_get_memory_affinity(handles, scope):
7678
def test_device_get_cpu_affinity_within_scope(handles, scope):
7779
size = 1024
7880
for handle in handles:
79-
cpu_set = nvml.device_get_cpu_affinity_within_scope(handle, size, scope)
81+
with unsupported_before(handle, nvml.DeviceArch.KEPLER):
82+
cpu_set = nvml.device_get_cpu_affinity_within_scope(handle, size, scope)
8083
assert cpu_set is not None
8184
assert len(cpu_set) == size
8285

@@ -136,22 +139,22 @@ def test_device_get_p2p_status(handles, index):
136139

137140
def test_device_get_power_usage(ngpus, handles):
138141
for i in range(ngpus):
139-
try:
142+
# Note: documentation says this is supported on Fermi or newer,
143+
# but in practice it fails on some later architectures.
144+
with unsupported_before(handles[i], None):
140145
power_mwatts = nvml.device_get_power_usage(handles[i])
141-
except nvml.NotSupportedError:
142-
pytest.skip("device_get_power_usage not supported")
143146
assert power_mwatts >= 0.0
144147

145148

146149
def test_device_get_total_energy_consumption(ngpus, handles):
147150
for i in range(ngpus):
148-
try:
151+
with unsupported_before(handles[i], nvml.DeviceArch.VOLTA):
149152
energy_mjoules1 = nvml.device_get_total_energy_consumption(handles[i])
150-
except nvml.NotSupportedError:
151-
pytest.skip("device_get_total_energy_consumption not supported")
153+
152154
for j in range(10): # idle for 150 ms
153155
time.sleep(0.015) # and check for increase every 15 ms
154-
energy_mjoules2 = nvml.device_get_total_energy_consumption(handles[i])
156+
with unsupported_before(handles[i], nvml.DeviceArch.VOLTA):
157+
energy_mjoules2 = nvml.device_get_total_energy_consumption(handles[i])
155158
assert energy_mjoules2 >= energy_mjoules1
156159
if energy_mjoules2 > energy_mjoules1:
157160
break
@@ -182,7 +185,8 @@ def test_device_get_memory_info(ngpus, handles):
182185

183186
def test_device_get_utilization_rates(ngpus, handles):
184187
for i in range(ngpus):
185-
urate = nvml.device_get_utilization_rates(handles[i])
188+
with unsupported_before(handles[i], "FERMI"):
189+
urate = nvml.device_get_utilization_rates(handles[i])
186190
assert urate.gpu >= 0
187191
assert urate.memory >= 0
188192

@@ -239,7 +243,8 @@ def test_device_get_utilization_rates(ngpus, handles):
239243

240244
def test_device_get_pcie_throughput(ngpus, handles):
241245
for i in range(ngpus):
242-
tx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_TX_BYTES)
246+
with unsupported_before(handles[i], nvml.DeviceArch.MAXWELL):
247+
tx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_TX_BYTES)
243248
assert tx_bytes_tp >= 0
244249
rx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_RX_BYTES)
245250
assert rx_bytes_tp >= 0
@@ -271,10 +276,10 @@ def test_device_get_pcie_throughput(ngpus, handles):
271276
def test_device_get_nvlink_capability(ngpus, handles, cap_type):
272277
for i in range(ngpus):
273278
for j in range(nvml.NVLINK_MAX_LINKS):
274-
try:
279+
# By the documentation, this should be supported on PASCAL or newer,
280+
# but this also seems to fail on newer.
281+
with unsupported_before(handles[i], None):
275282
cap = nvml.device_get_nvlink_capability(handles[i], j, cap_type)
276-
except nvml.NotSupportedError:
277-
pytest.skip("NVLink capability not supported")
278283
assert cap >= 0
279284

280285

cuda_core/cuda/core/system/_device.pyx

Lines changed: 13 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ ClocksEventReasons = nvml.ClocksEventReasons
2121
ClockType = nvml.ClockType
2222
CoolerControl = nvml.CoolerControl
2323
CoolerTarget = nvml.CoolerTarget
24+
DeviceArch = nvml.DeviceArch
2425
EventType = nvml.EventType
2526
FanControlPolicy = nvml.FanControlPolicy
2627
FieldId = nvml.FieldId
@@ -45,41 +46,6 @@ include "_performance.pxi"
4546
include "_temperature.pxi"
4647

4748

48-
class DeviceArchitecture:
49-
"""
50-
Device architecture enumeration.
51-
"""
52-
53-
def __init__(self, architecture: int):
54-
try:
55-
self._architecture = nvml.DeviceArch(architecture)
56-
except ValueError:
57-
self._architecture = None
58-
59-
@property
60-
def id(self) -> int:
61-
"""
62-
The numeric id of the device architecture.
63-
64-
Returns -1 if the device is unknown.
65-
"""
66-
if self._architecture is None:
67-
return -1
68-
return int(self._architecture)
69-
70-
@property
71-
def name(self) -> str:
72-
"""
73-
The name of the device architecture.
74-
75-
Returns "Unlisted" if the device is unknown.
76-
"""
77-
if self._architecture is None:
78-
return "Unlisted"
79-
name = self._architecture.name
80-
return name[name.rfind("_") + 1 :].title()
81-
82-
8349
cdef class MemoryInfo:
8450
"""
8551
Memory allocation information for a device.
@@ -952,16 +918,15 @@ cdef class Device:
952918
return [Pstates(x) for x in nvml.device_get_supported_performance_states(self._handle)]
953919

954920
@property
955-
def architecture(self) -> DeviceArchitecture:
921+
def arch(self) -> DeviceArch:
956922
"""
957-
Device architecture. For example, a Tesla V100 will report
958-
``DeviceArchitecture.name == "Volta"``, and RTX A6000 will report
959-
``DeviceArchitecture.name == "Ampere"``. If the device returns an
960-
architecture that is unknown to NVML then ``DeviceArchitecture.name ==
961-
"Unknown"`` is reported, whereas an architecture that is unknown to
962-
cuda.core.system is reported as ``DeviceArchitecture.name == "Unlisted"``.
923+
Device architecture.
924+
925+
For example, a Tesla V100 will report ``DeviceArchitecture.name ==
926+
"VOLTA"``, and RTX A6000 will report ``DeviceArchitecture.name ==
927+
"AMPERE"``.
963928
"""
964-
return DeviceArchitecture(nvml.device_get_architecture(self._handle))
929+
return DeviceArch(nvml.device_get_architecture(self._handle))
965930

966931
@property
967932
def bar1_memory_info(self) -> BAR1MemoryInfo:
@@ -1027,6 +992,8 @@ cdef class Device:
1027992
"""
1028993
Retrieves the globally unique board serial number associated with this
1029994
device's board.
995+
996+
For all products with an InfoROM.
1030997
"""
1031998
return nvml.device_get_serial(self._handle)
1032999

@@ -1268,6 +1235,8 @@ cdef class Device:
12681235
"""
12691236
Get the addressing mode of the device.
12701237

1238+
For Turing &tm; or newer fully supported devices.
1239+
12711240
Addressing modes can be one of:
12721241

12731242
- :attr:`AddressingMode.DEVICE_ADDRESSING_MODE_HMM`: System allocated
@@ -1486,7 +1455,7 @@ __all__ = [
14861455
"CoolerInfo",
14871456
"CoolerTarget",
14881457
"Device",
1489-
"DeviceArchitecture",
1458+
"DeviceArch",
14901459
"DeviceAttributes",
14911460
"DeviceEvents",
14921461
"EventData",

cuda_core/docs/source/api.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ CUDA system information and NVIDIA Management Library (NVML)
103103
system.CoolerControl
104104
system.CoolerInfo
105105
system.CoolerTarget
106-
system.DeviceArchitecture
106+
system.DeviceArch
107107
system.DeviceAttributes
108108
system.DeviceEvents
109109
system.EventData

cuda_core/tests/system/conftest.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,48 @@
33
# SPDX-License-Identifier: Apache-2.0
44

55

6+
from contextlib import contextmanager
7+
68
import pytest
79
from cuda.core import system
810

911
skip_if_nvml_unsupported = pytest.mark.skipif(
1012
not system.CUDA_BINDINGS_NVML_IS_COMPATIBLE, reason="NVML support requires cuda.bindings version 12.9.6+ or 13.1.2+"
1113
)
14+
15+
16+
@contextmanager
17+
def unsupported_before(device: system.Device, expected_device_arch: system.DeviceArch | str | None):
18+
device_arch = device.arch
19+
20+
if isinstance(expected_device_arch, system.DeviceArch):
21+
expected_device_arch_int = int(expected_device_arch)
22+
elif expected_device_arch == "FERMI":
23+
expected_device_arch_int = 1
24+
else:
25+
expected_device_arch_int = 0
26+
27+
if (
28+
expected_device_arch is None
29+
or expected_device_arch == "HAS_INFOROM"
30+
or device_arch == system.DeviceArch.UNKNOWN
31+
):
32+
# In this case, we don't /know/ if it will fail, but we are ok if it
33+
# does or does not.
34+
35+
# TODO: There are APIs that are documented as supported only if the
36+
# device has an InfoROM, but I couldn't find a way to detect that. For now, they
37+
# are just handled as "possibly failing".
38+
39+
try:
40+
yield
41+
except system.NotSupportedError:
42+
pytest.skip(f"Unsupported call for device architecture {device_arch.name} on device '{device.name}'")
43+
elif int(device_arch) < expected_device_arch_int:
44+
# In this case, we /know/ if will fail, and we want to assert that it does.
45+
with pytest.raises(system.NotSupportedError):
46+
yield
47+
pytest.skip("Unsupported before {expected_device_arch.name}, got {device_arch.name}")
48+
else:
49+
# In this case, we /know/ it should work, and if it fails, the test should fail.
50+
yield

0 commit comments

Comments
 (0)