NVIDIA · leofang · Feb 4, 2025 · Dec 19, 2024 · Dec 20, 2024 · Jan 20, 2025
diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
@@ -3,11 +3,11 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import ctypes
-import warnings
 import weakref
 from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import List, Optional
+from warnings import warn
 
 from cuda.core.experimental._device import Device
 from cuda.core.experimental._module import ObjectCode
@@ -23,11 +23,11 @@
 
 
 # Note: this function is reused in the tests
-def _decide_nvjitlink_or_driver():
+def _decide_nvjitlink_or_driver() -> bool:
     """Returns True if falling back to the cuLink* driver APIs."""
     global _driver_ver, _driver, _nvjitlink
     if _driver or _nvjitlink:
-        return
+        return _driver is not None
 
     _driver_ver = handle_return(driver.cuDriverGetVersion())
     _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10)
@@ -43,7 +43,7 @@ def _decide_nvjitlink_or_driver():
             _nvjitlink = None
 
     if _nvjitlink is None:
-        warnings.warn(
+        warn(
             "nvJitLink is not installed or too old (<12.3). Therefore it is not usable "
             "and the culink APIs will be used instead.",
             stacklevel=3,
@@ -98,78 +98,59 @@ class LinkerOptions:
         will be used.
     max_register_count : int, optional
         Maximum register count.
-        Maps to: ``-maxrregcount=<N>``.
     time : bool, optional
         Print timing information to the info log.
-        Maps to ``-time``.
         Default: False.
     verbose : bool, optional
         Print verbose messages to the info log.
-        Maps to ``-verbose``.
         Default: False.
     link_time_optimization : bool, optional
         Perform link time optimization.
-        Maps to: ``-lto``.
         Default: False.
     ptx : bool, optional
-        Emit PTX after linking instead of CUBIN; only supported with ``-lto``.
-        Maps to ``-ptx``.
+        Emit PTX after linking instead of CUBIN; only supported with ``link_time_optimization=True``.
         Default: False.
     optimization_level : int, optional
         Set optimization level. Only 0 and 3 are accepted.
-        Maps to ``-O<N>``.
     debug : bool, optional
         Generate debug information.
-        Maps to ``-g``
         Default: False.
     lineinfo : bool, optional
         Generate line information.
-        Maps to ``-lineinfo``.
         Default: False.
     ftz : bool, optional
         Flush denormal values to zero.
-        Maps to ``-ftz=<n>``.
         Default: False.
     prec_div : bool, optional
         Use precise division.
-        Maps to ``-prec-div=<n>``.
         Default: True.
     prec_sqrt : bool, optional
         Use precise square root.
-        Maps to ``-prec-sqrt=<n>``.
         Default: True.
     fma : bool, optional
         Use fast multiply-add.
-        Maps to ``-fma=<n>``.
         Default: True.
     kernels_used : List[str], optional
         Pass list of kernels that are used; any not in the list can be removed. This option can be specified multiple
         times.
-        Maps to ``-kernels-used=<name>``.
     variables_used : List[str], optional
         Pass a list of variables that are used; any not in the list can be removed.
-        Maps to ``-variables-used=<name>``
     optimize_unused_variables : bool, optional
         Assume that if a variable is not referenced in device code, it can be removed.
-        Maps to: ``-optimize-unused-variables``
         Default: False.
     xptxas : List[str], optional
         Pass options to PTXAS.
-        Maps to: ``-Xptxas=<opt>``.
     split_compile : int, optional
         Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split
         compilation (default).
-        Maps to ``-split-compile=<N>``.
         Default: 1.
     split_compile_extended : int, optional
         A more aggressive form of split compilation available in LTO mode only. Accepts a maximum thread count value.
         Use 0 to use all available processors. Value of 1 disables extended split compilation (default). Note: This
         option can potentially impact performance of the compiled binary.
-        Maps to ``-split-compile-extended=<N>``.
         Default: 1.
     no_cache : bool, optional
         Do not cache the intermediate steps of nvJitLink.
-        Maps to ``-no-cache``.
         Default: False.
     """
 
@@ -290,19 +271,19 @@ def _init_driver(self):
             self.formatted_options.append(1)
             self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO)
         if self.ftz is not None:
-            raise ValueError("ftz option is deprecated in the driver API")
+            warn("ftz option is deprecated in the driver API", DeprecationWarning, stacklevel=3)
         if self.prec_div is not None:
-            raise ValueError("prec_div option is deprecated in the driver API")
+            warn("prec_div option is deprecated in the driver API", DeprecationWarning, stacklevel=3)
         if self.prec_sqrt is not None:
-            raise ValueError("prec_sqrt option is deprecated in the driver API")
+            warn("prec_sqrt option is deprecated in the driver API", DeprecationWarning, stacklevel=3)
         if self.fma is not None:
-            raise ValueError("fma options is deprecated in the driver API")
+            warn("fma options is deprecated in the driver API", DeprecationWarning, stacklevel=3)
         if self.kernels_used is not None:
-            raise ValueError("kernels_used is deprecated in the driver API")
+            warn("kernels_used is deprecated in the driver API", DeprecationWarning, stacklevel=3)
         if self.variables_used is not None:
-            raise ValueError("variables_used is deprecated in the driver API")
+            warn("variables_used is deprecated in the driver API", DeprecationWarning, stacklevel=3)
         if self.optimize_unused_variables is not None:
-            raise ValueError("optimize_unused_variables is deprecated in the driver API")
+            warn("optimize_unused_variables is deprecated in the driver API", DeprecationWarning, stacklevel=3)
         if self.xptxas is not None:
             raise ValueError("xptxas option is not supported by the driver API")
         if self.split_compile is not None:

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
@@ -7,6 +7,7 @@
 from typing import List, Optional, Tuple, Union
 
 from cuda.core.experimental._device import Device
+from cuda.core.experimental._linker import Linker, LinkerOptions
 from cuda.core.experimental._module import ObjectCode
 from cuda.core.experimental._utils import (
     _handle_boolean_option,
@@ -31,157 +32,120 @@ class ProgramOptions:
     relocatable_device_code : bool, optional
         Enable (disable) the generation of relocatable device code.
         Default: False
-        Maps to: ``--relocatable-device-code={true|false}`` (``-rdc``)
     extensible_whole_program : bool, optional
         Do extensible whole program compilation of device code.
         Default: False
-        Maps to: ``--extensible-whole-program`` (``-ewp``)
     debug : bool, optional
         Generate debug information. If --dopt is not specified, then turns off all optimizations.
         Default: False
-        Maps to: ``--device-debug`` (``-G``)
     lineinfo: bool, optional
         Generate line-number information.
         Default: False
-        Maps to: ``--generate-line-info`` (``-lineinfo``)
     device_code_optimize : bool, optional
         Enable device code optimization. When specified along with ‘-G’, enables limited debug information generation
         for optimized device code.
         Default: None
-        Maps to: ``--dopt on`` (``-dopt``)
     ptxas_options : Union[str, List[str]], optional
         Specify one or more options directly to ptxas, the PTX optimizing assembler. Options should be strings.
         For example ["-v", "-O2"].
         Default: None
-        Maps to: ``--ptxas-options <options>`` (``-Xptxas``)
     max_register_count : int, optional
         Specify the maximum amount of registers that GPU functions can use.
         Default: None
-        Maps to: ``--maxrregcount=<N>`` (``-maxrregcount``)
     ftz : bool, optional
         When performing single-precision floating-point operations, flush denormal values to zero or preserve denormal
         values.
         Default: False
-        Maps to: ``--ftz={true|false}`` (``-ftz``)
     prec_sqrt : bool, optional
         For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation.
         Default: True
-        Maps to: ``--prec-sqrt={true|false}`` (``-prec-sqrt``)
     prec_div : bool, optional
         For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster
         approximation.
         Default: True
-        Maps to: ``--prec-div={true|false}`` (``-prec-div``)
     fma : bool, optional
         Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point
         multiply-add operations.
         Default: True
-        Maps to: ``--fmad={true|false}`` (``-fmad``)
     use_fast_math : bool, optional
         Make use of fast math operations.
         Default: False
-        Maps to: ``--use_fast_math`` (``-use_fast_math``)
     extra_device_vectorization : bool, optional
         Enables more aggressive device code vectorization in the NVVM optimizer.
         Default: False
-        Maps to: ``--extra-device-vectorization`` (``-extra-device-vectorization``)
     link_time_optimization : bool, optional
         Generate intermediate code for later link-time optimization.
         Default: False
-        Maps to: ``--dlink-time-opt`` (``-dlto``)
     gen_opt_lto : bool, optional
         Run the optimizer passes before generating the LTO IR.
         Default: False
-        Maps to: ``--gen-opt-lto`` (``-gen-opt-lto``)
     define_macro : Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]], optional
         Predefine a macro. Can be either a string, in which case that macro will be set to 1, a 2 element tuple of
         strings, in which case the first element is defined as the second, or a list of strings or tuples.
         Default: None
-        Maps to: ``--define-macro=<def>`` (``-D``)
     undefine_macro : Union[str, List[str]], optional
         Cancel any previous definition of a macro, or list of macros.
         Default: None
-        Maps to: ``--undefine-macro=<def>`` (``-U``)
     include_path : Union[str, List[str]], optional
         Add the directory or directories to the list of directories to be searched for headers.
         Default: None
-        Maps to: ``--include-path=<dir>`` (``-I``)
     pre_include : Union[str, List[str]], optional
         Preinclude one or more headers during preprocessing. Can be either a string or a list of strings.
         Default: None
-        Maps to: ``--pre-include=<header>`` (``-include``)
     no_source_include : bool, optional
         Disable the default behavior of adding the directory of each input source to the include path.
         Default: False
-        Maps to: ``--no-source-include`` (``-no-source-include``)
     std : str, optional
         Set language dialect to C++03, C++11, C++14, C++17 or C++20.
         Default: c++17
-        Maps to: ``--std={c++03|c++11|c++14|c++17|c++20}`` (``-std``)
     builtin_move_forward : bool, optional
         Provide builtin definitions of std::move and std::forward.
         Default: True
-        Maps to: ``--builtin-move-forward={true|false}`` (``-builtin-move-forward``)
     builtin_initializer_list : bool, optional
         Provide builtin definitions of std::initializer_list class and member functions.
         Default: True
-        Maps to: ``--builtin-initializer-list={true|false}`` (``-builtin-initializer-list``)
     disable_warnings : bool, optional
         Inhibit all warning messages.
         Default: False
-        Maps to: ``--disable-warnings`` (``-w``)
     restrict : bool, optional
         Programmer assertion that all kernel pointer parameters are restrict pointers.
         Default: False
-        Maps to: ``--restrict`` (``-restrict``)
     device_as_default_execution_space : bool, optional
         Treat entities with no execution space annotation as __device__ entities.
         Default: False
-        Maps to: ``--device-as-default-execution-space`` (``-default-device``)
     device_int128 : bool, optional
         Allow the __int128 type in device code.
         Default: False
-        Maps to: ``--device-int128`` (``-device-int128``)
     optimization_info : str, optional
         Provide optimization reports for the specified kind of optimization.
         Default: None
-        Maps to: ``--optimization-info=<kind>`` (``-opt-info``)
     no_display_error_number : bool, optional
         Disable the display of a diagnostic number for warning messages.
         Default: False
-        Maps to: ``--no-display-error-number`` (``-no-err-no``)
     diag_error : Union[int, List[int]], optional
         Emit error for a specified diagnostic message number or comma separated list of numbers.
         Default: None
-        Maps to: ``--diag-error=<error-number>, ...`` (``-diag-error``)
     diag_suppress : Union[int, List[int]], optional
         Suppress a specified diagnostic message number or comma separated list of numbers.
         Default: None
-        Maps to: ``--diag-suppress=<error-number>,…`` (``-diag-suppress``)
     diag_warn : Union[int, List[int]], optional
         Emit warning for a specified diagnostic message number or comma separated lis of numbers.
         Default: None
-        Maps to: ``--diag-warn=<error-number>,…`` (``-diag-warn``)
     brief_diagnostics : bool, optional
         Disable or enable showing source line and column info in a diagnostic.
         Default: False
-        Maps to: ``--brief-diagnostics={true|false}`` (``-brief-diag``)
     time : str, optional
         Generate a CSV table with the time taken by each compilation phase.
         Default: None
-        Maps to: ``--time=<file-name>`` (``-time``)
     split_compile : int, optional
         Perform compiler optimizations in parallel.
         Default: 1
-        Maps to: ``--split-compile= <number of threads>`` (``-split-compile``)
     fdevice_syntax_only : bool, optional
         Ends device compilation after front-end syntax checking.
         Default: False
-        Maps to: ``--fdevice-syntax-only`` (``-fdevice-syntax-only``)
     minimal : bool, optional
         Omit certain language features to reduce compile time for small programs.
         Default: False
-        Maps to: ``--minimal`` (``-minimal``)
     """
 
     arch: Optional[str] = None
@@ -385,7 +349,7 @@ class Program:
     """
 
     class _MembersNeededForFinalize:
-        __slots__ = ("handle",)
+        __slots__ = "handle"
 
         def __init__(self, program_obj, handle):
             self.handle = handle
@@ -396,28 +360,52 @@ def close(self):
                 handle_return(nvrtc.nvrtcDestroyProgram(self.handle))
                 self.handle = None
 
-    __slots__ = ("__weakref__", "_mnff", "_backend", "_options")
-    _supported_code_type = ("c++",)
+    __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options")
+    _supported_code_type = ("c++", "ptx")
     _supported_target_type = ("ptx", "cubin", "ltoir")
 
     def __init__(self, code, code_type, options: ProgramOptions = None):
         self._mnff = Program._MembersNeededForFinalize(self, None)
 
         self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
+        code_type = code_type.lower()
 
         if code_type not in self._supported_code_type:
             raise NotImplementedError
 
-        if code_type.lower() == "c++":
+        if code_type == "c++":
             if not isinstance(code, str):
                 raise TypeError
             # TODO: support pre-loaded headers & include names
             # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
             self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], []))
             self._backend = "nvrtc"
+
+        elif code_type == "ptx":
+            if not isinstance(code, str):
+                raise TypeError
+            self._linker = Linker(
+                ObjectCode(code.encode(), code_type), options=self._translate_program_options(options)
+            )
+            self._backend = "linker"
         else:
             raise NotImplementedError
 
+    def _translate_program_options(self, options: ProgramOptions) -> LinkerOptions:
+        return LinkerOptions(
+            arch=options.arch,
+            max_register_count=options.max_register_count,
+            time=options.time,
+            debug=options.debug,
+            lineinfo=options.lineinfo,
+            ftz=options.ftz,
+            prec_div=options.prec_div,
+            prec_sqrt=options.prec_sqrt,
+            fma=options.fma,
+            link_time_optimization=options.link_time_optimization,
+            split_compile=options.split_compile,
+        )
+
     def close(self):
         """Destroy this program."""
         self._mnff.close()
-        self._mnff.close()
+        self._mnff.close()
+        if self._linker:
+            self._linker.close()
-        self._mnff.close()
+        self._mnff.close()
+        if self._linker:
+            self._linker.close()
@@ -481,6 +469,9 @@ def compile(self, target_type, name_expressions=(), logs=None):
 
             return ObjectCode(data, target_type, symbol_mapping=symbol_mapping)
 
+        if self._backend == "linker":
+            return self._linker.link(target_type)
+
     @property
     def backend(self):
         """Return the backend type string associated with this program."""