Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 12 additions & 31 deletions cuda_core/cuda/core/experimental/_linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

import ctypes
import warnings
import weakref
from contextlib import contextmanager
from dataclasses import dataclass
from typing import List, Optional
from warnings import warn

from cuda.core.experimental._device import Device
from cuda.core.experimental._module import ObjectCode
Expand All @@ -23,11 +23,11 @@


# Note: this function is reused in the tests
def _decide_nvjitlink_or_driver():
def _decide_nvjitlink_or_driver() -> bool:
"""Returns True if falling back to the cuLink* driver APIs."""
global _driver_ver, _driver, _nvjitlink
if _driver or _nvjitlink:
return
return _driver is not None

_driver_ver = handle_return(driver.cuDriverGetVersion())
_driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10)
Expand All @@ -43,7 +43,7 @@ def _decide_nvjitlink_or_driver():
_nvjitlink = None

if _nvjitlink is None:
warnings.warn(
warn(
"nvJitLink is not installed or too old (<12.3). Therefore it is not usable "
"and the culink APIs will be used instead.",
stacklevel=3,
Expand Down Expand Up @@ -98,78 +98,59 @@ class LinkerOptions:
will be used.
max_register_count : int, optional
Maximum register count.
Maps to: ``-maxrregcount=<N>``.
time : bool, optional
Print timing information to the info log.
Maps to ``-time``.
Default: False.
verbose : bool, optional
Print verbose messages to the info log.
Maps to ``-verbose``.
Default: False.
link_time_optimization : bool, optional
Perform link time optimization.
Maps to: ``-lto``.
Default: False.
ptx : bool, optional
Emit PTX after linking instead of CUBIN; only supported with ``-lto``.
Maps to ``-ptx``.
Emit PTX after linking instead of CUBIN; only supported with ``link_time_optimization=True``.
Default: False.
optimization_level : int, optional
Set optimization level. Only 0 and 3 are accepted.
Maps to ``-O<N>``.
debug : bool, optional
Generate debug information.
Maps to ``-g``
Default: False.
lineinfo : bool, optional
Generate line information.
Maps to ``-lineinfo``.
Default: False.
ftz : bool, optional
Flush denormal values to zero.
Maps to ``-ftz=<n>``.
Default: False.
prec_div : bool, optional
Use precise division.
Maps to ``-prec-div=<n>``.
Default: True.
prec_sqrt : bool, optional
Use precise square root.
Maps to ``-prec-sqrt=<n>``.
Default: True.
fma : bool, optional
Use fast multiply-add.
Maps to ``-fma=<n>``.
Default: True.
kernels_used : List[str], optional
Pass list of kernels that are used; any not in the list can be removed. This option can be specified multiple
times.
Maps to ``-kernels-used=<name>``.
variables_used : List[str], optional
Pass a list of variables that are used; any not in the list can be removed.
Maps to ``-variables-used=<name>``
optimize_unused_variables : bool, optional
Assume that if a variable is not referenced in device code, it can be removed.
Maps to: ``-optimize-unused-variables``
Default: False.
xptxas : List[str], optional
Pass options to PTXAS.
Maps to: ``-Xptxas=<opt>``.
split_compile : int, optional
Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split
compilation (default).
Maps to ``-split-compile=<N>``.
Default: 1.
split_compile_extended : int, optional
A more aggressive form of split compilation available in LTO mode only. Accepts a maximum thread count value.
Use 0 to use all available processors. Value of 1 disables extended split compilation (default). Note: This
option can potentially impact performance of the compiled binary.
Maps to ``-split-compile-extended=<N>``.
Default: 1.
no_cache : bool, optional
Do not cache the intermediate steps of nvJitLink.
Maps to ``-no-cache``.
Default: False.
"""

Expand Down Expand Up @@ -290,19 +271,19 @@ def _init_driver(self):
self.formatted_options.append(1)
self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO)
if self.ftz is not None:
raise ValueError("ftz option is deprecated in the driver API")
warn("ftz option is deprecated in the driver API", DeprecationWarning, stacklevel=3)
if self.prec_div is not None:
raise ValueError("prec_div option is deprecated in the driver API")
warn("prec_div option is deprecated in the driver API", DeprecationWarning, stacklevel=3)
if self.prec_sqrt is not None:
raise ValueError("prec_sqrt option is deprecated in the driver API")
warn("prec_sqrt option is deprecated in the driver API", DeprecationWarning, stacklevel=3)
if self.fma is not None:
raise ValueError("fma options is deprecated in the driver API")
warn("fma options is deprecated in the driver API", DeprecationWarning, stacklevel=3)
if self.kernels_used is not None:
raise ValueError("kernels_used is deprecated in the driver API")
warn("kernels_used is deprecated in the driver API", DeprecationWarning, stacklevel=3)
if self.variables_used is not None:
raise ValueError("variables_used is deprecated in the driver API")
warn("variables_used is deprecated in the driver API", DeprecationWarning, stacklevel=3)
if self.optimize_unused_variables is not None:
raise ValueError("optimize_unused_variables is deprecated in the driver API")
warn("optimize_unused_variables is deprecated in the driver API", DeprecationWarning, stacklevel=3)
if self.xptxas is not None:
raise ValueError("xptxas option is not supported by the driver API")
if self.split_compile is not None:
Expand Down
73 changes: 32 additions & 41 deletions cuda_core/cuda/core/experimental/_program.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import List, Optional, Tuple, Union

from cuda.core.experimental._device import Device
from cuda.core.experimental._linker import Linker, LinkerOptions
from cuda.core.experimental._module import ObjectCode
from cuda.core.experimental._utils import (
_handle_boolean_option,
Expand All @@ -31,157 +32,120 @@ class ProgramOptions:
relocatable_device_code : bool, optional
Enable (disable) the generation of relocatable device code.
Default: False
Maps to: ``--relocatable-device-code={true|false}`` (``-rdc``)
extensible_whole_program : bool, optional
Do extensible whole program compilation of device code.
Default: False
Maps to: ``--extensible-whole-program`` (``-ewp``)
debug : bool, optional
Generate debug information. If --dopt is not specified, then turns off all optimizations.
Default: False
Maps to: ``--device-debug`` (``-G``)
lineinfo: bool, optional
Generate line-number information.
Default: False
Maps to: ``--generate-line-info`` (``-lineinfo``)
device_code_optimize : bool, optional
Enable device code optimization. When specified along with ‘-G’, enables limited debug information generation
for optimized device code.
Default: None
Maps to: ``--dopt on`` (``-dopt``)
ptxas_options : Union[str, List[str]], optional
Specify one or more options directly to ptxas, the PTX optimizing assembler. Options should be strings.
For example ["-v", "-O2"].
Default: None
Maps to: ``--ptxas-options <options>`` (``-Xptxas``)
max_register_count : int, optional
Specify the maximum amount of registers that GPU functions can use.
Default: None
Maps to: ``--maxrregcount=<N>`` (``-maxrregcount``)
ftz : bool, optional
When performing single-precision floating-point operations, flush denormal values to zero or preserve denormal
values.
Default: False
Maps to: ``--ftz={true|false}`` (``-ftz``)
prec_sqrt : bool, optional
For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation.
Default: True
Maps to: ``--prec-sqrt={true|false}`` (``-prec-sqrt``)
prec_div : bool, optional
For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster
approximation.
Default: True
Maps to: ``--prec-div={true|false}`` (``-prec-div``)
fma : bool, optional
Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point
multiply-add operations.
Default: True
Maps to: ``--fmad={true|false}`` (``-fmad``)
use_fast_math : bool, optional
Make use of fast math operations.
Default: False
Maps to: ``--use_fast_math`` (``-use_fast_math``)
extra_device_vectorization : bool, optional
Enables more aggressive device code vectorization in the NVVM optimizer.
Default: False
Maps to: ``--extra-device-vectorization`` (``-extra-device-vectorization``)
link_time_optimization : bool, optional
Generate intermediate code for later link-time optimization.
Default: False
Maps to: ``--dlink-time-opt`` (``-dlto``)
gen_opt_lto : bool, optional
Run the optimizer passes before generating the LTO IR.
Default: False
Maps to: ``--gen-opt-lto`` (``-gen-opt-lto``)
define_macro : Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]], optional
Predefine a macro. Can be either a string, in which case that macro will be set to 1, a 2 element tuple of
strings, in which case the first element is defined as the second, or a list of strings or tuples.
Default: None
Maps to: ``--define-macro=<def>`` (``-D``)
undefine_macro : Union[str, List[str]], optional
Cancel any previous definition of a macro, or list of macros.
Default: None
Maps to: ``--undefine-macro=<def>`` (``-U``)
include_path : Union[str, List[str]], optional
Add the directory or directories to the list of directories to be searched for headers.
Default: None
Maps to: ``--include-path=<dir>`` (``-I``)
pre_include : Union[str, List[str]], optional
Preinclude one or more headers during preprocessing. Can be either a string or a list of strings.
Default: None
Maps to: ``--pre-include=<header>`` (``-include``)
no_source_include : bool, optional
Disable the default behavior of adding the directory of each input source to the include path.
Default: False
Maps to: ``--no-source-include`` (``-no-source-include``)
std : str, optional
Set language dialect to C++03, C++11, C++14, C++17 or C++20.
Default: c++17
Maps to: ``--std={c++03|c++11|c++14|c++17|c++20}`` (``-std``)
builtin_move_forward : bool, optional
Provide builtin definitions of std::move and std::forward.
Default: True
Maps to: ``--builtin-move-forward={true|false}`` (``-builtin-move-forward``)
builtin_initializer_list : bool, optional
Provide builtin definitions of std::initializer_list class and member functions.
Default: True
Maps to: ``--builtin-initializer-list={true|false}`` (``-builtin-initializer-list``)
disable_warnings : bool, optional
Inhibit all warning messages.
Default: False
Maps to: ``--disable-warnings`` (``-w``)
restrict : bool, optional
Programmer assertion that all kernel pointer parameters are restrict pointers.
Default: False
Maps to: ``--restrict`` (``-restrict``)
device_as_default_execution_space : bool, optional
Treat entities with no execution space annotation as __device__ entities.
Default: False
Maps to: ``--device-as-default-execution-space`` (``-default-device``)
device_int128 : bool, optional
Allow the __int128 type in device code.
Default: False
Maps to: ``--device-int128`` (``-device-int128``)
optimization_info : str, optional
Provide optimization reports for the specified kind of optimization.
Default: None
Maps to: ``--optimization-info=<kind>`` (``-opt-info``)
no_display_error_number : bool, optional
Disable the display of a diagnostic number for warning messages.
Default: False
Maps to: ``--no-display-error-number`` (``-no-err-no``)
diag_error : Union[int, List[int]], optional
Emit error for a specified diagnostic message number or comma separated list of numbers.
Default: None
Maps to: ``--diag-error=<error-number>, ...`` (``-diag-error``)
diag_suppress : Union[int, List[int]], optional
Suppress a specified diagnostic message number or comma separated list of numbers.
Default: None
Maps to: ``--diag-suppress=<error-number>,…`` (``-diag-suppress``)
diag_warn : Union[int, List[int]], optional
Emit warning for a specified diagnostic message number or comma separated lis of numbers.
Default: None
Maps to: ``--diag-warn=<error-number>,…`` (``-diag-warn``)
brief_diagnostics : bool, optional
Disable or enable showing source line and column info in a diagnostic.
Default: False
Maps to: ``--brief-diagnostics={true|false}`` (``-brief-diag``)
time : str, optional
Generate a CSV table with the time taken by each compilation phase.
Default: None
Maps to: ``--time=<file-name>`` (``-time``)
split_compile : int, optional
Perform compiler optimizations in parallel.
Default: 1
Maps to: ``--split-compile= <number of threads>`` (``-split-compile``)
fdevice_syntax_only : bool, optional
Ends device compilation after front-end syntax checking.
Default: False
Maps to: ``--fdevice-syntax-only`` (``-fdevice-syntax-only``)
minimal : bool, optional
Omit certain language features to reduce compile time for small programs.
Default: False
Maps to: ``--minimal`` (``-minimal``)
"""

arch: Optional[str] = None
Expand Down Expand Up @@ -385,7 +349,7 @@ class Program:
"""

class _MembersNeededForFinalize:
__slots__ = ("handle",)
__slots__ = "handle"

def __init__(self, program_obj, handle):
self.handle = handle
Expand All @@ -396,28 +360,52 @@ def close(self):
handle_return(nvrtc.nvrtcDestroyProgram(self.handle))
self.handle = None

__slots__ = ("__weakref__", "_mnff", "_backend", "_options")
_supported_code_type = ("c++",)
__slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options")
_supported_code_type = ("c++", "ptx")
Comment thread
keenan-simpson marked this conversation as resolved.
_supported_target_type = ("ptx", "cubin", "ltoir")

def __init__(self, code, code_type, options: ProgramOptions = None):
self._mnff = Program._MembersNeededForFinalize(self, None)

self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
code_type = code_type.lower()

if code_type not in self._supported_code_type:
raise NotImplementedError

if code_type.lower() == "c++":
if code_type == "c++":
if not isinstance(code, str):
raise TypeError
# TODO: support pre-loaded headers & include names
# TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], []))
self._backend = "nvrtc"

Comment thread
leofang marked this conversation as resolved.
elif code_type == "ptx":
if not isinstance(code, str):
raise TypeError
Comment thread
keenan-simpson marked this conversation as resolved.
Outdated
self._linker = Linker(
ObjectCode(code.encode(), code_type), options=self._translate_program_options(options)
)
self._backend = "linker"
else:
raise NotImplementedError

def _translate_program_options(self, options: ProgramOptions) -> LinkerOptions:
return LinkerOptions(
Comment thread
keenan-simpson marked this conversation as resolved.
arch=options.arch,
max_register_count=options.max_register_count,
time=options.time,
debug=options.debug,
lineinfo=options.lineinfo,
ftz=options.ftz,
prec_div=options.prec_div,
prec_sqrt=options.prec_sqrt,
fma=options.fma,
link_time_optimization=options.link_time_optimization,
Comment thread
keenan-simpson marked this conversation as resolved.
split_compile=options.split_compile,
)

def close(self):
"""Destroy this program."""
self._mnff.close()
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
self._mnff.close()
self._mnff.close()
if self._linker:
self._linker.close()

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there an advantage to explicitly closing the Linker instance from the Program finalizer rather than letting Program decrement the Linker instance refcount and relying on the established logic in Linker? My instincts tell me to rely on the python runtime garbage collection scheduling for PyObjects.

I've changed the mnff handle attribute name to nvrtc_handle to be more explicit, since before it felt implied that you were referencing a handle that was inherent to the Program instance, rather than a backend dependent handle. Let me know what you think. Functionally there is no difference

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The semantics of close() is: Destruct every resource that this object owns immediately, and consider this object no longer usable after close() returns. It does not rely on the gc behavior, which is the last resort (gc can defer the destruction at an arbitrarily later time). We need this guarantee for certain use cases.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is a good catch about Program.handle. I think it should return either nvrtc_handle or linker.handle (which is either nvjitlink handle or cuLink handle). For clarity on the semantics of the handle, we should make Linker.backend queryable (it is not today), so that Program.backend returns Linker.backend if it's using the linker under the hood. Then we teach users about "check which backend is in use to determine how to interpret the handle". WDYT?

We should fix this asap but perhaps in a separate PR. Would you mind creating a new issue to track this?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I follow your point for the close() method. I will implement it as you suggest.

For the handles, I'm a bit torn. It makes sense that we would return a handle / backend of equivalent level of abstraction to that of nvrtc ( return a handle, or the binding level backend ), however it also feels like we are breaking the Linker's abstraction by doing so. An nvrtc backed Program's handle is an ~nvrtcProgramHandle, while a linker backed Program doesn't really have a handle. It has a linker backend which has a culinkHandle or nvjitLinkHandle which is a handle to the linker rather than a program. Tradeoff between handle semantic consistency and abstraction consistency.

The same tradeoff feels relevant to the return value of Program.backend (either Linker, or Linker.backend).

The only thing I feel sure of, is that we should put this in a new issue and discuss it on a call. In the meantime I will revert the handle name change I made so this review as isolated as possible from that discussion. #433 is the issue

Expand Down Expand Up @@ -481,6 +469,9 @@ def compile(self, target_type, name_expressions=(), logs=None):

return ObjectCode(data, target_type, symbol_mapping=symbol_mapping)

if self._backend == "linker":
return self._linker.link(target_type)

@property
def backend(self):
"""Return the backend type string associated with this program."""
Expand Down
Loading