Skip to content

Commit 2d0687a

Browse files
author
Henry Linjamäki
committed
* Add command buffer counterparts for tensor translation commands
* Add error codes for tensor translation commands. * Tweaked mem_pitch semantics.
1 parent 274e76e commit 2d0687a

2 files changed

Lines changed: 319 additions & 35 deletions

File tree

ext/cl_exp_tensor.asciidoc

Lines changed: 132 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,9 @@ cl_int clEnqueueTranslateToTensor(
347347

348348
* _mem_pitch_ defines the length of each dimension in elements to be
349349
used for the memory region of _buffer_ or _host_ptr_. The length of
350-
the array must be at least the rank of _tensor_ minus one.
350+
the array must be at least the rank of _tensor_ minus one. if
351+
_mem_pitch_ is NULL or _mem_pitch_[i] is zero, _mem_pitch_[i] is
352+
computed as _region_[i + 1].
351353

352354
* _buffer_ and _host_ptr_ refer to a valid buffer object / host
353355
allocation where data is to be read into or to be written from.
@@ -408,7 +410,8 @@ follows in pseudo C code:
408410
size_t pitch(size_t dim) {
409411
size_t pitch = 1;
410412
for (size_t i = dim; i < tensor_rank - 1; i++)
411-
pitch *= mem_pitch != NULL ? mem_pitch[i] : region[i + 1];
413+
pitch *=
414+
(mem_pitch != NULL || mem_pitch[i] == 0) ? mem_pitch[i] : region[i + 1];
412415
return pitch;
413416
}
414417
----
@@ -418,11 +421,131 @@ an abstract function that accesses a tensor element in its storage at
418421
given coordinate. The method how the coordinates translate to tensor
419422
storage addresses is unspecified.
420423

424+
*clEnqueueTranslateFsomTensor* and *clEnqueueTranslateToTensor*
425+
returns CL_SUCCESS if the function is executed
426+
successfully. Otherwise, it returns one of the following errors:
427+
428+
* CL_INVALID_COMMAND_QUEUE if _command_queue_ is not a valid host
429+
command-queue.
430+
431+
* CL_INVALID_CONTEXT if the context associated with _command_queue_
432+
and buffer are not the same or if the context associated with
433+
_command_queue_ and events in _event_wait_list_ are not the same.
434+
435+
* CL_INVALID_MEM_OBJECT if _buffer_ is not a valid buffer object.
436+
437+
* CL_INVALID_VALUE if _tensor_origin_ or _mem_origin_ is NULL.
438+
439+
* CL_INVALID_VALUE if the region being read or written specified by
440+
(_mem_origin_, _region_, _mem_pitch_) is out of bounds.
441+
442+
* CL_INVALID_VALUE if any _region_ array element is 0.
443+
444+
* CL_INVALID_VALUE if _mem_pitch_ is not NULL and _mem_pitch_[i] is
445+
not 0 and _mem_pitch_[i] is less than _region_[i].
446+
447+
* CL_INVALID_VALUE if _buffer_ and _host_ptr_ both are NULL or non-NULL.
448+
449+
* CL_INVALID_EVENT_WAIT_LIST if _event_wait_list_ is NULL and
450+
_num_events_in_wait_list_ > 0, or _event_wait_list_ is not NULL and
451+
_num_events_in_wait_list_ is 0, or if event objects in
452+
_event_wait_list_ are not valid events.
453+
454+
* CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST if the read and write
455+
operations are blocking and the execution status of any of the
456+
events in _event_wait_list_ is a negative integer value.
457+
458+
* CL_MEM_OBJECT_ALLOCATION_FAILURE if there is a failure to allocate
459+
memory for data store associated with memory object the _tensor_ is
460+
bound to.
461+
462+
* CL_OUT_OF_RESOURCES if there is a failure to allocate resources
463+
required by the OpenCL implementation on the device.
464+
465+
* CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources
466+
required by the OpenCL implementation on the host.
467+
421468
// TODO: add clEnqueueCopyTensor
422469

423470
// TODO: add clEnqueueFillTensor?
424471

425-
TODO: add command buffer variants for clEnqueue*Tensor.
472+
If *cl_khr_command_buffer* is is supported, then the following command
473+
buffer counterparts of the *clEnqueueTranslateFromTensor* and
474+
*clEnqueueTranslateToTensor* commands are available.
475+
476+
[source,c]
477+
----
478+
cl_int clCommandTranslateFromTensorKHR(
479+
cl_command_buffer_khr command_buffer,
480+
cl_command_queue command_queue,
481+
cl_tensor tensor,
482+
const size_t* tensor_origin,
483+
const size_t* mem_origin,
484+
const size_t* region,
485+
const size_t* mem_pitch,
486+
cl_mem buffer,
487+
void* host_ptr,
488+
cl_uint num_sync_points_in_wait_list,
489+
const cl_sync_point_khr* sync_point_wait_list,
490+
cl_sync_point_khr* sync_point,
491+
cl_mutable_command_khr* mutable_handle);
492+
----
493+
494+
[source,c]
495+
----
496+
cl_int clCommandTranslateToTensorKHR(
497+
cl_command_buffer_khr command_buffer,
498+
cl_command_queue command_queue,
499+
cl_tensor tensor,
500+
const size_t* tensor_origin,
501+
const size_t* mem_origin,
502+
const size_t* region,
503+
const size_t* mem_pitch,
504+
cl_mem buffer,
505+
const void* host_ptr,
506+
cl_uint num_sync_points_in_wait_list,
507+
const cl_sync_point_khr* sync_point_wait_list,
508+
cl_sync_point_khr* sync_point,
509+
cl_mutable_command_khr* mutable_handle);
510+
----
511+
512+
* _command_buffer_ refers to valid command-buffer object.
513+
514+
* For _command_queue_, _tensor_, _tensor_origin_, _mem_origin_,
515+
_region_, _mem_pitch_, _buffer_ and _host_ptr_ parameters refer to
516+
*clEnqueueTranslateFromTensor*.
517+
518+
* For _num_sync_points_in_wait_list_, _sync_point_wait_list_,
519+
_sync_point_, _mutable_handle_ parameters refer to
520+
*clCommandCopyBufferKHR*.
521+
522+
*clCommandTranslateFromTensorKHR* and *clCommandTranslateFromTensorKHR*
523+
returns CL_SUCCESS if the function is executed
524+
successfully. Otherwise, it returns one of the following errors:
525+
526+
* CL_INVALID_COMMAND_QUEUE if _command_queue_ is not NULL.
527+
528+
* CL_INVALID_COMMAND_BUFFER_KHR if _command_buffer_ is not a valid
529+
command-buffer.
530+
531+
* CL_INVALID_CONTEXT if the context associated with _command_queue_
532+
and _command_buffer_ is not the same.
533+
534+
* CL_INVALID_OPERATION if _command_buffer_ has been finalized.
535+
536+
* CL_INVALID_VALUE if _mutable_handle_ is not NULL.
537+
538+
* CL_INVALID_SYNC_POINT_WAIT_LIST_KHR if _sync_point_wait_list_ is
539+
NULL and _num_sync_points_in_wait_list_ is > 0, or
540+
_sync_point_wait_list_ is not NULL and _num_sync_points_in_wait_list_ is
541+
0, or if synchronization-point objects in _sync_point_wait_list_ are
542+
not valid synchronization-points.
543+
544+
* CL_OUT_OF_RESOURCES if there is a failure to allocate resources
545+
required by the OpenCL implementation on the device.
546+
547+
* CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources
548+
required by the OpenCL implementation on the host.
426549

427550
==== Add New Buffer Property in Section 5.2.1
428551

@@ -580,17 +703,17 @@ std::vector<float> out_data(b * m * n);
580703
// optimal data layout.
581704
clEnqueueTranslateToTensor(
582705
cmd_q, in0, false, {0, 0, 0}, {0, 0, 0}, {b, m, k},
583-
nullptr, nullptr, nullptr, in0_data.data(), 0, nullptr, nullptr);
706+
nullptr, nullptr, in0_data.data(), 0, nullptr, nullptr);
584707
clEnqueueTranslateToTensor(
585708
cmd_q, in1, false, {0, 0, 0}, {0, 0, 0}, {b, k, n},
586-
nullptr, nullptr, nullptr, in1_data.data(), 0, nullptr, nullptr);
709+
nullptr, nullptr, in1_data.data(), 0, nullptr, nullptr);
587710
clEnqueueNDRangeKernel(
588711
cmd_q, matmul_kernel, 3, matmul_grid, nullptr, nullptr, 0, nullptr, nullptr);
589712
clEnqueueNDRangeKernel(
590713
cmd_q, add_kernel, 3, add_grid, nullptr, nullptr, 0, nullptr, nullptr);
591714
clEnqueueTranslateFromTensor(
592715
cmd_q, out, false, {0, 0, 0}, {0, 0, 0}, {b, m, n},
593-
nullptr, nullptr, nullptr, out_data.data(), 0, nullptr, nullptr);
716+
nullptr, nullptr, out_data.data(), 0, nullptr, nullptr);
594717
----
595718

596719
An example use of tensors in a command buffer when cl_khr_command_buffer
@@ -642,10 +765,10 @@ cl_command_buffer_khr cb =
642765
cl_sync_point_khr in0_syncp, in1_syncp, matmul_syncp, add_syncp;
643766
clCommandTranslateToTensorKHR(
644767
cmd_b, cmd_q, in0, {0, 0, 0}, {0, 0, 0}, {b, m, k},
645-
nullptr, nullptr, nullptr, in0_data.data(), 0, nullptr, &in0_syncp);
768+
nullptr, nullptr, in0_data.data(), 0, nullptr, &in0_syncp);
646769
clCommandTranslateToTensorKHR(
647770
cmd_b, cmd_q, in1, {0, 0, 0}, {0, 0, 0}, {b, k, m},
648-
nullptr, nullptr, nullptr, in1_data.data(), 0, nullptr, &in1_syncp);
771+
nullptr, nullptr, in1_data.data(), 0, nullptr, &in1_syncp);
649772
clCommandNDRangeKernelKHR(
650773
cmd_b, cmd_q, nullptr, matmul_kernel, 3, matmul_grid, nullptr, nullptr,
651774
2, {in0_syncp, in2_syncp}, &matmul_syncp, nullptr);
@@ -654,7 +777,7 @@ clCommandNDRangeKernelKHR(
654777
1, {matmul_syncp}, &add_syncp, nullptr);
655778
clCommandTranslateFromTensorKHR(
656779
cmd_b, cmd_q, out, {0, 0, 0}, {0, 0, 0}, {b, k, m},
657-
nullptr, nullptr, nullptr, out_data.data(), 1, {add_syncp}, nullptr);
780+
nullptr, nullptr, out_data.data(), 1, {add_syncp}, nullptr);
658781
659782
// Finalize the command buffer. At this point the OpenCL
660783
// implementation may reserve enough storage for all the tensor

0 commit comments

Comments
 (0)