Skip to content

Commit 34bc564

Browse files
authored
Document img 1x2_2x2 matmul functions. (#1283)
* document img 1x2_2x2 matmul functions. * Correct the description of matmul with saturation. * Address review comments.
1 parent 91c4a0d commit 34bc564

1 file changed

Lines changed: 227 additions & 25 deletions

File tree

extensions/cl_img_matrix_multiply.asciidoc

Lines changed: 227 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ Tomasz Platek, Imagination Technologies (Tomasz.Platek 'at' imgtec.com)
2020

2121
CY Cheng, Imagination Technologies. +
2222
Joe Molleson, Imagination Technologies. +
23-
Tomasz Platek, Imagination Technologies.
23+
Tomasz Platek, Imagination Technologies. +
24+
Szabolcs Csefalvay, Imagination Technologies. +
25+
David Welch, Imagination Technologies.
2426

2527
== Notice
2628

@@ -33,7 +35,7 @@ Final Draft
3335
== Version
3436

3537
Built On: {docdate} +
36-
Version: 1.0.0
38+
Version: 1.1.0
3739

3840
== Dependencies
3941

@@ -50,6 +52,7 @@ This extension adds built-in functions that exercise hardware capabilities of Im
5052
[source,c]
5153
----
5254
__opencl_img_dot_interleaved
55+
__opencl_img_matmul_1x2_2x2
5356
__opencl_img_matmul_2x4_4x4
5457
----
5558

@@ -69,7 +72,24 @@ float2 img_dot_interleaved_acc(float4 a,__local float8 * b, float2 acc);
6972
float2 img_dot_interleaved_acc(float8 a,__local float16 * b, float2 acc);
7073
----
7174

72-
Perform the matrix multiplication operation:
75+
Perform the matrix multiplication of a 1x2 matrix `a` with a 2x2 matrix `b`, adding the result to a 1x2 matrix `c`:
76+
77+
[source,c]
78+
----
79+
float2 img_matmul_float_acc_1x2_2x2(float2 a, __local float4 * b, float2 acc)
80+
float2 img_matmul_half2_acc_1x2_2x2f(half4 a, __local half8 * b, float2 acc)
81+
half2 img_matmul_half2_acc_1x2_2x2h(half4 a, __local half8 * b, half2 acc)
82+
uint2 img_matmul_uchar4_acc_1x2_2x2(uchar8 a, __local uchar16 * b, uint2 acc);
83+
int2 img_matmul_char4_acc_1x2_2x2(char8 a, __local char16 * b, int2 acc);
84+
int2 img_matmul_char4_acc_1x2_2x2(uchar8 a, __local char16 * b, int2 acc);
85+
int2 img_matmul_char4_acc_1x2_2x2(char8 a, __local uchar16 * b, int2 acc);
86+
uint2 img_matmul_uchar4_acc_1x2_2x2_sat(uchar8 a, __local uchar16 * b, uint2 acc);
87+
int2 img_matmul_char4_acc_1x2_2x2_sat(char8 a, __local char16 * b, int2 acc);
88+
int2 img_matmul_char4_acc_1x2_2x2_sat(uchar8 a, __local char16 * b, int2 acc);
89+
int2 img_matmul_char4_acc_1x2_2x2_sat(char8 a, __local uchar16 * b, int2 acc);
90+
----
91+
92+
Perform the matrix multiplication of a 2x4 matrix `a` with a 4x4 matrix `b`, adding the result to a 2x4 matrix `acc`:
7393

7494
[source,c]
7595
----
@@ -95,12 +115,12 @@ half8 img_matmul_acc_2x4_4x4transposedh(half4 a0, half4 a1,__local half16 * b, h
95115
float2 *img_dot_interleaved*(float2 _a_,pass:[__local] float4 * _b_) +
96116
float2 *img_dot_interleaved*(float4 _a_,pass:[__local] float8 * _b_) +
97117
float2 *img_dot_interleaved*(float8 _a_,pass:[__local] float16 * _b_)
98-
a| `img_dot_interleaved` performs the dual dot product operation.
118+
a| `img_dot_interleaved` performs the dual dot product operation.
99119
The input vectors of the first dot product are `a` and the vector containing the even-indexed elements of `b`. The result is stored into the first element of the output vector.
100120
The input vectors of the second dot product are `a` and the vector containing the odd-indexed elements of `b`. The result is stored into the second element of the output vector.
101-
121+
102122
For example, given:
103-
123+
104124
----
105125
a = [a0 a1]
106126
b = [b0 b1 b2 b3]
@@ -111,14 +131,17 @@ the output vector is:
111131
----
112132
[res0 res1] = [a0 a1] x [b0 b1]
113133
[b2 b3]
134+
135+
res0 = a0b0 + a1b2
136+
res1 = a0b1 + a1b3
114137
----
115138

116139
Requires that the `__opencl_img_dot_interleaved` feature macro is defined.
117140
| float2 *img_dot_interleaved_acc*(float _a_,pass:[__local] float2 * _b_, float2 _acc_) +
118141
float2 *img_dot_interleaved_acc*(float2 _a_,pass:[__local] float4 * _b_, float2 _acc_) +
119142
float2 *img_dot_interleaved_acc*(float4 _a_,pass:[__local] float8 * _b_, float2 _acc_) +
120143
float2 *img_dot_interleaved_acc*(float8 _a_,pass:[__local] float16 * _b_, float2 _acc_)
121-
a| `img_dot_interleaved_acc` performs the dual dot product operation with the accumulator `acc`.
144+
a| `img_dot_interleaved_acc` performs the dual dot product operation with the accumulator `acc`.
122145
The input vectors of the first dot product are `a` and the vector containing the even-indexed elements of `b`. The result is stored into the first element of the output vector.
123146
The input vectors of the second dot product are `a` and the vector containing the odd-indexed elements of `b`. The result is stored into the second element of the output vector.
124147

@@ -135,9 +158,129 @@ the output vector is:
135158
----
136159
[res0 res1] = [a0 a1] x [b0 b1] + [acc0 acc1]
137160
[b2 b3]
161+
162+
res0 = a0b0 + a1b2 + acc0
163+
res1 = a0b1 + a1b3 + acc1
138164
----
139165

140166
Requires that the `__opencl_img_dot_interleaved` feature macro is defined.
167+
| float2 *img_matmul_float_acc_1x2_2x2*(float2 _a_, pass:[__local] float4 * _b_, float2 _acc_)
168+
a| `img_matmul_float_acc_1x2_2x2` performs the dual dot product operation with the accumulator `acc`
169+
The input vectors of the first dot product are `a` and the vector containing the even-indexed elements of `b`. The result is stored into the first element of the output vector.
170+
The input vectors of the second dot product are `a` and the vector containing the odd-indexed elements of `b`. The result is stored into the second element of the output vector.
171+
172+
For example, given:
173+
----
174+
a = [a0 a1]
175+
b = [b0 b1 b2 b3]
176+
acc = [acc0 acc1]
177+
----
178+
179+
the output vector is:
180+
181+
----
182+
[res0 res1] = [a0 a1] x [b0 b1] + [acc0 acc1]
183+
[b2 b3]
184+
185+
res0 = a0b0 + a1b2 + acc0
186+
res1 = a0b1 + a1b3 + acc1
187+
----
188+
189+
Requires that the `__opencl_img_matmul_1x2_2x2` feature macro is defined.
190+
| float2 *img_matmul_half2_acc_1x2_2x2f*(half4 _a_, pass:[__local] half8 * _b_, float2 _acc_) +
191+
half2 *img_matmul_half2_acc_1x2_2x2h*(half4 _a_, pass:[__local] half8 * _b_, half2 _acc_)
192+
a| `img_matmul_half2_acc_1x2_2x2f` and `img_matmul_half2_acc_1x2_2x2h` perform the dual dot product operation with the accumulator `acc`
193+
The input vectors of the first dot product are `a` and the vector containing the even-indexed *32-bit elements* of `b`. The result is stored into the first element of the output vector.
194+
The input vectors of the second dot product are `a` and the vector containing the odd-indexed *32-bit elements* of `b`. The result is stored into the second element of the output vector.
195+
196+
For example, given:
197+
----
198+
a = [a0 a1, a2 a3]
199+
b = [b0 b1, b2 b3]
200+
[b4 b5, b6 b7]
201+
acc = [acc0 acc1]
202+
203+
a's memory layout = LSB [a0 a1 a2 a3]
204+
b's memory layout = LSB [b0 b1 b2 b3 b4 b5 b6 b7]
205+
----
206+
207+
the output vector is:
208+
209+
----
210+
[res0 res1] = [a0 a1, a2 a3] x [b0 b1, b2 b3] + [acc0 acc1]
211+
[b4 b5, b6 b7]
212+
213+
res0 = (a0b0 + a1b1) + (a2b4 + a3b5) + acc0
214+
res1 = (a0b2 + a1b3) + (a2b6 + a3b7) + acc1
215+
216+
Note: The parentheses are only used to help the reader see that the dot computation is a [1x2] x [2x2] with half2 elements; they do not indicate the accumulation order.
217+
----
218+
219+
Requires that the `__opencl_img_matmul_1x2_2x2` feature macro is defined.
220+
| uint2 *img_matmul_uchar4_acc_1x2_2x2*(uchar8 _a_, pass:[__local] uchar16 * _b_, uint2 _acc_);
221+
int2 *img_matmul_char4_acc_1x2_2x2*(char8 _a_, pass:[__local] char16 * _b_, int2 _acc_);
222+
int2 *img_matmul_char4_acc_1x2_2x2*(uchar8 _a_, pass:[__local] char16 * _b_, int2 _acc_);
223+
int2 *img_matmul_char4_acc_1x2_2x2*(char8 _a_, pass:[__local] uchar16 * _b_, int2 _acc_);
224+
a| `img_matmul_uchar4_acc_1x2_2x2` and `img_matmul_char4_acc_1x2_2x2` perform the dual dot product operation with the accumulator `acc`
225+
The input vectors of the first dot product are `a` and the vector containing the even-indexed *32-bit elements* of `b`. The result is stored into the first element of the output vector.
226+
The input vectors of the second dot product are `a` and the vector containing the odd-indexed *32-bit elements* of `b`. The result is stored into the second element of the output vector.
227+
228+
For example, given:
229+
----
230+
a = [a0 a1 a2 a3, a4 a5 a6 a7]
231+
b = [b0 b1 b2 b3, b4 b5 b6 b7]
232+
[b8 b9 b10 b11, b12 b13 b14 b15]
233+
acc = [acc0 acc1]
234+
235+
a's memory layout = LSB [a0 a1 a2 a3]
236+
b's memory layout = LSB [b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15]
237+
----
238+
239+
the output vector is:
240+
241+
----
242+
[res0 res1] = [a0 a1 a2 a3, a4 a5 a6 a7] x [b0 b1 b2 b3, b4 b5 b6 b7] + [acc0 acc1]
243+
[b8 b9 b10 b11, b12 b13 b14 b15]
244+
res0 = (a0b0 + a1b1 + a2b2 + a3b3) + ( a4b8 + a5b9 + a6b10 + a7b11) + acc0
245+
res1 = (a0b4 + a1b5 + a2b6 + a3b7) + (a4b12 + a5b13 + a6b14 + a7b15) + acc1
246+
247+
Note: The parentheses are only used to help the reader see that the dot computation is a [1x2] x [2x2] with char4/uchar4 elements; they do not indicate the accumulation order.
248+
----
249+
250+
Requires that the `__opencl_img_matmul_1x2_2x2` feature macro is defined.
251+
| uint2 *img_matmul_uchar4_acc_1x2_2x2_sat*(uchar8 _a_, pass:[__local] uchar16 * _b_, uint2 _acc_);
252+
int2 *img_matmul_char4_acc_1x2_2x2_sat*(char8 _a_, pass:[__local] char16 * _b_, int2 _acc_);
253+
int2 *img_matmul_char4_acc_1x2_2x2_sat*(uchar8 _a_, pass:[__local] char16 * _b_, int2 _acc_);
254+
int2 *img_matmul_char4_acc_1x2_2x2_sat*(char8 _a_, pass:[__local] uchar16 * _b_, int2 _acc_);
255+
a| `img_matmul_uchar4_acc_1x2_2x2_sat` and `img_matmul_char4_acc_1x2_2x2_sat` perform the dual dot product operation, add the accumulator `acc`, and saturate the result.
256+
The input vectors of the first dot product are `a` and the vector containing the even-indexed *32-bit elements* of `b`. The result is saturated and stored into the first element of the output vector.
257+
The input vectors of the second dot product are `a` and the vector containing the odd-indexed *32-bit elements* of `b`. The result is saturated and stored into the second element of the output vector.
258+
259+
For example, given:
260+
----
261+
a = [a0 a1 a2 a3, a4 a5 a6 a7]
262+
b = [b0 b1 b2 b3, b4 b5 b6 b7]
263+
[b8 b9 b10 b11, b12 b13 b14 b15]
264+
acc = [acc0 acc1]
265+
266+
a's memory layout = LSB [a0 a1 a2 a3]
267+
b's memory layout = LSB [b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15]
268+
----
269+
270+
the output vector is:
271+
----
272+
[res0 res1] = [a0 a1 a2 a3, a4 a5 a6 a7] x [b0 b1 b2 b3, b4 b5 b6 b7] + [acc0 acc1]
273+
[b8 b9 b10 b11, b12 b13 b14 b15]
274+
product0 = (a0b0 + a1b1 + a2b2 + a3b3) + ( a4b8 + a5b9 + a6b10 + a7b11)
275+
res0 = add_sat(product0, acc0)
276+
277+
product1 = (a0b4 + a1b5 + a2b6 + a3b7) + (a4b12 + a5b13 + a6b14 + a7b15)
278+
res1 = add_sat(product1, acc1)
279+
280+
Note: The parentheses are only used to help the reader see that the dot computation is a [1x2] x [2x2] with char4/uchar4 elements; they do not indicate the accumulation order.
281+
----
282+
283+
Requires that the `__opencl_img_matmul_1x2_2x2` feature macro is defined.
141284
| float8 *img_matmul_2x4_4x4f*(half4 _a0_, half4 _a1_,pass:[__local] half16 * _b_) +
142285
half8 *img_matmul_2x4_4x4h*(half4 _a0_, half4 _a1_,pass:[__local] half16 * _b_)
143286
a| `img_matmul_2x4_4x4f` and `img_matmul_2x4_4x4h` perform the matrix multiplication operation of matrices A and B of dimensions 2x4 and 4x4, where `a0` is the first row and `a1` is the second row of the matrix A.
@@ -158,7 +301,7 @@ the output vector is:
158301

159302
----
160303
[res0 res1 res2 res3] = A x B
161-
[res4 res5 res6 res7]
304+
[res4 res5 res6 res7]
162305
----
163306

164307
Requires that the `__opencl_img_matmul_2x4_4x4` feature macro is defined.
@@ -184,7 +327,7 @@ the output vector is:
184327

185328
----
186329
[res0 res1 res2 res3] = A x B + C
187-
[res4 res5 res6 res7]
330+
[res4 res5 res6 res7]
188331
----
189332

190333
Requires that the `__opencl_img_matmul_2x4_4x4` feature macro is defined.
@@ -209,7 +352,7 @@ the output vector is:
209352

210353
----
211354
[res0 res1 res2 res3] = A x BT
212-
[res4 res5 res6 res7]
355+
[res4 res5 res6 res7]
213356
----
214357

215358
Requires that the `__opencl_img_matmul_2x4_4x4` feature macro is defined.
@@ -228,14 +371,14 @@ BT = [b0 b4 b8 b12]
228371
[b2 b6 b10 b14]
229372
[b3 b7 b11 b15]
230373
C = [acc00 acc01 acc02 acc03]
231-
[acc10 acc11 acc12 acc13]
374+
[acc10 acc11 acc12 acc13]
232375
----
233376

234377
the output vector is:
235378

236379
----
237380
[res0 res1 res2 res3] = A x BT + C
238-
[res4 res5 res6 res7]
381+
[res4 res5 res6 res7]
239382
----
240383

241384
Requires that the `__opencl_img_matmul_2x4_4x4` feature macro is defined.
@@ -245,7 +388,7 @@ Requires that the `__opencl_img_matmul_2x4_4x4` feature macro is defined.
245388
== Coding Sample
246389

247390
This coding sample shows how to initialize the input vectors, use the *img_dot_interleaved_acc* function, and access the output vector:
248-
[source]
391+
[source,c]
249392
----
250393
float4 a = (float4) (1.0f, 1.0f, 1.0f, 1.0f);
251394
__local float8 b;
@@ -257,14 +400,80 @@ float2 res = img_dot_interleaved_acc(a, &b, acc);
257400
printf("res = [ %f %f ]\n", res.s0, res.s1);
258401
----
259402

260-
Executing a work-item containing this code gives the following result:
261-
[source]
403+
This coding sample shows how to use the *img_matmul_float_acc_1x2_2x2* function:
404+
[source,c]
405+
----
406+
__attribute__((reqd_work_group_size(128, 1, 1)))
407+
void __kernel f32Matmul(__global float2 *a, __global float4 *b, __global float2 *acc, int step) {
408+
__local float4 cachedB[..];
409+
int id = ..;
410+
// load data from the matrix b which is shared in a workgroup.
411+
// We can let each thread copies the data or use async_work_group_copy:
412+
// cachedB[id] = ..;
413+
//
414+
// event_t e = async_work_group_copy(cachedB, &b[group_id], .. /* num elements */, 0 /* event */);
415+
// wait_group_events(1, &e);
416+
417+
float2 results = acc[id];
418+
for (int i = 0; i < step; ++i)
419+
results = img_matmul_float_acc_1x2_2x2(a[id + i], &cachedB[i], results);
420+
421+
acc[id] = results;
422+
}
423+
424+
// Note: It is preferable to use a workgroup size of 128 for optimal performance.
425+
----
426+
427+
This coding sample shows how to use the *img_matmul_half2_acc_1x2_2x2h* function:
428+
[source,c]
262429
----
263-
res = [ 1.000000 5.000000 ]
430+
__attribute__((reqd_work_group_size(128, 1, 1)))
431+
void __kernel f16Matmul(__global half4 *a, __global half8 *b, __global half2 *acc, int step) {
432+
__local half8 cachedB[..];
433+
int id = ..;
434+
// load data from the matrix b which is shared in a workgroup.
435+
// We can let each thread copies the data or use async_work_group_copy:
436+
// cachedB[id] = ..;
437+
//
438+
// event_t e = async_work_group_copy(cachedB, &b[group_id], .. /* num elements */, 0 /* event */);
439+
// wait_group_events(1, &e);
440+
441+
half2 results = acc[id];
442+
for (int i = 0; i < step; ++i)
443+
results = img_matmul_half2_acc_1x2_2x2h(a[id + i], &cachedB[i], results);
444+
445+
acc[id] = results;
446+
}
447+
448+
// Note: It is preferable to use a workgroup size of 128 for optimal performance.
449+
----
450+
451+
This coding sample shows how to use the *img_matmul_char4_acc_1x2_2x2_sat* function:
452+
[source,c]
453+
----
454+
__attribute__((reqd_work_group_size(128, 1, 1)))
455+
void __kernel char4Matmul(__global char8 *a, __global char16 *b, __global int2 *acc, int step) {
456+
__local char16 cachedB[..];
457+
int id = ..;
458+
// load data from the matrix b which is shared in a workgroup.
459+
// We can let each thread copies the data or use async_work_group_copy:
460+
// cachedB[id] = ..;
461+
//
462+
// event_t e = async_work_group_copy(cachedB, &b[group_id], .. /* num elements */, 0 /* event */);
463+
// wait_group_events(1, &e);
464+
465+
int2 results = acc[id];
466+
for (int i = 0; i < step; ++i)
467+
results = img_matmul_char4_acc_1x2_2x2_sat(a[id + i], &cachedB[i], results);
468+
469+
acc[id] = results;
470+
}
471+
472+
// Note: It is preferable to use a workgroup size of 128 for optimal performance.
264473
----
265474

266475
This coding sample shows how to initialize the input vectors, use the *img_matmul_acc_2x4_4x4f* function, and access the output vector:
267-
[source]
476+
[source,c]
268477
----
269478
half4 a0 = (half4) (1.0h, 0.0h, 0.0h, 0.0h);
270479
half4 a1 = (half4) (0.0h, 1.0h, 0.0h, 0.0h);
@@ -284,13 +493,6 @@ printf("res = [ %f %f %f %f ]\n", res.s0, res.s1, res.s2, res.s3);
284493
printf(" [ %f %f %f %f ]\n", res.s4, res.s5, res.s6, res.s7);
285494
----
286495

287-
Executing a work-item containing this code gives the following result:
288-
[source]
289-
----
290-
res = [ 1.000000 2.000000 3.000000 4.000000 ]
291-
[ 5.000000 6.000000 7.000000 8.000000 ]
292-
----
293-
294496
== Version History
295497

296498
[cols="5,15,15,70"]
@@ -299,5 +501,5 @@ res = [ 1.000000 2.000000 3.000000 4.000000 ]
299501
|====
300502
| Version | Date | Author | Changes
301503
| 1.0.0 | 2024-06-07 | Tomasz Platek | *Initial revision*
504+
| 1.1.0 | 2024-11-11 | CY Cheng | Document 1x2_2x2 matrix functions
302505
|====
303-

0 commit comments

Comments
 (0)