forked from AnswerDotAI/gpu.cpp
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun.cpp
More file actions
446 lines (426 loc) · 18 KB
/
run.cpp
File metadata and controls
446 lines (426 loc) · 18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
#include "spdlog/spdlog.h"
#include "webgpu/webgpu.h"
#include <array>
#include <future>
/*
* Approximate GELU kernel definition, implemented as a WGSL.
* In general GPU device code for WEBGPU is written in the WGSL domain specific
* language.
*
* Here inp and out correspond to bindings 0 and 1 respectively. In the main
* code, we create buffers for these bindings and populate them with data.
*
*/
const char *kShaderGELU = R"(
const GELU_SCALING_FACTOR: f32 = 0.7978845608028654; // sqrt(2.0 / PI)
@group(0) @binding(0) var<storage, read_write> inp: array<f32>;
@group(0) @binding(1) var<storage, read_write> out: array<f32>;
@compute @workgroup_size(256)
fn main(
@builtin(global_invocation_id) GlobalInvocationID: vec3<u32>) {
let i: u32 = GlobalInvocationID.x;
// Ensure we do not access out of bounds
if (i < 3072) {
let x: f32 = inp[i];
let cube: f32 = 0.044715 * x * x * x;
out[i] = 0.5 * x * (1.0 + tanh(GELU_SCALING_FACTOR * (x + cube)));
}
}
)";
/*
* Convenience function to check if a condition is true, if not log an error
* message and exit.
*
* @param condition: The condition to check.
* @param message: The error message to log if the condition is false.
* @param file: The file where the error occurred.
* @param line: The line where the error occurred.
*/
inline void check(bool condition, const char *message,
const char *file = "unkown", int line = -1) {
if (!condition) {
spdlog::error("Error in file {} line {}:\n{}", file, line, message);
exit(1);
} else {
spdlog::trace("Success in file {} line {}:\n{}", file, line, message);
}
}
/*
* Convenience function to display the first few elements of an array. A more
* robust/extensive version of this is in array_utils.h this is minimal to keep
* this example self-contained.
*
* @param a: The array to show.
* @param name: The name of the array.
* @return: A string representation of the array.
*/
template <typename numtype, size_t N>
std::string show(std::array<numtype, N> a, std::string name) {
std::string output = "\n\n";
if (name != "") {
output += name + " (" + std::to_string(N) + ") : \n";
}
for (size_t i = 0; i < N; i++) {
output += std::to_string(a[i]) + "\n";
if (i > 10) {
output += "...\n";
break;
}
}
return output;
}
int main() {
static constexpr size_t N = 3072;
// Host data - input and output arrays on the CPU
std::array<float, N> inputArr;
std::array<float, N> outputArr;
for (size_t i = 0; i < N; i++) {
// Populate input array with a range of dummy values
inputArr[i] = static_cast<float>(i);
}
// API representations for interfacing with the GPU
WGPUInstance instance; // The instance is the top-level context object for
// WebGPU. It is used to create adapters.
WGPUAdapter adapter; // The adapter is the physical device that WebGPU uses
// to interface with the GPU.
WGPUDevice device; // The device is the logical device that WebGPU uses to
// interface with the adapter.
WGPUQueue queue; // The queue is used to submit work to the GPU.
// Buffers - buffers are used to store data on the GPU.
WGPUBuffer inputBuffer; // The input buffer is used to store the input data.
WGPUBuffer outputBuffer; // The output buffer is used to store the output data.
WGPUBuffer readbackBuffer; // The readback buffer is used to copy the output
// data from the GPU back to the CPU.
WGPUCommandBuffer commandBuffer; // The command buffer is used to store the
// sequence of operations to be executed on
// the GPU.
// Async management - polling the GPU is asynchronous, so we need to manage
// the async work.
std::promise<void> promise; // used to signal when the work is done.
std::future<void> future; // used to wait for the work to be done.
// Here we initialize the instance, adapter, device, and queue.
spdlog::info("Setting up GPU Context");
{
const WGPUInstanceDescriptor desc = {};
WGPURequestAdapterOptions adapterOpts = {};
WGPUDeviceDescriptor devDescriptor = {};
spdlog::info("Creating instance");
{
instance = wgpuCreateInstance(&desc);
check(instance, "Initialize WebGPU", __FILE__, __LINE__);
}
spdlog::info("Requesting adapter");
{
struct AdapterData {
WGPUAdapter adapter = nullptr;
bool requestEnded = false;
};
AdapterData adapterData;
auto onAdapterRequestEnded = [](WGPURequestAdapterStatus status,
WGPUAdapter adapter, char const *message,
void *pUserData) {
AdapterData &adapterData = *reinterpret_cast<AdapterData *>(pUserData);
check(status == WGPURequestAdapterStatus_Success,
"Request WebGPU adapter", __FILE__, __LINE__);
adapterData.adapter = adapter;
adapterData.requestEnded = true;
};
wgpuInstanceRequestAdapter(instance, &adapterOpts, onAdapterRequestEnded,
(void *)&adapterData);
assert(adapterData.requestEnded);
adapter = adapterData.adapter;
check(adapter, "Get WebGPU adapter", __FILE__, __LINE__);
}
spdlog::info("Requesting device");
{
struct DeviceData {
WGPUDevice device = nullptr;
bool requestEnded = false;
};
DeviceData devData;
auto onDeviceRequestEnded = [](WGPURequestDeviceStatus status,
WGPUDevice device, char const *message,
void *pUserData) {
DeviceData &devData = *reinterpret_cast<DeviceData *>(pUserData);
check(status == WGPURequestDeviceStatus_Success,
"Could not get WebGPU device.", __FILE__, __LINE__);
spdlog::info("Device Request succeeded {}",
static_cast<void *>(device));
devData.device = device;
devData.requestEnded = true;
};
devDescriptor.deviceLostCallback =
[](WGPUDeviceLostReason reason, char const *message, void *userdata) {
spdlog::error("Device lost:\n{}", message);
};
wgpuAdapterRequestDevice(adapter, &devDescriptor, onDeviceRequestEnded,
(void *)&devData);
assert(devData.requestEnded);
device = devData.device;
spdlog::info("Setting error callback");
wgpuDeviceSetUncapturedErrorCallback(
device,
[](WGPUErrorType type, char const *message, void *devData) {
spdlog::error("Device uncaptured error: {}", message);
},
nullptr);
wgpuDeviceSetLoggingCallback(
device,
[](WGPULoggingType level, const char *message, void *userdata) {
spdlog::info("WebGPU Validation: {}", message);
},
NULL);
}
// Queue
spdlog::info("Instantiating device queue");
queue = wgpuDeviceGetQueue(device);
}
// Here we setup the binding group layout. The binding group layout is used to
// define the layout of the bind group - e.g. how many buffers are going to be
// used and what their sizes are.
//
// The general pattern of using the WebGPU API is to populate a configuration
// using a descriptor type (*Descriptor), and then pass the descriptor to a
// factory function (*Create*) operation which returns a handle to the
// object. Sometimes the descriptors can be hierarchical and nested, but
// ultimately they are still just an elaborate set of configuration
// parameters.
//
// For example, here we populate a WGPUBindGroupLayoutDescriptor and then
// pass that to the wgpuDeviceCreateBindGroupLayout() function to get back a
// WGPUBindGroupLayout.
spdlog::info("Setting up binding group layout");
WGPUBindGroupLayout bgLayout;
static constexpr uint32_t bufferSize =
static_cast<uint32_t>(sizeof(float) * N);
spdlog::info("Buffer size: {}, number of elements {}", bufferSize, N);
{
WGPUBindGroupLayoutEntry bgLayoutEntries[2];
bgLayoutEntries[0] = (WGPUBindGroupLayoutEntry){
.binding = 0,
.visibility = WGPUShaderStage_Compute,
.buffer =
(WGPUBufferBindingLayout){
.type = WGPUBufferBindingType_Storage,
.minBindingSize = bufferSize,
},
};
bgLayoutEntries[1] = (WGPUBindGroupLayoutEntry){
.binding = 1,
.visibility = WGPUShaderStage_Compute,
.buffer =
(WGPUBufferBindingLayout){
.type = WGPUBufferBindingType_Storage,
.minBindingSize = bufferSize,
},
};
spdlog::info("Creating Binding Group Layout Description");
WGPUBindGroupLayoutDescriptor bgLayoutDesc = {
.entryCount = std::size(bgLayoutEntries),
.entries = bgLayoutEntries,
};
bgLayout = wgpuDeviceCreateBindGroupLayout(device, &bgLayoutDesc);
}
// After setting up the binding group layout we initialize the buffers by
// interacting with the device.
spdlog::info("Create buffers: input, output, and readback");
{
WGPUBufferDescriptor inputBufferDesc = {
.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst,
.size = bufferSize,
};
inputBuffer = wgpuDeviceCreateBuffer(device, &inputBufferDesc);
WGPUBufferDescriptor outputBufferDesc = {
.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
WGPUBufferUsage_CopySrc,
.size = bufferSize,
};
outputBuffer = wgpuDeviceCreateBuffer(device, &outputBufferDesc);
WGPUBufferDescriptor readbackBufferDescriptor = {
.usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead,
.size = bufferSize,
};
readbackBuffer = wgpuDeviceCreateBuffer(device, &readbackBufferDescriptor);
check(inputBuffer, "Create input buffer", __FILE__, __LINE__);
check(outputBuffer, "Create output buffer", __FILE__, __LINE__);
check(readbackBuffer, "Create readback buffer", __FILE__, __LINE__);
}
// We create the bind group with references to the buffers and initialize the
// binding group. Does this seem redundant with the binding group layout?
// Probably.
// The bind group is used to bind the buffers to the compute pipeline.
// The bind group layout is used to define the layout of the bind group.
spdlog::info("Create the bind group");
WGPUBindGroup bindGroup;
{
WGPUBindGroupEntry bindGroupEntries[2];
bindGroupEntries[0] = (WGPUBindGroupEntry){
.binding = 0,
.buffer = inputBuffer,
.offset = 0,
.size = bufferSize,
};
bindGroupEntries[1] = (WGPUBindGroupEntry){
.binding = 1,
.buffer = outputBuffer,
.offset = 0,
.size = bufferSize,
};
WGPUBindGroupDescriptor bindGroupDesc = {
.layout = bgLayout,
.entryCount = std::size(bindGroupEntries),
.entries = bindGroupEntries,
};
bindGroup = wgpuDeviceCreateBindGroup(device, &bindGroupDesc);
}
// We create the compute pipeline with the shader module and pipeline layout.
// The compute pipeline is used to run the compute shader.
spdlog::info("Creating the compute pipeline");
WGPUComputePipeline computePipeline;
{
WGPUPipelineLayout pipelineLayout;
WGPUPipelineLayoutDescriptor pipelineLayoutDesc = {
.bindGroupLayoutCount = 1,
.bindGroupLayouts = &bgLayout,
};
pipelineLayout =
wgpuDeviceCreatePipelineLayout(device, &pipelineLayoutDesc);
WGPUShaderModuleWGSLDescriptor wgslDesc = {
.code = kShaderGELU,
};
wgslDesc.chain.sType = WGPUSType_ShaderModuleWGSLDescriptor;
WGPUShaderModuleDescriptor shaderModuleDesc = {};
shaderModuleDesc.nextInChain = &wgslDesc.chain;
shaderModuleDesc.label = "shader";
WGPUComputePipelineDescriptor computePipelineDesc = {};
computePipelineDesc.layout = pipelineLayout;
computePipelineDesc.compute.module =
wgpuDeviceCreateShaderModule(device, &shaderModuleDesc);
computePipelineDesc.compute.entryPoint = "main";
computePipeline =
wgpuDeviceCreateComputePipeline(device, &computePipelineDesc);
check(computePipeline, "Create compute pipeline", __FILE__, __LINE__);
}
// We create the command encoder and the compute pass encoder. The command
// encoder is used to encode commands for the GPU. The compute pass encoder is
// used to encode commands for the compute pipeline.
spdlog::info("Create the command encoder");
{
static constexpr uint32_t kWorkgroupSize = 256; // This needs to match the
// workgroup size in the
// shader.
WGPUCommandEncoder commandEncoder;
WGPUComputePassEncoder computePassEncoder;
commandEncoder = wgpuDeviceCreateCommandEncoder(device, nullptr);
computePassEncoder =
wgpuCommandEncoderBeginComputePass(commandEncoder, nullptr);
wgpuComputePassEncoderSetPipeline(computePassEncoder, computePipeline);
wgpuComputePassEncoderSetBindGroup(computePassEncoder, 0, bindGroup, 0,
nullptr);
wgpuComputePassEncoderDispatchWorkgroups(
computePassEncoder, (N + (kWorkgroupSize - 1)) / kWorkgroupSize, 1, 1);
wgpuComputePassEncoderEnd(computePassEncoder);
wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, outputBuffer, 0,
readbackBuffer, 0, bufferSize);
commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr);
check(commandBuffer, "Create command buffer", __FILE__, __LINE__);
}
spdlog::info("Initializing promise and future");
promise = std::promise<void>();
future = promise.get_future();
spdlog::info("Copying input data to GPU");
wgpuQueueWriteBuffer(queue, inputBuffer, 0, inputArr.data(), bufferSize);
// Submit the command buffer and launch the kernel. The command buffer is
// submitted to the queue and a callback is set up to handle the completion of
// the job which updates the promise. A while loop is used to wait for the
// promise to be set.
spdlog::info("Submit the command buffer and launching the kernel");
struct CallbackData {
WGPUBuffer buffer;
size_t bufferSize;
float *output;
std::promise<void> *promise;
};
{
// Submit the command buffer
wgpuQueueSubmit(queue, 1, &commandBuffer);
CallbackData callbackData =
CallbackData{readbackBuffer, sizeof(outputArr), nullptr, &promise};
// Set up the callback for when the work is done
wgpuQueueOnSubmittedWorkDone(
queue,
[](WGPUQueueWorkDoneStatus status, void *callbackData) {
spdlog::info("QueueOnSubmittedWorkDone status: {}",
WGPUQueueWorkDoneStatus_Success == status);
check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done",
__FILE__, __LINE__);
const auto *data = static_cast<CallbackData *>(callbackData);
data->promise->set_value();
},
&callbackData);
// Wait for the promise to be set
while (future.wait_for(std::chrono::seconds(0)) !=
std::future_status::ready) {
wgpuInstanceProcessEvents(instance);
}
}
// Copy the output data back to the CPU. This requires its own command encoder
// and command buffer. As with the computation a job is asynchronously
// submitted to the queue and a callback is set up to handle the completion
// of the job which updates the promise.
//
// The execution blocks on the future until the promise is set, after which
// the result of the computation is copied to the outputArr array and is
// printed.
spdlog::info("Copying output to the CPU");
{
// reset the promise and future
promise = std::promise<void>();
future = promise.get_future();
spdlog::info("Setting up command encoder and command buffer for copying "
"output to the CPU");
{
WGPUCommandEncoder commandEncoder;
WGPUComputePassEncoder computePassEncoder;
commandEncoder = wgpuDeviceCreateCommandEncoder(device, nullptr);
wgpuCommandEncoderCopyBufferToBuffer(commandEncoder, outputBuffer, 0,
readbackBuffer, 0, bufferSize);
commandBuffer = wgpuCommandEncoderFinish(commandEncoder, nullptr);
check(commandBuffer, "Create command buffer", __FILE__, __LINE__);
}
wgpuQueueSubmit(queue, 1, &commandBuffer);
CallbackData callbackData = {readbackBuffer, bufferSize, outputArr.data(),
&promise};
wgpuQueueOnSubmittedWorkDone(
queue,
[](WGPUQueueWorkDoneStatus status, void *callbackData) {
spdlog::info("QueueOnSubmittedWorkDone status: {}",
WGPUQueueWorkDoneStatus_Success == status);
check(status == WGPUQueueWorkDoneStatus_Success, "Queue work done",
__FILE__, __LINE__);
const auto *data = static_cast<CallbackData *>(callbackData);
wgpuBufferMapAsync(
data->buffer, WGPUMapMode_Read, 0, bufferSize,
[](WGPUBufferMapAsyncStatus status, void *captureData) {
const auto *data = static_cast<CallbackData *>(captureData);
check(status == WGPUBufferMapAsyncStatus_Success,
"Map readbackBuffer", __FILE__, __LINE__);
const void *mappedData = wgpuBufferGetConstMappedRange(
data->buffer, /*offset=*/0, data->bufferSize);
check(mappedData, "Get mapped range", __FILE__, __LINE__);
memcpy(data->output, mappedData, data->bufferSize);
wgpuBufferUnmap(data->buffer);
data->promise->set_value();
},
callbackData);
},
&callbackData);
while (future.wait_for(std::chrono::seconds(0)) !=
std::future_status::ready) {
wgpuInstanceProcessEvents(instance);
}
}
spdlog::info("{}", show<float, N>(inputArr, "GELU Input"));
spdlog::info("{}", show<float, N>(outputArr, "GELU Output"));
spdlog::info("Done with GELU kernel");
}