Naive Matmul Implementation¶
This demonstrate a basic matmul implementation using Python and vulkan-kompute. Many thanks for the very helpful SGEMM in WebGL2-compute article on the public library ibiblio.org.
To test the implementation simply run the matmul.py
script :
python matmul.py
Implementation Overview¶
The benchmark can be found in the benchmark.py file in the repo, which is outlined below. This file runs a naive implementation of the three matrix multiplication implementations to evaluate the performance of each.
import time
import kp
import numpy as np
from imp1_naive import MatMulOp as MatMulOp1
from imp2_tiled import MatMulOp as MatMulOp2
from imp3_better_tiling import MatMulOp as MatMulOp3
def main():
mgr = kp.Manager()
for tensor_size, experiment_count in [(512, 1000), (4096, 5)]:
tensor_shape = [tensor_size, tensor_size]
tensor_shape = [tensor_size, tensor_size]
mat_1 = np.triu(np.ones(tensor_shape))
mat_2 = np.triu(np.ones(tensor_shape))
tensor_in_1 = mgr.tensor(mat_1)
tensor_in_2 = mgr.tensor(mat_2)
tensor_out = mgr.tensor(np.zeros(tensor_shape))
if tensor_size <= 512:
mat_result = mat_1 @ mat_2
else:
MatMulOp1(mgr)(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
mat_result = tensor_out.data().reshape(tensor_shape) # CPU is too slow for big sizes
print(f'{tensor_shape} input tensors:\n'
f'{mat_1}\n'
f'{mat_2}\n')
print(f'Output :\n{mat_result}')
for MatMulOp in [MatMulOp1, MatMulOp2, MatMulOp3]:
tensor_out.data()[:] = 0
mgr.sequence().record(kp.OpTensorSyncDevice([tensor_out]))
matmul_op = MatMulOp(mgr)
matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
start_time = time.time()
for _ in range(experiment_count):
matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
end_time = time.time()
experiment_time = end_time - start_time
op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1)
# print(tensor_out.data().reshape(tensor_shape))
if (tensor_out.data().reshape(tensor_shape) == mat_result).all():
print(f'From {MatMulOp.__module__} : {experiment_count} matmul time : '
f'{experiment_time * 1000:0.2f}ms => '
f'{experiment_count / experiment_time:0.2f}op/s or '
f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS')
else:
print(f'Test failed => output tensor is wrong :\n{tensor_out.data().reshape(tensor_shape)}')
if __name__ == '__main__':
main()