Naive Matmul Implementation

This demonstrate a basic matmul implementation using Python and vulkan-kompute. Many thanks for the very helpful SGEMM in WebGL2-compute article on the public library ibiblio.org.

To test the implementation simply run the matmul.py script :

python matmul.py

Implementation Overview

The benchmark can be found in the benchmark.py file in the repo, which is outlined below. This file runs a naive implementation of the three matrix multiplication implementations to evaluate the performance of each.

import time

import kp
import numpy as np
from imp1_naive import MatMulOp as MatMulOp1
from imp2_tiled import MatMulOp as MatMulOp2
from imp3_better_tiling import MatMulOp as MatMulOp3


def main():
    mgr = kp.Manager()
    for tensor_size, experiment_count in [(512, 1000), (4096, 5)]:
        tensor_shape = [tensor_size, tensor_size]
        tensor_shape = [tensor_size, tensor_size]
        mat_1 = np.triu(np.ones(tensor_shape))
        mat_2 = np.triu(np.ones(tensor_shape))

        tensor_in_1 = mgr.tensor(mat_1)
        tensor_in_2 = mgr.tensor(mat_2)
        tensor_out = mgr.tensor(np.zeros(tensor_shape))
        if tensor_size <= 512:
            mat_result = mat_1 @ mat_2
        else:
            MatMulOp1(mgr)(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
            mat_result = tensor_out.data().reshape(tensor_shape)  # CPU is too slow for big sizes

        print(f'{tensor_shape} input tensors:\n'
              f'{mat_1}\n'
              f'{mat_2}\n')
        print(f'Output :\n{mat_result}')

        for MatMulOp in [MatMulOp1, MatMulOp2, MatMulOp3]:
            tensor_out.data()[:] = 0
            mgr.sequence().record(kp.OpTensorSyncDevice([tensor_out]))
            matmul_op = MatMulOp(mgr)
            matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)

            start_time = time.time()
            for _ in range(experiment_count):
                matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
            end_time = time.time()
            experiment_time = end_time - start_time
            op_count = tensor_shape[0] * tensor_shape[1] * ((tensor_shape[1] * 2) - 1)

            # print(tensor_out.data().reshape(tensor_shape))
            if (tensor_out.data().reshape(tensor_shape) == mat_result).all():
                print(f'From {MatMulOp.__module__} : {experiment_count} matmul time : '
                      f'{experiment_time * 1000:0.2f}ms => '
                      f'{experiment_count / experiment_time:0.2f}op/s or '
                      f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS')
            else:
                print(f'Test failed => output tensor is wrong :\n{tensor_out.data().reshape(tensor_shape)}')


if __name__ == '__main__':
    main()