Python Examples

This section contains simple and advanced examples using the Python Kompute class. For an overview of the module check Python Package Overview, for a deep dive into functions check the Python Class Reference Section.

You will be able to run the examples below by installing the dependencies in python/test/requirements-dev.txt

Python Example (Simple)

Then you can interact with it from your interpreter. Below is the same sample as above “Your First Kompute (Simple Version)” but in Python:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from kp import Manager, Tensor, OpTensorSyncDevice, OpTensorSyncLocal, OpAlgoDispatch
from pyshader import python2shader, ivec3, f32, Array

mgr = Manager()

# Can be initialized with List[] or np.Array
tensor_in_a = mgr.tensor([2, 2, 2])
tensor_in_b = mgr.tensor([1, 2, 3])
tensor_out = mgr.tensor([0, 0, 0])

sq = mgr.sequence()

sq.eval(OpTensorSyncDevice([tensor_in_a, tensor_in_b, tensor_out]))

# Define the function via PyShader or directly as glsl string or spirv bytes
@python2shader
def compute_shader_multiply(index=("input", "GlobalInvocationId", ivec3),
                            data1=("buffer", 0, Array(f32)),
                            data2=("buffer", 1, Array(f32)),
                            data3=("buffer", 2, Array(f32))):
    i = index.x
    data3[i] = data1[i] * data2[i]

algo = mgr.algorithm([tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv())

# Run shader operation synchronously
sq.eval(OpAlgoDispatch(algo))
sq.eval(OpTensorSyncLocal([tensor_out]))

assert tensor_out.data().tolist() == [2.0, 4.0, 6.0]

Python Example (Extended)

Similarly you can find the same extended example as above:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
 from kp import Manager, Tensor
 import kp
 from pyshader import python2shader, ivec3, f32, Array

 mgr = Manager(0, [2])

 # Can be initialized with List[] or np.Array
 tensor_in_a = mgr.tensor([2, 2, 2])
 tensor_in_b = mgr.tensor([1, 2, 3])
 tensor_out = mgr.tensor([0, 0, 0])

 seq = mgr.sequence()
 seq.eval(kp.OpTensorSyncDevice([tensor_in_a, tensor_in_b, tensor_out]))

 # Define the function via PyShader or directly as glsl string or spirv bytes
 @python2shader
 def compute_shader_multiply(index=("input", "GlobalInvocationId", ivec3),
                             data1=("buffer", 0, Array(f32)),
                             data2=("buffer", 1, Array(f32)),
                             data3=("buffer", 2, Array(f32))):
     i = index.x
     data3[i] = data1[i] * data2[i]

 algo = mgr.algorithm([tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv())

 # Run shader operation asynchronously and then await
 seq.eval_async(kp.OpAlgoDispatch(algo))
 seq.eval_await()

 seq.record(kp.OpTensorSyncLocal([tensor_in_a]))
 seq.record(kp.OpTensorSyncLocal([tensor_in_b]))
 seq.record(kp.OpTensorSyncLocal([tensor_out]))

 seq.eval()

 assert tensor_out.data().tolist() == [2.0, 4.0, 6.0]

Kompute Operation Capabilities

Handling multiple capabilites of processing can be done by compute shaders being loaded into separate sequences. The example below shows how this can be done:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
 from kp import Manager
 import kp

 # We'll assume we have the shader data available
 from my_spv_shader_data import mult_shader, sum_shader

 mgr = Manager()

 t1 = mgr.tensor([2,2,2])
 t2 = mgr.tensor([1,2,3])
 t3 = mgr.tensor([1,2,3])

 mgr.sequence().eval(kp.OpTensorSyncLocal([t1, t3]))

 # Create multiple separate sequences
 sq_mult = mgr.sequence()
 sq_sum = mgr.sequence()
 sq_sync = mgr.sequence()

 sq_mult.record(kp.OpAlgoDispatch(mgr.algorithm([t1, t2, t3], add_shader))

 sq_sum.record(kp.OpAlgoDispatch(mgr.algorithm([t3, t2, t1], sum_shader))

 sq_sync.record(kp.OpTensorSyncLocal([t1, t3]))

 # Run multiple iterations
 for i in range(10):
     sq_mult.eval()
     sq_sum.eval()

 sq_sync.eval()

 print(t1.data(), t2.data(), t3.data())

Machine Learning Logistic Regression Implementation

Similar to the logistic regression implementation in the C++ examples section, below you can find the Python implementation of the Logistic Regression algorithm.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
 from kp import Manager, Tensor
 import kp
 from pyshader import python2shader, ivec3, f32, Array

 @python2shader
 def compute_shader(
         index   = ("input", "GlobalInvocationId", ivec3),
         x_i     = ("buffer", 0, Array(f32)),
         x_j     = ("buffer", 1, Array(f32)),
         y       = ("buffer", 2, Array(f32)),
         w_in    = ("buffer", 3, Array(f32)),
         w_out_i = ("buffer", 4, Array(f32)),
         w_out_j = ("buffer", 5, Array(f32)),
         b_in    = ("buffer", 6, Array(f32)),
         b_out   = ("buffer", 7, Array(f32)),
         l_out   = ("buffer", 8, Array(f32)),
         M       = ("buffer", 9, Array(f32))):

     i = index.x

     m = M[0]

     w_curr = vec2(w_in[0], w_in[1])
     b_curr = b_in[0]

     x_curr = vec2(x_i[i], x_j[i])
     y_curr = y[i]

     z_dot = w_curr @ x_curr
     z = z_dot + b_curr
     y_hat = 1.0 / (1.0 + exp(-z))

     d_z = y_hat - y_curr
     d_w = (1.0 / m) * x_curr * d_z
     d_b = (1.0 / m) * d_z

     loss = -((y_curr * log(y_hat)) + ((1.0 + y_curr) * log(1.0 - y_hat)))

     w_out_i[i] = d_w.x
     w_out_j[i] = d_w.y
     b_out[i] = d_b
     l_out[i] = loss


 mgr = Manager()

 # First we create input and ouput tensors for shader
 tensor_x_i = mgr.tensor([0.0, 1.0, 1.0, 1.0, 1.0])
 tensor_x_j = mgr.tensor([0.0, 0.0, 0.0, 1.0, 1.0])

 tensor_y = mgr.tensor([0.0, 0.0, 0.0, 1.0, 1.0])

 tensor_w_in = mgr.tensor([0.001, 0.001])
 tensor_w_out_i = mgr.tensor([0.0, 0.0, 0.0, 0.0, 0.0])
 tensor_w_out_j = mgr.tensor([0.0, 0.0, 0.0, 0.0, 0.0])

 tensor_b_in = mgr.tensor([0.0])
 tensor_b_out = mgr.tensor([0.0, 0.0, 0.0, 0.0, 0.0])

 tensor_l_out = mgr.tensor([0.0, 0.0, 0.0, 0.0, 0.0])

 tensor_m = mgr.tensor([ 5.0 ])

 # We store them in an array for easier interaction
 params = [tensor_x_i, tensor_x_j, tensor_y, tensor_w_in, tensor_w_out_i,
     tensor_w_out_j, tensor_b_in, tensor_b_out, tensor_l_out, tensor_m]

 sq.sequence().eval(kp.OpTensorSyncDevice(params))

 # Record commands for efficient evaluation
 sq = mgr.sequence()

 sq.record(kp.OpTensorSyncDevice([tensor_w_in, tensor_b_in]))
 sq.record(kp.OpAlgoDispatch(mgr.algorithm(params, compute_shader.to_spirv())))
 sq.record(kp.OpTensorSyncLocal([tensor_w_out_i, tensor_w_out_j, tensor_b_out, tensor_l_out]))

 ITERATIONS = 100
 learning_rate = 0.1

 # Perform machine learning training and inference across all input X and Y
 for i_iter in range(ITERATIONS):
     sq.eval()

     # Calculate the parameters based on the respective derivatives calculated
     w_in_i_val = tensor_w_in.data()[0]
     w_in_j_val = tensor_w_in.data()[1]
     b_in_val = tensor_b_in.data()[0]

     for j_iter in range(tensor_b_out.size()):
         w_in_i_val -= learning_rate * tensor_w_out_i.data()[j_iter]
         w_in_j_val -= learning_rate * tensor_w_out_j.data()[j_iter]
         b_in_val -= learning_rate * tensor_b_out.data()[j_iter]

     # Update the parameters to process inference again
     tensor_w_in.set_data([w_in_i_val, w_in_j_val])
     tensor_b_in.set_data([b_in_val])

 assert tensor_w_in.data()[0] < 0.01
 assert tensor_w_in.data()[0] > 0.0
 assert tensor_w_in.data()[1] > 1.5
 assert tensor_b_in.data()[0] < 0.7

 # Print outputs
 print(tensor_w_in.data())
 print(tensor_b_in.data())