First example: arrays on GPGPU

Let's create first program which operates with arrays using GPGPU.

Initialization

Suppose, you have .NET runtime, F# development environment, and OpenCL-compatible device. Now you can create new F# project and add Brahma.FSharp packet to it.

OpenCL device

Let's check that Brahma.FSharp can detect at least one appropriate device.

1: 
2: 
3: 
4: 
open Brahma.FSharp

let device = ClDevice.GetFirstAppropriateDevice()
printfn "Device: %A" device.Name

As a result of execution you should get a name of detected device. For example, something like this: Device: "Intel(R) UHD Graphics 620 [0x5917]"

If you have truobles with device detection, you can:

Kernel creation

Let's create map2-like kernel with following features.

  • Kernel should be generic. We want to create a single function for all possible types of arrays.
  • Kernel should be parameterized by operation, like a classic map.

It can be done with the following code.

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
10: 
11: 
12: 
13: 
14: 
15: 
16: 
17: 
18: 
19: 
20: 
21: 
22: 
23: 
24: 
25: 
26: 
27: 
28: 
29: 
30: 
31: 
32: 
33: 
34: 
35: 
36: 
37: 
open Brahma.FSharp

let arrayMap2 operation (clContext: ClContext) workGroupSize =
    // The first argument of kernel function is a virtual grid. It is by contract.
    let kernel =
        <@
            fun (range: Range1D) arrLength (array1: ClArray<_>) (array2: ClArray<_>) (result: ClArray<_>) ->
                let i = range.GlobalID0

                if i < arrLength then
                    result.[i] <- (%operation) array1.[i] array2.[i]
        @>

    let kernel = clContext.Compile kernel

    // You can inspect generated OpenCL code.
    printfn "Code: %A" kernel.Code

    // Compilation can be done once.
    // Compiled kernel can be executed many times on different queues with different arguments.
    fun (commandQueue: MailboxProcessor<_>) (inputArray1: ClArray<_>) (inputArray2: ClArray<_>) ->
        let ndRange = Range1D.CreateValid(inputArray1.Length, workGroupSize)

        // We use default allocation mode, but it can be tuned for specific cases.
        let outputArray = clContext.CreateClArray(inputArray1.Length, allocationMode = AllocationMode.Default)

        // We should use new executable object for safety.
        let kernel = kernel.GetKernel()

        commandQueue.Post(
            Msg.MsgSetArguments
                (fun () -> kernel.KernelFunc ndRange inputArray1.Length inputArray1 inputArray2 outputArray)
        )

        commandQueue.Post(Msg.CreateRunMsg<_, _> kernel)

        outputArray

Note, that our function operates over ClArray<_>. It allows one to provide fine-grained memory management.

Kernel execution

To execute or kernel we should do following steps.

  • Create OpenCL-compatible device.
  • Create execution context. Multiple contexts can be created on the top of single device. Moreover, one can use translators with different options in different contexts.
  • Get at least one queue. Queue is a way to communicate with device. Moreover, it is a synchronization primitive: commands in queue will be executed in order.
  • Allocate input data.
  • Create instances of kernel.
  • Run instances.
  • Get result back to the host.

It can be done using the following code.

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
10: 
11: 
12: 
13: 
14: 
15: 
16: 
17: 
18: 
19: 
20: 
21: 
22: 
23: 
24: 
25: 
26: 
27: 
28: 
29: 
30: 
31: 
32: 
33: 
34: 
35: 
36: 
37: 
38: 
39: 
40: 
41: 
42: 
43: 
44: 
45: 
46: 
47: 
48: 
49: 
50: 
51: 
52: 
53: 
54: 
55: 
let main (argv: string[]) =
    // Size of arrays
    let n =
        if argv.Length > 0 then
            int argv.[0]
        else
            10

    let device = ClDevice.GetFirstAppropriateDevice()
    printfn "Device: %A" device.Name

    let context = ClContext(device)
    let mainQueue = context.QueueProvider.CreateQueue()

    // Specific instances of kernel.
    let intArraySum = arrayMap2 <@ (+) @> context 32
    let boolArraySum = arrayMap2 <@ (&&) @> context 32
    let arrayMask = arrayMap2 <@ fun x y -> if y then x else 0 @> context 32

    // Helpers for random data generation.
    let rnd = System.Random()
    let randomIntArray () = Array.init n (fun _ -> rnd.Next() / 10000)
    let randomBoolArray () = Array.init n (fun _ -> rnd.Next() % 2 = 1)

    let intA1 = randomIntArray ()
    let intA2 = randomIntArray ()

    let boolA1 = randomBoolArray ()
    let boolA2 = randomBoolArray ()
    let boolA3 = randomBoolArray ()

    // Allocation of input data.
    // Data will be copied to device automatically.
    let clIntA1 = context.CreateClArray<_>(intA1)
    let clIntA2 = context.CreateClArray<_>(intA2)
    let clBoolA1 = context.CreateClArray<_>(boolA1)
    let clBoolA2 = context.CreateClArray<_>(boolA2)
    let clBoolA3 = context.CreateClArray<_>(boolA3)

    // Evaluation.
    let intRes = intArraySum mainQueue clIntA1 clIntA2

    // Result of first call will be passed as to the next call without copying.
    let boolRes =
        boolArraySum mainQueue clBoolA1 clBoolA2
        |> boolArraySum mainQueue clBoolA3

    let res = arrayMask mainQueue intRes boolRes

    // Getting result as a F# array.
    // We use PostAdReply for synchronization.
    let resOnHost = Array.zeroCreate n
    let res = mainQueue.PostAndReply(fun ch -> Msg.CreateToHostMsg(res, resOnHost, ch))

    0

Final version

Finally, full code looks like follows.

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
10: 
11: 
12: 
13: 
14: 
15: 
16: 
17: 
18: 
19: 
20: 
21: 
22: 
23: 
24: 
25: 
26: 
27: 
28: 
29: 
30: 
31: 
32: 
33: 
34: 
35: 
36: 
37: 
38: 
39: 
40: 
41: 
42: 
43: 
44: 
45: 
46: 
47: 
48: 
49: 
50: 
51: 
52: 
53: 
54: 
55: 
56: 
57: 
58: 
59: 
60: 
61: 
62: 
63: 
64: 
65: 
66: 
67: 
68: 
69: 
70: 
71: 
72: 
73: 
74: 
75: 
76: 
77: 
78: 
79: 
80: 
81: 
82: 
83: 
84: 
85: 
86: 
module FinalVersion =
    open Brahma.FSharp
    open Brahma.FSharp.OpenCL.Translator

    let arrayMap2 operation (clContext: ClContext) workGroupSize =
        let kernel =
            <@
                fun (range: Range1D) arrLength (array1: ClArray<_>) (array2: ClArray<_>) (result: ClArray<_>) ->
                    let i = range.GlobalID0

                    if i < arrLength then
                        result.[i] <- (%operation) array1.[i] array2.[i]
            @>

        let kernel = clContext.Compile kernel

        fun (commandQueue: MailboxProcessor<_>) (inputArray1: ClArray<_>) (inputArray2: ClArray<_>) ->
            let ndRange = Range1D.CreateValid(inputArray1.Length, workGroupSize)
            let outputArray = clContext.CreateClArray(inputArray1.Length, allocationMode = AllocationMode.Default)

            let kernel = kernel.GetKernel()

            commandQueue.Post(
                Msg.MsgSetArguments
                    (fun () -> kernel.KernelFunc ndRange inputArray1.Length inputArray1 inputArray2 outputArray)
            )

            commandQueue.Post(Msg.CreateRunMsg<_, _> kernel)

            outputArray

    [<EntryPoint>]
    let main argv =
        let n =
            if argv.Length > 0 then
                int argv.[0]
            else
                10

        let device = ClDevice.GetFirstAppropriateDevice()

        let context = ClContext(device)
        let mainQueue = context.QueueProvider.CreateQueue()

        let intArraySum = arrayMap2 <@ (+) @> context 32
        let boolArraySum = arrayMap2 <@ (&&) @> context 32
        let arrayMask = arrayMap2 <@ fun x y -> if y then x else 0 @> context 32

        let rnd = System.Random()
        let randomIntArray () = Array.init n (fun _ -> rnd.Next() / 10000)
        let randomBoolArray () = Array.init n (fun _ -> rnd.Next() % 2 = 1)

        let intA1 = randomIntArray ()
        let intA2 = randomIntArray ()

        let boolA1 = randomBoolArray ()
        let boolA2 = randomBoolArray ()
        let boolA3 = randomBoolArray ()

        let clIntA1 = context.CreateClArray<_>(intA1)
        let clIntA2 = context.CreateClArray<_>(intA2)
        let clBoolA1 = context.CreateClArray<_>(boolA1)
        let clBoolA2 = context.CreateClArray<_>(boolA2)
        let clBoolA3 = context.CreateClArray<_>(boolA3)

        let intRes = intArraySum mainQueue clIntA1 clIntA2

        let boolRes =
            boolArraySum mainQueue clBoolA1 clBoolA2
            |> boolArraySum mainQueue clBoolA3

        let res = arrayMask mainQueue intRes boolRes

        let resOnHost = Array.zeroCreate n
        let res = mainQueue.PostAndReply(fun ch -> Msg.CreateToHostMsg(res, resOnHost, ch))

        printfn "First int array:  %A" intA1
        printfn "Second int array: %A" intA2

        printfn "First bool array:  %A" boolA1
        printfn "Second bool array: %A" boolA2
        printfn "Third bool array:  %A" boolA3

        printfn "Result: %A" res

        0
namespace Brahma
namespace Brahma.FSharp
val device : ClDevice
Multiple items
type ClDevice =
  interface IDevice
  new : device:Device -> ClDevice
  override ToString : unit -> string
  member Device : Device
  member DeviceExtensions : string
  member DeviceType : DeviceType
  member MaxWorkGroupSize : int
  member MaxWorkItemDimensions : int
  member MaxWorkItemSizes : int []
  member Name : string
  ...

--------------------
new : device:OpenCL.Net.Device -> ClDevice
static member ClDevice.GetFirstAppropriateDevice : ?platform:Platform * ?deviceType:DeviceType -> ClDevice
val printfn : format:Printf.TextWriterFormat<'T> -> 'T
property ClDevice.Name: string with get
val arrayMap2 : operation:Quotations.Expr<('a -> 'b -> 'c)> -> clContext:ClContext -> workGroupSize:int -> (MailboxProcessor<Msg> -> ClArray<'a> -> ClArray<'b> -> ClArray<'c>)
val operation : Quotations.Expr<('a -> 'b -> 'c)>
val clContext : ClContext
Multiple items
type ClContext =
  new : clDevice:ClDevice * ?translator:FSQuotationToOpenCLTranslator * ?compilerOptions:string -> ClContext
  override ToString : unit -> string
  member ClDevice : ClDevice
  member CompilerOptions : string option
  member Context : Context
  member QueueProvider : CommandQueueProvider
  member Translator : FSQuotationToOpenCLTranslator

--------------------
new : clDevice:ClDevice * ?translator:OpenCL.Translator.FSQuotationToOpenCLTranslator * ?compilerOptions:string -> ClContext
val workGroupSize : int
val kernel : Quotations.Expr<(Range1D -> int -> ClArray<'a> -> ClArray<'b> -> ClArray<'c> -> unit)>
val range : Range1D
Multiple items
type Range1D =
  interface INDRange
  new : globalWorkSize:int -> Range1D
  new : globalWorkSize:int * localWorkSize:int -> Range1D
  private new : globalWorkSize:int * localWorkSize:int * __:unit -> Range1D
  member GlobalID0 : int
  member GlobalWorkSize : int
  member LocalID0 : int
  member LocalWorkSize : int
  static member CreateValid : neededSize:int * localWorkSize:int -> Range1D

--------------------
new : globalWorkSize:int -> Range1D
new : globalWorkSize:int * localWorkSize:int -> Range1D
val arrLength : int
val array1 : ClArray<'a>
Multiple items
module ClArray

from Brahma.FSharp

--------------------
type ClArray<'a> =
  interface IBuffer<'a>
  interface IClMem
  interface IDisposable
  private new : buffer:ClBuffer<'a> -> ClArray<'a>
  member Dispose : unit -> unit
  override ToString : unit -> string
  member private Buffer : ClBuffer<'a>
  member Item : idx:int -> 'a with get
  member Length : int
  member Item : idx:int -> 'a with set
val array2 : ClArray<'b>
val result : ClArray<'c>
val i : int
property Range1D.GlobalID0: int with get
val kernel : ClProgram<Range1D,(int -> ClArray<'a> -> ClArray<'b> -> ClArray<'c> -> unit)>
member ClContext.Compile : srcLambda:Quotations.Expr<('TRange -> 'a)> -> ClProgram<'TRange,'a> (requires 'TRange :> INDRange)
property ClProgram.Code: string with get
val commandQueue : MailboxProcessor<Msg>
Multiple items
type MailboxProcessor<'Msg> =
  interface IDisposable
  new : body:(MailboxProcessor<'Msg> -> Async<unit>) * ?cancellationToken:CancellationToken -> MailboxProcessor<'Msg>
  member Post : message:'Msg -> unit
  member PostAndAsyncReply : buildMessage:(AsyncReplyChannel<'Reply> -> 'Msg) * ?timeout:int -> Async<'Reply>
  member PostAndReply : buildMessage:(AsyncReplyChannel<'Reply> -> 'Msg) * ?timeout:int -> 'Reply
  member PostAndTryAsyncReply : buildMessage:(AsyncReplyChannel<'Reply> -> 'Msg) * ?timeout:int -> Async<'Reply option>
  member Receive : ?timeout:int -> Async<'Msg>
  member Scan : scanner:('Msg -> Async<'T> option) * ?timeout:int -> Async<'T>
  member Start : unit -> unit
  member TryPostAndReply : buildMessage:(AsyncReplyChannel<'Reply> -> 'Msg) * ?timeout:int -> 'Reply option
  ...

--------------------
new : body:(MailboxProcessor<'Msg> -> Async<unit>) * ?cancellationToken:System.Threading.CancellationToken -> MailboxProcessor<'Msg>
val inputArray1 : ClArray<'a>
val inputArray2 : ClArray<'b>
val ndRange : Range1D
static member Range1D.CreateValid : neededSize:int * localWorkSize:int -> Range1D
property ClArray.Length: int with get
val outputArray : ClArray<'c>
member ClContext.CreateClArray : data:'a [] * ?hostAccessMode:HostAccessMode * ?deviceAccessMode:DeviceAccessMode * ?allocationMode:AllocationMode -> ClArray<'a>
member ClContext.CreateClArray : size:int * ?hostAccessMode:HostAccessMode * ?deviceAccessMode:DeviceAccessMode * ?allocationMode:AllocationMode -> ClArray<'a>
type AllocationMode =
  | UseHostPtr
  | AllocHostPtr
  | CopyHostPtr
  | AllocAndCopyHostPtr
  | Default
union case AllocationMode.Default: AllocationMode
val kernel : ClKernel<Range1D,(int -> ClArray<'a> -> ClArray<'b> -> ClArray<'c> -> unit)>
member MailboxProcessor.Post : message:'Msg -> unit
type Msg =
  | MsgToHost of IToHostCrate
  | MsgToGPU of IToGPUCrate
  | MsgRun of IRunCrate
  | MsgFree of IFreeCrate
  | MsgSetArguments of (unit -> unit)
  | MsgNotifyMe of AsyncReplyChannel<unit>
  | MsgBarrier of SyncObject
    static member CreateBarrierMessages : numOfQueuesOnBarrier:int -> Msg []
    static member CreateFreeMsg : src:IDisposable -> Msg
    static member CreateRunMsg : kernel:IKernel -> Msg
    static member CreateToGPUMsg : src:'a [] * dst:IBuffer<'a> -> Msg
    static member CreateToHostMsg : src:IBuffer<'a> * dst:'a [] * ?ch:AsyncReplyChannel<'a []> -> Msg
union case Msg.MsgSetArguments: (unit -> unit) -> Msg
property ClKernel.KernelFunc: Range1D -> int -> ClArray<'a> -> ClArray<'b> -> ClArray<'c> -> unit with get
static member Msg.CreateRunMsg : kernel:IKernel -> Msg
val main : argv:string [] -> int
val argv : string []
Multiple items
val string : value:'T -> string

--------------------
type string = System.String
val n : int
property System.Array.Length: int with get
Multiple items
val int : value:'T -> int (requires member op_Explicit)

--------------------
type int = int32

--------------------
type int<'Measure> = int
val context : ClContext
val mainQueue : MailboxProcessor<Msg>
property ClContext.QueueProvider: CommandQueueProvider with get
member CommandQueueProvider.CreateQueue : unit -> MailboxProcessor<Msg>
val intArraySum : (MailboxProcessor<Msg> -> ClArray<int> -> ClArray<int> -> ClArray<int>)
val boolArraySum : (MailboxProcessor<Msg> -> ClArray<bool> -> ClArray<bool> -> ClArray<bool>)
val arrayMask : (MailboxProcessor<Msg> -> ClArray<int> -> ClArray<bool> -> ClArray<int>)
val x : int
val y : bool
val rnd : System.Random
namespace System
Multiple items
type Random =
  new : unit -> Random + 1 overload
  member Next : unit -> int + 2 overloads
  member NextBytes : buffer:byte[] -> unit + 1 overload
  member NextDouble : unit -> float

--------------------
System.Random() : System.Random
System.Random(Seed: int) : System.Random
val randomIntArray : (unit -> int [])
module Array

from Microsoft.FSharp.Collections
val init : count:int -> initializer:(int -> 'T) -> 'T []
System.Random.Next() : int
System.Random.Next(maxValue: int) : int
System.Random.Next(minValue: int, maxValue: int) : int
val randomBoolArray : (unit -> bool [])
val intA1 : int []
val intA2 : int []
val boolA1 : bool []
val boolA2 : bool []
val boolA3 : bool []
val clIntA1 : ClArray<int>
val clIntA2 : ClArray<int>
val clBoolA1 : ClArray<bool>
val clBoolA2 : ClArray<bool>
val clBoolA3 : ClArray<bool>
val intRes : ClArray<int>
val boolRes : ClArray<bool>
val res : ClArray<int>
val resOnHost : int []
val zeroCreate : count:int -> 'T []
val res : int []
member MailboxProcessor.PostAndReply : buildMessage:(AsyncReplyChannel<'Reply> -> 'Msg) * ?timeout:int -> 'Reply
val ch : AsyncReplyChannel<int []>
static member Msg.CreateToHostMsg : src:OpenCL.Shared.IBuffer<'a> * dst:'a [] * ?ch:AsyncReplyChannel<'a []> -> Msg
module FinalVersion

from Basic_Example
namespace Brahma.FSharp.OpenCL
namespace Brahma.FSharp.OpenCL.Translator
Multiple items
type ClContext =
  new : clDevice:ClDevice * ?translator:FSQuotationToOpenCLTranslator * ?compilerOptions:string -> ClContext
  override ToString : unit -> string
  member ClDevice : ClDevice
  member CompilerOptions : string option
  member Context : Context
  member QueueProvider : CommandQueueProvider
  member Translator : FSQuotationToOpenCLTranslator

--------------------
new : clDevice:ClDevice * ?translator:FSQuotationToOpenCLTranslator * ?compilerOptions:string -> ClContext
Multiple items
type EntryPointAttribute =
  inherit Attribute
  new : unit -> EntryPointAttribute

--------------------
new : unit -> EntryPointAttribute