using CUDAdrv; CUDAdrv.name(CuDevice(0)) using CuArrays, FileIO, Colors, GPUArrays, BenchmarkTools, ImageShow using CuArrays: CuArray """ The function calculating the Julia set """ function juliaset(z0, maxiter) c = ComplexF32(-0.5, 0.75) z = z0 for i in 1:maxiter abs2(z) > 4f0 && return (i - 1) % UInt8 z = z * z + c end return maxiter % UInt8 # % is used to convert without overflow check end range = 100:50:2^12 cutimes, jltimes = Float64[], Float64[] function run_bench(in, out) # use dot syntax to apply `juliaset` to each elemt of q_converted # and write the output to result out .= juliaset.(in, 16) # all calls to the GPU are scheduled asynchronous, # so we need to synchronize GPUArrays.synchronize(out) end # store a reference to the last results for plotting last_jl, last_cu = nothing, nothing for N in range w, h = N, N q = [ComplexF32(r, i) for i=1:-(2.0/w):-1, r=-1.5:(3.0/h):1.5] for (times, Typ) in ((cutimes, CuArray), (jltimes, Array)) # convert to Array or CuArray - moving the calculation to CPU/GPU q_converted = Typ(q) result = Typ(zeros(UInt8, size(q))) for i in 1:10 # 5 samples per size # benchmarking macro, all variables need to be prefixed with $ t = Base.@elapsed begin run_bench(q_converted, result) end global last_jl, last_cu # we're in local scope if result isa CuArray last_cu = result else last_jl = result end push!(times, t) end end end cu_jl = hcat(Array(last_cu), last_jl) cmap = colormap("Blues", 16 + 1) color_lookup(val, cmap) = cmap[val + 1] color_lookup.(cu_jl, (cmap,)) using Plots; plotly() x = repeat(range, inner = 10) speedup = jltimes ./ cutimes Plots.scatter( log2.(x), [speedup, fill(1.0, length(speedup))], label = ["cuda" "cpu"], markersize = 2, markerstrokewidth = 0, legend = :right, xlabel = "2^N", ylabel = "speedup" ) struct Test # an immutable struct # that only contains other immutable, which makes # isbitstype(Test) == true x::Float32 end # the isbits property is important, since those types can be used # without constraints on the GPU! @assert isbitstype(Test) == true x = (2, 2) isa(x, Tuple{Int, Int}) # tuples are also immutable mutable struct Test2 #-> mutable, isbits(Test2) == false x::Float32 end struct Test3 # contains a heap allocation/ reference, not isbits x::Vector{Float32} y::Test2 # Test2 is mutable and also heap allocated / a reference end Vector{Test} # <- An Array with isbits elements is contigious in memory Vector{Test2} # <- An Array with mutable elements is basically an array of heap pointers. Since it just contains cpu heap pointers, it won't work on the GPU. using CuArrays, LinearAlgebra using CuArrays.CURAND # GPU Arrays can be constructed from all Julia arrays containing isbits types! A1D = cu([1, 2, 3]) # cl for CLArrays A1D = CuArrays.fill(0, 100) # CLArray for CLArrays # Float32 array - Float32 is usually preferred and can be up to 30x faster on most GPUs than Float64 diagonal_matrix = CuArray{Float32}(I, 100, 100) filled = CuArrays.fill(77f0, (4, 4, 4)) # 3D array filled with Float32 77 randy = CuArrays.rand(42, 42) # random numbers generated on the GPU # The array constructor also accepts isbits iterators with a known size # Note, that since you can also pass isbits types to a gpu kernel directly, in most cases you won't need to materialize them as an gpu array from_iter = CuArray(1:10) # let's create a point type to further illustrate what can be done: struct Point x::Float32 y::Float32 end Base.convert(::Type{Point}, x::NTuple{2, Any}) = Point(x[1], x[2]) # because we defined the above convert from a tuple to a point # [Point(2, 2)] can be written as Point[(2,2)] since all array # elements will get converted to Point custom_types = cu(Point[(1, 2), (4, 3), (2, 2)]) typeof(custom_types) x = zeros(4, 4) # 4x4 array of zeros y = zeros(4) # 4 element array z = 2 # a scalar # y's 1st dimension gets repeated for the 2nd dimension in x # and the scalar z get's repeated for all dimensions # the below is equal to `broadcast(+, broadcast(+, xx, y), z)` x .+ y .+ z using CuArrays A = cu([1, 2, 3]) B = cu([1, 2, 3]) C = CuArrays.rand(3) result = A .+ B .- C test(a::T) where T = a * convert(T, 2) # convert to same type as `a` # inplace broadcast, writes directly into `result` result .= test.(A) # custom function work # The cool thing is that this composes well with custom types and custom functions. # Let's go back to our Point type and define addition for it Base.:(+)(p1::Point, p2::Point) = Point(p1.x + p2.x, p1.y + p2.y) # now this works: custom_types = cu(Point[(1, 2), (4, 3), (2, 2)]) # This particular example also shows the power of broadcasting: # Non array types are broadcasted and repeated for the whole length result = custom_types .+ Ref(Point(2, 2)) # So the above is equal to (minus all the allocations): # this allocates a new array on the gpu, which we can avoid with the above broadcast broadcasted = fill(CuArray, Point(2, 2), (3,)) result == custom_types .+ broadcasted using Flux, Flux.Data.MNIST, Statistics using Flux: onehotbatch, onecold, crossentropy, throttle, params using Base.Iterators: repeated, partition using CuArrays # Classify MNIST digits with a convolutional network imgs = MNIST.images() labels = onehotbatch(MNIST.labels(), 0:9) # Partition into batches of size 1,000 train = [(cat(float.(imgs[i])..., dims = 4), labels[:,i]) for i in partition(1:60_000, 1000)] use_gpu = true # helper to easily switch between gpu/cpu todevice(x) = use_gpu ? gpu(x) : x train = todevice.(train) # Prepare test set (first 1,000 images) tX = cat(float.(MNIST.images(:test)[1:1000])..., dims = 4) |> todevice tY = onehotbatch(MNIST.labels(:test)[1:1000], 0:9) |> todevice m = todevice(Chain( Conv((2,2), 1=>16, relu), MaxPool((2, 2)), Conv((2,2), 16=>8, relu), MaxPool((2, 2)), x -> reshape(x, :, size(x, 4)), Dense(288, 10), softmax )) loss(x, y) = crossentropy(m(x), y) # train Flux.train!(loss, params(m), train, ADAM()) using Colors, FileIO, ImageShow N = 7 img = tX[:, :, 1:1, N:N] println("Predicted: ", Flux.onecold(m(gpu(img))) .- 1) save("results/test.png", collect(tX[:, :, 1, N])) using GPUArrays, CuArrays # Overloading the Julia Base map! function for GPUArrays function Base.map!(f::Function, A::GPUArray, B::GPUArray) # our function that will run on the gpu function kernel(state, f, A, B) # If launch parameters aren't specified, linear_index gets the index # into the Array passed as second argument to gpu_call (`A`) i = linear_index(state) if i <= length(A) @inbounds A[i] = f(B[i]) end return end # call kernel on the gpu gpu_call(kernel, A, (f, A, B)) end using BenchmarkTools function threadded_map!(f::Function, A::Array, B::Array) Threads.@threads for i in 1:length(A) A[i] = f(B[i]) end A end x, y = rand(10^7), rand(10^7) kernel(y) = (y / 33f0) * (732.f0/y) # on the cpu without threads: single_t = @belapsed map!($kernel, $x, $y) # "on the CPU with 4 threads (2 real cores): thread_t = @belapsed threadded_map!($kernel, $x, $y) # on the GPU: xgpu, ygpu = cu(x), cu(y) gpu_t = @belapsed begin map!($kernel, $xgpu, $ygpu) GPUArrays.synchronize($xgpu) end times = [single_t, thread_t, gpu_t] speedup = maximum(times) ./ times println("speedup: $speedup") bar(["1 core", "2 cores", "gpu"], speedup, legend = false, fillcolor = :grey, ylabel = "speedup") using CuArrays threads = (2, 2) blocks = (2, 2) T = fill(CuArray, (0, 0), (4, 4)) B = fill(CuArray, (0, 0), (4, 4)) gpu_call(T, (B, T), (blocks, threads)) do state, A, B # those names pretty much refer to the cuda names b = (blockidx_x(state), blockidx_y(state)) bdim = (blockdim_x(state), blockdim_y(state)) t = (threadidx_x(state), threadidx_y(state)) idx = (bdim .* (b .- 1)) .+ t A[idx...] = b B[idx...] = t return end println("Threads index: \n", T) println("Block index: \n", B)