using CUDAdrv; CUDAdrv.name(CuDevice(0))

using CuArrays, FileIO, Colors, GPUArrays, BenchmarkTools, ImageShow
using CuArrays: CuArray
"""
The function calculating the Julia set
"""
function juliaset(z0, maxiter)
    c = ComplexF32(-0.5, 0.75)
    z = z0
    for i in 1:maxiter
        abs2(z) > 4f0 && return (i - 1) % UInt8
        z = z * z + c
    end
    return maxiter % UInt8 # % is used to convert without overflow check
end
range = 100:50:2^12
cutimes, jltimes = Float64[], Float64[]
function run_bench(in, out)
  # use dot syntax to apply `juliaset` to each elemt of q_converted 
  # and write the output to result
  out .= juliaset.(in, 16)
  # all calls to the GPU are scheduled asynchronous, 
  # so we need to synchronize
  GPUArrays.synchronize(out)
end
# store a reference to the last results for plotting
last_jl, last_cu = nothing, nothing
for N in range
  w, h = N, N
  q = [ComplexF32(r, i) for i=1:-(2.0/w):-1, r=-1.5:(3.0/h):1.5]
  for (times, Typ) in ((cutimes, CuArray), (jltimes, Array))
    # convert to Array or CuArray - moving the calculation to CPU/GPU
    q_converted = Typ(q)
    result = Typ(zeros(UInt8, size(q)))
    for i in 1:10 # 5 samples per size
      # benchmarking macro, all variables need to be prefixed with $
      t = Base.@elapsed begin
				run_bench(q_converted, result)
      end
      global last_jl, last_cu # we're in local scope
      if result isa CuArray
        last_cu = result
      else
      	last_jl = result
      end
      push!(times, t)
    end
  end
end

cu_jl = hcat(Array(last_cu), last_jl)
cmap = colormap("Blues", 16 + 1)
color_lookup(val, cmap) = cmap[val + 1]
color_lookup.(cu_jl, (cmap,))

using Plots; plotly()
x = repeat(range, inner = 10)
speedup = jltimes ./ cutimes
Plots.scatter(
  log2.(x), [speedup, fill(1.0, length(speedup))], 
  label = ["cuda" "cpu"], markersize = 2, markerstrokewidth = 0,
  legend = :right, xlabel = "2^N", ylabel = "speedup"
)

struct Test # an immutable struct
# that only contains other immutable, which makes 
# isbitstype(Test) == true
	x::Float32 
end

# the isbits property is important, since those types can be used
# without constraints on the GPU!
@assert isbitstype(Test) == true
x = (2, 2)
isa(x, Tuple{Int, Int}) # tuples are also immutable
mutable struct Test2 #-> mutable, isbits(Test2) == false
	x::Float32
end
struct Test3
	# contains a heap allocation/ reference, not isbits
	x::Vector{Float32}
	y::Test2 # Test2 is mutable and also heap allocated / a reference
end
Vector{Test} # <-  An Array with isbits elements is contigious in memory
Vector{Test2} # <- An Array with mutable elements is basically an array of heap pointers. Since it just contains cpu heap pointers, it won't work on the GPU.

using CuArrays, LinearAlgebra
using CuArrays.CURAND

# GPU Arrays can be constructed from all Julia arrays containing isbits types!
A1D = cu([1, 2, 3]) # cl for CLArrays
A1D = CuArrays.fill(0, 100) # CLArray for CLArrays
# Float32 array - Float32 is usually preferred and can be up to 30x faster on most GPUs than Float64
diagonal_matrix = CuArray{Float32}(I, 100, 100)
filled = CuArrays.fill(77f0, (4, 4, 4)) # 3D array filled with Float32 77
randy = CuArrays.rand(42, 42) # random numbers generated on the GPU
# The array constructor also accepts isbits iterators with a known size
# Note, that since you can also pass isbits types to a gpu kernel directly, in most cases you won't need to materialize them as an gpu array
from_iter = CuArray(1:10)
# let's create a point type to further illustrate what can be done:
struct Point
    x::Float32
    y::Float32
end
Base.convert(::Type{Point}, x::NTuple{2, Any}) = Point(x[1], x[2])
# because we defined the above convert from a tuple to a point
# [Point(2, 2)] can be written as Point[(2,2)] since all array 
# elements will get converted to Point
custom_types = cu(Point[(1, 2), (4, 3), (2, 2)])
typeof(custom_types)

x = zeros(4, 4) # 4x4 array of zeros
y = zeros(4) # 4 element array
z = 2 # a scalar
# y's 1st dimension gets repeated for the 2nd dimension in x
# and the scalar z get's repeated for all dimensions
# the below is equal to `broadcast(+, broadcast(+, xx, y), z)`
x .+ y .+ z

using CuArrays
A = cu([1, 2, 3])
B = cu([1, 2, 3])
C = CuArrays.rand(3)
result = A .+ B .- C
test(a::T) where T = a * convert(T, 2) # convert to same type as `a`

# inplace broadcast, writes directly into `result`
result .= test.(A) # custom function work

# The cool thing is that this composes well with custom types and custom functions.
# Let's go back to our Point type and define addition for it
Base.:(+)(p1::Point, p2::Point) = Point(p1.x + p2.x, p1.y + p2.y)

# now this works:
custom_types = cu(Point[(1, 2), (4, 3), (2, 2)])

# This particular example also shows the power of broadcasting: 
# Non array types are broadcasted and repeated for the whole length
result = custom_types .+ Ref(Point(2, 2))

# So the above is equal to (minus all the allocations):
# this allocates a new array on the gpu, which we can avoid with the above broadcast
broadcasted = fill(CuArray, Point(2, 2), (3,))

result == custom_types .+ broadcasted

using Flux, Flux.Data.MNIST, Statistics
using Flux: onehotbatch, onecold, crossentropy, throttle, params
using Base.Iterators: repeated, partition
using CuArrays

# Classify MNIST digits with a convolutional network

imgs = MNIST.images()

labels = onehotbatch(MNIST.labels(), 0:9)

# Partition into batches of size 1,000
train = [(cat(float.(imgs[i])..., dims = 4), labels[:,i])
         for i in partition(1:60_000, 1000)]

use_gpu = true # helper to easily switch between gpu/cpu

todevice(x) = use_gpu ? gpu(x) : x

train = todevice.(train)

# Prepare test set (first 1,000 images)
tX = cat(float.(MNIST.images(:test)[1:1000])..., dims = 4) |> todevice
tY = onehotbatch(MNIST.labels(:test)[1:1000], 0:9) |> todevice

m = todevice(Chain(
  Conv((2,2), 1=>16, relu),
  MaxPool((2, 2)),
  Conv((2,2), 16=>8, relu),
  MaxPool((2, 2)),
  x -> reshape(x, :, size(x, 4)),
  Dense(288, 10), softmax
))

loss(x, y) = crossentropy(m(x), y)

# train
Flux.train!(loss, params(m), train, ADAM())

using Colors, FileIO, ImageShow
N = 7
img = tX[:, :, 1:1, N:N]
println("Predicted: ", Flux.onecold(m(gpu(img))) .- 1)

save("results/test.png", collect(tX[:, :, 1, N]))

using GPUArrays, CuArrays
# Overloading the Julia Base map! function for GPUArrays
function Base.map!(f::Function, A::GPUArray, B::GPUArray)
    # our function that will run on the gpu
    function kernel(state, f, A, B)
        # If launch parameters aren't specified, linear_index gets the index
        # into the Array passed as second argument to gpu_call (`A`)
        i = linear_index(state)
    		if i <= length(A)
          @inbounds A[i] = f(B[i])
        end
        return
    end
    # call kernel on the gpu
    gpu_call(kernel, A, (f, A, B))
end

using BenchmarkTools
function threadded_map!(f::Function, A::Array, B::Array)
    Threads.@threads for i in 1:length(A)
        A[i] = f(B[i])
    end
  A
end
x, y = rand(10^7), rand(10^7)
kernel(y) = (y / 33f0) * (732.f0/y)
# on the cpu without threads:
single_t = @belapsed map!($kernel, $x, $y)

# "on the CPU with 4 threads (2 real cores):
thread_t = @belapsed threadded_map!($kernel, $x, $y)

# on the GPU:
xgpu, ygpu = cu(x), cu(y)
gpu_t = @belapsed begin
  map!($kernel, $xgpu, $ygpu)
  GPUArrays.synchronize($xgpu)
end
times = [single_t, thread_t, gpu_t]
speedup = maximum(times) ./ times
println("speedup: $speedup")
bar(["1 core", "2 cores", "gpu"], speedup, legend = false, fillcolor = :grey, ylabel = "speedup")

using CuArrays

threads = (2, 2)
blocks = (2, 2)
T = fill(CuArray, (0, 0), (4, 4))
B = fill(CuArray, (0, 0), (4, 4))
gpu_call(T, (B, T), (blocks, threads)) do state, A, B
  # those names pretty much refer to the cuda names
    b = (blockidx_x(state), blockidx_y(state))
    bdim = (blockdim_x(state), blockdim_y(state))
    t = (threadidx_x(state), threadidx_y(state))
    idx = (bdim .* (b .- 1)) .+ t
    A[idx...] = b
    B[idx...] = t
    return
end
println("Threads index: \n", T)
println("Block index: \n", B)