[CUDA]カスタムatomicMinで最小値とそのときのindexとを同時に取得する

値や構造体の配列を入力として、それぞれの入力値に対して何らかの計算式などで何かしらの値を求め、その値の最小値（または最大値）と、そのときの配列のindexとをセットで欲しいときがあります。
例えば、
std::vector values = { 3, 5, 2, 7, 1, 9 };
が入力値で、この入力値の最小値とそのときのindexが欲しいとき、
min_value = 1; index = 4;
です。

これをCUDA上でやりたいときがあります。もちろん、入力配列の最小値を求めるだけであればCPUでやったほうが普通は速いため、実際には、入力値や入力構造体値からCUDAカーネル上で何かしらの計算を行い、有効な最小値の候補が見つかったスレッドに対して最小値かどうかを判定する、という使い方になるかと思いますが、単純化のためここでは単に入力値が最小値候補だと仮定して話を進めます。

CUDAでは複数スレッドが同一メモリに同時に書き込むことを防ぐためのatomic関数が用意されており、最小値を求めるためのatomicMinも用意されているのですが、複数の値を同時にthread safeに書き換えることは出来ず、16bit, 32bit, あるいは64bitの単一の変数を書き換えることしかできません。
複数行に渡ってmutex的なことを行ったり、複数の変数を持つ構造体に対してthread safeに値を書き換えることは基本的には出来ません（…たぶん）。

ですが、最小値（または最大値）とそのときのindexがどちらも32bit以下の場合には、共用体（union）を使って無理やり64bit変数として解釈することで、2つの値をthread safeに書き換えられるというstack overflowの記事を見つけました。

cuda – How can I implement a custom atomic function involving several variables? – Stack Overflow

いままで共用体なんて使ったことがなく、というか使いどころを知らなかったのですが、まさかこんなところで使えるなんて驚きです。天才的な発想ではないかと思いました。
リンク先のコードをC++とCUDA thrustを使い、その他若干のrefactoringを行ったコードがこちらです。

/* Copyright Hirofumi Seo, M.D. */

#include <random>
#include <limits>
#include <iostream>
#include <chrono>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "thrust/host_vector.h"
#include "thrust/device_vector.h"

constexpr int kNumTotal = 50000;
constexpr int kBlockSize = 256;
constexpr int kNumBlocks = (kNumTotal + kBlockSize - 1) / kBlockSize;

union ValueAndIndex{
public:
  unsigned long long int ulong;    // for atomic update
 
  __host__ __device__ ValueAndIndex() {}
  __host__ __device__ ValueAndIndex(const float value, const int index) {
    Set(value, index);
  }

  __host__ __device__ void Set(const float value, const int index) {
    SetValue(value);
    SetIndex(index);
  }

  __host__ __device__ float GetValue() const {
    return floats[0];
  }
  __host__ __device__ void SetValue(const float value) {
    floats[0] = value;
  }

  __host__ __device__ int GetIndex() const {
    return ints[1];
  }
  __host__ __device__ void SetIndex(const int index) {
    ints[1] = index;
  }

private:
  float floats[2];                 // floats[0] = lowest value
  int ints[2];                     // ints[1] = the index of the lowest value
};

/* ------------original version------------ */
// https://stackoverflow.com/questions/17411493/how-can-i-implement-a-custom-atomic-function-involving-several-variables/17414007#17414007
__device__ ValueAndIndex AtomicMinValueAndIndexOriginal(const ValueAndIndex& value_and_index, ValueAndIndex* min_value_and_index) {
  ValueAndIndex old = *min_value_and_index;
  while (value_and_index.GetValue() < old.GetValue()) {
    old.ulong = atomicCAS(&(min_value_and_index->ulong), old.ulong, value_and_index.ulong);
  }
  return old;
}

__global__ void Kernel_GetMinValueAndIndexOriginal(const float* values, ValueAndIndex* min_value_and_index) {
  const int initial_index = blockIdx.x * blockDim.x + threadIdx.x;
  const int stride = blockDim.x * gridDim.x;
  for (int index = initial_index; index < kNumTotal; index += stride) {
    AtomicMinValueAndIndexOriginal(ValueAndIndex(values[index], index), min_value_and_index);
  }
}
/* ---------------------------------------- */

int main() {

  thrust::host_vector<float> h_values(kNumTotal);
  thrust::device_vector<float> d_values(kNumTotal);

  // create random floats between 0 and 1
  std::random_device rnd;
  std::mt19937 mt(rnd());
  std::uniform_int_distribution<int> rand0_to_10000(0, 10000);
  for (int i = 0; i < kNumTotal; ++i) {
    h_values[i] = (float)rand0_to_10000(mt);
  }
  d_values = h_values;

  ValueAndIndex h_min_value_and_index;
  ValueAndIndex* d_min_value_and_index;
  cudaMalloc((void**)&d_min_value_and_index, sizeof(ValueAndIndex));

  auto time_start = std::chrono::system_clock::now();
  auto time_end = std::chrono::system_clock::now();
  auto time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() / 1000.0f;

  /* ------------original version------------ */
  time_start = std::chrono::system_clock::now();
  
  cudaMemcpy(d_min_value_and_index, &ValueAndIndex(std::numeric_limits<float>::max(), -1), sizeof(ValueAndIndex), cudaMemcpyHostToDevice);
  Kernel_GetMinValueAndIndexOriginal<<<kNumBlocks, kBlockSize>>>(thrust::raw_pointer_cast(d_values.data()), d_min_value_and_index);
  cudaDeviceSynchronize();
  cudaMemcpy(&h_min_value_and_index, d_min_value_and_index, sizeof(ValueAndIndex), cudaMemcpyDeviceToHost);
  
  time_end = std::chrono::system_clock::now();
  time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() / 1000.0f;

  std::cout << "GPU Original: (min_value, index) = (" << h_min_value_and_index.GetValue() << ", " << h_min_value_and_index.GetIndex() << "), " << time_used << "msec." << std::endl;
  /* ---------------------------------------- */


  /* --------------cpu version--------------- */
  time_start = std::chrono::system_clock::now();

  ValueAndIndex min_value_and_index_by_cpu(std::numeric_limits<float>::max(), -1);
  for (int index = 0; index < kNumTotal; ++index) {
    if (h_values[index] < min_value_and_index_by_cpu.GetValue()) {
      min_value_and_index_by_cpu.Set(h_values[index], index);
    }
  }

  time_end = std::chrono::system_clock::now();
  time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() / 1000.0f;

  std::cout << "CPU: (min_value, index) = (" << min_value_and_index_by_cpu.GetValue() << ", " << min_value_and_index_by_cpu.GetIndex() << "), " << time_used << "msec." << std::endl;
  /* ---------------------------------------- */

  cudaFree(d_min_value_and_index);
  return 0;
}

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

/* Copyright Hirofumi Seo, M.D. */

#include <random>

#include <limits>

#include <iostream>

#include <chrono>

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include "thrust/host_vector.h"

#include "thrust/device_vector.h"

constexpr int kNumTotal = 50000;

constexpr int kBlockSize = 256;

constexpr int kNumBlocks = (kNumTotal + kBlockSize - 1) / kBlockSize;

union ValueAndIndex{

public:

unsigned long long int ulong; // for atomic update

__host__ __device__ ValueAndIndex() {}

__host__ __device__ ValueAndIndex(const float value, const int index) {

Set(value, index);

}

__host__ __device__ void Set(const float value, const int index) {

SetValue(value);

SetIndex(index);

}

__host__ __device__ float GetValue() const {

return floats[0];

}

__host__ __device__ void SetValue(const float value) {

floats[0] = value;

}

__host__ __device__ int GetIndex() const {

return ints[1];

}

__host__ __device__ void SetIndex(const int index) {

ints[1] = index;

}

private:

float floats[2]; // floats[0] = lowest value

int ints[2]; // ints[1] = the index of the lowest value

};

/* ------------original version------------ */

// https://stackoverflow.com/questions/17411493/how-can-i-implement-a-custom-atomic-function-involving-several-variables/17414007#17414007

__device__ ValueAndIndex AtomicMinValueAndIndexOriginal(const ValueAndIndex& value_and_index, ValueAndIndex* min_value_and_index) {

ValueAndIndex old = *min_value_and_index;

while (value_and_index.GetValue() < old.GetValue()) {

old.ulong = atomicCAS(&(min_value_and_index->ulong), old.ulong, value_and_index.ulong);

}

return old;

}

__global__ void Kernel_GetMinValueAndIndexOriginal(const float* values, ValueAndIndex* min_value_and_index) {

const int initial_index = blockIdx.x * blockDim.x + threadIdx.x;

const int stride = blockDim.x * gridDim.x;

for (int index = initial_index; index < kNumTotal; index += stride) {

AtomicMinValueAndIndexOriginal(ValueAndIndex(values[index], index), min_value_and_index);

}

/* ---------------------------------------- */

int main() {

thrust::host_vector<float> h_values(kNumTotal);

thrust::device_vector<float> d_values(kNumTotal);

// create random floats between 0 and 1

std::random_device rnd;

std::mt19937 mt(rnd());

std::uniform_int_distribution<int> rand0_to_10000(0, 10000);

for (int i = 0; i < kNumTotal; ++i) {

h_values[i] = (float)rand0_to_10000(mt);

}

d_values = h_values;

ValueAndIndex h_min_value_and_index;

ValueAndIndex* d_min_value_and_index;

cudaMalloc((void**)&d_min_value_and_index, sizeof(ValueAndIndex));

auto time_start = std::chrono::system_clock::now();

auto time_end = std::chrono::system_clock::now();

auto time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() / 1000.0f;

/* ------------original version------------ */

time_start = std::chrono::system_clock::now();

cudaMemcpy(d_min_value_and_index, &ValueAndIndex(std::numeric_limits<float>::max(), -1), sizeof(ValueAndIndex), cudaMemcpyHostToDevice);

Kernel_GetMinValueAndIndexOriginal<<<kNumBlocks, kBlockSize>>>(thrust::raw_pointer_cast(d_values.data()), d_min_value_and_index);

cudaDeviceSynchronize();

cudaMemcpy(&h_min_value_and_index, d_min_value_and_index, sizeof(ValueAndIndex), cudaMemcpyDeviceToHost);

time_end = std::chrono::system_clock::now();

time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() / 1000.0f;

std::cout << "GPU Original: (min_value, index) = (" << h_min_value_and_index.GetValue() << ", " << h_min_value_and_index.GetIndex() << "), " << time_used << "msec." << std::endl;

/* ---------------------------------------- */

/* --------------cpu version--------------- */

time_start = std::chrono::system_clock::now();

ValueAndIndex min_value_and_index_by_cpu(std::numeric_limits<float>::max(), -1);

for (int index = 0; index < kNumTotal; ++index) {

if (h_values[index] < min_value_and_index_by_cpu.GetValue()) {

min_value_and_index_by_cpu.Set(h_values[index], index);

}

time_end = std::chrono::system_clock::now();

time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() / 1000.0f;

std::cout << "CPU: (min_value, index) = (" << min_value_and_index_by_cpu.GetValue() << ", " << min_value_and_index_by_cpu.GetIndex() << "), " << time_used << "msec." << std::endl;

/* ---------------------------------------- */

cudaFree(d_min_value_and_index);

return 0;

}

共用体ValueAndIndexの前半32bitに最小値を、後半32bitにindexを格納するようにし、atomicMinを用いるときはunsigned long long int値として解釈するというテクニックです。
かなり天才的な発想ではないかと個人的には思うのと、stack overflowでは模範解答的に扱われているのですが、atomic関数について勉強すると、実は上記の方法はもう少し改良の余地があることがわかります。

ちなみに上記コードを走らせると、
GPU Original: (min_value, index) = (0, 11101), 0.191msec. CPU: (min_value, index) = (0, 11101), 0.036msec.
というような結果になり、当たり前ですがこの程度であればCPU計算のほうが速いです。

さて、上記で最も大事なカーネル関数を注意深く見てみましょう。

__device__ ValueAndIndex AtomicMinValueAndIndexOriginal(const ValueAndIndex& value_and_index, ValueAndIndex* min_value_and_index) {
  ValueAndIndex current = *min_value_and_index;
  while (value_and_index.GetValue() < current.GetValue()) {
    current.ulong = atomicCAS(&(min_value_and_index->ulong), current.ulong, value_and_index.ulong);
  }
  return current;
}

__device__ ValueAndIndex AtomicMinValueAndIndexOriginal(const ValueAndIndex& value_and_index, ValueAndIndex* min_value_and_index) {

ValueAndIndex current = *min_value_and_index;

while (value_and_index.GetValue() < current.GetValue()) {

current.ulong = atomicCAS(&(min_value_and_index->ulong), current.ulong, value_and_index.ulong);

}

return current;

}

これ、最低でもwhile文が2回まわってしまいます。他のスレッドからのmin_value_and_indexへの操作がなく、1回目のwhile文内でatomicCASにおいて
min_value_and_index->ulong == current.ulong
だったとしても
current.ulongには古い値が代入されます（atnomicCASの仕様です）。つまり、min_value_and_indexにはvalue_and_indexが正しく代入されているのにcurrentには代入前の古い値が代入されるため、新しい値が代入されても
value_and_index.GetValue() < current.GetValue()
を満たしてしまい、2回目のwhile文が走ります。
2回目のwhile文内では
min_value_and_index->ulong != current.ulong
なので、atomicCASによって値が変わることは無く、一方でatomicCASによって
current.ulong == value_and_index.ulong
となるため、これでようやく
value_and_index.GetValue() == current.GetValue()
となりwhile文を抜けます。

atomic関数についてはNVIDIAが2013年のGPU Technology Conferenceでatomic関数に特化した講演（？）をしているらしく、貴重なスライドを見ることが出来ます。

Understanding and Using Atomic Memory Operations
Lars Nyland & Stephen Jones, NVIDIA GTC 2013

このスライドで解説されているLock-Free Data Updatesの考え方で、先ほどのカーネルを改良すると以下のようになります。

__device__ ValueAndIndex AtomicMinValueAndIndexModified(const ValueAndIndex& value_and_index, ValueAndIndex* min_value_and_index) {
  ValueAndIndex old = *min_value_and_index, assumed;
  do {
    if (value_and_index.GetValue() >= old.GetValue()) {
      break;
    }
    assumed = old;
    old.ulong = atomicCAS(&(min_value_and_index->ulong), assumed.ulong, value_and_index.ulong);
  } while (assumed.ulong != old.ulong);
  return old;
}

__device__ ValueAndIndex AtomicMinValueAndIndexModified(const ValueAndIndex& value_and_index, ValueAndIndex* min_value_and_index) {

ValueAndIndex old = *min_value_and_index, assumed;

do {

if (value_and_index.GetValue() >= old.GetValue()) {

break;

}

assumed = old;

old.ulong = atomicCAS(&(min_value_and_index->ulong), assumed.ulong, value_and_index.ulong);

} while (assumed.ulong != old.ulong);

return old;

}

Programming Guide :: CUDA Toolkit Documentation
8.12. Atomic Functions

に記載されている、double値でのatomicAdd演算のサンプルコードと同じ変数名を用いました。上記のようにすることで、中間変数assumedが1つ増えてしまいますが、atomicCASは最低1回で済みます。
ただし、オリジナルのものと比較して速度が改善されるかというとたいして変わりません。むしろ改良したもののほうが時々遅くなることもあります。このあたりは実行時のGPUの状態（？）次第でまちまちです。
試しに実行してみると以下のようになりました。

GPU Original: (min_value, index) = (0, 16912), 0.193msec. GPU Modified: (min_value, index) = (0, 34187), 0.183msec. CPU: (min_value, index) = (0, 15386), 0.038msec.

ん、何やら別の問題が発生しているようです…。indexの値が異なっています…！
実は最初に紹介したstack overflowのサンプルコードでは入力要素数が5000しかなかったために気付かれにくかったのですが、min_valueを持つ要素が複数あるとき、CPUでは必ず最小のindex値が選ばれますが、GPUではどのthreadが最速で終わるかはその時々で異なるため、どのindexが選ばれるかは全くわかりません。
上記の場合、50000個の入力値に対して入力値が最小値である0となっていたものが複数あり、indexを書き出してみると
The indices of the min_value are… 15386, 16912, 17974, 28312, 34187,
となっていました。つまり、この5つのindexのどれであっても最小値は0なので、どのindexも正しいです。
最小値が複数ある場合に、どの場所の要素を取ってきても構わないのであればこれで問題ありませんが、CPUでの処理とGPUでの処理とで結果が異なることがあるのは気持ち悪いです。
ので、上記改良したカーネルにさらに少しだけ手を加えて、CPUとGPUとで必ず結果が同じになるようにしましょう。こうすれば良いです。

__device__ ValueAndIndex AtomicMinValueAndIndexModifiedNew(const ValueAndIndex& value_and_index, ValueAndIndex* min_value_and_index) {
  ValueAndIndex old = *min_value_and_index, assumed;
  do {
    if (value_and_index.GetValue() > old.GetValue()) {
      break;
    }
    if ((value_and_index.GetValue() == old.GetValue()) && (value_and_index.GetIndex() > old.GetIndex())) {
      break;
    }
    assumed = old;
    old.ulong = atomicCAS(&(min_value_and_index->ulong), assumed.ulong, value_and_index.ulong);
  } while (assumed.ulong != old.ulong);
  return old;
}

__device__ ValueAndIndex AtomicMinValueAndIndexModifiedNew(const ValueAndIndex& value_and_index, ValueAndIndex* min_value_and_index) {

ValueAndIndex old = *min_value_and_index, assumed;

do {

if (value_and_index.GetValue() > old.GetValue()) {

break;

}

if ((value_and_index.GetValue() == old.GetValue()) && (value_and_index.GetIndex() > old.GetIndex())) {

break;

}

assumed = old;

old.ulong = atomicCAS(&(min_value_and_index->ulong), assumed.ulong, value_and_index.ulong);

} while (assumed.ulong != old.ulong);

return old;

}

これで、常に最小のindex値を得られるようになりました。7行目の不等号を反対向きにすれば最大のindex値を得ることも出来ます。
4行目の不等号を入れ替えればatomicMaxにもなります。
実行結果は例えば以下のようになります。

GPU Original: (min_value, index) = (0, 43739), 0.195msec. GPU Modified: (min_value, index) = (0, 33730), 0.268msec. GPU Modified New: (min_value, index) = (0, 7930), 0.239msec. CPU: (min_value, index) = (0, 7930), 0.036msec. The indices of the min_value are… 7930, 18153, 33730, 43739, 46528, 48255,

CPUでの結果と必ず一致すると安心出来ますね。
もちろん、オリジナルのカーネルでもwhileの条件式を書き足せば、CPUと同じ結果にすることが出来ます。

以上、atmoic関数に関する非常にマニアックなネタでした。

最後に、ソースコード全体を載せておきますね。

/* Copyright Hirofumi Seo, M.D. */

#include <random>
#include <limits>
#include <iostream>
#include <chrono>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "thrust/host_vector.h"
#include "thrust/device_vector.h"

constexpr int kNumTotal = 50000;
constexpr int kBlockSize = 256;
constexpr int kNumBlocks = (kNumTotal + kBlockSize - 1) / kBlockSize;

union ValueAndIndex{
public:
  unsigned long long int ulong;    // for atomic update
 
  __host__ __device__ ValueAndIndex() {}
  __host__ __device__ ValueAndIndex(const float value, const int index) {
    Set(value, index);
  }

  __host__ __device__ void Set(const float value, const int index) {
    SetValue(value);
    SetIndex(index);
  }

  __host__ __device__ float GetValue() const {
    return floats[0];
  }
  __host__ __device__ void SetValue(const float value) {
    floats[0] = value;
  }

  __host__ __device__ int GetIndex() const {
    return ints[1];
  }
  __host__ __device__ void SetIndex(const int index) {
    ints[1] = index;
  }

private:
  float floats[2];                 // floats[0] = lowest value
  int ints[2];                     // ints[1] = the index of the lowest value
};

/* ------------original version------------ */
// https://stackoverflow.com/questions/17411493/how-can-i-implement-a-custom-atomic-function-involving-several-variables/17414007#17414007
__device__ ValueAndIndex AtomicMinValueAndIndexOriginal(const ValueAndIndex& value_and_index, ValueAndIndex* min_value_and_index) {
  ValueAndIndex old = *min_value_and_index;
  while (value_and_index.GetValue() < old.GetValue()) {
    old.ulong = atomicCAS(&(min_value_and_index->ulong), old.ulong, value_and_index.ulong);
  }
  return old;
}

__global__ void Kernel_GetMinValueAndIndexOriginal(const float* values, ValueAndIndex* min_value_and_index) {
  const int initial_index = blockIdx.x * blockDim.x + threadIdx.x;
  const int stride = blockDim.x * gridDim.x;
  for (int index = initial_index; index < kNumTotal; index += stride) {
    AtomicMinValueAndIndexOriginal(ValueAndIndex(values[index], index), min_value_and_index);
  }
}
/* ---------------------------------------- */

/* -----Lock-Free Data Updates version----- */
__device__ ValueAndIndex AtomicMinValueAndIndexModified(const ValueAndIndex& value_and_index, ValueAndIndex* min_value_and_index) {
  ValueAndIndex old = *min_value_and_index, assumed;
  do {
    if (value_and_index.GetValue() >= old.GetValue()) {
      break;
    }
    assumed = old;
    old.ulong = atomicCAS(&(min_value_and_index->ulong), assumed.ulong, value_and_index.ulong);
  } while (assumed.ulong != old.ulong);
  return old;
}

__global__ void Kernel_GetMinValueAndIndexModified(const float* values, ValueAndIndex* min_value_and_index) {
  const int initial_index = blockIdx.x * blockDim.x + threadIdx.x;
  const int stride = blockDim.x * gridDim.x;
  for (int index = initial_index; index < kNumTotal; index += stride) {
    AtomicMinValueAndIndexModified(ValueAndIndex(values[index], index), min_value_and_index);
  }
}
/* ---------------------------------------- */

/* ---Lock-Free Data Updates new version--- */
__device__ ValueAndIndex AtomicMinValueAndIndexModifiedNew(const ValueAndIndex& value_and_index, ValueAndIndex* min_value_and_index) {
  ValueAndIndex old = *min_value_and_index, assumed;
  do {
    if (value_and_index.GetValue() > old.GetValue()) {
      break;
    }
    if ((value_and_index.GetValue() == old.GetValue()) && (value_and_index.GetIndex() > old.GetIndex())) {
      break;
    }
    assumed = old;
    old.ulong = atomicCAS(&(min_value_and_index->ulong), assumed.ulong, value_and_index.ulong);
  } while (assumed.ulong != old.ulong);
  return old;
}

__global__ void Kernel_GetMinValueAndIndexModifiedNew(const float* values, ValueAndIndex* min_value_and_index) {
  const int initial_index = blockIdx.x * blockDim.x + threadIdx.x;
  const int stride = blockDim.x * gridDim.x;
  for (int index = initial_index; index < kNumTotal; index += stride) {
    AtomicMinValueAndIndexModifiedNew(ValueAndIndex(values[index], index), min_value_and_index);
  }
}
/* ---------------------------------------- */

int main() {

  thrust::host_vector<float> h_values(kNumTotal);
  thrust::device_vector<float> d_values(kNumTotal);

  // create random floats between 0 and 1
  std::random_device rnd;
  std::mt19937 mt(rnd());
  std::uniform_int_distribution<int> rand0_to_10000(0, 10000);
  for (int i = 0; i < kNumTotal; ++i) {
    h_values[i] = (float)rand0_to_10000(mt);
  }
  d_values = h_values;

  ValueAndIndex h_min_value_and_index;
  ValueAndIndex* d_min_value_and_index;
  cudaMalloc((void**)&d_min_value_and_index, sizeof(ValueAndIndex));

  auto time_start = std::chrono::system_clock::now();
  auto time_end = std::chrono::system_clock::now();
  auto time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() / 1000.0f;

  /* ------------original version------------ */
  time_start = std::chrono::system_clock::now();
  
  cudaMemcpy(d_min_value_and_index, &ValueAndIndex(std::numeric_limits<float>::max(), -1), sizeof(ValueAndIndex), cudaMemcpyHostToDevice);
  Kernel_GetMinValueAndIndexOriginal<<<kNumBlocks, kBlockSize>>>(thrust::raw_pointer_cast(d_values.data()), d_min_value_and_index);
  cudaDeviceSynchronize();
  cudaMemcpy(&h_min_value_and_index, d_min_value_and_index, sizeof(ValueAndIndex), cudaMemcpyDeviceToHost);
  
  time_end = std::chrono::system_clock::now();
  time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() / 1000.0f;

  std::cout << "GPU Original: (min_value, index) = (" << h_min_value_and_index.GetValue() << ", " << h_min_value_and_index.GetIndex() << "), " << time_used << "msec." << std::endl;
  /* ---------------------------------------- */


  /* -----Lock-Free Data Updates version----- */
  time_start = std::chrono::system_clock::now();
  
  cudaMemcpy(d_min_value_and_index, &ValueAndIndex(std::numeric_limits<float>::max(), -1), sizeof(ValueAndIndex), cudaMemcpyHostToDevice);
  Kernel_GetMinValueAndIndexModified<<<kNumBlocks, kBlockSize>>>(thrust::raw_pointer_cast(d_values.data()), d_min_value_and_index);
  cudaDeviceSynchronize();
  cudaMemcpy(&h_min_value_and_index, d_min_value_and_index, sizeof(ValueAndIndex), cudaMemcpyDeviceToHost);
  
  time_end = std::chrono::system_clock::now();
  time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() / 1000.0f;

  std::cout << "GPU Modified: (min_value, index) = (" << h_min_value_and_index.GetValue() << ", " << h_min_value_and_index.GetIndex() << "), " << time_used << "msec." << std::endl;
  /* ---------------------------------------- */


/* ---Lock-Free Data Updates new version--- */
  time_start = std::chrono::system_clock::now();

  cudaMemcpy(d_min_value_and_index, &ValueAndIndex(std::numeric_limits<float>::max(), -1), sizeof(ValueAndIndex), cudaMemcpyHostToDevice);
  Kernel_GetMinValueAndIndexModifiedNew<<<kNumBlocks, kBlockSize>>>(thrust::raw_pointer_cast(d_values.data()), d_min_value_and_index);
  cudaDeviceSynchronize();
  cudaMemcpy(&h_min_value_and_index, d_min_value_and_index, sizeof(ValueAndIndex), cudaMemcpyDeviceToHost);

  time_end = std::chrono::system_clock::now();
  time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() / 1000.0f;

  std::cout << "GPU Modified New: (min_value, index) = (" << h_min_value_and_index.GetValue() << ", " << h_min_value_and_index.GetIndex() << "), " << time_used << "msec." << std::endl;
  /* ---------------------------------------- */


  /* --------------cpu version--------------- */
  time_start = std::chrono::system_clock::now();

  ValueAndIndex min_value_and_index_by_cpu(std::numeric_limits<float>::max(), -1);
  for (int index = 0; index < kNumTotal; ++index) {
    if (h_values[index] < min_value_and_index_by_cpu.GetValue()) {
      min_value_and_index_by_cpu.Set(h_values[index], index);
    }
  }

  time_end = std::chrono::system_clock::now();
  time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() / 1000.0f;

  std::cout << "CPU: (min_value, index) = (" << min_value_and_index_by_cpu.GetValue() << ", " << min_value_and_index_by_cpu.GetIndex() << "), " << time_used << "msec." << std::endl;
  /* ---------------------------------------- */

  std::cout << "The indices of the min_value are... ";
  for (int index = 0; index < kNumTotal; ++index) {
    if (h_values[index] == min_value_and_index_by_cpu.GetValue()) {
      std::cout << index << ", ";
    }
  }
  std::cout << std::endl;

  cudaFree(d_min_value_and_index);
  return 0;
}

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

/* Copyright Hirofumi Seo, M.D. */

#include <random>

#include <limits>

#include <iostream>

#include <chrono>

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include "thrust/host_vector.h"

#include "thrust/device_vector.h"

constexpr int kNumTotal = 50000;

constexpr int kBlockSize = 256;

constexpr int kNumBlocks = (kNumTotal + kBlockSize - 1) / kBlockSize;

union ValueAndIndex{

public:

unsigned long long int ulong; // for atomic update

__host__ __device__ ValueAndIndex() {}

__host__ __device__ ValueAndIndex(const float value, const int index) {

Set(value, index);

}

__host__ __device__ void Set(const float value, const int index) {

SetValue(value);

SetIndex(index);

}

__host__ __device__ float GetValue() const {

return floats[0];

}

__host__ __device__ void SetValue(const float value) {

floats[0] = value;

}

__host__ __device__ int GetIndex() const {

return ints[1];

}

__host__ __device__ void SetIndex(const int index) {

ints[1] = index;

}

private:

float floats[2]; // floats[0] = lowest value

int ints[2]; // ints[1] = the index of the lowest value

};

/* ------------original version------------ */

// https://stackoverflow.com/questions/17411493/how-can-i-implement-a-custom-atomic-function-involving-several-variables/17414007#17414007

__device__ ValueAndIndex AtomicMinValueAndIndexOriginal(const ValueAndIndex& value_and_index, ValueAndIndex* min_value_and_index) {

ValueAndIndex old = *min_value_and_index;

while (value_and_index.GetValue() < old.GetValue()) {

old.ulong = atomicCAS(&(min_value_and_index->ulong), old.ulong, value_and_index.ulong);

}

return old;

}

__global__ void Kernel_GetMinValueAndIndexOriginal(const float* values, ValueAndIndex* min_value_and_index) {

const int initial_index = blockIdx.x * blockDim.x + threadIdx.x;

const int stride = blockDim.x * gridDim.x;

for (int index = initial_index; index < kNumTotal; index += stride) {

AtomicMinValueAndIndexOriginal(ValueAndIndex(values[index], index), min_value_and_index);

}

/* ---------------------------------------- */

/* -----Lock-Free Data Updates version----- */

__device__ ValueAndIndex AtomicMinValueAndIndexModified(const ValueAndIndex& value_and_index, ValueAndIndex* min_value_and_index) {

ValueAndIndex old = *min_value_and_index, assumed;

do {

if (value_and_index.GetValue() >= old.GetValue()) {

break;

}

assumed = old;

old.ulong = atomicCAS(&(min_value_and_index->ulong), assumed.ulong, value_and_index.ulong);

} while (assumed.ulong != old.ulong);

return old;

}

__global__ void Kernel_GetMinValueAndIndexModified(const float* values, ValueAndIndex* min_value_and_index) {

const int initial_index = blockIdx.x * blockDim.x + threadIdx.x;

const int stride = blockDim.x * gridDim.x;

for (int index = initial_index; index < kNumTotal; index += stride) {

AtomicMinValueAndIndexModified(ValueAndIndex(values[index], index), min_value_and_index);

}

/* ---------------------------------------- */

/* ---Lock-Free Data Updates new version--- */

__device__ ValueAndIndex AtomicMinValueAndIndexModifiedNew(const ValueAndIndex& value_and_index, ValueAndIndex* min_value_and_index) {

ValueAndIndex old = *min_value_and_index, assumed;

do {

if (value_and_index.GetValue() > old.GetValue()) {

break;

}

if ((value_and_index.GetValue() == old.GetValue()) && (value_and_index.GetIndex() > old.GetIndex())) {

break;

}

assumed = old;

old.ulong = atomicCAS(&(min_value_and_index->ulong), assumed.ulong, value_and_index.ulong);

} while (assumed.ulong != old.ulong);

return old;

}

__global__ void Kernel_GetMinValueAndIndexModifiedNew(const float* values, ValueAndIndex* min_value_and_index) {

const int initial_index = blockIdx.x * blockDim.x + threadIdx.x;

const int stride = blockDim.x * gridDim.x;

for (int index = initial_index; index < kNumTotal; index += stride) {

AtomicMinValueAndIndexModifiedNew(ValueAndIndex(values[index], index), min_value_and_index);

}

/* ---------------------------------------- */

int main() {

thrust::host_vector<float> h_values(kNumTotal);

thrust::device_vector<float> d_values(kNumTotal);

// create random floats between 0 and 1

std::random_device rnd;

std::mt19937 mt(rnd());

std::uniform_int_distribution<int> rand0_to_10000(0, 10000);

for (int i = 0; i < kNumTotal; ++i) {

h_values[i] = (float)rand0_to_10000(mt);

}

d_values = h_values;

ValueAndIndex h_min_value_and_index;

ValueAndIndex* d_min_value_and_index;

cudaMalloc((void**)&d_min_value_and_index, sizeof(ValueAndIndex));

auto time_start = std::chrono::system_clock::now();

auto time_end = std::chrono::system_clock::now();

auto time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() / 1000.0f;

/* ------------original version------------ */

time_start = std::chrono::system_clock::now();

cudaMemcpy(d_min_value_and_index, &ValueAndIndex(std::numeric_limits<float>::max(), -1), sizeof(ValueAndIndex), cudaMemcpyHostToDevice);

Kernel_GetMinValueAndIndexOriginal<<<kNumBlocks, kBlockSize>>>(thrust::raw_pointer_cast(d_values.data()), d_min_value_and_index);

cudaDeviceSynchronize();

cudaMemcpy(&h_min_value_and_index, d_min_value_and_index, sizeof(ValueAndIndex), cudaMemcpyDeviceToHost);

time_end = std::chrono::system_clock::now();

time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() / 1000.0f;

std::cout << "GPU Original: (min_value, index) = (" << h_min_value_and_index.GetValue() << ", " << h_min_value_and_index.GetIndex() << "), " << time_used << "msec." << std::endl;

/* ---------------------------------------- */

/* -----Lock-Free Data Updates version----- */

time_start = std::chrono::system_clock::now();

cudaMemcpy(d_min_value_and_index, &ValueAndIndex(std::numeric_limits<float>::max(), -1), sizeof(ValueAndIndex), cudaMemcpyHostToDevice);

Kernel_GetMinValueAndIndexModified<<<kNumBlocks, kBlockSize>>>(thrust::raw_pointer_cast(d_values.data()), d_min_value_and_index);

cudaDeviceSynchronize();

cudaMemcpy(&h_min_value_and_index, d_min_value_and_index, sizeof(ValueAndIndex), cudaMemcpyDeviceToHost);

time_end = std::chrono::system_clock::now();

time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() / 1000.0f;

std::cout << "GPU Modified: (min_value, index) = (" << h_min_value_and_index.GetValue() << ", " << h_min_value_and_index.GetIndex() << "), " << time_used << "msec." << std::endl;

/* ---------------------------------------- */

/* ---Lock-Free Data Updates new version--- */

time_start = std::chrono::system_clock::now();

cudaMemcpy(d_min_value_and_index, &ValueAndIndex(std::numeric_limits<float>::max(), -1), sizeof(ValueAndIndex), cudaMemcpyHostToDevice);

Kernel_GetMinValueAndIndexModifiedNew<<<kNumBlocks, kBlockSize>>>(thrust::raw_pointer_cast(d_values.data()), d_min_value_and_index);

cudaDeviceSynchronize();

cudaMemcpy(&h_min_value_and_index, d_min_value_and_index, sizeof(ValueAndIndex), cudaMemcpyDeviceToHost);

time_end = std::chrono::system_clock::now();

time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() / 1000.0f;

std::cout << "GPU Modified New: (min_value, index) = (" << h_min_value_and_index.GetValue() << ", " << h_min_value_and_index.GetIndex() << "), " << time_used << "msec." << std::endl;

/* ---------------------------------------- */

/* --------------cpu version--------------- */

time_start = std::chrono::system_clock::now();

ValueAndIndex min_value_and_index_by_cpu(std::numeric_limits<float>::max(), -1);

for (int index = 0; index < kNumTotal; ++index) {

if (h_values[index] < min_value_and_index_by_cpu.GetValue()) {

min_value_and_index_by_cpu.Set(h_values[index], index);

}

time_end = std::chrono::system_clock::now();

time_used = std::chrono::duration_cast<std::chrono::microseconds>(time_end - time_start).count() / 1000.0f;

std::cout << "CPU: (min_value, index) = (" << min_value_and_index_by_cpu.GetValue() << ", " << min_value_and_index_by_cpu.GetIndex() << "), " << time_used << "msec." << std::endl;

/* ---------------------------------------- */

std::cout << "The indices of the min_value are... ";

for (int index = 0; index < kNumTotal; ++index) {

if (h_values[index] == min_value_and_index_by_cpu.GetValue()) {

std::cout << index << ", ";

}

std::cout << std::endl;

cudaFree(d_min_value_and_index);

return 0;

}

Post Views: 6,090

サイアメント技術メモ

[CUDA]カスタムatomicMinで最小値とそのときのindexとを同時に取得する

コメントを残すコメントをキャンセル

コメントを残す コメントをキャンセル

コメントを残すコメントをキャンセル