diff --git a/homework_3/Makefile b/homework_3/Makefile index dae2b51..75914d9 100644 --- a/homework_3/Makefile +++ b/homework_3/Makefile @@ -22,7 +22,7 @@ PROJECT := PDS_homework_3 # Excecutable's name -TARGET := bitonic +TARGET := bitonicCUDA # Source directories list(space seperated). Makefile-relative path, UNDER current directory. SRC_DIR_LIST := src #test test/gtest @@ -52,7 +52,7 @@ REL_CXXFLAGS := -O3 -std=c++17 -Xcompiler "-Wall -Wextra" # Pre-defines # PRE_DEFS := MYCAB=1729 SUPER_MODE -PRE_DEFS := +PRE_DEFS := TARGET=$(TARGET) # ============== Linker settings ============== # Linker flags (example: -pthread -lm) @@ -83,17 +83,14 @@ DOCKER := CSIZE := size CFLAGS := $(DEB_CFLAGS) CXXFLAGS := $(DEB_CXXFLAGS) -CXX := g++ #mpic++ -CC := gcc #mpicc +CXX := g++ +CC := gcc LINKER := g++ # # =========== Main body and Patterns =========== # -#ifeq ($(OS), Windows_NT) -# TARGET := $(TARGET).exe -#endif INC := $(foreach dir,$(INC_DIR_LIST),-I$(dir)) DEF := $(foreach def,$(PRE_DEFS),-D$(def)) EXC := $(foreach fil,$(EXC_FILE_LIST), \ @@ -111,38 +108,16 @@ OBJ := $(foreach file,$(SRC:%.cpp=%.o),$(OBJ_DIR)/$(file)) DEP := $(foreach file,$(SRC:%.cpp=%.d),$(DEP_DIR)/$(file)) -# Make Dependencies pattern. -# This little trick enables recompilation only when dependencies change -# and it does so for changes both in source AND header files ;) -# -# It is based on Tom Tromey's method. -# -# Invoke cpp to create makefile rules with dependencies for each source file -#$(DEP_DIR)/%.d: %.c -# @mkdir -p $(@D) -# @$(DOCKER) $(CC) -E $(CFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.c=.o) -MF $@ $< - # c file objects depent on .c AND dependency files, which have an empty recipe $(OBJ_DIR)/%.o: %.c @mkdir -p $(@D) $(DOCKER) $(CC) -c $(CFLAGS) $(INC) $(DEF) -o $@ $< -#$(DEP_DIR)/%.d: %.cpp -# @mkdir -p $(@D) -# @$(DOCKER) $(CXX) -E $(CXXFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.cpp=.o) -MF $@ $< - # cpp file objects depend on .cpp AND dependency files, which have an empty recipe $(OBJ_DIR)/%.o: %.cpp @mkdir -p $(@D) $(DOCKER) $(CXX) -c $(CXXFLAGS) $(INC) $(DEF) -o $@ $< -# empty recipe for dependency files. This prevents make errors -#$(DEP): - -# now include all dependencies -# After all they are makefile dependency rules ;) -#include $(wildcard $(DEP)) - # main target rule $(BUILD_DIR)/$(TARGET): $(OBJ) @mkdir -p $(@D) @@ -179,6 +154,39 @@ release: $(BUILD_DIR)/$(TARGET) # ================ Build rules ================= # +bitonic_v0deb: CC := nvcc -G -g -x cu +bitonic_v0deb: CXX := nvcc -G -g -x cu +bitonic_v0deb: LINKER := nvcc +bitonic_v0deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V0 +bitonic_v0deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V0 +bitonic_v0deb: OUTPUT_DIR := $(OUTPUT_DIR)/v0 +bitonic_v0deb: $(BUILD_DIR)/$(TARGET) + @mkdir -p $(OUTPUT_DIR) + cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET) + + +bitonic_v1deb: CC := nvcc -G -g -x cu +bitonic_v1deb: CXX := nvcc -G -g -x cu +bitonic_v1deb: LINKER := nvcc +bitonic_v1deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V1 +bitonic_v1deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V1 +bitonic_v1deb: OUTPUT_DIR := $(OUTPUT_DIR)/v1 +bitonic_v1deb: $(BUILD_DIR)/$(TARGET) + @mkdir -p $(OUTPUT_DIR) + cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET) + + +bitonic_v2deb: CC := nvcc -G -g -x cu +bitonic_v2deb: CXX := nvcc -G -g -x cu +bitonic_v2deb: LINKER := nvcc +bitonic_v2deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V2 +bitonic_v2deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V2 +bitonic_v2deb: OUTPUT_DIR := $(OUTPUT_DIR)/v2 +bitonic_v2deb: $(BUILD_DIR)/$(TARGET) + @mkdir -p $(OUTPUT_DIR) + cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET) + + bitonic_v0: CC := nvcc -x cu bitonic_v0: CXX := nvcc -x cu @@ -201,7 +209,6 @@ bitonic_v1: $(BUILD_DIR)/$(TARGET) @mkdir -p $(OUTPUT_DIR) cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET) - bitonic_v2: CC := nvcc -x cu bitonic_v2: CXX := nvcc -x cu bitonic_v2: LINKER := nvcc diff --git a/homework_3/src/bitonicsort.hpp b/homework_3/src/bitonicsort.hpp index e20031e..fbcf924 100644 --- a/homework_3/src/bitonicsort.hpp +++ b/homework_3/src/bitonicsort.hpp @@ -21,7 +21,7 @@ /* * Exported timers */ -extern Timing Timer_total; +extern Timing Timer_total, Timer_memory, Timer_sorting; using threadId_t = size_t; @@ -74,9 +74,41 @@ __device__ inline bool keepSmall(threadId_t tid, threadId_t partner, size_t stag * ============================== Sort algorithms ============================== */ +/*! + * Each thread can handle 2 points in the array. For each of these 2 points it may + * - compare and exchange if needed + * - copy data to local and back if needed + */ +static constexpr size_t SizeToThreadsRatio = 2; +/*! + * Calculates the blocks needed for the entire sorting process + * + * @note + * This "redundant" little trick makes sure blocks are allocated for arraySizes that are not exact + * multipliers of config.blockSize. + * Even if we don't need it, we keep it in case we experiment with weird sizes in the future! + * + * @param arraySize [ArraySize_t] The size of the entire array (in points) + * @return [size_t] The number of blocks + */ +inline size_t NBlocks(ArraySize_t arraySize) { + return (((arraySize + config.blockSize - 1) / config.blockSize) / SizeToThreadsRatio); +} + + +/*! + * Exchange utility + * + * @tparam ValueT The underlying data type of the array items + * + * @param data [ValueT*] Pointer to data array + * @param tid [threadId_t] Current thread's index to data + * @param pid [threadId_t] Parents's index to data + * @param keepSmall [bool] Flag to indicate if current threads is keeping the small + */ template -__device__ void exchange(ValueT* data, int tid, int partner, bool keepSmall) { +__device__ void exchange(ValueT* data, threadId_t tid, threadId_t partner, bool keepSmall) { if (( keepSmall && (data[tid] > data[partner])) || (!keepSmall && (data[tid] < data[partner])) ) { ValueT temp = data[tid]; @@ -86,13 +118,24 @@ __device__ void exchange(ValueT* data, int tid, int partner, bool keepSmall) { } #if CODE_VERSION == V0 + +/*! + * This is the body of each thread. This function compare and exchange data + * + * @tparam ValueT The underlying data type of the array items + * @param data [ValueT*] Pointer to data array + * @param n [size_t] The total size of the array + * @param step [size_t] The current step of the current stage of bitonic sort + * @param stage [size_t] The current stage of bitonic sort + */ template __global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) { - threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID + threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Keep contiguous addressing to the first half of the array threadId_t pid = partner(tid, step); if (tid > pid) { - tid += n >> 1; - pid += n >> 1; + // Shift to the other half of the array for global data + tid += n / SizeToThreadsRatio; + pid += n / SizeToThreadsRatio; } if ((tid < n) && (pid < n)) { // Boundary check bool keep = keepSmall(tid, pid, stage); @@ -102,18 +145,11 @@ __global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) { /*! - * A distributed version of the Bitonic sort algorithm. + * A CUDA version of the Bitonic sort algorithm. * - * @note - * Each MPI process should run an instance of this function. - * - * @tparam ShadowedDataT A Shadowed buffer type with random access iterator. - * - * @param data [ShadowedDataT] The local to MPI process data to sort - * @param Processes [mpi_id_t] The total number of MPI processes - * @param rank [mpi_id_t] The current process id + * @tparam DataT A container type to hold data array. Should have .data() and .size() methods + * @param data [DataT&] Reference to the container to sort */ - template void bitonicSort(DataT& data) { using value_t = typename DataT::value_type; @@ -121,34 +157,57 @@ void bitonicSort(DataT& data) { value_t* dev_data; auto size = data.size(); - cudaMalloc(&dev_data, size * sizeof(value_t)); - cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice); + Timer_memory.start(); + if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) + throw std::runtime_error("[CUDA] - Can not allocate memory\n"); + if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) + throw std::runtime_error("[CUDA] - Can not copy memory to device\n"); + Timer_memory.stop(); - int Nthreads = THREADS_PER_BLOCK; - int HalfNblocks = ((size + Nthreads - 1) / Nthreads) >> 1; + size_t Nth = config.blockSize; + size_t Nbl = NBlocks(size); size_t Stages = static_cast(log2(size)); + Timer_sorting.start(); for (size_t stage = 1; stage <= Stages; ++stage) { for (size_t step = stage; step > 0; ) { --step; - bitonicStep<<>>(dev_data, size, step, stage); + bitonicStep<<>>(dev_data, size, step, stage); cudaDeviceSynchronize(); } } + Timer_sorting.stop(); - cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost); + Timer_memory.start(); + if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) + throw std::runtime_error("[CUDA] - Can not copy memory from device\n"); cudaFree(dev_data); + Timer_memory.stop(); } #elif CODE_VERSION == V1 +/*! + * This is the body of each thread. This function compare and exchange data + * + * @tparam ValueT The underlying data type of the array items + * @param data [ValueT*] Pointer to data array + * @param n [size_t] The total size of the array + * @param step [size_t] The current step of the current stage of bitonic sort + * @param stage [size_t] The current stage of bitonic sort + */ template __device__ void interBlockStep_(ValueT* data, size_t n, size_t step, size_t stage) { - threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID + /* + * Here we skip blocks every time (one for SizeToThreadsRatio = 2) + * And we use the neighbor block address indices for the other half of the threads + */ + threadId_t tid = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x; threadId_t pid = partner(tid, step); if (tid > pid) { - tid += n >> 1; - pid += n >> 1; + // Shift to the other half of the array for global data + tid += blockDim.x; + pid += blockDim.x; } if ((tid < n) && (pid < n)) { // Boundary check bool keep = keepSmall(tid, pid, stage); @@ -156,12 +215,29 @@ __device__ void interBlockStep_(ValueT* data, size_t n, size_t step, size_t stag } } +/*! + * This is the version of the body that is called outside of the loop unrolling + * + * @tparam ValueT The underlying data type of the array items + * @param data [ValueT*] Pointer to data array + * @param n [size_t] The total size of the array + * @param step [size_t] The current step of the current stage of bitonic sort + * @param stage [size_t] The current stage of bitonic sort + */ template __global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) { interBlockStep_(data, n, step, stage); } - +/*! + * This is unrolled part of the bitonic double loop. + * + * @tparam ValueT The underlying data type of the array items + * @param data [ValueT*] Pointer to data array + * @param n [size_t] The total size of the array + * @param step [size_t] The current step of the current stage of bitonic sort + * @param stage [size_t] The current stage of bitonic sort + */ template __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) { for (size_t step = innerSteps + 1; step > 0; ) { @@ -172,18 +248,11 @@ __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t st } /*! - * A distributed version of the Bitonic sort algorithm. + * A CUDA version of the Bitonic sort algorithm. * - * @note - * Each MPI process should run an instance of this function. - * - * @tparam ShadowedDataT A Shadowed buffer type with random access iterator. - * - * @param data [ShadowedDataT] The local to MPI process data to sort - * @param Processes [mpi_id_t] The total number of MPI processes - * @param rank [mpi_id_t] The current process id + * @tparam DataT A container type to hold data array. Should have .data() and .size() methods + * @param data [DataT&] Reference to the container to sort */ - template void bitonicSort(DataT& data) { using value_t = typename DataT::value_type; @@ -191,38 +260,85 @@ void bitonicSort(DataT& data) { value_t* dev_data; auto size = data.size(); - cudaMalloc(&dev_data, size * sizeof(value_t)); - cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice); + Timer_memory.start(); + if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) + throw std::runtime_error("[CUDA] - Can not allocate memory\n"); + if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) + throw std::runtime_error("[CUDA] - Can not copy memory to device\n"); + Timer_memory.stop(); - int Nthreads = THREADS_PER_BLOCK; - int HalfNblocks = ((size + Nthreads - 1) / Nthreads) >> 1; + size_t Nth = config.blockSize; + size_t Nbl = NBlocks(size); - auto Stages = static_cast(log2(size)); - auto InnerBlockSteps = static_cast(log2(IN_BLOCK_THRESHOLD)); + auto Stages = static_cast(log2(size)); + auto InnerBlockSteps = static_cast(log2(Nth)); // + Timer_sorting.start(); for (size_t stage = 1; stage <= Stages; ++stage) { size_t step = stage - 1; for ( ; step > InnerBlockSteps; --step) { - interBlockStep<<>>(dev_data, size, step, stage); + interBlockStep<<>>(dev_data, size, step, stage); cudaDeviceSynchronize(); } - inBlockStep<<>>(dev_data, size, step, stage); + inBlockStep<<>>(dev_data, size, step, stage); cudaDeviceSynchronize(); } + Timer_sorting.stop(); - cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost); + Timer_memory.start(); + if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) + throw std::runtime_error("[CUDA] - Can not copy memory from device\n"); cudaFree(dev_data); + Timer_memory.stop(); } + #elif CODE_VERSION == V2 +/*! + * @return The memory that each block local threads can affect. + * + * @note + * Each block thread collection can exchange twice the size of data points. + */ +inline size_t effectiveBlockSize() { return SizeToThreadsRatio * config.blockSize; } + + + +/*! + * Converts the global address of the data to the local shared memory array which is used + * as cached memory to the unrolled part of the bitonic sort loop. + * + * @note + * Each block's thread collection can exchange twice the size of data points. + * These points get copied (cached) in the shared memory location. We use contiguous blocks + * both in global data memory and the shared memory buffer. + * + * @param gIndex The global array index + * @param blockDim The block size (threads per block) + * @return The equivalent local address of the shared memory + */ +__device__ inline size_t toLocal(size_t gIndex, size_t blockDim) { + return gIndex % (SizeToThreadsRatio * blockDim); +} + +/*! + * This is the version of the body that is called outside of the loop unrolling + * + * @tparam ValueT The underlying data type of the array items + * @param data [ValueT*] Pointer to data array + * @param n [size_t] The total size of the array + * @param step [size_t] The current step of the current stage of bitonic sort + * @param stage [size_t] The current stage of bitonic sort + */ template __global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) { - threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID + threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Keep contiguous addressing to the first half of the array threadId_t pid = partner(tid, step); if (tid > pid) { - tid += n >> 1; - pid += n >> 1; + // Shift to the other half of the array for global data + tid += n / SizeToThreadsRatio; + pid += n / SizeToThreadsRatio; } if ((tid < n) && (pid < n)) { // Boundary check bool keep = keepSmall(tid, pid, stage); @@ -230,57 +346,72 @@ __global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage } } - +/*! + * This is unrolled part of the bitonic double loop. + * + * First each thread caches its corresponding data point from the current and the following data block. + * After that we execute the loop unrolling on the local data and then we write back to global memory. + * + * @tparam ValueT The underlying data type of the array items + * @param data [ValueT*] Pointer to data array + * @param n [size_t] The total size of the array + * @param step [size_t] The current step of the current stage of bitonic sort + * @param stage [size_t] The current stage of bitonic sort + */ template -__global__ void inBlockStep(ValueT* data, size_t n, size_t nthreads, size_t innerSteps, size_t stage, int *mutex) { +__global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) { extern __shared__ ValueT shared_data[]; + /* + * Global and local(shared) memory indices (calculated once) + * Here we skip blocks every time (one for SizeToThreadsRatio = 2) + * And we cache the neighbor block address indexes in local (shared) memory + */ + threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x; + threadId_t lIdx0 = toLocal(gIdx0, blockDim.x); + + if (gIdx0 + blockDim.x >= n) // Boundary check + return; + + // Fetch to local memory the entire effective block size (2 positions for each thread) + shared_data[lIdx0] = data[gIdx0]; + shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x]; + __syncthreads(); + for (size_t step = innerSteps + 1; step > 0; ) { --step; - // Global memory thread and partner ids - threadId_t Tid = threadIdx.x + blockIdx.x * blockDim.x; - threadId_t Pid = partner(Tid, step); - if (Tid > Pid) { - Tid += n >> 1; - Pid += n >> 1; + // Init thread global and local indices + threadId_t gIdx = gIdx0; + threadId_t lIdx = lIdx0; + // Find partner and keep-small configuration based on the global data positions + threadId_t pIdx = partner(gIdx, step); + if (gIdx > pIdx) { + // Shift inside effective block + gIdx += blockDim.x; // global + pIdx += blockDim.x; + lIdx += blockDim.x; // local } + bool keep = keepSmall(gIdx, pIdx, stage); - if ((Tid < n) && (Pid < n)) { // Boundary check - // Global to local index resolution - threadId_t tid = (Tid> 1))*nthreads)%(2*nthreads)); - threadId_t pid = tid + 1; - // Fetch to local memory - shared_data[tid] = data[Tid]; - shared_data[pid] = data[Pid]; - __syncthreads(); - - bool keep = keepSmall(Tid, Pid, stage); - exchange(shared_data, tid, pid, keep); - __syncthreads(); - - // Write back to global memory - data[Tid] = shared_data[tid]; - data[Pid] = shared_data[pid]; - __syncthreads(); - } + // Exchange data on local(shared) copy + threadId_t lpIdx = toLocal(pIdx, blockDim.x); + exchange(shared_data, lIdx, lpIdx, keep); + __syncthreads(); } + // Write back to global memory + data[gIdx0] = shared_data[lIdx0]; + data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x]; + __syncthreads(); } /*! - * A distributed version of the Bitonic sort algorithm. + * A CUDA version of the Bitonic sort algorithm. * - * @note - * Each MPI process should run an instance of this function. - * - * @tparam dDataT A Shadowed buffer type with random access iterator. - * - * @param data [ShadowedDataT] The local to MPI process data to sort - * @param Processes [mpi_id_t] The total number of MPI processes - * @param rank [mpi_id_t] The current process id + * @tparam DataT A container type to hold data array. Should have .data() and .size() methods + * @param data [DataT&] Reference to the container to sort */ - template void bitonicSort(DataT& data) { using value_t = typename DataT::value_type; @@ -288,30 +419,36 @@ void bitonicSort(DataT& data) { value_t* dev_data; auto size = data.size(); - cudaMalloc(&dev_data, size * sizeof(value_t)); - cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice); + Timer_memory.start(); + if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) + throw std::runtime_error("[CUDA] - Can not allocate memory\n"); + if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) + throw std::runtime_error("[CUDA] - Can not copy memory to device\n"); + Timer_memory.stop(); - int* d_mutex; - cudaMalloc(&d_mutex, sizeof(int)); - cudaMemset(d_mutex, 0, sizeof(int)); // init mutex + size_t Nth = config.blockSize; + size_t Nbl = NBlocks(size); + size_t kernelMemSize = effectiveBlockSize() * sizeof(value_t); - int Nthreads = THREADS_PER_BLOCK; - int Nblocks = ((size + Nthreads - 1) / Nthreads) >> 1; - - auto Stages = static_cast(log2(size)); - auto InnerBlockSteps = static_cast(log2(IN_BLOCK_THRESHOLD)); + auto Stages = static_cast(log2(size)); + auto InnerBlockSteps = static_cast(log2(Nth)); + Timer_sorting.start(); for (size_t stage = 1; stage <= Stages; ++stage) { size_t step = stage - 1; for ( ; step > InnerBlockSteps; --step) { - interBlockStep<<>>(dev_data, size, step, stage); + interBlockStep<<>>(dev_data, size, step, stage); cudaDeviceSynchronize(); } - inBlockStep<<>>(dev_data, size, Nthreads, step, stage, d_mutex); + inBlockStep<<>>(dev_data, size, step, stage); cudaDeviceSynchronize(); } + Timer_sorting.stop(); - cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost); + Timer_memory.start(); + if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) + throw std::runtime_error("[CUDA] - Can not copy memory from device\n"); cudaFree(dev_data); + Timer_memory.stop(); } #endif diff --git a/homework_3/src/config.h b/homework_3/src/config.h index dec8b68..5423f1a 100644 --- a/homework_3/src/config.h +++ b/homework_3/src/config.h @@ -1,6 +1,6 @@ /*! * \file - * \brief Build configuration file. + * \brief Build and runtime configuration file. * * \author * Christos Choutouridis AEM:8997 @@ -11,19 +11,20 @@ #define CONFIG_H_ #include +#include /* * Versioning: - * - RC1: + * - RC1: First version to test on HPC */ -static constexpr char version[] = "0.0"; +static constexpr char version[] = "0.1"; /* * Defines for different version of the exercise */ -#define V0 (0) -#define V1 (1) -#define V2 (2) +#define V0 0 +#define V1 1 +#define V2 2 // Fail-safe version selection #if !defined CODE_VERSION @@ -33,8 +34,9 @@ static constexpr char version[] = "0.0"; // Default Data size (in case -q is not present) static constexpr size_t DEFAULT_DATA_SIZE = 1 << 16; +// Placeholder default (actual default comes from device properties read at initialization) static constexpr size_t THREADS_PER_BLOCK = 1024; -static constexpr size_t IN_BLOCK_THRESHOLD = 512; + /*! * Value and Buffer type selection @@ -51,6 +53,11 @@ static constexpr size_t IN_BLOCK_THRESHOLD = 512; using Value_t = uint32_t; using Data_t = std::vector; +/*! + * In theory we can support large arrays ;) + */ +using ArraySize_t = uint64_t; + /*! * Session option for each invocation of the executable. * @@ -58,20 +65,18 @@ using Data_t = std::vector; * The values of the members are set from the command line. */ struct config_t { - size_t arraySize{DEFAULT_DATA_SIZE}; //!< The array size of the local data to sort. - bool exchangeOpt{false}; //!< Flag to request the exchange optimization - size_t pipeline{1UL}; //!< Pipeline stages (1 to disable) - bool validation{false}; //!< Request a full validation at the end, performed by process rank 0. - bool ndebug{false}; //!< Skips debug trap on DEBUG builds. - size_t perf{1}; //!< Enable performance timing measurements and prints and repeat - //!< the sorting times. - bool verbose{false}; //!< Flag to enable verbose output to stdout. + ArraySize_t arraySize{DEFAULT_DATA_SIZE}; //!< The array size of the local data to sort. + size_t blockSize{THREADS_PER_BLOCK}; //!< The block size (threads per block) for the session. + bool validation{false}; //!< Request a full validation at the end, performed by process rank 0. + size_t perf{1}; //!< Enable performance timing measurements and prints. Repeat + //!< the sorting times to do so. + bool verbose{false}; //!< Flag to enable verbose output to stdout. }; /* * Exported data types */ -extern config_t config; - +extern config_t config; +extern cudaDeviceProp device; #endif /* CONFIG_H_ */ diff --git a/homework_3/src/main.cpp b/homework_3/src/main.cpp index 961d275..c93490f 100644 --- a/homework_3/src/main.cpp +++ b/homework_3/src/main.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include "utils.hpp" #include "config.h" @@ -18,22 +19,25 @@ // Global session data -Data_t Data = {3, 5, 1, 2, 4, 7, 8, 6}; +Data_t Data; config_t config; Log logger; +cudaDeviceProp device; // Mersenne seeded from hw if possible. range: [type_min, type_max] std::random_device rd; std::mt19937 gen(rd()); //! Performance timers for each one of the "costly" functions -Timing Timer_total; +Timing Timer_total, Timer_memory, Timer_sorting; //! Init timing objects for extra rounds void measurements_init() { if (config.perf > 1) { Timer_total.init(config.perf); + Timer_memory.init(config.perf); + Timer_sorting.init(config.perf); } } @@ -41,6 +45,8 @@ void measurements_init() { void measurements_next() { if (config.perf > 1) { Timer_total.next(); + Timer_memory.next(); + Timer_sorting.next(); } } @@ -57,7 +63,15 @@ bool get_options(int argc, char* argv[]){ if (arg == "-q" || arg == "--array-size") { if (i+1 < argc) { - config.arraySize = 1 << atoi(argv[++i]); + config.arraySize = (ArraySize_t)1 << atoi(argv[++i]); + } + else { + status = false; + } + } + else if (arg == "-b" || arg == "--block-size") { + if (i+1 < argc) { + config.blockSize = atoi(argv[++i]); } else { status = false; @@ -74,32 +88,34 @@ bool get_options(int argc, char* argv[]){ status = false; } } - else if (arg == "--ndebug") { - config.ndebug = true; - } else if (arg == "-v" || arg == "--verbose") { config.verbose = true; } else if (arg == "--version") { - std::cout << "bitonic - A GPU accelerated sort utility\n"; + std::cout << STR(TARGET) << " - A GPU accelerated bitonic sort utility (V" << STR(CODE_VERSION)<< ") \n"; std::cout << "version: " << version << "\n\n"; exit(0); } else if (arg == "-h" || arg == "--help") { - std::cout << "distbitonic - A distributed sort utility\n\n"; - std::cout << " distbitonic -q [--validation] [--perf ] [--ndebug] [-v]\n"; - std::cout << " distbitonic -h\n"; + std::cout << STR(TARGET) << " - A GPU accelerated bitonic sort utility (V" << STR(CODE_VERSION)<< ") \n\n"; + std::cout << " " << STR(TARGET) << " -q -b [--validation] [--perf ] [-v]\n"; + std::cout << " " << STR(TARGET) << " -h\n"; + std::cout << " " << STR(TARGET) << " --version\n"; std::cout << '\n'; std::cout << "Options:\n\n"; std::cout << " -q | --array-size \n"; - std::cout << " Selects the array size according to size = 2^N\n\n"; + std::cout << " Selects the array size according to size = 2^N\n"; + std::cout << " [Size must be larger than 2 * blockSize]\n"; + std::cout << " [Default is 2^16]\n\n"; + std::cout << " -b | --block-size \n"; + std::cout << " Selects the number of CUDA threads per block\n"; + std::cout << " [Size has to be multiple of device's warp size (usually 32)\n"; + std::cout << " [Default is the maximum device supported number. For ex: (GTX 1650) block-size=1024]\n\n"; std::cout << " --validation\n"; - std::cout << " Request a full validation at the end, performed by process rank 0\n\n"; + std::cout << " Request a full validation at the end\n\n"; std::cout << " --perf \n"; std::cout << " Enable performance timing measurements and prints, and repeat\n"; std::cout << " the sorting times.\n\n"; - std::cout << " --ndebug\n"; - std::cout << " Skip debug breakpoint when on debug build.\n\n"; std::cout << " -v | --verbose\n"; std::cout << " Request a more verbose output to stdout.\n\n"; std::cout << " -h | --help\n"; @@ -107,8 +123,12 @@ bool get_options(int argc, char* argv[]){ std::cout << " --version\n"; std::cout << " Prints version and exit.\n\n"; std::cout << "Examples:\n\n"; - std::cout << " bitonic -q 24\n"; - std::cout << " Runs bitonic with GPU acceleration with 2^24 array points\n\n"; + std::cout << " " << STR(TARGET) << " -q 24\n"; + std::cout << " Runs bitonic sort on an 2^24 points array, using GPU acceleration\n\n"; + std::cout << " " << STR(TARGET) << " --validation --perf 5 -b 512 -q 26\n"; + std::cout << " Runs bitonic sort on an 2^26 points array 5 times, using GPU acceleration with\n"; + std::cout << " 512 threads per block, performs a validation check at the end and prints the time\n"; + std::cout << " of the median.\n\n"; exit(0); } @@ -118,6 +138,17 @@ bool get_options(int argc, char* argv[]){ } } + // Check configuration requirements + if (config.blockSize % device.warpSize) + throw std::runtime_error("[Config] - Number of threads per block is not an exact multiple of warp size\n"); + if (config.arraySize < 2*config.blockSize) + throw std::runtime_error("[Config] - Unsupported array size (smaller than " + + std::to_string(SizeToThreadsRatio*config.blockSize) + ")\n"); + if (device.totalGlobalMem < config.arraySize * sizeof(Value_t)) + throw std::runtime_error("[CUDA] - Unsupported array size: " + + std::to_string(config.arraySize * sizeof(Value_t)) + + " (larger than GPU's: " + std::to_string(device.totalGlobalMem) + ")\n"); + return status; } @@ -141,6 +172,13 @@ bool validator(DataT& data) { * @param argv [char***] POINTER to main's argv argument */ void init(int* argc, char*** argv) { + + // Get device configuration + if (cudaGetDeviceProperties(&device, 0) != cudaSuccess) + throw std::runtime_error("[CUDA] - Can not read GPU"); + + config.blockSize = static_cast(device.maxThreadsPerBlock); + // try to read command line if (!get_options(*argc, *argv)) exit(1); @@ -159,16 +197,23 @@ int main(int argc, char* argv[]) try { // Init everything init(&argc, &argv); + logger << "Array size: " << config.arraySize << " (Q=" << static_cast(log2(config.arraySize))<< ")" << logger.endl; + logger << "Repeated sorts: " << config.perf << logger.endl; + logger << "GPU: " << device.name << logger.endl; + logger << "Block size: " << config.blockSize << logger.endl; + for (size_t it = 0 ; it < config.perf ; ++it) { // Initialize local data - logger << "Initialize local array of " << config.arraySize << " elements" << logger.endl; + logger << "Initialize array ... "; std::uniform_int_distribution dis( std::numeric_limits::min(), std::numeric_limits::max() ); std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); }); + logger << " Done." << logger.endl; + // Run distributed sort - logger << "Starting distributed sorting ... "; + logger << "Start sorting ... "; Timer_total.start(); bitonicSort(Data); Timer_total.stop(); @@ -178,20 +223,15 @@ int main(int argc, char* argv[]) try { // Print-outs and validation if (config.perf > 1) { - Timing::print_duration(Timer_total.median(), "Total"); + Timing::print_duration(Timer_total.median(), "Total "); + Timing::print_duration(Timer_memory.median(), "Mem-xch "); + Timing::print_duration(Timer_sorting.median(),"Sorting "); } if (config.validation) { // If requested, we have the chance to fail! std::cout << "[Validation] Results validation ..."; bool val = validator(Data); - std::cout << ((val) ? "\x1B[32m [PASSED] \x1B[0m\n" : " \x1B[32m [FAILED] \x1B[0m\n"); - if (Data.size() < 128) { - std::cout << "Data: "; - for (auto& v : Data) { - std::cout << (int)v << ", "; - } - std::cout << '\n'; - } + std::cout << ((val) ? "\x1B[32m [PASSED] \x1B[0m\n" : " \x1B[31m [FAILED] \x1B[0m\n"); } return 0; } diff --git a/homework_3/src/utils.hpp b/homework_3/src/utils.hpp index bf06ef0..520cd5c 100644 --- a/homework_3/src/utils.hpp +++ b/homework_3/src/utils.hpp @@ -17,6 +17,11 @@ #include "config.h" +/*! + * Stringify preprocessor util + */ +#define STR(s) STR_(s) +#define STR_(s) #s /*! * A Logger for entire program.