Compare commits
	
		
			8 Commits
		
	
	
		
			e7be3a10dd
			...
			7a6f7f53b5
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 7a6f7f53b5 | |||
| e165b75f92 | |||
| 6db2a814d2 | |||
| b31ca23757 | |||
| 2a2c7fec38 | |||
| 1fe5ab4da7 | |||
| 146e975ac1 | |||
| 2ff6ae171a | 
							
								
								
									
										23
									
								
								homework_3/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								homework_3/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @ -0,0 +1,23 @@ | ||||
| # project | ||||
| bin/ | ||||
| out/ | ||||
| mat/ | ||||
| mtx/ | ||||
| .unused/ | ||||
| various/ | ||||
| 
 | ||||
| # hpc | ||||
| 
 | ||||
| # IDEs | ||||
| .idea/ | ||||
| .clangd | ||||
| 
 | ||||
| # eclipse | ||||
| .project | ||||
| .cproject | ||||
| .settings/ | ||||
| 
 | ||||
| .vs/ | ||||
| .vscode/ | ||||
| 
 | ||||
| 
 | ||||
							
								
								
									
										236
									
								
								homework_3/Makefile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										236
									
								
								homework_3/Makefile
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,236 @@ | ||||
| #
 | ||||
| # PDS HW3 Makefile
 | ||||
| #
 | ||||
| # Copyright (C) 2025 Christos Choutouridis <christos@choutouridis.net>
 | ||||
| #
 | ||||
| # This program is free software: you can redistribute it and/or modify
 | ||||
| # it under the terms of the GNU Lesser General Public License as
 | ||||
| # published by the Free Software Foundation, either version 3
 | ||||
| # of the License, or (at your option) any later version.
 | ||||
| #
 | ||||
| # This program is distributed in the hope that it will be useful,
 | ||||
| # but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | ||||
| # GNU Lesser General Public License for more details.
 | ||||
| #
 | ||||
| # You should have received a copy of the GNU Lesser General Public License
 | ||||
| # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 | ||||
| #
 | ||||
| 
 | ||||
| # ============== Project settings ==============
 | ||||
| # Project's name
 | ||||
| PROJECT         := PDS_homework_3 | ||||
| 
 | ||||
| # Excecutable's name
 | ||||
| TARGET          := bitonicCUDA | ||||
| 
 | ||||
| # Source directories list(space seperated). Makefile-relative path, UNDER current directory.
 | ||||
| SRC_DIR_LIST    := src #test test/gtest | ||||
| 
 | ||||
| # Include directories list(space seperated). Makefile-relative path.
 | ||||
| INC_DIR_LIST    := src | ||||
| #                   test \
 | ||||
| #                   test/gtest/ \
 | ||||
| 
 | ||||
| 
 | ||||
| # Exclude files list(space seperated). Filenames only.
 | ||||
| # EXC_FILE_LIST := bad.cpp old.cpp
 | ||||
| 
 | ||||
| # Build directories
 | ||||
| BUILD_DIR       := bin | ||||
| OBJ_DIR         := $(BUILD_DIR)/obj | ||||
| DEP_DIR         := $(BUILD_DIR)/.dep | ||||
| 
 | ||||
| OUTPUT_DIR      := out | ||||
| 
 | ||||
| # ========== Compiler settings ==========
 | ||||
| # Compiler flags for debug and release
 | ||||
| DEB_CFLAGS      := -DDEBUG -std=c11 -Xcompiler "-Wall -Wextra -g -DDEBUG" | ||||
| REL_CFLAGS      := -O3 -std=c11 -Xcompiler "-Wall -Wextra" | ||||
| DEB_CXXFLAGS    := -DDEBUG -std=c++17 -Xcompiler "-Wall -Wextra -g -DDEBUG"  | ||||
| REL_CXXFLAGS    := -O3 -std=c++17 -Xcompiler "-Wall -Wextra" | ||||
| 
 | ||||
| # Pre-defines
 | ||||
| # PRE_DEFS := MYCAB=1729 SUPER_MODE
 | ||||
| PRE_DEFS        := TARGET=$(TARGET) | ||||
| 
 | ||||
| # ============== Linker settings ==============
 | ||||
| # Linker flags (example: -pthread -lm)
 | ||||
| LDFLAGS         := | ||||
| 
 | ||||
| # Map output file
 | ||||
| MAP_FILE        := # output.map | ||||
| MAP_FLAG        := # -Xlinker -Map=$(BUILD_DIR)/$(MAP_FILE) | ||||
| 
 | ||||
| # ============== Docker settings ==============
 | ||||
| # We need:
 | ||||
| #  - Bind the entire project directory(the dir that includes all the code) as volume.
 | ||||
| #  - In docker instance, change to working directory(where the makefile is).
 | ||||
| DOCKER_VOL_DIR  := $(shell pwd) | ||||
| DOCKER_WRK_DIR  := | ||||
| DOCKER_RUN      := docker run --rm | ||||
| DOCKER_FLAGS    := -v $(DOCKER_VOL_DIR):/usr/src/$(PROJECT) -w /usr/src/$(PROJECT)/$(DOCKER_WRK_DIR) | ||||
| 
 | ||||
| # docker invoke mechanism (edit with care)
 | ||||
| #   note:
 | ||||
| #   Here, `DOCKER` variable is empty. Rules can assign `DOCKER := DOCKER_CMD` when docker
 | ||||
| #   functionality is needed.
 | ||||
| DOCKER_CMD      = $(DOCKER_RUN) $(DOCKER_FLAGS) $(IMAGE) | ||||
| DOCKER          := | ||||
| 
 | ||||
| # ============== Tool selection ==============
 | ||||
| # compiler and compiler flags.
 | ||||
| CSIZE           := size | ||||
| CFLAGS          := $(DEB_CFLAGS) | ||||
| CXXFLAGS        := $(DEB_CXXFLAGS) | ||||
| CXX             := g++ | ||||
| CC              := gcc | ||||
| LINKER          := g++ | ||||
| 
 | ||||
| #
 | ||||
| # =========== Main body and Patterns ===========
 | ||||
| #
 | ||||
| 
 | ||||
| INC     := $(foreach dir,$(INC_DIR_LIST),-I$(dir)) | ||||
| DEF     := $(foreach def,$(PRE_DEFS),-D$(def)) | ||||
| EXC     := $(foreach fil,$(EXC_FILE_LIST),                              \
 | ||||
|                $(foreach dir,$(SRC_DIR_LIST),$(wildcard $(dir)/$(fil))) \
 | ||||
|            ) | ||||
| # source files. object and dependencies list
 | ||||
| # recursive search into current and source directories
 | ||||
| SRC     := $(wildcard *.cpp) | ||||
| SRC     += $(foreach dir,$(SRC_DIR_LIST),$(wildcard $(dir)/*.cpp)) | ||||
| SRC     += $(foreach dir,$(SRC_DIR_LIST),$(wildcard $(dir)/**/*.cpp)) | ||||
| SRC     := $(filter-out $(EXC),$(SRC)) | ||||
| #SRC     := $(abspath $(SRC))
 | ||||
| 
 | ||||
| OBJ     := $(foreach file,$(SRC:%.cpp=%.o),$(OBJ_DIR)/$(file)) | ||||
| DEP     := $(foreach file,$(SRC:%.cpp=%.d),$(DEP_DIR)/$(file)) | ||||
| 
 | ||||
| 
 | ||||
| # c file objects depent on .c AND dependency files, which have an empty recipe 
 | ||||
| $(OBJ_DIR)/%.o: %.c | ||||
| 	@mkdir -p $(@D) | ||||
| 	$(DOCKER) $(CC) -c $(CFLAGS) $(INC) $(DEF) -o $@ $< | ||||
| 
 | ||||
| # cpp file objects depend on .cpp AND dependency files, which have an empty recipe
 | ||||
| $(OBJ_DIR)/%.o: %.cpp | ||||
| 	@mkdir -p $(@D) | ||||
| 	$(DOCKER) $(CXX) -c $(CXXFLAGS) $(INC) $(DEF) -o $@ $< | ||||
| 
 | ||||
| # main target rule
 | ||||
| $(BUILD_DIR)/$(TARGET): $(OBJ) | ||||
| 	@mkdir -p $(@D) | ||||
| 	@echo Linking to target: $(TARGET) | ||||
| 	@echo $(DOCKER) $(LINKER) '$$(OBJ)' $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET) | ||||
| 	@$(DOCKER) $(LINKER) $(OBJ) $(LDFLAGS) $(MAP_FLAG) -o $(@D)/$(TARGET) | ||||
| 	@echo | ||||
| 	@echo Print size information | ||||
| 	@$(CSIZE) $(@D)/$(TARGET) | ||||
| 	@echo Done | ||||
| 
 | ||||
| 
 | ||||
| #
 | ||||
| # ================ Default local build rules =================
 | ||||
| # example:
 | ||||
| # make debug
 | ||||
| 
 | ||||
| .DEFAULT_GOAL := all | ||||
| 
 | ||||
| .PHONY: clean | ||||
| clean: | ||||
| 	@echo Cleaning build directories | ||||
| 	@rm -rf $(OBJ_DIR) | ||||
| 	@rm -rf $(DEP_DIR) | ||||
| 	@rm -rf $(BUILD_DIR) | ||||
| 
 | ||||
| debug: CFLAGS := $(DEB_CFLAGS) | ||||
| debug: $(BUILD_DIR)/$(TARGET) | ||||
| 
 | ||||
| release: CFLAGS := $(REL_CFLAGS) | ||||
| release: $(BUILD_DIR)/$(TARGET) | ||||
| 
 | ||||
| #
 | ||||
| # ================ Build rules =================
 | ||||
| #
 | ||||
| 
 | ||||
| bitonic_v0deb: CC := nvcc -G -g -x cu | ||||
| bitonic_v0deb: CXX := nvcc -G -g -x cu | ||||
| bitonic_v0deb: LINKER := nvcc | ||||
| bitonic_v0deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V0 | ||||
| bitonic_v0deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V0 | ||||
| bitonic_v0deb: OUTPUT_DIR := $(OUTPUT_DIR)/v0 | ||||
| bitonic_v0deb: $(BUILD_DIR)/$(TARGET) | ||||
| 	@mkdir -p $(OUTPUT_DIR) | ||||
| 	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET) | ||||
| 
 | ||||
| 
 | ||||
| bitonic_v1deb: CC := nvcc -G -g -x cu | ||||
| bitonic_v1deb: CXX := nvcc -G -g -x cu | ||||
| bitonic_v1deb: LINKER := nvcc | ||||
| bitonic_v1deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V1 | ||||
| bitonic_v1deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V1 | ||||
| bitonic_v1deb: OUTPUT_DIR := $(OUTPUT_DIR)/v1 | ||||
| bitonic_v1deb: $(BUILD_DIR)/$(TARGET) | ||||
| 	@mkdir -p $(OUTPUT_DIR) | ||||
| 	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET) | ||||
| 
 | ||||
| 
 | ||||
| bitonic_v2deb: CC := nvcc -G -g -x cu | ||||
| bitonic_v2deb: CXX := nvcc -G -g -x cu | ||||
| bitonic_v2deb: LINKER := nvcc | ||||
| bitonic_v2deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V2 | ||||
| bitonic_v2deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V2 | ||||
| bitonic_v2deb: OUTPUT_DIR := $(OUTPUT_DIR)/v2 | ||||
| bitonic_v2deb: $(BUILD_DIR)/$(TARGET) | ||||
| 	@mkdir -p $(OUTPUT_DIR) | ||||
| 	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET) | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| bitonic_v0: CC := nvcc -x cu | ||||
| bitonic_v0: CXX := nvcc -x cu | ||||
| bitonic_v0: LINKER := nvcc | ||||
| bitonic_v0: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V0 | ||||
| bitonic_v0: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V0 | ||||
| bitonic_v0: OUTPUT_DIR := $(OUTPUT_DIR)/v0 | ||||
| bitonic_v0: $(BUILD_DIR)/$(TARGET) | ||||
| 	@mkdir -p $(OUTPUT_DIR) | ||||
| 	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET) | ||||
| 
 | ||||
| 
 | ||||
| bitonic_v1: CC := nvcc -x cu | ||||
| bitonic_v1: CXX := nvcc -x cu | ||||
| bitonic_v1: LINKER := nvcc | ||||
| bitonic_v1: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V1 | ||||
| bitonic_v1: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V1 | ||||
| bitonic_v1: OUTPUT_DIR := $(OUTPUT_DIR)/v1 | ||||
| bitonic_v1: $(BUILD_DIR)/$(TARGET) | ||||
| 	@mkdir -p $(OUTPUT_DIR) | ||||
| 	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET) | ||||
| 
 | ||||
| bitonic_v2: CC := nvcc -x cu | ||||
| bitonic_v2: CXX := nvcc -x cu | ||||
| bitonic_v2: LINKER := nvcc | ||||
| bitonic_v2: CFLAGS := $(REL_CFLAGS) -DCODE_VERSION=V2 | ||||
| bitonic_v2: CXXFLAGS := $(REL_CXXFLAGS) -DCODE_VERSION=V2 | ||||
| bitonic_v2: OUTPUT_DIR := $(OUTPUT_DIR)/v2 | ||||
| bitonic_v2: $(BUILD_DIR)/$(TARGET) | ||||
| 	@mkdir -p $(OUTPUT_DIR) | ||||
| 	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET) | ||||
| 
 | ||||
| 
 | ||||
| hpc-build: | ||||
| 	make clean | ||||
| 	make bitonic_v0 | ||||
| 	make clean | ||||
| 	make bitonic_v1 | ||||
| 	make clean | ||||
| 	make bitonic_v2 | ||||
| 
 | ||||
| 
 | ||||
| all: debug bitonic_v0 | ||||
| # Note:
 | ||||
| #	Add a gcc based make rule here in order for clangd to successfully scan the project files.
 | ||||
| #	Otherwise we do not need the gcc build.
 | ||||
| 
 | ||||
							
								
								
									
										33
									
								
								homework_3/exersize.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								homework_3/exersize.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,33 @@ | ||||
| Parallel & Distributed Computer Systems HW3 | ||||
| 
 | ||||
| January, 2025 | ||||
| 
 | ||||
| Write a program that sorts $N$ integers in ascending order, using CUDA. | ||||
| 
 | ||||
| The program must perform the following tasks: | ||||
| 
 | ||||
| - The user specifies a positive integers $q$. | ||||
| 
 | ||||
| - Start a process with an array of $N = 2^q$ random integers is each processes. | ||||
| 
 | ||||
| - Sort all $N$ elements int ascending order. | ||||
| 
 | ||||
| - Check the correctness of the final result. | ||||
| 
 | ||||
| Your implementation should be based on the following steps: | ||||
| 
 | ||||
| V0. A kernel where each thread only compares and exchanges. This "eliminates" the 1:n innermost loop. Easy to write, but too many function calls and global synchronizations. | ||||
| 
 | ||||
| V1. Include the k inner loop in the kernel function. How do we handle the synchronization? Fewer calls, fewer global synchronizations. Faster than V0! | ||||
| 
 | ||||
| V2. Modify the kernel of V1 to work with local memory instead of global. | ||||
| 
 | ||||
| You must deliver: | ||||
| - A report (about $3-4$ pages) that describes your parallel algorithm and implementation. | ||||
| 
 | ||||
| - Your comments on the speed of your parallel program compared to the serial sort, after trying you program on aristotelis for $q = [20:27]$. | ||||
| 
 | ||||
| - The source code of your program uploaded online. | ||||
| 
 | ||||
| Ethics: If you use code found on the web or by an LLM, you should mention your source and the changes you made. You may work in pairs; both partners must submit a single report with both names. | ||||
| Deadline: 2 February, $2025$. | ||||
							
								
								
									
										26
									
								
								homework_3/hpc/makeSlurmScripts.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										26
									
								
								homework_3/hpc/makeSlurmScripts.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,26 @@ | ||||
| #!/usr/bin/env bash | ||||
| 
 | ||||
| # Parameters | ||||
| versions=("v0" "v1" "v2") | ||||
| q_values=(20 21 22 23 24 25 26 27 28 29 30) | ||||
| 
 | ||||
| # Make scripts | ||||
| for version in "${versions[@]}"; do | ||||
|   for q in "${q_values[@]}"; do | ||||
|     filename="Bitnc${version^^}Q${q}.sh"  # Convert v0 -> V0 etc... | ||||
|     cat > "$filename" <<EOL | ||||
| #! /usr/bin/env bash | ||||
| 
 | ||||
| #SBATCH --job-name=Bitnc${version^^}Q${q} | ||||
| #SBATCH --nodes=1 | ||||
| #SBATCH --gres=gpu:1 | ||||
| #SBATCH --time=10:00 | ||||
| 
 | ||||
| module load gcc/9.2.0 cuda/11.1.0 | ||||
| 
 | ||||
| ./out/${version}/bitonicCUDA -v --validation --perf 7 -b 512 -q ${q} | ||||
| 
 | ||||
| EOL | ||||
|     echo "Create: $filename" | ||||
|   done | ||||
| done | ||||
							
								
								
									
										26
									
								
								homework_3/hpc/submitJobs.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										26
									
								
								homework_3/hpc/submitJobs.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,26 @@ | ||||
| #!/usr/bin/env bash | ||||
| 
 | ||||
| # Submission parameters | ||||
| QOS="small" | ||||
| PARTITION="ampere" | ||||
| SCRIPT_DIR="hpc"  # Directory containing the job scripts | ||||
| 
 | ||||
| # Range of values for the -q parameter | ||||
| VERSIONS=("V0" "V1" "V2") | ||||
| Q_START=20 | ||||
| Q_END=30 | ||||
| 
 | ||||
| # Submitting the jobs | ||||
| for version in "${VERSIONS[@]}"; do | ||||
|   for ((q = Q_START; q <= Q_END; q++)); do | ||||
|     script_name="Bitnc${version}Q${q}.sh" | ||||
|     script_path="${SCRIPT_DIR}/${script_name}" | ||||
| 
 | ||||
|     if [[ -f "$script_path" ]]; then | ||||
|       sbatch --qos="$QOS" -p "$PARTITION" "$script_path" | ||||
|       echo "Submitted: $script_path" | ||||
|     else | ||||
|       echo "Warning: File not found - $script_path" | ||||
|     fi | ||||
|   done | ||||
| done | ||||
							
								
								
									
										456
									
								
								homework_3/src/bitonicsort.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										456
									
								
								homework_3/src/bitonicsort.hpp
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,456 @@ | ||||
| /*!
 | ||||
|  * \file | ||||
|  * \brief   Bitonic sort CUDA implementation header | ||||
|  * | ||||
|  * \author | ||||
|  *    Christos Choutouridis AEM:8997 | ||||
|  *    <cchoutou@ece.auth.gr> | ||||
|  */ | ||||
| 
 | ||||
| #ifndef BITONICSORTCUDA_H_ | ||||
| #define BITONICSORTCUDA_H_ | ||||
| 
 | ||||
| #include <cuda_runtime.h> | ||||
| #include <vector> | ||||
| #include <cmath> | ||||
| #include <cstdint> | ||||
| #include <utility> | ||||
| 
 | ||||
| #include "utils.hpp" | ||||
| 
 | ||||
| /*
 | ||||
|  * Exported timers | ||||
|  */ | ||||
| extern Timing Timer_total, Timer_memory, Timer_sorting; | ||||
| 
 | ||||
| using threadId_t = size_t; | ||||
| 
 | ||||
| 
 | ||||
| /*
 | ||||
|  * ============================== Sort utilities ============================== | ||||
|  */ | ||||
| 
 | ||||
| /*!
 | ||||
|  * Returns the ascending or descending configuration (up/down phase) of the thread id | ||||
|  * depending on the current depth | ||||
|  * | ||||
|  * @param tid   [threadId_t] The current thread | ||||
|  * @param stage [size_t]     The current stage of the sorting network (same for each step) | ||||
|  * @return      [bool]       True if we need ascending configuration, false otherwise | ||||
|  */ | ||||
| __device__ inline bool ascending(threadId_t tid, size_t stage) noexcept { | ||||
|     return !(tid & (1 << stage)); | ||||
| } | ||||
| 
 | ||||
| /*!
 | ||||
|  * Returns the thread's partner for data exchange during the sorting network iterations | ||||
|  * of Bitonic | ||||
|  * | ||||
|  * @param tid   [threadId_t] The current node | ||||
|  * @param step  [size_t]     The step of the sorting network | ||||
|  * @return      [threadId_t] The node id of the partner for data exchange | ||||
|  */ | ||||
| __device__ inline threadId_t partner(threadId_t tid, size_t step) noexcept { | ||||
|     return (tid ^ (1 << step)); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| /*!
 | ||||
|  * Predicate to check if a node keeps the small numbers during the bitonic sort network exchange. | ||||
|  * | ||||
|  * @param tid       [threadId_t] The node for which we check | ||||
|  * @param partner   [threadId_t] The partner of the data exchange | ||||
|  * @param stage     [size_t]     The current stage of the sorting network (same for each step) | ||||
|  * @return          [bool]       True if the node should keep the small values, false otherwise | ||||
|  */ | ||||
| 
 | ||||
| __device__ inline bool keepSmall(threadId_t tid, threadId_t partner, size_t stage) { | ||||
|     return ascending(tid, stage) == (tid < partner); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| /*
 | ||||
|  * ============================== Sort algorithms ============================== | ||||
|  */ | ||||
| 
 | ||||
| /*!
 | ||||
|  * Each thread can handle 2 points in the array. For each of these 2 points it may | ||||
|  * - compare and exchange if needed | ||||
|  * - copy data to local and back if needed | ||||
|  */ | ||||
| static constexpr size_t SizeToThreadsRatio = 2; | ||||
| 
 | ||||
| /*!
 | ||||
|  * Calculates the blocks needed for the entire sorting process | ||||
|  * | ||||
|  * @note | ||||
|  *  This "redundant" little trick makes sure blocks are allocated for arraySizes that are not exact | ||||
|  *  multipliers of config.blockSize. | ||||
|  *  Even if we don't need it, we keep it in case we experiment with weird sizes in the future! | ||||
|  * | ||||
|  * @param arraySize [ArraySize_t]   The size of the entire array (in points) | ||||
|  * @return          [size_t]        The number of blocks | ||||
|  */ | ||||
| inline size_t NBlocks(ArraySize_t arraySize) { | ||||
| 	return (((arraySize + config.blockSize - 1) / config.blockSize) / SizeToThreadsRatio); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| /*!
 | ||||
|  * Exchange utility | ||||
|  * | ||||
|  * @tparam  ValueT  The underlying data type of the array items | ||||
|  * | ||||
|  * @param data      [ValueT*]    Pointer to data array | ||||
|  * @param tid       [threadId_t] Current thread's index to data | ||||
|  * @param pid       [threadId_t] Parents's index to data | ||||
|  * @param keepSmall [bool]       Flag to indicate if current threads is keeping the small | ||||
|  */ | ||||
| template <typename ValueT> | ||||
| __device__ void exchange(ValueT* data, threadId_t tid, threadId_t partner, bool keepSmall) { | ||||
|     if (( keepSmall && (data[tid] > data[partner])) || | ||||
|         (!keepSmall && (data[tid] < data[partner])) ) { | ||||
|         ValueT temp = data[tid]; | ||||
|         data[tid] = data[partner]; | ||||
|         data[partner] = temp; | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| #if CODE_VERSION == V0 | ||||
| 
 | ||||
| /*!
 | ||||
|  * This is the body of each thread. This function compare and exchange data | ||||
|  * | ||||
|  * @tparam ValueT   The underlying data type of the array items | ||||
|  * @param data      [ValueT*]   Pointer to data array | ||||
|  * @param n         [size_t]    The total size of the array | ||||
|  * @param step      [size_t]    The current step of the current stage of bitonic sort | ||||
|  * @param stage     [size_t]    The current stage of bitonic sort | ||||
|  */ | ||||
| template <typename ValueT> | ||||
| __global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) { | ||||
|     threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Keep contiguous addressing to the first half of the array
 | ||||
|     threadId_t pid = partner(tid, step); | ||||
|     if (tid > pid) { | ||||
|         // Shift to the other half of the array for global data
 | ||||
|         tid += n / SizeToThreadsRatio; | ||||
|         pid += n / SizeToThreadsRatio; | ||||
|     } | ||||
|     if ((tid < n) && (pid < n)) {       // Boundary check
 | ||||
|         bool keep = keepSmall(tid, pid, stage); | ||||
|         exchange(data, tid, pid, keep); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| /*!
 | ||||
|  * A CUDA version of the Bitonic sort algorithm. | ||||
|  * | ||||
|  * @tparam DataT    A container type to hold data array. Should have .data() and .size() methods | ||||
|  * @param data      [DataT&]    Reference to the container to sort | ||||
|  */ | ||||
| template <typename DataT> | ||||
| void bitonicSort(DataT& data) { | ||||
|     using value_t = typename DataT::value_type; | ||||
| 
 | ||||
|     value_t* dev_data; | ||||
|     auto size = data.size(); | ||||
| 
 | ||||
|     Timer_memory.start(); | ||||
|     if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) | ||||
|         throw std::runtime_error("[CUDA] - Can not allocate memory\n"); | ||||
|     if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) | ||||
|         throw std::runtime_error("[CUDA] - Can not copy memory to device\n"); | ||||
|     Timer_memory.stop(); | ||||
| 
 | ||||
|     size_t Nth = config.blockSize; | ||||
|     size_t Nbl = NBlocks(size); | ||||
| 
 | ||||
|     size_t Stages = static_cast<size_t>(log2(size)); | ||||
|     Timer_sorting.start(); | ||||
|     for (size_t stage = 1; stage <= Stages; ++stage) { | ||||
|         for (size_t step = stage; step > 0; ) { | ||||
|             --step; | ||||
|             bitonicStep<<<Nbl, Nth>>>(dev_data, size, step, stage); | ||||
|             cudaDeviceSynchronize(); | ||||
|         } | ||||
|     } | ||||
|     Timer_sorting.stop(); | ||||
| 
 | ||||
|     Timer_memory.start(); | ||||
|     if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) | ||||
|         throw std::runtime_error("[CUDA] - Can not copy memory from device\n"); | ||||
|     cudaFree(dev_data); | ||||
|     Timer_memory.stop(); | ||||
| } | ||||
| 
 | ||||
| #elif CODE_VERSION == V1 | ||||
| 
 | ||||
| /*!
 | ||||
|  * This is the body of each thread. This function compare and exchange data | ||||
|  * | ||||
|  * @tparam ValueT   The underlying data type of the array items | ||||
|  * @param data      [ValueT*]   Pointer to data array | ||||
|  * @param n         [size_t]    The total size of the array | ||||
|  * @param step      [size_t]    The current step of the current stage of bitonic sort | ||||
|  * @param stage     [size_t]    The current stage of bitonic sort | ||||
|  */ | ||||
| template <typename ValueT> | ||||
| __device__ void interBlockStep_(ValueT* data, size_t n, size_t step, size_t stage) { | ||||
|     /*
 | ||||
|      * Here we skip blocks every time (one for SizeToThreadsRatio = 2) | ||||
|      * And we use the neighbor block address indices for the other half of the threads | ||||
|      */ | ||||
|     threadId_t tid = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x; | ||||
|     threadId_t pid = partner(tid, step); | ||||
|     if (tid > pid) { | ||||
|         // Shift to the other half of the array for global data
 | ||||
|         tid += blockDim.x; | ||||
|         pid += blockDim.x; | ||||
|     } | ||||
|     if ((tid < n) && (pid < n)) {       // Boundary check
 | ||||
|         bool keep = keepSmall(tid, pid, stage); | ||||
|         exchange(data, tid, pid, keep); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| /*!
 | ||||
|  * This is the version of the body that is called outside of the loop unrolling | ||||
|  * | ||||
|  * @tparam ValueT   The underlying data type of the array items | ||||
|  * @param data      [ValueT*]   Pointer to data array | ||||
|  * @param n         [size_t]    The total size of the array | ||||
|  * @param step      [size_t]    The current step of the current stage of bitonic sort | ||||
|  * @param stage     [size_t]    The current stage of bitonic sort | ||||
|  */ | ||||
| template <typename ValueT> | ||||
| __global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) { | ||||
|     interBlockStep_(data, n, step, stage); | ||||
| } | ||||
| 
 | ||||
| /*!
 | ||||
|  * This is unrolled part of the bitonic double loop. | ||||
|  * | ||||
|  * @tparam ValueT   The underlying data type of the array items | ||||
|  * @param data      [ValueT*]   Pointer to data array | ||||
|  * @param n         [size_t]    The total size of the array | ||||
|  * @param step      [size_t]    The current step of the current stage of bitonic sort | ||||
|  * @param stage     [size_t]    The current stage of bitonic sort | ||||
|  */ | ||||
| template <typename ValueT> | ||||
| __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) { | ||||
|     for (size_t step = innerSteps + 1; step > 0; ) { | ||||
|         --step; | ||||
|         interBlockStep_(data, n, step, stage); | ||||
|         __syncthreads(); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| /*!
 | ||||
|  * A CUDA version of the Bitonic sort algorithm. | ||||
|  * | ||||
|  * @tparam DataT    A container type to hold data array. Should have .data() and .size() methods | ||||
|  * @param data      [DataT&]    Reference to the container to sort | ||||
|  */ | ||||
| template <typename DataT> | ||||
| void bitonicSort(DataT& data) { | ||||
|     using value_t = typename DataT::value_type; | ||||
| 
 | ||||
|     value_t* dev_data; | ||||
|     auto size = data.size(); | ||||
| 
 | ||||
|     Timer_memory.start(); | ||||
|     if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) | ||||
|         throw std::runtime_error("[CUDA] - Can not allocate memory\n"); | ||||
|     if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) | ||||
|         throw std::runtime_error("[CUDA] - Can not copy memory to device\n"); | ||||
|     Timer_memory.stop(); | ||||
| 
 | ||||
|     size_t Nth = config.blockSize; | ||||
|     size_t Nbl = NBlocks(size); | ||||
| 
 | ||||
|     auto Stages          = static_cast<size_t>(log2(size)); | ||||
|     auto InnerBlockSteps = static_cast<size_t>(log2(Nth));    //
 | ||||
|     Timer_sorting.start(); | ||||
|     for (size_t stage = 1; stage <= Stages; ++stage) { | ||||
|         size_t step = stage - 1; | ||||
|         for ( ; step > InnerBlockSteps; --step) { | ||||
|             interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage); | ||||
|             cudaDeviceSynchronize(); | ||||
|         } | ||||
|         inBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage); | ||||
|         cudaDeviceSynchronize(); | ||||
|     } | ||||
|     Timer_sorting.stop(); | ||||
| 
 | ||||
|     Timer_memory.start(); | ||||
|     if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) | ||||
|         throw std::runtime_error("[CUDA] - Can not copy memory from device\n"); | ||||
|     cudaFree(dev_data); | ||||
|     Timer_memory.stop(); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| #elif CODE_VERSION == V2 | ||||
| 
 | ||||
| /*!
 | ||||
|  * @return The memory that each block local threads can affect. | ||||
|  * | ||||
|  * @note | ||||
|  *  Each block thread collection can exchange twice the size of data points. | ||||
|  */ | ||||
| inline size_t effectiveBlockSize() { return SizeToThreadsRatio * config.blockSize; } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| /*!
 | ||||
|  * Converts the global address of the data to the local shared memory array which is used | ||||
|  * as cached memory to the unrolled part of the bitonic sort loop. | ||||
|  * | ||||
|  * @note | ||||
|  *  Each block's thread collection can exchange twice the size of data points. | ||||
|  *  These points get copied (cached) in the shared memory location. We use contiguous blocks | ||||
|  *  both in global data memory and the shared memory buffer. | ||||
|  * | ||||
|  * @param gIndex    The global array index | ||||
|  * @param blockDim  The block size (threads per block) | ||||
|  * @return          The equivalent local address of the shared memory | ||||
|  */ | ||||
| __device__ inline size_t toLocal(size_t gIndex, size_t blockDim) { | ||||
|     return gIndex % (SizeToThreadsRatio * blockDim); | ||||
| } | ||||
| 
 | ||||
| /*!
 | ||||
|  * This is the version of the body that is called outside of the loop unrolling | ||||
|  * | ||||
|  * @tparam ValueT   The underlying data type of the array items | ||||
|  * @param data      [ValueT*]   Pointer to data array | ||||
|  * @param n         [size_t]    The total size of the array | ||||
|  * @param step      [size_t]    The current step of the current stage of bitonic sort | ||||
|  * @param stage     [size_t]    The current stage of bitonic sort | ||||
|  */ | ||||
| template <typename ValueT> | ||||
| __global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) { | ||||
|     threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x;     // Keep contiguous addressing to the first half of the array
 | ||||
|     threadId_t pid = partner(tid, step); | ||||
|     if (tid > pid) { | ||||
|         // Shift to the other half of the array for global data
 | ||||
|         tid += n / SizeToThreadsRatio; | ||||
|         pid += n / SizeToThreadsRatio; | ||||
|     } | ||||
|     if ((tid < n) && (pid < n)) {       // Boundary check
 | ||||
|         bool keep = keepSmall(tid, pid, stage); | ||||
|         exchange(data, tid, pid, keep); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| /*!
 | ||||
|  * This is unrolled part of the bitonic double loop. | ||||
|  * | ||||
|  * First each thread caches its corresponding data point from the current and the following data block. | ||||
|  * After that we execute the loop unrolling on the local data and then we write back to global memory. | ||||
|  * | ||||
|  * @tparam ValueT   The underlying data type of the array items | ||||
|  * @param data      [ValueT*]   Pointer to data array | ||||
|  * @param n         [size_t]    The total size of the array | ||||
|  * @param step      [size_t]    The current step of the current stage of bitonic sort | ||||
|  * @param stage     [size_t]    The current stage of bitonic sort | ||||
|  */ | ||||
| template <typename ValueT> | ||||
| __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) { | ||||
|     extern __shared__ ValueT shared_data[]; | ||||
| 
 | ||||
|     /*
 | ||||
|      * Global and local(shared) memory indices (calculated once) | ||||
|      * Here we skip blocks every time (one for SizeToThreadsRatio = 2) | ||||
|      * And we cache the neighbor block address indexes in local (shared) memory | ||||
|      */ | ||||
|     threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x; | ||||
|     threadId_t lIdx0 = toLocal(gIdx0, blockDim.x); | ||||
| 
 | ||||
|     if (gIdx0 + blockDim.x >= n)   // Boundary check
 | ||||
|         return; | ||||
| 
 | ||||
|     // Fetch to local memory the entire effective block size (2 positions for each thread)
 | ||||
|     shared_data[lIdx0]              = data[gIdx0]; | ||||
|     shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x]; | ||||
|     __syncthreads(); | ||||
| 
 | ||||
|     for (size_t step = innerSteps + 1; step > 0; ) { | ||||
|         --step; | ||||
| 
 | ||||
|         // Init thread global and local indices
 | ||||
|         threadId_t gIdx = gIdx0; | ||||
|         threadId_t lIdx = lIdx0; | ||||
|         // Find partner and keep-small configuration based on the global data positions
 | ||||
|         threadId_t pIdx  = partner(gIdx, step); | ||||
|         if (gIdx > pIdx) { | ||||
|             // Shift inside effective block
 | ||||
|             gIdx += blockDim.x; // global
 | ||||
|             pIdx += blockDim.x; | ||||
|             lIdx += blockDim.x;	// local
 | ||||
|         } | ||||
|         bool keep = keepSmall(gIdx, pIdx, stage); | ||||
| 
 | ||||
|         // Exchange data on local(shared) copy
 | ||||
|         threadId_t lpIdx = toLocal(pIdx, blockDim.x); | ||||
|         exchange(shared_data, lIdx, lpIdx, keep); | ||||
|         __syncthreads(); | ||||
|     } | ||||
| 
 | ||||
|     // Write back to global memory
 | ||||
|     data[gIdx0]              = shared_data[lIdx0]; | ||||
|     data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x]; | ||||
|     __syncthreads(); | ||||
| } | ||||
| 
 | ||||
| /*!
 | ||||
|  * A CUDA version of the Bitonic sort algorithm. | ||||
|  * | ||||
|  * @tparam DataT    A container type to hold data array. Should have .data() and .size() methods | ||||
|  * @param data      [DataT&]    Reference to the container to sort | ||||
|  */ | ||||
| template <typename DataT> | ||||
| void bitonicSort(DataT& data) { | ||||
|     using value_t = typename DataT::value_type; | ||||
| 
 | ||||
|     value_t* dev_data; | ||||
|     auto size = data.size(); | ||||
| 
 | ||||
|     Timer_memory.start(); | ||||
|     if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess) | ||||
|         throw std::runtime_error("[CUDA] - Can not allocate memory\n"); | ||||
|     if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess) | ||||
|         throw std::runtime_error("[CUDA] - Can not copy memory to device\n"); | ||||
|     Timer_memory.stop(); | ||||
| 
 | ||||
|     size_t Nth = config.blockSize; | ||||
|     size_t Nbl = NBlocks(size); | ||||
|     size_t kernelMemSize = effectiveBlockSize() * sizeof(value_t); | ||||
| 
 | ||||
|     auto Stages          = static_cast<size_t>(log2(size)); | ||||
|     auto InnerBlockSteps = static_cast<size_t>(log2(Nth)); | ||||
|     Timer_sorting.start(); | ||||
|     for (size_t stage = 1; stage <= Stages; ++stage) { | ||||
|         size_t step = stage - 1; | ||||
|         for ( ; step > InnerBlockSteps; --step) { | ||||
|             interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage); | ||||
|             cudaDeviceSynchronize(); | ||||
|         } | ||||
|         inBlockStep<<<Nbl, Nth, kernelMemSize>>>(dev_data, size, step, stage); | ||||
|         cudaDeviceSynchronize(); | ||||
|     } | ||||
|     Timer_sorting.stop(); | ||||
| 
 | ||||
|     Timer_memory.start(); | ||||
|     if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess) | ||||
|         throw std::runtime_error("[CUDA] - Can not copy memory from device\n"); | ||||
|     cudaFree(dev_data); | ||||
|     Timer_memory.stop(); | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| #endif //BITONICSORTCUDA_H_
 | ||||
							
								
								
									
										82
									
								
								homework_3/src/config.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										82
									
								
								homework_3/src/config.h
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,82 @@ | ||||
| /*!
 | ||||
|  * \file | ||||
|  * \brief   Build and runtime configuration file. | ||||
|  * | ||||
|  * \author | ||||
|  *    Christos Choutouridis AEM:8997 | ||||
|  *    <cchoutou@ece.auth.gr> | ||||
|  */ | ||||
| 
 | ||||
| #ifndef CONFIG_H_ | ||||
| #define CONFIG_H_ | ||||
| 
 | ||||
| #include <cstdint> | ||||
| #include <cuda_runtime.h> | ||||
| 
 | ||||
| /*
 | ||||
|  * Versioning: | ||||
|  * - RC1: First version to test on HPC | ||||
|  */ | ||||
| static constexpr char version[] = "0.1"; | ||||
| 
 | ||||
| /*
 | ||||
|  * Defines for different version of the exercise | ||||
|  */ | ||||
| #define  V0        0 | ||||
| #define  V1        1 | ||||
| #define  V2        2 | ||||
| 
 | ||||
| // Fail-safe version selection
 | ||||
| #if !defined CODE_VERSION | ||||
| #define CODE_VERSION   V2 | ||||
| #endif | ||||
| 
 | ||||
| // Default Data size (in case -q <N> is not present)
 | ||||
| static constexpr size_t DEFAULT_DATA_SIZE   = 1 << 16; | ||||
| 
 | ||||
| // Placeholder default (actual default comes from device properties read at initialization)
 | ||||
| static constexpr size_t THREADS_PER_BLOCK   = 1024; | ||||
| 
 | ||||
| 
 | ||||
| /*!
 | ||||
|  * Value and Buffer type selection | ||||
|  * | ||||
|  * We support the following compiler types or the <cstdint> that translate to them: | ||||
|  *  char       -  unsigned char | ||||
|  *  short      -  unsigned short | ||||
|  *  int        -  unsigned int | ||||
|  *  long       -  unsigned long | ||||
|  *  long long  -  unsigned long long | ||||
|  *  float | ||||
|  *  double | ||||
|  */ | ||||
| using Value_t = uint32_t; | ||||
| using Data_t  = std::vector<Value_t>; | ||||
| 
 | ||||
| /*!
 | ||||
|  * In theory we can support large arrays ;) | ||||
|  */ | ||||
| using ArraySize_t = uint64_t; | ||||
| 
 | ||||
| /*!
 | ||||
|  * Session option for each invocation of the executable. | ||||
|  * | ||||
|  * @note | ||||
|  *  The values of the members are set from the command line. | ||||
|  */ | ||||
| struct config_t { | ||||
| 	ArraySize_t	arraySize{DEFAULT_DATA_SIZE};   //!< The array size of the local data to sort.
 | ||||
| 	size_t      blockSize{THREADS_PER_BLOCK};	//!< The block size (threads per block) for the session.
 | ||||
|     bool        validation{false};              //!< Request a full validation at the end, performed by process rank 0.
 | ||||
|     size_t      perf{1};                        //!< Enable performance timing measurements and prints. Repeat
 | ||||
|                                                 //!< the sorting <perf> times to do so.
 | ||||
|     bool        verbose{false};                 //!< Flag to enable verbose output to stdout.
 | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
|  * Exported data types | ||||
|  */ | ||||
| extern config_t 		config; | ||||
| extern cudaDeviceProp   device; | ||||
| 
 | ||||
| #endif /* CONFIG_H_ */ | ||||
							
								
								
									
										260
									
								
								homework_3/src/main.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										260
									
								
								homework_3/src/main.cpp
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,260 @@ | ||||
| /*!
 | ||||
|  * \file | ||||
|  * \brief   Main application file for PDS HW3 (CUDA) | ||||
|  * | ||||
|  * \author | ||||
|  *    Christos Choutouridis AEM:8997 | ||||
|  *    <cchoutou@ece.auth.gr> | ||||
|  */ | ||||
| 
 | ||||
| #include <exception> | ||||
| #include <iostream> | ||||
| #include <algorithm> | ||||
| #include <random> | ||||
| #include <cuda_runtime.h> | ||||
| 
 | ||||
| #include "utils.hpp" | ||||
| #include "config.h" | ||||
| #include "bitonicsort.hpp" | ||||
| 
 | ||||
| 
 | ||||
| // Global session data
 | ||||
| Data_t          Data; | ||||
| config_t        config; | ||||
| Log             logger; | ||||
| cudaDeviceProp 	device; | ||||
| 
 | ||||
| // Mersenne seeded from hw if possible. range: [type_min, type_max]
 | ||||
| std::random_device  rd; | ||||
| std::mt19937        gen(rd()); | ||||
| 
 | ||||
| //! Performance timers for each one of the "costly" functions
 | ||||
| Timing Timer_total, Timer_memory, Timer_sorting; | ||||
| 
 | ||||
| 
 | ||||
| //! Init timing objects for extra rounds
 | ||||
| void measurements_init() { | ||||
|     if (config.perf > 1) { | ||||
|         Timer_total.init(config.perf); | ||||
|         Timer_memory.init(config.perf); | ||||
|         Timer_sorting.init(config.perf); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| //! iterate ot the next round of measurements for all measurement objects
 | ||||
| void measurements_next() { | ||||
|     if (config.perf > 1) { | ||||
|         Timer_total.next(); | ||||
|         Timer_memory.next(); | ||||
|         Timer_sorting.next(); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| /*!
 | ||||
|  * A small command line argument parser | ||||
|  * \return  The status of the operation | ||||
|  */ | ||||
| bool get_options(int argc, char* argv[]){ | ||||
|     bool status =true; | ||||
| 
 | ||||
|     // iterate over the passed arguments
 | ||||
|     for (int i=1 ; i<argc ; ++i) { | ||||
|         std::string arg(argv[i]);     // get current argument
 | ||||
| 
 | ||||
|         if (arg == "-q" || arg == "--array-size") { | ||||
|             if (i+1 < argc) { | ||||
|                 config.arraySize = (ArraySize_t)1 << atoi(argv[++i]); | ||||
|             } | ||||
|             else { | ||||
|                 status = false; | ||||
|             } | ||||
|         } | ||||
|         else if (arg == "-b" || arg == "--block-size") { | ||||
|             if (i+1 < argc) { | ||||
|                 config.blockSize = atoi(argv[++i]); | ||||
|             } | ||||
|             else { | ||||
|                 status = false; | ||||
|             } | ||||
|         } | ||||
|         else if (arg == "--validation") { | ||||
|             config.validation = true; | ||||
|         } | ||||
|         else if (arg == "--perf") { | ||||
|             if (i+1 < argc) { | ||||
|                 config.perf = atoi(argv[++i]); | ||||
|             } | ||||
|             else { | ||||
|                 status = false; | ||||
|             } | ||||
|         } | ||||
|         else if (arg == "-v" || arg == "--verbose") { | ||||
|             config.verbose = true; | ||||
|         } | ||||
|         else if (arg == "--version") { | ||||
|             std::cout << STR(TARGET) << " - A GPU accelerated bitonic sort utility (V" << STR(CODE_VERSION)<< ") \n"; | ||||
|             std::cout << "version: " << version << "\n\n"; | ||||
|             exit(0); | ||||
|         } | ||||
|         else if (arg == "-h" || arg == "--help") { | ||||
|             std::cout << STR(TARGET) << " - A GPU accelerated bitonic sort utility (V" << STR(CODE_VERSION)<< ") \n\n"; | ||||
|             std::cout << "  " << STR(TARGET) << " -q <N> -b <N> [--validation] [--perf <N>] [-v]\n"; | ||||
|             std::cout << "  " << STR(TARGET) << " -h\n"; | ||||
|             std::cout << "  " << STR(TARGET) << " --version\n"; | ||||
|             std::cout << '\n'; | ||||
|             std::cout << "Options:\n\n"; | ||||
|             std::cout << "   -q | --array-size <N>\n"; | ||||
|             std::cout << "      Selects the array size according to size = 2^N\n"; | ||||
|             std::cout << "      [Size must be larger than 2 * blockSize]\n"; | ||||
|             std::cout << "      [Default is 2^16]\n\n"; | ||||
|             std::cout << "   -b | --block-size <N>\n"; | ||||
|             std::cout << "      Selects the number of CUDA threads per block\n"; | ||||
|             std::cout << "      [Size has to be multiple of device's warp size (usually 32)\n"; | ||||
|             std::cout << "      [Default is the maximum device supported number. For ex: (GTX 1650) block-size=1024]\n\n"; | ||||
|             std::cout << "   --validation\n"; | ||||
|             std::cout << "      Request a full validation at the end\n\n"; | ||||
|             std::cout << "   --perf <N> \n"; | ||||
|             std::cout << "      Enable performance timing measurements and prints, and repeat\n"; | ||||
|             std::cout << "      the sorting <N> times.\n\n"; | ||||
|             std::cout << "   -v | --verbose\n"; | ||||
|             std::cout << "      Request a more verbose output to stdout.\n\n"; | ||||
|             std::cout << "   -h | --help\n"; | ||||
|             std::cout << "      Prints this and exit.\n\n"; | ||||
|             std::cout << "   --version\n"; | ||||
|             std::cout << "      Prints version and exit.\n\n"; | ||||
|             std::cout << "Examples:\n\n"; | ||||
|             std::cout << "  " << STR(TARGET) << " -q 24\n"; | ||||
|             std::cout << "      Runs bitonic sort on an 2^24 points array, using GPU acceleration\n\n"; | ||||
|             std::cout << "  " << STR(TARGET) << " --validation --perf 5 -b 512 -q 26\n"; | ||||
|             std::cout << "      Runs bitonic sort on an 2^26 points array 5 times, using GPU acceleration with\n"; | ||||
|             std::cout << "      512 threads per block, performs a validation check at the end and prints the time\n"; | ||||
|             std::cout << "      of the median.\n\n"; | ||||
| 
 | ||||
|             exit(0); | ||||
|         } | ||||
|         else {   // parse error
 | ||||
|             std::cout << "Invocation error. Try -h for details.\n"; | ||||
|             status = false; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     // Check configuration requirements
 | ||||
|     if (config.blockSize % device.warpSize) | ||||
|     	throw std::runtime_error("[Config] - Number of threads per block is not an exact multiple of warp size\n"); | ||||
|     if (config.arraySize < 2*config.blockSize) | ||||
|     	throw std::runtime_error("[Config] - Unsupported array size (smaller than " | ||||
|                                  + std::to_string(SizeToThreadsRatio*config.blockSize) + ")\n"); | ||||
|     if (device.totalGlobalMem < config.arraySize * sizeof(Value_t)) | ||||
|     	throw std::runtime_error("[CUDA] - Unsupported array size: " | ||||
|                                  + std::to_string(config.arraySize * sizeof(Value_t)) | ||||
|                                  + " (larger than GPU's: " + std::to_string(device.totalGlobalMem) + ")\n"); | ||||
| 
 | ||||
|     return status; | ||||
| } | ||||
| 
 | ||||
| /*!
 | ||||
|  * A simple validator for the entire distributed process | ||||
|  * | ||||
|  * @tparam DataT    A buffer type with random access iterator. | ||||
|  * | ||||
|  * @param data      [DataT] The data | ||||
|  * @return          [bool]  True if sorted in ascending order | ||||
|  */ | ||||
| template<typename DataT> | ||||
| bool validator(DataT& data) { | ||||
|     return std::is_sorted(data.begin(), data.end()); | ||||
| } | ||||
| 
 | ||||
| /*!
 | ||||
|  * Initializes the environment, must called from each process | ||||
|  * | ||||
|  * @param argc  [int*]      POINTER to main's argc argument | ||||
|  * @param argv  [char***]   POINTER to main's argv argument | ||||
|  */ | ||||
| void init(int* argc, char*** argv) { | ||||
| 
 | ||||
|     // Get device configuration
 | ||||
|     if (cudaGetDeviceProperties(&device, 0) != cudaSuccess) | ||||
|         throw std::runtime_error("[CUDA] - Can not read GPU"); | ||||
| 
 | ||||
|     config.blockSize = static_cast<size_t>(device.maxThreadsPerBlock); | ||||
| 
 | ||||
|     // try to read command line
 | ||||
|     if (!get_options(*argc, *argv)) | ||||
|         exit(1); | ||||
| 
 | ||||
|     // Prepare vector and timing data
 | ||||
|     Data.resize(config.arraySize); | ||||
|     measurements_init(); | ||||
| } | ||||
| 
 | ||||
| #if !defined TESTING | ||||
| /*!
 | ||||
|  * @return Returns 0, but.... we may throw or exit(0) / exit(1) | ||||
|  */ | ||||
| int main(int argc, char* argv[]) try { | ||||
| 
 | ||||
|     // Init everything
 | ||||
|     init(&argc, &argv); | ||||
| 
 | ||||
|     logger << "Array size:     " << config.arraySize << " (Q=" << static_cast<size_t>(log2(config.arraySize))<< ")" << logger.endl; | ||||
|     logger << "Repeated sorts: " << config.perf << logger.endl; | ||||
|     logger << "GPU:            " << device.name << logger.endl; | ||||
|     logger << "Block size:     " << config.blockSize << logger.endl; | ||||
| 
 | ||||
|     for (size_t it = 0 ; it < config.perf ; ++it) { | ||||
|         // Initialize local data
 | ||||
|         logger << "Initialize array ... "; | ||||
|         std::uniform_int_distribution<Value_t > dis( | ||||
|                 std::numeric_limits<Value_t>::min(), | ||||
|                 std::numeric_limits<Value_t>::max() | ||||
|         ); | ||||
|         std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); }); | ||||
|         logger << " Done." << logger.endl; | ||||
| 
 | ||||
|         // Run distributed sort
 | ||||
|         logger << "Start sorting ...    "; | ||||
|         Timer_total.start(); | ||||
|         bitonicSort(Data); | ||||
|         Timer_total.stop(); | ||||
|         measurements_next(); | ||||
|         logger << " Done." << logger.endl; | ||||
|     } | ||||
| 
 | ||||
|     // Print-outs and validation
 | ||||
|     if (config.perf > 1) { | ||||
|         Timing::print_duration(Timer_total.median(),  "Total   "); | ||||
|         Timing::print_duration(Timer_memory.median(), "Mem-xch "); | ||||
|         Timing::print_duration(Timer_sorting.median(),"Sorting "); | ||||
|     } | ||||
|     if (config.validation) { | ||||
|         // If requested, we have the chance to fail!
 | ||||
|         std::cout << "[Validation] Results validation ..."; | ||||
|         bool val = validator(Data); | ||||
|         std::cout << ((val) ? "\x1B[32m [PASSED] \x1B[0m\n" : " \x1B[31m [FAILED] \x1B[0m\n"); | ||||
|     } | ||||
|     return 0; | ||||
| } | ||||
| catch (std::exception& e) { | ||||
|     //we probably pollute the user's screen. Comment `cerr << ...` if you don't like it.
 | ||||
|     std::cerr << "Error: " << e.what() << '\n'; | ||||
|     exit(1); | ||||
| } | ||||
| 
 | ||||
| #else | ||||
| 
 | ||||
| #include <gtest/gtest.h> | ||||
| #include <exception> | ||||
| 
 | ||||
| /*!
 | ||||
|  * The testing version of our program | ||||
|  */ | ||||
| GTEST_API_ int main(int argc, char **argv) try { | ||||
|    testing::InitGoogleTest(&argc, argv); | ||||
|    return RUN_ALL_TESTS(); | ||||
| } | ||||
| catch (std::exception& e) { | ||||
|     std::cout << "Exception: " << e.what() << '\n'; | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
							
								
								
									
										157
									
								
								homework_3/src/utils.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										157
									
								
								homework_3/src/utils.hpp
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,157 @@ | ||||
| /**
 | ||||
|  * \file | ||||
|  * \brief   Utilities header | ||||
|  * | ||||
|  * \author | ||||
|  *    Christos Choutouridis AEM:8997 | ||||
|  *    <cchoutou@ece.auth.gr> | ||||
|  */ | ||||
| #ifndef UTILS_HPP_ | ||||
| #define UTILS_HPP_ | ||||
| 
 | ||||
| #include <vector> | ||||
| #include <iostream> | ||||
| #include <chrono> | ||||
| #include <unistd.h> | ||||
| #include <algorithm> | ||||
| 
 | ||||
| #include "config.h" | ||||
| 
 | ||||
| /*!
 | ||||
|  * Stringify preprocessor util | ||||
|  */ | ||||
| #define STR(s)		STR_(s) | ||||
| #define STR_(s)		#s | ||||
| 
 | ||||
| /*!
 | ||||
|  * A Logger for entire program. | ||||
|  */ | ||||
| struct Log { | ||||
|     struct Endl {} endl;    //!< a tag object to to use it as a new line request.
 | ||||
| 
 | ||||
|     //! We provide logging via << operator
 | ||||
|     template<typename T> | ||||
|     Log &operator<<(T &&t) { | ||||
|         if (config.verbose) { | ||||
|             if (line_) { | ||||
|                 std::cout << "[Log]: " << t; | ||||
|                 line_ = false; | ||||
|             } else | ||||
|                 std::cout << t; | ||||
|         } | ||||
|         return *this; | ||||
|     } | ||||
| 
 | ||||
|     // overload for special end line handling
 | ||||
|     Log &operator<<(Endl e) { | ||||
|         (void) e; | ||||
|         if (config.verbose) { | ||||
|             std::cout << '\n'; | ||||
|             line_ = true; | ||||
|         } | ||||
|         return *this; | ||||
|     } | ||||
| 
 | ||||
| private: | ||||
|     bool line_{true}; | ||||
| }; | ||||
| 
 | ||||
| extern Log logger; | ||||
| 
 | ||||
| /*!
 | ||||
|  * A small timing utility based on chrono that supports timing rounds | ||||
|  * and returning the median of them. Time can accumulate to the measurement | ||||
|  * for each round. | ||||
|  */ | ||||
| struct Timing { | ||||
|     using Tpoint = std::chrono::steady_clock::time_point; | ||||
|     using Tduration = std::chrono::microseconds; | ||||
|     using microseconds = std::chrono::microseconds; | ||||
|     using milliseconds = std::chrono::milliseconds; | ||||
|     using seconds = std::chrono::seconds; | ||||
| 
 | ||||
|     //! Setup measurement rounds
 | ||||
|     void init(size_t rounds) { | ||||
|         duration_.resize(rounds); | ||||
|         for (auto& d : duration_) | ||||
|             d = Tduration::zero(); | ||||
|     } | ||||
| 
 | ||||
|     //! tool to mark the starting point
 | ||||
|     Tpoint start() noexcept { return mark_ = std::chrono::steady_clock::now(); } | ||||
| 
 | ||||
|     //! tool to mark the ending point
 | ||||
|     Tpoint stop() noexcept { | ||||
|         Tpoint now = std::chrono::steady_clock::now(); | ||||
|         duration_[current_] += dt(now, mark_); | ||||
|         return now; | ||||
|     } | ||||
| 
 | ||||
|     //! Switch timing slot
 | ||||
|     void next() noexcept { | ||||
|         ++current_; | ||||
|         current_ %= duration_.size(); | ||||
|     } | ||||
| 
 | ||||
|     Tduration& median() noexcept { | ||||
|         std::sort(duration_.begin(), duration_.end()); | ||||
|         return duration_[duration_.size()/2]; | ||||
|     } | ||||
| 
 | ||||
|     //! A duration calculation utility
 | ||||
|     static Tduration dt(Tpoint t2, Tpoint t1) noexcept { | ||||
|         return std::chrono::duration_cast<Tduration>(t2 - t1); | ||||
|     } | ||||
| 
 | ||||
|     //! Tool to print the time interval
 | ||||
|     static void print_duration(const Tduration& duration, const char *what) noexcept { | ||||
|         if (std::chrono::duration_cast<microseconds>(duration).count() < 10000) | ||||
|             std::cout << "[Timing] " << what << ": " | ||||
|                       << std::to_string(std::chrono::duration_cast<microseconds>(duration).count()) << " [usec]\n"; | ||||
|         else if (std::chrono::duration_cast<milliseconds>(duration).count() < 10000) | ||||
|             std::cout << "[Timing] " << what << ": " | ||||
|                       << std::to_string(std::chrono::duration_cast<milliseconds>(duration).count()) << " [msec]\n"; | ||||
|         else { | ||||
|             char stime[26]; // fit ulong
 | ||||
|             auto sec  = std::chrono::duration_cast<seconds>(duration).count(); | ||||
|             auto msec = (std::chrono::duration_cast<milliseconds>(duration).count() % 1000) / 10;  // keep 2 digit
 | ||||
|             std::sprintf(stime, "%ld.%1ld", sec, msec); | ||||
|             std::cout << "[Timing] " << what << ": " << stime << " [sec]\n"; | ||||
|         } | ||||
| 
 | ||||
|     } | ||||
| 
 | ||||
| private: | ||||
|     size_t current_{0}; | ||||
|     Tpoint mark_{}; | ||||
|     std::vector<Tduration> duration_{1}; | ||||
| }; | ||||
| 
 | ||||
| /*!
 | ||||
|  * A "high level function"-like utility macro to forward a function call | ||||
|  * and accumulate the execution time to the corresponding timing object. | ||||
|  * | ||||
|  * @param   Tim     The Timing object [Needs to have methods start() and stop()] | ||||
|  * @param   Func    The function name | ||||
|  * @param   ...     The arguments to pass to function (the preprocessor way) | ||||
|  */ | ||||
| #define timeCall(Tim, Func, ...)    \ | ||||
|     Tim.start();                    \ | ||||
|     Func(__VA_ARGS__);              \ | ||||
|     Tim.stop();                     \ | ||||
| 
 | ||||
| 
 | ||||
| /*!
 | ||||
|  * A utility to check if a number is power of two | ||||
|  * | ||||
|  * @tparam Integral     The integral type of the number to check | ||||
|  * @param x             The number to check | ||||
|  * @return              True if it is power of 2, false otherwise | ||||
|  */ | ||||
| template <typename Integral> | ||||
| constexpr inline bool isPowerOfTwo(Integral x) noexcept { | ||||
|     return (!(x & (x - 1)) && x); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| #endif /* UTILS_HPP_ */ | ||||
							
								
								
									
										11673
									
								
								homework_3/test/gtest/gtest/gtest-all.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11673
									
								
								homework_3/test/gtest/gtest/gtest-all.cpp
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										17103
									
								
								homework_3/test/gtest/gtest/gtest.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17103
									
								
								homework_3/test/gtest/gtest/gtest.h
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										33
									
								
								homework_3/test/tests.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								homework_3/test/tests.cpp
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,33 @@ | ||||
| /**
 | ||||
|  * \file | ||||
|  * \brief   PDS HW3 tests | ||||
|  * | ||||
|  * To run these test execute: | ||||
|  *  ... | ||||
|  * | ||||
|  * \author | ||||
|  *    Christos Choutouridis AEM:8997 | ||||
|  *    <cchoutou@ece.auth.gr> | ||||
|  */ | ||||
| 
 | ||||
| #include <gtest/gtest.h> | ||||
| 
 | ||||
| /*
 | ||||
|  * Global fixtures | ||||
|  */ | ||||
| 
 | ||||
| class TCUDAbitonic : public ::testing::Test { | ||||
| protected: | ||||
|     static void SetUpTestSuite() { } | ||||
| 
 | ||||
|     static void TearDownTestSuite() { } | ||||
| }; | ||||
| 
 | ||||
| 
 | ||||
| /*
 | ||||
|  * | ||||
|  */ | ||||
| TEST_F(TCUDAbitonic, test1) { | ||||
| 
 | ||||
|     EXPECT_EQ(true, true); | ||||
| } | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user