diff --git a/homework_3/Makefile b/homework_3/Makefile
index dae2b51..75914d9 100644
--- a/homework_3/Makefile
+++ b/homework_3/Makefile
@@ -22,7 +22,7 @@
 PROJECT         := PDS_homework_3
 
 # Excecutable's name
-TARGET          := bitonic
+TARGET          := bitonicCUDA
 
 # Source directories list(space seperated). Makefile-relative path, UNDER current directory.
 SRC_DIR_LIST    := src #test test/gtest
@@ -52,7 +52,7 @@ REL_CXXFLAGS    := -O3 -std=c++17 -Xcompiler "-Wall -Wextra"
 
 # Pre-defines
 # PRE_DEFS := MYCAB=1729 SUPER_MODE
-PRE_DEFS        :=
+PRE_DEFS        := TARGET=$(TARGET)
 
 # ============== Linker settings ==============
 # Linker flags (example: -pthread -lm)
@@ -83,17 +83,14 @@ DOCKER          :=
 CSIZE           := size
 CFLAGS          := $(DEB_CFLAGS)
 CXXFLAGS        := $(DEB_CXXFLAGS)
-CXX             := g++ #mpic++
-CC              := gcc #mpicc
+CXX             := g++
+CC              := gcc
 LINKER          := g++
 
 #
 # =========== Main body and Patterns ===========
 #
 
-#ifeq ($(OS), Windows_NT)
-#	TARGET := $(TARGET).exe
-#endif
 INC     := $(foreach dir,$(INC_DIR_LIST),-I$(dir))
 DEF     := $(foreach def,$(PRE_DEFS),-D$(def))
 EXC     := $(foreach fil,$(EXC_FILE_LIST),                              \
@@ -111,38 +108,16 @@ OBJ     := $(foreach file,$(SRC:%.cpp=%.o),$(OBJ_DIR)/$(file))
 DEP     := $(foreach file,$(SRC:%.cpp=%.d),$(DEP_DIR)/$(file))
 
 
-# Make Dependencies pattern.
-# This little trick enables recompilation only when dependencies change
-# and it does so for changes both in source AND header files ;)
-# 
-# It is based on Tom Tromey's method.
-# 
-# Invoke cpp to create makefile rules with dependencies for each source file
-#$(DEP_DIR)/%.d: %.c
-#	@mkdir -p $(@D)
-#	@$(DOCKER) $(CC) -E $(CFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.c=.o) -MF $@ $<
-
 # c file objects depent on .c AND dependency files, which have an empty recipe 
 $(OBJ_DIR)/%.o: %.c
 	@mkdir -p $(@D)
 	$(DOCKER) $(CC) -c $(CFLAGS) $(INC) $(DEF) -o $@ $<
 
-#$(DEP_DIR)/%.d: %.cpp
-#	@mkdir -p $(@D)
-#	@$(DOCKER) $(CXX) -E $(CXXFLAGS) $(INC) $(DEF) -MM -MT $(OBJ_DIR)/$(<:.cpp=.o) -MF $@ $<
-
 # cpp file objects depend on .cpp AND dependency files, which have an empty recipe
 $(OBJ_DIR)/%.o: %.cpp
 	@mkdir -p $(@D)
 	$(DOCKER) $(CXX) -c $(CXXFLAGS) $(INC) $(DEF) -o $@ $<
 
-# empty recipe for dependency files. This prevents make errors
-#$(DEP):
-
-# now include all dependencies
-# After all they are makefile dependency rules ;)
-#include $(wildcard $(DEP))
-
 # main target rule
 $(BUILD_DIR)/$(TARGET): $(OBJ)
 	@mkdir -p $(@D)
@@ -179,6 +154,39 @@ release: $(BUILD_DIR)/$(TARGET)
 # ================ Build rules =================
 #
 
+bitonic_v0deb: CC := nvcc -G -g -x cu
+bitonic_v0deb: CXX := nvcc -G -g -x cu
+bitonic_v0deb: LINKER := nvcc
+bitonic_v0deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V0
+bitonic_v0deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V0
+bitonic_v0deb: OUTPUT_DIR := $(OUTPUT_DIR)/v0
+bitonic_v0deb: $(BUILD_DIR)/$(TARGET)
+	@mkdir -p $(OUTPUT_DIR)
+	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
+
+
+bitonic_v1deb: CC := nvcc -G -g -x cu
+bitonic_v1deb: CXX := nvcc -G -g -x cu
+bitonic_v1deb: LINKER := nvcc
+bitonic_v1deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V1
+bitonic_v1deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V1
+bitonic_v1deb: OUTPUT_DIR := $(OUTPUT_DIR)/v1
+bitonic_v1deb: $(BUILD_DIR)/$(TARGET)
+	@mkdir -p $(OUTPUT_DIR)
+	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
+
+
+bitonic_v2deb: CC := nvcc -G -g -x cu
+bitonic_v2deb: CXX := nvcc -G -g -x cu
+bitonic_v2deb: LINKER := nvcc
+bitonic_v2deb: CFLAGS := $(DEB_CFLAGS) -DCODE_VERSION=V2
+bitonic_v2deb: CXXFLAGS := $(DEB_CXXFLAGS) -DCODE_VERSION=V2
+bitonic_v2deb: OUTPUT_DIR := $(OUTPUT_DIR)/v2
+bitonic_v2deb: $(BUILD_DIR)/$(TARGET)
+	@mkdir -p $(OUTPUT_DIR)
+	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
+
+
 
 bitonic_v0: CC := nvcc -x cu
 bitonic_v0: CXX := nvcc -x cu
@@ -201,7 +209,6 @@ bitonic_v1: $(BUILD_DIR)/$(TARGET)
 	@mkdir -p $(OUTPUT_DIR)
 	cp $(BUILD_DIR)/$(TARGET) $(OUTPUT_DIR)/$(TARGET)
 
-
 bitonic_v2: CC := nvcc -x cu
 bitonic_v2: CXX := nvcc -x cu
 bitonic_v2: LINKER := nvcc
diff --git a/homework_3/src/bitonicsort.hpp b/homework_3/src/bitonicsort.hpp
index e20031e..fbcf924 100644
--- a/homework_3/src/bitonicsort.hpp
+++ b/homework_3/src/bitonicsort.hpp
@@ -21,7 +21,7 @@
 /*
  * Exported timers
  */
-extern Timing Timer_total;
+extern Timing Timer_total, Timer_memory, Timer_sorting;
 
 using threadId_t = size_t;
 
@@ -74,9 +74,41 @@ __device__ inline bool keepSmall(threadId_t tid, threadId_t partner, size_t stag
  * ============================== Sort algorithms ==============================
  */
 
+/*!
+ * Each thread can handle 2 points in the array. For each of these 2 points it may
+ * - compare and exchange if needed
+ * - copy data to local and back if needed
+ */
+static constexpr size_t SizeToThreadsRatio = 2;
 
+/*!
+ * Calculates the blocks needed for the entire sorting process
+ *
+ * @note
+ *  This "redundant" little trick makes sure blocks are allocated for arraySizes that are not exact
+ *  multipliers of config.blockSize.
+ *  Even if we don't need it, we keep it in case we experiment with weird sizes in the future!
+ *
+ * @param arraySize [ArraySize_t]   The size of the entire array (in points)
+ * @return          [size_t]        The number of blocks
+ */
+inline size_t NBlocks(ArraySize_t arraySize) {
+	return (((arraySize + config.blockSize - 1) / config.blockSize) / SizeToThreadsRatio);
+}
+
+
+/*!
+ * Exchange utility
+ *
+ * @tparam  ValueT  The underlying data type of the array items
+ *
+ * @param data      [ValueT*]    Pointer to data array
+ * @param tid       [threadId_t] Current thread's index to data
+ * @param pid       [threadId_t] Parents's index to data
+ * @param keepSmall [bool]       Flag to indicate if current threads is keeping the small
+ */
 template <typename ValueT>
-__device__ void exchange(ValueT* data, int tid, int partner, bool keepSmall) {
+__device__ void exchange(ValueT* data, threadId_t tid, threadId_t partner, bool keepSmall) {
     if (( keepSmall && (data[tid] > data[partner])) ||
         (!keepSmall && (data[tid] < data[partner])) ) {
         ValueT temp = data[tid];
@@ -86,13 +118,24 @@ __device__ void exchange(ValueT* data, int tid, int partner, bool keepSmall) {
 }
 
 #if CODE_VERSION == V0
+
+/*!
+ * This is the body of each thread. This function compare and exchange data
+ *
+ * @tparam ValueT   The underlying data type of the array items
+ * @param data      [ValueT*]   Pointer to data array
+ * @param n         [size_t]    The total size of the array
+ * @param step      [size_t]    The current step of the current stage of bitonic sort
+ * @param stage     [size_t]    The current stage of bitonic sort
+ */
 template <typename ValueT>
 __global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) {
-    threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID
+    threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Keep contiguous addressing to the first half of the array
     threadId_t pid = partner(tid, step);
     if (tid > pid) {
-        tid += n >> 1;
-        pid += n >> 1;
+        // Shift to the other half of the array for global data
+        tid += n / SizeToThreadsRatio;
+        pid += n / SizeToThreadsRatio;
     }
     if ((tid < n) && (pid < n)) {       // Boundary check
         bool keep = keepSmall(tid, pid, stage);
@@ -102,18 +145,11 @@ __global__ void bitonicStep(ValueT* data, size_t n, size_t step, size_t stage) {
 
 
 /*!
- * A distributed version of the Bitonic sort algorithm.
+ * A CUDA version of the Bitonic sort algorithm.
  *
- * @note
- *  Each MPI process should run an instance of this function.
- *
- * @tparam ShadowedDataT    A Shadowed buffer type with random access iterator.
- *
- * @param data          [ShadowedDataT] The local to MPI process data to sort
- * @param Processes     [mpi_id_t]      The total number of MPI processes
- * @param rank          [mpi_id_t]      The current process id
+ * @tparam DataT    A container type to hold data array. Should have .data() and .size() methods
+ * @param data      [DataT&]    Reference to the container to sort
  */
-
 template <typename DataT>
 void bitonicSort(DataT& data) {
     using value_t = typename DataT::value_type;
@@ -121,34 +157,57 @@ void bitonicSort(DataT& data) {
     value_t* dev_data;
     auto size = data.size();
 
-    cudaMalloc(&dev_data, size * sizeof(value_t));
-    cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice);
+    Timer_memory.start();
+    if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
+        throw std::runtime_error("[CUDA] - Can not allocate memory\n");
+    if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
+        throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
+    Timer_memory.stop();
 
-    int Nthreads = THREADS_PER_BLOCK;
-    int HalfNblocks = ((size + Nthreads - 1) / Nthreads) >> 1;
+    size_t Nth = config.blockSize;
+    size_t Nbl = NBlocks(size);
 
     size_t Stages = static_cast<size_t>(log2(size));
+    Timer_sorting.start();
     for (size_t stage = 1; stage <= Stages; ++stage) {
         for (size_t step = stage; step > 0; ) {
             --step;
-            bitonicStep<<<HalfNblocks, Nthreads>>>(dev_data, size, step, stage);
+            bitonicStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
             cudaDeviceSynchronize();
         }
     }
+    Timer_sorting.stop();
 
-    cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost);
+    Timer_memory.start();
+    if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
+        throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
     cudaFree(dev_data);
+    Timer_memory.stop();
 }
 
 #elif CODE_VERSION == V1
 
+/*!
+ * This is the body of each thread. This function compare and exchange data
+ *
+ * @tparam ValueT   The underlying data type of the array items
+ * @param data      [ValueT*]   Pointer to data array
+ * @param n         [size_t]    The total size of the array
+ * @param step      [size_t]    The current step of the current stage of bitonic sort
+ * @param stage     [size_t]    The current stage of bitonic sort
+ */
 template <typename ValueT>
 __device__ void interBlockStep_(ValueT* data, size_t n, size_t step, size_t stage) {
-    threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID
+    /*
+     * Here we skip blocks every time (one for SizeToThreadsRatio = 2)
+     * And we use the neighbor block address indices for the other half of the threads
+     */
+    threadId_t tid = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
     threadId_t pid = partner(tid, step);
     if (tid > pid) {
-        tid += n >> 1;
-        pid += n >> 1;
+        // Shift to the other half of the array for global data
+        tid += blockDim.x;
+        pid += blockDim.x;
     }
     if ((tid < n) && (pid < n)) {       // Boundary check
         bool keep = keepSmall(tid, pid, stage);
@@ -156,12 +215,29 @@ __device__ void interBlockStep_(ValueT* data, size_t n, size_t step, size_t stag
     }
 }
 
+/*!
+ * This is the version of the body that is called outside of the loop unrolling
+ *
+ * @tparam ValueT   The underlying data type of the array items
+ * @param data      [ValueT*]   Pointer to data array
+ * @param n         [size_t]    The total size of the array
+ * @param step      [size_t]    The current step of the current stage of bitonic sort
+ * @param stage     [size_t]    The current stage of bitonic sort
+ */
 template <typename ValueT>
 __global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) {
     interBlockStep_(data, n, step, stage);
 }
 
-
+/*!
+ * This is unrolled part of the bitonic double loop.
+ *
+ * @tparam ValueT   The underlying data type of the array items
+ * @param data      [ValueT*]   Pointer to data array
+ * @param n         [size_t]    The total size of the array
+ * @param step      [size_t]    The current step of the current stage of bitonic sort
+ * @param stage     [size_t]    The current stage of bitonic sort
+ */
 template <typename ValueT>
 __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) {
     for (size_t step = innerSteps + 1; step > 0; ) {
@@ -172,18 +248,11 @@ __global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t st
 }
 
 /*!
- * A distributed version of the Bitonic sort algorithm.
+ * A CUDA version of the Bitonic sort algorithm.
  *
- * @note
- *  Each MPI process should run an instance of this function.
- *
- * @tparam ShadowedDataT    A Shadowed buffer type with random access iterator.
- *
- * @param data          [ShadowedDataT] The local to MPI process data to sort
- * @param Processes     [mpi_id_t]      The total number of MPI processes
- * @param rank          [mpi_id_t]      The current process id
+ * @tparam DataT    A container type to hold data array. Should have .data() and .size() methods
+ * @param data      [DataT&]    Reference to the container to sort
  */
-
 template <typename DataT>
 void bitonicSort(DataT& data) {
     using value_t = typename DataT::value_type;
@@ -191,38 +260,85 @@ void bitonicSort(DataT& data) {
     value_t* dev_data;
     auto size = data.size();
 
-    cudaMalloc(&dev_data, size * sizeof(value_t));
-    cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice);
+    Timer_memory.start();
+    if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
+        throw std::runtime_error("[CUDA] - Can not allocate memory\n");
+    if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
+        throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
+    Timer_memory.stop();
 
-    int Nthreads = THREADS_PER_BLOCK;
-    int HalfNblocks = ((size + Nthreads - 1) / Nthreads) >> 1;
+    size_t Nth = config.blockSize;
+    size_t Nbl = NBlocks(size);
 
-    auto Stages = static_cast<size_t>(log2(size));
-    auto InnerBlockSteps = static_cast<size_t>(log2(IN_BLOCK_THRESHOLD));
+    auto Stages          = static_cast<size_t>(log2(size));
+    auto InnerBlockSteps = static_cast<size_t>(log2(Nth));    //
+    Timer_sorting.start();
     for (size_t stage = 1; stage <= Stages; ++stage) {
         size_t step = stage - 1;
         for ( ; step > InnerBlockSteps; --step) {
-            interBlockStep<<<HalfNblocks, Nthreads>>>(dev_data, size, step, stage);
+            interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
             cudaDeviceSynchronize();
         }
-        inBlockStep<<<HalfNblocks, Nthreads>>>(dev_data, size, step, stage);
+        inBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
         cudaDeviceSynchronize();
     }
+    Timer_sorting.stop();
 
-    cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost);
+    Timer_memory.start();
+    if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
+        throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
     cudaFree(dev_data);
+    Timer_memory.stop();
 }
 
 
+
 #elif CODE_VERSION == V2
 
+/*!
+ * @return The memory that each block local threads can affect.
+ *
+ * @note
+ *  Each block thread collection can exchange twice the size of data points.
+ */
+inline size_t effectiveBlockSize() { return SizeToThreadsRatio * config.blockSize; }
+
+
+
+/*!
+ * Converts the global address of the data to the local shared memory array which is used
+ * as cached memory to the unrolled part of the bitonic sort loop.
+ *
+ * @note
+ *  Each block's thread collection can exchange twice the size of data points.
+ *  These points get copied (cached) in the shared memory location. We use contiguous blocks
+ *  both in global data memory and the shared memory buffer.
+ *
+ * @param gIndex    The global array index
+ * @param blockDim  The block size (threads per block)
+ * @return          The equivalent local address of the shared memory
+ */
+__device__ inline size_t toLocal(size_t gIndex, size_t blockDim) {
+    return gIndex % (SizeToThreadsRatio * blockDim);
+}
+
+/*!
+ * This is the version of the body that is called outside of the loop unrolling
+ *
+ * @tparam ValueT   The underlying data type of the array items
+ * @param data      [ValueT*]   Pointer to data array
+ * @param n         [size_t]    The total size of the array
+ * @param step      [size_t]    The current step of the current stage of bitonic sort
+ * @param stage     [size_t]    The current stage of bitonic sort
+ */
 template <typename ValueT>
 __global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage) {
-    threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x; // Compute global thread ID
+    threadId_t tid = threadIdx.x + blockIdx.x * blockDim.x;     // Keep contiguous addressing to the first half of the array
     threadId_t pid = partner(tid, step);
     if (tid > pid) {
-        tid += n >> 1;
-        pid += n >> 1;
+        // Shift to the other half of the array for global data
+        tid += n / SizeToThreadsRatio;
+        pid += n / SizeToThreadsRatio;
     }
     if ((tid < n) && (pid < n)) {       // Boundary check
         bool keep = keepSmall(tid, pid, stage);
@@ -230,57 +346,72 @@ __global__ void interBlockStep(ValueT* data, size_t n, size_t step, size_t stage
     }
 }
 
-
+/*!
+ * This is unrolled part of the bitonic double loop.
+ *
+ * First each thread caches its corresponding data point from the current and the following data block.
+ * After that we execute the loop unrolling on the local data and then we write back to global memory.
+ *
+ * @tparam ValueT   The underlying data type of the array items
+ * @param data      [ValueT*]   Pointer to data array
+ * @param n         [size_t]    The total size of the array
+ * @param step      [size_t]    The current step of the current stage of bitonic sort
+ * @param stage     [size_t]    The current stage of bitonic sort
+ */
 template <typename ValueT>
-__global__ void inBlockStep(ValueT* data, size_t n, size_t nthreads, size_t innerSteps, size_t stage, int *mutex) {
+__global__ void inBlockStep(ValueT* data, size_t n, size_t innerSteps, size_t stage) {
     extern __shared__ ValueT shared_data[];
 
+    /*
+     * Global and local(shared) memory indices (calculated once)
+     * Here we skip blocks every time (one for SizeToThreadsRatio = 2)
+     * And we cache the neighbor block address indexes in local (shared) memory
+     */
+    threadId_t gIdx0 = threadIdx.x + SizeToThreadsRatio * blockIdx.x * blockDim.x;
+    threadId_t lIdx0 = toLocal(gIdx0, blockDim.x);
+
+    if (gIdx0 + blockDim.x >= n)   // Boundary check
+        return;
+
+    // Fetch to local memory the entire effective block size (2 positions for each thread)
+    shared_data[lIdx0]              = data[gIdx0];
+    shared_data[lIdx0 + blockDim.x] = data[gIdx0 + blockDim.x];
+    __syncthreads();
+
     for (size_t step = innerSteps + 1; step > 0; ) {
         --step;
 
-        // Global memory thread and partner ids
-        threadId_t Tid = threadIdx.x + blockIdx.x * blockDim.x;
-        threadId_t Pid = partner(Tid, step);
-        if (Tid > Pid) {
-            Tid += n >> 1;
-            Pid += n >> 1;
+        // Init thread global and local indices
+        threadId_t gIdx = gIdx0;
+        threadId_t lIdx = lIdx0;
+        // Find partner and keep-small configuration based on the global data positions
+        threadId_t pIdx  = partner(gIdx, step);
+        if (gIdx > pIdx) {
+            // Shift inside effective block
+            gIdx += blockDim.x; // global
+            pIdx += blockDim.x;
+            lIdx += blockDim.x;	// local
         }
+        bool keep = keepSmall(gIdx, pIdx, stage);
 
-        if ((Tid < n) && (Pid < n)) {       // Boundary check
-        	// Global to local index resolution
-        	threadId_t tid = (Tid<Pid) ? ((Tid*nthreads)%(2*nthreads)) : (((Tid - (n >> 1))*nthreads)%(2*nthreads));
-        	threadId_t pid = tid + 1;
-        	// Fetch to local memory
-        	shared_data[tid] = data[Tid];
-        	shared_data[pid] = data[Pid];
-        	__syncthreads();
-
-            bool keep = keepSmall(Tid, Pid, stage);
-            exchange(shared_data, tid, pid, keep);
-            __syncthreads();
-
-            // Write back to global memory
-            data[Tid] = shared_data[tid];
-            data[Pid] = shared_data[pid];
-            __syncthreads();
-        }
+        // Exchange data on local(shared) copy
+        threadId_t lpIdx = toLocal(pIdx, blockDim.x);
+        exchange(shared_data, lIdx, lpIdx, keep);
+        __syncthreads();
     }
 
+    // Write back to global memory
+    data[gIdx0]              = shared_data[lIdx0];
+    data[gIdx0 + blockDim.x] = shared_data[lIdx0 + blockDim.x];
+    __syncthreads();
 }
 
 /*!
- * A distributed version of the Bitonic sort algorithm.
+ * A CUDA version of the Bitonic sort algorithm.
  *
- * @note
- *  Each MPI process should run an instance of this function.
- *
- * @tparam dDataT    A Shadowed buffer type with random access iterator.
- *
- * @param data          [ShadowedDataT] The local to MPI process data to sort
- * @param Processes     [mpi_id_t]      The total number of MPI processes
- * @param rank          [mpi_id_t]      The current process id
+ * @tparam DataT    A container type to hold data array. Should have .data() and .size() methods
+ * @param data      [DataT&]    Reference to the container to sort
  */
-
 template <typename DataT>
 void bitonicSort(DataT& data) {
     using value_t = typename DataT::value_type;
@@ -288,30 +419,36 @@ void bitonicSort(DataT& data) {
     value_t* dev_data;
     auto size = data.size();
 
-    cudaMalloc(&dev_data, size * sizeof(value_t));
-    cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice);
+    Timer_memory.start();
+    if (cudaMalloc(&dev_data, size * sizeof(value_t)) != cudaSuccess)
+        throw std::runtime_error("[CUDA] - Can not allocate memory\n");
+    if (cudaMemcpy(dev_data, data.data(), size * sizeof(value_t), cudaMemcpyHostToDevice) != cudaSuccess)
+        throw std::runtime_error("[CUDA] - Can not copy memory to device\n");
+    Timer_memory.stop();
 
-    int* d_mutex;
-    cudaMalloc(&d_mutex, sizeof(int));
-    cudaMemset(d_mutex, 0, sizeof(int));  // init mutex
+    size_t Nth = config.blockSize;
+    size_t Nbl = NBlocks(size);
+    size_t kernelMemSize = effectiveBlockSize() * sizeof(value_t);
 
-    int Nthreads = THREADS_PER_BLOCK;
-    int Nblocks = ((size + Nthreads - 1) / Nthreads) >> 1;
-
-    auto Stages = static_cast<size_t>(log2(size));
-    auto InnerBlockSteps = static_cast<size_t>(log2(IN_BLOCK_THRESHOLD));
+    auto Stages          = static_cast<size_t>(log2(size));
+    auto InnerBlockSteps = static_cast<size_t>(log2(Nth));
+    Timer_sorting.start();
     for (size_t stage = 1; stage <= Stages; ++stage) {
         size_t step = stage - 1;
         for ( ; step > InnerBlockSteps; --step) {
-            interBlockStep<<<Nblocks, Nthreads>>>(dev_data, size, step, stage);
+            interBlockStep<<<Nbl, Nth>>>(dev_data, size, step, stage);
             cudaDeviceSynchronize();
         }
-        inBlockStep<<<Nblocks, Nthreads, 2*Nthreads*sizeof(value_t)>>>(dev_data, size, Nthreads, step, stage, d_mutex);
+        inBlockStep<<<Nbl, Nth, kernelMemSize>>>(dev_data, size, step, stage);
         cudaDeviceSynchronize();
     }
+    Timer_sorting.stop();
 
-    cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost);
+    Timer_memory.start();
+    if (cudaMemcpy(data.data(), dev_data, size * sizeof(value_t), cudaMemcpyDeviceToHost) != cudaSuccess)
+        throw std::runtime_error("[CUDA] - Can not copy memory from device\n");
     cudaFree(dev_data);
+    Timer_memory.stop();
 }
 
 #endif
diff --git a/homework_3/src/config.h b/homework_3/src/config.h
index dec8b68..5423f1a 100644
--- a/homework_3/src/config.h
+++ b/homework_3/src/config.h
@@ -1,6 +1,6 @@
 /*!
  * \file
- * \brief   Build configuration file.
+ * \brief   Build and runtime configuration file.
  *
  * \author
  *    Christos Choutouridis AEM:8997
@@ -11,19 +11,20 @@
 #define CONFIG_H_
 
 #include <cstdint>
+#include <cuda_runtime.h>
 
 /*
  * Versioning:
- * - RC1:
+ * - RC1: First version to test on HPC
  */
-static constexpr char version[] = "0.0";
+static constexpr char version[] = "0.1";
 
 /*
  * Defines for different version of the exercise
  */
-#define  V0        (0)
-#define  V1        (1)
-#define  V2        (2)
+#define  V0        0
+#define  V1        1
+#define  V2        2
 
 // Fail-safe version selection
 #if !defined CODE_VERSION
@@ -33,8 +34,9 @@ static constexpr char version[] = "0.0";
 // Default Data size (in case -q <N> is not present)
 static constexpr size_t DEFAULT_DATA_SIZE   = 1 << 16;
 
+// Placeholder default (actual default comes from device properties read at initialization)
 static constexpr size_t THREADS_PER_BLOCK   = 1024;
-static constexpr size_t IN_BLOCK_THRESHOLD  = 512;
+
 
 /*!
  * Value and Buffer type selection
@@ -51,6 +53,11 @@ static constexpr size_t IN_BLOCK_THRESHOLD  = 512;
 using Value_t = uint32_t;
 using Data_t  = std::vector<Value_t>;
 
+/*!
+ * In theory we can support large arrays ;)
+ */
+using ArraySize_t = uint64_t;
+
 /*!
  * Session option for each invocation of the executable.
  *
@@ -58,20 +65,18 @@ using Data_t  = std::vector<Value_t>;
  *  The values of the members are set from the command line.
  */
 struct config_t {
-    size_t  arraySize{DEFAULT_DATA_SIZE};   //!< The array size of the local data to sort.
-    bool    exchangeOpt{false};             //!< Flag to request the exchange optimization
-    size_t  pipeline{1UL};                  //!< Pipeline stages (1 to disable)
-    bool    validation{false};              //!< Request a full validation at the end, performed by process rank 0.
-    bool    ndebug{false};                  //!< Skips debug trap on DEBUG builds.
-    size_t  perf{1};                        //!< Enable performance timing measurements and prints and repeat
-                                            //!< the sorting <perf> times.
-    bool    verbose{false};                 //!< Flag to enable verbose output to stdout.
+	ArraySize_t	arraySize{DEFAULT_DATA_SIZE};   //!< The array size of the local data to sort.
+	size_t      blockSize{THREADS_PER_BLOCK};	//!< The block size (threads per block) for the session.
+    bool        validation{false};              //!< Request a full validation at the end, performed by process rank 0.
+    size_t      perf{1};                        //!< Enable performance timing measurements and prints. Repeat
+                                                //!< the sorting <perf> times to do so.
+    bool        verbose{false};                 //!< Flag to enable verbose output to stdout.
 };
 
 /*
  * Exported data types
  */
-extern config_t config;
-
+extern config_t 		config;
+extern cudaDeviceProp   device;
 
 #endif /* CONFIG_H_ */
diff --git a/homework_3/src/main.cpp b/homework_3/src/main.cpp
index 961d275..c93490f 100644
--- a/homework_3/src/main.cpp
+++ b/homework_3/src/main.cpp
@@ -11,6 +11,7 @@
 #include <iostream>
 #include <algorithm>
 #include <random>
+#include <cuda_runtime.h>
 
 #include "utils.hpp"
 #include "config.h"
@@ -18,22 +19,25 @@
 
 
 // Global session data
-Data_t          Data = {3, 5, 1, 2, 4, 7, 8, 6};
+Data_t          Data;
 config_t        config;
 Log             logger;
+cudaDeviceProp 	device;
 
 // Mersenne seeded from hw if possible. range: [type_min, type_max]
 std::random_device  rd;
 std::mt19937        gen(rd());
 
 //! Performance timers for each one of the "costly" functions
-Timing Timer_total;
+Timing Timer_total, Timer_memory, Timer_sorting;
 
 
 //! Init timing objects for extra rounds
 void measurements_init() {
     if (config.perf > 1) {
         Timer_total.init(config.perf);
+        Timer_memory.init(config.perf);
+        Timer_sorting.init(config.perf);
     }
 }
 
@@ -41,6 +45,8 @@ void measurements_init() {
 void measurements_next() {
     if (config.perf > 1) {
         Timer_total.next();
+        Timer_memory.next();
+        Timer_sorting.next();
     }
 }
 
@@ -57,7 +63,15 @@ bool get_options(int argc, char* argv[]){
 
         if (arg == "-q" || arg == "--array-size") {
             if (i+1 < argc) {
-                config.arraySize = 1 << atoi(argv[++i]);
+                config.arraySize = (ArraySize_t)1 << atoi(argv[++i]);
+            }
+            else {
+                status = false;
+            }
+        }
+        else if (arg == "-b" || arg == "--block-size") {
+            if (i+1 < argc) {
+                config.blockSize = atoi(argv[++i]);
             }
             else {
                 status = false;
@@ -74,32 +88,34 @@ bool get_options(int argc, char* argv[]){
                 status = false;
             }
         }
-        else if (arg == "--ndebug") {
-            config.ndebug = true;
-        }
         else if (arg == "-v" || arg == "--verbose") {
             config.verbose = true;
         }
         else if (arg == "--version") {
-            std::cout << "bitonic - A GPU accelerated sort utility\n";
+            std::cout << STR(TARGET) << " - A GPU accelerated bitonic sort utility (V" << STR(CODE_VERSION)<< ") \n";
             std::cout << "version: " << version << "\n\n";
             exit(0);
         }
         else if (arg == "-h" || arg == "--help") {
-            std::cout << "distbitonic - A distributed sort utility\n\n";
-            std::cout << "  distbitonic -q <N> [--validation] [--perf <N>] [--ndebug] [-v]\n";
-            std::cout << "  distbitonic -h\n";
+            std::cout << STR(TARGET) << " - A GPU accelerated bitonic sort utility (V" << STR(CODE_VERSION)<< ") \n\n";
+            std::cout << "  " << STR(TARGET) << " -q <N> -b <N> [--validation] [--perf <N>] [-v]\n";
+            std::cout << "  " << STR(TARGET) << " -h\n";
+            std::cout << "  " << STR(TARGET) << " --version\n";
             std::cout << '\n';
             std::cout << "Options:\n\n";
             std::cout << "   -q | --array-size <N>\n";
-            std::cout << "      Selects the array size according to size = 2^N\n\n";
+            std::cout << "      Selects the array size according to size = 2^N\n";
+            std::cout << "      [Size must be larger than 2 * blockSize]\n";
+            std::cout << "      [Default is 2^16]\n\n";
+            std::cout << "   -b | --block-size <N>\n";
+            std::cout << "      Selects the number of CUDA threads per block\n";
+            std::cout << "      [Size has to be multiple of device's warp size (usually 32)\n";
+            std::cout << "      [Default is the maximum device supported number. For ex: (GTX 1650) block-size=1024]\n\n";
             std::cout << "   --validation\n";
-            std::cout << "      Request a full validation at the end, performed by process rank 0\n\n";
+            std::cout << "      Request a full validation at the end\n\n";
             std::cout << "   --perf <N> \n";
             std::cout << "      Enable performance timing measurements and prints, and repeat\n";
             std::cout << "      the sorting <N> times.\n\n";
-            std::cout << "   --ndebug\n";
-            std::cout << "      Skip debug breakpoint when on debug build.\n\n";
             std::cout << "   -v | --verbose\n";
             std::cout << "      Request a more verbose output to stdout.\n\n";
             std::cout << "   -h | --help\n";
@@ -107,8 +123,12 @@ bool get_options(int argc, char* argv[]){
             std::cout << "   --version\n";
             std::cout << "      Prints version and exit.\n\n";
             std::cout << "Examples:\n\n";
-            std::cout << "   bitonic -q 24\n";
-            std::cout << "      Runs bitonic with GPU acceleration with 2^24 array points\n\n";
+            std::cout << "  " << STR(TARGET) << " -q 24\n";
+            std::cout << "      Runs bitonic sort on an 2^24 points array, using GPU acceleration\n\n";
+            std::cout << "  " << STR(TARGET) << " --validation --perf 5 -b 512 -q 26\n";
+            std::cout << "      Runs bitonic sort on an 2^26 points array 5 times, using GPU acceleration with\n";
+            std::cout << "      512 threads per block, performs a validation check at the end and prints the time\n";
+            std::cout << "      of the median.\n\n";
 
             exit(0);
         }
@@ -118,6 +138,17 @@ bool get_options(int argc, char* argv[]){
         }
     }
 
+    // Check configuration requirements
+    if (config.blockSize % device.warpSize)
+    	throw std::runtime_error("[Config] - Number of threads per block is not an exact multiple of warp size\n");
+    if (config.arraySize < 2*config.blockSize)
+    	throw std::runtime_error("[Config] - Unsupported array size (smaller than "
+                                 + std::to_string(SizeToThreadsRatio*config.blockSize) + ")\n");
+    if (device.totalGlobalMem < config.arraySize * sizeof(Value_t))
+    	throw std::runtime_error("[CUDA] - Unsupported array size: "
+                                 + std::to_string(config.arraySize * sizeof(Value_t))
+                                 + " (larger than GPU's: " + std::to_string(device.totalGlobalMem) + ")\n");
+
     return status;
 }
 
@@ -141,6 +172,13 @@ bool validator(DataT& data) {
  * @param argv  [char***]   POINTER to main's argv argument
  */
 void init(int* argc, char*** argv) {
+
+    // Get device configuration
+    if (cudaGetDeviceProperties(&device, 0) != cudaSuccess)
+        throw std::runtime_error("[CUDA] - Can not read GPU");
+
+    config.blockSize = static_cast<size_t>(device.maxThreadsPerBlock);
+
     // try to read command line
     if (!get_options(*argc, *argv))
         exit(1);
@@ -159,16 +197,23 @@ int main(int argc, char* argv[]) try {
     // Init everything
     init(&argc, &argv);
 
+    logger << "Array size:     " << config.arraySize << " (Q=" << static_cast<size_t>(log2(config.arraySize))<< ")" << logger.endl;
+    logger << "Repeated sorts: " << config.perf << logger.endl;
+    logger << "GPU:            " << device.name << logger.endl;
+    logger << "Block size:     " << config.blockSize << logger.endl;
+
     for (size_t it = 0 ; it < config.perf ; ++it) {
         // Initialize local data
-        logger << "Initialize local array of " << config.arraySize << " elements" << logger.endl;
+        logger << "Initialize array ... ";
         std::uniform_int_distribution<Value_t > dis(
                 std::numeric_limits<Value_t>::min(),
                 std::numeric_limits<Value_t>::max()
         );
         std::generate(Data.begin(), Data.end(), [&]() { return dis(gen); });
+        logger << " Done." << logger.endl;
+
         // Run distributed sort
-        logger << "Starting distributed sorting ... ";
+        logger << "Start sorting ...    ";
         Timer_total.start();
         bitonicSort(Data);
         Timer_total.stop();
@@ -178,20 +223,15 @@ int main(int argc, char* argv[]) try {
 
     // Print-outs and validation
     if (config.perf > 1) {
-        Timing::print_duration(Timer_total.median(), "Total");
+        Timing::print_duration(Timer_total.median(),  "Total   ");
+        Timing::print_duration(Timer_memory.median(), "Mem-xch ");
+        Timing::print_duration(Timer_sorting.median(),"Sorting ");
     }
     if (config.validation) {
         // If requested, we have the chance to fail!
         std::cout << "[Validation] Results validation ...";
         bool val = validator(Data);
-        std::cout << ((val) ? "\x1B[32m [PASSED] \x1B[0m\n" : " \x1B[32m [FAILED] \x1B[0m\n");
-        if (Data.size() < 128) {
-        	std::cout << "Data: ";
-        	for (auto& v : Data) {
-        		std::cout << (int)v << ", ";
-        	}
-        	std::cout << '\n';
-        }
+        std::cout << ((val) ? "\x1B[32m [PASSED] \x1B[0m\n" : " \x1B[31m [FAILED] \x1B[0m\n");
     }
     return 0;
 }
diff --git a/homework_3/src/utils.hpp b/homework_3/src/utils.hpp
index bf06ef0..520cd5c 100644
--- a/homework_3/src/utils.hpp
+++ b/homework_3/src/utils.hpp
@@ -17,6 +17,11 @@
 
 #include "config.h"
 
+/*!
+ * Stringify preprocessor util
+ */
+#define STR(s)		STR_(s)
+#define STR_(s)		#s
 
 /*!
  * A Logger for entire program.