diff --git a/A2/Detection_cercle/A Circle Hough Transform Implementation Using High-Level Synthesi.pdf b/A2/Detection_cercle/A Circle Hough Transform Implementation Using High-Level Synthesi.pdf new file mode 100644 index 0000000..3fce4ab Binary files /dev/null and b/A2/Detection_cercle/A Circle Hough Transform Implementation Using High-Level Synthesi.pdf differ diff --git a/A2/Detection_cercle/car.bmp b/A2/Detection_cercle/car.bmp new file mode 100644 index 0000000..3f124c7 Binary files /dev/null and b/A2/Detection_cercle/car.bmp differ diff --git a/A2/Detection_cercle/car.jpg b/A2/Detection_cercle/car.jpg new file mode 100644 index 0000000..dc49b08 Binary files /dev/null and b/A2/Detection_cercle/car.jpg differ diff --git a/A4/TP_GPU-master/TP2_reduction/windows/.vs/Reduce/v16/.suo b/A4/TP_GPU-master/TP2_reduction/windows/.vs/Reduce/v16/.suo new file mode 100644 index 0000000..9bc5f6e Binary files /dev/null and b/A4/TP_GPU-master/TP2_reduction/windows/.vs/Reduce/v16/.suo differ diff --git a/A4/TP_GPU-master/TP2_reduction/windows/.vs/Reduce/v16/Browse.VC.db b/A4/TP_GPU-master/TP2_reduction/windows/.vs/Reduce/v16/Browse.VC.db new file mode 100644 index 0000000..5049ffd Binary files /dev/null and b/A4/TP_GPU-master/TP2_reduction/windows/.vs/Reduce/v16/Browse.VC.db differ diff --git a/A4/TP_GPU-master/TP2_reduction/windows/Reduce.vcxproj.user b/A4/TP_GPU-master/TP2_reduction/windows/Reduce.vcxproj.user new file mode 100644 index 0000000..88a5509 --- /dev/null +++ b/A4/TP_GPU-master/TP2_reduction/windows/Reduce.vcxproj.user @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/A4/TP_GPU-master/TP2_reduction/windows/x64/Release/Reduce.log b/A4/TP_GPU-master/TP2_reduction/windows/x64/Release/Reduce.log new file mode 100644 index 0000000..2e64220 --- /dev/null +++ b/A4/TP_GPU-master/TP2_reduction/windows/x64/Release/Reduce.log @@ -0,0 +1 @@ +C:\Users\Sasa\Documents\M2R_SETI\M2_SETI\A4\TP_GPU-master\TP2_reduction\windows\Reduce.vcxproj(55,5): error MSB4019: le projet importé "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\BuildCustomizations\CUDA 4.2.props" est introuvable. Vérifiez que l'expression de la déclaration Import "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\\BuildCustomizations\CUDA 4.2.props" est correcte et que le fichier existe sur le disque. diff --git a/A4/TP_OMP_GPU/Reduce_solution.cu b/A4/TP_OMP_GPU/Reduce_solution.cu new file mode 100644 index 0000000..f5554bf --- /dev/null +++ b/A4/TP_OMP_GPU/Reduce_solution.cu @@ -0,0 +1,577 @@ +/* +# Copyright (c) 2011-2012 NVIDIA CORPORATION. All Rights Reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. +*/ + +#include +#include +#include +#include +#include +#include +#include "GpuTimer.h" + +#define CUDA_SAFE_CALL(call) \ + { \ + cudaError_t err_code = call; \ + if( err_code != cudaSuccess ) { std::cerr << "Error (" << __FILE__ << ":" << __LINE__ << "): " << cudaGetErrorString(err_code) << std::endl; return 1; } \ + } + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +// G P U R E D U C T I O N + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void reduce_kernel( int n, const int *in_buffer, int *out_buffer, const int2 *block_ranges ) +{ + // Allocate shared memory inside the block. + extern __shared__ int s_mem[]; + + // The range of data to work with. + int2 range = block_ranges[blockIdx.x]; + + // Compute the sum of my elements. + int my_sum = 0; + for( int idx = range.x + threadIdx.x ; idx < range.y ; idx += blockDim.x ) + my_sum += in_buffer[idx]; + + // Copy my sum in shared memory. + s_mem[threadIdx.x] = my_sum; + + // Make sure all the threads have copied their value in shared memory. + __syncthreads(); + + int offset ; + // Compute the sum inside the block. + for(offset = blockDim.x / 2 ; offset > 16 ; offset /= 2 ) + { + if( threadIdx.x < offset ) + s_mem[threadIdx.x] += s_mem[threadIdx.x + offset]; + __syncthreads( ); + } + + //INSIDE WARP 0 SYNC NOT NECESSARY + for(; offset > 0 ; offset /= 2 ) + { + if( threadIdx.x < offset ) + s_mem[threadIdx.x] += s_mem[threadIdx.x + offset]; + //__syncthreads( ); + } + + + // The first thread of the block stores its result. + if( threadIdx.x == 0 ) + out_buffer[blockIdx.x] = s_mem[0]; +} + +int reduce_on_gpu( int n, const int *a_device ) +{ + // Compute the size of the grid. + const int BLOCK_DIM = 256; + const int grid_dim = std::min( BLOCK_DIM, (n + BLOCK_DIM-1) / BLOCK_DIM ); + const int num_threads = BLOCK_DIM * grid_dim; + + // Compute the number of elements per block. + const int elements_per_block = BLOCK_DIM * ((n + num_threads - 1) / num_threads); + + // Allocate memory for temporary buffers. + int *partial_sums = NULL; + int2 *block_ranges = NULL; + + CUDA_SAFE_CALL( cudaMalloc( (void **) &partial_sums, BLOCK_DIM * sizeof(int ) ) ); + CUDA_SAFE_CALL( cudaMalloc( (void **) &block_ranges, grid_dim * sizeof(int2) ) ); + + // Compute the ranges for the blocks. + int sum = 0; + int2 *block_ranges_on_host = new int2[grid_dim]; + for( int block_idx = 0 ; block_idx < grid_dim ; ++block_idx ) + { + block_ranges_on_host[block_idx].x = sum; + block_ranges_on_host[block_idx].y = std::min( sum += elements_per_block, n ); + } + CUDA_SAFE_CALL( cudaMemcpy( block_ranges, block_ranges_on_host, grid_dim * sizeof(int2), cudaMemcpyHostToDevice ) ); + delete[] block_ranges_on_host; + + // First round: Compute a partial sum for all blocks. + reduce_kernel<<>>( n, a_device, partial_sums, block_ranges ); + CUDA_SAFE_CALL( cudaGetLastError() ); + + // Set the ranges for the second kernel call. + int2 block_range = make_int2( 0, grid_dim ); + CUDA_SAFE_CALL( cudaMemcpy( block_ranges, &block_range, sizeof(int2), cudaMemcpyHostToDevice ) ); + + // Second round: Compute the final sum by summing the partial results of all blocks. + reduce_kernel<<<1, BLOCK_DIM, BLOCK_DIM*sizeof(int)>>>( grid_dim, partial_sums, partial_sums, block_ranges ); + CUDA_SAFE_CALL( cudaGetLastError() ); + + // Read the result from device memory. + int result; + CUDA_SAFE_CALL( cudaMemcpy( &result, partial_sums, sizeof(int), cudaMemcpyDeviceToHost ) ); + + // Free temporary memory. + CUDA_SAFE_CALL( cudaFree( block_ranges ) ); + CUDA_SAFE_CALL( cudaFree( partial_sums ) ); + + return result; +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +// G P U R E D U C T I O N : O P T I M I Z E D V E R S I O N + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +#define WARP_SIZE 32 + +template< int BLOCK_DIM > +__global__ void reduce_kernel_optimized( int n, const int *in_buffer, int *out_buffer, const int2 *__restrict block_ranges ) +{ + // The number of warps in the block. + const int NUM_WARPS = BLOCK_DIM / WARP_SIZE; + + // Allocate shared memory inside the block. + __shared__ volatile int s_mem[BLOCK_DIM]; + + // The range of data to work with. + int2 range = block_ranges[blockIdx.x]; + + // Warp/lane IDs. + const int warp_id = threadIdx.x / WARP_SIZE; + const int lane_id = threadIdx.x % WARP_SIZE; + + // Compute the sum of my elements. + int my_sum = 0; + for( int idx = range.x + threadIdx.x ; idx < range.y ; idx += BLOCK_DIM ) + my_sum += in_buffer[idx]; + + // Copy my sum in shared memory. + s_mem[threadIdx.x] = my_sum; + + // Compute the sum inside each warp. + #pragma unroll + for( int offset = 16 ; offset > 1 ; offset >>= 1 ) + if( lane_id < offset ) + s_mem[threadIdx.x] = my_sum += s_mem[threadIdx.x + offset]; + + __syncthreads(); + + // Each warp leader stores the result for the warp. + if( lane_id == 0 ) + s_mem[warp_id] = my_sum += s_mem[threadIdx.x+1]; + + __syncthreads(); + + if( warp_id == 0 ) + { + // Read my value from shared memory and store it in a register. + my_sum = s_mem[lane_id]; + + // Sum the results of the warps. + #pragma unroll + for( int offset = NUM_WARPS / 2 ; offset > 1 ; offset >>= 1 ) + if( threadIdx.x < offset ) + s_mem[threadIdx.x] = my_sum += s_mem[threadIdx.x + offset]; + } + + // The 1st thread stores the result of the block. + if( threadIdx.x == 0 ) + out_buffer[blockIdx.x] = my_sum += s_mem[1]; +} + +template< int BLOCK_DIM > +int reduce_on_gpu_optimized( int n, const int *a_device ) +{ + // Compute the size of the grid. + const int grid_dim = std::min( BLOCK_DIM, (n + BLOCK_DIM-1) / BLOCK_DIM ); + const int num_threads = BLOCK_DIM * grid_dim; + + // Compute the number of elements per block. + const int elements_per_block = BLOCK_DIM * ((n + num_threads - 1) / num_threads); + + // Allocate memory for temporary buffers. + int *partial_sums = NULL; + int2 *block_ranges = NULL; + + CUDA_SAFE_CALL( cudaMalloc( (void **) &partial_sums, BLOCK_DIM * sizeof(int ) ) ); + CUDA_SAFE_CALL( cudaMalloc( (void **) &block_ranges, grid_dim * sizeof(int2) ) ); + + // Compute the ranges for the blocks. + int sum = 0; + int2 *block_ranges_on_host = new int2[grid_dim]; + for( int block_idx = 0 ; block_idx < grid_dim ; ++block_idx ) + { + block_ranges_on_host[block_idx].x = sum; + block_ranges_on_host[block_idx].y = std::min( sum += elements_per_block, n ); + } + CUDA_SAFE_CALL( cudaMemcpy( block_ranges, block_ranges_on_host, grid_dim * sizeof(int2), cudaMemcpyHostToDevice ) ); + delete[] block_ranges_on_host; + + // First round: Compute a partial sum for all blocks. + reduce_kernel_optimized<<>>( n, a_device, partial_sums, block_ranges ); + CUDA_SAFE_CALL( cudaGetLastError() ); + + // Set the ranges for the second kernel call. + int2 block_range = make_int2( 0, grid_dim ); + CUDA_SAFE_CALL( cudaMemcpy( block_ranges, &block_range, sizeof(int2), cudaMemcpyHostToDevice ) ); + + // Second round: Compute the final sum by summing the partial results of all blocks. + reduce_kernel_optimized<<<1, BLOCK_DIM>>>( grid_dim, partial_sums, partial_sums, block_ranges ); + CUDA_SAFE_CALL( cudaGetLastError() ); + + // Read the result from device memory. + int result; + CUDA_SAFE_CALL( cudaMemcpy( &result, partial_sums, sizeof(int), cudaMemcpyDeviceToHost ) ); + + // Free temporary memory. + CUDA_SAFE_CALL( cudaFree( block_ranges ) ); + CUDA_SAFE_CALL( cudaFree( partial_sums ) ); + + return result; +} + + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +// G P U R E D U C T I O N : O P T I M I Z E D WITHOUT MYSUM+= V E R S I O N + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + +template< int BLOCK_DIM > +__global__ void reduce_kernel_optimized_wo_mysum( int n, const int *in_buffer, int *out_buffer, const int2 *__restrict block_ranges ) +{ + // The number of warps in the block. + const int NUM_WARPS = BLOCK_DIM / WARP_SIZE; + + // Allocate shared memory inside the block. + __shared__ volatile int s_mem[BLOCK_DIM]; + + // The range of data to work with. + int2 range = block_ranges[blockIdx.x]; + + // Warp/lane IDs. + const int warp_id = threadIdx.x / WARP_SIZE; + const int lane_id = threadIdx.x % WARP_SIZE; + + // Compute the sum of my elements. + int my_sum = 0; + for( int idx = range.x + threadIdx.x ; idx < range.y ; idx += BLOCK_DIM ) + my_sum += in_buffer[idx]; + + // Copy my sum in shared memory. + s_mem[threadIdx.x] = my_sum; + + // Compute the sum inside each warp. + #pragma unroll + for( int offset = 16 ; offset > 0 ; offset >>= 1 ) + if( lane_id < offset ) + s_mem[threadIdx.x] += s_mem[threadIdx.x + offset]; + + __syncthreads(); + + // Each warp leader stores the result for the warp. + if( lane_id == 0 ) + s_mem[warp_id] = s_mem[threadIdx.x]; + + __syncthreads(); + + if( warp_id == 0 ) + { + // Sum the results of the warps. + #pragma unroll + for( int offset = NUM_WARPS / 2 ; offset > 0 ; offset >>= 1 ) + if( threadIdx.x < offset ) + s_mem[threadIdx.x] += s_mem[threadIdx.x + offset]; + } + + // The 1st thread stores the result of the block. + if( threadIdx.x == 0 ) + out_buffer[blockIdx.x] = s_mem[0]; +} + +template< int BLOCK_DIM > +int reduce_on_gpu_optimized_wo_mysum( int n, const int *a_device ) +{ + // Compute the size of the grid. + const int grid_dim = std::min( BLOCK_DIM, (n + BLOCK_DIM-1) / BLOCK_DIM ); + const int num_threads = BLOCK_DIM * grid_dim; + + // Compute the number of elements per block. + const int elements_per_block = BLOCK_DIM * ((n + num_threads - 1) / num_threads); + + // Allocate memory for temporary buffers. + int *partial_sums = NULL; + int2 *block_ranges = NULL; + + CUDA_SAFE_CALL( cudaMalloc( (void **) &partial_sums, BLOCK_DIM * sizeof(int ) ) ); + CUDA_SAFE_CALL( cudaMalloc( (void **) &block_ranges, grid_dim * sizeof(int2) ) ); + + // Compute the ranges for the blocks. + int sum = 0; + int2 *block_ranges_on_host = new int2[grid_dim]; + for( int block_idx = 0 ; block_idx < grid_dim ; ++block_idx ) + { + block_ranges_on_host[block_idx].x = sum; + block_ranges_on_host[block_idx].y = std::min( sum += elements_per_block, n ); + } + CUDA_SAFE_CALL( cudaMemcpy( block_ranges, block_ranges_on_host, grid_dim * sizeof(int2), cudaMemcpyHostToDevice ) ); + delete[] block_ranges_on_host; + + // First round: Compute a partial sum for all blocks. + reduce_kernel_optimized_wo_mysum<<>>( n, a_device, partial_sums, block_ranges ); + CUDA_SAFE_CALL( cudaGetLastError() ); + + // Set the ranges for the second kernel call. + int2 block_range = make_int2( 0, grid_dim ); + CUDA_SAFE_CALL( cudaMemcpy( block_ranges, &block_range, sizeof(int2), cudaMemcpyHostToDevice ) ); + + // Second round: Compute the final sum by summing the partial results of all blocks. + reduce_kernel_optimized_wo_mysum<<<1, BLOCK_DIM>>>( grid_dim, partial_sums, partial_sums, block_ranges ); + CUDA_SAFE_CALL( cudaGetLastError() ); + + // Read the result from device memory. + int result; + CUDA_SAFE_CALL( cudaMemcpy( &result, partial_sums, sizeof(int), cudaMemcpyDeviceToHost ) ); + + // Free temporary memory. + CUDA_SAFE_CALL( cudaFree( block_ranges ) ); + CUDA_SAFE_CALL( cudaFree( partial_sums ) ); + + return result; +} + + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +// M A I N + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +int main( int, char ** ) +{ + const int NUM_TESTS = 10; + + // The number of elements in the problem. + const int N = 256*256*1024; + //const int N = 256*256*8; + + std::cout << "Computing a reduction on " << N << " elements" << std::endl; + + // X and Y on the host (CPU). + int *a_host = new int[N]; + + // Make sure the memory got allocated. TODO: free memory. + if( a_host == NULL ) + { + std::cerr << "ERROR: Couldn't allocate a_host" << std::endl; + return 1; + } + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + // Generate data + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + std::cout << "Filling with 1s" << std::endl; + + // Generate pseudo-random data. + for( int i = 0 ; i < N ; ++i ) + a_host[i] = 1; + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + // Compute on the CPU using 1 thread + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + std::cout << std::endl; + std::cout << "Computing on the CPU using 1 CPU thread" << std::endl; + + GpuTimer gpu_timer; + gpu_timer.Start(); + + // Calculate the reference to compare with the device result. + int sum = 0; + for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test ) + { + sum = 0; + for( int i = 0 ; i < N ; ++i ) + sum += a_host[i]; + } + + gpu_timer.Stop(); + + std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl; + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + // Compute on the CPU using several OpenMP threads + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + std::cout << std::endl; + std::cout << "Computing on the CPU using " << omp_get_max_threads() << " OpenMP thread(s)" << std::endl; + + gpu_timer.Start(); + + // Calculate the reference to compare with the device result. + int omp_sum = 0; + for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test ) + { + omp_sum = 0; +#pragma omp parallel shared(omp_sum) + { +#pragma omp for reduction(+ : omp_sum) + for( int i = 0 ; i < N ; ++i ) + omp_sum = omp_sum + a_host[i]; + } + } + + gpu_timer.Stop(); + + std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl; + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + // Compute on the GPU + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + // The copy of A on the device (GPU). + int *a_device = NULL; + + // Allocate A on the device. + CUDA_SAFE_CALL( cudaMalloc( (void **) &a_device, N*sizeof( int ) ) ); + + // Copy A from host (CPU) to device (GPU). + CUDA_SAFE_CALL( cudaMemcpy( a_device, a_host, N*sizeof( int ), cudaMemcpyHostToDevice ) ); + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + // Compute on the GPU using Thrust + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + std::cout << std::endl; + std::cout << "Computing on the GPU using Thrust (transfers excluded)" << std::endl; + + gpu_timer.Start(); + + // Launch the kernel on the GPU. + int thrust_sum = 0; + thrust::device_ptr aptr = thrust::device_pointer_cast(a_device); + for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test ) + { + thrust_sum = thrust::reduce( aptr, aptr+N ); + } + + gpu_timer.Stop(); + + std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl; + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + // Compute on the GPU + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + std::cout << std::endl; + std::cout << "Computing on the GPU (transfers excluded)" << std::endl; + + gpu_timer.Start(); + + // Launch the kernel on the GPU. + int gpu_sum = 0; + for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test ) + { + gpu_sum = reduce_on_gpu( N, a_device ); + } + + gpu_timer.Stop(); + + std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl; + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + // Compute on the GPU (optimized version) + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + std::cout << std::endl; + std::cout << "Computing on the GPU using a tuned version (transfers excluded)" << std::endl; + + gpu_timer.Start(); + + const int BLOCK_DIM = 256; + + // Launch the kernel on the GPU. + int optim_gpu_sum = 0; + for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test ) + { + optim_gpu_sum = reduce_on_gpu_optimized( N, a_device ); + } + + gpu_timer.Stop(); + + std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl; + + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + // Compute on the GPU (optimized version without mysum+=) + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + std::cout << std::endl; + std::cout << "Computing on the GPU using a tuned version without my_sum (transfers excluded)" << std::endl; + + gpu_timer.Start(); + + + // Launch the kernel on the GPU. + int optim_gpu_sum_wo_mysum = 0; + for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test ) + { + optim_gpu_sum_wo_mysum = reduce_on_gpu_optimized_wo_mysum( N, a_device ); + } + + gpu_timer.Stop(); + + std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl; + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + // Validate results + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + std::cout << std::endl; + std::cout << std::endl; + std::cout << "OpenMP results: ref= " << sum << " / sum= " << omp_sum << std::endl; + std::cout << "CUDA results: ref= " << sum << " / sum= " << gpu_sum << std::endl; + std::cout << "Thrust results: ref= " << sum << " / sum= " << thrust_sum << std::endl; + std::cout << "Optim results: ref= " << sum << " / sum= " << optim_gpu_sum << std::endl; + std::cout << "Optim without mysum+= results: ref= " << sum << " / sum= " << optim_gpu_sum_wo_mysum << std::endl; + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + // Clean memory + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + // Free device memory. + CUDA_SAFE_CALL( cudaFree( a_device ) ); + + // Free host memory. + delete[] a_host; + + return 0; +} + diff --git a/A4/TP_OMP_GPU/Seuillage/.vs/Seuillage/v16/.suo b/A4/TP_OMP_GPU/Seuillage/.vs/Seuillage/v16/.suo new file mode 100644 index 0000000..b67b191 Binary files /dev/null and b/A4/TP_OMP_GPU/Seuillage/.vs/Seuillage/v16/.suo differ diff --git a/A4/TP_OMP_GPU/Seuillage/.vs/Seuillage/v16/Browse.VC.db b/A4/TP_OMP_GPU/Seuillage/.vs/Seuillage/v16/Browse.VC.db new file mode 100644 index 0000000..4a4cc2f Binary files /dev/null and b/A4/TP_OMP_GPU/Seuillage/.vs/Seuillage/v16/Browse.VC.db differ diff --git a/A4/TP_OMP_GPU/Seuillage/GpuTimer.h b/A4/TP_OMP_GPU/Seuillage/GpuTimer.h new file mode 100644 index 0000000..de1fb0d --- /dev/null +++ b/A4/TP_OMP_GPU/Seuillage/GpuTimer.h @@ -0,0 +1,39 @@ +#pragma once +#include + +class GpuTimer +{ + cudaEvent_t start, stop; + +public: + GpuTimer() + { + cudaEventCreate(&start); + cudaEventCreate(&stop); + } + + ~GpuTimer() + { + cudaEventDestroy(stop); + cudaEventDestroy(start); + } + + void Start() + { + cudaEventRecord(start); + } + + void Stop() + { + cudaEventRecord(stop); + cudaEventSynchronize(stop); + } + + float Elapsed() + { + float elapsed; + cudaEventElapsedTime(&elapsed, start, stop); + return elapsed; + } +}; + diff --git a/A4/TP_OMP_GPU/Seuillage/Seuillage.sln b/A4/TP_OMP_GPU/Seuillage/Seuillage.sln new file mode 100644 index 0000000..6da52ab --- /dev/null +++ b/A4/TP_OMP_GPU/Seuillage/Seuillage.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.32228.343 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Seuillage", "Seuillage\Seuillage.vcxproj", "{727252A0-B5D1-48AE-81A6-37E11733EBC2}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {727252A0-B5D1-48AE-81A6-37E11733EBC2}.Debug|x64.ActiveCfg = Debug|x64 + {727252A0-B5D1-48AE-81A6-37E11733EBC2}.Debug|x64.Build.0 = Debug|x64 + {727252A0-B5D1-48AE-81A6-37E11733EBC2}.Release|x64.ActiveCfg = Release|x64 + {727252A0-B5D1-48AE-81A6-37E11733EBC2}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {5177E263-3BDE-4B8E-8A21-512DA5F23521} + EndGlobalSection +EndGlobal diff --git a/A4/TP_OMP_GPU/Seuillage/Seuillage/CpuTimer.cpp b/A4/TP_OMP_GPU/Seuillage/Seuillage/CpuTimer.cpp new file mode 100644 index 0000000..c90252c --- /dev/null +++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/CpuTimer.cpp @@ -0,0 +1,7 @@ +#include "CpuTimer.h" + +// Initialize the resolution of the timer +LARGE_INTEGER CpuTimer::m_freq = (QueryPerformanceFrequency(&CpuTimer::m_freq), CpuTimer::m_freq); + +// Calculate the overhead of the timer +LONGLONG CpuTimer::m_overhead = CpuTimer::GetOverhead(); \ No newline at end of file diff --git a/A4/TP_OMP_GPU/Seuillage/Seuillage/CpuTimer.h b/A4/TP_OMP_GPU/Seuillage/Seuillage/CpuTimer.h new file mode 100644 index 0000000..a3add84 --- /dev/null +++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/CpuTimer.h @@ -0,0 +1,37 @@ +#pragma once +#include + +struct CpuTimer +{ + void Start() + { + QueryPerformanceCounter(&m_start); + } + + void Stop() + { + QueryPerformanceCounter(&m_stop); + } + + // Returns elapsed time in milliseconds (ms) + double Elapsed() + { + return (m_stop.QuadPart - m_start.QuadPart - m_overhead) * 1000.0 / m_freq.QuadPart; + } + +private: + + // Returns the overhead of the timer in ticks + static LONGLONG GetOverhead() + { + CpuTimer t; + t.Start(); + t.Stop(); + return t.m_stop.QuadPart - t.m_start.QuadPart; + } + + LARGE_INTEGER m_start; + LARGE_INTEGER m_stop; + static LARGE_INTEGER m_freq; + static LONGLONG m_overhead; +}; diff --git a/A4/TP_OMP_GPU/Seuillage/Seuillage/GpuTimer.h b/A4/TP_OMP_GPU/Seuillage/Seuillage/GpuTimer.h new file mode 100644 index 0000000..de1fb0d --- /dev/null +++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/GpuTimer.h @@ -0,0 +1,39 @@ +#pragma once +#include + +class GpuTimer +{ + cudaEvent_t start, stop; + +public: + GpuTimer() + { + cudaEventCreate(&start); + cudaEventCreate(&stop); + } + + ~GpuTimer() + { + cudaEventDestroy(stop); + cudaEventDestroy(start); + } + + void Start() + { + cudaEventRecord(start); + } + + void Stop() + { + cudaEventRecord(stop); + cudaEventSynchronize(stop); + } + + float Elapsed() + { + float elapsed; + cudaEventElapsedTime(&elapsed, start, stop); + return elapsed; + } +}; + diff --git a/A4/TP_OMP_GPU/Seuillage/Seuillage/Seuillage.vcxproj b/A4/TP_OMP_GPU/Seuillage/Seuillage/Seuillage.vcxproj new file mode 100644 index 0000000..059fbf4 --- /dev/null +++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/Seuillage.vcxproj @@ -0,0 +1,86 @@ + + + + + Debug + x64 + + + Release + x64 + + + + {727252A0-B5D1-48AE-81A6-37E11733EBC2} + Seuillage + + + + Application + true + MultiByte + v142 + + + Application + false + true + MultiByte + v142 + + + + + + + + + + + + + + true + + + + Level3 + Disabled + WIN32;WIN64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + true + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + 64 + + + + + Level3 + MaxSpeed + true + true + WIN32;WIN64;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + true + true + true + Console + cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + 64 + + + + + + + + + + \ No newline at end of file diff --git a/A4/TP_OMP_GPU/Seuillage/Seuillage/Seuillage.vcxproj.user b/A4/TP_OMP_GPU/Seuillage/Seuillage/Seuillage.vcxproj.user new file mode 100644 index 0000000..88a5509 --- /dev/null +++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/Seuillage.vcxproj.user @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/A4/TP_OMP_GPU/Seuillage/Seuillage/kernel.cu b/A4/TP_OMP_GPU/Seuillage/Seuillage/kernel.cu new file mode 100644 index 0000000..d2b1cf0 --- /dev/null +++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/kernel.cu @@ -0,0 +1,121 @@ + +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +#include + +cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size); + +__global__ void addKernel(int *c, const int *a, const int *b) +{ + int i = threadIdx.x; + c[i] = a[i] + b[i]; +} + +int main() +{ + const int arraySize = 5; + const int a[arraySize] = { 1, 2, 3, 4, 5 }; + const int b[arraySize] = { 10, 20, 30, 40, 50 }; + int c[arraySize] = { 0 }; + + // Add vectors in parallel. + cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "addWithCuda failed!"); + return 1; + } + + printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n", + c[0], c[1], c[2], c[3], c[4]); + + // cudaDeviceReset must be called before exiting in order for profiling and + // tracing tools such as Nsight and Visual Profiler to show complete traces. + cudaStatus = cudaDeviceReset(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceReset failed!"); + return 1; + } + + return 0; +} + +// Helper function for using CUDA to add vectors in parallel. +cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size) +{ + int *dev_a = 0; + int *dev_b = 0; + int *dev_c = 0; + cudaError_t cudaStatus; + + // Choose which GPU to run on, change this on a multi-GPU system. + cudaStatus = cudaSetDevice(0); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); + goto Error; + } + + // Allocate GPU buffers for three vectors (two input, one output) . + cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int)); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaMalloc failed!"); + goto Error; + } + + cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int)); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaMalloc failed!"); + goto Error; + } + + cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int)); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaMalloc failed!"); + goto Error; + } + + // Copy input vectors from host memory to GPU buffers. + cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaMemcpy failed!"); + goto Error; + } + + cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaMemcpy failed!"); + goto Error; + } + + // Launch a kernel on the GPU with one thread for each element. + addKernel<<<1, size>>>(dev_c, dev_a, dev_b); + + // Check for any errors launching the kernel + cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); + goto Error; + } + + // cudaDeviceSynchronize waits for the kernel to finish, and returns + // any errors encountered during the launch. + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); + goto Error; + } + + // Copy output vector from GPU buffer to host memory. + cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaMemcpy failed!"); + goto Error; + } + +Error: + cudaFree(dev_c); + cudaFree(dev_a); + cudaFree(dev_b); + + return cudaStatus; +} diff --git a/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.Build.CppClean.log b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.Build.CppClean.log new file mode 100644 index 0000000..837bfb2 --- /dev/null +++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.Build.CppClean.log @@ -0,0 +1,4 @@ +c:\users\sasa\documents\m2r_seti\m2_seti\a4\tp_omp_gpu\seuillage\seuillage\x64\debug\kernel.cu.obj +c:\users\sasa\documents\m2r_seti\m2_seti\a4\tp_omp_gpu\seuillage\seuillage\x64\debug\kernel.cu.cache +c:\users\sasa\documents\m2r_seti\m2_seti\a4\tp_omp_gpu\seuillage\seuillage\x64\debug\seuillage.tlog\cudacompile.read.1u.tlog +c:\users\sasa\documents\m2r_seti\m2_seti\a4\tp_omp_gpu\seuillage\seuillage\x64\debug\seuillage.tlog\cudacompile.write.1u.tlog diff --git a/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.log b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.log new file mode 100644 index 0000000..77ace8a --- /dev/null +++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.log @@ -0,0 +1 @@ +C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\BuildCustomizations\CUDA 11.8.targets(606,9): error : The CUDA Toolkit v11.8 directory '' does not exist. Please verify the CUDA Toolkit is installed properly or define the CudaToolkitDir property to resolve this error. diff --git a/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.tlog/CudaCompile.read.1u.tlog b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.tlog/CudaCompile.read.1u.tlog new file mode 100644 index 0000000..700b875 Binary files /dev/null and b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.tlog/CudaCompile.read.1u.tlog differ diff --git a/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.tlog/CudaCompile.write.1u.tlog b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.tlog/CudaCompile.write.1u.tlog new file mode 100644 index 0000000..88ff63c Binary files /dev/null and b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.tlog/CudaCompile.write.1u.tlog differ diff --git a/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.tlog/Seuillage.lastbuildstate b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.tlog/Seuillage.lastbuildstate new file mode 100644 index 0000000..ed2063c --- /dev/null +++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.tlog/Seuillage.lastbuildstate @@ -0,0 +1,2 @@ +PlatformToolSet=v142:VCToolArchitecture=Native32Bit:VCToolsVersion=14.29.30133:VCServicingVersionCrtHeaders=14.29.30136:TargetPlatformVersion=10.0.22000.0: +Debug|x64|C:\Users\Sasa\Documents\M2R_SETI\M2_SETI\A4\TP_OMP_GPU\Seuillage\| diff --git a/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.tlog/unsuccessfulbuild b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.tlog/unsuccessfulbuild new file mode 100644 index 0000000..e69de29 diff --git a/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.vcxproj.FileListAbsolute.txt b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.vcxproj.FileListAbsolute.txt new file mode 100644 index 0000000..e69de29 diff --git a/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/kernel.cu-686266873.deps b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/kernel.cu-686266873.deps new file mode 100644 index 0000000..e69de29 diff --git a/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/kernel.cu.cache b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/kernel.cu.cache new file mode 100644 index 0000000..28b52a0 --- /dev/null +++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/kernel.cu.cache @@ -0,0 +1,55 @@ +Identity=kernel.cu +AdditionalCompilerOptions= +AdditionalCompilerOptions= +AdditionalDependencies= +AdditionalDeps= +AdditionalLibraryDirectories= +AdditionalOptions= +AdditionalOptions= +CodeGeneration=compute_52,sm_52 +CodeGeneration=compute_52,sm_52 +CompileOut=C:\Users\Sasa\Documents\M2R_SETI\M2_SETI\A4\TP_OMP_GPU\Seuillage\Seuillage\x64\Debug\kernel.cu.obj +CudaRuntime=Static +CudaToolkitCustomDir= +DebugInformationFormat=ProgramDatabase +DebugInformationFormat=ProgramDatabase +Defines=;WIN32;WIN64;_DEBUG;_CONSOLE;_MBCS; +Emulation=false +EnableVirtualArchInFatbin=true +ExtensibleWholeProgramCompilation=false +FastMath=false +GenerateLineInfo=false +GenerateRelocatableDeviceCode=false +GPUDebugInfo=true +GPUDebugInfo=true +HostDebugInfo=true +Include=;;include +Inputs= +InterleaveSourceInPTX=false +Keep=false +KeepDir=x64\Debug +LinkOut= +MaxRegCount=0 +NvccCompilation=compile +NvccPath= +Optimization=Od +Optimization=Od +PerformDeviceLink= +ProgramDataBaseFileName=x64\Debug\vc142.pdb +ProgramDataBaseFileName=x64\Debug\vc142.pdb +PtxAsOptionV=false +RequiredIncludes= +Runtime=MDd +Runtime=MDd +RuntimeChecks=RTC1 +RuntimeChecks=RTC1 +TargetMachinePlatform=64 +TargetMachinePlatform=64 +TypeInfo= +TypeInfo= +UseHostDefines=true +UseHostInclude=true +UseHostLibraryDependencies= +UseHostLibraryDirectories= +Warning=W3 +Warning=W3 diff --git a/A4/TP_OMP_GPU/deviceQuery.PNG b/A4/TP_OMP_GPU/deviceQuery.PNG new file mode 100644 index 0000000..6de4373 Binary files /dev/null and b/A4/TP_OMP_GPU/deviceQuery.PNG differ diff --git a/IA/1708.06733.pdf b/IA/1708.06733.pdf new file mode 100644 index 0000000..e1f960b Binary files /dev/null and b/IA/1708.06733.pdf differ diff --git a/IA/Analyse.docx b/IA/Analyse.docx new file mode 100644 index 0000000..0d8204a Binary files /dev/null and b/IA/Analyse.docx differ diff --git a/IA/Master_2_SETI-IA_EmbarquéeDeConfiance-Examen-2023-02-24[9120].pdf b/IA/Master_2_SETI-IA_EmbarquéeDeConfiance-Examen-2023-02-24[9120].pdf new file mode 100644 index 0000000..ae30fd0 Binary files /dev/null and b/IA/Master_2_SETI-IA_EmbarquéeDeConfiance-Examen-2023-02-24[9120].pdf differ diff --git a/IA/TRUSWORTHY EMBEDDED AI Risk Analysis and Certification Frameworks for Critical Trusted AI Applications-PartI.pdf b/IA/TRUSWORTHY EMBEDDED AI Risk Analysis and Certification Frameworks for Critical Trusted AI Applications-PartI.pdf new file mode 100644 index 0000000..14ee38c Binary files /dev/null and b/IA/TRUSWORTHY EMBEDDED AI Risk Analysis and Certification Frameworks for Critical Trusted AI Applications-PartI.pdf differ diff --git a/IA/TRUSWORTHY EMBEDDED AI Risk Analysis and Certification Frameworks for Critical Trusted AI Applications-PartII.pdf b/IA/TRUSWORTHY EMBEDDED AI Risk Analysis and Certification Frameworks for Critical Trusted AI Applications-PartII.pdf new file mode 100644 index 0000000..b2b5ffd Binary files /dev/null and b/IA/TRUSWORTHY EMBEDDED AI Risk Analysis and Certification Frameworks for Critical Trusted AI Applications-PartII.pdf differ diff --git a/IA/cours_adversarial_20230203[9258].pdf b/IA/cours_adversarial_20230203[9258].pdf new file mode 100644 index 0000000..c5a426b Binary files /dev/null and b/IA/cours_adversarial_20230203[9258].pdf differ