IA

2023-02-22 11:23:45 +01:00 · 2023-02-22 11:23:45 +01:00 · 049e45e1b9
commit 049e45e1b9
parent 83239d487e
34 changed files with 1002 additions and 0 deletions
--- a/A2/Detection_cercle/A
+++ b/A2/Detection_cercle/A
--- a/A2/Detection_cercle/car.bmp
+++ b/A2/Detection_cercle/car.bmp
--- a/A2/Detection_cercle/car.jpg
+++ b/A2/Detection_cercle/car.jpg
--- a/A4/TP_GPU-master/TP2_reduction/windows/.vs/Reduce/v16/.suo
+++ b/A4/TP_GPU-master/TP2_reduction/windows/.vs/Reduce/v16/.suo
--- a/A4/TP_GPU-master/TP2_reduction/windows/.vs/Reduce/v16/Browse.VC.db
+++ b/A4/TP_GPU-master/TP2_reduction/windows/.vs/Reduce/v16/Browse.VC.db
--- a/A4/TP_GPU-master/TP2_reduction/windows/Reduce.vcxproj.user
+++ b/A4/TP_GPU-master/TP2_reduction/windows/Reduce.vcxproj.user
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup />
+</Project>
--- a/A4/TP_GPU-master/TP2_reduction/windows/x64/Release/Reduce.log
+++ b/A4/TP_GPU-master/TP2_reduction/windows/x64/Release/Reduce.log
@ -0,0 +1 @@
+C:\Users\Sasa\Documents\M2R_SETI\M2_SETI\A4\TP_GPU-master\TP2_reduction\windows\Reduce.vcxproj(55,5): error MSB4019: le projet importé "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\BuildCustomizations\CUDA 4.2.props" est introuvable. Vérifiez que l'expression de la déclaration Import "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\\BuildCustomizations\CUDA 4.2.props" est correcte et que le fichier existe sur le disque.
--- a/A4/TP_OMP_GPU/Reduce_solution.cu
+++ b/A4/TP_OMP_GPU/Reduce_solution.cu
@ -0,0 +1,577 @@
+/*
+# Copyright (c) 2011-2012 NVIDIA CORPORATION. All Rights Reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto. Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.   
+*/
+
+#include <iostream>
+#include <cuda_runtime_api.h>
+#include <omp.h>
+#include <thrust/reduce.h>
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include "GpuTimer.h"
+
+#define CUDA_SAFE_CALL(call) \
+  { \
+    cudaError_t err_code = call; \
+    if( err_code != cudaSuccess ) { std::cerr << "Error (" << __FILE__ << ":" << __LINE__ << "): " << cudaGetErrorString(err_code) << std::endl; return 1; } \
+  }
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// G P U   R E D U C T I O N
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+__global__ void reduce_kernel( int n, const int *in_buffer, int *out_buffer, const int2 *block_ranges )
+{
+  // Allocate shared memory inside the block.
+  extern __shared__ int s_mem[];
+
+  // The range of data to work with.
+  int2 range = block_ranges[blockIdx.x];
+
+  // Compute the sum of my elements.
+  int my_sum = 0;
+  for( int idx = range.x + threadIdx.x ; idx < range.y ; idx += blockDim.x )
+    my_sum += in_buffer[idx];
+
+  // Copy my sum in shared memory.
+  s_mem[threadIdx.x] = my_sum;
+
+  // Make sure all the threads have copied their value in shared memory.
+  __syncthreads();
+
+  int offset ;
+  // Compute the sum inside the block.
+  for(offset = blockDim.x / 2 ; offset > 16 ; offset /= 2 )
+  {
+    if( threadIdx.x < offset )
+      s_mem[threadIdx.x] += s_mem[threadIdx.x + offset];
+    __syncthreads( );
+  }
+
+  //INSIDE WARP 0 SYNC NOT NECESSARY
+  for(; offset > 0 ; offset /= 2 )
+  {
+    if( threadIdx.x < offset )
+      s_mem[threadIdx.x] += s_mem[threadIdx.x + offset];
+    //__syncthreads( );
+  }
+
+
+  // The first thread of the block stores its result.
+  if( threadIdx.x == 0 )
+    out_buffer[blockIdx.x] = s_mem[0];
+}
+
+int reduce_on_gpu( int n, const int *a_device )
+{
+  // Compute the size of the grid.
+  const int BLOCK_DIM   = 256;
+  const int grid_dim    = std::min( BLOCK_DIM, (n + BLOCK_DIM-1) / BLOCK_DIM );
+  const int num_threads = BLOCK_DIM * grid_dim;
+
+  // Compute the number of elements per block.
+  const int elements_per_block = BLOCK_DIM * ((n + num_threads - 1) / num_threads);
+
+  // Allocate memory for temporary buffers.
+  int  *partial_sums = NULL;
+  int2 *block_ranges = NULL;
+
+  CUDA_SAFE_CALL( cudaMalloc( (void **) &partial_sums, BLOCK_DIM * sizeof(int ) ) );
+  CUDA_SAFE_CALL( cudaMalloc( (void **) &block_ranges, grid_dim  * sizeof(int2) ) );
+
+  // Compute the ranges for the blocks.
+  int sum = 0;
+  int2 *block_ranges_on_host = new int2[grid_dim];
+  for( int block_idx = 0 ; block_idx < grid_dim ; ++block_idx )
+  {
+    block_ranges_on_host[block_idx].x = sum;
+    block_ranges_on_host[block_idx].y = std::min( sum += elements_per_block, n );
+  }
+  CUDA_SAFE_CALL( cudaMemcpy( block_ranges, block_ranges_on_host, grid_dim * sizeof(int2), cudaMemcpyHostToDevice ) );
+  delete[] block_ranges_on_host;
+
+  // First round: Compute a partial sum for all blocks.
+  reduce_kernel<<<grid_dim, BLOCK_DIM, BLOCK_DIM*sizeof(int)>>>( n, a_device, partial_sums, block_ranges );
+  CUDA_SAFE_CALL( cudaGetLastError() );
+
+  // Set the ranges for the second kernel call.
+  int2 block_range = make_int2( 0, grid_dim );
+  CUDA_SAFE_CALL( cudaMemcpy( block_ranges, &block_range, sizeof(int2), cudaMemcpyHostToDevice ) );
+
+  // Second round: Compute the final sum by summing the partial results of all blocks.
+  reduce_kernel<<<1, BLOCK_DIM, BLOCK_DIM*sizeof(int)>>>( grid_dim, partial_sums, partial_sums, block_ranges );
+  CUDA_SAFE_CALL( cudaGetLastError() );
+
+  // Read the result from device memory.
+  int result;
+  CUDA_SAFE_CALL( cudaMemcpy( &result, partial_sums, sizeof(int), cudaMemcpyDeviceToHost ) );
+
+  // Free temporary memory.
+  CUDA_SAFE_CALL( cudaFree( block_ranges ) );
+  CUDA_SAFE_CALL( cudaFree( partial_sums ) );
+
+  return result;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// G P U   R E D U C T I O N :   O P T I M I Z E D   V E R S I O N
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define WARP_SIZE 32
+
+template< int BLOCK_DIM > 
+__global__ void reduce_kernel_optimized( int n, const int *in_buffer, int *out_buffer, const int2 *__restrict block_ranges )
+{
+  // The number of warps in the block.
+  const int NUM_WARPS = BLOCK_DIM / WARP_SIZE;
+
+  // Allocate shared memory inside the block.
+  __shared__ volatile int s_mem[BLOCK_DIM];
+
+  // The range of data to work with.
+  int2 range = block_ranges[blockIdx.x];
+
+  // Warp/lane IDs.
+  const int warp_id = threadIdx.x / WARP_SIZE;
+  const int lane_id = threadIdx.x % WARP_SIZE;
+
+  // Compute the sum of my elements.
+  int my_sum = 0;
+  for( int idx = range.x + threadIdx.x ; idx < range.y ; idx += BLOCK_DIM )
+    my_sum += in_buffer[idx];
+
+  // Copy my sum in shared memory.
+  s_mem[threadIdx.x] = my_sum;
+
+  // Compute the sum inside each warp.
+  #pragma unroll
+  for( int offset = 16 ; offset > 1 ; offset >>= 1 )
+    if( lane_id < offset )
+      s_mem[threadIdx.x] = my_sum += s_mem[threadIdx.x + offset];
+
+  __syncthreads();
+  
+  // Each warp leader stores the result for the warp.
+  if( lane_id == 0 )
+    s_mem[warp_id] = my_sum += s_mem[threadIdx.x+1];
+
+  __syncthreads();
+
+  if( warp_id == 0 )
+  {
+    // Read my value from shared memory and store it in a register.
+    my_sum = s_mem[lane_id];
+  
+    // Sum the results of the warps.
+    #pragma unroll
+    for( int offset = NUM_WARPS / 2 ; offset > 1 ; offset >>= 1 )
+      if( threadIdx.x < offset )
+        s_mem[threadIdx.x] = my_sum += s_mem[threadIdx.x + offset];
+  }
+
+  // The 1st thread stores the result of the block.
+  if( threadIdx.x == 0 )
+    out_buffer[blockIdx.x] = my_sum += s_mem[1];
+}
+
+template< int BLOCK_DIM >
+int reduce_on_gpu_optimized( int n, const int *a_device )
+{
+  // Compute the size of the grid.
+  const int grid_dim    = std::min( BLOCK_DIM, (n + BLOCK_DIM-1) / BLOCK_DIM );
+  const int num_threads = BLOCK_DIM * grid_dim;
+
+  // Compute the number of elements per block.
+  const int elements_per_block = BLOCK_DIM * ((n + num_threads - 1) / num_threads);
+
+  // Allocate memory for temporary buffers.
+  int  *partial_sums = NULL;
+  int2 *block_ranges = NULL;
+
+  CUDA_SAFE_CALL( cudaMalloc( (void **) &partial_sums, BLOCK_DIM * sizeof(int ) ) );
+  CUDA_SAFE_CALL( cudaMalloc( (void **) &block_ranges, grid_dim  * sizeof(int2) ) );
+
+  // Compute the ranges for the blocks.
+  int sum = 0;
+  int2 *block_ranges_on_host = new int2[grid_dim];
+  for( int block_idx = 0 ; block_idx < grid_dim ; ++block_idx )
+  {
+    block_ranges_on_host[block_idx].x = sum;
+    block_ranges_on_host[block_idx].y = std::min( sum += elements_per_block, n );
+  }
+  CUDA_SAFE_CALL( cudaMemcpy( block_ranges, block_ranges_on_host, grid_dim * sizeof(int2), cudaMemcpyHostToDevice ) );
+  delete[] block_ranges_on_host;
+
+  // First round: Compute a partial sum for all blocks.
+  reduce_kernel_optimized<BLOCK_DIM><<<grid_dim, BLOCK_DIM>>>( n, a_device, partial_sums, block_ranges );
+  CUDA_SAFE_CALL( cudaGetLastError() );
+
+  // Set the ranges for the second kernel call.
+  int2 block_range = make_int2( 0, grid_dim );
+  CUDA_SAFE_CALL( cudaMemcpy( block_ranges, &block_range, sizeof(int2), cudaMemcpyHostToDevice ) );
+
+  // Second round: Compute the final sum by summing the partial results of all blocks.
+  reduce_kernel_optimized<BLOCK_DIM><<<1, BLOCK_DIM>>>( grid_dim, partial_sums, partial_sums, block_ranges );
+  CUDA_SAFE_CALL( cudaGetLastError() );
+
+  // Read the result from device memory.
+  int result;
+  CUDA_SAFE_CALL( cudaMemcpy( &result, partial_sums, sizeof(int), cudaMemcpyDeviceToHost ) );
+
+  // Free temporary memory.
+  CUDA_SAFE_CALL( cudaFree( block_ranges ) );
+  CUDA_SAFE_CALL( cudaFree( partial_sums ) );
+
+  return result;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// G P U   R E D U C T I O N :   O P T I M I Z E D  WITHOUT MYSUM+=   V E R S I O N
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+template< int BLOCK_DIM > 
+__global__ void reduce_kernel_optimized_wo_mysum( int n, const int *in_buffer, int *out_buffer, const int2 *__restrict block_ranges )
+{
+  // The number of warps in the block.
+  const int NUM_WARPS = BLOCK_DIM / WARP_SIZE;
+
+  // Allocate shared memory inside the block.
+  __shared__ volatile int s_mem[BLOCK_DIM];
+
+  // The range of data to work with.
+  int2 range = block_ranges[blockIdx.x];
+
+  // Warp/lane IDs.
+  const int warp_id = threadIdx.x / WARP_SIZE;
+  const int lane_id = threadIdx.x % WARP_SIZE;
+
+  // Compute the sum of my elements.
+  int my_sum = 0;
+  for( int idx = range.x + threadIdx.x ; idx < range.y ; idx += BLOCK_DIM )
+    my_sum += in_buffer[idx];
+
+  // Copy my sum in shared memory.
+  s_mem[threadIdx.x] = my_sum;
+
+  // Compute the sum inside each warp.
+  #pragma unroll
+  for( int offset = 16 ; offset > 0 ; offset >>= 1 )
+    if( lane_id < offset )
+      s_mem[threadIdx.x] += s_mem[threadIdx.x + offset];
+
+  __syncthreads();
+  
+  // Each warp leader stores the result for the warp.
+  if( lane_id == 0 )
+    s_mem[warp_id] = s_mem[threadIdx.x];
+
+  __syncthreads();
+
+  if( warp_id == 0 )
+  {
+   // Sum the results of the warps.
+    #pragma unroll
+    for( int offset = NUM_WARPS / 2 ; offset > 0 ; offset >>= 1 )
+      if( threadIdx.x < offset )
+        s_mem[threadIdx.x] += s_mem[threadIdx.x + offset]; 
+  }
+
+  // The 1st thread stores the result of the block.
+  if( threadIdx.x == 0 )
+    out_buffer[blockIdx.x] = s_mem[0];
+}
+
+template< int BLOCK_DIM >
+int reduce_on_gpu_optimized_wo_mysum( int n, const int *a_device )
+{
+  // Compute the size of the grid.
+  const int grid_dim    = std::min( BLOCK_DIM, (n + BLOCK_DIM-1) / BLOCK_DIM );
+  const int num_threads = BLOCK_DIM * grid_dim;
+
+  // Compute the number of elements per block.
+  const int elements_per_block = BLOCK_DIM * ((n + num_threads - 1) / num_threads);
+
+  // Allocate memory for temporary buffers.
+  int  *partial_sums = NULL;
+  int2 *block_ranges = NULL;
+
+  CUDA_SAFE_CALL( cudaMalloc( (void **) &partial_sums, BLOCK_DIM * sizeof(int ) ) );
+  CUDA_SAFE_CALL( cudaMalloc( (void **) &block_ranges, grid_dim  * sizeof(int2) ) );
+
+  // Compute the ranges for the blocks.
+  int sum = 0;
+  int2 *block_ranges_on_host = new int2[grid_dim];
+  for( int block_idx = 0 ; block_idx < grid_dim ; ++block_idx )
+  {
+    block_ranges_on_host[block_idx].x = sum;
+    block_ranges_on_host[block_idx].y = std::min( sum += elements_per_block, n );
+  }
+  CUDA_SAFE_CALL( cudaMemcpy( block_ranges, block_ranges_on_host, grid_dim * sizeof(int2), cudaMemcpyHostToDevice ) );
+  delete[] block_ranges_on_host;
+
+  // First round: Compute a partial sum for all blocks.
+  reduce_kernel_optimized_wo_mysum<BLOCK_DIM><<<grid_dim, BLOCK_DIM>>>( n, a_device, partial_sums, block_ranges );
+  CUDA_SAFE_CALL( cudaGetLastError() );
+
+  // Set the ranges for the second kernel call.
+  int2 block_range = make_int2( 0, grid_dim );
+  CUDA_SAFE_CALL( cudaMemcpy( block_ranges, &block_range, sizeof(int2), cudaMemcpyHostToDevice ) );
+
+  // Second round: Compute the final sum by summing the partial results of all blocks.
+  reduce_kernel_optimized_wo_mysum<BLOCK_DIM><<<1, BLOCK_DIM>>>( grid_dim, partial_sums, partial_sums, block_ranges );
+  CUDA_SAFE_CALL( cudaGetLastError() );
+
+  // Read the result from device memory.
+  int result;
+  CUDA_SAFE_CALL( cudaMemcpy( &result, partial_sums, sizeof(int), cudaMemcpyDeviceToHost ) );
+
+  // Free temporary memory.
+  CUDA_SAFE_CALL( cudaFree( block_ranges ) );
+  CUDA_SAFE_CALL( cudaFree( partial_sums ) );
+
+  return result;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// M A I N
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main( int, char ** )
+{
+  const int NUM_TESTS = 10;
+
+  // The number of elements in the problem.
+  const int N = 256*256*1024;
+  //const int N = 256*256*8;
+
+  std::cout << "Computing a reduction on " << N << " elements" << std::endl;
+
+  // X and Y on the host (CPU).
+  int *a_host = new int[N];
+
+  // Make sure the memory got allocated. TODO: free memory.
+  if( a_host == NULL )
+  {
+    std::cerr << "ERROR: Couldn't allocate a_host" << std::endl;
+    return 1;
+  }
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  // Generate data
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  std::cout << "Filling with 1s" << std::endl;
+
+  // Generate pseudo-random data.
+  for( int i = 0 ; i < N ; ++i )
+    a_host[i] = 1;
+  
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  // Compute on the CPU using 1 thread
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  std::cout << std::endl;
+  std::cout << "Computing on the CPU using 1 CPU thread" << std::endl;
+  
+  GpuTimer gpu_timer;
+  gpu_timer.Start();
+
+  // Calculate the reference to compare with the device result.
+  int sum = 0;
+  for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
+  {
+    sum = 0;
+    for( int i = 0 ; i < N ; ++i )
+      sum += a_host[i];
+  }
+
+  gpu_timer.Stop();
+  
+  std::cout << "  Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  // Compute on the CPU using several OpenMP threads
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  std::cout << std::endl;
+  std::cout << "Computing on the CPU using " << omp_get_max_threads() << " OpenMP thread(s)" << std::endl;
+  
+  gpu_timer.Start();
+
+  // Calculate the reference to compare with the device result.
+  int omp_sum = 0;
+  for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
+  {
+    omp_sum = 0;
+#pragma omp parallel shared(omp_sum)
+    {
+#pragma omp for reduction(+ : omp_sum)
+    for( int i = 0 ; i < N ; ++i )
+      omp_sum = omp_sum + a_host[i];
+    }
+  }
+
+  gpu_timer.Stop();
+  
+  std::cout << "  Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
+  
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  // Compute on the GPU
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  // The copy of A on the device (GPU).
+  int *a_device = NULL;
+
+  // Allocate A on the device.
+  CUDA_SAFE_CALL( cudaMalloc( (void **) &a_device, N*sizeof( int ) ) );
+
+  // Copy A from host (CPU) to device (GPU).
+  CUDA_SAFE_CALL( cudaMemcpy( a_device, a_host, N*sizeof( int ), cudaMemcpyHostToDevice ) );
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  // Compute on the GPU using Thrust
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  std::cout << std::endl;
+  std::cout << "Computing on the GPU using Thrust (transfers excluded)" << std::endl;
+  
+  gpu_timer.Start();
+
+  // Launch the kernel on the GPU.
+  int thrust_sum = 0;
+  thrust::device_ptr<int> aptr = thrust::device_pointer_cast(a_device);
+  for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
+  {
+    thrust_sum = thrust::reduce( aptr, aptr+N );
+  }
+
+  gpu_timer.Stop();
+  
+  std::cout << "  Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  // Compute on the GPU
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  std::cout << std::endl;
+  std::cout << "Computing on the GPU (transfers excluded)" << std::endl;
+  
+  gpu_timer.Start();
+
+  // Launch the kernel on the GPU.
+  int gpu_sum = 0;
+  for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
+  {
+    gpu_sum = reduce_on_gpu( N, a_device );
+  }
+
+  gpu_timer.Stop();
+  
+  std::cout << "  Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  // Compute on the GPU (optimized version)
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  std::cout << std::endl;
+  std::cout << "Computing on the GPU using a tuned version (transfers excluded)" << std::endl;
+  
+  gpu_timer.Start();
+
+  const int BLOCK_DIM = 256;
+  
+  // Launch the kernel on the GPU.
+  int optim_gpu_sum = 0;
+  for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
+  {
+    optim_gpu_sum = reduce_on_gpu_optimized<BLOCK_DIM>( N, a_device );
+  }
+  
+  gpu_timer.Stop();
+  
+  std::cout << "  Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
+
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  // Compute on the GPU (optimized version without mysum+=)
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  std::cout << std::endl;
+  std::cout << "Computing on the GPU using a tuned version  without my_sum (transfers excluded)" << std::endl;
+  
+  gpu_timer.Start();
+
+  
+  // Launch the kernel on the GPU.
+  int optim_gpu_sum_wo_mysum = 0;
+  for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
+  {
+    optim_gpu_sum_wo_mysum = reduce_on_gpu_optimized_wo_mysum<BLOCK_DIM>( N, a_device );
+  }
+  
+  gpu_timer.Stop();
+  
+  std::cout << "  Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  // Validate results
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  std::cout << std::endl;
+  std::cout << std::endl;
+  std::cout << "OpenMP results: ref= " << sum << " / sum= " << omp_sum << std::endl;
+  std::cout << "CUDA   results: ref= " << sum << " / sum= " << gpu_sum << std::endl;
+  std::cout << "Thrust results: ref= " << sum << " / sum= " << thrust_sum << std::endl;
+  std::cout << "Optim  results: ref= " << sum << " / sum= " << optim_gpu_sum << std::endl;
+  std::cout << "Optim without mysum+=  results: ref= " << sum << " / sum= " << optim_gpu_sum_wo_mysum << std::endl;
+  
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  // Clean memory
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  // Free device memory.
+  CUDA_SAFE_CALL( cudaFree( a_device ) );
+  
+  // Free host memory.
+  delete[] a_host;
+
+  return 0;
+}
+
--- a/A4/TP_OMP_GPU/Seuillage/.vs/Seuillage/v16/.suo
+++ b/A4/TP_OMP_GPU/Seuillage/.vs/Seuillage/v16/.suo
--- a/A4/TP_OMP_GPU/Seuillage/.vs/Seuillage/v16/Browse.VC.db
+++ b/A4/TP_OMP_GPU/Seuillage/.vs/Seuillage/v16/Browse.VC.db
--- a/A4/TP_OMP_GPU/Seuillage/GpuTimer.h
+++ b/A4/TP_OMP_GPU/Seuillage/GpuTimer.h
@ -0,0 +1,39 @@
+#pragma once
+#include <cuda_runtime_api.h>
+
+class GpuTimer
+{
+  cudaEvent_t start, stop;
+
+public:
+  GpuTimer()
+  {
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+  }
+
+  ~GpuTimer()
+  {
+    cudaEventDestroy(stop);
+    cudaEventDestroy(start);
+  }
+
+  void Start()
+  {
+    cudaEventRecord(start);
+  }
+
+  void Stop()
+  {
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+  }
+
+  float Elapsed()
+  {
+    float elapsed;
+    cudaEventElapsedTime(&elapsed, start, stop);
+    return elapsed;
+  }
+};
+
--- a/A4/TP_OMP_GPU/Seuillage/Seuillage.sln
+++ b/A4/TP_OMP_GPU/Seuillage/Seuillage.sln
@ -0,0 +1,25 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 16
+VisualStudioVersion = 16.0.32228.343
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Seuillage", "Seuillage\Seuillage.vcxproj", "{727252A0-B5D1-48AE-81A6-37E11733EBC2}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{727252A0-B5D1-48AE-81A6-37E11733EBC2}.Debug|x64.ActiveCfg = Debug|x64
+		{727252A0-B5D1-48AE-81A6-37E11733EBC2}.Debug|x64.Build.0 = Debug|x64
+		{727252A0-B5D1-48AE-81A6-37E11733EBC2}.Release|x64.ActiveCfg = Release|x64
+		{727252A0-B5D1-48AE-81A6-37E11733EBC2}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {5177E263-3BDE-4B8E-8A21-512DA5F23521}
+	EndGlobalSection
+EndGlobal
--- a/A4/TP_OMP_GPU/Seuillage/Seuillage/CpuTimer.cpp
+++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/CpuTimer.cpp
@ -0,0 +1,7 @@
+#include "CpuTimer.h"
+
+// Initialize the resolution of the timer
+LARGE_INTEGER CpuTimer::m_freq = (QueryPerformanceFrequency(&CpuTimer::m_freq), CpuTimer::m_freq);
+
+// Calculate the overhead of the timer
+LONGLONG CpuTimer::m_overhead = CpuTimer::GetOverhead();
--- a/A4/TP_OMP_GPU/Seuillage/Seuillage/CpuTimer.h
+++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/CpuTimer.h
@ -0,0 +1,37 @@
+#pragma once
+#include <windows.h>
+
+struct CpuTimer
+{
+  void Start() 
+  {
+    QueryPerformanceCounter(&m_start);
+  }
+
+  void Stop() 
+  {
+    QueryPerformanceCounter(&m_stop);
+  }
+
+  // Returns elapsed time in milliseconds (ms)
+  double Elapsed()
+  {
+    return (m_stop.QuadPart - m_start.QuadPart - m_overhead) * 1000.0 / m_freq.QuadPart;
+  }
+
+private:
+
+  // Returns the overhead of the timer in ticks
+  static LONGLONG GetOverhead()
+  {
+    CpuTimer t;
+    t.Start();
+    t.Stop();
+    return t.m_stop.QuadPart - t.m_start.QuadPart;
+  }
+
+  LARGE_INTEGER m_start;
+  LARGE_INTEGER m_stop;
+  static LARGE_INTEGER m_freq;
+  static LONGLONG m_overhead;
+};
--- a/A4/TP_OMP_GPU/Seuillage/Seuillage/GpuTimer.h
+++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/GpuTimer.h
@ -0,0 +1,39 @@
+#pragma once
+#include <cuda_runtime_api.h>
+
+class GpuTimer
+{
+  cudaEvent_t start, stop;
+
+public:
+  GpuTimer()
+  {
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+  }
+
+  ~GpuTimer()
+  {
+    cudaEventDestroy(stop);
+    cudaEventDestroy(start);
+  }
+
+  void Start()
+  {
+    cudaEventRecord(start);
+  }
+
+  void Stop()
+  {
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+  }
+
+  float Elapsed()
+  {
+    float elapsed;
+    cudaEventElapsedTime(&elapsed, start, stop);
+    return elapsed;
+  }
+};
+
--- a/A4/TP_OMP_GPU/Seuillage/Seuillage/Seuillage.vcxproj
+++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/Seuillage.vcxproj
@ -0,0 +1,86 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{727252A0-B5D1-48AE-81A6-37E11733EBC2}</ProjectGuid>
+    <RootNamespace>Seuillage</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v142</PlatformToolset>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 11.8.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;WIN64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <CudaCompile>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;WIN64;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <CudaCompile>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="kernel.cu" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 11.8.targets" />
+  </ImportGroup>
+</Project>
--- a/A4/TP_OMP_GPU/Seuillage/Seuillage/Seuillage.vcxproj.user
+++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/Seuillage.vcxproj.user
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup />
+</Project>
--- a/A4/TP_OMP_GPU/Seuillage/Seuillage/kernel.cu
+++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/kernel.cu
@ -0,0 +1,121 @@
+
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+#include <stdio.h>
+
+cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
+
+__global__ void addKernel(int *c, const int *a, const int *b)
+{
+    int i = threadIdx.x;
+    c[i] = a[i] + b[i];
+}
+
+int main()
+{
+    const int arraySize = 5;
+    const int a[arraySize] = { 1, 2, 3, 4, 5 };
+    const int b[arraySize] = { 10, 20, 30, 40, 50 };
+    int c[arraySize] = { 0 };
+
+    // Add vectors in parallel.
+    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
+    if (cudaStatus != cudaSuccess) {
+        fprintf(stderr, "addWithCuda failed!");
+        return 1;
+    }
+
+    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
+        c[0], c[1], c[2], c[3], c[4]);
+
+    // cudaDeviceReset must be called before exiting in order for profiling and
+    // tracing tools such as Nsight and Visual Profiler to show complete traces.
+    cudaStatus = cudaDeviceReset();
+    if (cudaStatus != cudaSuccess) {
+        fprintf(stderr, "cudaDeviceReset failed!");
+        return 1;
+    }
+
+    return 0;
+}
+
+// Helper function for using CUDA to add vectors in parallel.
+cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
+{
+    int *dev_a = 0;
+    int *dev_b = 0;
+    int *dev_c = 0;
+    cudaError_t cudaStatus;
+
+    // Choose which GPU to run on, change this on a multi-GPU system.
+    cudaStatus = cudaSetDevice(0);
+    if (cudaStatus != cudaSuccess) {
+        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
+        goto Error;
+    }
+
+    // Allocate GPU buffers for three vectors (two input, one output)    .
+    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
+    if (cudaStatus != cudaSuccess) {
+        fprintf(stderr, "cudaMalloc failed!");
+        goto Error;
+    }
+
+    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
+    if (cudaStatus != cudaSuccess) {
+        fprintf(stderr, "cudaMalloc failed!");
+        goto Error;
+    }
+
+    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
+    if (cudaStatus != cudaSuccess) {
+        fprintf(stderr, "cudaMalloc failed!");
+        goto Error;
+    }
+
+    // Copy input vectors from host memory to GPU buffers.
+    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) {
+        fprintf(stderr, "cudaMemcpy failed!");
+        goto Error;
+    }
+
+    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) {
+        fprintf(stderr, "cudaMemcpy failed!");
+        goto Error;
+    }
+
+    // Launch a kernel on the GPU with one thread for each element.
+    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
+
+    // Check for any errors launching the kernel
+    cudaStatus = cudaGetLastError();
+    if (cudaStatus != cudaSuccess) {
+        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
+        goto Error;
+    }
+    
+    // cudaDeviceSynchronize waits for the kernel to finish, and returns
+    // any errors encountered during the launch.
+    cudaStatus = cudaDeviceSynchronize();
+    if (cudaStatus != cudaSuccess) {
+        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
+        goto Error;
+    }
+
+    // Copy output vector from GPU buffer to host memory.
+    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
+    if (cudaStatus != cudaSuccess) {
+        fprintf(stderr, "cudaMemcpy failed!");
+        goto Error;
+    }
+
+Error:
+    cudaFree(dev_c);
+    cudaFree(dev_a);
+    cudaFree(dev_b);
+    
+    return cudaStatus;
+}
--- a/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.Build.CppClean.log
+++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.Build.CppClean.log
@ -0,0 +1,4 @@
+c:\users\sasa\documents\m2r_seti\m2_seti\a4\tp_omp_gpu\seuillage\seuillage\x64\debug\kernel.cu.obj
+c:\users\sasa\documents\m2r_seti\m2_seti\a4\tp_omp_gpu\seuillage\seuillage\x64\debug\kernel.cu.cache
+c:\users\sasa\documents\m2r_seti\m2_seti\a4\tp_omp_gpu\seuillage\seuillage\x64\debug\seuillage.tlog\cudacompile.read.1u.tlog
+c:\users\sasa\documents\m2r_seti\m2_seti\a4\tp_omp_gpu\seuillage\seuillage\x64\debug\seuillage.tlog\cudacompile.write.1u.tlog
--- a/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.log
+++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.log
@ -0,0 +1 @@
+C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\BuildCustomizations\CUDA 11.8.targets(606,9): error : The CUDA Toolkit v11.8 directory '' does not exist.  Please verify the CUDA Toolkit is installed properly or define the CudaToolkitDir property to resolve this error.
--- a/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.tlog/CudaCompile.read.1u.tlog
+++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.tlog/CudaCompile.read.1u.tlog
--- a/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.tlog/CudaCompile.write.1u.tlog
+++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.tlog/CudaCompile.write.1u.tlog
--- a/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.tlog/Seuillage.lastbuildstate
+++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.tlog/Seuillage.lastbuildstate
@ -0,0 +1,2 @@
+PlatformToolSet=v142:VCToolArchitecture=Native32Bit:VCToolsVersion=14.29.30133:VCServicingVersionCrtHeaders=14.29.30136:TargetPlatformVersion=10.0.22000.0:
+Debug|x64|C:\Users\Sasa\Documents\M2R_SETI\M2_SETI\A4\TP_OMP_GPU\Seuillage\|
--- a/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.tlog/unsuccessfulbuild
+++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.tlog/unsuccessfulbuild
--- a/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.vcxproj.FileListAbsolute.txt
+++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/Seuillage.vcxproj.FileListAbsolute.txt
--- a/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/kernel.cu-686266873.deps
+++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/kernel.cu-686266873.deps
--- a/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/kernel.cu.cache
+++ b/A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/kernel.cu.cache
@ -0,0 +1,55 @@
+Identity=kernel.cu
+AdditionalCompilerOptions=
+AdditionalCompilerOptions=
+AdditionalDependencies=
+AdditionalDeps=
+AdditionalLibraryDirectories=
+AdditionalOptions=
+AdditionalOptions=
+CodeGeneration=compute_52,sm_52
+CodeGeneration=compute_52,sm_52
+CompileOut=C:\Users\Sasa\Documents\M2R_SETI\M2_SETI\A4\TP_OMP_GPU\Seuillage\Seuillage\x64\Debug\kernel.cu.obj
+CudaRuntime=Static
+CudaToolkitCustomDir=
+DebugInformationFormat=ProgramDatabase
+DebugInformationFormat=ProgramDatabase
+Defines=;WIN32;WIN64;_DEBUG;_CONSOLE;_MBCS;
+Emulation=false
+EnableVirtualArchInFatbin=true
+ExtensibleWholeProgramCompilation=false
+FastMath=false
+GenerateLineInfo=false
+GenerateRelocatableDeviceCode=false
+GPUDebugInfo=true
+GPUDebugInfo=true
+HostDebugInfo=true
+Include=;;include
+Inputs=
+InterleaveSourceInPTX=false
+Keep=false
+KeepDir=x64\Debug
+LinkOut=
+MaxRegCount=0
+NvccCompilation=compile
+NvccPath=
+Optimization=Od
+Optimization=Od
+PerformDeviceLink=
+ProgramDataBaseFileName=x64\Debug\vc142.pdb
+ProgramDataBaseFileName=x64\Debug\vc142.pdb
+PtxAsOptionV=false
+RequiredIncludes=
+Runtime=MDd
+Runtime=MDd
+RuntimeChecks=RTC1
+RuntimeChecks=RTC1
+TargetMachinePlatform=64
+TargetMachinePlatform=64
+TypeInfo=
+TypeInfo=
+UseHostDefines=true
+UseHostInclude=true
+UseHostLibraryDependencies=
+UseHostLibraryDirectories=
+Warning=W3
+Warning=W3
--- a/A4/TP_OMP_GPU/deviceQuery.PNG
+++ b/A4/TP_OMP_GPU/deviceQuery.PNG
--- a/IA/1708.06733.pdf
+++ b/IA/1708.06733.pdf
--- a/IA/Analyse.docx
+++ b/IA/Analyse.docx
--- a/IA/Master_2_SETI-IA_EmbarquéeDeConfiance-Examen-2023-02-24[9120].pdf
+++ b/IA/Master_2_SETI-IA_EmbarquéeDeConfiance-Examen-2023-02-24[9120].pdf
--- a/Applications-PartI.pdf
+++ b/Applications-PartI.pdf
--- a/Applications-PartII.pdf
+++ b/Applications-PartII.pdf
--- a/IA/cours_adversarial_20230203[9258].pdf
+++ b/IA/cours_adversarial_20230203[9258].pdf
				`@ -0,0 +1 @@`
				C:\Users\Sasa\Documents\M2R_SETI\M2_SETI\A4\TP_GPU-master\TP2_reduction\windows\Reduce.vcxproj(55,5): error MSB4019: le projet importé "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\BuildCustomizations\CUDA 4.2.props" est introuvable. Vérifiez que l'expression de la déclaration Import "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\\BuildCustomizations\CUDA 4.2.props" est correcte et que le fichier existe sur le disque.
				`@ -0,0 +1 @@`
				`C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\BuildCustomizations\CUDA 11.8.targets(606,9): error : The CUDA Toolkit v11.8 directory '' does not exist. Please verify the CUDA Toolkit is installed properly or define the CudaToolkitDir property to resolve this error.`