IA
This commit is contained in:
parent
83239d487e
commit
049e45e1b9
34 changed files with 1002 additions and 0 deletions
Binary file not shown.
BIN
A2/Detection_cercle/car.bmp
Normal file
BIN
A2/Detection_cercle/car.bmp
Normal file
Binary file not shown.
After Width: | Height: | Size: 5.9 MiB |
BIN
A2/Detection_cercle/car.jpg
Normal file
BIN
A2/Detection_cercle/car.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 370 KiB |
BIN
A4/TP_GPU-master/TP2_reduction/windows/.vs/Reduce/v16/.suo
Normal file
BIN
A4/TP_GPU-master/TP2_reduction/windows/.vs/Reduce/v16/.suo
Normal file
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,4 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<PropertyGroup />
|
||||
</Project>
|
|
@ -0,0 +1 @@
|
|||
C:\Users\Sasa\Documents\M2R_SETI\M2_SETI\A4\TP_GPU-master\TP2_reduction\windows\Reduce.vcxproj(55,5): error MSB4019: le projet importé "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\BuildCustomizations\CUDA 4.2.props" est introuvable. Vérifiez que l'expression de la déclaration Import "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\\BuildCustomizations\CUDA 4.2.props" est correcte et que le fichier existe sur le disque.
|
577
A4/TP_OMP_GPU/Reduce_solution.cu
Normal file
577
A4/TP_OMP_GPU/Reduce_solution.cu
Normal file
|
@ -0,0 +1,577 @@
|
|||
/*
|
||||
# Copyright (c) 2011-2012 NVIDIA CORPORATION. All Rights Reserved.
|
||||
#
|
||||
# NVIDIA CORPORATION and its licensors retain all intellectual property
|
||||
# and proprietary rights in and to this software, related documentation
|
||||
# and any modifications thereto. Any use, reproduction, disclosure or
|
||||
# distribution of this software and related documentation without an express
|
||||
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <omp.h>
|
||||
#include <thrust/reduce.h>
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/execution_policy.h>
|
||||
#include "GpuTimer.h"
|
||||
|
||||
#define CUDA_SAFE_CALL(call) \
|
||||
{ \
|
||||
cudaError_t err_code = call; \
|
||||
if( err_code != cudaSuccess ) { std::cerr << "Error (" << __FILE__ << ":" << __LINE__ << "): " << cudaGetErrorString(err_code) << std::endl; return 1; } \
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// G P U R E D U C T I O N
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__global__ void reduce_kernel( int n, const int *in_buffer, int *out_buffer, const int2 *block_ranges )
|
||||
{
|
||||
// Allocate shared memory inside the block.
|
||||
extern __shared__ int s_mem[];
|
||||
|
||||
// The range of data to work with.
|
||||
int2 range = block_ranges[blockIdx.x];
|
||||
|
||||
// Compute the sum of my elements.
|
||||
int my_sum = 0;
|
||||
for( int idx = range.x + threadIdx.x ; idx < range.y ; idx += blockDim.x )
|
||||
my_sum += in_buffer[idx];
|
||||
|
||||
// Copy my sum in shared memory.
|
||||
s_mem[threadIdx.x] = my_sum;
|
||||
|
||||
// Make sure all the threads have copied their value in shared memory.
|
||||
__syncthreads();
|
||||
|
||||
int offset ;
|
||||
// Compute the sum inside the block.
|
||||
for(offset = blockDim.x / 2 ; offset > 16 ; offset /= 2 )
|
||||
{
|
||||
if( threadIdx.x < offset )
|
||||
s_mem[threadIdx.x] += s_mem[threadIdx.x + offset];
|
||||
__syncthreads( );
|
||||
}
|
||||
|
||||
//INSIDE WARP 0 SYNC NOT NECESSARY
|
||||
for(; offset > 0 ; offset /= 2 )
|
||||
{
|
||||
if( threadIdx.x < offset )
|
||||
s_mem[threadIdx.x] += s_mem[threadIdx.x + offset];
|
||||
//__syncthreads( );
|
||||
}
|
||||
|
||||
|
||||
// The first thread of the block stores its result.
|
||||
if( threadIdx.x == 0 )
|
||||
out_buffer[blockIdx.x] = s_mem[0];
|
||||
}
|
||||
|
||||
int reduce_on_gpu( int n, const int *a_device )
|
||||
{
|
||||
// Compute the size of the grid.
|
||||
const int BLOCK_DIM = 256;
|
||||
const int grid_dim = std::min( BLOCK_DIM, (n + BLOCK_DIM-1) / BLOCK_DIM );
|
||||
const int num_threads = BLOCK_DIM * grid_dim;
|
||||
|
||||
// Compute the number of elements per block.
|
||||
const int elements_per_block = BLOCK_DIM * ((n + num_threads - 1) / num_threads);
|
||||
|
||||
// Allocate memory for temporary buffers.
|
||||
int *partial_sums = NULL;
|
||||
int2 *block_ranges = NULL;
|
||||
|
||||
CUDA_SAFE_CALL( cudaMalloc( (void **) &partial_sums, BLOCK_DIM * sizeof(int ) ) );
|
||||
CUDA_SAFE_CALL( cudaMalloc( (void **) &block_ranges, grid_dim * sizeof(int2) ) );
|
||||
|
||||
// Compute the ranges for the blocks.
|
||||
int sum = 0;
|
||||
int2 *block_ranges_on_host = new int2[grid_dim];
|
||||
for( int block_idx = 0 ; block_idx < grid_dim ; ++block_idx )
|
||||
{
|
||||
block_ranges_on_host[block_idx].x = sum;
|
||||
block_ranges_on_host[block_idx].y = std::min( sum += elements_per_block, n );
|
||||
}
|
||||
CUDA_SAFE_CALL( cudaMemcpy( block_ranges, block_ranges_on_host, grid_dim * sizeof(int2), cudaMemcpyHostToDevice ) );
|
||||
delete[] block_ranges_on_host;
|
||||
|
||||
// First round: Compute a partial sum for all blocks.
|
||||
reduce_kernel<<<grid_dim, BLOCK_DIM, BLOCK_DIM*sizeof(int)>>>( n, a_device, partial_sums, block_ranges );
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
// Set the ranges for the second kernel call.
|
||||
int2 block_range = make_int2( 0, grid_dim );
|
||||
CUDA_SAFE_CALL( cudaMemcpy( block_ranges, &block_range, sizeof(int2), cudaMemcpyHostToDevice ) );
|
||||
|
||||
// Second round: Compute the final sum by summing the partial results of all blocks.
|
||||
reduce_kernel<<<1, BLOCK_DIM, BLOCK_DIM*sizeof(int)>>>( grid_dim, partial_sums, partial_sums, block_ranges );
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
// Read the result from device memory.
|
||||
int result;
|
||||
CUDA_SAFE_CALL( cudaMemcpy( &result, partial_sums, sizeof(int), cudaMemcpyDeviceToHost ) );
|
||||
|
||||
// Free temporary memory.
|
||||
CUDA_SAFE_CALL( cudaFree( block_ranges ) );
|
||||
CUDA_SAFE_CALL( cudaFree( partial_sums ) );
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// G P U R E D U C T I O N : O P T I M I Z E D V E R S I O N
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define WARP_SIZE 32
|
||||
|
||||
template< int BLOCK_DIM >
|
||||
__global__ void reduce_kernel_optimized( int n, const int *in_buffer, int *out_buffer, const int2 *__restrict block_ranges )
|
||||
{
|
||||
// The number of warps in the block.
|
||||
const int NUM_WARPS = BLOCK_DIM / WARP_SIZE;
|
||||
|
||||
// Allocate shared memory inside the block.
|
||||
__shared__ volatile int s_mem[BLOCK_DIM];
|
||||
|
||||
// The range of data to work with.
|
||||
int2 range = block_ranges[blockIdx.x];
|
||||
|
||||
// Warp/lane IDs.
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
|
||||
// Compute the sum of my elements.
|
||||
int my_sum = 0;
|
||||
for( int idx = range.x + threadIdx.x ; idx < range.y ; idx += BLOCK_DIM )
|
||||
my_sum += in_buffer[idx];
|
||||
|
||||
// Copy my sum in shared memory.
|
||||
s_mem[threadIdx.x] = my_sum;
|
||||
|
||||
// Compute the sum inside each warp.
|
||||
#pragma unroll
|
||||
for( int offset = 16 ; offset > 1 ; offset >>= 1 )
|
||||
if( lane_id < offset )
|
||||
s_mem[threadIdx.x] = my_sum += s_mem[threadIdx.x + offset];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Each warp leader stores the result for the warp.
|
||||
if( lane_id == 0 )
|
||||
s_mem[warp_id] = my_sum += s_mem[threadIdx.x+1];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if( warp_id == 0 )
|
||||
{
|
||||
// Read my value from shared memory and store it in a register.
|
||||
my_sum = s_mem[lane_id];
|
||||
|
||||
// Sum the results of the warps.
|
||||
#pragma unroll
|
||||
for( int offset = NUM_WARPS / 2 ; offset > 1 ; offset >>= 1 )
|
||||
if( threadIdx.x < offset )
|
||||
s_mem[threadIdx.x] = my_sum += s_mem[threadIdx.x + offset];
|
||||
}
|
||||
|
||||
// The 1st thread stores the result of the block.
|
||||
if( threadIdx.x == 0 )
|
||||
out_buffer[blockIdx.x] = my_sum += s_mem[1];
|
||||
}
|
||||
|
||||
template< int BLOCK_DIM >
|
||||
int reduce_on_gpu_optimized( int n, const int *a_device )
|
||||
{
|
||||
// Compute the size of the grid.
|
||||
const int grid_dim = std::min( BLOCK_DIM, (n + BLOCK_DIM-1) / BLOCK_DIM );
|
||||
const int num_threads = BLOCK_DIM * grid_dim;
|
||||
|
||||
// Compute the number of elements per block.
|
||||
const int elements_per_block = BLOCK_DIM * ((n + num_threads - 1) / num_threads);
|
||||
|
||||
// Allocate memory for temporary buffers.
|
||||
int *partial_sums = NULL;
|
||||
int2 *block_ranges = NULL;
|
||||
|
||||
CUDA_SAFE_CALL( cudaMalloc( (void **) &partial_sums, BLOCK_DIM * sizeof(int ) ) );
|
||||
CUDA_SAFE_CALL( cudaMalloc( (void **) &block_ranges, grid_dim * sizeof(int2) ) );
|
||||
|
||||
// Compute the ranges for the blocks.
|
||||
int sum = 0;
|
||||
int2 *block_ranges_on_host = new int2[grid_dim];
|
||||
for( int block_idx = 0 ; block_idx < grid_dim ; ++block_idx )
|
||||
{
|
||||
block_ranges_on_host[block_idx].x = sum;
|
||||
block_ranges_on_host[block_idx].y = std::min( sum += elements_per_block, n );
|
||||
}
|
||||
CUDA_SAFE_CALL( cudaMemcpy( block_ranges, block_ranges_on_host, grid_dim * sizeof(int2), cudaMemcpyHostToDevice ) );
|
||||
delete[] block_ranges_on_host;
|
||||
|
||||
// First round: Compute a partial sum for all blocks.
|
||||
reduce_kernel_optimized<BLOCK_DIM><<<grid_dim, BLOCK_DIM>>>( n, a_device, partial_sums, block_ranges );
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
// Set the ranges for the second kernel call.
|
||||
int2 block_range = make_int2( 0, grid_dim );
|
||||
CUDA_SAFE_CALL( cudaMemcpy( block_ranges, &block_range, sizeof(int2), cudaMemcpyHostToDevice ) );
|
||||
|
||||
// Second round: Compute the final sum by summing the partial results of all blocks.
|
||||
reduce_kernel_optimized<BLOCK_DIM><<<1, BLOCK_DIM>>>( grid_dim, partial_sums, partial_sums, block_ranges );
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
// Read the result from device memory.
|
||||
int result;
|
||||
CUDA_SAFE_CALL( cudaMemcpy( &result, partial_sums, sizeof(int), cudaMemcpyDeviceToHost ) );
|
||||
|
||||
// Free temporary memory.
|
||||
CUDA_SAFE_CALL( cudaFree( block_ranges ) );
|
||||
CUDA_SAFE_CALL( cudaFree( partial_sums ) );
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// G P U R E D U C T I O N : O P T I M I Z E D WITHOUT MYSUM+= V E R S I O N
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
template< int BLOCK_DIM >
|
||||
__global__ void reduce_kernel_optimized_wo_mysum( int n, const int *in_buffer, int *out_buffer, const int2 *__restrict block_ranges )
|
||||
{
|
||||
// The number of warps in the block.
|
||||
const int NUM_WARPS = BLOCK_DIM / WARP_SIZE;
|
||||
|
||||
// Allocate shared memory inside the block.
|
||||
__shared__ volatile int s_mem[BLOCK_DIM];
|
||||
|
||||
// The range of data to work with.
|
||||
int2 range = block_ranges[blockIdx.x];
|
||||
|
||||
// Warp/lane IDs.
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
|
||||
// Compute the sum of my elements.
|
||||
int my_sum = 0;
|
||||
for( int idx = range.x + threadIdx.x ; idx < range.y ; idx += BLOCK_DIM )
|
||||
my_sum += in_buffer[idx];
|
||||
|
||||
// Copy my sum in shared memory.
|
||||
s_mem[threadIdx.x] = my_sum;
|
||||
|
||||
// Compute the sum inside each warp.
|
||||
#pragma unroll
|
||||
for( int offset = 16 ; offset > 0 ; offset >>= 1 )
|
||||
if( lane_id < offset )
|
||||
s_mem[threadIdx.x] += s_mem[threadIdx.x + offset];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Each warp leader stores the result for the warp.
|
||||
if( lane_id == 0 )
|
||||
s_mem[warp_id] = s_mem[threadIdx.x];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if( warp_id == 0 )
|
||||
{
|
||||
// Sum the results of the warps.
|
||||
#pragma unroll
|
||||
for( int offset = NUM_WARPS / 2 ; offset > 0 ; offset >>= 1 )
|
||||
if( threadIdx.x < offset )
|
||||
s_mem[threadIdx.x] += s_mem[threadIdx.x + offset];
|
||||
}
|
||||
|
||||
// The 1st thread stores the result of the block.
|
||||
if( threadIdx.x == 0 )
|
||||
out_buffer[blockIdx.x] = s_mem[0];
|
||||
}
|
||||
|
||||
template< int BLOCK_DIM >
|
||||
int reduce_on_gpu_optimized_wo_mysum( int n, const int *a_device )
|
||||
{
|
||||
// Compute the size of the grid.
|
||||
const int grid_dim = std::min( BLOCK_DIM, (n + BLOCK_DIM-1) / BLOCK_DIM );
|
||||
const int num_threads = BLOCK_DIM * grid_dim;
|
||||
|
||||
// Compute the number of elements per block.
|
||||
const int elements_per_block = BLOCK_DIM * ((n + num_threads - 1) / num_threads);
|
||||
|
||||
// Allocate memory for temporary buffers.
|
||||
int *partial_sums = NULL;
|
||||
int2 *block_ranges = NULL;
|
||||
|
||||
CUDA_SAFE_CALL( cudaMalloc( (void **) &partial_sums, BLOCK_DIM * sizeof(int ) ) );
|
||||
CUDA_SAFE_CALL( cudaMalloc( (void **) &block_ranges, grid_dim * sizeof(int2) ) );
|
||||
|
||||
// Compute the ranges for the blocks.
|
||||
int sum = 0;
|
||||
int2 *block_ranges_on_host = new int2[grid_dim];
|
||||
for( int block_idx = 0 ; block_idx < grid_dim ; ++block_idx )
|
||||
{
|
||||
block_ranges_on_host[block_idx].x = sum;
|
||||
block_ranges_on_host[block_idx].y = std::min( sum += elements_per_block, n );
|
||||
}
|
||||
CUDA_SAFE_CALL( cudaMemcpy( block_ranges, block_ranges_on_host, grid_dim * sizeof(int2), cudaMemcpyHostToDevice ) );
|
||||
delete[] block_ranges_on_host;
|
||||
|
||||
// First round: Compute a partial sum for all blocks.
|
||||
reduce_kernel_optimized_wo_mysum<BLOCK_DIM><<<grid_dim, BLOCK_DIM>>>( n, a_device, partial_sums, block_ranges );
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
// Set the ranges for the second kernel call.
|
||||
int2 block_range = make_int2( 0, grid_dim );
|
||||
CUDA_SAFE_CALL( cudaMemcpy( block_ranges, &block_range, sizeof(int2), cudaMemcpyHostToDevice ) );
|
||||
|
||||
// Second round: Compute the final sum by summing the partial results of all blocks.
|
||||
reduce_kernel_optimized_wo_mysum<BLOCK_DIM><<<1, BLOCK_DIM>>>( grid_dim, partial_sums, partial_sums, block_ranges );
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
// Read the result from device memory.
|
||||
int result;
|
||||
CUDA_SAFE_CALL( cudaMemcpy( &result, partial_sums, sizeof(int), cudaMemcpyDeviceToHost ) );
|
||||
|
||||
// Free temporary memory.
|
||||
CUDA_SAFE_CALL( cudaFree( block_ranges ) );
|
||||
CUDA_SAFE_CALL( cudaFree( partial_sums ) );
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// M A I N
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int main( int, char ** )
|
||||
{
|
||||
const int NUM_TESTS = 10;
|
||||
|
||||
// The number of elements in the problem.
|
||||
const int N = 256*256*1024;
|
||||
//const int N = 256*256*8;
|
||||
|
||||
std::cout << "Computing a reduction on " << N << " elements" << std::endl;
|
||||
|
||||
// X and Y on the host (CPU).
|
||||
int *a_host = new int[N];
|
||||
|
||||
// Make sure the memory got allocated. TODO: free memory.
|
||||
if( a_host == NULL )
|
||||
{
|
||||
std::cerr << "ERROR: Couldn't allocate a_host" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Generate data
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::cout << "Filling with 1s" << std::endl;
|
||||
|
||||
// Generate pseudo-random data.
|
||||
for( int i = 0 ; i < N ; ++i )
|
||||
a_host[i] = 1;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Compute on the CPU using 1 thread
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "Computing on the CPU using 1 CPU thread" << std::endl;
|
||||
|
||||
GpuTimer gpu_timer;
|
||||
gpu_timer.Start();
|
||||
|
||||
// Calculate the reference to compare with the device result.
|
||||
int sum = 0;
|
||||
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
|
||||
{
|
||||
sum = 0;
|
||||
for( int i = 0 ; i < N ; ++i )
|
||||
sum += a_host[i];
|
||||
}
|
||||
|
||||
gpu_timer.Stop();
|
||||
|
||||
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Compute on the CPU using several OpenMP threads
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "Computing on the CPU using " << omp_get_max_threads() << " OpenMP thread(s)" << std::endl;
|
||||
|
||||
gpu_timer.Start();
|
||||
|
||||
// Calculate the reference to compare with the device result.
|
||||
int omp_sum = 0;
|
||||
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
|
||||
{
|
||||
omp_sum = 0;
|
||||
#pragma omp parallel shared(omp_sum)
|
||||
{
|
||||
#pragma omp for reduction(+ : omp_sum)
|
||||
for( int i = 0 ; i < N ; ++i )
|
||||
omp_sum = omp_sum + a_host[i];
|
||||
}
|
||||
}
|
||||
|
||||
gpu_timer.Stop();
|
||||
|
||||
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Compute on the GPU
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// The copy of A on the device (GPU).
|
||||
int *a_device = NULL;
|
||||
|
||||
// Allocate A on the device.
|
||||
CUDA_SAFE_CALL( cudaMalloc( (void **) &a_device, N*sizeof( int ) ) );
|
||||
|
||||
// Copy A from host (CPU) to device (GPU).
|
||||
CUDA_SAFE_CALL( cudaMemcpy( a_device, a_host, N*sizeof( int ), cudaMemcpyHostToDevice ) );
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Compute on the GPU using Thrust
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "Computing on the GPU using Thrust (transfers excluded)" << std::endl;
|
||||
|
||||
gpu_timer.Start();
|
||||
|
||||
// Launch the kernel on the GPU.
|
||||
int thrust_sum = 0;
|
||||
thrust::device_ptr<int> aptr = thrust::device_pointer_cast(a_device);
|
||||
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
|
||||
{
|
||||
thrust_sum = thrust::reduce( aptr, aptr+N );
|
||||
}
|
||||
|
||||
gpu_timer.Stop();
|
||||
|
||||
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Compute on the GPU
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "Computing on the GPU (transfers excluded)" << std::endl;
|
||||
|
||||
gpu_timer.Start();
|
||||
|
||||
// Launch the kernel on the GPU.
|
||||
int gpu_sum = 0;
|
||||
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
|
||||
{
|
||||
gpu_sum = reduce_on_gpu( N, a_device );
|
||||
}
|
||||
|
||||
gpu_timer.Stop();
|
||||
|
||||
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Compute on the GPU (optimized version)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "Computing on the GPU using a tuned version (transfers excluded)" << std::endl;
|
||||
|
||||
gpu_timer.Start();
|
||||
|
||||
const int BLOCK_DIM = 256;
|
||||
|
||||
// Launch the kernel on the GPU.
|
||||
int optim_gpu_sum = 0;
|
||||
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
|
||||
{
|
||||
optim_gpu_sum = reduce_on_gpu_optimized<BLOCK_DIM>( N, a_device );
|
||||
}
|
||||
|
||||
gpu_timer.Stop();
|
||||
|
||||
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Compute on the GPU (optimized version without mysum+=)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "Computing on the GPU using a tuned version without my_sum (transfers excluded)" << std::endl;
|
||||
|
||||
gpu_timer.Start();
|
||||
|
||||
|
||||
// Launch the kernel on the GPU.
|
||||
int optim_gpu_sum_wo_mysum = 0;
|
||||
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
|
||||
{
|
||||
optim_gpu_sum_wo_mysum = reduce_on_gpu_optimized_wo_mysum<BLOCK_DIM>( N, a_device );
|
||||
}
|
||||
|
||||
gpu_timer.Stop();
|
||||
|
||||
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Validate results
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << std::endl;
|
||||
std::cout << "OpenMP results: ref= " << sum << " / sum= " << omp_sum << std::endl;
|
||||
std::cout << "CUDA results: ref= " << sum << " / sum= " << gpu_sum << std::endl;
|
||||
std::cout << "Thrust results: ref= " << sum << " / sum= " << thrust_sum << std::endl;
|
||||
std::cout << "Optim results: ref= " << sum << " / sum= " << optim_gpu_sum << std::endl;
|
||||
std::cout << "Optim without mysum+= results: ref= " << sum << " / sum= " << optim_gpu_sum_wo_mysum << std::endl;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Clean memory
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Free device memory.
|
||||
CUDA_SAFE_CALL( cudaFree( a_device ) );
|
||||
|
||||
// Free host memory.
|
||||
delete[] a_host;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
BIN
A4/TP_OMP_GPU/Seuillage/.vs/Seuillage/v16/.suo
Normal file
BIN
A4/TP_OMP_GPU/Seuillage/.vs/Seuillage/v16/.suo
Normal file
Binary file not shown.
BIN
A4/TP_OMP_GPU/Seuillage/.vs/Seuillage/v16/Browse.VC.db
Normal file
BIN
A4/TP_OMP_GPU/Seuillage/.vs/Seuillage/v16/Browse.VC.db
Normal file
Binary file not shown.
39
A4/TP_OMP_GPU/Seuillage/GpuTimer.h
Normal file
39
A4/TP_OMP_GPU/Seuillage/GpuTimer.h
Normal file
|
@ -0,0 +1,39 @@
|
|||
#pragma once
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
class GpuTimer
|
||||
{
|
||||
cudaEvent_t start, stop;
|
||||
|
||||
public:
|
||||
GpuTimer()
|
||||
{
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
}
|
||||
|
||||
~GpuTimer()
|
||||
{
|
||||
cudaEventDestroy(stop);
|
||||
cudaEventDestroy(start);
|
||||
}
|
||||
|
||||
void Start()
|
||||
{
|
||||
cudaEventRecord(start);
|
||||
}
|
||||
|
||||
void Stop()
|
||||
{
|
||||
cudaEventRecord(stop);
|
||||
cudaEventSynchronize(stop);
|
||||
}
|
||||
|
||||
float Elapsed()
|
||||
{
|
||||
float elapsed;
|
||||
cudaEventElapsedTime(&elapsed, start, stop);
|
||||
return elapsed;
|
||||
}
|
||||
};
|
||||
|
25
A4/TP_OMP_GPU/Seuillage/Seuillage.sln
Normal file
25
A4/TP_OMP_GPU/Seuillage/Seuillage.sln
Normal file
|
@ -0,0 +1,25 @@
|
|||
|
||||
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||
# Visual Studio Version 16
|
||||
VisualStudioVersion = 16.0.32228.343
|
||||
MinimumVisualStudioVersion = 10.0.40219.1
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Seuillage", "Seuillage\Seuillage.vcxproj", "{727252A0-B5D1-48AE-81A6-37E11733EBC2}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|x64 = Debug|x64
|
||||
Release|x64 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||
{727252A0-B5D1-48AE-81A6-37E11733EBC2}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{727252A0-B5D1-48AE-81A6-37E11733EBC2}.Debug|x64.Build.0 = Debug|x64
|
||||
{727252A0-B5D1-48AE-81A6-37E11733EBC2}.Release|x64.ActiveCfg = Release|x64
|
||||
{727252A0-B5D1-48AE-81A6-37E11733EBC2}.Release|x64.Build.0 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
EndGlobalSection
|
||||
GlobalSection(ExtensibilityGlobals) = postSolution
|
||||
SolutionGuid = {5177E263-3BDE-4B8E-8A21-512DA5F23521}
|
||||
EndGlobalSection
|
||||
EndGlobal
|
7
A4/TP_OMP_GPU/Seuillage/Seuillage/CpuTimer.cpp
Normal file
7
A4/TP_OMP_GPU/Seuillage/Seuillage/CpuTimer.cpp
Normal file
|
@ -0,0 +1,7 @@
|
|||
#include "CpuTimer.h"
|
||||
|
||||
// Initialize the resolution of the timer
|
||||
LARGE_INTEGER CpuTimer::m_freq = (QueryPerformanceFrequency(&CpuTimer::m_freq), CpuTimer::m_freq);
|
||||
|
||||
// Calculate the overhead of the timer
|
||||
LONGLONG CpuTimer::m_overhead = CpuTimer::GetOverhead();
|
37
A4/TP_OMP_GPU/Seuillage/Seuillage/CpuTimer.h
Normal file
37
A4/TP_OMP_GPU/Seuillage/Seuillage/CpuTimer.h
Normal file
|
@ -0,0 +1,37 @@
|
|||
#pragma once
|
||||
#include <windows.h>
|
||||
|
||||
struct CpuTimer
|
||||
{
|
||||
void Start()
|
||||
{
|
||||
QueryPerformanceCounter(&m_start);
|
||||
}
|
||||
|
||||
void Stop()
|
||||
{
|
||||
QueryPerformanceCounter(&m_stop);
|
||||
}
|
||||
|
||||
// Returns elapsed time in milliseconds (ms)
|
||||
double Elapsed()
|
||||
{
|
||||
return (m_stop.QuadPart - m_start.QuadPart - m_overhead) * 1000.0 / m_freq.QuadPart;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
// Returns the overhead of the timer in ticks
|
||||
static LONGLONG GetOverhead()
|
||||
{
|
||||
CpuTimer t;
|
||||
t.Start();
|
||||
t.Stop();
|
||||
return t.m_stop.QuadPart - t.m_start.QuadPart;
|
||||
}
|
||||
|
||||
LARGE_INTEGER m_start;
|
||||
LARGE_INTEGER m_stop;
|
||||
static LARGE_INTEGER m_freq;
|
||||
static LONGLONG m_overhead;
|
||||
};
|
39
A4/TP_OMP_GPU/Seuillage/Seuillage/GpuTimer.h
Normal file
39
A4/TP_OMP_GPU/Seuillage/Seuillage/GpuTimer.h
Normal file
|
@ -0,0 +1,39 @@
|
|||
#pragma once
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
class GpuTimer
|
||||
{
|
||||
cudaEvent_t start, stop;
|
||||
|
||||
public:
|
||||
GpuTimer()
|
||||
{
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
}
|
||||
|
||||
~GpuTimer()
|
||||
{
|
||||
cudaEventDestroy(stop);
|
||||
cudaEventDestroy(start);
|
||||
}
|
||||
|
||||
void Start()
|
||||
{
|
||||
cudaEventRecord(start);
|
||||
}
|
||||
|
||||
void Stop()
|
||||
{
|
||||
cudaEventRecord(stop);
|
||||
cudaEventSynchronize(stop);
|
||||
}
|
||||
|
||||
float Elapsed()
|
||||
{
|
||||
float elapsed;
|
||||
cudaEventElapsedTime(&elapsed, start, stop);
|
||||
return elapsed;
|
||||
}
|
||||
};
|
||||
|
86
A4/TP_OMP_GPU/Seuillage/Seuillage/Seuillage.vcxproj
Normal file
86
A4/TP_OMP_GPU/Seuillage/Seuillage/Seuillage.vcxproj
Normal file
|
@ -0,0 +1,86 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<ProjectGuid>{727252A0-B5D1-48AE-81A6-37E11733EBC2}</ProjectGuid>
|
||||
<RootNamespace>Seuillage</RootNamespace>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>MultiByte</CharacterSet>
|
||||
<PlatformToolset>v142</PlatformToolset>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>MultiByte</CharacterSet>
|
||||
<PlatformToolset>v142</PlatformToolset>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 11.8.props" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;WIN64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
<CudaCompile>
|
||||
<TargetMachinePlatform>64</TargetMachinePlatform>
|
||||
</CudaCompile>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;WIN64;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
<CudaCompile>
|
||||
<TargetMachinePlatform>64</TargetMachinePlatform>
|
||||
</CudaCompile>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<CudaCompile Include="kernel.cu" />
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 11.8.targets" />
|
||||
</ImportGroup>
|
||||
</Project>
|
4
A4/TP_OMP_GPU/Seuillage/Seuillage/Seuillage.vcxproj.user
Normal file
4
A4/TP_OMP_GPU/Seuillage/Seuillage/Seuillage.vcxproj.user
Normal file
|
@ -0,0 +1,4 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<PropertyGroup />
|
||||
</Project>
|
121
A4/TP_OMP_GPU/Seuillage/Seuillage/kernel.cu
Normal file
121
A4/TP_OMP_GPU/Seuillage/Seuillage/kernel.cu
Normal file
|
@ -0,0 +1,121 @@
|
|||
|
||||
#include "cuda_runtime.h"
|
||||
#include "device_launch_parameters.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
|
||||
|
||||
__global__ void addKernel(int *c, const int *a, const int *b)
|
||||
{
|
||||
int i = threadIdx.x;
|
||||
c[i] = a[i] + b[i];
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
const int arraySize = 5;
|
||||
const int a[arraySize] = { 1, 2, 3, 4, 5 };
|
||||
const int b[arraySize] = { 10, 20, 30, 40, 50 };
|
||||
int c[arraySize] = { 0 };
|
||||
|
||||
// Add vectors in parallel.
|
||||
cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
|
||||
if (cudaStatus != cudaSuccess) {
|
||||
fprintf(stderr, "addWithCuda failed!");
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
|
||||
c[0], c[1], c[2], c[3], c[4]);
|
||||
|
||||
// cudaDeviceReset must be called before exiting in order for profiling and
|
||||
// tracing tools such as Nsight and Visual Profiler to show complete traces.
|
||||
cudaStatus = cudaDeviceReset();
|
||||
if (cudaStatus != cudaSuccess) {
|
||||
fprintf(stderr, "cudaDeviceReset failed!");
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Helper function for using CUDA to add vectors in parallel.
|
||||
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
|
||||
{
|
||||
int *dev_a = 0;
|
||||
int *dev_b = 0;
|
||||
int *dev_c = 0;
|
||||
cudaError_t cudaStatus;
|
||||
|
||||
// Choose which GPU to run on, change this on a multi-GPU system.
|
||||
cudaStatus = cudaSetDevice(0);
|
||||
if (cudaStatus != cudaSuccess) {
|
||||
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
|
||||
goto Error;
|
||||
}
|
||||
|
||||
// Allocate GPU buffers for three vectors (two input, one output) .
|
||||
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
|
||||
if (cudaStatus != cudaSuccess) {
|
||||
fprintf(stderr, "cudaMalloc failed!");
|
||||
goto Error;
|
||||
}
|
||||
|
||||
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
|
||||
if (cudaStatus != cudaSuccess) {
|
||||
fprintf(stderr, "cudaMalloc failed!");
|
||||
goto Error;
|
||||
}
|
||||
|
||||
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
|
||||
if (cudaStatus != cudaSuccess) {
|
||||
fprintf(stderr, "cudaMalloc failed!");
|
||||
goto Error;
|
||||
}
|
||||
|
||||
// Copy input vectors from host memory to GPU buffers.
|
||||
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
|
||||
if (cudaStatus != cudaSuccess) {
|
||||
fprintf(stderr, "cudaMemcpy failed!");
|
||||
goto Error;
|
||||
}
|
||||
|
||||
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
|
||||
if (cudaStatus != cudaSuccess) {
|
||||
fprintf(stderr, "cudaMemcpy failed!");
|
||||
goto Error;
|
||||
}
|
||||
|
||||
// Launch a kernel on the GPU with one thread for each element.
|
||||
addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
|
||||
|
||||
// Check for any errors launching the kernel
|
||||
cudaStatus = cudaGetLastError();
|
||||
if (cudaStatus != cudaSuccess) {
|
||||
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
|
||||
goto Error;
|
||||
}
|
||||
|
||||
// cudaDeviceSynchronize waits for the kernel to finish, and returns
|
||||
// any errors encountered during the launch.
|
||||
cudaStatus = cudaDeviceSynchronize();
|
||||
if (cudaStatus != cudaSuccess) {
|
||||
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
|
||||
goto Error;
|
||||
}
|
||||
|
||||
// Copy output vector from GPU buffer to host memory.
|
||||
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
if (cudaStatus != cudaSuccess) {
|
||||
fprintf(stderr, "cudaMemcpy failed!");
|
||||
goto Error;
|
||||
}
|
||||
|
||||
Error:
|
||||
cudaFree(dev_c);
|
||||
cudaFree(dev_a);
|
||||
cudaFree(dev_b);
|
||||
|
||||
return cudaStatus;
|
||||
}
|
|
@ -0,0 +1,4 @@
|
|||
c:\users\sasa\documents\m2r_seti\m2_seti\a4\tp_omp_gpu\seuillage\seuillage\x64\debug\kernel.cu.obj
|
||||
c:\users\sasa\documents\m2r_seti\m2_seti\a4\tp_omp_gpu\seuillage\seuillage\x64\debug\kernel.cu.cache
|
||||
c:\users\sasa\documents\m2r_seti\m2_seti\a4\tp_omp_gpu\seuillage\seuillage\x64\debug\seuillage.tlog\cudacompile.read.1u.tlog
|
||||
c:\users\sasa\documents\m2r_seti\m2_seti\a4\tp_omp_gpu\seuillage\seuillage\x64\debug\seuillage.tlog\cudacompile.write.1u.tlog
|
|
@ -0,0 +1 @@
|
|||
C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\BuildCustomizations\CUDA 11.8.targets(606,9): error : The CUDA Toolkit v11.8 directory '' does not exist. Please verify the CUDA Toolkit is installed properly or define the CudaToolkitDir property to resolve this error.
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,2 @@
|
|||
PlatformToolSet=v142:VCToolArchitecture=Native32Bit:VCToolsVersion=14.29.30133:VCServicingVersionCrtHeaders=14.29.30136:TargetPlatformVersion=10.0.22000.0:
|
||||
Debug|x64|C:\Users\Sasa\Documents\M2R_SETI\M2_SETI\A4\TP_OMP_GPU\Seuillage\|
|
55
A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/kernel.cu.cache
Normal file
55
A4/TP_OMP_GPU/Seuillage/Seuillage/x64/Debug/kernel.cu.cache
Normal file
|
@ -0,0 +1,55 @@
|
|||
Identity=kernel.cu
|
||||
AdditionalCompilerOptions=
|
||||
AdditionalCompilerOptions=
|
||||
AdditionalDependencies=
|
||||
AdditionalDeps=
|
||||
AdditionalLibraryDirectories=
|
||||
AdditionalOptions=
|
||||
AdditionalOptions=
|
||||
CodeGeneration=compute_52,sm_52
|
||||
CodeGeneration=compute_52,sm_52
|
||||
CompileOut=C:\Users\Sasa\Documents\M2R_SETI\M2_SETI\A4\TP_OMP_GPU\Seuillage\Seuillage\x64\Debug\kernel.cu.obj
|
||||
CudaRuntime=Static
|
||||
CudaToolkitCustomDir=
|
||||
DebugInformationFormat=ProgramDatabase
|
||||
DebugInformationFormat=ProgramDatabase
|
||||
Defines=;WIN32;WIN64;_DEBUG;_CONSOLE;_MBCS;
|
||||
Emulation=false
|
||||
EnableVirtualArchInFatbin=true
|
||||
ExtensibleWholeProgramCompilation=false
|
||||
FastMath=false
|
||||
GenerateLineInfo=false
|
||||
GenerateRelocatableDeviceCode=false
|
||||
GPUDebugInfo=true
|
||||
GPUDebugInfo=true
|
||||
HostDebugInfo=true
|
||||
Include=;;include
|
||||
Inputs=
|
||||
InterleaveSourceInPTX=false
|
||||
Keep=false
|
||||
KeepDir=x64\Debug
|
||||
LinkOut=
|
||||
MaxRegCount=0
|
||||
NvccCompilation=compile
|
||||
NvccPath=
|
||||
Optimization=Od
|
||||
Optimization=Od
|
||||
PerformDeviceLink=
|
||||
ProgramDataBaseFileName=x64\Debug\vc142.pdb
|
||||
ProgramDataBaseFileName=x64\Debug\vc142.pdb
|
||||
PtxAsOptionV=false
|
||||
RequiredIncludes=
|
||||
Runtime=MDd
|
||||
Runtime=MDd
|
||||
RuntimeChecks=RTC1
|
||||
RuntimeChecks=RTC1
|
||||
TargetMachinePlatform=64
|
||||
TargetMachinePlatform=64
|
||||
TypeInfo=
|
||||
TypeInfo=
|
||||
UseHostDefines=true
|
||||
UseHostInclude=true
|
||||
UseHostLibraryDependencies=
|
||||
UseHostLibraryDirectories=
|
||||
Warning=W3
|
||||
Warning=W3
|
BIN
A4/TP_OMP_GPU/deviceQuery.PNG
Normal file
BIN
A4/TP_OMP_GPU/deviceQuery.PNG
Normal file
Binary file not shown.
After Width: | Height: | Size: 50 KiB |
BIN
IA/1708.06733.pdf
Normal file
BIN
IA/1708.06733.pdf
Normal file
Binary file not shown.
BIN
IA/Analyse.docx
Normal file
BIN
IA/Analyse.docx
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
IA/cours_adversarial_20230203[9258].pdf
Normal file
BIN
IA/cours_adversarial_20230203[9258].pdf
Normal file
Binary file not shown.
Loading…
Reference in a new issue