a4
BIN
A4/A4C4-C1.pdf
Normal file
BIN
A4/A4C4-TP1.pdf
Normal file
BIN
A4/TP_GPU-master.zip
Normal file
1
A4/TP_GPU-master/SAXPY/.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
/Release/
|
34
A4/TP_GPU-master/SAXPY/CMakeLists.txt
Normal file
|
@ -0,0 +1,34 @@
|
|||
cmake_minimum_required(VERSION 2.8)
|
||||
|
||||
project(saxpy LANGUAGES CXX CUDA)
|
||||
set(ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/)
|
||||
#set(CMAKE_C_COMPILER "gcc-8")
|
||||
#set(CMAKE_CXX_COMPILER "g++-8")
|
||||
|
||||
#set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")
|
||||
|
||||
list(APPEND HEADER_REP ${ROOT_DIR}/ ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} "/usr/local/cuda/samples/common/inc/" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/../samples/common/inc")
|
||||
|
||||
file(GLOB CPP_FILES ${ROOT_DIR}/*.cpp)
|
||||
file(GLOB CU_FILES ${ROOT_DIR}/*.cu)
|
||||
file(GLOB_RECURSE HEADER_FILES ${ROOT_DIR}/*.cuh ${ROOT_DIR}/*.h)
|
||||
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math")
|
||||
|
||||
|
||||
set(CUDA_SEPARABLE_COMPILATION ON)
|
||||
set(CUDA_NVCC_FLAGS "-lineinfo;-I\"${ROOT_DIR}/inc\" -I\"${ROOT_DIR}/\" -I\"/usr/local/cuda/samples/common/inc/\"")
|
||||
|
||||
|
||||
add_executable(saxpy_CUDA ${CPP_FILES} ${CU_FILES} ${HEADER_FILES})
|
||||
set_target_properties(saxpy_CUDA PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_include_directories(saxpy_CUDA PUBLIC ${ROOT_DIR}/inc/ /usr/local/cuda/samples/common/inc/)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
34
A4/TP_GPU-master/SAXPY/CMakeLists_cmake28.txt
Normal file
|
@ -0,0 +1,34 @@
|
|||
cmake_minimum_required(VERSION 2.8)
|
||||
|
||||
project(matrice)
|
||||
|
||||
set(ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/)
|
||||
|
||||
#set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")
|
||||
|
||||
find_package(CUDA REQUIRED)
|
||||
list(APPEND HEADER_REP ${ROOT_DIR}/ ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} "/usr/local/cuda/samples/common/inc/" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/../samples/common/inc")
|
||||
|
||||
#file(GLOB CPP_FILES ${ROOT_DIR}/src/CPP/*.cpp)
|
||||
#file(GLOB CU_FILES ${ROOT_DIR}/src/CPP/*.cu)
|
||||
set(CPP_FILES seuillage_C.cpp)
|
||||
set(CU_FILES ${ROOT_DIR}/seuillage_main.cu)
|
||||
#file(GLOB_RECURSE HEADER_FILES ${ROOT_DIR}/src/CUDA/*.cuh ${ROOT_DIR}/inc/*.h)
|
||||
|
||||
|
||||
set(CUDA_SEPARABLE_COMPILATION ON)
|
||||
set(CUDA_NVCC_FLAGS "-lineinfo;-I\"${ROOT_DIR}/inc\" -I\"${ROOT_DIR}/\" -I\"/usr/local/cuda/samples/common/inc/\"")
|
||||
|
||||
|
||||
cuda_add_executable(seuillage_CUDA ${CPP_FILES} ${CU_FILES} ${HEADER_FILES})
|
||||
set_target_properties(seuillage_CUDA PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_include_directories(seuillage_CUDA PUBLIC ${ROOT_DIR}/inc/ /usr/local/cuda/samples/common/inc/)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
17
A4/TP_GPU-master/SAXPY/CMakeLists_cmake38.txt
Normal file
|
@ -0,0 +1,17 @@
|
|||
#cmake -DCMAKE_CUDA_FLAGS=”-arch=sm_30” ..
|
||||
cmake_minimum_required(VERSION 3.8)
|
||||
set(CUDACXX "/usr/local/cuda/bin/nvcc")
|
||||
set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc")
|
||||
project(seuillage_CUDA LANGUAGES CXX CUDA)
|
||||
|
||||
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
|
||||
set(CMAKE_VERBOSE_MAKEFILE TRUE)
|
||||
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math")
|
||||
|
||||
|
||||
add_executable(seuillage_CUDA seuillage_C.cpp seuillage_main.cu)
|
||||
set_target_properties(seuillage_CUDA PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_features(seuillage_CUDA PUBLIC cxx_std_11)
|
||||
target_include_directories(seuillage_CUDA PUBLIC ".")
|
||||
|
24
A4/TP_GPU-master/SAXPY/saxpy.h
Normal file
|
@ -0,0 +1,24 @@
|
|||
|
||||
//#define SEUILLAGE_H
|
||||
//#ifndef SEUILLAGE_H
|
||||
|
||||
// includes, system
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
|
||||
|
||||
|
||||
#define BLOCK_SIZE 16
|
||||
|
||||
// Prototype
|
||||
void runTest( int argc, char** argv);
|
||||
extern "C" void saxpy_C(float *vector_SAXPY, float A, float *vector_X, float *vector_Y, int N);
|
||||
|
||||
|
||||
//#endif
|
||||
|
||||
|
||||
|
28
A4/TP_GPU-master/SAXPY/saxpy_C_student.cpp
Normal file
|
@ -0,0 +1,28 @@
|
|||
/*
|
||||
* Copiright 1993-2009 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NVIDIA Corporation and its licensors retain all intellectual properti and
|
||||
* proprietari rights in and to this software and related documentation and
|
||||
* ani modifications thereto. Ani use, reproduction, disclosure, or distribution
|
||||
* of this software and related documentation without an ejpress license
|
||||
* agreement from NVIDIA Corporation is strictli prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
/* Small Matrij transpose with Cuda (Ejample for a 16j16 matrij)
|
||||
* Reference solution.
|
||||
*/
|
||||
|
||||
#include "saxpy.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compute reference data set
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
void saxpy_C(float *vector_SAXPY, float A, float *vector_X, float *vector_Y, int N)
|
||||
{
|
||||
// A VOUS DE CODER
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
195
A4/TP_GPU-master/SAXPY/saxpy_main_student.cu
Normal file
|
@ -0,0 +1,195 @@
|
|||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
/* Template project which demonstrates the basics on how to setup a project
|
||||
* example application.
|
||||
* Host code.
|
||||
*/
|
||||
|
||||
// includes, system
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <iostream>
|
||||
using namespace std;
|
||||
|
||||
// includes CUDA
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include "saxpy.h"
|
||||
|
||||
__global__ void saxpy_kernel(float *vector_SAXPY, float A, float *vector_X, float *vector_Y, int N)
|
||||
{
|
||||
// A VOUS DE CODER
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// declaration, forward
|
||||
void runTest( int argc, char** argv);
|
||||
|
||||
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Program main
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
int
|
||||
main( int argc, char** argv)
|
||||
{
|
||||
|
||||
runTest( argc, argv);
|
||||
}
|
||||
|
||||
__host__ static int iDivUp(int a, int b) {
|
||||
return ((a % b != 0) ? (a / b + 1): (a/b));
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Run a simple test for CUDA
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
void
|
||||
runTest( int argc, char** argv)
|
||||
{
|
||||
cudaError_t error;
|
||||
|
||||
unsigned long int N=256*1024;
|
||||
|
||||
const unsigned int mem_size = N*sizeof(float);
|
||||
// allocate host memory
|
||||
float* h_vector_X = (float*) malloc(mem_size);
|
||||
float* h_vector_Y = (float*) malloc(mem_size);
|
||||
|
||||
//Initilaisation des données d'entrée
|
||||
float A=1.0;
|
||||
|
||||
for (int i=0;i<N;i++){
|
||||
h_vector_X[i]=(float)rand();
|
||||
h_vector_Y[i]=(float)rand();
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// EXECUTION SUR LE CPU
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
// Image trait<69>e sur le CPU
|
||||
float* h_vector_SAXPY_CPU = (float*) malloc( mem_size);
|
||||
|
||||
printf("SAXPY CPU\n");
|
||||
|
||||
cudaEvent_t start,stop;
|
||||
error = cudaEventCreate(&start);
|
||||
error = cudaEventCreate(&stop);
|
||||
|
||||
// Record the start event
|
||||
error = cudaEventRecord(start, NULL);
|
||||
error = cudaEventSynchronize(start);
|
||||
//Seuillage sur CPU
|
||||
// A VOUS DE CODER
|
||||
|
||||
// Record the start event
|
||||
error = cudaEventRecord(stop, NULL);
|
||||
// Wait for the stop event to complete
|
||||
error = cudaEventSynchronize(stop);
|
||||
float msecTotal = 0.0f;
|
||||
error = cudaEventElapsedTime(&msecTotal, start, stop);
|
||||
|
||||
|
||||
printf("CPU execution time %f\n",msecTotal);
|
||||
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// EXECUTION SUR LE GPU
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
printf("SAXPY GPU\n");
|
||||
|
||||
float* h_vector_SAXPY_GPU = (float*) malloc(mem_size);
|
||||
|
||||
// images on device memory
|
||||
float* d_vector_X;
|
||||
float* d_vector_Y;
|
||||
float* d_vector_SAXPY;
|
||||
|
||||
|
||||
cudaEvent_t start_mem,stop_mem;
|
||||
error = cudaEventCreate(&start_mem);
|
||||
error = cudaEventCreate(&stop_mem);
|
||||
|
||||
error = cudaEventRecord(start, NULL);
|
||||
error = cudaEventSynchronize(start);
|
||||
|
||||
|
||||
|
||||
|
||||
// Alocation mémoire de d_vector_X, d_vector_Y et d_vector_SAXPY sur la carte GPU
|
||||
// A VOUS DE CODER
|
||||
|
||||
// copy host memory to device
|
||||
// A VOUS DE CODER
|
||||
|
||||
|
||||
error = cudaEventRecord(stop_mem, NULL);
|
||||
// Wait for the stop event to complete
|
||||
error = cudaEventSynchronize(stop_mem);
|
||||
float msecMem = 0.0f;
|
||||
error = cudaEventElapsedTime(&msecMem, start, stop_mem);
|
||||
|
||||
// setup execution parameters -> découpage en threads
|
||||
// A VOUS DE CODER
|
||||
|
||||
|
||||
// lancement des threads executé sur la carte GPU
|
||||
// A VOUS DE CODER
|
||||
|
||||
error = cudaEventRecord(start_mem, NULL);
|
||||
error = cudaEventSynchronize(start_mem);
|
||||
|
||||
// copy result from device to host
|
||||
// A VOUS DE CODER
|
||||
|
||||
// cleanup device memory
|
||||
// COMMENTAIRES A ENLEVER
|
||||
//cudaFree(d_vector_X);
|
||||
//cudaFree(d_vector_Y);
|
||||
//cudaFree(d_vector_SAXPY);
|
||||
|
||||
|
||||
error = cudaEventRecord(stop, NULL);
|
||||
// Wait for the stop event to complete
|
||||
error = cudaEventSynchronize(stop);
|
||||
msecTotal = 0.0f;
|
||||
error = cudaEventElapsedTime(&msecTotal, start, stop);
|
||||
float msecMem2 =0.0f;
|
||||
error = cudaEventElapsedTime(&msecMem2, start_mem, stop);
|
||||
msecMem+=msecMem2;
|
||||
|
||||
printf("GPU execution time %f ms (memory management %2.2f \%)\n",msecTotal,(msecMem)/(msecTotal)*100);
|
||||
|
||||
float sum_diff=0;
|
||||
for(int i=0;i<N;i++)
|
||||
sum_diff+= h_vector_SAXPY_GPU[i]-h_vector_SAXPY_CPU[i];
|
||||
|
||||
printf("sum_diff = %f\n",sum_diff);
|
||||
|
||||
// cleanup memory
|
||||
free(h_vector_X);
|
||||
free(h_vector_Y);
|
||||
free(h_vector_SAXPY_GPU);
|
||||
free(h_vector_SAXPY_CPU);
|
||||
}
|
BIN
A4/TP_GPU-master/TP0.png
Normal file
After Width: | Height: | Size: 166 KiB |
94
A4/TP_GPU-master/TP0_matrice/.cproject
Normal file
|
@ -0,0 +1,94 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
|
||||
<storageModule moduleId="org.eclipse.cdt.core.settings">
|
||||
<cconfiguration id="com.nvidia.cuda.ide.nine_zero.configuration.release.1804067204">
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="com.nvidia.cuda.ide.nine_zero.configuration.release.1804067204" moduleId="org.eclipse.cdt.core.settings" name="Release">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="com.nvidia.cuda.ide.elf" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="com.nvidia.cuda.ide.cubin" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="com.nvidia.cuda.ide.macho" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.VCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="nvcc.errorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe,org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release" cleanCommand="rm -rf" description="" id="com.nvidia.cuda.ide.nine_zero.configuration.release.1804067204" name="Release" parent="com.nvidia.cuda.ide.nine_zero.configuration.release">
|
||||
<folderInfo id="com.nvidia.cuda.ide.nine_zero.configuration.release.1804067204." name="/" resourcePath="">
|
||||
<toolChain id="com.nvidia.cuda.ide.toolchain.nine_zero.69243472" name="CUDA Toolkit 10.0" superClass="com.nvidia.cuda.ide.toolchain.nine_zero">
|
||||
<targetPlatform archList="all" binaryParser="com.nvidia.cuda.ide.elf;com.nvidia.cuda.ide.macho;com.nvidia.cuda.ide.cubin" id="com.nvidia.cuda.ide.targetPlatform.718907530" isAbstract="false" name="Debug Platform" osList="linux,macosx" superClass="com.nvidia.cuda.ide.targetPlatform"/>
|
||||
<builder buildPath="${workspace_loc:/matrice}/Release" id="com.nvidia.cuda.ide.builder.960504214" keepEnvironmentInBuildfile="false" name="CUDA Toolkit 9.2 Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="com.nvidia.cuda.ide.builder"/>
|
||||
<tool id="nvcc.compiler.base.376896027" name="NVCC Compiler" superClass="nvcc.compiler.base">
|
||||
<option id="nvcc.compiler.include.paths.602427994" name="Include paths (-I)" superClass="nvcc.compiler.include.paths" valueType="includePath">
|
||||
<listOptionValue builtIn="false" value="/usr/local/cuda/samples/common/inc"/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:/${ProjName}}/inc""/>
|
||||
<listOptionValue builtIn="false" value=""${workspace_loc:/${ProjName}}/src/CUDA""/>
|
||||
</option>
|
||||
<option id="nvcc.compiler.deviceDebug.1868299221" name="Generate device debug information (-G)" superClass="nvcc.compiler.deviceDebug"/>
|
||||
<option id="nvcc.compiler.option.level.1923776877" name="Generate host debug information (-g)" superClass="nvcc.compiler.option.level"/>
|
||||
<option defaultValue="nvcc.compiler.optimization.level.most" id="nvcc.compiler.optimization.level.461195553" name="Optimization Level" superClass="nvcc.compiler.optimization.level" value="nvcc.compiler.optimization.level.most" valueType="enumerated"/>
|
||||
<option id="nvcc.compiler.pic.2021621233" name="Position Independent Code (-fPIC)" superClass="nvcc.compiler.pic" value="true" valueType="boolean"/>
|
||||
<inputType id="nvcc.compiler.input.cu.212130830" superClass="nvcc.compiler.input.cu"/>
|
||||
<inputType id="nvcc.compiler.input.cpp.1544991377" superClass="nvcc.compiler.input.cpp"/>
|
||||
<inputType id="nvcc.compiler.input.c.298636231" superClass="nvcc.compiler.input.c"/>
|
||||
</tool>
|
||||
<tool id="nvcc.linker.base.815597997" name="NVCC Linker" superClass="nvcc.linker.base">
|
||||
<option id="nvcc.linker.option.paths.539821132" name="Library search path (-L)" superClass="nvcc.linker.option.paths" valueType="libPaths">
|
||||
<listOptionValue builtIn="false" value="/usr/local/cuda/lib"/>
|
||||
</option>
|
||||
<option id="nvcc.linker.option.libs.680956661" name="Libraries (-l)" superClass="nvcc.linker.option.libs" valueType="libs">
|
||||
<listOptionValue builtIn="false" value="cublas"/>
|
||||
</option>
|
||||
<inputType id="nvcc.linker.input.1893304424" superClass="nvcc.linker.input">
|
||||
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
|
||||
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
|
||||
</inputType>
|
||||
</tool>
|
||||
<tool id="nvcc.archiver.base.24620523" name="NVCC Archiver" superClass="nvcc.archiver.base"/>
|
||||
<tool id="com.nvidia.host.assembler.1797002694" name="Host Assembler" superClass="com.nvidia.host.assembler">
|
||||
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.195427270" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
|
||||
</tool>
|
||||
</toolChain>
|
||||
</folderInfo>
|
||||
<sourceEntries>
|
||||
<entry excluding="matrixMul_kernel.cuh" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name="src"/>
|
||||
</sourceEntries>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
|
||||
<storageModule moduleId="com.nvidia.cuda.ide.build.project.ICudaProjectConfiguration">
|
||||
<executable devicelink="false">
|
||||
<ptx major="5" minor="2"/>
|
||||
<sass major="5" minor="2"/>
|
||||
</executable>
|
||||
<editor-arch major="5" minor="2"/>
|
||||
</storageModule>
|
||||
</cconfiguration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
<project id="matrice.com.nvidia.cuda.ide.nine_zero.exe.915560355" name="Executable" projectType="com.nvidia.cuda.ide.nine_zero.exe"/>
|
||||
</storageModule>
|
||||
<storageModule moduleId="scannerConfiguration">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
|
||||
<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.nine_zero.configuration.debug.2018016243;com.nvidia.cuda.ide.nine_zero.configuration.debug.2018016243.;nvcc.compiler.base.1292094216;nvcc.compiler.input.cu.2074083432">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
|
||||
</scannerConfigBuildInfo>
|
||||
<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.nine_zero.configuration.debug.2018016243;com.nvidia.cuda.ide.nine_zero.configuration.debug.2018016243.;nvcc.compiler.base.1292094216;nvcc.compiler.input.c.2064063779">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
|
||||
</scannerConfigBuildInfo>
|
||||
<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.nine_zero.configuration.debug.2018016243;com.nvidia.cuda.ide.nine_zero.configuration.debug.2018016243.;nvcc.compiler.base.1292094216;nvcc.compiler.input.cpp.679408217">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
|
||||
</scannerConfigBuildInfo>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
|
||||
<storageModule moduleId="refreshScope" versionNumber="2">
|
||||
<configuration configurationName="Release">
|
||||
<resource resourceType="PROJECT" workspacePath="/matrice"/>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
|
||||
</cproject>
|
27
A4/TP_GPU-master/TP0_matrice/.project
Normal file
|
@ -0,0 +1,27 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<projectDescription>
|
||||
<name>matrice</name>
|
||||
<comment></comment>
|
||||
<projects>
|
||||
</projects>
|
||||
<buildSpec>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
|
||||
<triggers>clean,full,incremental,</triggers>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
|
||||
<triggers>full,incremental,</triggers>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
</buildSpec>
|
||||
<natures>
|
||||
<nature>org.eclipse.cdt.core.cnature</nature>
|
||||
<nature>org.eclipse.cdt.core.ccnature</nature>
|
||||
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
|
||||
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
|
||||
</natures>
|
||||
</projectDescription>
|
|
@ -0,0 +1,3 @@
|
|||
eclipse.preferences.version=1
|
||||
project-sync-config=<?xml version\="1.0" encoding\="UTF-8"?>\n<sync-configs>\n<sync-config config-name\="Local" connection-name\="Local" location\="/home/gac/L2S_Programmation/TP_GPU/TP_matrice_CUDA_10" remote-services-id\="org.eclipse.ptp.remote.LocalServices" sync-on-postbuild\="true" sync-on-prebuild\="true" sync-on-save\="true" sync-provider-id\="org.eclipse.ptp.rdt.sync.git.core.synchronizeService">\n<config-properties com.nvidia.cuda.ide.build.core.Cpu\="Native"/>\n</sync-config>\n</sync-configs>
|
||||
sync-mode=NONE
|
33
A4/TP_GPU-master/TP0_matrice/CMakeLists-cuda11.2.txt
Normal file
|
@ -0,0 +1,33 @@
|
|||
#cmake -DCMAKE_CUDA_FLAGS=”-arch=sm_30” ..
|
||||
cmake_minimum_required(VERSION 2.8)
|
||||
|
||||
project(matrice)
|
||||
|
||||
set(CUDA_TOOLKIT_ROOT_DIR "/partage/public/ngac/cuda-11.2")
|
||||
|
||||
set(ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/)
|
||||
|
||||
#set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")
|
||||
|
||||
find_package(CUDA REQUIRED)
|
||||
list(APPEND HEADER_REP ${ROOT_DIR}/inc/ ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} "/usr/local/cuda/samples/common/inc/" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/../samples/common/inc")
|
||||
|
||||
#file(GLOB CPP_FILES ${ROOT_DIR}/src/CPP/*.cpp)
|
||||
#file(GLOB CU_FILES ${ROOT_DIR}/src/CPP/*.cu)
|
||||
set(CPP_FILES ${ROOT_DIR}/src/CPP/matrixMul_C.cpp)
|
||||
set(CU_FILES ${ROOT_DIR}/src/CPP/matrixMul_cuda.cu ${ROOT_DIR}/src/CPP/matrixMul_CUBLAS.cu ${ROOT_DIR}/src/CPP/matrixMul_main.cu)
|
||||
file(GLOB_RECURSE HEADER_FILES ${ROOT_DIR}/src/CUDA/*.cuh ${ROOT_DIR}/inc/*.h)
|
||||
|
||||
|
||||
set(CUDA_SEPARABLE_COMPILATION ON)
|
||||
set(CUDA_NVCC_FLAGS "-lineinfo;-I\"${ROOT_DIR}/inc\" -I\"${ROOT_DIR}/src/CUDA\" -I\"/usr/local/cuda/samples/common/inc/\"")
|
||||
|
||||
|
||||
cuda_add_executable(matrice_CUDA ${CPP_FILES} ${CU_FILES} ${HEADER_FILES})
|
||||
set_target_properties(matrice_CUDA PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_include_directories(matrice_CUDA PUBLIC ${ROOT_DIR}/inc/ ${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc)
|
||||
target_link_libraries(matrice_CUDA ${CUDA_CUBLAS_LIBRARIES})
|
||||
|
||||
|
||||
|
||||
|
37
A4/TP_GPU-master/TP0_matrice/CMakeLists.txt
Normal file
|
@ -0,0 +1,37 @@
|
|||
#cmake -DCMAKE_CUDA_FLAGS=”-arch=sm_30” ..
|
||||
cmake_minimum_required(VERSION 3.8)
|
||||
#set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc")
|
||||
project(Matrice_project LANGUAGES CXX CUDA)
|
||||
|
||||
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
|
||||
set(CMAKE_VERBOSE_MAKEFILE TRUE)
|
||||
|
||||
set(ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
|
||||
set(EXE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
|
||||
|
||||
list(APPEND HEADER_REP ${ROOT_DIR}/inc ${ROOT_DIR}/src/CUDA ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} "/usr/local/cuda/samples/common/inc/" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/../samples/common/inc")
|
||||
|
||||
file(GLOB CPP_FILES ${ROOT_DIR}/src/CPP/*.cpp)
|
||||
file(GLOB CU_FILES ${ROOT_DIR}/src/CPP/*.cu)
|
||||
file(GLOB_RECURSE HEADER_FILES ${ROOT_DIR}/src/CUDA/*.cuh ${ROOT_DIR}/inc/*.h)
|
||||
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math")
|
||||
|
||||
#COMPILATION LIBRAIRIE PUIS EXECUTABLE
|
||||
#add_library(matrice ${CPP_FILES} ${CU_FILES} ${HEADER_FILES})
|
||||
|
||||
#set_target_properties(matrice PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
#target_compile_features(matrice PUBLIC cxx_std_11)
|
||||
#target_include_directories(matrice PUBLIC "${HEADER_REP}")
|
||||
|
||||
#add_executable(matrice_exe "${EXE_DIR}/src/CPP/matrixMul_main.cu")
|
||||
#target_link_libraries(matrice_exe PUBLIC matrice)
|
||||
|
||||
#COMPILATION EXECUTABLE (sans generer avant une librairie => tous les fichers sources sont pris en compte)
|
||||
add_executable(matrice_exe ${CPP_FILES} ${CU_FILES} ${HEADER_FILES})
|
||||
|
||||
set_target_properties(matrice_exe PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_features(matrice_exe PUBLIC cxx_std_11)
|
||||
target_include_directories(matrice_exe PUBLIC "${HEADER_REP}")
|
||||
target_link_libraries(matrice_exe PUBLIC cublas)
|
||||
|
31
A4/TP_GPU-master/TP0_matrice/CMakeLists_cmake28.txt
Normal file
|
@ -0,0 +1,31 @@
|
|||
#cmake -DCMAKE_CUDA_FLAGS=”-arch=sm_30” ..
|
||||
cmake_minimum_required(VERSION 2.8)
|
||||
|
||||
project(matrice)
|
||||
|
||||
set(ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/)
|
||||
|
||||
#set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")
|
||||
|
||||
find_package(CUDA REQUIRED)
|
||||
list(APPEND HEADER_REP ${ROOT_DIR}/inc/ ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} "/usr/local/cuda/samples/common/inc/" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/../samples/common/inc")
|
||||
|
||||
#file(GLOB CPP_FILES ${ROOT_DIR}/src/CPP/*.cpp)
|
||||
#file(GLOB CU_FILES ${ROOT_DIR}/src/CPP/*.cu)
|
||||
set(CPP_FILES ${ROOT_DIR}/src/CPP/matrixMul_C.cpp)
|
||||
set(CU_FILES ${ROOT_DIR}/src/CPP/matrixMul_cuda.cu ${ROOT_DIR}/src/CPP/matrixMul_CUBLAS.cu ${ROOT_DIR}/src/CPP/matrixMul_main.cu)
|
||||
file(GLOB_RECURSE HEADER_FILES ${ROOT_DIR}/src/CUDA/*.cuh ${ROOT_DIR}/inc/*.h)
|
||||
|
||||
|
||||
set(CUDA_SEPARABLE_COMPILATION ON)
|
||||
set(CUDA_NVCC_FLAGS "-lineinfo;-I\"${ROOT_DIR}/inc\" -I\"${ROOT_DIR}/src/CUDA\" -I\"/usr/local/cuda/samples/common/inc/\"")
|
||||
|
||||
|
||||
cuda_add_executable(matrice_CUDA ${CPP_FILES} ${CU_FILES} ${HEADER_FILES})
|
||||
set_target_properties(matrice_CUDA PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_include_directories(matrice_CUDA PUBLIC ${ROOT_DIR}/inc/ /usr/local/cuda/samples/common/inc/)
|
||||
target_link_libraries(matrice_CUDA ${CUDA_CUBLAS_LIBRARIES})
|
||||
|
||||
|
||||
|
||||
|
38
A4/TP_GPU-master/TP0_matrice/inc/matrixMul.h
Normal file
|
@ -0,0 +1,38 @@
|
|||
#ifndef _MATRIXMUL_H_
|
||||
#define _MATRIXMUL_H_
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
// includes, project
|
||||
#include <cuda_runtime.h>
|
||||
//#include <helper_functions.h>
|
||||
//#include <helper_cuda.h>
|
||||
#include "cublas.h"
|
||||
// Thread block size
|
||||
#define BLOCK_SIZE 16
|
||||
|
||||
typedef enum type_version_kernel {v0,v0_bis,v1} Type_version_kernel;
|
||||
|
||||
void computeGold( float*, const float*, const float*, unsigned int, unsigned int, unsigned int);
|
||||
// Matrix dimensions
|
||||
// (chosen as multiples of the thread block size for simplicity)
|
||||
/*
|
||||
#define WA (64 * BLOCK_SIZE) // Matrix A width
|
||||
#define HA (64 * BLOCK_SIZE) // Matrix A height
|
||||
#define WB (64 * BLOCK_SIZE) // Matrix B width
|
||||
#define HB WA // Matrix B height
|
||||
#define WC WB // Matrix C width
|
||||
#define HC HA // Matrix C height
|
||||
*/
|
||||
|
||||
#
|
||||
void compute_matrixMul_C(int W);
|
||||
void compute_matrixMul_cublas(int N);
|
||||
void compute_matrixMul_cuda(int N,Type_version_kernel v);
|
||||
void randomInit(float*, int);
|
||||
void printDiff(float*, float*, int, int);
|
||||
|
||||
|
||||
#endif // _MATRIXMUL_H_
|
0
A4/TP_GPU-master/TP0_matrice/src/.ptp-sync-folder
Normal file
137
A4/TP_GPU-master/TP0_matrice/src/CPP/matrixMul_C.cpp
Normal file
|
@ -0,0 +1,137 @@
|
|||
/*
|
||||
* Copyright 1993-2007 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO USER:
|
||||
*
|
||||
* This source code is subject to NVIDIA ownership rights under U.S. and
|
||||
* international Copyright laws. Users and possessors of this source code
|
||||
* are hereby granted a nonexclusive, royalty-free license to use this code
|
||||
* in individual and commercial software.
|
||||
*
|
||||
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
|
||||
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
|
||||
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
|
||||
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
|
||||
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
||||
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
||||
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
||||
* OR PERFORMANCE OF THIS SOURCE CODE.
|
||||
*
|
||||
* U.S. Government End Users. This source code is a "commercial item" as
|
||||
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
|
||||
* "commercial computer software" and "commercial computer software
|
||||
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
|
||||
* and is provided to the U.S. Government only as a commercial end item.
|
||||
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
|
||||
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
|
||||
* source code with only those rights set forth herein.
|
||||
*
|
||||
* Any use of this source code in individual and commercial software must
|
||||
* include, in the user documentation and internal comments to the code,
|
||||
* the above Disclaimer and U.S. Government End Users Notice.
|
||||
*/
|
||||
|
||||
|
||||
#include "matrixMul.h"
|
||||
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// export C interface
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compute reference data set
|
||||
//! C = A * B
|
||||
//! @param C reference data, computed but preallocated
|
||||
//! @param A matrix A as provided to device
|
||||
//! @param B matrix B as provided to device
|
||||
//! @param hA height of matrix A
|
||||
//! @param wB width of matrix B
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
void
|
||||
computeGold(float* C, const float* A, const float* B, unsigned int hA, unsigned int wA, unsigned int wB)
|
||||
{
|
||||
for (unsigned int i = 0; i < hA; ++i)
|
||||
for (unsigned int j = 0; j < wB; ++j) {
|
||||
double sum = 0;
|
||||
for (unsigned int k = 0; k < wA; ++k) {
|
||||
double a = A[i * wA + k];
|
||||
double b = B[k * wB + j];
|
||||
sum += a * b;
|
||||
}
|
||||
C[i * wB + j] = (float)sum;
|
||||
}
|
||||
}
|
||||
|
||||
void compute_matrixMul_C(int W){
|
||||
|
||||
|
||||
|
||||
// allocate host memory for matrices A and B
|
||||
unsigned int size_A = W * W;
|
||||
unsigned int mem_size_A = sizeof(float) * size_A;
|
||||
float* h_A = (float*) malloc(mem_size_A);
|
||||
unsigned int size_B = W * W;
|
||||
unsigned int mem_size_B = sizeof(float) * size_B;
|
||||
float* h_B = (float*) malloc(mem_size_B);
|
||||
|
||||
// initialize host memory
|
||||
//randomInit(h_A, size_A);
|
||||
//randomInit(h_B, size_B);
|
||||
|
||||
// allocate host memory for the result
|
||||
unsigned int size_C = W * W;
|
||||
unsigned int mem_size_C = sizeof(float) * size_C;
|
||||
float* h_C = (float*) malloc(mem_size_C);
|
||||
|
||||
// create and start timer
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// Allocate CUDA events that we'll use for timing
|
||||
cudaEvent_t event[2];
|
||||
|
||||
cudaEventCreate(event+0);
|
||||
cudaEventCreate(event+1);
|
||||
|
||||
cudaEventRecord(event[0], NULL);
|
||||
computeGold(h_C, h_A, h_B, W, W, W);
|
||||
cudaEventRecord(event[1], NULL);
|
||||
cudaEventSynchronize(event[1]);
|
||||
|
||||
float msecTotal = 0.0f;
|
||||
cudaEventElapsedTime(&msecTotal, event[0], event[1]);
|
||||
|
||||
printf("Time= \t\t\t\t %.3f msec\n",msecTotal);
|
||||
|
||||
free(h_A);
|
||||
free(h_B);
|
||||
free(h_C);
|
||||
|
||||
}
|
||||
|
||||
|
||||
// Allocates a matrix with random float entries.
|
||||
void randomInit(float* data, int size)
|
||||
{
|
||||
for (int i = 0; i < size; ++i)
|
||||
data[i] = rand() / (float)RAND_MAX;
|
||||
}
|
||||
|
||||
void printDiff(float *data1, float *data2, int width, int height)
|
||||
{
|
||||
int i,j,k;
|
||||
int error_count=0;
|
||||
for (j=0; j<height; j++) {
|
||||
for (i=0; i<width; i++) {
|
||||
k = j*width+i;
|
||||
if (data1[k] != data2[k]) {
|
||||
printf("diff(%d,%d) CPU=%4.4f, GPU=%4.4f n", i,j, data1[k], data2[k]);
|
||||
error_count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
printf(" nTotal Errors = %d n", error_count);
|
||||
}
|
||||
|
111
A4/TP_GPU-master/TP0_matrice/src/CPP/matrixMul_CUBLAS.cu
Normal file
|
@ -0,0 +1,111 @@
|
|||
|
||||
|
||||
#include "matrixMul.h"
|
||||
|
||||
|
||||
|
||||
void compute_matrixMul_cublas(int N)
|
||||
{
|
||||
|
||||
float alpha = 1.0f, beta = 0.0f;
|
||||
|
||||
|
||||
|
||||
|
||||
// allocate host memory for matrices A and B
|
||||
unsigned int size_A = N * N;
|
||||
unsigned int mem_size_A = sizeof(float) * size_A;
|
||||
float* h_A = (float*) malloc(mem_size_A);
|
||||
//float* h_Abis = (float*) malloc(mem_size_A);
|
||||
unsigned int size_B = N * N;
|
||||
unsigned int mem_size_B = sizeof(float) * size_B;
|
||||
float* h_B = (float*) malloc(mem_size_B);
|
||||
//float* h_Bbis = (float*) malloc(mem_size_B);
|
||||
|
||||
// allocate host memory for the result
|
||||
unsigned int size_C = N * N;
|
||||
unsigned int mem_size_C = sizeof(float) * size_C;
|
||||
float* h_C = (float*) malloc(mem_size_C);
|
||||
|
||||
|
||||
cublasInit();
|
||||
|
||||
// set seed for rand()
|
||||
srand(2006);
|
||||
// initialize host memory
|
||||
randomInit(h_A, size_A);
|
||||
randomInit(h_B, size_B);
|
||||
|
||||
// Allocate CUDA events that we'll use for timing
|
||||
cudaEvent_t record_event[5];
|
||||
float time_msec[4];
|
||||
for (int i=0;i<5;i++){
|
||||
cudaEventCreate(record_event+i);
|
||||
}
|
||||
|
||||
// Record the start event
|
||||
cudaDeviceSynchronize();
|
||||
cudaEventRecord(record_event[0], NULL);
|
||||
|
||||
float* d_A;
|
||||
cublasAlloc(N*N, sizeof(float), (void **)&d_A);
|
||||
float* d_B;
|
||||
cublasAlloc(N*N, sizeof(float), (void **)&d_B);
|
||||
float* d_C;
|
||||
cublasAlloc(N*N, sizeof(float), (void **)&d_C);
|
||||
|
||||
|
||||
cudaEventRecord(record_event[1], NULL);
|
||||
cudaEventSynchronize(record_event[1]);
|
||||
|
||||
// copy host memory to device
|
||||
cublasSetMatrix(N,N, sizeof(float), h_A, N, d_A, N);
|
||||
cublasSetMatrix(N,N, sizeof(float), h_B, N, d_B, N);
|
||||
|
||||
cudaEventRecord(record_event[2], NULL);
|
||||
cudaEventSynchronize(record_event[2]);
|
||||
|
||||
|
||||
|
||||
cublasSgemm('n', 'n', N, N, N, alpha, d_A, N,d_B, N, beta, d_C, N);
|
||||
|
||||
|
||||
cudaEventRecord(record_event[3], NULL);
|
||||
cudaEventSynchronize(record_event[3]);
|
||||
|
||||
|
||||
|
||||
|
||||
cublasGetMatrix(N,N, sizeof(float), d_C,N, h_C, N);
|
||||
|
||||
cudaEventRecord(record_event[4], NULL);
|
||||
cudaEventSynchronize(record_event[4]);
|
||||
|
||||
|
||||
|
||||
cudaEventElapsedTime(time_msec+0, record_event[0], record_event[1]);
|
||||
cudaEventElapsedTime(time_msec+1, record_event[1], record_event[2]);
|
||||
cudaEventElapsedTime(time_msec+2, record_event[2], record_event[3]);
|
||||
cudaEventElapsedTime(time_msec+3, record_event[3], record_event[4]);
|
||||
time_msec[4]=time_msec[0]+time_msec[1]+time_msec[2]+time_msec[3];
|
||||
|
||||
printf("TOTAL : \t\t\t %f (ms) dont %.2f%% de gestion mémoire \n",time_msec[4],100*(time_msec[0]+time_msec[1]+time_msec[3])/time_msec[4]);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
cublasShutdown();
|
||||
|
||||
|
||||
// clean up memory
|
||||
free(h_A);
|
||||
free(h_B);
|
||||
free(h_C);
|
||||
//free(reference);
|
||||
cudaFree(d_A);
|
||||
cudaFree(d_B);
|
||||
cudaFree(d_C);
|
||||
|
||||
|
||||
}
|
161
A4/TP_GPU-master/TP0_matrice/src/CPP/matrixMul_cuda.cu
Normal file
|
@ -0,0 +1,161 @@
|
|||
/*
|
||||
* Copyright 1993-2007 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO USER:
|
||||
*
|
||||
* This source code is subject to NVIDIA ownership rights under U.S. and
|
||||
* international Copyright laws. Users and possessors of this source code
|
||||
* are hereby granted a nonexclusive, royalty-free license to use this code
|
||||
* in individual and commercial software.
|
||||
*
|
||||
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
|
||||
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
|
||||
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
|
||||
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
|
||||
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
||||
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
||||
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
||||
* OR PERFORMANCE OF THIS SOURCE CODE.
|
||||
*
|
||||
* U.S. Government End Users. This source code is a "commercial item" as
|
||||
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
|
||||
* "commercial computer software" and "commercial computer software
|
||||
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
|
||||
* and is provided to the U.S. Government only as a commercial end item.
|
||||
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
|
||||
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
|
||||
* source code with only those rights set forth herein.
|
||||
*
|
||||
* Any use of this source code in individual and commercial software must
|
||||
* include, in the user documentation and internal comments to the code,
|
||||
* the above Disclaimer and U.S. Government End Users Notice.
|
||||
*/
|
||||
|
||||
/* Matrix multiplication: C = A * B.
|
||||
* Host code.
|
||||
*
|
||||
* This sample implements matrix multiplication and is exactly the same as
|
||||
* Chapter 7 of the programming guide.
|
||||
* It has been written for clarity of exposition to illustrate various CUDA
|
||||
* programming principles, not with the goal of providing the most
|
||||
* performant generic kernel for matrix multiplication.
|
||||
*
|
||||
* CUBLAS provides high-performance matrix multiplication.
|
||||
*/
|
||||
|
||||
// includes, system
|
||||
#include "matrixMul.h"
|
||||
// includes, kernels
|
||||
#include "matrixMul_kernel.cuh"
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Run a simple test for CUDA
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
void compute_matrixMul_cuda(int N,Type_version_kernel version_kernel)
|
||||
{
|
||||
|
||||
// set seed for rand()
|
||||
//srand(2006);
|
||||
|
||||
// allocate host memory for matrices A and B
|
||||
unsigned int size_A = N * N;
|
||||
unsigned int mem_size_A = sizeof(float) * size_A;
|
||||
float* h_A = (float*) malloc(mem_size_A);
|
||||
unsigned int size_B = N * N;
|
||||
unsigned int mem_size_B = sizeof(float) * size_B;
|
||||
float* h_B = (float*) malloc(mem_size_B);
|
||||
|
||||
// allocate host memory for the result
|
||||
unsigned int size_C = N * N;
|
||||
unsigned int mem_size_C = sizeof(float) * size_C;
|
||||
float* h_C = (float*) malloc(mem_size_C);
|
||||
|
||||
|
||||
|
||||
|
||||
// Allocate CUDA events that we'll use for timing
|
||||
cudaEvent_t record_event[5];
|
||||
float time_msec[4];
|
||||
for (int i=0;i<5;i++){
|
||||
cudaEventCreate(record_event+i);
|
||||
}
|
||||
|
||||
// Record the start event
|
||||
cudaDeviceSynchronize();
|
||||
cudaEventRecord(record_event[0], NULL);
|
||||
|
||||
// allocate device memory
|
||||
float* d_A;
|
||||
cudaMalloc((void**) &d_A, mem_size_A);
|
||||
float* d_B;
|
||||
cudaMalloc((void**) &d_B, mem_size_B);
|
||||
|
||||
// allocate device memory for result
|
||||
float* d_C;
|
||||
cudaMalloc((void**) &d_C, mem_size_C);
|
||||
|
||||
cudaEventRecord(record_event[1], NULL);
|
||||
cudaEventSynchronize(record_event[1]);
|
||||
|
||||
|
||||
|
||||
|
||||
// copy host memory to device
|
||||
cudaMemcpy(d_A, h_A, mem_size_A,
|
||||
cudaMemcpyHostToDevice) ;
|
||||
cudaMemcpy(d_B, h_B, mem_size_B,
|
||||
cudaMemcpyHostToDevice) ;
|
||||
|
||||
|
||||
cudaEventRecord(record_event[2], NULL);
|
||||
cudaEventSynchronize(record_event[2]);
|
||||
|
||||
|
||||
|
||||
// setup execution parameters
|
||||
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
|
||||
dim3 grid(N/threads.x, N/threads.y);
|
||||
|
||||
// execute the kernel
|
||||
switch(version_kernel){
|
||||
case v0 : matrixMul_v0<<< grid, threads >>>(d_C, d_A, d_B, N); break;
|
||||
case v0_bis : matrixMul_v0_bis<<< grid, threads >>>(d_C, d_A, d_B, N); break;
|
||||
case v1 : matrixMul_v1<<< grid, threads >>>(d_C, d_A, d_B, N,N); break;
|
||||
}
|
||||
|
||||
|
||||
cudaEventRecord(record_event[3], NULL);
|
||||
cudaEventSynchronize(record_event[3]);
|
||||
|
||||
|
||||
// copy result from device to host
|
||||
cudaMemcpy(h_C, d_C, mem_size_C,
|
||||
cudaMemcpyDeviceToHost) ;
|
||||
|
||||
cudaEventRecord(record_event[4], NULL);
|
||||
cudaEventSynchronize(record_event[4]);
|
||||
|
||||
|
||||
|
||||
cudaEventElapsedTime(time_msec+0, record_event[0], record_event[1]);
|
||||
cudaEventElapsedTime(time_msec+1, record_event[1], record_event[2]);
|
||||
cudaEventElapsedTime(time_msec+2, record_event[2], record_event[3]);
|
||||
cudaEventElapsedTime(time_msec+3, record_event[3], record_event[4]);
|
||||
time_msec[4]=time_msec[0]+time_msec[1]+time_msec[2]+time_msec[3];
|
||||
printf("TOTAL : \t\t\t %f (ms) dont %.2f%% de gestion mémoire \n",time_msec[4],100*(time_msec[0]+time_msec[1]+time_msec[3])/time_msec[4]);
|
||||
|
||||
|
||||
|
||||
// clean up memory
|
||||
free(h_A);
|
||||
free(h_B);
|
||||
free(h_C);
|
||||
|
||||
cudaFree(d_A);
|
||||
cudaFree(d_B);
|
||||
cudaFree(d_C);
|
||||
}
|
||||
|
108
A4/TP_GPU-master/TP0_matrice/src/CPP/matrixMul_main.cu
Normal file
|
@ -0,0 +1,108 @@
|
|||
/*
|
||||
* Copyright 1993-2007 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO USER:
|
||||
*
|
||||
* This source code is subject to NVIDIA ownership rights under U.S. and
|
||||
* international Copyright laws. Users and possessors of this source code
|
||||
* are hereby granted a nonexclusive, royalty-free license to use this code
|
||||
* in individual and commercial software.
|
||||
*
|
||||
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
|
||||
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
|
||||
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
|
||||
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
|
||||
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
||||
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
||||
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
||||
* OR PERFORMANCE OF THIS SOURCE CODE.
|
||||
*
|
||||
* U.S. Government End Users. This source code is a "commercial item" as
|
||||
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
|
||||
* "commercial computer software" and "commercial computer software
|
||||
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
|
||||
* and is provided to the U.S. Government only as a commercial end item.
|
||||
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
|
||||
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
|
||||
* source code with only those rights set forth herein.
|
||||
*
|
||||
* Any use of this source code in individual and commercial software must
|
||||
* include, in the user documentation and internal comments to the code,
|
||||
* the above Disclaimer and U.S. Government End Users Notice.
|
||||
*/
|
||||
|
||||
/* Matrix multiplication: C = A * B.
|
||||
* Host code.
|
||||
*
|
||||
* This sample implements matrix multiplication and is exactly the same as
|
||||
* Chapter 7 of the programming guide.
|
||||
* It has been written for clarity of exposition to illustrate various CUDA
|
||||
* programming principles, not with the goal of providing the most
|
||||
* performant generic kernel for matrix multiplication.
|
||||
*
|
||||
* CUBLAS provides high-performance matrix multiplication.
|
||||
*/
|
||||
|
||||
// includes, system
|
||||
#include "matrixMul.h"
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Program main
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
int
|
||||
main(int argc, const char** argv)
|
||||
{
|
||||
//findCudaDevice(argc,argv);
|
||||
|
||||
int n;
|
||||
|
||||
printf("\nmatrix_Mul_CUDA_C\n\n");
|
||||
for (n=7;n<11;n++){
|
||||
unsigned int taille_matrice;
|
||||
taille_matrice=(unsigned int)pow((float)2.0,n);
|
||||
printf("MATRICE DE TAILLE %d\n",taille_matrice);
|
||||
compute_matrixMul_C(taille_matrice);
|
||||
}
|
||||
|
||||
|
||||
printf("\nmatrix_Mul_CUDA_v0\n\n");
|
||||
for (n=7;n<11;n++){
|
||||
unsigned int taille_matrice;
|
||||
taille_matrice=(unsigned int)pow((float)2.0,n);
|
||||
printf("MATRICE DE TAILLE %d\n",taille_matrice);
|
||||
compute_matrixMul_cuda(taille_matrice,v0);
|
||||
}
|
||||
|
||||
|
||||
printf("\nmatrix_Mul_CUDA_v0_bis\n\n");
|
||||
for (n=7;n<11;n++){
|
||||
unsigned int taille_matrice;
|
||||
taille_matrice=(unsigned int)pow((float)2.0,n);
|
||||
printf("MATRICE DE TAILLE %d\n",taille_matrice);
|
||||
compute_matrixMul_cuda(taille_matrice,v0_bis);
|
||||
}
|
||||
|
||||
|
||||
printf("\nmatrix_Mul_CUDA_v1\n\n");
|
||||
for (n=7;n<11;n++){
|
||||
unsigned int taille_matrice;
|
||||
taille_matrice=(unsigned int)pow((float)2.0,n);
|
||||
printf("MATRICE DE TAILLE %d\n",taille_matrice);
|
||||
compute_matrixMul_cuda(taille_matrice,v1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
printf("\nmatrix_Mul_CUBLAS\n\n");
|
||||
for (n=7;n<11;n++){
|
||||
unsigned int taille_matrice;
|
||||
taille_matrice=(unsigned int)pow((float)2.0,n);
|
||||
printf("MATRICE DE TAILLE %d\n",taille_matrice);
|
||||
compute_matrixMul_cublas(taille_matrice);
|
||||
}
|
||||
|
||||
//CUT_EXIT(argc, argv);
|
||||
}
|
198
A4/TP_GPU-master/TP0_matrice/src/CUDA/matrixMul_kernel.cuh
Normal file
|
@ -0,0 +1,198 @@
|
|||
/*
|
||||
* Copyright 1993-2007 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NOTICE TO USER:
|
||||
*
|
||||
* This source code is subject to NVIDIA ownership rights under U.S. and
|
||||
* international Copyright laws. Users and possessors of this source code
|
||||
* are hereby granted a nonexclusive, royalty-free license to use this code
|
||||
* in individual and commercial software.
|
||||
*
|
||||
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
|
||||
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
|
||||
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
|
||||
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
|
||||
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
||||
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
||||
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
||||
* OR PERFORMANCE OF THIS SOURCE CODE.
|
||||
*
|
||||
* U.S. Government End Users. This source code is a "commercial item" as
|
||||
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
|
||||
* "commercial computer software" and "commercial computer software
|
||||
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
|
||||
* and is provided to the U.S. Government only as a commercial end item.
|
||||
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
|
||||
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
|
||||
* source code with only those rights set forth herein.
|
||||
*
|
||||
* Any use of this source code in individual and commercial software must
|
||||
* include, in the user documentation and internal comments to the code,
|
||||
* the above Disclaimer and U.S. Government End Users Notice.
|
||||
*/
|
||||
|
||||
/* Matrix multiplication: C = A * B.
|
||||
* Device code.
|
||||
*/
|
||||
|
||||
#ifndef _MATRIXMUL_KERNEL_H_
|
||||
#define _MATRIXMUL_KERNEL_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include "matrixMul.h"
|
||||
|
||||
#define CHECK_BANK_CONFLICTS 0
|
||||
#if CHECK_BANK_CONFLICTS
|
||||
#define AS(i, j) CUT_BANK_CHECKER(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
|
||||
#define BS(i, j) CUT_BANK_CHECKER(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
|
||||
#else
|
||||
#define AS(i, j) As[i][j]
|
||||
#define BS(i, j) Bs[i][j]
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Matrix multiplication on the device: C = A * B
|
||||
//! N is matrix dimension
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
__global__ void
|
||||
matrixMul_v0( float* C, float* A, float* B,int matrix_size)
|
||||
{
|
||||
int k;
|
||||
float C_sum;
|
||||
|
||||
int i_0,j_0;
|
||||
int i,j;
|
||||
|
||||
int a,b,c;
|
||||
|
||||
i_0=blockIdx.x*BLOCK_SIZE;
|
||||
j_0=blockIdx.y*BLOCK_SIZE;
|
||||
|
||||
i=i_0+threadIdx.x;
|
||||
j=j_0+threadIdx.y;
|
||||
|
||||
a=i*matrix_size;
|
||||
b=j;
|
||||
c=j+i*matrix_size;
|
||||
|
||||
|
||||
for (k = 0; k < matrix_size; ++k){
|
||||
C_sum += A[a] * B[b];
|
||||
a++;
|
||||
b+=matrix_size;
|
||||
}
|
||||
|
||||
C[c] = C_sum;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
__global__ void
|
||||
matrixMul_v0_bis( float* C, float* A, float* B,int matrix_size)
|
||||
{
|
||||
int k;
|
||||
float C_sum;
|
||||
|
||||
int i_0,j_0;
|
||||
int i,j;
|
||||
|
||||
int a,b,c;
|
||||
|
||||
j_0=blockIdx.x*BLOCK_SIZE;
|
||||
i_0=blockIdx.y*BLOCK_SIZE;
|
||||
|
||||
j=j_0+threadIdx.x;
|
||||
i=i_0+threadIdx.y;
|
||||
|
||||
a=i*matrix_size;
|
||||
b=j;
|
||||
c=j+i*matrix_size;
|
||||
|
||||
|
||||
for (k = 0; k < matrix_size; ++k){
|
||||
C_sum += A[a] * B[b];
|
||||
a++;
|
||||
b+=matrix_size;
|
||||
}
|
||||
|
||||
C[c] = C_sum;
|
||||
|
||||
}
|
||||
|
||||
|
||||
__global__ void
|
||||
matrixMul_v1( float* C, float* A, float* B, int wA, int wB)
|
||||
{
|
||||
// Block index
|
||||
int bx = blockIdx.x;
|
||||
int by = blockIdx.y;
|
||||
|
||||
// Thread index
|
||||
int tx = threadIdx.x;
|
||||
int ty = threadIdx.y;
|
||||
|
||||
// Index of the first sub-matrix of A processed by the block
|
||||
int aBegin = wA * BLOCK_SIZE * by;
|
||||
|
||||
// Index of the last sub-matrix of A processed by the block
|
||||
int aEnd = aBegin + wA - 1;
|
||||
|
||||
// Step size used to iterate through the sub-matrices of A
|
||||
int aStep = BLOCK_SIZE;
|
||||
|
||||
// Index of the first sub-matrix of B processed by the block
|
||||
int bBegin = BLOCK_SIZE * bx;
|
||||
|
||||
// Step size used to iterate through the sub-matrices of B
|
||||
int bStep = BLOCK_SIZE * wB;
|
||||
|
||||
// Csub is used to store the element of the block sub-matrix
|
||||
// that is computed by the thread
|
||||
float Csub = 0;
|
||||
|
||||
// Loop over all the sub-matrices of A and B
|
||||
// required to compute the block sub-matrix
|
||||
for (int a = aBegin, b = bBegin;
|
||||
a <= aEnd;
|
||||
a += aStep, b += bStep) {
|
||||
|
||||
// Declaration of the shared memory array As used to
|
||||
// store the sub-matrix of A
|
||||
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
|
||||
|
||||
// Declaration of the shared memory array Bs used to
|
||||
// store the sub-matrix of B
|
||||
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
|
||||
|
||||
// Load the matrices from device memory
|
||||
// to shared memory; each thread loads
|
||||
// one element of each matrix
|
||||
AS(ty, tx) = A[a + wA * ty + tx];
|
||||
BS(ty, tx) = B[b + wB * ty + tx];
|
||||
|
||||
// Synchronize to make sure the matrices are loaded
|
||||
__syncthreads();
|
||||
|
||||
// Multiply the two matrices together;
|
||||
// each thread computes one element
|
||||
// of the block sub-matrix
|
||||
for (int k = 0; k < BLOCK_SIZE; ++k)
|
||||
Csub += AS(ty, k) * BS(k, tx);
|
||||
|
||||
// Synchronize to make sure that the preceding
|
||||
// computation is done before loading two new
|
||||
// sub-matrices of A and B in the next iteration
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// Write the block sub-matrix to device memory;
|
||||
// each thread writes one element
|
||||
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
|
||||
C[c + wB * ty + tx] = Csub;
|
||||
}
|
||||
|
||||
|
||||
#endif // #ifndef _MATRIXMUL_KERNEL_H_
|
77
A4/TP_GPU-master/TP1_image/CUDA/.cproject
Normal file
|
@ -0,0 +1,77 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
|
||||
<storageModule moduleId="org.eclipse.cdt.core.settings">
|
||||
<cconfiguration id="com.nvidia.cuda.ide.eight_zero.configuration.release.381529367">
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="com.nvidia.cuda.ide.eight_zero.configuration.release.381529367" moduleId="org.eclipse.cdt.core.settings" name="Release">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.VCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="nvcc.errorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="com.nvidia.cuda.ide.elf" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="com.nvidia.cuda.ide.cubin" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="com.nvidia.cuda.ide.macho" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="com.nvidia.cuda.ide.eight_zero.configuration.release.381529367" name="Release" parent="com.nvidia.cuda.ide.eight_zero.configuration.release.381529367">
|
||||
<folderInfo id="com.nvidia.cuda.ide.eight_zero.configuration.release.381529367." name="/" resourcePath="">
|
||||
<toolChain id="com.nvidia.cuda.ide.toolchain.eight_zero.exe.release.1272863516" name="CUDA Toolkit 8.0">
|
||||
<targetPlatform archList="all" binaryParser="com.nvidia.cuda.ide.elf;com.nvidia.cuda.ide.macho;com.nvidia.cuda.ide.cubin" id="com.nvidia.cuda.ide.targetPlatform.677042486" isAbstract="false" name="Debug Platform" osList="linux,macosx" superClass="com.nvidia.cuda.ide.targetPlatform"/>
|
||||
<builder buildPath="${workspace_loc:/seuillage_CUDA_8}/Release" id="com.nvidia.cuda.ide.builder.1452012632" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="CUDA Toolkit 8.0 Builder" superClass="com.nvidia.cuda.ide.builder"/>
|
||||
<tool id="nvcc.compiler.base.1547368432" name="NVCC Compiler" superClass="nvcc.compiler.base">
|
||||
<option id="nvcc.compiler.deviceDebug.1647831981" name="Generate device debug information (-G)" superClass="nvcc.compiler.deviceDebug"/>
|
||||
<option id="nvcc.compiler.option.level.16726990" name="Generate host debug information (-g)" superClass="nvcc.compiler.option.level"/>
|
||||
<option defaultValue="nvcc.compiler.optimization.level.most" id="nvcc.compiler.optimization.level.1222624320" name="Optimization Level" superClass="nvcc.compiler.optimization.level" valueType="enumerated"/>
|
||||
<option id="nvcc.compiler.pic.1176654696" name="Position Independent Code (-fPIC)" superClass="nvcc.compiler.pic"/>
|
||||
<option id="nvcc.compiler.include.paths.393958652" name="Include paths (-I)" superClass="nvcc.compiler.include.paths" valueType="includePath">
|
||||
<listOptionValue builtIn="false" value="/opt/cuda/samples/common/inc"/>
|
||||
</option>
|
||||
<inputType id="nvcc.compiler.input.cu.1459843904" superClass="nvcc.compiler.input.cu"/>
|
||||
<inputType id="nvcc.compiler.input.cpp.360700336" superClass="nvcc.compiler.input.cpp"/>
|
||||
<inputType id="nvcc.compiler.input.c.1943821627" superClass="nvcc.compiler.input.c"/>
|
||||
</tool>
|
||||
<tool id="nvcc.linker.base.602397088" name="NVCC Linker" superClass="nvcc.linker.base">
|
||||
<inputType id="nvcc.linker.input.862625666" superClass="nvcc.linker.input">
|
||||
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
|
||||
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
|
||||
</inputType>
|
||||
</tool>
|
||||
<tool id="nvcc.archiver.base.342423531" name="NVCC Archiver" superClass="nvcc.archiver.base"/>
|
||||
<tool id="com.nvidia.host.assembler.1673437193" name="Host Assembler" superClass="com.nvidia.host.assembler">
|
||||
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.1548523292" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
|
||||
</tool>
|
||||
</toolChain>
|
||||
</folderInfo>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="com.nvidia.cuda.ide.build.project.ICudaProjectConfiguration">
|
||||
<executable devicelink="false">
|
||||
<sass major="3" minor="0"/>
|
||||
<ptx major="3" minor="0"/>
|
||||
</executable>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
|
||||
</cconfiguration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
<project id="seuillage_CUDA_8.com.nvidia.cuda.ide.eight_zero.exe.270714933" name="Executable"/>
|
||||
</storageModule>
|
||||
<storageModule moduleId="scannerConfiguration">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
|
||||
<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.eight_zero.configuration.debug.823155436;com.nvidia.cuda.ide.eight_zero.configuration.debug.823155436.;nvcc.compiler.base.1174254289;nvcc.compiler.input.cpp.2134833342">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
|
||||
</scannerConfigBuildInfo>
|
||||
<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.eight_zero.configuration.debug.823155436;com.nvidia.cuda.ide.eight_zero.configuration.debug.823155436.;nvcc.compiler.base.1174254289;nvcc.compiler.input.c.691715125">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
|
||||
</scannerConfigBuildInfo>
|
||||
<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.eight_zero.configuration.debug.823155436;com.nvidia.cuda.ide.eight_zero.configuration.debug.823155436.;nvcc.compiler.base.1174254289;nvcc.compiler.input.cu.1571068709">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
|
||||
</scannerConfigBuildInfo>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
|
||||
<storageModule moduleId="refreshScope"/>
|
||||
</cproject>
|
1
A4/TP_GPU-master/TP1_image/CUDA/.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
/Release/
|
27
A4/TP_GPU-master/TP1_image/CUDA/.project
Normal file
|
@ -0,0 +1,27 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<projectDescription>
|
||||
<name>seuillage_CUDA_8</name>
|
||||
<comment></comment>
|
||||
<projects>
|
||||
</projects>
|
||||
<buildSpec>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
|
||||
<triggers>clean,full,incremental,</triggers>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
|
||||
<triggers>full,incremental,</triggers>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
</buildSpec>
|
||||
<natures>
|
||||
<nature>org.eclipse.cdt.core.cnature</nature>
|
||||
<nature>org.eclipse.cdt.core.ccnature</nature>
|
||||
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
|
||||
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
|
||||
</natures>
|
||||
</projectDescription>
|
34
A4/TP_GPU-master/TP1_image/CUDA/CMakeLists.txt
Normal file
|
@ -0,0 +1,34 @@
|
|||
cmake_minimum_required(VERSION 2.8)
|
||||
|
||||
project(seuillage LANGUAGES CXX CUDA)
|
||||
set(ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/)
|
||||
#set(CMAKE_C_COMPILER "gcc-8")
|
||||
#set(CMAKE_CXX_COMPILER "g++-8")
|
||||
|
||||
#set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")
|
||||
|
||||
list(APPEND HEADER_REP ${ROOT_DIR}/ ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} "/usr/local/cuda/samples/common/inc/" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/../samples/common/inc")
|
||||
|
||||
file(GLOB CPP_FILES ${ROOT_DIR}/*.cpp)
|
||||
file(GLOB CU_FILES ${ROOT_DIR}/*.cu)
|
||||
file(GLOB_RECURSE HEADER_FILES ${ROOT_DIR}/*.cuh ${ROOT_DIR}/*.h)
|
||||
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math")
|
||||
|
||||
|
||||
set(CUDA_SEPARABLE_COMPILATION ON)
|
||||
set(CUDA_NVCC_FLAGS "-lineinfo;-I\"${ROOT_DIR}/inc\" -I\"${ROOT_DIR}/\" -I\"/usr/local/cuda/samples/common/inc/\"")
|
||||
|
||||
|
||||
add_executable(seuillage_CUDA ${CPP_FILES} ${CU_FILES} ${HEADER_FILES})
|
||||
set_target_properties(seuillage_CUDA PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_include_directories(seuillage_CUDA PUBLIC ${ROOT_DIR}/inc/ /usr/local/cuda/samples/common/inc/)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
34
A4/TP_GPU-master/TP1_image/CUDA/CMakeLists_cmake28.txt
Normal file
|
@ -0,0 +1,34 @@
|
|||
cmake_minimum_required(VERSION 2.8)
|
||||
|
||||
project(matrice)
|
||||
|
||||
set(ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/)
|
||||
|
||||
#set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")
|
||||
|
||||
find_package(CUDA REQUIRED)
|
||||
list(APPEND HEADER_REP ${ROOT_DIR}/ ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} "/usr/local/cuda/samples/common/inc/" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/../samples/common/inc")
|
||||
|
||||
#file(GLOB CPP_FILES ${ROOT_DIR}/src/CPP/*.cpp)
|
||||
#file(GLOB CU_FILES ${ROOT_DIR}/src/CPP/*.cu)
|
||||
set(CPP_FILES seuillage_C.cpp)
|
||||
set(CU_FILES ${ROOT_DIR}/seuillage_main.cu)
|
||||
#file(GLOB_RECURSE HEADER_FILES ${ROOT_DIR}/src/CUDA/*.cuh ${ROOT_DIR}/inc/*.h)
|
||||
|
||||
|
||||
set(CUDA_SEPARABLE_COMPILATION ON)
|
||||
set(CUDA_NVCC_FLAGS "-lineinfo;-I\"${ROOT_DIR}/inc\" -I\"${ROOT_DIR}/\" -I\"/usr/local/cuda/samples/common/inc/\"")
|
||||
|
||||
|
||||
cuda_add_executable(seuillage_CUDA ${CPP_FILES} ${CU_FILES} ${HEADER_FILES})
|
||||
set_target_properties(seuillage_CUDA PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_include_directories(seuillage_CUDA PUBLIC ${ROOT_DIR}/inc/ /usr/local/cuda/samples/common/inc/)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
17
A4/TP_GPU-master/TP1_image/CUDA/CMakeLists_cmake38.txt
Normal file
|
@ -0,0 +1,17 @@
|
|||
#cmake -DCMAKE_CUDA_FLAGS=”-arch=sm_30” ..
|
||||
cmake_minimum_required(VERSION 3.8)
|
||||
set(CUDACXX "/usr/local/cuda/bin/nvcc")
|
||||
set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc")
|
||||
project(seuillage_CUDA LANGUAGES CXX CUDA)
|
||||
|
||||
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
|
||||
set(CMAKE_VERBOSE_MAKEFILE TRUE)
|
||||
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math")
|
||||
|
||||
|
||||
add_executable(seuillage_CUDA seuillage_C.cpp seuillage_main.cu)
|
||||
set_target_properties(seuillage_CUDA PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_features(seuillage_CUDA PUBLIC cxx_std_11)
|
||||
target_include_directories(seuillage_CUDA PUBLIC ".")
|
||||
|
BIN
A4/TP_GPU-master/TP1_image/CUDA/build/.vs/cmake.db
Normal file
BIN
A4/TP_GPU-master/TP1_image/CUDA/build/.vs/seuillage/v16/.suo
Normal file
27
A4/TP_GPU-master/TP1_image/CUDA/seuillage.h
Normal file
|
@ -0,0 +1,27 @@
|
|||
|
||||
//#define SEUILLAGE_H
|
||||
//#ifndef SEUILLAGE_H
|
||||
|
||||
// includes, system
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
|
||||
|
||||
#define SIZE_I 960
|
||||
#define SIZE_J 1280
|
||||
|
||||
|
||||
#define BLOCK_SIZE 16
|
||||
|
||||
// Prototype
|
||||
void runTest( int argc, char** argv);
|
||||
extern "C" void seuillage_C( float reference[][SIZE_J][SIZE_I] , float idata[][SIZE_J][SIZE_I] );
|
||||
|
||||
|
||||
//#endif
|
||||
|
||||
|
||||
|
41
A4/TP_GPU-master/TP1_image/CUDA/seuillage_C_student.cpp
Normal file
|
@ -0,0 +1,41 @@
|
|||
/*
|
||||
* Copiright 1993-2009 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* NVIDIA Corporation and its licensors retain all intellectual properti and
|
||||
* proprietari rights in and to this software and related documentation and
|
||||
* ani modifications thereto. Ani use, reproduction, disclosure, or distribution
|
||||
* of this software and related documentation without an ejpress license
|
||||
* agreement from NVIDIA Corporation is strictli prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
/* Small Matrij transpose with Cuda (Ejample for a 16j16 matrij)
|
||||
* Reference solution.
|
||||
*/
|
||||
|
||||
#include "seuillage.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compute reference data set
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
void seuillage_C(float image_out[][SIZE_J][SIZE_I], float image_in[][SIZE_J][SIZE_I])
|
||||
{
|
||||
float r, g, b;
|
||||
for(int j=0; j<SIZE_J; j++){
|
||||
for(int i=0; i<SIZE_I; i++){
|
||||
r = image_in[0][j][i];
|
||||
g = image_in[1][j][i];
|
||||
b = image_in[2][j][i];
|
||||
|
||||
if(r/sqrt(r*r+g*g+b*b) > 0.7){
|
||||
image_out[0][j][i] = r;
|
||||
image_out[1][j][i] = g;
|
||||
image_out[2][j][i] = b;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
229
A4/TP_GPU-master/TP1_image/CUDA/seuillage_main_student.cu
Normal file
|
@ -0,0 +1,229 @@
|
|||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
/* Template project which demonstrates the basics on how to setup a project
|
||||
* example application.
|
||||
* Host code.
|
||||
*/
|
||||
|
||||
// includes, system
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <iostream>
|
||||
using namespace std;
|
||||
|
||||
// includes CUDA
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include "seuillage.h"
|
||||
|
||||
__global__ void seuillage_kernel(float d_image_in[][SIZE_J][SIZE_I],float d_image_out[][SIZE_J][SIZE_I])
|
||||
{
|
||||
|
||||
// A VOUS DE CODER
|
||||
int i,j;
|
||||
|
||||
j = blockIdx.x;
|
||||
i = threadIdx.x;
|
||||
|
||||
float r,g,b;
|
||||
|
||||
r = d_image_in[0][j][i];
|
||||
g = d_image_in[1][j][i];
|
||||
b = d_image_in[2][j][i];
|
||||
|
||||
if((r/sqrt(r*r+g*g+b*b))> 0.7){
|
||||
|
||||
d_image_out[0][j][i]=r;
|
||||
d_image_out[1][j][i]=g;
|
||||
d_image_out[2][j][i]=b;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// declaration, forward
|
||||
void runTest( int argc, char** argv);
|
||||
|
||||
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Program main
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
int
|
||||
main( int argc, char** argv)
|
||||
{
|
||||
|
||||
runTest( argc, argv);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Run a simple test for CUDA
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
void
|
||||
runTest( int argc, char** argv)
|
||||
{
|
||||
cudaError_t error;
|
||||
|
||||
if (argc<2)
|
||||
printf("indiquer le chemin du repertoire contenant les images\n");
|
||||
|
||||
const unsigned int mem_size = sizeof(float) * 3* SIZE_J * SIZE_I;
|
||||
// allocate host memory
|
||||
float* h_image_in = (float*) malloc(mem_size);
|
||||
|
||||
|
||||
//Initilaisation du volume d'entr<74>e
|
||||
FILE *file_ptr;
|
||||
char name_file_in[512];
|
||||
sprintf(name_file_in,"%s/ferrari.raw",argv[1]);
|
||||
printf("%s",name_file_in);
|
||||
file_ptr=fopen(name_file_in,"rb");
|
||||
fread(h_image_in,sizeof(float),3*SIZE_J*SIZE_I,file_ptr);
|
||||
fclose(file_ptr);
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// EXECUTION SUR LE CPU
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
// Image trait<69>e sur le CPU
|
||||
float* h_image_out_CPU = (float*) malloc( mem_size);
|
||||
|
||||
printf("Seuillage CPU d'une image couleur \n");
|
||||
|
||||
cudaEvent_t start,stop;
|
||||
error = cudaEventCreate(&start);
|
||||
error = cudaEventCreate(&stop);
|
||||
|
||||
// Record the start event
|
||||
error = cudaEventRecord(start, NULL);
|
||||
error = cudaEventSynchronize(start);
|
||||
//Seuillage sur CPU
|
||||
seuillage_C( (float (*)[SIZE_J][SIZE_I])h_image_out_CPU, (float (*)[SIZE_J][SIZE_I])h_image_in);
|
||||
|
||||
// Record the start event
|
||||
error = cudaEventRecord(stop, NULL);
|
||||
// Wait for the stop event to complete
|
||||
error = cudaEventSynchronize(stop);
|
||||
float msecTotal = 0.0f;
|
||||
error = cudaEventElapsedTime(&msecTotal, start, stop);
|
||||
|
||||
|
||||
printf("CPU execution time %f\n",msecTotal);
|
||||
|
||||
//Sauvegarde de l'image resultat
|
||||
char name_file_out_CPU[512];
|
||||
sprintf(name_file_out_CPU,"%s/ferrari_out_CPU.raw",argv[1]);
|
||||
file_ptr=fopen(name_file_out_CPU,"wb");
|
||||
fwrite(h_image_out_CPU,sizeof(float),3*SIZE_J*SIZE_I,file_ptr);
|
||||
fclose(file_ptr);
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// EXECUTION SUR LE GPU
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
cudaEvent_t start_mem,stop_mem;
|
||||
error = cudaEventCreate(&start_mem);
|
||||
error = cudaEventCreate(&stop_mem);
|
||||
|
||||
error = cudaEventRecord(start, NULL);
|
||||
error = cudaEventSynchronize(start);
|
||||
|
||||
|
||||
float* h_image_out_GPU = (float*) malloc(mem_size);
|
||||
|
||||
// images on device memory
|
||||
float* d_image_in;
|
||||
float* d_image_out;
|
||||
|
||||
|
||||
|
||||
// Alocation mémoire de d_image_in et d_image_out sur la carte GPU
|
||||
// A VOUS DE CODER
|
||||
cudaMalloc((void **) &d_image_in, mem_size);
|
||||
cudaMalloc((void **) &d_image_out, mem_size);
|
||||
|
||||
// copy host memory to device
|
||||
// A VOUS DE CODER
|
||||
cudaMemcpy(d_image_in, h_image_in, mem_size, cudaMemcpyHostToDevice);
|
||||
|
||||
error = cudaEventRecord(stop_mem, NULL);
|
||||
// Wait for the stop event to complete
|
||||
error = cudaEventSynchronize(stop_mem);
|
||||
float msecMem = 0.0f;
|
||||
error = cudaEventElapsedTime(&msecMem, start, stop_mem);
|
||||
|
||||
// setup execution parameters -> découpage en threads
|
||||
// A VOUS DE CODER
|
||||
dim3 threads(960);
|
||||
dim3 grid(1280);
|
||||
|
||||
// lancement des threads executé sur la carte GPU
|
||||
// A VOUS DE CODER
|
||||
|
||||
// INDICATION : pour les parametres de la fonction kernel seuillage_kernel, vous ferez un changement de type (float *) vers (float (*)[SIZE_J][SIZE_I])
|
||||
// inspirez vous du lancement de la fonction seuillage_C dans le main.
|
||||
seuillage_kernel<<< grid,threads >>>((float (*)[SIZE_J][SIZE_I])d_image_in, (float (*)[SIZE_J][SIZE_I])d_image_out);
|
||||
// Record the start event
|
||||
|
||||
|
||||
|
||||
error = cudaEventRecord(start_mem, NULL);
|
||||
error = cudaEventSynchronize(start_mem);
|
||||
|
||||
// copy result from device to host
|
||||
// A VOUS DE CODER
|
||||
cudaMemcpy(h_image_out_GPU, d_image_out, mem_size, cudaMemcpyDeviceToHost);
|
||||
|
||||
// cleanup device memory
|
||||
//ENLEVEZ LES COMMENTAIRES
|
||||
cudaFree(d_image_in);
|
||||
cudaFree(d_image_out);
|
||||
|
||||
|
||||
error = cudaEventRecord(stop, NULL);
|
||||
// Wait for the stop event to complete
|
||||
error = cudaEventSynchronize(stop);
|
||||
msecTotal = 0.0f;
|
||||
error = cudaEventElapsedTime(&msecTotal, start, stop);
|
||||
float msecMem2 =0.0f;
|
||||
error = cudaEventElapsedTime(&msecMem2, start_mem, stop);
|
||||
msecMem+=msecMem2;
|
||||
|
||||
printf("GPU execution time %f ms (memory management %2.2f \%)\n",msecTotal,(msecMem)/(msecTotal)*100);
|
||||
|
||||
// Enregistrement de l'image de sortie sur un fichier
|
||||
char name_file_out_GPU[512];
|
||||
sprintf(name_file_out_GPU,"%s/ferrari_out_GPU.raw",argv[1]);
|
||||
file_ptr=fopen(name_file_out_GPU,"wb");
|
||||
fwrite(h_image_out_GPU,sizeof(float),3*SIZE_J*SIZE_I,file_ptr);
|
||||
fclose(file_ptr);
|
||||
|
||||
|
||||
// cleanup memory
|
||||
free(h_image_in);
|
||||
free(h_image_out_GPU);
|
||||
free(h_image_out_CPU);
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
BIN
A4/TP_GPU-master/TP1_image/Image/.DS_Store
vendored
Normal file
BIN
A4/TP_GPU-master/TP1_image/Image/carre.raw
Normal file
BIN
A4/TP_GPU-master/TP1_image/Image/carre_noise_gaussian.raw
Normal file
BIN
A4/TP_GPU-master/TP1_image/Image/carre_noise_saltpepper.raw
Normal file
BIN
A4/TP_GPU-master/TP1_image/Image/carre_out.jpg
Normal file
After Width: | Height: | Size: 4.8 KiB |
BIN
A4/TP_GPU-master/TP1_image/Image/carre_out.raw
Normal file
BIN
A4/TP_GPU-master/TP1_image/Image/ferrari.jpg
Normal file
After Width: | Height: | Size: 154 KiB |
BIN
A4/TP_GPU-master/TP1_image/Image/ferrari.raw
Normal file
BIN
A4/TP_GPU-master/TP1_image/Image/ferrari2.raw
Normal file
BIN
A4/TP_GPU-master/TP1_image/Image/ferrari_NB.raw
Normal file
BIN
A4/TP_GPU-master/TP1_image/Image/ferrari_NB_bruit_gaussien.raw
Normal file
BIN
A4/TP_GPU-master/TP1_image/Image/ferrari_out.jpg
Normal file
After Width: | Height: | Size: 132 KiB |
13268
A4/TP_GPU-master/TP1_image/Image/ferrari_out.raw
Normal file
BIN
A4/TP_GPU-master/TP1_image/Image/ferrari_out_CPU.jpg
Normal file
After Width: | Height: | Size: 19 KiB |
BIN
A4/TP_GPU-master/TP1_image/Image/ferrari_out_CPU.png
Normal file
After Width: | Height: | Size: 162 KiB |
BIN
A4/TP_GPU-master/TP1_image/Image/ferrari_out_CPU.raw
Normal file
BIN
A4/TP_GPU-master/TP1_image/Image/ferrari_out_GPU.jpg
Normal file
After Width: | Height: | Size: 19 KiB |
BIN
A4/TP_GPU-master/TP1_image/Image/ferrari_out_GPU.raw
Normal file
BIN
A4/TP_GPU-master/TP1_image/Image/ferrari_out_v1.jpg
Normal file
After Width: | Height: | Size: 132 KiB |
BIN
A4/TP_GPU-master/TP1_image/Image/ferrari_out_v2.jpg
Normal file
After Width: | Height: | Size: 132 KiB |
BIN
A4/TP_GPU-master/TP1_image/Image/ferrari_out_v3.jpg
Normal file
After Width: | Height: | Size: 132 KiB |
BIN
A4/TP_GPU-master/TP1_image/Matlab/.DS_Store
vendored
Normal file
51
A4/TP_GPU-master/TP1_image/Matlab/Contours_Matlab.m
Normal file
|
@ -0,0 +1,51 @@
|
|||
|
||||
%Ouverture d'une image au format raw
|
||||
fid = fopen('../Image/carre.raw', 'rb');
|
||||
image_in=fread(fid, 512*512, 'single');
|
||||
image_in=reshape(image_in,512,512);
|
||||
fclose(fid);
|
||||
|
||||
|
||||
%Affichage d'une image couleur avec image
|
||||
figure('name','Image in','numbertitle','off');imagesc(image_in);colormap(gray);
|
||||
|
||||
%Taille d'une image
|
||||
taille=size(image_in);
|
||||
display(taille);
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
%tic toc pour mesurer le temps de calcul
|
||||
tic;
|
||||
hx=[-1 0 1];
|
||||
hy=[1;0;-1];
|
||||
|
||||
h=conv2(hx,hy)
|
||||
h2D=[0 1 0;-1 0 1;0 -1 0];
|
||||
|
||||
image_out=conv2(image_in,h2D,'same');
|
||||
image_out=abs(image_out);
|
||||
|
||||
%pour la convolution regardez l'aide pour conv2
|
||||
|
||||
%pour le filtre median regardez l'aide de medfilt2
|
||||
|
||||
|
||||
toc;
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
|
||||
|
||||
|
||||
%Affichage d'une image avec imagesc
|
||||
figure('name','Image out','numbertitle','off');imagesc(image_out);colormap gray;
|
||||
|
||||
|
||||
|
||||
%Sauvegarde d'une image au format jpg
|
||||
imwrite(image_out,'../Image/carre_out.jpg','jpg');
|
||||
|
||||
|
||||
%Sauvegarde d'une image au format raw
|
||||
fid = fopen('../Image/carre_out.raw', 'w');
|
||||
fwrite(fid, image_out, 'single');
|
||||
fclose(fid);
|
14
A4/TP_GPU-master/TP1_image/Matlab/raw2jpg.m
Normal file
|
@ -0,0 +1,14 @@
|
|||
function raw2jpg(name_proc)
|
||||
%fid = fopen('ferrari.raw', 'r');
|
||||
file_name_in=['../Image/ferrari_out_' name_proc '.raw'];
|
||||
fid = fopen(file_name_in, 'r');
|
||||
ima=single(fread(fid,1280*960*3, 'single'));
|
||||
fclose(fid);
|
||||
|
||||
ima=reshape(ima,960,1280,3);
|
||||
ima=ima./255;
|
||||
figure('name',name_proc,'numbertitle','off');image(ima);
|
||||
|
||||
file_name_out=['../Image/ferrari_out_' name_proc '.jpg'];
|
||||
imwrite(ima,file_name_out,'jpg');
|
||||
end
|
69
A4/TP_GPU-master/TP1_image/Matlab/seuil_Matlab.m
Normal file
|
@ -0,0 +1,69 @@
|
|||
close all;
|
||||
|
||||
%Ouverture d'une image au format couleur
|
||||
ima=single(imread('../Image/ferrari.jpg'));
|
||||
ima=ima./255;
|
||||
|
||||
%Affichage d'une image couleur avec image
|
||||
figure('name','RGB in','numbertitle','off');image(ima);
|
||||
|
||||
|
||||
%Taille d'une image
|
||||
taille=size(ima);
|
||||
display(taille);
|
||||
|
||||
ima_r=ima(:,:,1);
|
||||
ima_g=ima(:,:,2);
|
||||
ima_b=ima(:,:,3);
|
||||
|
||||
%Affichage d'un niveau de couleur de l'image
|
||||
figure('name','R','numbertitle','off');imagesc(ima_r);colormap gray %Niveau de rouge
|
||||
figure('name','G','numbertitle','off');imagesc(ima_g);colormap gray %Niveau de vert
|
||||
figure('name','B','numbertitle','off');imagesc(ima_b);colormap gray %Niveau de bleu
|
||||
|
||||
%Taille d'une image
|
||||
taille=size(ima);
|
||||
display(taille);
|
||||
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
%tic toc pour mesurer le temps de calcul
|
||||
disp("Seuil :");
|
||||
tic;
|
||||
|
||||
ima_out=ima;
|
||||
|
||||
nr = ima_r./sqrt(ima_r.^2 + ima_b.^2 + ima_g.^2);
|
||||
|
||||
ima_seuil = ima.*(nr>0.7);
|
||||
toc;
|
||||
figure('name','RGB out','numbertitle','off');image(ima_seuil);
|
||||
|
||||
disp("Jaune :")
|
||||
tic;
|
||||
ima_jaune = ima_seuil;
|
||||
ima_jaune(:,:,2) = ima_seuil(:,:,1);
|
||||
toc;
|
||||
figure('name','RGB out','numbertitle','off');image(ima_jaune);
|
||||
|
||||
disp("Reinsertion dans l'image :")
|
||||
tic;
|
||||
ima_out = ima - ima_seuil + ima_jaune;
|
||||
figure('name','RGB out','numbertitle','off');image(ima_out);
|
||||
toc;
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
|
||||
|
||||
|
||||
%Sauvegarde d'une image au format jpg
|
||||
imwrite(ima_out,'../Image/ferrari_out.jpg','jpg');
|
||||
|
||||
|
||||
|
||||
%Sauvegarde d'une image au format raw
|
||||
fid = fopen('../Image/ferrari_out.raw', 'w');
|
||||
fwrite(fid, ima_out, 'single');
|
||||
fclose(fid);
|
||||
|
108
A4/TP_GPU-master/TP1_image/Matlab/seuil_Matlab_solution.m
Normal file
|
@ -0,0 +1,108 @@
|
|||
close all;
|
||||
|
||||
%Ouverture d'une image au format couleur
|
||||
ima=single(imread('../Image/ferrari.jpg'));
|
||||
ima=ima./255;
|
||||
|
||||
%Affichage d'une image couleur avec image
|
||||
figure('name','RGB in','numbertitle','off');image(ima);
|
||||
|
||||
|
||||
%Taille d'une image
|
||||
taille=size(ima);
|
||||
display(taille);
|
||||
|
||||
ima_r=ima(:,:,1);
|
||||
ima_g=ima(:,:,2);
|
||||
ima_b=ima(:,:,3);
|
||||
|
||||
%Affichage d'un niveau de couleur de l'image
|
||||
figure('name','R','numbertitle','off');imagesc(ima_r);colormap gray %Niveau de rouge
|
||||
figure('name','G','numbertitle','off');imagesc(ima_g);colormap gray %Niveau de vert
|
||||
figure('name','B','numbertitle','off');imagesc(ima_b);colormap gray %Niveau de bleu
|
||||
|
||||
%Taille d'une image
|
||||
taille=size(ima);
|
||||
display(taille);
|
||||
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
%tic toc pour mesurer le temps de calcul
|
||||
display('VERSION 1');
|
||||
tic;
|
||||
|
||||
ima_out_v1=ima;
|
||||
%A vous de coder !
|
||||
|
||||
|
||||
for j=1:taille(2)
|
||||
for i=1:taille(1)
|
||||
|
||||
|
||||
|
||||
nr(i,j)=ima(i,j,1)/sqrt(ima(i,j,1)*ima(i,j,1)+ima(i,j,2)*ima(i,j,2)+ima(i,j,3)*ima(i,j,3));
|
||||
|
||||
if (nr(i,j)>0.7)
|
||||
ima_out_v1(i,j,2)=ima(i,j,1);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
toc;
|
||||
|
||||
|
||||
figure('name','ratio rouge','numbertitle','off');imagesc(nr);colormap gray %Niveau de rouge
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
display('VERSION 2');
|
||||
tic;
|
||||
|
||||
|
||||
ima_out_v2=ima;
|
||||
|
||||
nr=ima_r./sqrt(ima_r.^2+ima_g.^2+ima_b.^2);
|
||||
%A vous de coder !
|
||||
|
||||
for j=1:taille(2)
|
||||
for i=1:taille(1)
|
||||
if (nr(i,j)>0.7)
|
||||
ima_out_v2(i,j,2)=ima(i,j,1);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
toc;
|
||||
|
||||
|
||||
display('VERSION 3');
|
||||
tic;
|
||||
ima_out_v3=ima;
|
||||
|
||||
nr=ima_r./sqrt(ima_r.^2+ima_g.^2+ima_b.^2);
|
||||
image_test=nr>0.7;
|
||||
|
||||
image_tmp=ima_g;
|
||||
image_tmp(image_test)=ima_r(image_test);
|
||||
ima_out_v3(:,:,2)=image_tmp();
|
||||
toc;
|
||||
figure('name','test','numbertitle','off');imagesc(image_test);colormap gray %Niveau de rouge
|
||||
|
||||
figure('name','RGB out version 1','numbertitle','off');image(ima_out_v1);
|
||||
figure('name','RGB out version 2','numbertitle','off');image(ima_out_v2);
|
||||
figure('name','RGB out version 3','numbertitle','off');image(ima_out_v3);
|
||||
|
||||
%Sauvegarde d'une image au format jpg
|
||||
imwrite(ima_out_v1,'../Image/ferrari_out_v1.jpg','jpg');
|
||||
imwrite(ima_out_v2,'../Image/ferrari_out_v2.jpg','jpg');
|
||||
imwrite(ima_out_v3,'../Image/ferrari_out_v3.jpg','jpg');
|
||||
|
||||
|
||||
%Sauvegarde d'une image au format raw
|
||||
fid = fopen('../Image/ferrari_out.raw', 'w');
|
||||
fwrite(fid, ima_out_v3, 'single');
|
||||
fclose(fid);
|
||||
|
77
A4/TP_GPU-master/TP2_reduction/.cproject
Normal file
|
@ -0,0 +1,77 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
|
||||
<storageModule moduleId="org.eclipse.cdt.core.settings">
|
||||
<cconfiguration id="com.nvidia.cuda.ide.nine_zero.configuration.release.1433346787">
|
||||
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="com.nvidia.cuda.ide.nine_zero.configuration.release.1433346787" moduleId="org.eclipse.cdt.core.settings" name="Release">
|
||||
<externalSettings/>
|
||||
<extensions>
|
||||
<extension id="com.nvidia.cuda.ide.elf" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="com.nvidia.cuda.ide.cubin" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="com.nvidia.cuda.ide.macho" point="org.eclipse.cdt.core.BinaryParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.VCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="nvcc.errorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
|
||||
</extensions>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe,org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release" cleanCommand="rm -rf" description="" id="com.nvidia.cuda.ide.nine_zero.configuration.release.1433346787" name="Release" parent="com.nvidia.cuda.ide.nine_zero.configuration.release">
|
||||
<folderInfo id="com.nvidia.cuda.ide.nine_zero.configuration.release.1433346787." name="/" resourcePath="">
|
||||
<toolChain id="com.nvidia.cuda.ide.toolchain.nine_zero.exe.release.698197512" name="CUDA Toolkit 10.0" superClass="com.nvidia.cuda.ide.toolchain.nine_zero.exe.release">
|
||||
<targetPlatform archList="all" binaryParser="com.nvidia.cuda.ide.elf;com.nvidia.cuda.ide.macho;com.nvidia.cuda.ide.cubin" id="com.nvidia.cuda.ide.targetPlatform.1887109166" isAbstract="false" name="Debug Platform" osList="linux,macosx" superClass="com.nvidia.cuda.ide.targetPlatform"/>
|
||||
<builder buildPath="${workspace_loc:/TP2}/Release" id="com.nvidia.cuda.ide.builder.996479146" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="CUDA Toolkit 10.0 Builder" superClass="com.nvidia.cuda.ide.builder"/>
|
||||
<tool id="nvcc.compiler.base.970531586" name="NVCC Compiler" superClass="nvcc.compiler.base">
|
||||
<option id="nvcc.compiler.deviceDebug.130815577" name="Generate device debug information (-G)" superClass="nvcc.compiler.deviceDebug"/>
|
||||
<option id="nvcc.compiler.option.level.1535441624" name="Generate host debug information (-g)" superClass="nvcc.compiler.option.level"/>
|
||||
<option defaultValue="nvcc.compiler.optimization.level.most" id="nvcc.compiler.optimization.level.1934934131" name="Optimization Level" superClass="nvcc.compiler.optimization.level" valueType="enumerated"/>
|
||||
<option id="nvcc.compiler.pic.84159010" name="Position Independent Code (-fPIC)" superClass="nvcc.compiler.pic"/>
|
||||
<inputType id="nvcc.compiler.input.cu.2057104910" superClass="nvcc.compiler.input.cu"/>
|
||||
<inputType id="nvcc.compiler.input.cpp.1440437331" superClass="nvcc.compiler.input.cpp"/>
|
||||
<inputType id="nvcc.compiler.input.c.1110197327" superClass="nvcc.compiler.input.c"/>
|
||||
</tool>
|
||||
<tool id="nvcc.linker.base.1084287072" name="NVCC Linker" superClass="nvcc.linker.base">
|
||||
<option id="nvcc.linker.option.libs.1571841227" name="Libraries (-l)" superClass="nvcc.linker.option.libs" valueType="libs">
|
||||
<listOptionValue builtIn="false" value="gomp"/>
|
||||
</option>
|
||||
<inputType id="nvcc.linker.input.1818364586" superClass="nvcc.linker.input">
|
||||
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
|
||||
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
|
||||
</inputType>
|
||||
</tool>
|
||||
<tool id="nvcc.archiver.base.1485801156" name="NVCC Archiver" superClass="nvcc.archiver.base"/>
|
||||
<tool id="com.nvidia.host.assembler.1159491752" name="Host Assembler" superClass="com.nvidia.host.assembler">
|
||||
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.174374770" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
|
||||
</tool>
|
||||
</toolChain>
|
||||
</folderInfo>
|
||||
</configuration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="com.nvidia.cuda.ide.build.project.ICudaProjectConfiguration">
|
||||
<executable devicelink="false">
|
||||
<sass major="3" minor="0"/>
|
||||
<ptx major="3" minor="0"/>
|
||||
</executable>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
|
||||
</cconfiguration>
|
||||
</storageModule>
|
||||
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
|
||||
<project id="TP2.com.nvidia.cuda.ide.nine_zero.exe.235863883" name="Executable" projectType="com.nvidia.cuda.ide.nine_zero.exe"/>
|
||||
</storageModule>
|
||||
<storageModule moduleId="scannerConfiguration">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
|
||||
<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.nine_zero.configuration.debug.9454793;com.nvidia.cuda.ide.nine_zero.configuration.debug.9454793.;nvcc.compiler.base.1363154216;nvcc.compiler.input.cu.75245776">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
|
||||
</scannerConfigBuildInfo>
|
||||
<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.nine_zero.configuration.debug.9454793;com.nvidia.cuda.ide.nine_zero.configuration.debug.9454793.;nvcc.compiler.base.1363154216;nvcc.compiler.input.cpp.1264206044">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
|
||||
</scannerConfigBuildInfo>
|
||||
<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.nine_zero.configuration.debug.9454793;com.nvidia.cuda.ide.nine_zero.configuration.debug.9454793.;nvcc.compiler.base.1363154216;nvcc.compiler.input.c.1376543207">
|
||||
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
|
||||
</scannerConfigBuildInfo>
|
||||
</storageModule>
|
||||
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
|
||||
<storageModule moduleId="refreshScope"/>
|
||||
</cproject>
|
0
A4/TP_GPU-master/TP2_reduction/.gitkeep
Normal file
27
A4/TP_GPU-master/TP2_reduction/.project
Normal file
|
@ -0,0 +1,27 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<projectDescription>
|
||||
<name>TP2</name>
|
||||
<comment></comment>
|
||||
<projects>
|
||||
</projects>
|
||||
<buildSpec>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
|
||||
<triggers>clean,full,incremental,</triggers>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
|
||||
<triggers>full,incremental,</triggers>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
</buildSpec>
|
||||
<natures>
|
||||
<nature>org.eclipse.cdt.core.cnature</nature>
|
||||
<nature>org.eclipse.cdt.core.ccnature</nature>
|
||||
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
|
||||
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
|
||||
</natures>
|
||||
</projectDescription>
|
34
A4/TP_GPU-master/TP2_reduction/CMakeLists.txt
Normal file
|
@ -0,0 +1,34 @@
|
|||
#cmake -DCMAKE_CUDA_FLAGS=”-arch=sm_30” ..
|
||||
cmake_minimum_required(VERSION 2.8)
|
||||
|
||||
project(reduce)
|
||||
|
||||
set(ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/)
|
||||
|
||||
#set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")
|
||||
|
||||
find_package(CUDA REQUIRED)
|
||||
list(APPEND HEADER_REP ${ROOT_DIR}/ ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} "/usr/local/cuda/samples/common/inc/" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/../samples/common/inc")
|
||||
|
||||
#file(GLOB CPP_FILES ${ROOT_DIR}/src/CPP/*.cpp)
|
||||
#file(GLOB CU_FILES ${ROOT_DIR}/src/CPP/*.cu)
|
||||
set(CU_FILES ${ROOT_DIR}/Reduce_solution.cu)
|
||||
#file(GLOB_RECURSE HEADER_FILES ${ROOT_DIR}/src/CUDA/*.cuh ${ROOT_DIR}/inc/*.h)
|
||||
|
||||
|
||||
set(CUDA_SEPARABLE_COMPILATION ON)
|
||||
set(CUDA_NVCC_FLAGS "-lineinfo;-I\"${ROOT_DIR}/inc\" -I\"${ROOT_DIR}/\" -I\"/usr/local/cuda/samples/common/inc/\" -std=c++14")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
|
||||
|
||||
cuda_add_executable(reduce ${CPP_FILES} ${CU_FILES} ${HEADER_FILES})
|
||||
set_target_properties(reduce PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_include_directories(reduce PUBLIC ${ROOT_DIR}/inc/ /usr/local/cuda/samples/common/inc/)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
34
A4/TP_GPU-master/TP2_reduction/CMakeLists_cmake28.txt
Normal file
|
@ -0,0 +1,34 @@
|
|||
cmake_minimum_required(VERSION 2.8)
|
||||
|
||||
project(matrice)
|
||||
|
||||
set(ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/)
|
||||
|
||||
#set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")
|
||||
|
||||
find_package(CUDA REQUIRED)
|
||||
list(APPEND HEADER_REP ${ROOT_DIR}/ ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} "/usr/local/cuda/samples/common/inc/" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/../samples/common/inc")
|
||||
|
||||
#file(GLOB CPP_FILES ${ROOT_DIR}/src/CPP/*.cpp)
|
||||
#file(GLOB CU_FILES ${ROOT_DIR}/src/CPP/*.cu)
|
||||
set(CPP_FILES seuillage_C.cpp)
|
||||
set(CU_FILES ${ROOT_DIR}/seuillage_main.cu)
|
||||
#file(GLOB_RECURSE HEADER_FILES ${ROOT_DIR}/src/CUDA/*.cuh ${ROOT_DIR}/inc/*.h)
|
||||
|
||||
|
||||
set(CUDA_SEPARABLE_COMPILATION ON)
|
||||
set(CUDA_NVCC_FLAGS "-lineinfo;-I\"${ROOT_DIR}/inc\" -I\"${ROOT_DIR}/\" -I\"/usr/local/cuda/samples/common/inc/\"")
|
||||
|
||||
|
||||
cuda_add_executable(seuillage_CUDA ${CPP_FILES} ${CU_FILES} ${HEADER_FILES})
|
||||
set_target_properties(seuillage_CUDA PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_include_directories(seuillage_CUDA PUBLIC ${ROOT_DIR}/inc/ /usr/local/cuda/samples/common/inc/)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
17
A4/TP_GPU-master/TP2_reduction/CMakeLists_cmake38.txt
Normal file
|
@ -0,0 +1,17 @@
|
|||
#cmake -DCMAKE_CUDA_FLAGS=”-arch=sm_30” ..
|
||||
cmake_minimum_required(VERSION 3.8)
|
||||
set(CUDACXX "/usr/local/cuda/bin/nvcc")
|
||||
set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc")
|
||||
project(seuillage_CUDA LANGUAGES CXX CUDA)
|
||||
|
||||
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
|
||||
set(CMAKE_VERBOSE_MAKEFILE TRUE)
|
||||
|
||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math")
|
||||
|
||||
|
||||
add_executable(seuillage_CUDA seuillage_C.cpp seuillage_main.cu)
|
||||
set_target_properties(seuillage_CUDA PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_features(seuillage_CUDA PUBLIC cxx_std_11)
|
||||
target_include_directories(seuillage_CUDA PUBLIC ".")
|
||||
|
39
A4/TP_GPU-master/TP2_reduction/GpuTimer.h
Normal file
|
@ -0,0 +1,39 @@
|
|||
#pragma once
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
class GpuTimer
|
||||
{
|
||||
cudaEvent_t start, stop;
|
||||
|
||||
public:
|
||||
GpuTimer()
|
||||
{
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
}
|
||||
|
||||
~GpuTimer()
|
||||
{
|
||||
cudaEventDestroy(stop);
|
||||
cudaEventDestroy(start);
|
||||
}
|
||||
|
||||
void Start()
|
||||
{
|
||||
cudaEventRecord(start);
|
||||
}
|
||||
|
||||
void Stop()
|
||||
{
|
||||
cudaEventRecord(stop);
|
||||
cudaEventSynchronize(stop);
|
||||
}
|
||||
|
||||
float Elapsed()
|
||||
{
|
||||
float elapsed;
|
||||
cudaEventElapsedTime(&elapsed, start, stop);
|
||||
return elapsed;
|
||||
}
|
||||
};
|
||||
|
458
A4/TP_GPU-master/TP2_reduction/Reduce.cu
Normal file
|
@ -0,0 +1,458 @@
|
|||
/*
|
||||
# Copyright (c) 2011-2012 NVIDIA CORPORATION. All Rights Reserved.
|
||||
#
|
||||
# NVIDIA CORPORATION and its licensors retain all intellectual property
|
||||
# and proprietary rights in and to this software, related documentation
|
||||
# and any modifications thereto. Any use, reproduction, disclosure or
|
||||
# distribution of this software and related documentation without an express
|
||||
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <omp.h>
|
||||
|
||||
#include <thrust/host_vector.h>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <thrust/generate.h>
|
||||
#include <thrust/reduce.h>
|
||||
#include <thrust/functional.h>
|
||||
|
||||
//#include <cub/cub.cuh>
|
||||
#include "GpuTimer.h"
|
||||
|
||||
#define CUDA_SAFE_CALL(call) \
|
||||
{ \
|
||||
cudaError_t err_code = call; \
|
||||
if( err_code != cudaSuccess ) { std::cerr << "Error (" << __FILE__ << ":" << __LINE__ << "): " << cudaGetErrorString(err_code) << std::endl; return 1; } \
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// G P U R E D U C T I O N
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__global__ void reduce_kernel( int n, const int *in_buffer, int *out_buffer, const int2 *block_ranges )
|
||||
{
|
||||
// Allocate shared memory inside the block.
|
||||
extern __shared__ int s_mem[];
|
||||
float my_sum=0;
|
||||
// The range of data to work with.
|
||||
int2 range = block_ranges[blockIdx.x];
|
||||
|
||||
// Compute the sum of my elements.
|
||||
|
||||
// TODO: fill-in that section of the code
|
||||
|
||||
// Copy my sum in shared memory.
|
||||
s_mem[threadIdx.x] = my_sum;
|
||||
|
||||
// Make sure all the threads have copied their value in shared memory.
|
||||
__syncthreads();
|
||||
|
||||
// Compute the sum inside the block.
|
||||
|
||||
// TODO: fill-in that section of the code
|
||||
|
||||
// The first thread of the block stores its result.
|
||||
if( threadIdx.x == 0 )
|
||||
out_buffer[blockIdx.x] = s_mem[0];
|
||||
}
|
||||
|
||||
int reduce_on_gpu( int n, const int *a_device )
|
||||
{
|
||||
// Compute the size of the grid.
|
||||
const int BLOCK_DIM = 256;
|
||||
const int grid_dim = std::min( BLOCK_DIM, (n + BLOCK_DIM-1) / BLOCK_DIM );
|
||||
const int num_threads = BLOCK_DIM * grid_dim;
|
||||
|
||||
// Compute the number of elements per block.
|
||||
const int elements_per_block = BLOCK_DIM * ((n + num_threads - 1) / num_threads);
|
||||
|
||||
// Allocate memory for temporary buffers.
|
||||
int *partial_sums = NULL;
|
||||
int2 *block_ranges = NULL;
|
||||
|
||||
CUDA_SAFE_CALL( cudaMalloc( (void **) &partial_sums, BLOCK_DIM * sizeof(int ) ) );
|
||||
CUDA_SAFE_CALL( cudaMalloc( (void **) &block_ranges, grid_dim * sizeof(int2) ) );
|
||||
|
||||
// Compute the ranges for the blocks.
|
||||
int sum = 0;
|
||||
int2 *block_ranges_on_host = new int2[grid_dim];
|
||||
for( int block_idx = 0 ; block_idx < grid_dim ; ++block_idx )
|
||||
{
|
||||
block_ranges_on_host[block_idx].x = sum;
|
||||
block_ranges_on_host[block_idx].y = std::min( sum += elements_per_block, n );
|
||||
}
|
||||
CUDA_SAFE_CALL( cudaMemcpy( block_ranges, block_ranges_on_host, grid_dim * sizeof(int2), cudaMemcpyHostToDevice ) );
|
||||
delete[] block_ranges_on_host;
|
||||
|
||||
// First round: Compute a partial sum for all blocks.
|
||||
reduce_kernel<<<grid_dim, BLOCK_DIM, BLOCK_DIM*sizeof(int)>>>( n, a_device, partial_sums, block_ranges );
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
// Set the ranges for the second kernel call.
|
||||
int2 block_range = make_int2( 0, grid_dim );
|
||||
CUDA_SAFE_CALL( cudaMemcpy( block_ranges, &block_range, sizeof(int2), cudaMemcpyHostToDevice ) );
|
||||
|
||||
// Second round: Compute the final sum by summing the partial results of all blocks.
|
||||
reduce_kernel<<<1, BLOCK_DIM, BLOCK_DIM*sizeof(int)>>>( grid_dim, partial_sums, partial_sums, block_ranges );
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
// Read the result from device memory.
|
||||
int result;
|
||||
CUDA_SAFE_CALL( cudaMemcpy( &result, partial_sums, sizeof(int), cudaMemcpyDeviceToHost ) );
|
||||
|
||||
// Free temporary memory.
|
||||
CUDA_SAFE_CALL( cudaFree( block_ranges ) );
|
||||
CUDA_SAFE_CALL( cudaFree( partial_sums ) );
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// G P U R E D U C T I O N : O P T I M I Z E D V E R S I O N
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define WARP_SIZE 32
|
||||
|
||||
template< int BLOCK_DIM >
|
||||
__global__ void reduce_kernel_optimized( int n, const int *in_buffer, int *out_buffer, const int2 *__restrict block_ranges )
|
||||
{
|
||||
// The number of warps in the block.
|
||||
const int NUM_WARPS = BLOCK_DIM / WARP_SIZE;
|
||||
float my_sum=0;
|
||||
// Allocate shared memory inside the block.
|
||||
__shared__ volatile int s_mem[BLOCK_DIM];
|
||||
|
||||
// The range of data to work with.
|
||||
int2 range = block_ranges[blockIdx.x];
|
||||
|
||||
// Warp/lane IDs.
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
|
||||
// Compute the sum of my elements.
|
||||
|
||||
// TODO: fill-in that section of the code
|
||||
|
||||
// Copy my sum in shared memory.
|
||||
s_mem[threadIdx.x] = my_sum;
|
||||
|
||||
// Compute the sum inside each warp.
|
||||
|
||||
// TODO: fill-in that section of the code
|
||||
|
||||
// Each warp leader stores the result for the warp.
|
||||
if( lane_id == 0 )
|
||||
// TODO: fill-in that section of the code
|
||||
__syncthreads();
|
||||
|
||||
if( warp_id == 0 )
|
||||
{
|
||||
// Read my value from shared memory and store it in a register.
|
||||
my_sum = s_mem[lane_id];
|
||||
|
||||
// Sum the results of the warps.
|
||||
|
||||
// TODO: fill-in that section of the code
|
||||
}
|
||||
|
||||
// The 1st thread stores the result of the block.
|
||||
if( threadIdx.x == 0 )
|
||||
out_buffer[blockIdx.x] = my_sum += s_mem[1];
|
||||
}
|
||||
|
||||
template< int BLOCK_DIM >
|
||||
int reduce_on_gpu_optimized( int n, const int *a_device )
|
||||
{
|
||||
// Compute the size of the grid.
|
||||
const int grid_dim = std::min( BLOCK_DIM, (n + BLOCK_DIM-1) / BLOCK_DIM );
|
||||
const int num_threads = BLOCK_DIM * grid_dim;
|
||||
|
||||
// Compute the number of elements per block.
|
||||
const int elements_per_block = BLOCK_DIM * ((n + num_threads - 1) / num_threads);
|
||||
|
||||
// Allocate memory for temporary buffers.
|
||||
int *partial_sums = NULL;
|
||||
int2 *block_ranges = NULL;
|
||||
|
||||
CUDA_SAFE_CALL( cudaMalloc( (void **) &partial_sums, BLOCK_DIM * sizeof(int ) ) );
|
||||
CUDA_SAFE_CALL( cudaMalloc( (void **) &block_ranges, grid_dim * sizeof(int2) ) );
|
||||
|
||||
// Compute the ranges for the blocks.
|
||||
int sum = 0;
|
||||
int2 *block_ranges_on_host = new int2[grid_dim];
|
||||
for( int block_idx = 0 ; block_idx < grid_dim ; ++block_idx )
|
||||
{
|
||||
block_ranges_on_host[block_idx].x = sum;
|
||||
block_ranges_on_host[block_idx].y = std::min( sum += elements_per_block, n );
|
||||
}
|
||||
CUDA_SAFE_CALL( cudaMemcpy( block_ranges, block_ranges_on_host, grid_dim * sizeof(int2), cudaMemcpyHostToDevice ) );
|
||||
delete[] block_ranges_on_host;
|
||||
|
||||
// First round: Compute a partial sum for all blocks.
|
||||
reduce_kernel_optimized<BLOCK_DIM><<<grid_dim, BLOCK_DIM>>>( n, a_device, partial_sums, block_ranges );
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
// Set the ranges for the second kernel call.
|
||||
int2 block_range = make_int2( 0, grid_dim );
|
||||
CUDA_SAFE_CALL( cudaMemcpy( block_ranges, &block_range, sizeof(int2), cudaMemcpyHostToDevice ) );
|
||||
|
||||
// Second round: Compute the final sum by summing the partial results of all blocks.
|
||||
reduce_kernel_optimized<BLOCK_DIM><<<1, BLOCK_DIM>>>( grid_dim, partial_sums, partial_sums, block_ranges );
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
// Read the result from device memory.
|
||||
int result;
|
||||
CUDA_SAFE_CALL( cudaMemcpy( &result, partial_sums, sizeof(int), cudaMemcpyDeviceToHost ) );
|
||||
|
||||
// Free temporary memory.
|
||||
CUDA_SAFE_CALL( cudaFree( block_ranges ) );
|
||||
CUDA_SAFE_CALL( cudaFree( partial_sums ) );
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// M A I N
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int main( int, char ** )
|
||||
{
|
||||
const int NUM_TESTS = 10;
|
||||
|
||||
// The number of elements in the problem.
|
||||
const int N = 512 * 131072;
|
||||
|
||||
std::cout << "Computing a reduction on " << N << " elements" << std::endl;
|
||||
|
||||
// X and Y on the host (CPU).
|
||||
int *a_host = new int[N];
|
||||
|
||||
// Make sure the memory got allocated. TODO: free memory.
|
||||
if( a_host == NULL )
|
||||
{
|
||||
std::cerr << "ERROR: Couldn't allocate a_host" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Generate data
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::cout << "Filling with 1s" << std::endl;
|
||||
|
||||
// Generate pseudo-random data.
|
||||
for( int i = 0 ; i < N ; ++i )
|
||||
a_host[i] = 1;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Compute on the CPU using 1 thread
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "Computing on the CPU using 1 CPU thread" << std::endl;
|
||||
|
||||
GpuTimer gpu_timer;
|
||||
gpu_timer.Start();
|
||||
|
||||
// Calculate the reference to compare with the device result.
|
||||
int sum = 0;
|
||||
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
|
||||
{
|
||||
sum = 0;
|
||||
for( int i = 0 ; i < N ; ++i )
|
||||
sum += a_host[i];
|
||||
}
|
||||
|
||||
gpu_timer.Stop();
|
||||
|
||||
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Compute on the CPU using several OpenMP threads
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "Computing on the CPU using " << omp_get_max_threads() << " OpenMP thread(s)" << std::endl;
|
||||
|
||||
gpu_timer.Start();
|
||||
|
||||
// Calculate the reference to compare with the device result.
|
||||
int omp_sum = 0;
|
||||
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
|
||||
{
|
||||
omp_sum = 0;
|
||||
#pragma omp parallel shared(omp_sum)
|
||||
{
|
||||
#pragma omp for reduction(+ : omp_sum)
|
||||
for( int i = 0 ; i < N ; ++i )
|
||||
omp_sum = omp_sum + a_host[i];
|
||||
}
|
||||
}
|
||||
|
||||
gpu_timer.Stop();
|
||||
|
||||
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Compute on the GPU
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// The copy of A on the device (GPU).
|
||||
int *a_device = NULL;
|
||||
|
||||
// Allocate A on the device.
|
||||
CUDA_SAFE_CALL( cudaMalloc( (void **) &a_device, N*sizeof( int ) ) );
|
||||
|
||||
// Copy A from host (CPU) to device (GPU).
|
||||
CUDA_SAFE_CALL( cudaMemcpy( a_device, a_host, N*sizeof( int ), cudaMemcpyHostToDevice ) );
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Compute on the GPU using Thrust
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "Computing on the GPU using Thrust (transfers excluded)" << std::endl;
|
||||
|
||||
|
||||
gpu_timer.Start();
|
||||
|
||||
// Launch the kernel on the GPU.
|
||||
int thrust_sum = 0;
|
||||
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
|
||||
{
|
||||
thrust_sum = thrust::reduce( thrust::device_ptr<int>(a_device), thrust::device_ptr<int>(a_device+N) );
|
||||
}
|
||||
|
||||
gpu_timer.Stop();
|
||||
|
||||
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Compute on the GPU using CUB
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/*int cub_sum=0;
|
||||
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "Computing on the GPU using CUB (transfers excluded)" << std::endl;
|
||||
|
||||
int * sum_device=NULL;
|
||||
cudaMalloc(&sum_device, sizeof(int));
|
||||
|
||||
void *temp_storage_device = NULL;
|
||||
size_t temp_storage_bytes = 0;
|
||||
cub::DeviceReduce::Sum(temp_storage_device, temp_storage_bytes, a_device ,sum_device, N);
|
||||
// Allocate temporary storage
|
||||
cudaMalloc(&temp_storage_device, temp_storage_bytes);
|
||||
|
||||
|
||||
gpu_timer.Start();
|
||||
|
||||
// Run reduction
|
||||
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
|
||||
{
|
||||
cub::DeviceReduce::Sum(temp_storage_device, temp_storage_bytes, a_device, sum_device,N);
|
||||
}
|
||||
|
||||
|
||||
gpu_timer.Stop();
|
||||
|
||||
CUDA_SAFE_CALL( cudaMemcpy( &cub_sum, sum_device, sizeof(int), cudaMemcpyDeviceToHost ) );
|
||||
|
||||
|
||||
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
|
||||
*/
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Compute on the GPU
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "Computing on the GPU (transfers excluded)" << std::endl;
|
||||
|
||||
gpu_timer.Start();
|
||||
|
||||
// Launch the kernel on the GPU.
|
||||
int gpu_sum = 0;
|
||||
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
|
||||
{
|
||||
gpu_sum = reduce_on_gpu( N, a_device );
|
||||
}
|
||||
|
||||
gpu_timer.Stop();
|
||||
|
||||
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Compute on the GPU (optimized version)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "Computing on the GPU using a tuned version (transfers excluded)" << std::endl;
|
||||
|
||||
gpu_timer.Start();
|
||||
|
||||
const int BLOCK_DIM = 256;
|
||||
|
||||
// Launch the kernel on the GPU.
|
||||
int optim_gpu_sum = 0;
|
||||
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
|
||||
{
|
||||
optim_gpu_sum = reduce_on_gpu_optimized<BLOCK_DIM>( N, a_device );
|
||||
}
|
||||
|
||||
gpu_timer.Stop();
|
||||
|
||||
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Validate results
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << std::endl;
|
||||
std::cout << "OpenMP results: ref= " << sum << " / sum= " << omp_sum << std::endl;
|
||||
std::cout << "Thrust results: ref= " << sum << " / sum= " << thrust_sum << std::endl;
|
||||
//std::cout << "CUB results: ref= " << sum << " / sum= " << cub_sum << std::endl;
|
||||
std::cout << "CUDA results: ref= " << sum << " / sum= " << gpu_sum << std::endl;
|
||||
std::cout << "CUDA Optim results: ref= " << sum << " / sum= " << optim_gpu_sum << std::endl;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Clean memory
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Free device memory.
|
||||
CUDA_SAFE_CALL( cudaFree( a_device ) );
|
||||
|
||||
// Free host memory.
|
||||
delete[] a_host;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
7
A4/TP_GPU-master/TP2_reduction/windows/CpuTimer.cpp
Normal file
|
@ -0,0 +1,7 @@
|
|||
#include "CpuTimer.h"
|
||||
|
||||
// Initialize the resolution of the timer
|
||||
LARGE_INTEGER CpuTimer::m_freq = (QueryPerformanceFrequency(&CpuTimer::m_freq), CpuTimer::m_freq);
|
||||
|
||||
// Calculate the overhead of the timer
|
||||
LONGLONG CpuTimer::m_overhead = CpuTimer::GetOverhead();
|
37
A4/TP_GPU-master/TP2_reduction/windows/CpuTimer.h
Normal file
|
@ -0,0 +1,37 @@
|
|||
#pragma once
|
||||
#include <windows.h>
|
||||
|
||||
struct CpuTimer
|
||||
{
|
||||
void Start()
|
||||
{
|
||||
QueryPerformanceCounter(&m_start);
|
||||
}
|
||||
|
||||
void Stop()
|
||||
{
|
||||
QueryPerformanceCounter(&m_stop);
|
||||
}
|
||||
|
||||
// Returns elapsed time in milliseconds (ms)
|
||||
double Elapsed()
|
||||
{
|
||||
return (m_stop.QuadPart - m_start.QuadPart - m_overhead) * 1000.0 / m_freq.QuadPart;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
// Returns the overhead of the timer in ticks
|
||||
static LONGLONG GetOverhead()
|
||||
{
|
||||
CpuTimer t;
|
||||
t.Start();
|
||||
t.Stop();
|
||||
return t.m_stop.QuadPart - t.m_start.QuadPart;
|
||||
}
|
||||
|
||||
LARGE_INTEGER m_start;
|
||||
LARGE_INTEGER m_stop;
|
||||
static LARGE_INTEGER m_freq;
|
||||
static LONGLONG m_overhead;
|
||||
};
|
39
A4/TP_GPU-master/TP2_reduction/windows/GpuTimer.h
Normal file
|
@ -0,0 +1,39 @@
|
|||
#pragma once
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
class GpuTimer
|
||||
{
|
||||
cudaEvent_t start, stop;
|
||||
|
||||
public:
|
||||
GpuTimer()
|
||||
{
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
}
|
||||
|
||||
~GpuTimer()
|
||||
{
|
||||
cudaEventDestroy(stop);
|
||||
cudaEventDestroy(start);
|
||||
}
|
||||
|
||||
void Start()
|
||||
{
|
||||
cudaEventRecord(start);
|
||||
}
|
||||
|
||||
void Stop()
|
||||
{
|
||||
cudaEventRecord(stop);
|
||||
cudaEventSynchronize(stop);
|
||||
}
|
||||
|
||||
float Elapsed()
|
||||
{
|
||||
float elapsed;
|
||||
cudaEventElapsedTime(&elapsed, start, stop);
|
||||
return elapsed;
|
||||
}
|
||||
};
|
||||
|
415
A4/TP_GPU-master/TP2_reduction/windows/Reduce.cu
Normal file
|
@ -0,0 +1,415 @@
|
|||
/*
|
||||
# Copyright (c) 2011-2012 NVIDIA CORPORATION. All Rights Reserved.
|
||||
#
|
||||
# NVIDIA CORPORATION and its licensors retain all intellectual property
|
||||
# and proprietary rights in and to this software, related documentation
|
||||
# and any modifications thereto. Any use, reproduction, disclosure or
|
||||
# distribution of this software and related documentation without an express
|
||||
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <omp.h>
|
||||
#include <thrust/reduce.h>
|
||||
#include "GpuTimer.h"
|
||||
#include "CpuTimer.h"
|
||||
|
||||
#define CUDA_SAFE_CALL(call) \
|
||||
{ \
|
||||
cudaError_t err_code = call; \
|
||||
if( err_code != cudaSuccess ) { std::cerr << "Error (" << __FILE__ << ":" << __LINE__ << "): " << cudaGetErrorString(err_code) << std::endl; return 1; } \
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// G P U R E D U C T I O N
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__global__ void reduce_kernel( int n, const int *in_buffer, int *out_buffer, const int2 *block_ranges )
|
||||
{
|
||||
// Allocate shared memory inside the block.
|
||||
extern __shared__ int s_mem[];
|
||||
|
||||
// The range of data to work with.
|
||||
int2 range = block_ranges[blockIdx.x];
|
||||
|
||||
// Compute the sum of my elements.
|
||||
|
||||
// TODO: fill-in that section of the code
|
||||
|
||||
// Copy my sum in shared memory.
|
||||
s_mem[threadIdx.x] = my_sum;
|
||||
|
||||
// Make sure all the threads have copied their value in shared memory.
|
||||
__syncthreads();
|
||||
|
||||
// Compute the sum inside the block.
|
||||
|
||||
// TODO: fill-in that section of the code
|
||||
|
||||
// The first thread of the block stores its result.
|
||||
if( threadIdx.x == 0 )
|
||||
out_buffer[blockIdx.x] = s_mem[0];
|
||||
}
|
||||
|
||||
int reduce_on_gpu( int n, const int *a_device )
|
||||
{
|
||||
// Compute the size of the grid.
|
||||
const int BLOCK_DIM = 256;
|
||||
const int grid_dim = std::min( BLOCK_DIM, (n + BLOCK_DIM-1) / BLOCK_DIM );
|
||||
const int num_threads = BLOCK_DIM * grid_dim;
|
||||
|
||||
// Compute the number of elements per block.
|
||||
const int elements_per_block = BLOCK_DIM * ((n + num_threads - 1) / num_threads);
|
||||
|
||||
// Allocate memory for temporary buffers.
|
||||
int *partial_sums = NULL;
|
||||
int2 *block_ranges = NULL;
|
||||
|
||||
CUDA_SAFE_CALL( cudaMalloc( (void **) &partial_sums, BLOCK_DIM * sizeof(int ) ) );
|
||||
CUDA_SAFE_CALL( cudaMalloc( (void **) &block_ranges, grid_dim * sizeof(int2) ) );
|
||||
|
||||
// Compute the ranges for the blocks.
|
||||
int sum = 0;
|
||||
int2 *block_ranges_on_host = new int2[grid_dim];
|
||||
for( int block_idx = 0 ; block_idx < grid_dim ; ++block_idx )
|
||||
{
|
||||
block_ranges_on_host[block_idx].x = sum;
|
||||
block_ranges_on_host[block_idx].y = std::min( sum += elements_per_block, n );
|
||||
}
|
||||
CUDA_SAFE_CALL( cudaMemcpy( block_ranges, block_ranges_on_host, grid_dim * sizeof(int2), cudaMemcpyHostToDevice ) );
|
||||
delete[] block_ranges_on_host;
|
||||
|
||||
// First round: Compute a partial sum for all blocks.
|
||||
reduce_kernel<<<grid_dim, BLOCK_DIM, BLOCK_DIM*sizeof(int)>>>( n, a_device, partial_sums, block_ranges );
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
// Set the ranges for the second kernel call.
|
||||
int2 block_range = make_int2( 0, grid_dim );
|
||||
CUDA_SAFE_CALL( cudaMemcpy( block_ranges, &block_range, sizeof(int2), cudaMemcpyHostToDevice ) );
|
||||
|
||||
// Second round: Compute the final sum by summing the partial results of all blocks.
|
||||
reduce_kernel<<<1, BLOCK_DIM, BLOCK_DIM*sizeof(int)>>>( grid_dim, partial_sums, partial_sums, block_ranges );
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
// Read the result from device memory.
|
||||
int result;
|
||||
CUDA_SAFE_CALL( cudaMemcpy( &result, partial_sums, sizeof(int), cudaMemcpyDeviceToHost ) );
|
||||
|
||||
// Free temporary memory.
|
||||
CUDA_SAFE_CALL( cudaFree( block_ranges ) );
|
||||
CUDA_SAFE_CALL( cudaFree( partial_sums ) );
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// G P U R E D U C T I O N : O P T I M I Z E D V E R S I O N
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define WARP_SIZE 32
|
||||
|
||||
template< int BLOCK_DIM >
|
||||
__global__ void reduce_kernel_optimized( int n, const int *in_buffer, int *out_buffer, const int2 *__restrict block_ranges )
|
||||
{
|
||||
// The number of warps in the block.
|
||||
const int NUM_WARPS = BLOCK_DIM / WARP_SIZE;
|
||||
|
||||
// Allocate shared memory inside the block.
|
||||
__shared__ volatile int s_mem[BLOCK_DIM];
|
||||
|
||||
// The range of data to work with.
|
||||
int2 range = block_ranges[blockIdx.x];
|
||||
|
||||
// Warp/lane IDs.
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
|
||||
// Compute the sum of my elements.
|
||||
|
||||
// TODO: fill-in that section of the code
|
||||
|
||||
// Copy my sum in shared memory.
|
||||
s_mem[threadIdx.x] = my_sum;
|
||||
|
||||
// Compute the sum inside each warp.
|
||||
|
||||
// TODO: fill-in that section of the code
|
||||
|
||||
// Each warp leader stores the result for the warp.
|
||||
if( lane_id == 0 )
|
||||
// TODO: fill-in that section of the code
|
||||
__syncthreads();
|
||||
|
||||
if( warp_id == 0 )
|
||||
{
|
||||
// Read my value from shared memory and store it in a register.
|
||||
my_sum = s_mem[lane_id];
|
||||
|
||||
// Sum the results of the warps.
|
||||
|
||||
// TODO: fill-in that section of the code
|
||||
}
|
||||
|
||||
// The 1st thread stores the result of the block.
|
||||
if( threadIdx.x == 0 )
|
||||
out_buffer[blockIdx.x] = my_sum += s_mem[1];
|
||||
}
|
||||
|
||||
template< int BLOCK_DIM >
|
||||
int reduce_on_gpu_optimized( int n, const int *a_device )
|
||||
{
|
||||
// Compute the size of the grid.
|
||||
const int grid_dim = std::min( BLOCK_DIM, (n + BLOCK_DIM-1) / BLOCK_DIM );
|
||||
const int num_threads = BLOCK_DIM * grid_dim;
|
||||
|
||||
// Compute the number of elements per block.
|
||||
const int elements_per_block = BLOCK_DIM * ((n + num_threads - 1) / num_threads);
|
||||
|
||||
// Allocate memory for temporary buffers.
|
||||
int *partial_sums = NULL;
|
||||
int2 *block_ranges = NULL;
|
||||
|
||||
CUDA_SAFE_CALL( cudaMalloc( (void **) &partial_sums, BLOCK_DIM * sizeof(int ) ) );
|
||||
CUDA_SAFE_CALL( cudaMalloc( (void **) &block_ranges, grid_dim * sizeof(int2) ) );
|
||||
|
||||
// Compute the ranges for the blocks.
|
||||
int sum = 0;
|
||||
int2 *block_ranges_on_host = new int2[grid_dim];
|
||||
for( int block_idx = 0 ; block_idx < grid_dim ; ++block_idx )
|
||||
{
|
||||
block_ranges_on_host[block_idx].x = sum;
|
||||
block_ranges_on_host[block_idx].y = std::min( sum += elements_per_block, n );
|
||||
}
|
||||
CUDA_SAFE_CALL( cudaMemcpy( block_ranges, block_ranges_on_host, grid_dim * sizeof(int2), cudaMemcpyHostToDevice ) );
|
||||
delete[] block_ranges_on_host;
|
||||
|
||||
// First round: Compute a partial sum for all blocks.
|
||||
reduce_kernel_optimized<BLOCK_DIM><<<grid_dim, BLOCK_DIM>>>( n, a_device, partial_sums, block_ranges );
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
// Set the ranges for the second kernel call.
|
||||
int2 block_range = make_int2( 0, grid_dim );
|
||||
CUDA_SAFE_CALL( cudaMemcpy( block_ranges, &block_range, sizeof(int2), cudaMemcpyHostToDevice ) );
|
||||
|
||||
// Second round: Compute the final sum by summing the partial results of all blocks.
|
||||
reduce_kernel_optimized<BLOCK_DIM><<<1, BLOCK_DIM>>>( grid_dim, partial_sums, partial_sums, block_ranges );
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
|
||||
// Read the result from device memory.
|
||||
int result;
|
||||
CUDA_SAFE_CALL( cudaMemcpy( &result, partial_sums, sizeof(int), cudaMemcpyDeviceToHost ) );
|
||||
|
||||
// Free temporary memory.
|
||||
CUDA_SAFE_CALL( cudaFree( block_ranges ) );
|
||||
CUDA_SAFE_CALL( cudaFree( partial_sums ) );
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// M A I N
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int main( int, char ** )
|
||||
{
|
||||
const int NUM_TESTS = 10;
|
||||
|
||||
// The number of elements in the problem.
|
||||
const int N = 512 * 131072;
|
||||
|
||||
std::cout << "Computing a reduction on " << N << " elements" << std::endl;
|
||||
|
||||
// X and Y on the host (CPU).
|
||||
int *a_host = new int[N];
|
||||
|
||||
// Make sure the memory got allocated. TODO: free memory.
|
||||
if( a_host == NULL )
|
||||
{
|
||||
std::cerr << "ERROR: Couldn't allocate a_host" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Generate data
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::cout << "Filling with 1s" << std::endl;
|
||||
|
||||
// Generate pseudo-random data.
|
||||
for( int i = 0 ; i < N ; ++i )
|
||||
a_host[i] = 1;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Compute on the CPU using 1 thread
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "Computing on the CPU using 1 CPU thread" << std::endl;
|
||||
|
||||
CpuTimer cpu_timer;
|
||||
cpu_timer.Start();
|
||||
|
||||
// Calculate the reference to compare with the device result.
|
||||
int sum = 0;
|
||||
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
|
||||
{
|
||||
sum = 0;
|
||||
for( int i = 0 ; i < N ; ++i )
|
||||
sum += a_host[i];
|
||||
}
|
||||
|
||||
cpu_timer.Stop();
|
||||
|
||||
std::cout << " Elapsed time: " << cpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Compute on the CPU using several OpenMP threads
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "Computing on the CPU using " << omp_get_max_threads() << " OpenMP thread(s)" << std::endl;
|
||||
|
||||
cpu_timer.Start();
|
||||
|
||||
// Calculate the reference to compare with the device result.
|
||||
int omp_sum = 0;
|
||||
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
|
||||
{
|
||||
omp_sum = 0;
|
||||
#pragma omp parallel shared(omp_sum)
|
||||
{
|
||||
#pragma omp for reduction(+ : omp_sum)
|
||||
for( int i = 0 ; i < N ; ++i )
|
||||
omp_sum = omp_sum + a_host[i];
|
||||
}
|
||||
}
|
||||
|
||||
cpu_timer.Stop();
|
||||
|
||||
std::cout << " Elapsed time: " << cpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Compute on the GPU
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// The copy of A on the device (GPU).
|
||||
int *a_device = NULL;
|
||||
|
||||
// Allocate A on the device.
|
||||
CUDA_SAFE_CALL( cudaMalloc( (void **) &a_device, N*sizeof( int ) ) );
|
||||
|
||||
// Copy A from host (CPU) to device (GPU).
|
||||
CUDA_SAFE_CALL( cudaMemcpy( a_device, a_host, N*sizeof( int ), cudaMemcpyHostToDevice ) );
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Compute on the GPU using Thrust
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "Computing on the GPU using Thrust (transfers excluded)" << std::endl;
|
||||
|
||||
GpuTimer gpu_timer;
|
||||
gpu_timer.Start();
|
||||
|
||||
// Launch the kernel on the GPU.
|
||||
int thrust_sum = 0;
|
||||
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
|
||||
{
|
||||
thrust_sum = thrust::reduce( thrust::device_ptr<int>(a_device), thrust::device_ptr<int>(a_device+N) );
|
||||
}
|
||||
|
||||
gpu_timer.Stop();
|
||||
|
||||
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Compute on the GPU
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "Computing on the GPU (transfers excluded)" << std::endl;
|
||||
|
||||
gpu_timer.Start();
|
||||
|
||||
// Launch the kernel on the GPU.
|
||||
int gpu_sum = 0;
|
||||
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
|
||||
{
|
||||
gpu_sum = reduce_on_gpu( N, a_device );
|
||||
}
|
||||
|
||||
gpu_timer.Stop();
|
||||
|
||||
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Compute on the GPU (optimized version)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << "Computing on the GPU using a tuned version (transfers excluded)" << std::endl;
|
||||
|
||||
gpu_timer.Start();
|
||||
|
||||
const int BLOCK_DIM = 256;
|
||||
|
||||
// Launch the kernel on the GPU.
|
||||
int optim_gpu_sum = 0;
|
||||
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
|
||||
{
|
||||
optim_gpu_sum = reduce_on_gpu_optimized<BLOCK_DIM>( N, a_device );
|
||||
}
|
||||
|
||||
gpu_timer.Stop();
|
||||
|
||||
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Validate results
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::cout << std::endl;
|
||||
std::cout << std::endl;
|
||||
std::cout << "OpenMP results: ref= " << sum << " / sum= " << omp_sum << std::endl;
|
||||
std::cout << "CUDA results: ref= " << sum << " / sum= " << gpu_sum << std::endl;
|
||||
std::cout << "Thrust results: ref= " << sum << " / sum= " << thrust_sum << std::endl;
|
||||
std::cout << "Optim results: ref= " << sum << " / sum= " << optim_gpu_sum << std::endl;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Clean memory
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Free device memory.
|
||||
CUDA_SAFE_CALL( cudaFree( a_device ) );
|
||||
|
||||
// Free host memory.
|
||||
delete[] a_host;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
20
A4/TP_GPU-master/TP2_reduction/windows/Reduce.sln
Normal file
|
@ -0,0 +1,20 @@
|
|||
|
||||
Microsoft Visual Studio Solution File, Format Version 11.00
|
||||
# Visual Studio 2010
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Reduce", "Reduce.vcxproj", "{037A2CAC-CD22-421A-9BB3-4E39043D36C0}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|x64 = Debug|x64
|
||||
Release|x64 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||
{037A2CAC-CD22-421A-9BB3-4E39043D36C0}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{037A2CAC-CD22-421A-9BB3-4E39043D36C0}.Debug|x64.Build.0 = Debug|x64
|
||||
{037A2CAC-CD22-421A-9BB3-4E39043D36C0}.Release|x64.ActiveCfg = Release|x64
|
||||
{037A2CAC-CD22-421A-9BB3-4E39043D36C0}.Release|x64.Build.0 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
EndGlobalSection
|
||||
EndGlobal
|
BIN
A4/TP_GPU-master/TP2_reduction/windows/Reduce.suo
Normal file
172
A4/TP_GPU-master/TP2_reduction/windows/Reduce.vcxproj
Normal file
|
@ -0,0 +1,172 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|Win32">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<ProjectGuid>{037A2CAC-CD22-421A-9BB3-4E39043D36C0}</ProjectGuid>
|
||||
<Keyword>Win32Proj</Keyword>
|
||||
<RootNamespace>Reduce</RootNamespace>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 4.2.props" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<OpenMPSupport>true</OpenMPSupport>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);cudart.lib</AdditionalDependencies>
|
||||
</Link>
|
||||
<CudaCompile>
|
||||
<TargetMachinePlatform>64</TargetMachinePlatform>
|
||||
<CodeGeneration>compute_20,sm_20</CodeGeneration>
|
||||
<AdditionalOptions>-Xcompiler "/openmp" %(AdditionalOptions)</AdditionalOptions>
|
||||
</CudaCompile>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<OpenMPSupport>true</OpenMPSupport>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);cudart.lib</AdditionalDependencies>
|
||||
</Link>
|
||||
<CudaCompile>
|
||||
<TargetMachinePlatform>64</TargetMachinePlatform>
|
||||
<CodeGeneration>compute_20,sm_20</CodeGeneration>
|
||||
<AdditionalCompilerOptions>/openmp</AdditionalCompilerOptions>
|
||||
<Optimization>O3</Optimization>
|
||||
<AdditionalOptions>-Xcompiler "/openmp" %(AdditionalOptions)</AdditionalOptions>
|
||||
</CudaCompile>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="GpuTimer.h" />
|
||||
<ClInclude Include="CpuTimer.h" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="CpuTimer.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CudaCompile Include="Reduce.cu">
|
||||
<TargetMachinePlatform Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">64</TargetMachinePlatform>
|
||||
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">compute_20,sm_20</CodeGeneration>
|
||||
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xcompiler "/openmp" %(AdditionalOptions)</AdditionalOptions>
|
||||
</CudaCompile>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 4.2.targets" />
|
||||
</ImportGroup>
|
||||
</Project>
|
115
A4/TP_GPU-master/TP3_TomoGPI/TP_TomoGPI_one_gradient.m
Normal file
|
@ -0,0 +1,115 @@
|
|||
clear all;close all;
|
||||
N=128;
|
||||
P=128;
|
||||
rep_name=['Projections_',num2str(P),'/'];
|
||||
iter = Iter3D(rep_name); % Create a class instance
|
||||
|
||||
iter.save_file=1;
|
||||
iter.save_volume=2;
|
||||
iter.fc=1.0;%frequence de coupure pour filtre de rétroporjection filtrée
|
||||
|
||||
f_real=CreateVolumeReal(iter);
|
||||
g_real=getSinoReal(iter);
|
||||
|
||||
f_estimated=CreateVolumeInit(iter);
|
||||
normdJProjReg=CreateVolumeInit(iter);
|
||||
|
||||
[g_real,rsb_in]=addNoise(iter,g_real);
|
||||
file_name=sprintf('%s/P_ER_GPU_NOISE_%2.1fdB.s',iter.workdirectory,rsb_in);
|
||||
fid = fopen(file_name, 'wb');
|
||||
fwrite(fid,g_real ,'float');
|
||||
fclose(fid);
|
||||
|
||||
%temp=zeros(size(f_real));%doLaplacian(iter,f_real,temp);%figure(2);imagesc(temp(:,:,N/2));title('laplacien df');colorbar;colormap(gray);drawnow;%figure(3);imagesc(f_real(:,:,N/2));title('f_real');colorbar;colormap(gray);drawnow;
|
||||
|
||||
% disp('****************************')
|
||||
% disp('Descente de gradient... ')
|
||||
% disp('****************************')
|
||||
|
||||
f_estimated_n=f_estimated;
|
||||
for num_iter_gradient_n=1:1:getGradientIterationNb(iter)
|
||||
fprintf("iter = %d\n",num_iter_gradient_n);
|
||||
iter.num_iter=num_iter_gradient_n;
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
% CALCUL DES J et dJ
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
%g=H*f
|
||||
g_estimated=doProjection(iter,f_estimated_n);%figure(4);imagesc(g_estimated(:,:,N/2));title('g_estimated');colorbar;colormap(gray);drawnow;
|
||||
dg=g_real-g_estimated;%figure(5);imagesc(g_real(:,:,N/2));title('g_real');colorbar;colormap(gray);drawnow;%figure(6);imagesc(dg(:,:,N/2));title('dg');colorbar;colormap(gray);drawnow;
|
||||
|
||||
%df=Ht*(g-Hf)
|
||||
df=doBackprojection(iter,dg);%figure(7);imagesc(df(:,:,N/2));title('df');colorbar;colormap(gray);drawnow;
|
||||
|
||||
%dJ=-2*Ht*(g-Hf)
|
||||
dJ_MC=-2*df;
|
||||
dJ=dJ_MC;
|
||||
|
||||
%dJ+=Dt*D*f avec D laplacien
|
||||
%J_reg=0;
|
||||
%ApplyLaplacianRegularization_to_dJ(iter,f_estimated_n,dJ,getLambda(iter),J_reg,normdJProjReg,getGradientIterationNb(iter),getOptimalStepIterationNb(iter));
|
||||
%dJ_reg=zeros(size(dJ)); temp=zeros(size(dJ));%doLaplacian(iter,f_estimated_n,temp);%doLaplacian(iter,temp,dJ_reg);clear temp;
|
||||
%dJ=dJ+2*dJ_reg;
|
||||
%dJ_reg=dJ-dJ_MC;figure(8);imagesc(dJ_reg(:,:,N/2));title('dJ reg');colorbar;colormap(gray);drawnow;%figure(9);imagesc(dg(:,:,N/2));title('dg');colorbar;colormap(gray);drawnow;
|
||||
|
||||
%Gradient dJ mise à jour
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
% CALCUL DU PAS
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
num_alpha=sum(dJ(:).^2);
|
||||
proj_dJ=doProjection(iter,dJ);
|
||||
denum_alpha=2*sum(proj_dJ(:).^2);
|
||||
clear proj_dJ;
|
||||
|
||||
%SI REGULARISATION
|
||||
%if (getLambda(iter) ~= 0)
|
||||
% s=zeros(size(dJ));
|
||||
% doLaplacian(iter,dJ,s);
|
||||
% s=sum(s(:).^2);
|
||||
% denum_alpha=denum_alpha+getLambda(iter)*s;
|
||||
%end
|
||||
|
||||
alpha=num_alpha/denum_alpha;
|
||||
|
||||
%iter.alpha(iter.num_iter)=alpha; %figure(10);plot(iter.alpha);title('pas');xlabel('iter');ylabel('alpha');drawnow;
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
% MISE A JOUR DE f
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
f_estimated_n=f_estimated_n-alpha.*dJ;
|
||||
clear dJ;
|
||||
|
||||
% SAUVEGARDE DU VOLUME RECONSTRUIT TOUS LES iter.save_file
|
||||
iter=sauvegarde_volume_TOMO8(f_estimated_n,f_real,iter);
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
%CALCUL DU CRITERE (FACULTATIF)
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
J_MC=sum(dg(:).^2);
|
||||
iter.J(iter.num_iter)=J_MC;%+J_reg
|
||||
|
||||
niter_done=size(iter.J,2);
|
||||
if (niter_done > 1)
|
||||
figure(11);plot(iter.J(2:niter_done));title('J');xlabel('iter');ylabel('J');drawnow;
|
||||
end
|
||||
figure(12);imagesc(f_estimated_n(:,:,N/2));title('gradient');colorbar;colormap(gray);drawnow;
|
||||
end
|
||||
|
||||
% disp('****************************')
|
||||
% disp('Descente de gradient OK !!!!')
|
||||
% disp('****************************')
|
||||
|
||||
|
||||
figure(11);
|
||||
plot(f_estimated_n(:,N/2,N/2),'b','LineWidth',1.5,'Marker','+');hold on;
|
||||
plot(f_real(:,N/2,N/2),'k','LineWidth',1.5);hold on;
|
||||
legend('gradient','real');
|
||||
|
||||
|
||||
figure(13);
|
||||
imagesc(f_real(:,:,N/2));title('real');colorbar;colormap(gray);drawnow;
|
||||
|
73
A4/TP_GPU-master/TP3_TomoGPI/TP_TomoGPI_several_gradients.m
Normal file
|
@ -0,0 +1,73 @@
|
|||
clear all;close all;
|
||||
N=128;
|
||||
P=128;
|
||||
rep_name=['Projections_',num2str(P),'/'];
|
||||
iter = Iter3D(rep_name); % Create a class instance
|
||||
|
||||
iter.save_file=1;
|
||||
iter.save_volume=2;
|
||||
iter.fc=1.0;%frequence de coupure pour filtre de rétroporjection filtrée
|
||||
|
||||
f_real=CreateVolumeReal(iter);
|
||||
g_real=getSinoReal(iter);
|
||||
|
||||
|
||||
[g_real,rsb_in]=addNoise(iter,g_real);
|
||||
file_name=sprintf('%s/P_ER_GPU_NOISE_%2.1fdB.s',iter.workdirectory,rsb_in);
|
||||
fid = fopen(file_name, 'wb');
|
||||
fwrite(fid,g_real ,'float');
|
||||
fclose(fid);
|
||||
|
||||
file_name=sprintf('%s/FDK_NOISE_%2.1fdB.v',iter.workdirectory,rsb_in);
|
||||
fid = fopen(file_name, 'rb');
|
||||
f_FDK_32=fread(fid,N*N*N ,'float');
|
||||
f_FDK_32=reshape(f_FDK_32,N,N,N);
|
||||
fclose(fid);
|
||||
|
||||
|
||||
|
||||
% RECONSTRUCTION ITERATIVE MOINDRE CARRE AVEC REGULARISATION QUADRATIQUE
|
||||
setPositivity(iter,1);
|
||||
setLambda(iter,100);
|
||||
l1=getLambda(iter);
|
||||
f1_32=CreateVolumeInit(iter);
|
||||
title1_32='lambda 100 P 32';
|
||||
doGradient(iter,f1_32,g_real,f_real);
|
||||
|
||||
setLambda(iter,0.1);
|
||||
l2=getLambda(iter);
|
||||
f2_32=CreateVolumeInit(iter);
|
||||
title2_32='lambda 0.1 P 32';
|
||||
doGradient(iter,f2_32,g_real,f_real);
|
||||
|
||||
|
||||
setLambda(iter,0);
|
||||
l3=getLambda(iter);
|
||||
f3_32=CreateVolumeInit(iter);
|
||||
title3_32='lambda 0 P 32';
|
||||
doGradient(iter,f3_32,g_real,f_real);
|
||||
|
||||
|
||||
figure(11);
|
||||
plot(f1_32(:,N/2,N/2),'b','LineWidth',1.5,'Marker','+');hold on;
|
||||
plot(f2_32(:,N/2,N/2),'g','LineWidth',1.5,'Marker','x');hold on;
|
||||
plot(f3_32(:,N/2,N/2),'c','LineWidth',1.5,'Marker','*');hold on;
|
||||
plot(f_real(:,N/2,N/2),'k','LineWidth',1.5);hold on;
|
||||
plot(f_FDK_32(:,N/2,N/2),'r','LineWidth',1.5,'Marker','o');hold on;
|
||||
legend(title1_32,title2_32,title3_32,'real','fdk');
|
||||
%legend(title1_32,'real','fdk');
|
||||
|
||||
|
||||
figure(12);
|
||||
imagesc(f_real(:,:,N/2));title('real');colorbar;colormap(gray);drawnow;
|
||||
figure(13);
|
||||
imagesc(f1_32(:,:,N/2));title(title1_32);colorbar;colormap(gray);drawnow;
|
||||
figure(14);
|
||||
imagesc(f_FDK_32(:,:,N/2));title('FDK');colorbar;colormap(gray);drawnow;
|
||||
figure(15);
|
||||
imagesc(f2_32(:,:,N/2));title(title2_32);colorbar;colormap(gray);drawnow;
|
||||
figure(16);
|
||||
imagesc(f3_32(:,:,N/2));title(title3_32);colorbar;colormap(gray);drawnow;
|
||||
|
||||
|
||||
|
16
A4/TP_GPU-master/TP3_TomoGPI/install_TomoGPI.sh
Normal file
|
@ -0,0 +1,16 @@
|
|||
#!/bin/bash
|
||||
export DATA_TP_TOMOGPI="/partage/public/ngac/TomoGPI/data_TP_TomoGPI/"
|
||||
if [ ! -d "/tmp/data_TP_TomoGPI/data3D_0256" ]
|
||||
then
|
||||
cp -r "$DATA_TP_TOMOGPI" /tmp
|
||||
fi
|
||||
export CURRENTDIR='pwd'
|
||||
export TOMO_GPI=/partage/public/ngac/TomoGPI
|
||||
export MATLABPATH=$TOMO_GPI/Matlab/Tomo8:$MATLABPATH
|
||||
export MATLABPATH=$TOMO_GPI/build:$MATLABPATH
|
||||
export MATLABPATH=$CURRENT_DIR:$MATLABPATH
|
||||
export PATH=$TOMO_GPI/build/:$PATH
|
||||
alias matlab_TomoGPI='LD_PRELOAD=/usr/lib/gcc/x86_64-linux-gnu/8/libstdc++.so matlab'
|
||||
cd /tmp/data_TP_TomoGPI/data3D_0128/phantom3D_0006_shepp/
|
||||
matlab_TomoGPI &
|
||||
#matlab &
|
32
A4/TP_GPU-master/TP5_computer-vision/CMakeLists.txt
Normal file
|
@ -0,0 +1,32 @@
|
|||
#Déclaration du projet
|
||||
project(my_project)
|
||||
|
||||
#link_directories(/usr/local/lib)
|
||||
|
||||
set(ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/)
|
||||
|
||||
find_package(CUDA REQUIRED)
|
||||
#set(OpenCV_DIR "/usr/local/lib/OpenCV/")
|
||||
find_package(OpenCV REQUIRED)
|
||||
include_directories(${OpenCV_INCLUDE_DIRS} ".")
|
||||
|
||||
find_package(Qt5 COMPONENTS Core Gui Widgets REQUIRED)
|
||||
|
||||
list(APPEND HEADER_REP ${ROOT_DIR}/ ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} "/usr/local/cuda/samples/common/inc/" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/../samples/common/inc")
|
||||
|
||||
|
||||
set(CPP_FILES ${ROOT_DIR}/mylib.cpp)
|
||||
set(CU_FILES ${ROOT_DIR}/main_nano.cu ${ROOT_DIR}/mylib.cu)
|
||||
|
||||
|
||||
set(CUDA_SEPARABLE_COMPILATION ON)
|
||||
set(CUDA_NVCC_FLAGS "-lineinfo;-I\"${ROOT_DIR}/inc\" -I\"${ROOT_DIR}/\" -I\"/usr/local/cuda/samples/common/inc/\"")
|
||||
|
||||
|
||||
cuda_add_executable(my_project ${CPP_FILES} ${CU_FILES} ${HEADER_FILES})
|
||||
|
||||
set_target_properties(my_project PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_include_directories(my_project PUBLIC ${ROOT_DIR}/inc/ /usr/local/cuda/samples/common/inc/)
|
||||
target_link_libraries(my_project Qt5::Widgets)
|
||||
target_link_libraries(my_project Qt5::Core)
|
||||
target_link_libraries(my_project opencv_core opencv_highgui opencv_imgproc opencv_imgcodecs opencv_video opencv_videoio)
|
47
A4/TP_GPU-master/TP5_computer-vision/main_PC.cu
Normal file
|
@ -0,0 +1,47 @@
|
|||
#include "mylib.h"
|
||||
#include "mylib.cuh"
|
||||
#include "opencv2/opencv.hpp"
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
|
||||
int main(int, char**)
|
||||
{
|
||||
VideoCapture cap(0); // open the default camera
|
||||
if(!cap.isOpened()) // check if we succeeded
|
||||
return -1;
|
||||
|
||||
while(1){
|
||||
Mat frame;
|
||||
cap >> frame;
|
||||
char c=(char)waitKey(25);
|
||||
if(c == '1'){ // if '1' est appuye
|
||||
Mat NB = noirBlanc(frame);
|
||||
imshow("NoirEtBlanc", NB);
|
||||
}
|
||||
else if(c == '2'){ // if '2' est appuye
|
||||
Mat seuil = seuillage(frame);
|
||||
imshow("seuillage", seuil);
|
||||
}
|
||||
else if (c == '3'){ // if '3' est appuye
|
||||
Mat cont = contour(frame);
|
||||
imshow("contour", cont);
|
||||
}
|
||||
else if (c == '4'){ // if '4' est appuye
|
||||
Mat seuilgpu = seuillageGPU(frame);
|
||||
imshow("seuillage GPU",seuilgpu);
|
||||
}
|
||||
else if(c == '0') destroyAllWindows(); // if '0' est appuye
|
||||
|
||||
else imshow("frame", frame);
|
||||
|
||||
if(c==27) // if 'esc' est appuye
|
||||
break;
|
||||
}
|
||||
// When everything done, release the video capture object
|
||||
cap.release();
|
||||
|
||||
// Closes all the frames
|
||||
destroyAllWindows();
|
||||
|
||||
return 0;
|
||||
}
|
92
A4/TP_GPU-master/TP5_computer-vision/main_nano.cu
Normal file
|
@ -0,0 +1,92 @@
|
|||
#include "mylib.cuh"
|
||||
#include "mylib.h"
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
// acces au flux de la camera
|
||||
std::string gstreamer_pipeline(int capture_width, int capture_height,
|
||||
int display_width, int display_height,
|
||||
int framerate, int flip_method) {
|
||||
return "nvarguscamerasrc ! video/x-raw(memory:NVMM), width=(int)" +
|
||||
std::to_string(capture_width) + ", height=(int)" +
|
||||
std::to_string(capture_height) +
|
||||
", format=(string)NV12, framerate=(fraction)" +
|
||||
std::to_string(framerate) +
|
||||
"/1 ! nvvidconv flip-method=" + std::to_string(flip_method) +
|
||||
" ! video/x-raw, width=(int)" + std::to_string(display_width) +
|
||||
", height=(int)" + std::to_string(display_height) +
|
||||
", format=(string)BGRx ! videoconvert ! video/x-raw, "
|
||||
"format=(string)BGR ! appsink";
|
||||
}
|
||||
|
||||
int main(int, char **) {
|
||||
int capture_width = 1280;
|
||||
int capture_height = 720;
|
||||
int display_width = 640;
|
||||
int display_height = 360;
|
||||
int framerate = 60;
|
||||
int flip_method = 0;
|
||||
char c = '1';
|
||||
|
||||
std::string pipeline =
|
||||
gstreamer_pipeline(capture_width, capture_height, display_width,
|
||||
display_height, framerate, flip_method);
|
||||
std::cout << "Using pipeline: \n\t" << pipeline << "\n";
|
||||
|
||||
cv::VideoCapture cap(pipeline, cv::CAP_GSTREAMER);
|
||||
|
||||
if (!cap.isOpened()) // check if we succeeded
|
||||
return -1;
|
||||
|
||||
while (1) {
|
||||
Mat frame;
|
||||
cap >> frame;
|
||||
int c_new;
|
||||
c_new = cv::waitKey(10);
|
||||
if (c_new != -1)
|
||||
c = c_new;
|
||||
|
||||
switch (c) {
|
||||
case '1': {
|
||||
imshow("frame", frame);
|
||||
break;
|
||||
}
|
||||
case '2': {
|
||||
Mat NB = noirBlanc(frame);
|
||||
imshow("NoirEtBlanc", NB);
|
||||
break;
|
||||
}
|
||||
case '3': {
|
||||
Mat seuil = seuillage(frame);
|
||||
imshow("seuillage", seuil);
|
||||
break;
|
||||
}
|
||||
case '4': {
|
||||
Mat cont = contour(frame);
|
||||
imshow("contour", cont);
|
||||
break;
|
||||
}
|
||||
case '5': {
|
||||
Mat seuilgpu = seuillageGPU(frame);
|
||||
imshow("seuillage GPU", seuilgpu);
|
||||
break;
|
||||
}
|
||||
|
||||
case '0': {
|
||||
destroyAllWindows();
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
if (c == '\e')
|
||||
break;
|
||||
}
|
||||
// When everything done, release the video capture object
|
||||
cap.release();
|
||||
|
||||
// Closes all the frames
|
||||
destroyAllWindows();
|
||||
|
||||
return 0;
|
||||
}
|
41
A4/TP_GPU-master/TP5_computer-vision/main_rasp.cpp
Normal file
|
@ -0,0 +1,41 @@
|
|||
#include "mylib.h"
|
||||
#include "opencv2/opencv.hpp"
|
||||
|
||||
|
||||
int main(int, char**)
|
||||
{
|
||||
VideoCapture cap(0); // open the default camera
|
||||
if(!cap.isOpened()) // check if we succeeded
|
||||
return -1;
|
||||
|
||||
while(1){
|
||||
Mat frame;
|
||||
cap >> frame;
|
||||
char c=(char)waitKey(25);
|
||||
if(c == '1'){ // if '1' est appuye
|
||||
Mat NB = noirBlanc(frame);
|
||||
imshow("NoirEtBlanc", NB);
|
||||
}
|
||||
else if(c == '2'){ // if '2' est appuye
|
||||
Mat seuil = seuillage(frame);
|
||||
imshow("seuillage", seuil);
|
||||
}
|
||||
else if (c == '3'){ // if '3' est appuye
|
||||
Mat cont = contour(frame);
|
||||
imshow("contour", cont);
|
||||
}
|
||||
else if(c == '0') destroyAllWindows(); // if '0' est appuye
|
||||
|
||||
else imshow("frame", frame);
|
||||
|
||||
if(c==27) // if 'esc' est appuye
|
||||
break;
|
||||
}
|
||||
// When everything done, release the video capture object
|
||||
cap.release();
|
||||
|
||||
// Closes all the frames
|
||||
destroyAllWindows();
|
||||
|
||||
return 0;
|
||||
}
|
90
A4/TP_GPU-master/TP5_computer-vision/mylib.cpp
Normal file
|
@ -0,0 +1,90 @@
|
|||
#include "mylib.h"
|
||||
//---------------------noirBlanc-----------------------
|
||||
|
||||
Mat noirBlanc(Mat frame)
|
||||
{
|
||||
Mat im_gray_out;
|
||||
|
||||
if (frame.empty())
|
||||
exit(0);
|
||||
|
||||
cvtColor(frame,im_gray_out,COLOR_RGB2GRAY);
|
||||
return im_gray_out;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//---------------------get_frame-----------------------
|
||||
|
||||
Mat get_frame(Mat frame)
|
||||
{
|
||||
if (frame.empty())
|
||||
exit(0);
|
||||
|
||||
return frame;
|
||||
}
|
||||
|
||||
//---------------------seuillage------------------------
|
||||
|
||||
Mat seuillage(Mat frame)
|
||||
{
|
||||
float nr;
|
||||
uchar r,v,b;
|
||||
Mat frame_out;
|
||||
frame_out.create(frame.rows,frame.cols,CV_8UC3);
|
||||
|
||||
if (frame.empty())
|
||||
exit(0);
|
||||
|
||||
for(int i = 0; i < frame.rows; i++)
|
||||
{
|
||||
for(int j = 0; j < frame.cols; j++)
|
||||
{
|
||||
|
||||
b = frame.at<Vec3b>(i,j)[0];
|
||||
v = frame.at<Vec3b>(i,j)[1];
|
||||
r = frame.at<Vec3b>(i,j)[2];
|
||||
|
||||
nr = r/sqrt(r*r+b*b+v*v);
|
||||
|
||||
if (nr > 0.7){
|
||||
frame_out.at<Vec3b>(i,j)[0] = b;
|
||||
frame_out.at<Vec3b>(i,j)[1] = r;
|
||||
frame_out.at<Vec3b>(i,j)[2] = r;
|
||||
}
|
||||
else{
|
||||
frame_out.at<Vec3b>(i,j)[0] = b;
|
||||
frame_out.at<Vec3b>(i,j)[1] = v;
|
||||
frame_out.at<Vec3b>(i,j)[2] = r;
|
||||
}
|
||||
}
|
||||
}
|
||||
return frame_out;
|
||||
}
|
||||
|
||||
|
||||
//---------------------contour------------------------
|
||||
|
||||
Mat contour(Mat frame)
|
||||
{
|
||||
|
||||
Mat frame_out,frame_grayt;
|
||||
|
||||
cvtColor(frame,frame_grayt,COLOR_RGB2GRAY);
|
||||
frame_out.create(frame.rows,frame.cols,CV_8UC1);
|
||||
|
||||
if (frame.empty())
|
||||
exit(0);
|
||||
|
||||
for (int i=1;i<frame.rows;i++){
|
||||
for (int j=1;j<frame.cols;j++){
|
||||
short temp;
|
||||
temp = (-1)*(short)frame_grayt.at<uchar>(i,j-1)+(-1)*(short)frame_grayt.at<uchar>(i-1,j)+(-1)*(char)frame_grayt.at<uchar>(i,j+1)+(-1)*(short)frame_grayt.at<uchar>(i+1,j)+4*(short)frame_grayt.at<uchar>(i,j);
|
||||
|
||||
frame_out.at<uchar>(i,j)=(uchar)abs(temp);
|
||||
}
|
||||
}
|
||||
return frame_out;
|
||||
}
|
||||
|
||||
|
93
A4/TP_GPU-master/TP5_computer-vision/mylib.cu
Normal file
|
@ -0,0 +1,93 @@
|
|||
#include "mylib.h"
|
||||
#include "mylib.cuh"
|
||||
|
||||
|
||||
__global__ void kernel_seuillageGPU(unsigned char *d_image_in, unsigned char *d_image_out,int size_j)
|
||||
{
|
||||
float Csum;
|
||||
int i, j, k, iFirst, jFirst;
|
||||
|
||||
iFirst = blockIdx.x*BLOCK_SIZE; // num de block dans la grille de block
|
||||
jFirst = blockIdx.y*BLOCK_SIZE;
|
||||
|
||||
i = iFirst + threadIdx.x;// recuperer l'identifiant d'un thread dans les blocs
|
||||
j = jFirst + threadIdx.y;
|
||||
|
||||
float nr = 0;
|
||||
|
||||
nr=d_image_in[2+j*3+i*3*size_j]/sqrtf(d_image_in[0+j*3+i*3*size_j]*d_image_in[0+j*3+i*3*size_j]+d_image_in[1+j*3+i*3*size_j]*d_image_in[1+j*3+i*3*size_j]+d_image_in[2+j*3+i*3*size_j]*d_image_in[2+j*3+i*3*size_j]);
|
||||
|
||||
if(nr > 0.7)
|
||||
d_image_out[1+j*3+i*3*size_j] = d_image_in[2+j*3+i*3*size_j];
|
||||
else
|
||||
d_image_out[1+j*3+i*3*size_j] = d_image_in[1+j*3+i*3*size_j];
|
||||
|
||||
d_image_out[0+j*3+i*3*size_j] = d_image_in[0+j*3+i*3*size_j];
|
||||
d_image_out[2+j*3+i*3*size_j] = d_image_in[2+j*3+i*3*size_j];
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
Mat seuillageGPU( Mat in)
|
||||
{
|
||||
cudaError_t error;
|
||||
Mat out;
|
||||
out.create(in.rows,in.cols,CV_8UC3);
|
||||
|
||||
// allocate host memory
|
||||
unsigned char *h_image_in_GPU ;
|
||||
h_image_in_GPU=in.data;
|
||||
|
||||
/*cudaEvent_t start,stop,start_mem,stop_mem;
|
||||
error = cudaEventCreate(&start_mem);
|
||||
error = cudaEventCreate(&stop_mem);
|
||||
|
||||
error = cudaEventRecord(start, NULL);
|
||||
error = cudaEventSynchronize(start);*/
|
||||
|
||||
// images on device memoryÍÍÍ
|
||||
unsigned char *d_image_in_GPU;
|
||||
unsigned char *d_image_out_GPU;
|
||||
|
||||
const unsigned long int mem_size=in.cols*in.rows*3*sizeof(unsigned char);
|
||||
|
||||
// Alocation mémoire de d_image_in et d_image_out sur la carte GPU
|
||||
cudaMalloc((void**) &d_image_in_GPU,mem_size );
|
||||
cudaMalloc((void**) &d_image_out_GPU, mem_size);
|
||||
|
||||
// copy host memory to device
|
||||
cudaMemcpy(d_image_in_GPU, h_image_in_GPU,mem_size ,cudaMemcpyHostToDevice);
|
||||
|
||||
//error = cudaEventRecord(stop_mem, NULL);
|
||||
|
||||
// Wait for the stop event to complete
|
||||
//error = cudaEventSynchronize(stop_mem);
|
||||
//float msecMem = 0.0f;
|
||||
//error = cudaEventElapsedTime(&msecMem, start, stop_mem);
|
||||
|
||||
// setup execution parameters -> découpage en threads
|
||||
dim3 threads(BLOCK_SIZE,BLOCK_SIZE);
|
||||
dim3 grid(in.rows/BLOCK_SIZE,in.cols/BLOCK_SIZE);
|
||||
|
||||
// lancement des threads executé sur la carte GPU
|
||||
kernel_seuillageGPU<<< grid, threads >>>(d_image_in_GPU, d_image_out_GPU,in.cols);
|
||||
|
||||
// Record the start event
|
||||
//error = cudaEventRecord(start_mem, NULL);
|
||||
//error = cudaEventSynchronize(start_mem);
|
||||
|
||||
// copy result from device to host
|
||||
cudaMemcpy(out.data, d_image_out_GPU, mem_size,cudaMemcpyDeviceToHost);
|
||||
cudaFree(d_image_in_GPU);
|
||||
cudaFree(d_image_out_GPU);
|
||||
/*
|
||||
float msecTotal,msecMem2;
|
||||
error = cudaEventRecord(stop, NULL);
|
||||
error = cudaEventSynchronize(stop);
|
||||
error = cudaEventElapsedTime(&msecTotal, start, stop);
|
||||
error = cudaEventElapsedTime(&msecMem2, start_mem, stop);
|
||||
*/
|
||||
return out;
|
||||
}
|
||||
|
13
A4/TP_GPU-master/TP5_computer-vision/mylib.cuh
Normal file
|
@ -0,0 +1,13 @@
|
|||
#ifndef MYLIB_CUH
|
||||
#define MYLIB_CUH
|
||||
|
||||
#include "mylib.h"
|
||||
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
|
||||
Mat seuillageGPU( Mat in);
|
||||
|
||||
|
||||
#endif
|
36
A4/TP_GPU-master/TP5_computer-vision/mylib.h
Normal file
|
@ -0,0 +1,36 @@
|
|||
#ifndef MYLIB_H
|
||||
#define MYLIB_H
|
||||
|
||||
#define SIZE_I 960
|
||||
#define SIZE_J 1280
|
||||
#define BLOCK_SIZE 16
|
||||
|
||||
#include "opencv2/opencv.hpp"
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <iostream>
|
||||
|
||||
using namespace std;
|
||||
|
||||
using namespace cv;
|
||||
|
||||
//---------------------get_frame---------------------
|
||||
|
||||
Mat get_frame(Mat frame);
|
||||
|
||||
//---------------------noirBlanc------------------
|
||||
|
||||
Mat noirBlanc(Mat frame);
|
||||
|
||||
//---------------------seuillage------------------
|
||||
|
||||
Mat seuillage(Mat frame);
|
||||
|
||||
//---------------------contour--------------------
|
||||
|
||||
Mat contour(Mat frame);
|
||||
|
||||
|
||||
#endif
|