Compare commits

...

2 Commits

Binary file not shown.

Binary file not shown.

Binary file not shown.

@ -0,0 +1 @@
/Release/

@ -0,0 +1,34 @@
cmake_minimum_required(VERSION 2.8)
project(saxpy LANGUAGES CXX CUDA)
set(ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/)
#set(CMAKE_C_COMPILER "gcc-8")
#set(CMAKE_CXX_COMPILER "g++-8")
#set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")
list(APPEND HEADER_REP ${ROOT_DIR}/ ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} "/usr/local/cuda/samples/common/inc/" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/../samples/common/inc")
file(GLOB CPP_FILES ${ROOT_DIR}/*.cpp)
file(GLOB CU_FILES ${ROOT_DIR}/*.cu)
file(GLOB_RECURSE HEADER_FILES ${ROOT_DIR}/*.cuh ${ROOT_DIR}/*.h)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math")
set(CUDA_SEPARABLE_COMPILATION ON)
set(CUDA_NVCC_FLAGS "-lineinfo;-I\"${ROOT_DIR}/inc\" -I\"${ROOT_DIR}/\" -I\"/usr/local/cuda/samples/common/inc/\"")
add_executable(saxpy_CUDA ${CPP_FILES} ${CU_FILES} ${HEADER_FILES})
set_target_properties(saxpy_CUDA PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(saxpy_CUDA PUBLIC ${ROOT_DIR}/inc/ /usr/local/cuda/samples/common/inc/)

@ -0,0 +1,34 @@
cmake_minimum_required(VERSION 2.8)
project(matrice)
set(ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/)
#set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")
find_package(CUDA REQUIRED)
list(APPEND HEADER_REP ${ROOT_DIR}/ ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} "/usr/local/cuda/samples/common/inc/" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/../samples/common/inc")
#file(GLOB CPP_FILES ${ROOT_DIR}/src/CPP/*.cpp)
#file(GLOB CU_FILES ${ROOT_DIR}/src/CPP/*.cu)
set(CPP_FILES seuillage_C.cpp)
set(CU_FILES ${ROOT_DIR}/seuillage_main.cu)
#file(GLOB_RECURSE HEADER_FILES ${ROOT_DIR}/src/CUDA/*.cuh ${ROOT_DIR}/inc/*.h)
set(CUDA_SEPARABLE_COMPILATION ON)
set(CUDA_NVCC_FLAGS "-lineinfo;-I\"${ROOT_DIR}/inc\" -I\"${ROOT_DIR}/\" -I\"/usr/local/cuda/samples/common/inc/\"")
cuda_add_executable(seuillage_CUDA ${CPP_FILES} ${CU_FILES} ${HEADER_FILES})
set_target_properties(seuillage_CUDA PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(seuillage_CUDA PUBLIC ${ROOT_DIR}/inc/ /usr/local/cuda/samples/common/inc/)

@ -0,0 +1,17 @@
#cmake -DCMAKE_CUDA_FLAGS=”-arch=sm_30” ..
cmake_minimum_required(VERSION 3.8)
set(CUDACXX "/usr/local/cuda/bin/nvcc")
set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc")
project(seuillage_CUDA LANGUAGES CXX CUDA)
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
set(CMAKE_VERBOSE_MAKEFILE TRUE)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math")
add_executable(seuillage_CUDA seuillage_C.cpp seuillage_main.cu)
set_target_properties(seuillage_CUDA PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_features(seuillage_CUDA PUBLIC cxx_std_11)
target_include_directories(seuillage_CUDA PUBLIC ".")

@ -0,0 +1,24 @@
//#define SEUILLAGE_H
//#ifndef SEUILLAGE_H
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#define BLOCK_SIZE 16
// Prototype
void runTest( int argc, char** argv);
extern "C" void saxpy_C(float *vector_SAXPY, float A, float *vector_X, float *vector_Y, int N);
//#endif

@ -0,0 +1,28 @@
/*
* Copiright 1993-2009 NVIDIA Corporation. All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual properti and
* proprietari rights in and to this software and related documentation and
* ani modifications thereto. Ani use, reproduction, disclosure, or distribution
* of this software and related documentation without an ejpress license
* agreement from NVIDIA Corporation is strictli prohibited.
*
*/
/* Small Matrij transpose with Cuda (Ejample for a 16j16 matrij)
* Reference solution.
*/
#include "saxpy.h"
////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set
////////////////////////////////////////////////////////////////////////////////
void saxpy_C(float *vector_SAXPY, float A, float *vector_X, float *vector_Y, int N)
{
// A VOUS DE CODER
}

@ -0,0 +1,195 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* Template project which demonstrates the basics on how to setup a project
* example application.
* Host code.
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <iostream>
using namespace std;
// includes CUDA
#include <cuda_runtime.h>
#include "saxpy.h"
__global__ void saxpy_kernel(float *vector_SAXPY, float A, float *vector_X, float *vector_Y, int N)
{
// A VOUS DE CODER
}
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest( int argc, char** argv);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
runTest( argc, argv);
}
__host__ static int iDivUp(int a, int b) {
return ((a % b != 0) ? (a / b + 1): (a/b));
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void
runTest( int argc, char** argv)
{
cudaError_t error;
unsigned long int N=256*1024;
const unsigned int mem_size = N*sizeof(float);
// allocate host memory
float* h_vector_X = (float*) malloc(mem_size);
float* h_vector_Y = (float*) malloc(mem_size);
//Initilaisation des données d'entrée
float A=1.0;
for (int i=0;i<N;i++){
h_vector_X[i]=(float)rand();
h_vector_Y[i]=(float)rand();
}
////////////////////////////////////////////////////////////////////////////////
// EXECUTION SUR LE CPU
///////////////////////////////////////////////////////////////////////
// Image trait<69>e sur le CPU
float* h_vector_SAXPY_CPU = (float*) malloc( mem_size);
printf("SAXPY CPU\n");
cudaEvent_t start,stop;
error = cudaEventCreate(&start);
error = cudaEventCreate(&stop);
// Record the start event
error = cudaEventRecord(start, NULL);
error = cudaEventSynchronize(start);
//Seuillage sur CPU
// A VOUS DE CODER
// Record the start event
error = cudaEventRecord(stop, NULL);
// Wait for the stop event to complete
error = cudaEventSynchronize(stop);
float msecTotal = 0.0f;
error = cudaEventElapsedTime(&msecTotal, start, stop);
printf("CPU execution time %f\n",msecTotal);
////////////////////////////////////////////////////////////////////////////////
// EXECUTION SUR LE GPU
///////////////////////////////////////////////////////////////////////
printf("SAXPY GPU\n");
float* h_vector_SAXPY_GPU = (float*) malloc(mem_size);
// images on device memory
float* d_vector_X;
float* d_vector_Y;
float* d_vector_SAXPY;
cudaEvent_t start_mem,stop_mem;
error = cudaEventCreate(&start_mem);
error = cudaEventCreate(&stop_mem);
error = cudaEventRecord(start, NULL);
error = cudaEventSynchronize(start);
// Alocation mémoire de d_vector_X, d_vector_Y et d_vector_SAXPY sur la carte GPU
// A VOUS DE CODER
// copy host memory to device
// A VOUS DE CODER
error = cudaEventRecord(stop_mem, NULL);
// Wait for the stop event to complete
error = cudaEventSynchronize(stop_mem);
float msecMem = 0.0f;
error = cudaEventElapsedTime(&msecMem, start, stop_mem);
// setup execution parameters -> découpage en threads
// A VOUS DE CODER
// lancement des threads executé sur la carte GPU
// A VOUS DE CODER
error = cudaEventRecord(start_mem, NULL);
error = cudaEventSynchronize(start_mem);
// copy result from device to host
// A VOUS DE CODER
// cleanup device memory
// COMMENTAIRES A ENLEVER
//cudaFree(d_vector_X);
//cudaFree(d_vector_Y);
//cudaFree(d_vector_SAXPY);
error = cudaEventRecord(stop, NULL);
// Wait for the stop event to complete
error = cudaEventSynchronize(stop);
msecTotal = 0.0f;
error = cudaEventElapsedTime(&msecTotal, start, stop);
float msecMem2 =0.0f;
error = cudaEventElapsedTime(&msecMem2, start_mem, stop);
msecMem+=msecMem2;
printf("GPU execution time %f ms (memory management %2.2f \%)\n",msecTotal,(msecMem)/(msecTotal)*100);
float sum_diff=0;
for(int i=0;i<N;i++)
sum_diff+= h_vector_SAXPY_GPU[i]-h_vector_SAXPY_CPU[i];
printf("sum_diff = %f\n",sum_diff);
// cleanup memory
free(h_vector_X);
free(h_vector_Y);
free(h_vector_SAXPY_GPU);
free(h_vector_SAXPY_CPU);
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 166 KiB

@ -0,0 +1,94 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
<cconfiguration id="com.nvidia.cuda.ide.nine_zero.configuration.release.1804067204">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="com.nvidia.cuda.ide.nine_zero.configuration.release.1804067204" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="com.nvidia.cuda.ide.elf" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="com.nvidia.cuda.ide.cubin" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="com.nvidia.cuda.ide.macho" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.VCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="nvcc.errorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe,org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release" cleanCommand="rm -rf" description="" id="com.nvidia.cuda.ide.nine_zero.configuration.release.1804067204" name="Release" parent="com.nvidia.cuda.ide.nine_zero.configuration.release">
<folderInfo id="com.nvidia.cuda.ide.nine_zero.configuration.release.1804067204." name="/" resourcePath="">
<toolChain id="com.nvidia.cuda.ide.toolchain.nine_zero.69243472" name="CUDA Toolkit 10.0" superClass="com.nvidia.cuda.ide.toolchain.nine_zero">
<targetPlatform archList="all" binaryParser="com.nvidia.cuda.ide.elf;com.nvidia.cuda.ide.macho;com.nvidia.cuda.ide.cubin" id="com.nvidia.cuda.ide.targetPlatform.718907530" isAbstract="false" name="Debug Platform" osList="linux,macosx" superClass="com.nvidia.cuda.ide.targetPlatform"/>
<builder buildPath="${workspace_loc:/matrice}/Release" id="com.nvidia.cuda.ide.builder.960504214" keepEnvironmentInBuildfile="false" name="CUDA Toolkit 9.2 Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="com.nvidia.cuda.ide.builder"/>
<tool id="nvcc.compiler.base.376896027" name="NVCC Compiler" superClass="nvcc.compiler.base">
<option id="nvcc.compiler.include.paths.602427994" name="Include paths (-I)" superClass="nvcc.compiler.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="/usr/local/cuda/samples/common/inc"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:/${ProjName}}/inc&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:/${ProjName}}/src/CUDA&quot;"/>
</option>
<option id="nvcc.compiler.deviceDebug.1868299221" name="Generate device debug information (-G)" superClass="nvcc.compiler.deviceDebug"/>
<option id="nvcc.compiler.option.level.1923776877" name="Generate host debug information (-g)" superClass="nvcc.compiler.option.level"/>
<option defaultValue="nvcc.compiler.optimization.level.most" id="nvcc.compiler.optimization.level.461195553" name="Optimization Level" superClass="nvcc.compiler.optimization.level" value="nvcc.compiler.optimization.level.most" valueType="enumerated"/>
<option id="nvcc.compiler.pic.2021621233" name="Position Independent Code (-fPIC)" superClass="nvcc.compiler.pic" value="true" valueType="boolean"/>
<inputType id="nvcc.compiler.input.cu.212130830" superClass="nvcc.compiler.input.cu"/>
<inputType id="nvcc.compiler.input.cpp.1544991377" superClass="nvcc.compiler.input.cpp"/>
<inputType id="nvcc.compiler.input.c.298636231" superClass="nvcc.compiler.input.c"/>
</tool>
<tool id="nvcc.linker.base.815597997" name="NVCC Linker" superClass="nvcc.linker.base">
<option id="nvcc.linker.option.paths.539821132" name="Library search path (-L)" superClass="nvcc.linker.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="/usr/local/cuda/lib"/>
</option>
<option id="nvcc.linker.option.libs.680956661" name="Libraries (-l)" superClass="nvcc.linker.option.libs" valueType="libs">
<listOptionValue builtIn="false" value="cublas"/>
</option>
<inputType id="nvcc.linker.input.1893304424" superClass="nvcc.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="nvcc.archiver.base.24620523" name="NVCC Archiver" superClass="nvcc.archiver.base"/>
<tool id="com.nvidia.host.assembler.1797002694" name="Host Assembler" superClass="com.nvidia.host.assembler">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.195427270" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
<sourceEntries>
<entry excluding="matrixMul_kernel.cuh" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name="src"/>
</sourceEntries>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
<storageModule moduleId="com.nvidia.cuda.ide.build.project.ICudaProjectConfiguration">
<executable devicelink="false">
<ptx major="5" minor="2"/>
<sass major="5" minor="2"/>
</executable>
<editor-arch major="5" minor="2"/>
</storageModule>
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<project id="matrice.com.nvidia.cuda.ide.nine_zero.exe.915560355" name="Executable" projectType="com.nvidia.cuda.ide.nine_zero.exe"/>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.nine_zero.configuration.debug.2018016243;com.nvidia.cuda.ide.nine_zero.configuration.debug.2018016243.;nvcc.compiler.base.1292094216;nvcc.compiler.input.cu.2074083432">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.nine_zero.configuration.debug.2018016243;com.nvidia.cuda.ide.nine_zero.configuration.debug.2018016243.;nvcc.compiler.base.1292094216;nvcc.compiler.input.c.2064063779">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.nine_zero.configuration.debug.2018016243;com.nvidia.cuda.ide.nine_zero.configuration.debug.2018016243.;nvcc.compiler.base.1292094216;nvcc.compiler.input.cpp.679408217">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
<storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/matrice"/>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
</cproject>

@ -0,0 +1,27 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>matrice</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
<triggers>clean,full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
<triggers>full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.cdt.core.cnature</nature>
<nature>org.eclipse.cdt.core.ccnature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
</natures>
</projectDescription>

@ -0,0 +1,3 @@
eclipse.preferences.version=1
project-sync-config=<?xml version\="1.0" encoding\="UTF-8"?>\n<sync-configs>\n<sync-config config-name\="Local" connection-name\="Local" location\="/home/gac/L2S_Programmation/TP_GPU/TP_matrice_CUDA_10" remote-services-id\="org.eclipse.ptp.remote.LocalServices" sync-on-postbuild\="true" sync-on-prebuild\="true" sync-on-save\="true" sync-provider-id\="org.eclipse.ptp.rdt.sync.git.core.synchronizeService">\n<config-properties com.nvidia.cuda.ide.build.core.Cpu\="Native"/>\n</sync-config>\n</sync-configs>
sync-mode=NONE

@ -0,0 +1,33 @@
#cmake -DCMAKE_CUDA_FLAGS=”-arch=sm_30” ..
cmake_minimum_required(VERSION 2.8)
project(matrice)
set(CUDA_TOOLKIT_ROOT_DIR "/partage/public/ngac/cuda-11.2")
set(ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/)
#set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")
find_package(CUDA REQUIRED)
list(APPEND HEADER_REP ${ROOT_DIR}/inc/ ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} "/usr/local/cuda/samples/common/inc/" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/../samples/common/inc")
#file(GLOB CPP_FILES ${ROOT_DIR}/src/CPP/*.cpp)
#file(GLOB CU_FILES ${ROOT_DIR}/src/CPP/*.cu)
set(CPP_FILES ${ROOT_DIR}/src/CPP/matrixMul_C.cpp)
set(CU_FILES ${ROOT_DIR}/src/CPP/matrixMul_cuda.cu ${ROOT_DIR}/src/CPP/matrixMul_CUBLAS.cu ${ROOT_DIR}/src/CPP/matrixMul_main.cu)
file(GLOB_RECURSE HEADER_FILES ${ROOT_DIR}/src/CUDA/*.cuh ${ROOT_DIR}/inc/*.h)
set(CUDA_SEPARABLE_COMPILATION ON)
set(CUDA_NVCC_FLAGS "-lineinfo;-I\"${ROOT_DIR}/inc\" -I\"${ROOT_DIR}/src/CUDA\" -I\"/usr/local/cuda/samples/common/inc/\"")
cuda_add_executable(matrice_CUDA ${CPP_FILES} ${CU_FILES} ${HEADER_FILES})
set_target_properties(matrice_CUDA PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(matrice_CUDA PUBLIC ${ROOT_DIR}/inc/ ${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc)
target_link_libraries(matrice_CUDA ${CUDA_CUBLAS_LIBRARIES})

@ -0,0 +1,37 @@
#cmake -DCMAKE_CUDA_FLAGS=-arch=sm_30 ..
cmake_minimum_required(VERSION 3.8)
#set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc")
project(Matrice_project LANGUAGES CXX CUDA)
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
set(CMAKE_VERBOSE_MAKEFILE TRUE)
set(ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
set(EXE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
list(APPEND HEADER_REP ${ROOT_DIR}/inc ${ROOT_DIR}/src/CUDA ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} "/usr/local/cuda/samples/common/inc/" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/../samples/common/inc")
file(GLOB CPP_FILES ${ROOT_DIR}/src/CPP/*.cpp)
file(GLOB CU_FILES ${ROOT_DIR}/src/CPP/*.cu)
file(GLOB_RECURSE HEADER_FILES ${ROOT_DIR}/src/CUDA/*.cuh ${ROOT_DIR}/inc/*.h)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math")
#COMPILATION LIBRAIRIE PUIS EXECUTABLE
#add_library(matrice ${CPP_FILES} ${CU_FILES} ${HEADER_FILES})
#set_target_properties(matrice PROPERTIES POSITION_INDEPENDENT_CODE ON)
#target_compile_features(matrice PUBLIC cxx_std_11)
#target_include_directories(matrice PUBLIC "${HEADER_REP}")
#add_executable(matrice_exe "${EXE_DIR}/src/CPP/matrixMul_main.cu")
#target_link_libraries(matrice_exe PUBLIC matrice)
#COMPILATION EXECUTABLE (sans generer avant une librairie => tous les fichers sources sont pris en compte)
add_executable(matrice_exe ${CPP_FILES} ${CU_FILES} ${HEADER_FILES})
set_target_properties(matrice_exe PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_features(matrice_exe PUBLIC cxx_std_11)
target_include_directories(matrice_exe PUBLIC "${HEADER_REP}")
target_link_libraries(matrice_exe PUBLIC cublas)

@ -0,0 +1,31 @@
#cmake -DCMAKE_CUDA_FLAGS=”-arch=sm_30” ..
cmake_minimum_required(VERSION 2.8)
project(matrice)
set(ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/)
#set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")
find_package(CUDA REQUIRED)
list(APPEND HEADER_REP ${ROOT_DIR}/inc/ ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} "/usr/local/cuda/samples/common/inc/" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/../samples/common/inc")
#file(GLOB CPP_FILES ${ROOT_DIR}/src/CPP/*.cpp)
#file(GLOB CU_FILES ${ROOT_DIR}/src/CPP/*.cu)
set(CPP_FILES ${ROOT_DIR}/src/CPP/matrixMul_C.cpp)
set(CU_FILES ${ROOT_DIR}/src/CPP/matrixMul_cuda.cu ${ROOT_DIR}/src/CPP/matrixMul_CUBLAS.cu ${ROOT_DIR}/src/CPP/matrixMul_main.cu)
file(GLOB_RECURSE HEADER_FILES ${ROOT_DIR}/src/CUDA/*.cuh ${ROOT_DIR}/inc/*.h)
set(CUDA_SEPARABLE_COMPILATION ON)
set(CUDA_NVCC_FLAGS "-lineinfo;-I\"${ROOT_DIR}/inc\" -I\"${ROOT_DIR}/src/CUDA\" -I\"/usr/local/cuda/samples/common/inc/\"")
cuda_add_executable(matrice_CUDA ${CPP_FILES} ${CU_FILES} ${HEADER_FILES})
set_target_properties(matrice_CUDA PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(matrice_CUDA PUBLIC ${ROOT_DIR}/inc/ /usr/local/cuda/samples/common/inc/)
target_link_libraries(matrice_CUDA ${CUDA_CUBLAS_LIBRARIES})

@ -0,0 +1,38 @@
#ifndef _MATRIXMUL_H_
#define _MATRIXMUL_H_
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cuda_runtime.h>
//#include <helper_functions.h>
//#include <helper_cuda.h>
#include "cublas.h"
// Thread block size
#define BLOCK_SIZE 16
typedef enum type_version_kernel {v0,v0_bis,v1} Type_version_kernel;
void computeGold( float*, const float*, const float*, unsigned int, unsigned int, unsigned int);
// Matrix dimensions
// (chosen as multiples of the thread block size for simplicity)
/*
#define WA (64 * BLOCK_SIZE) // Matrix A width
#define HA (64 * BLOCK_SIZE) // Matrix A height
#define WB (64 * BLOCK_SIZE) // Matrix B width
#define HB WA // Matrix B height
#define WC WB // Matrix C width
#define HC HA // Matrix C height
*/
#
void compute_matrixMul_C(int W);
void compute_matrixMul_cublas(int N);
void compute_matrixMul_cuda(int N,Type_version_kernel v);
void randomInit(float*, int);
void printDiff(float*, float*, int, int);
#endif // _MATRIXMUL_H_

@ -0,0 +1,137 @@
/*
* Copyright 1993-2007 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*
* Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice.
*/
#include "matrixMul.h"
////////////////////////////////////////////////////////////////////////////////
// export C interface
////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set
//! C = A * B
//! @param C reference data, computed but preallocated
//! @param A matrix A as provided to device
//! @param B matrix B as provided to device
//! @param hA height of matrix A
//! @param wB width of matrix B
////////////////////////////////////////////////////////////////////////////////
void
computeGold(float* C, const float* A, const float* B, unsigned int hA, unsigned int wA, unsigned int wB)
{
for (unsigned int i = 0; i < hA; ++i)
for (unsigned int j = 0; j < wB; ++j) {
double sum = 0;
for (unsigned int k = 0; k < wA; ++k) {
double a = A[i * wA + k];
double b = B[k * wB + j];
sum += a * b;
}
C[i * wB + j] = (float)sum;
}
}
void compute_matrixMul_C(int W){
// allocate host memory for matrices A and B
unsigned int size_A = W * W;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*) malloc(mem_size_A);
unsigned int size_B = W * W;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*) malloc(mem_size_B);
// initialize host memory
//randomInit(h_A, size_A);
//randomInit(h_B, size_B);
// allocate host memory for the result
unsigned int size_C = W * W;
unsigned int mem_size_C = sizeof(float) * size_C;
float* h_C = (float*) malloc(mem_size_C);
// create and start timer
cudaDeviceSynchronize();
// Allocate CUDA events that we'll use for timing
cudaEvent_t event[2];
cudaEventCreate(event+0);
cudaEventCreate(event+1);
cudaEventRecord(event[0], NULL);
computeGold(h_C, h_A, h_B, W, W, W);
cudaEventRecord(event[1], NULL);
cudaEventSynchronize(event[1]);
float msecTotal = 0.0f;
cudaEventElapsedTime(&msecTotal, event[0], event[1]);
printf("Time= \t\t\t\t %.3f msec\n",msecTotal);
free(h_A);
free(h_B);
free(h_C);
}
// Allocates a matrix with random float entries.
void randomInit(float* data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}
void printDiff(float *data1, float *data2, int width, int height)
{
int i,j,k;
int error_count=0;
for (j=0; j<height; j++) {
for (i=0; i<width; i++) {
k = j*width+i;
if (data1[k] != data2[k]) {
printf("diff(%d,%d) CPU=%4.4f, GPU=%4.4f n", i,j, data1[k], data2[k]);
error_count++;
}
}
}
printf(" nTotal Errors = %d n", error_count);
}

@ -0,0 +1,111 @@
#include "matrixMul.h"
void compute_matrixMul_cublas(int N)
{
float alpha = 1.0f, beta = 0.0f;
// allocate host memory for matrices A and B
unsigned int size_A = N * N;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*) malloc(mem_size_A);
//float* h_Abis = (float*) malloc(mem_size_A);
unsigned int size_B = N * N;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*) malloc(mem_size_B);
//float* h_Bbis = (float*) malloc(mem_size_B);
// allocate host memory for the result
unsigned int size_C = N * N;
unsigned int mem_size_C = sizeof(float) * size_C;
float* h_C = (float*) malloc(mem_size_C);
cublasInit();
// set seed for rand()
srand(2006);
// initialize host memory
randomInit(h_A, size_A);
randomInit(h_B, size_B);
// Allocate CUDA events that we'll use for timing
cudaEvent_t record_event[5];
float time_msec[4];
for (int i=0;i<5;i++){
cudaEventCreate(record_event+i);
}
// Record the start event
cudaDeviceSynchronize();
cudaEventRecord(record_event[0], NULL);
float* d_A;
cublasAlloc(N*N, sizeof(float), (void **)&d_A);
float* d_B;
cublasAlloc(N*N, sizeof(float), (void **)&d_B);
float* d_C;
cublasAlloc(N*N, sizeof(float), (void **)&d_C);
cudaEventRecord(record_event[1], NULL);
cudaEventSynchronize(record_event[1]);
// copy host memory to device
cublasSetMatrix(N,N, sizeof(float), h_A, N, d_A, N);
cublasSetMatrix(N,N, sizeof(float), h_B, N, d_B, N);
cudaEventRecord(record_event[2], NULL);
cudaEventSynchronize(record_event[2]);
cublasSgemm('n', 'n', N, N, N, alpha, d_A, N,d_B, N, beta, d_C, N);
cudaEventRecord(record_event[3], NULL);
cudaEventSynchronize(record_event[3]);
cublasGetMatrix(N,N, sizeof(float), d_C,N, h_C, N);
cudaEventRecord(record_event[4], NULL);
cudaEventSynchronize(record_event[4]);
cudaEventElapsedTime(time_msec+0, record_event[0], record_event[1]);
cudaEventElapsedTime(time_msec+1, record_event[1], record_event[2]);
cudaEventElapsedTime(time_msec+2, record_event[2], record_event[3]);
cudaEventElapsedTime(time_msec+3, record_event[3], record_event[4]);
time_msec[4]=time_msec[0]+time_msec[1]+time_msec[2]+time_msec[3];
printf("TOTAL : \t\t\t %f (ms) dont %.2f%% de gestion mémoire \n",time_msec[4],100*(time_msec[0]+time_msec[1]+time_msec[3])/time_msec[4]);
cublasShutdown();
// clean up memory
free(h_A);
free(h_B);
free(h_C);
//free(reference);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}

@ -0,0 +1,161 @@
/*
* Copyright 1993-2007 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*
* Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice.
*/
/* Matrix multiplication: C = A * B.
* Host code.
*
* This sample implements matrix multiplication and is exactly the same as
* Chapter 7 of the programming guide.
* It has been written for clarity of exposition to illustrate various CUDA
* programming principles, not with the goal of providing the most
* performant generic kernel for matrix multiplication.
*
* CUBLAS provides high-performance matrix multiplication.
*/
// includes, system
#include "matrixMul.h"
// includes, kernels
#include "matrixMul_kernel.cuh"
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void compute_matrixMul_cuda(int N,Type_version_kernel version_kernel)
{
// set seed for rand()
//srand(2006);
// allocate host memory for matrices A and B
unsigned int size_A = N * N;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*) malloc(mem_size_A);
unsigned int size_B = N * N;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*) malloc(mem_size_B);
// allocate host memory for the result
unsigned int size_C = N * N;
unsigned int mem_size_C = sizeof(float) * size_C;
float* h_C = (float*) malloc(mem_size_C);
// Allocate CUDA events that we'll use for timing
cudaEvent_t record_event[5];
float time_msec[4];
for (int i=0;i<5;i++){
cudaEventCreate(record_event+i);
}
// Record the start event
cudaDeviceSynchronize();
cudaEventRecord(record_event[0], NULL);
// allocate device memory
float* d_A;
cudaMalloc((void**) &d_A, mem_size_A);
float* d_B;
cudaMalloc((void**) &d_B, mem_size_B);
// allocate device memory for result
float* d_C;
cudaMalloc((void**) &d_C, mem_size_C);
cudaEventRecord(record_event[1], NULL);
cudaEventSynchronize(record_event[1]);
// copy host memory to device
cudaMemcpy(d_A, h_A, mem_size_A,
cudaMemcpyHostToDevice) ;
cudaMemcpy(d_B, h_B, mem_size_B,
cudaMemcpyHostToDevice) ;
cudaEventRecord(record_event[2], NULL);
cudaEventSynchronize(record_event[2]);
// setup execution parameters
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(N/threads.x, N/threads.y);
// execute the kernel
switch(version_kernel){
case v0 : matrixMul_v0<<< grid, threads >>>(d_C, d_A, d_B, N); break;
case v0_bis : matrixMul_v0_bis<<< grid, threads >>>(d_C, d_A, d_B, N); break;
case v1 : matrixMul_v1<<< grid, threads >>>(d_C, d_A, d_B, N,N); break;
}
cudaEventRecord(record_event[3], NULL);
cudaEventSynchronize(record_event[3]);
// copy result from device to host
cudaMemcpy(h_C, d_C, mem_size_C,
cudaMemcpyDeviceToHost) ;
cudaEventRecord(record_event[4], NULL);
cudaEventSynchronize(record_event[4]);
cudaEventElapsedTime(time_msec+0, record_event[0], record_event[1]);
cudaEventElapsedTime(time_msec+1, record_event[1], record_event[2]);
cudaEventElapsedTime(time_msec+2, record_event[2], record_event[3]);
cudaEventElapsedTime(time_msec+3, record_event[3], record_event[4]);
time_msec[4]=time_msec[0]+time_msec[1]+time_msec[2]+time_msec[3];
printf("TOTAL : \t\t\t %f (ms) dont %.2f%% de gestion mémoire \n",time_msec[4],100*(time_msec[0]+time_msec[1]+time_msec[3])/time_msec[4]);
// clean up memory
free(h_A);
free(h_B);
free(h_C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}

@ -0,0 +1,108 @@
/*
* Copyright 1993-2007 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*
* Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice.
*/
/* Matrix multiplication: C = A * B.
* Host code.
*
* This sample implements matrix multiplication and is exactly the same as
* Chapter 7 of the programming guide.
* It has been written for clarity of exposition to illustrate various CUDA
* programming principles, not with the goal of providing the most
* performant generic kernel for matrix multiplication.
*
* CUBLAS provides high-performance matrix multiplication.
*/
// includes, system
#include "matrixMul.h"
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, const char** argv)
{
//findCudaDevice(argc,argv);
int n;
printf("\nmatrix_Mul_CUDA_C\n\n");
for (n=7;n<11;n++){
unsigned int taille_matrice;
taille_matrice=(unsigned int)pow((float)2.0,n);
printf("MATRICE DE TAILLE %d\n",taille_matrice);
compute_matrixMul_C(taille_matrice);
}
printf("\nmatrix_Mul_CUDA_v0\n\n");
for (n=7;n<11;n++){
unsigned int taille_matrice;
taille_matrice=(unsigned int)pow((float)2.0,n);
printf("MATRICE DE TAILLE %d\n",taille_matrice);
compute_matrixMul_cuda(taille_matrice,v0);
}
printf("\nmatrix_Mul_CUDA_v0_bis\n\n");
for (n=7;n<11;n++){
unsigned int taille_matrice;
taille_matrice=(unsigned int)pow((float)2.0,n);
printf("MATRICE DE TAILLE %d\n",taille_matrice);
compute_matrixMul_cuda(taille_matrice,v0_bis);
}
printf("\nmatrix_Mul_CUDA_v1\n\n");
for (n=7;n<11;n++){
unsigned int taille_matrice;
taille_matrice=(unsigned int)pow((float)2.0,n);
printf("MATRICE DE TAILLE %d\n",taille_matrice);
compute_matrixMul_cuda(taille_matrice,v1);
}
printf("\nmatrix_Mul_CUBLAS\n\n");
for (n=7;n<11;n++){
unsigned int taille_matrice;
taille_matrice=(unsigned int)pow((float)2.0,n);
printf("MATRICE DE TAILLE %d\n",taille_matrice);
compute_matrixMul_cublas(taille_matrice);
}
//CUT_EXIT(argc, argv);
}

@ -0,0 +1,198 @@
/*
* Copyright 1993-2007 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*
* Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice.
*/
/* Matrix multiplication: C = A * B.
* Device code.
*/
#ifndef _MATRIXMUL_KERNEL_H_
#define _MATRIXMUL_KERNEL_H_
#include <stdio.h>
#include "matrixMul.h"
#define CHECK_BANK_CONFLICTS 0
#if CHECK_BANK_CONFLICTS
#define AS(i, j) CUT_BANK_CHECKER(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
#define BS(i, j) CUT_BANK_CHECKER(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
#else
#define AS(i, j) As[i][j]
#define BS(i, j) Bs[i][j]
#endif
////////////////////////////////////////////////////////////////////////////////
//! Matrix multiplication on the device: C = A * B
//! N is matrix dimension
////////////////////////////////////////////////////////////////////////////////
__global__ void
matrixMul_v0( float* C, float* A, float* B,int matrix_size)
{
int k;
float C_sum;
int i_0,j_0;
int i,j;
int a,b,c;
i_0=blockIdx.x*BLOCK_SIZE;
j_0=blockIdx.y*BLOCK_SIZE;
i=i_0+threadIdx.x;
j=j_0+threadIdx.y;
a=i*matrix_size;
b=j;
c=j+i*matrix_size;
for (k = 0; k < matrix_size; ++k){
C_sum += A[a] * B[b];
a++;
b+=matrix_size;
}
C[c] = C_sum;
}
__global__ void
matrixMul_v0_bis( float* C, float* A, float* B,int matrix_size)
{
int k;
float C_sum;
int i_0,j_0;
int i,j;
int a,b,c;
j_0=blockIdx.x*BLOCK_SIZE;
i_0=blockIdx.y*BLOCK_SIZE;
j=j_0+threadIdx.x;
i=i_0+threadIdx.y;
a=i*matrix_size;
b=j;
c=j+i*matrix_size;
for (k = 0; k < matrix_size; ++k){
C_sum += A[a] * B[b];
a++;
b+=matrix_size;
}
C[c] = C_sum;
}
__global__ void
matrixMul_v1( float* C, float* A, float* B, int wA, int wB)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
// Index of the first sub-matrix of A processed by the block
int aBegin = wA * BLOCK_SIZE * by;
// Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + wA - 1;
// Step size used to iterate through the sub-matrices of A
int aStep = BLOCK_SIZE;
// Index of the first sub-matrix of B processed by the block
int bBegin = BLOCK_SIZE * bx;
// Step size used to iterate through the sub-matrices of B
int bStep = BLOCK_SIZE * wB;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
float Csub = 0;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int a = aBegin, b = bBegin;
a <= aEnd;
a += aStep, b += bStep) {
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
AS(ty, tx) = A[a + wA * ty + tx];
BS(ty, tx) = B[b + wB * ty + tx];
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
for (int k = 0; k < BLOCK_SIZE; ++k)
Csub += AS(ty, k) * BS(k, tx);
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write the block sub-matrix to device memory;
// each thread writes one element
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
}
#endif // #ifndef _MATRIXMUL_KERNEL_H_

@ -0,0 +1,77 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
<cconfiguration id="com.nvidia.cuda.ide.eight_zero.configuration.release.381529367">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="com.nvidia.cuda.ide.eight_zero.configuration.release.381529367" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.VCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="nvcc.errorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="com.nvidia.cuda.ide.elf" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="com.nvidia.cuda.ide.cubin" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="com.nvidia.cuda.ide.macho" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="com.nvidia.cuda.ide.eight_zero.configuration.release.381529367" name="Release" parent="com.nvidia.cuda.ide.eight_zero.configuration.release.381529367">
<folderInfo id="com.nvidia.cuda.ide.eight_zero.configuration.release.381529367." name="/" resourcePath="">
<toolChain id="com.nvidia.cuda.ide.toolchain.eight_zero.exe.release.1272863516" name="CUDA Toolkit 8.0">
<targetPlatform archList="all" binaryParser="com.nvidia.cuda.ide.elf;com.nvidia.cuda.ide.macho;com.nvidia.cuda.ide.cubin" id="com.nvidia.cuda.ide.targetPlatform.677042486" isAbstract="false" name="Debug Platform" osList="linux,macosx" superClass="com.nvidia.cuda.ide.targetPlatform"/>
<builder buildPath="${workspace_loc:/seuillage_CUDA_8}/Release" id="com.nvidia.cuda.ide.builder.1452012632" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="CUDA Toolkit 8.0 Builder" superClass="com.nvidia.cuda.ide.builder"/>
<tool id="nvcc.compiler.base.1547368432" name="NVCC Compiler" superClass="nvcc.compiler.base">
<option id="nvcc.compiler.deviceDebug.1647831981" name="Generate device debug information (-G)" superClass="nvcc.compiler.deviceDebug"/>
<option id="nvcc.compiler.option.level.16726990" name="Generate host debug information (-g)" superClass="nvcc.compiler.option.level"/>
<option defaultValue="nvcc.compiler.optimization.level.most" id="nvcc.compiler.optimization.level.1222624320" name="Optimization Level" superClass="nvcc.compiler.optimization.level" valueType="enumerated"/>
<option id="nvcc.compiler.pic.1176654696" name="Position Independent Code (-fPIC)" superClass="nvcc.compiler.pic"/>
<option id="nvcc.compiler.include.paths.393958652" name="Include paths (-I)" superClass="nvcc.compiler.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="/opt/cuda/samples/common/inc"/>
</option>
<inputType id="nvcc.compiler.input.cu.1459843904" superClass="nvcc.compiler.input.cu"/>
<inputType id="nvcc.compiler.input.cpp.360700336" superClass="nvcc.compiler.input.cpp"/>
<inputType id="nvcc.compiler.input.c.1943821627" superClass="nvcc.compiler.input.c"/>
</tool>
<tool id="nvcc.linker.base.602397088" name="NVCC Linker" superClass="nvcc.linker.base">
<inputType id="nvcc.linker.input.862625666" superClass="nvcc.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="nvcc.archiver.base.342423531" name="NVCC Archiver" superClass="nvcc.archiver.base"/>
<tool id="com.nvidia.host.assembler.1673437193" name="Host Assembler" superClass="com.nvidia.host.assembler">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.1548523292" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
</configuration>
</storageModule>
<storageModule moduleId="com.nvidia.cuda.ide.build.project.ICudaProjectConfiguration">
<executable devicelink="false">
<sass major="3" minor="0"/>
<ptx major="3" minor="0"/>
</executable>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<project id="seuillage_CUDA_8.com.nvidia.cuda.ide.eight_zero.exe.270714933" name="Executable"/>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.eight_zero.configuration.debug.823155436;com.nvidia.cuda.ide.eight_zero.configuration.debug.823155436.;nvcc.compiler.base.1174254289;nvcc.compiler.input.cpp.2134833342">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.eight_zero.configuration.debug.823155436;com.nvidia.cuda.ide.eight_zero.configuration.debug.823155436.;nvcc.compiler.base.1174254289;nvcc.compiler.input.c.691715125">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.eight_zero.configuration.debug.823155436;com.nvidia.cuda.ide.eight_zero.configuration.debug.823155436.;nvcc.compiler.base.1174254289;nvcc.compiler.input.cu.1571068709">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
<storageModule moduleId="refreshScope"/>
</cproject>

@ -0,0 +1,27 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>seuillage_CUDA_8</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
<triggers>clean,full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
<triggers>full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.cdt.core.cnature</nature>
<nature>org.eclipse.cdt.core.ccnature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
</natures>
</projectDescription>

@ -0,0 +1,34 @@
cmake_minimum_required(VERSION 2.8)
project(seuillage LANGUAGES CXX CUDA)
set(ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/)
#set(CMAKE_C_COMPILER "gcc-8")
#set(CMAKE_CXX_COMPILER "g++-8")
#set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")
list(APPEND HEADER_REP ${ROOT_DIR}/ ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} "/usr/local/cuda/samples/common/inc/" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/../samples/common/inc")
file(GLOB CPP_FILES ${ROOT_DIR}/*.cpp)
file(GLOB CU_FILES ${ROOT_DIR}/*.cu)
file(GLOB_RECURSE HEADER_FILES ${ROOT_DIR}/*.cuh ${ROOT_DIR}/*.h)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math")
set(CUDA_SEPARABLE_COMPILATION ON)
set(CUDA_NVCC_FLAGS "-lineinfo;-I\"${ROOT_DIR}/inc\" -I\"${ROOT_DIR}/\" -I\"/usr/local/cuda/samples/common/inc/\"")
add_executable(seuillage_CUDA ${CPP_FILES} ${CU_FILES} ${HEADER_FILES})
set_target_properties(seuillage_CUDA PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(seuillage_CUDA PUBLIC ${ROOT_DIR}/inc/ /usr/local/cuda/samples/common/inc/)

@ -0,0 +1,34 @@
cmake_minimum_required(VERSION 2.8)
project(matrice)
set(ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/)
#set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")
find_package(CUDA REQUIRED)
list(APPEND HEADER_REP ${ROOT_DIR}/ ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} "/usr/local/cuda/samples/common/inc/" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/../samples/common/inc")
#file(GLOB CPP_FILES ${ROOT_DIR}/src/CPP/*.cpp)
#file(GLOB CU_FILES ${ROOT_DIR}/src/CPP/*.cu)
set(CPP_FILES seuillage_C.cpp)
set(CU_FILES ${ROOT_DIR}/seuillage_main.cu)
#file(GLOB_RECURSE HEADER_FILES ${ROOT_DIR}/src/CUDA/*.cuh ${ROOT_DIR}/inc/*.h)
set(CUDA_SEPARABLE_COMPILATION ON)
set(CUDA_NVCC_FLAGS "-lineinfo;-I\"${ROOT_DIR}/inc\" -I\"${ROOT_DIR}/\" -I\"/usr/local/cuda/samples/common/inc/\"")
cuda_add_executable(seuillage_CUDA ${CPP_FILES} ${CU_FILES} ${HEADER_FILES})
set_target_properties(seuillage_CUDA PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(seuillage_CUDA PUBLIC ${ROOT_DIR}/inc/ /usr/local/cuda/samples/common/inc/)

@ -0,0 +1,17 @@
#cmake -DCMAKE_CUDA_FLAGS=”-arch=sm_30” ..
cmake_minimum_required(VERSION 3.8)
set(CUDACXX "/usr/local/cuda/bin/nvcc")
set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc")
project(seuillage_CUDA LANGUAGES CXX CUDA)
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
set(CMAKE_VERBOSE_MAKEFILE TRUE)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math")
add_executable(seuillage_CUDA seuillage_C.cpp seuillage_main.cu)
set_target_properties(seuillage_CUDA PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_features(seuillage_CUDA PUBLIC cxx_std_11)
target_include_directories(seuillage_CUDA PUBLIC ".")

@ -0,0 +1,27 @@
//#define SEUILLAGE_H
//#ifndef SEUILLAGE_H
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#define SIZE_I 960
#define SIZE_J 1280
#define BLOCK_SIZE 16
// Prototype
void runTest( int argc, char** argv);
extern "C" void seuillage_C( float reference[][SIZE_J][SIZE_I] , float idata[][SIZE_J][SIZE_I] );
//#endif

@ -0,0 +1,41 @@
/*
* Copiright 1993-2009 NVIDIA Corporation. All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual properti and
* proprietari rights in and to this software and related documentation and
* ani modifications thereto. Ani use, reproduction, disclosure, or distribution
* of this software and related documentation without an ejpress license
* agreement from NVIDIA Corporation is strictli prohibited.
*
*/
/* Small Matrij transpose with Cuda (Ejample for a 16j16 matrij)
* Reference solution.
*/
#include "seuillage.h"
////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set
////////////////////////////////////////////////////////////////////////////////
void seuillage_C(float image_out[][SIZE_J][SIZE_I], float image_in[][SIZE_J][SIZE_I])
{
float r, g, b;
for(int j=0; j<SIZE_J; j++){
for(int i=0; i<SIZE_I; i++){
r = image_in[0][j][i];
g = image_in[1][j][i];
b = image_in[2][j][i];
if(r/sqrt(r*r+g*g+b*b) > 0.7){
image_out[0][j][i] = r;
image_out[1][j][i] = g;
image_out[2][j][i] = b;
}
}
}
}

@ -0,0 +1,229 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* Template project which demonstrates the basics on how to setup a project
* example application.
* Host code.
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <iostream>
using namespace std;
// includes CUDA
#include <cuda_runtime.h>
#include "seuillage.h"
__global__ void seuillage_kernel(float d_image_in[][SIZE_J][SIZE_I],float d_image_out[][SIZE_J][SIZE_I])
{
// A VOUS DE CODER
int i,j;
j = blockIdx.x;
i = threadIdx.x;
float r,g,b;
r = d_image_in[0][j][i];
g = d_image_in[1][j][i];
b = d_image_in[2][j][i];
if((r/sqrt(r*r+g*g+b*b))> 0.7){
d_image_out[0][j][i]=r;
d_image_out[1][j][i]=g;
d_image_out[2][j][i]=b;
}
}
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest( int argc, char** argv);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
runTest( argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void
runTest( int argc, char** argv)
{
cudaError_t error;
if (argc<2)
printf("indiquer le chemin du repertoire contenant les images\n");
const unsigned int mem_size = sizeof(float) * 3* SIZE_J * SIZE_I;
// allocate host memory
float* h_image_in = (float*) malloc(mem_size);
//Initilaisation du volume d'entr<74>e
FILE *file_ptr;
char name_file_in[512];
sprintf(name_file_in,"%s/ferrari.raw",argv[1]);
printf("%s",name_file_in);
file_ptr=fopen(name_file_in,"rb");
fread(h_image_in,sizeof(float),3*SIZE_J*SIZE_I,file_ptr);
fclose(file_ptr);
////////////////////////////////////////////////////////////////////////////////
// EXECUTION SUR LE CPU
///////////////////////////////////////////////////////////////////////
// Image trait<69>e sur le CPU
float* h_image_out_CPU = (float*) malloc( mem_size);
printf("Seuillage CPU d'une image couleur \n");
cudaEvent_t start,stop;
error = cudaEventCreate(&start);
error = cudaEventCreate(&stop);
// Record the start event
error = cudaEventRecord(start, NULL);
error = cudaEventSynchronize(start);
//Seuillage sur CPU
seuillage_C( (float (*)[SIZE_J][SIZE_I])h_image_out_CPU, (float (*)[SIZE_J][SIZE_I])h_image_in);
// Record the start event
error = cudaEventRecord(stop, NULL);
// Wait for the stop event to complete
error = cudaEventSynchronize(stop);
float msecTotal = 0.0f;
error = cudaEventElapsedTime(&msecTotal, start, stop);
printf("CPU execution time %f\n",msecTotal);
//Sauvegarde de l'image resultat
char name_file_out_CPU[512];
sprintf(name_file_out_CPU,"%s/ferrari_out_CPU.raw",argv[1]);
file_ptr=fopen(name_file_out_CPU,"wb");
fwrite(h_image_out_CPU,sizeof(float),3*SIZE_J*SIZE_I,file_ptr);
fclose(file_ptr);
////////////////////////////////////////////////////////////////////////////////
// EXECUTION SUR LE GPU
///////////////////////////////////////////////////////////////////////
cudaEvent_t start_mem,stop_mem;
error = cudaEventCreate(&start_mem);
error = cudaEventCreate(&stop_mem);
error = cudaEventRecord(start, NULL);
error = cudaEventSynchronize(start);
float* h_image_out_GPU = (float*) malloc(mem_size);
// images on device memory
float* d_image_in;
float* d_image_out;
// Alocation mémoire de d_image_in et d_image_out sur la carte GPU
// A VOUS DE CODER
cudaMalloc((void **) &d_image_in, mem_size);
cudaMalloc((void **) &d_image_out, mem_size);
// copy host memory to device
// A VOUS DE CODER
cudaMemcpy(d_image_in, h_image_in, mem_size, cudaMemcpyHostToDevice);
error = cudaEventRecord(stop_mem, NULL);
// Wait for the stop event to complete
error = cudaEventSynchronize(stop_mem);
float msecMem = 0.0f;
error = cudaEventElapsedTime(&msecMem, start, stop_mem);
// setup execution parameters -> découpage en threads
// A VOUS DE CODER
dim3 threads(960);
dim3 grid(1280);
// lancement des threads executé sur la carte GPU
// A VOUS DE CODER
// INDICATION : pour les parametres de la fonction kernel seuillage_kernel, vous ferez un changement de type (float *) vers (float (*)[SIZE_J][SIZE_I])
// inspirez vous du lancement de la fonction seuillage_C dans le main.
seuillage_kernel<<< grid,threads >>>((float (*)[SIZE_J][SIZE_I])d_image_in, (float (*)[SIZE_J][SIZE_I])d_image_out);
// Record the start event
error = cudaEventRecord(start_mem, NULL);
error = cudaEventSynchronize(start_mem);
// copy result from device to host
// A VOUS DE CODER
cudaMemcpy(h_image_out_GPU, d_image_out, mem_size, cudaMemcpyDeviceToHost);
// cleanup device memory
//ENLEVEZ LES COMMENTAIRES
cudaFree(d_image_in);
cudaFree(d_image_out);
error = cudaEventRecord(stop, NULL);
// Wait for the stop event to complete
error = cudaEventSynchronize(stop);
msecTotal = 0.0f;
error = cudaEventElapsedTime(&msecTotal, start, stop);
float msecMem2 =0.0f;
error = cudaEventElapsedTime(&msecMem2, start_mem, stop);
msecMem+=msecMem2;
printf("GPU execution time %f ms (memory management %2.2f \%)\n",msecTotal,(msecMem)/(msecTotal)*100);
// Enregistrement de l'image de sortie sur un fichier
char name_file_out_GPU[512];
sprintf(name_file_out_GPU,"%s/ferrari_out_GPU.raw",argv[1]);
file_ptr=fopen(name_file_out_GPU,"wb");
fwrite(h_image_out_GPU,sizeof(float),3*SIZE_J*SIZE_I,file_ptr);
fclose(file_ptr);
// cleanup memory
free(h_image_in);
free(h_image_out_GPU);
free(h_image_out_CPU);
}

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 154 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 132 KiB

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 162 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 132 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 132 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 132 KiB

Binary file not shown.

@ -0,0 +1,51 @@
%Ouverture d'une image au format raw
fid = fopen('../Image/carre.raw', 'rb');
image_in=fread(fid, 512*512, 'single');
image_in=reshape(image_in,512,512);
fclose(fid);
%Affichage d'une image couleur avec image
figure('name','Image in','numbertitle','off');imagesc(image_in);colormap(gray);
%Taille d'une image
taille=size(image_in);
display(taille);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%tic toc pour mesurer le temps de calcul
tic;
hx=[-1 0 1];
hy=[1;0;-1];
h=conv2(hx,hy)
h2D=[0 1 0;-1 0 1;0 -1 0];
image_out=conv2(image_in,h2D,'same');
image_out=abs(image_out);
%pour la convolution regardez l'aide pour conv2
%pour le filtre median regardez l'aide de medfilt2
toc;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Affichage d'une image avec imagesc
figure('name','Image out','numbertitle','off');imagesc(image_out);colormap gray;
%Sauvegarde d'une image au format jpg
imwrite(image_out,'../Image/carre_out.jpg','jpg');
%Sauvegarde d'une image au format raw
fid = fopen('../Image/carre_out.raw', 'w');
fwrite(fid, image_out, 'single');
fclose(fid);

@ -0,0 +1,14 @@
function raw2jpg(name_proc)
%fid = fopen('ferrari.raw', 'r');
file_name_in=['../Image/ferrari_out_' name_proc '.raw'];
fid = fopen(file_name_in, 'r');
ima=single(fread(fid,1280*960*3, 'single'));
fclose(fid);
ima=reshape(ima,960,1280,3);
ima=ima./255;
figure('name',name_proc,'numbertitle','off');image(ima);
file_name_out=['../Image/ferrari_out_' name_proc '.jpg'];
imwrite(ima,file_name_out,'jpg');
end

@ -0,0 +1,69 @@
close all;
%Ouverture d'une image au format couleur
ima=single(imread('../Image/ferrari.jpg'));
ima=ima./255;
%Affichage d'une image couleur avec image
figure('name','RGB in','numbertitle','off');image(ima);
%Taille d'une image
taille=size(ima);
display(taille);
ima_r=ima(:,:,1);
ima_g=ima(:,:,2);
ima_b=ima(:,:,3);
%Affichage d'un niveau de couleur de l'image
figure('name','R','numbertitle','off');imagesc(ima_r);colormap gray %Niveau de rouge
figure('name','G','numbertitle','off');imagesc(ima_g);colormap gray %Niveau de vert
figure('name','B','numbertitle','off');imagesc(ima_b);colormap gray %Niveau de bleu
%Taille d'une image
taille=size(ima);
display(taille);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%tic toc pour mesurer le temps de calcul
disp("Seuil :");
tic;
ima_out=ima;
nr = ima_r./sqrt(ima_r.^2 + ima_b.^2 + ima_g.^2);
ima_seuil = ima.*(nr>0.7);
toc;
figure('name','RGB out','numbertitle','off');image(ima_seuil);
disp("Jaune :")
tic;
ima_jaune = ima_seuil;
ima_jaune(:,:,2) = ima_seuil(:,:,1);
toc;
figure('name','RGB out','numbertitle','off');image(ima_jaune);
disp("Reinsertion dans l'image :")
tic;
ima_out = ima - ima_seuil + ima_jaune;
figure('name','RGB out','numbertitle','off');image(ima_out);
toc;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Sauvegarde d'une image au format jpg
imwrite(ima_out,'../Image/ferrari_out.jpg','jpg');
%Sauvegarde d'une image au format raw
fid = fopen('../Image/ferrari_out.raw', 'w');
fwrite(fid, ima_out, 'single');
fclose(fid);

@ -0,0 +1,108 @@
close all;
%Ouverture d'une image au format couleur
ima=single(imread('../Image/ferrari.jpg'));
ima=ima./255;
%Affichage d'une image couleur avec image
figure('name','RGB in','numbertitle','off');image(ima);
%Taille d'une image
taille=size(ima);
display(taille);
ima_r=ima(:,:,1);
ima_g=ima(:,:,2);
ima_b=ima(:,:,3);
%Affichage d'un niveau de couleur de l'image
figure('name','R','numbertitle','off');imagesc(ima_r);colormap gray %Niveau de rouge
figure('name','G','numbertitle','off');imagesc(ima_g);colormap gray %Niveau de vert
figure('name','B','numbertitle','off');imagesc(ima_b);colormap gray %Niveau de bleu
%Taille d'une image
taille=size(ima);
display(taille);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%tic toc pour mesurer le temps de calcul
display('VERSION 1');
tic;
ima_out_v1=ima;
%A vous de coder !
for j=1:taille(2)
for i=1:taille(1)
nr(i,j)=ima(i,j,1)/sqrt(ima(i,j,1)*ima(i,j,1)+ima(i,j,2)*ima(i,j,2)+ima(i,j,3)*ima(i,j,3));
if (nr(i,j)>0.7)
ima_out_v1(i,j,2)=ima(i,j,1);
end
end
end
toc;
figure('name','ratio rouge','numbertitle','off');imagesc(nr);colormap gray %Niveau de rouge
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
display('VERSION 2');
tic;
ima_out_v2=ima;
nr=ima_r./sqrt(ima_r.^2+ima_g.^2+ima_b.^2);
%A vous de coder !
for j=1:taille(2)
for i=1:taille(1)
if (nr(i,j)>0.7)
ima_out_v2(i,j,2)=ima(i,j,1);
end
end
end
toc;
display('VERSION 3');
tic;
ima_out_v3=ima;
nr=ima_r./sqrt(ima_r.^2+ima_g.^2+ima_b.^2);
image_test=nr>0.7;
image_tmp=ima_g;
image_tmp(image_test)=ima_r(image_test);
ima_out_v3(:,:,2)=image_tmp();
toc;
figure('name','test','numbertitle','off');imagesc(image_test);colormap gray %Niveau de rouge
figure('name','RGB out version 1','numbertitle','off');image(ima_out_v1);
figure('name','RGB out version 2','numbertitle','off');image(ima_out_v2);
figure('name','RGB out version 3','numbertitle','off');image(ima_out_v3);
%Sauvegarde d'une image au format jpg
imwrite(ima_out_v1,'../Image/ferrari_out_v1.jpg','jpg');
imwrite(ima_out_v2,'../Image/ferrari_out_v2.jpg','jpg');
imwrite(ima_out_v3,'../Image/ferrari_out_v3.jpg','jpg');
%Sauvegarde d'une image au format raw
fid = fopen('../Image/ferrari_out.raw', 'w');
fwrite(fid, ima_out_v3, 'single');
fclose(fid);

@ -0,0 +1,77 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
<cconfiguration id="com.nvidia.cuda.ide.nine_zero.configuration.release.1433346787">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="com.nvidia.cuda.ide.nine_zero.configuration.release.1433346787" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="com.nvidia.cuda.ide.elf" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="com.nvidia.cuda.ide.cubin" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="com.nvidia.cuda.ide.macho" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.VCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="nvcc.errorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe,org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release" cleanCommand="rm -rf" description="" id="com.nvidia.cuda.ide.nine_zero.configuration.release.1433346787" name="Release" parent="com.nvidia.cuda.ide.nine_zero.configuration.release">
<folderInfo id="com.nvidia.cuda.ide.nine_zero.configuration.release.1433346787." name="/" resourcePath="">
<toolChain id="com.nvidia.cuda.ide.toolchain.nine_zero.exe.release.698197512" name="CUDA Toolkit 10.0" superClass="com.nvidia.cuda.ide.toolchain.nine_zero.exe.release">
<targetPlatform archList="all" binaryParser="com.nvidia.cuda.ide.elf;com.nvidia.cuda.ide.macho;com.nvidia.cuda.ide.cubin" id="com.nvidia.cuda.ide.targetPlatform.1887109166" isAbstract="false" name="Debug Platform" osList="linux,macosx" superClass="com.nvidia.cuda.ide.targetPlatform"/>
<builder buildPath="${workspace_loc:/TP2}/Release" id="com.nvidia.cuda.ide.builder.996479146" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="CUDA Toolkit 10.0 Builder" superClass="com.nvidia.cuda.ide.builder"/>
<tool id="nvcc.compiler.base.970531586" name="NVCC Compiler" superClass="nvcc.compiler.base">
<option id="nvcc.compiler.deviceDebug.130815577" name="Generate device debug information (-G)" superClass="nvcc.compiler.deviceDebug"/>
<option id="nvcc.compiler.option.level.1535441624" name="Generate host debug information (-g)" superClass="nvcc.compiler.option.level"/>
<option defaultValue="nvcc.compiler.optimization.level.most" id="nvcc.compiler.optimization.level.1934934131" name="Optimization Level" superClass="nvcc.compiler.optimization.level" valueType="enumerated"/>
<option id="nvcc.compiler.pic.84159010" name="Position Independent Code (-fPIC)" superClass="nvcc.compiler.pic"/>
<inputType id="nvcc.compiler.input.cu.2057104910" superClass="nvcc.compiler.input.cu"/>
<inputType id="nvcc.compiler.input.cpp.1440437331" superClass="nvcc.compiler.input.cpp"/>
<inputType id="nvcc.compiler.input.c.1110197327" superClass="nvcc.compiler.input.c"/>
</tool>
<tool id="nvcc.linker.base.1084287072" name="NVCC Linker" superClass="nvcc.linker.base">
<option id="nvcc.linker.option.libs.1571841227" name="Libraries (-l)" superClass="nvcc.linker.option.libs" valueType="libs">
<listOptionValue builtIn="false" value="gomp"/>
</option>
<inputType id="nvcc.linker.input.1818364586" superClass="nvcc.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="nvcc.archiver.base.1485801156" name="NVCC Archiver" superClass="nvcc.archiver.base"/>
<tool id="com.nvidia.host.assembler.1159491752" name="Host Assembler" superClass="com.nvidia.host.assembler">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.174374770" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
</configuration>
</storageModule>
<storageModule moduleId="com.nvidia.cuda.ide.build.project.ICudaProjectConfiguration">
<executable devicelink="false">
<sass major="3" minor="0"/>
<ptx major="3" minor="0"/>
</executable>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<project id="TP2.com.nvidia.cuda.ide.nine_zero.exe.235863883" name="Executable" projectType="com.nvidia.cuda.ide.nine_zero.exe"/>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.nine_zero.configuration.debug.9454793;com.nvidia.cuda.ide.nine_zero.configuration.debug.9454793.;nvcc.compiler.base.1363154216;nvcc.compiler.input.cu.75245776">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.nine_zero.configuration.debug.9454793;com.nvidia.cuda.ide.nine_zero.configuration.debug.9454793.;nvcc.compiler.base.1363154216;nvcc.compiler.input.cpp.1264206044">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.nine_zero.configuration.debug.9454793;com.nvidia.cuda.ide.nine_zero.configuration.debug.9454793.;nvcc.compiler.base.1363154216;nvcc.compiler.input.c.1376543207">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
<storageModule moduleId="refreshScope"/>
</cproject>

@ -0,0 +1,27 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>TP2</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
<triggers>clean,full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
<triggers>full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.cdt.core.cnature</nature>
<nature>org.eclipse.cdt.core.ccnature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
</natures>
</projectDescription>

@ -0,0 +1,34 @@
#cmake -DCMAKE_CUDA_FLAGS=-arch=sm_30 ..
cmake_minimum_required(VERSION 2.8)
project(reduce)
set(ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/)
#set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")
find_package(CUDA REQUIRED)
list(APPEND HEADER_REP ${ROOT_DIR}/ ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} "/usr/local/cuda/samples/common/inc/" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/../samples/common/inc")
#file(GLOB CPP_FILES ${ROOT_DIR}/src/CPP/*.cpp)
#file(GLOB CU_FILES ${ROOT_DIR}/src/CPP/*.cu)
set(CU_FILES ${ROOT_DIR}/Reduce_solution.cu)
#file(GLOB_RECURSE HEADER_FILES ${ROOT_DIR}/src/CUDA/*.cuh ${ROOT_DIR}/inc/*.h)
set(CUDA_SEPARABLE_COMPILATION ON)
set(CUDA_NVCC_FLAGS "-lineinfo;-I\"${ROOT_DIR}/inc\" -I\"${ROOT_DIR}/\" -I\"/usr/local/cuda/samples/common/inc/\" -std=c++14")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
cuda_add_executable(reduce ${CPP_FILES} ${CU_FILES} ${HEADER_FILES})
set_target_properties(reduce PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(reduce PUBLIC ${ROOT_DIR}/inc/ /usr/local/cuda/samples/common/inc/)

@ -0,0 +1,34 @@
cmake_minimum_required(VERSION 2.8)
project(matrice)
set(ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/)
#set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")
find_package(CUDA REQUIRED)
list(APPEND HEADER_REP ${ROOT_DIR}/ ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} "/usr/local/cuda/samples/common/inc/" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/../samples/common/inc")
#file(GLOB CPP_FILES ${ROOT_DIR}/src/CPP/*.cpp)
#file(GLOB CU_FILES ${ROOT_DIR}/src/CPP/*.cu)
set(CPP_FILES seuillage_C.cpp)
set(CU_FILES ${ROOT_DIR}/seuillage_main.cu)
#file(GLOB_RECURSE HEADER_FILES ${ROOT_DIR}/src/CUDA/*.cuh ${ROOT_DIR}/inc/*.h)
set(CUDA_SEPARABLE_COMPILATION ON)
set(CUDA_NVCC_FLAGS "-lineinfo;-I\"${ROOT_DIR}/inc\" -I\"${ROOT_DIR}/\" -I\"/usr/local/cuda/samples/common/inc/\"")
cuda_add_executable(seuillage_CUDA ${CPP_FILES} ${CU_FILES} ${HEADER_FILES})
set_target_properties(seuillage_CUDA PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(seuillage_CUDA PUBLIC ${ROOT_DIR}/inc/ /usr/local/cuda/samples/common/inc/)

@ -0,0 +1,17 @@
#cmake -DCMAKE_CUDA_FLAGS=”-arch=sm_30” ..
cmake_minimum_required(VERSION 3.8)
set(CUDACXX "/usr/local/cuda/bin/nvcc")
set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc")
project(seuillage_CUDA LANGUAGES CXX CUDA)
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
set(CMAKE_VERBOSE_MAKEFILE TRUE)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math")
add_executable(seuillage_CUDA seuillage_C.cpp seuillage_main.cu)
set_target_properties(seuillage_CUDA PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_features(seuillage_CUDA PUBLIC cxx_std_11)
target_include_directories(seuillage_CUDA PUBLIC ".")

@ -0,0 +1,39 @@
#pragma once
#include <cuda_runtime_api.h>
class GpuTimer
{
cudaEvent_t start, stop;
public:
GpuTimer()
{
cudaEventCreate(&start);
cudaEventCreate(&stop);
}
~GpuTimer()
{
cudaEventDestroy(stop);
cudaEventDestroy(start);
}
void Start()
{
cudaEventRecord(start);
}
void Stop()
{
cudaEventRecord(stop);
cudaEventSynchronize(stop);
}
float Elapsed()
{
float elapsed;
cudaEventElapsedTime(&elapsed, start, stop);
return elapsed;
}
};

@ -0,0 +1,458 @@
/*
# Copyright (c) 2011-2012 NVIDIA CORPORATION. All Rights Reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto. Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
*/
#include <iostream>
#include <cuda_runtime_api.h>
#include <omp.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/reduce.h>
#include <thrust/functional.h>
//#include <cub/cub.cuh>
#include "GpuTimer.h"
#define CUDA_SAFE_CALL(call) \
{ \
cudaError_t err_code = call; \
if( err_code != cudaSuccess ) { std::cerr << "Error (" << __FILE__ << ":" << __LINE__ << "): " << cudaGetErrorString(err_code) << std::endl; return 1; } \
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// G P U R E D U C T I O N
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void reduce_kernel( int n, const int *in_buffer, int *out_buffer, const int2 *block_ranges )
{
// Allocate shared memory inside the block.
extern __shared__ int s_mem[];
float my_sum=0;
// The range of data to work with.
int2 range = block_ranges[blockIdx.x];
// Compute the sum of my elements.
// TODO: fill-in that section of the code
// Copy my sum in shared memory.
s_mem[threadIdx.x] = my_sum;
// Make sure all the threads have copied their value in shared memory.
__syncthreads();
// Compute the sum inside the block.
// TODO: fill-in that section of the code
// The first thread of the block stores its result.
if( threadIdx.x == 0 )
out_buffer[blockIdx.x] = s_mem[0];
}
int reduce_on_gpu( int n, const int *a_device )
{
// Compute the size of the grid.
const int BLOCK_DIM = 256;
const int grid_dim = std::min( BLOCK_DIM, (n + BLOCK_DIM-1) / BLOCK_DIM );
const int num_threads = BLOCK_DIM * grid_dim;
// Compute the number of elements per block.
const int elements_per_block = BLOCK_DIM * ((n + num_threads - 1) / num_threads);
// Allocate memory for temporary buffers.
int *partial_sums = NULL;
int2 *block_ranges = NULL;
CUDA_SAFE_CALL( cudaMalloc( (void **) &partial_sums, BLOCK_DIM * sizeof(int ) ) );
CUDA_SAFE_CALL( cudaMalloc( (void **) &block_ranges, grid_dim * sizeof(int2) ) );
// Compute the ranges for the blocks.
int sum = 0;
int2 *block_ranges_on_host = new int2[grid_dim];
for( int block_idx = 0 ; block_idx < grid_dim ; ++block_idx )
{
block_ranges_on_host[block_idx].x = sum;
block_ranges_on_host[block_idx].y = std::min( sum += elements_per_block, n );
}
CUDA_SAFE_CALL( cudaMemcpy( block_ranges, block_ranges_on_host, grid_dim * sizeof(int2), cudaMemcpyHostToDevice ) );
delete[] block_ranges_on_host;
// First round: Compute a partial sum for all blocks.
reduce_kernel<<<grid_dim, BLOCK_DIM, BLOCK_DIM*sizeof(int)>>>( n, a_device, partial_sums, block_ranges );
CUDA_SAFE_CALL( cudaGetLastError() );
// Set the ranges for the second kernel call.
int2 block_range = make_int2( 0, grid_dim );
CUDA_SAFE_CALL( cudaMemcpy( block_ranges, &block_range, sizeof(int2), cudaMemcpyHostToDevice ) );
// Second round: Compute the final sum by summing the partial results of all blocks.
reduce_kernel<<<1, BLOCK_DIM, BLOCK_DIM*sizeof(int)>>>( grid_dim, partial_sums, partial_sums, block_ranges );
CUDA_SAFE_CALL( cudaGetLastError() );
// Read the result from device memory.
int result;
CUDA_SAFE_CALL( cudaMemcpy( &result, partial_sums, sizeof(int), cudaMemcpyDeviceToHost ) );
// Free temporary memory.
CUDA_SAFE_CALL( cudaFree( block_ranges ) );
CUDA_SAFE_CALL( cudaFree( partial_sums ) );
return result;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// G P U R E D U C T I O N : O P T I M I Z E D V E R S I O N
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#define WARP_SIZE 32
template< int BLOCK_DIM >
__global__ void reduce_kernel_optimized( int n, const int *in_buffer, int *out_buffer, const int2 *__restrict block_ranges )
{
// The number of warps in the block.
const int NUM_WARPS = BLOCK_DIM / WARP_SIZE;
float my_sum=0;
// Allocate shared memory inside the block.
__shared__ volatile int s_mem[BLOCK_DIM];
// The range of data to work with.
int2 range = block_ranges[blockIdx.x];
// Warp/lane IDs.
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
// Compute the sum of my elements.
// TODO: fill-in that section of the code
// Copy my sum in shared memory.
s_mem[threadIdx.x] = my_sum;
// Compute the sum inside each warp.
// TODO: fill-in that section of the code
// Each warp leader stores the result for the warp.
if( lane_id == 0 )
// TODO: fill-in that section of the code
__syncthreads();
if( warp_id == 0 )
{
// Read my value from shared memory and store it in a register.
my_sum = s_mem[lane_id];
// Sum the results of the warps.
// TODO: fill-in that section of the code
}
// The 1st thread stores the result of the block.
if( threadIdx.x == 0 )
out_buffer[blockIdx.x] = my_sum += s_mem[1];
}
template< int BLOCK_DIM >
int reduce_on_gpu_optimized( int n, const int *a_device )
{
// Compute the size of the grid.
const int grid_dim = std::min( BLOCK_DIM, (n + BLOCK_DIM-1) / BLOCK_DIM );
const int num_threads = BLOCK_DIM * grid_dim;
// Compute the number of elements per block.
const int elements_per_block = BLOCK_DIM * ((n + num_threads - 1) / num_threads);
// Allocate memory for temporary buffers.
int *partial_sums = NULL;
int2 *block_ranges = NULL;
CUDA_SAFE_CALL( cudaMalloc( (void **) &partial_sums, BLOCK_DIM * sizeof(int ) ) );
CUDA_SAFE_CALL( cudaMalloc( (void **) &block_ranges, grid_dim * sizeof(int2) ) );
// Compute the ranges for the blocks.
int sum = 0;
int2 *block_ranges_on_host = new int2[grid_dim];
for( int block_idx = 0 ; block_idx < grid_dim ; ++block_idx )
{
block_ranges_on_host[block_idx].x = sum;
block_ranges_on_host[block_idx].y = std::min( sum += elements_per_block, n );
}
CUDA_SAFE_CALL( cudaMemcpy( block_ranges, block_ranges_on_host, grid_dim * sizeof(int2), cudaMemcpyHostToDevice ) );
delete[] block_ranges_on_host;
// First round: Compute a partial sum for all blocks.
reduce_kernel_optimized<BLOCK_DIM><<<grid_dim, BLOCK_DIM>>>( n, a_device, partial_sums, block_ranges );
CUDA_SAFE_CALL( cudaGetLastError() );
// Set the ranges for the second kernel call.
int2 block_range = make_int2( 0, grid_dim );
CUDA_SAFE_CALL( cudaMemcpy( block_ranges, &block_range, sizeof(int2), cudaMemcpyHostToDevice ) );
// Second round: Compute the final sum by summing the partial results of all blocks.
reduce_kernel_optimized<BLOCK_DIM><<<1, BLOCK_DIM>>>( grid_dim, partial_sums, partial_sums, block_ranges );
CUDA_SAFE_CALL( cudaGetLastError() );
// Read the result from device memory.
int result;
CUDA_SAFE_CALL( cudaMemcpy( &result, partial_sums, sizeof(int), cudaMemcpyDeviceToHost ) );
// Free temporary memory.
CUDA_SAFE_CALL( cudaFree( block_ranges ) );
CUDA_SAFE_CALL( cudaFree( partial_sums ) );
return result;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// M A I N
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int main( int, char ** )
{
const int NUM_TESTS = 10;
// The number of elements in the problem.
const int N = 512 * 131072;
std::cout << "Computing a reduction on " << N << " elements" << std::endl;
// X and Y on the host (CPU).
int *a_host = new int[N];
// Make sure the memory got allocated. TODO: free memory.
if( a_host == NULL )
{
std::cerr << "ERROR: Couldn't allocate a_host" << std::endl;
return 1;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Generate data
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << "Filling with 1s" << std::endl;
// Generate pseudo-random data.
for( int i = 0 ; i < N ; ++i )
a_host[i] = 1;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Compute on the CPU using 1 thread
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << std::endl;
std::cout << "Computing on the CPU using 1 CPU thread" << std::endl;
GpuTimer gpu_timer;
gpu_timer.Start();
// Calculate the reference to compare with the device result.
int sum = 0;
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
{
sum = 0;
for( int i = 0 ; i < N ; ++i )
sum += a_host[i];
}
gpu_timer.Stop();
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Compute on the CPU using several OpenMP threads
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << std::endl;
std::cout << "Computing on the CPU using " << omp_get_max_threads() << " OpenMP thread(s)" << std::endl;
gpu_timer.Start();
// Calculate the reference to compare with the device result.
int omp_sum = 0;
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
{
omp_sum = 0;
#pragma omp parallel shared(omp_sum)
{
#pragma omp for reduction(+ : omp_sum)
for( int i = 0 ; i < N ; ++i )
omp_sum = omp_sum + a_host[i];
}
}
gpu_timer.Stop();
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Compute on the GPU
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// The copy of A on the device (GPU).
int *a_device = NULL;
// Allocate A on the device.
CUDA_SAFE_CALL( cudaMalloc( (void **) &a_device, N*sizeof( int ) ) );
// Copy A from host (CPU) to device (GPU).
CUDA_SAFE_CALL( cudaMemcpy( a_device, a_host, N*sizeof( int ), cudaMemcpyHostToDevice ) );
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Compute on the GPU using Thrust
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << std::endl;
std::cout << "Computing on the GPU using Thrust (transfers excluded)" << std::endl;
gpu_timer.Start();
// Launch the kernel on the GPU.
int thrust_sum = 0;
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
{
thrust_sum = thrust::reduce( thrust::device_ptr<int>(a_device), thrust::device_ptr<int>(a_device+N) );
}
gpu_timer.Stop();
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Compute on the GPU using CUB
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/*int cub_sum=0;
std::cout << std::endl;
std::cout << "Computing on the GPU using CUB (transfers excluded)" << std::endl;
int * sum_device=NULL;
cudaMalloc(&sum_device, sizeof(int));
void *temp_storage_device = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceReduce::Sum(temp_storage_device, temp_storage_bytes, a_device ,sum_device, N);
// Allocate temporary storage
cudaMalloc(&temp_storage_device, temp_storage_bytes);
gpu_timer.Start();
// Run reduction
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
{
cub::DeviceReduce::Sum(temp_storage_device, temp_storage_bytes, a_device, sum_device,N);
}
gpu_timer.Stop();
CUDA_SAFE_CALL( cudaMemcpy( &cub_sum, sum_device, sizeof(int), cudaMemcpyDeviceToHost ) );
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
*/
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Compute on the GPU
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << std::endl;
std::cout << "Computing on the GPU (transfers excluded)" << std::endl;
gpu_timer.Start();
// Launch the kernel on the GPU.
int gpu_sum = 0;
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
{
gpu_sum = reduce_on_gpu( N, a_device );
}
gpu_timer.Stop();
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Compute on the GPU (optimized version)
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << std::endl;
std::cout << "Computing on the GPU using a tuned version (transfers excluded)" << std::endl;
gpu_timer.Start();
const int BLOCK_DIM = 256;
// Launch the kernel on the GPU.
int optim_gpu_sum = 0;
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
{
optim_gpu_sum = reduce_on_gpu_optimized<BLOCK_DIM>( N, a_device );
}
gpu_timer.Stop();
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Validate results
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << std::endl;
std::cout << std::endl;
std::cout << "OpenMP results: ref= " << sum << " / sum= " << omp_sum << std::endl;
std::cout << "Thrust results: ref= " << sum << " / sum= " << thrust_sum << std::endl;
//std::cout << "CUB results: ref= " << sum << " / sum= " << cub_sum << std::endl;
std::cout << "CUDA results: ref= " << sum << " / sum= " << gpu_sum << std::endl;
std::cout << "CUDA Optim results: ref= " << sum << " / sum= " << optim_gpu_sum << std::endl;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Clean memory
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Free device memory.
CUDA_SAFE_CALL( cudaFree( a_device ) );
// Free host memory.
delete[] a_host;
return 0;
}

@ -0,0 +1,7 @@
#include "CpuTimer.h"
// Initialize the resolution of the timer
LARGE_INTEGER CpuTimer::m_freq = (QueryPerformanceFrequency(&CpuTimer::m_freq), CpuTimer::m_freq);
// Calculate the overhead of the timer
LONGLONG CpuTimer::m_overhead = CpuTimer::GetOverhead();

@ -0,0 +1,37 @@
#pragma once
#include <windows.h>
struct CpuTimer
{
void Start()
{
QueryPerformanceCounter(&m_start);
}
void Stop()
{
QueryPerformanceCounter(&m_stop);
}
// Returns elapsed time in milliseconds (ms)
double Elapsed()
{
return (m_stop.QuadPart - m_start.QuadPart - m_overhead) * 1000.0 / m_freq.QuadPart;
}
private:
// Returns the overhead of the timer in ticks
static LONGLONG GetOverhead()
{
CpuTimer t;
t.Start();
t.Stop();
return t.m_stop.QuadPart - t.m_start.QuadPart;
}
LARGE_INTEGER m_start;
LARGE_INTEGER m_stop;
static LARGE_INTEGER m_freq;
static LONGLONG m_overhead;
};

@ -0,0 +1,39 @@
#pragma once
#include <cuda_runtime_api.h>
class GpuTimer
{
cudaEvent_t start, stop;
public:
GpuTimer()
{
cudaEventCreate(&start);
cudaEventCreate(&stop);
}
~GpuTimer()
{
cudaEventDestroy(stop);
cudaEventDestroy(start);
}
void Start()
{
cudaEventRecord(start);
}
void Stop()
{
cudaEventRecord(stop);
cudaEventSynchronize(stop);
}
float Elapsed()
{
float elapsed;
cudaEventElapsedTime(&elapsed, start, stop);
return elapsed;
}
};

@ -0,0 +1,415 @@
/*
# Copyright (c) 2011-2012 NVIDIA CORPORATION. All Rights Reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto. Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
*/
#include <iostream>
#include <cuda_runtime_api.h>
#include <omp.h>
#include <thrust/reduce.h>
#include "GpuTimer.h"
#include "CpuTimer.h"
#define CUDA_SAFE_CALL(call) \
{ \
cudaError_t err_code = call; \
if( err_code != cudaSuccess ) { std::cerr << "Error (" << __FILE__ << ":" << __LINE__ << "): " << cudaGetErrorString(err_code) << std::endl; return 1; } \
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// G P U R E D U C T I O N
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__global__ void reduce_kernel( int n, const int *in_buffer, int *out_buffer, const int2 *block_ranges )
{
// Allocate shared memory inside the block.
extern __shared__ int s_mem[];
// The range of data to work with.
int2 range = block_ranges[blockIdx.x];
// Compute the sum of my elements.
// TODO: fill-in that section of the code
// Copy my sum in shared memory.
s_mem[threadIdx.x] = my_sum;
// Make sure all the threads have copied their value in shared memory.
__syncthreads();
// Compute the sum inside the block.
// TODO: fill-in that section of the code
// The first thread of the block stores its result.
if( threadIdx.x == 0 )
out_buffer[blockIdx.x] = s_mem[0];
}
int reduce_on_gpu( int n, const int *a_device )
{
// Compute the size of the grid.
const int BLOCK_DIM = 256;
const int grid_dim = std::min( BLOCK_DIM, (n + BLOCK_DIM-1) / BLOCK_DIM );
const int num_threads = BLOCK_DIM * grid_dim;
// Compute the number of elements per block.
const int elements_per_block = BLOCK_DIM * ((n + num_threads - 1) / num_threads);
// Allocate memory for temporary buffers.
int *partial_sums = NULL;
int2 *block_ranges = NULL;
CUDA_SAFE_CALL( cudaMalloc( (void **) &partial_sums, BLOCK_DIM * sizeof(int ) ) );
CUDA_SAFE_CALL( cudaMalloc( (void **) &block_ranges, grid_dim * sizeof(int2) ) );
// Compute the ranges for the blocks.
int sum = 0;
int2 *block_ranges_on_host = new int2[grid_dim];
for( int block_idx = 0 ; block_idx < grid_dim ; ++block_idx )
{
block_ranges_on_host[block_idx].x = sum;
block_ranges_on_host[block_idx].y = std::min( sum += elements_per_block, n );
}
CUDA_SAFE_CALL( cudaMemcpy( block_ranges, block_ranges_on_host, grid_dim * sizeof(int2), cudaMemcpyHostToDevice ) );
delete[] block_ranges_on_host;
// First round: Compute a partial sum for all blocks.
reduce_kernel<<<grid_dim, BLOCK_DIM, BLOCK_DIM*sizeof(int)>>>( n, a_device, partial_sums, block_ranges );
CUDA_SAFE_CALL( cudaGetLastError() );
// Set the ranges for the second kernel call.
int2 block_range = make_int2( 0, grid_dim );
CUDA_SAFE_CALL( cudaMemcpy( block_ranges, &block_range, sizeof(int2), cudaMemcpyHostToDevice ) );
// Second round: Compute the final sum by summing the partial results of all blocks.
reduce_kernel<<<1, BLOCK_DIM, BLOCK_DIM*sizeof(int)>>>( grid_dim, partial_sums, partial_sums, block_ranges );
CUDA_SAFE_CALL( cudaGetLastError() );
// Read the result from device memory.
int result;
CUDA_SAFE_CALL( cudaMemcpy( &result, partial_sums, sizeof(int), cudaMemcpyDeviceToHost ) );
// Free temporary memory.
CUDA_SAFE_CALL( cudaFree( block_ranges ) );
CUDA_SAFE_CALL( cudaFree( partial_sums ) );
return result;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// G P U R E D U C T I O N : O P T I M I Z E D V E R S I O N
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#define WARP_SIZE 32
template< int BLOCK_DIM >
__global__ void reduce_kernel_optimized( int n, const int *in_buffer, int *out_buffer, const int2 *__restrict block_ranges )
{
// The number of warps in the block.
const int NUM_WARPS = BLOCK_DIM / WARP_SIZE;
// Allocate shared memory inside the block.
__shared__ volatile int s_mem[BLOCK_DIM];
// The range of data to work with.
int2 range = block_ranges[blockIdx.x];
// Warp/lane IDs.
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
// Compute the sum of my elements.
// TODO: fill-in that section of the code
// Copy my sum in shared memory.
s_mem[threadIdx.x] = my_sum;
// Compute the sum inside each warp.
// TODO: fill-in that section of the code
// Each warp leader stores the result for the warp.
if( lane_id == 0 )
// TODO: fill-in that section of the code
__syncthreads();
if( warp_id == 0 )
{
// Read my value from shared memory and store it in a register.
my_sum = s_mem[lane_id];
// Sum the results of the warps.
// TODO: fill-in that section of the code
}
// The 1st thread stores the result of the block.
if( threadIdx.x == 0 )
out_buffer[blockIdx.x] = my_sum += s_mem[1];
}
template< int BLOCK_DIM >
int reduce_on_gpu_optimized( int n, const int *a_device )
{
// Compute the size of the grid.
const int grid_dim = std::min( BLOCK_DIM, (n + BLOCK_DIM-1) / BLOCK_DIM );
const int num_threads = BLOCK_DIM * grid_dim;
// Compute the number of elements per block.
const int elements_per_block = BLOCK_DIM * ((n + num_threads - 1) / num_threads);
// Allocate memory for temporary buffers.
int *partial_sums = NULL;
int2 *block_ranges = NULL;
CUDA_SAFE_CALL( cudaMalloc( (void **) &partial_sums, BLOCK_DIM * sizeof(int ) ) );
CUDA_SAFE_CALL( cudaMalloc( (void **) &block_ranges, grid_dim * sizeof(int2) ) );
// Compute the ranges for the blocks.
int sum = 0;
int2 *block_ranges_on_host = new int2[grid_dim];
for( int block_idx = 0 ; block_idx < grid_dim ; ++block_idx )
{
block_ranges_on_host[block_idx].x = sum;
block_ranges_on_host[block_idx].y = std::min( sum += elements_per_block, n );
}
CUDA_SAFE_CALL( cudaMemcpy( block_ranges, block_ranges_on_host, grid_dim * sizeof(int2), cudaMemcpyHostToDevice ) );
delete[] block_ranges_on_host;
// First round: Compute a partial sum for all blocks.
reduce_kernel_optimized<BLOCK_DIM><<<grid_dim, BLOCK_DIM>>>( n, a_device, partial_sums, block_ranges );
CUDA_SAFE_CALL( cudaGetLastError() );
// Set the ranges for the second kernel call.
int2 block_range = make_int2( 0, grid_dim );
CUDA_SAFE_CALL( cudaMemcpy( block_ranges, &block_range, sizeof(int2), cudaMemcpyHostToDevice ) );
// Second round: Compute the final sum by summing the partial results of all blocks.
reduce_kernel_optimized<BLOCK_DIM><<<1, BLOCK_DIM>>>( grid_dim, partial_sums, partial_sums, block_ranges );
CUDA_SAFE_CALL( cudaGetLastError() );
// Read the result from device memory.
int result;
CUDA_SAFE_CALL( cudaMemcpy( &result, partial_sums, sizeof(int), cudaMemcpyDeviceToHost ) );
// Free temporary memory.
CUDA_SAFE_CALL( cudaFree( block_ranges ) );
CUDA_SAFE_CALL( cudaFree( partial_sums ) );
return result;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// M A I N
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int main( int, char ** )
{
const int NUM_TESTS = 10;
// The number of elements in the problem.
const int N = 512 * 131072;
std::cout << "Computing a reduction on " << N << " elements" << std::endl;
// X and Y on the host (CPU).
int *a_host = new int[N];
// Make sure the memory got allocated. TODO: free memory.
if( a_host == NULL )
{
std::cerr << "ERROR: Couldn't allocate a_host" << std::endl;
return 1;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Generate data
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << "Filling with 1s" << std::endl;
// Generate pseudo-random data.
for( int i = 0 ; i < N ; ++i )
a_host[i] = 1;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Compute on the CPU using 1 thread
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << std::endl;
std::cout << "Computing on the CPU using 1 CPU thread" << std::endl;
CpuTimer cpu_timer;
cpu_timer.Start();
// Calculate the reference to compare with the device result.
int sum = 0;
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
{
sum = 0;
for( int i = 0 ; i < N ; ++i )
sum += a_host[i];
}
cpu_timer.Stop();
std::cout << " Elapsed time: " << cpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Compute on the CPU using several OpenMP threads
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << std::endl;
std::cout << "Computing on the CPU using " << omp_get_max_threads() << " OpenMP thread(s)" << std::endl;
cpu_timer.Start();
// Calculate the reference to compare with the device result.
int omp_sum = 0;
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
{
omp_sum = 0;
#pragma omp parallel shared(omp_sum)
{
#pragma omp for reduction(+ : omp_sum)
for( int i = 0 ; i < N ; ++i )
omp_sum = omp_sum + a_host[i];
}
}
cpu_timer.Stop();
std::cout << " Elapsed time: " << cpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Compute on the GPU
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// The copy of A on the device (GPU).
int *a_device = NULL;
// Allocate A on the device.
CUDA_SAFE_CALL( cudaMalloc( (void **) &a_device, N*sizeof( int ) ) );
// Copy A from host (CPU) to device (GPU).
CUDA_SAFE_CALL( cudaMemcpy( a_device, a_host, N*sizeof( int ), cudaMemcpyHostToDevice ) );
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Compute on the GPU using Thrust
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << std::endl;
std::cout << "Computing on the GPU using Thrust (transfers excluded)" << std::endl;
GpuTimer gpu_timer;
gpu_timer.Start();
// Launch the kernel on the GPU.
int thrust_sum = 0;
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
{
thrust_sum = thrust::reduce( thrust::device_ptr<int>(a_device), thrust::device_ptr<int>(a_device+N) );
}
gpu_timer.Stop();
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Compute on the GPU
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << std::endl;
std::cout << "Computing on the GPU (transfers excluded)" << std::endl;
gpu_timer.Start();
// Launch the kernel on the GPU.
int gpu_sum = 0;
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
{
gpu_sum = reduce_on_gpu( N, a_device );
}
gpu_timer.Stop();
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Compute on the GPU (optimized version)
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << std::endl;
std::cout << "Computing on the GPU using a tuned version (transfers excluded)" << std::endl;
gpu_timer.Start();
const int BLOCK_DIM = 256;
// Launch the kernel on the GPU.
int optim_gpu_sum = 0;
for( int i_test = 0 ; i_test < NUM_TESTS ; ++i_test )
{
optim_gpu_sum = reduce_on_gpu_optimized<BLOCK_DIM>( N, a_device );
}
gpu_timer.Stop();
std::cout << " Elapsed time: " << gpu_timer.Elapsed() / NUM_TESTS << "ms" << std::endl;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Validate results
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << std::endl;
std::cout << std::endl;
std::cout << "OpenMP results: ref= " << sum << " / sum= " << omp_sum << std::endl;
std::cout << "CUDA results: ref= " << sum << " / sum= " << gpu_sum << std::endl;
std::cout << "Thrust results: ref= " << sum << " / sum= " << thrust_sum << std::endl;
std::cout << "Optim results: ref= " << sum << " / sum= " << optim_gpu_sum << std::endl;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Clean memory
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Free device memory.
CUDA_SAFE_CALL( cudaFree( a_device ) );
// Free host memory.
delete[] a_host;
return 0;
}

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 11.00
# Visual Studio 2010
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Reduce", "Reduce.vcxproj", "{037A2CAC-CD22-421A-9BB3-4E39043D36C0}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{037A2CAC-CD22-421A-9BB3-4E39043D36C0}.Debug|x64.ActiveCfg = Debug|x64
{037A2CAC-CD22-421A-9BB3-4E39043D36C0}.Debug|x64.Build.0 = Debug|x64
{037A2CAC-CD22-421A-9BB3-4E39043D36C0}.Release|x64.ActiveCfg = Release|x64
{037A2CAC-CD22-421A-9BB3-4E39043D36C0}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

@ -0,0 +1,172 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|Win32">
<Configuration>Release</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{037A2CAC-CD22-421A-9BB3-4E39043D36C0}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>Reduce</RootNamespace>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 4.2.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<LinkIncremental>true</LinkIncremental>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<LinkIncremental>false</LinkIncremental>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
<PrecompiledHeader>
</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<PrecompiledHeader>
</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<OpenMPSupport>true</OpenMPSupport>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);cudart.lib</AdditionalDependencies>
</Link>
<CudaCompile>
<TargetMachinePlatform>64</TargetMachinePlatform>
<CodeGeneration>compute_20,sm_20</CodeGeneration>
<AdditionalOptions>-Xcompiler "/openmp" %(AdditionalOptions)</AdditionalOptions>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PrecompiledHeader>
</PrecompiledHeader>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PrecompiledHeader>
</PrecompiledHeader>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<OpenMPSupport>true</OpenMPSupport>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies);cudart.lib</AdditionalDependencies>
</Link>
<CudaCompile>
<TargetMachinePlatform>64</TargetMachinePlatform>
<CodeGeneration>compute_20,sm_20</CodeGeneration>
<AdditionalCompilerOptions>/openmp</AdditionalCompilerOptions>
<Optimization>O3</Optimization>
<AdditionalOptions>-Xcompiler "/openmp" %(AdditionalOptions)</AdditionalOptions>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<ClInclude Include="GpuTimer.h" />
<ClInclude Include="CpuTimer.h" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="CpuTimer.cpp" />
</ItemGroup>
<ItemGroup>
<CudaCompile Include="Reduce.cu">
<TargetMachinePlatform Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">64</TargetMachinePlatform>
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">compute_20,sm_20</CodeGeneration>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xcompiler "/openmp" %(AdditionalOptions)</AdditionalOptions>
</CudaCompile>
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 4.2.targets" />
</ImportGroup>
</Project>

@ -0,0 +1,115 @@
clear all;close all;
N=128;
P=128;
rep_name=['Projections_',num2str(P),'/'];
iter = Iter3D(rep_name); % Create a class instance
iter.save_file=1;
iter.save_volume=2;
iter.fc=1.0;%frequence de coupure pour filtre de rétroporjection filtrée
f_real=CreateVolumeReal(iter);
g_real=getSinoReal(iter);
f_estimated=CreateVolumeInit(iter);
normdJProjReg=CreateVolumeInit(iter);
[g_real,rsb_in]=addNoise(iter,g_real);
file_name=sprintf('%s/P_ER_GPU_NOISE_%2.1fdB.s',iter.workdirectory,rsb_in);
fid = fopen(file_name, 'wb');
fwrite(fid,g_real ,'float');
fclose(fid);
%temp=zeros(size(f_real));%doLaplacian(iter,f_real,temp);%figure(2);imagesc(temp(:,:,N/2));title('laplacien df');colorbar;colormap(gray);drawnow;%figure(3);imagesc(f_real(:,:,N/2));title('f_real');colorbar;colormap(gray);drawnow;
% disp('****************************')
% disp('Descente de gradient... ')
% disp('****************************')
f_estimated_n=f_estimated;
for num_iter_gradient_n=1:1:getGradientIterationNb(iter)
fprintf("iter = %d\n",num_iter_gradient_n);
iter.num_iter=num_iter_gradient_n;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% CALCUL DES J et dJ
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%g=H*f
g_estimated=doProjection(iter,f_estimated_n);%figure(4);imagesc(g_estimated(:,:,N/2));title('g_estimated');colorbar;colormap(gray);drawnow;
dg=g_real-g_estimated;%figure(5);imagesc(g_real(:,:,N/2));title('g_real');colorbar;colormap(gray);drawnow;%figure(6);imagesc(dg(:,:,N/2));title('dg');colorbar;colormap(gray);drawnow;
%df=Ht*(g-Hf)
df=doBackprojection(iter,dg);%figure(7);imagesc(df(:,:,N/2));title('df');colorbar;colormap(gray);drawnow;
%dJ=-2*Ht*(g-Hf)
dJ_MC=-2*df;
dJ=dJ_MC;
%dJ+=Dt*D*f avec D laplacien
%J_reg=0;
%ApplyLaplacianRegularization_to_dJ(iter,f_estimated_n,dJ,getLambda(iter),J_reg,normdJProjReg,getGradientIterationNb(iter),getOptimalStepIterationNb(iter));
%dJ_reg=zeros(size(dJ)); temp=zeros(size(dJ));%doLaplacian(iter,f_estimated_n,temp);%doLaplacian(iter,temp,dJ_reg);clear temp;
%dJ=dJ+2*dJ_reg;
%dJ_reg=dJ-dJ_MC;figure(8);imagesc(dJ_reg(:,:,N/2));title('dJ reg');colorbar;colormap(gray);drawnow;%figure(9);imagesc(dg(:,:,N/2));title('dg');colorbar;colormap(gray);drawnow;
%Gradient dJ mise à jour
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% CALCUL DU PAS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
num_alpha=sum(dJ(:).^2);
proj_dJ=doProjection(iter,dJ);
denum_alpha=2*sum(proj_dJ(:).^2);
clear proj_dJ;
%SI REGULARISATION
%if (getLambda(iter) ~= 0)
% s=zeros(size(dJ));
% doLaplacian(iter,dJ,s);
% s=sum(s(:).^2);
% denum_alpha=denum_alpha+getLambda(iter)*s;
%end
alpha=num_alpha/denum_alpha;
%iter.alpha(iter.num_iter)=alpha; %figure(10);plot(iter.alpha);title('pas');xlabel('iter');ylabel('alpha');drawnow;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% MISE A JOUR DE f
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
f_estimated_n=f_estimated_n-alpha.*dJ;
clear dJ;
% SAUVEGARDE DU VOLUME RECONSTRUIT TOUS LES iter.save_file
iter=sauvegarde_volume_TOMO8(f_estimated_n,f_real,iter);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%CALCUL DU CRITERE (FACULTATIF)
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
J_MC=sum(dg(:).^2);
iter.J(iter.num_iter)=J_MC;%+J_reg
niter_done=size(iter.J,2);
if (niter_done > 1)
figure(11);plot(iter.J(2:niter_done));title('J');xlabel('iter');ylabel('J');drawnow;
end
figure(12);imagesc(f_estimated_n(:,:,N/2));title('gradient');colorbar;colormap(gray);drawnow;
end
% disp('****************************')
% disp('Descente de gradient OK !!!!')
% disp('****************************')
figure(11);
plot(f_estimated_n(:,N/2,N/2),'b','LineWidth',1.5,'Marker','+');hold on;
plot(f_real(:,N/2,N/2),'k','LineWidth',1.5);hold on;
legend('gradient','real');
figure(13);
imagesc(f_real(:,:,N/2));title('real');colorbar;colormap(gray);drawnow;

@ -0,0 +1,73 @@
clear all;close all;
N=128;
P=128;
rep_name=['Projections_',num2str(P),'/'];
iter = Iter3D(rep_name); % Create a class instance
iter.save_file=1;
iter.save_volume=2;
iter.fc=1.0;%frequence de coupure pour filtre de rétroporjection filtrée
f_real=CreateVolumeReal(iter);
g_real=getSinoReal(iter);
[g_real,rsb_in]=addNoise(iter,g_real);
file_name=sprintf('%s/P_ER_GPU_NOISE_%2.1fdB.s',iter.workdirectory,rsb_in);
fid = fopen(file_name, 'wb');
fwrite(fid,g_real ,'float');
fclose(fid);
file_name=sprintf('%s/FDK_NOISE_%2.1fdB.v',iter.workdirectory,rsb_in);
fid = fopen(file_name, 'rb');
f_FDK_32=fread(fid,N*N*N ,'float');
f_FDK_32=reshape(f_FDK_32,N,N,N);
fclose(fid);
% RECONSTRUCTION ITERATIVE MOINDRE CARRE AVEC REGULARISATION QUADRATIQUE
setPositivity(iter,1);
setLambda(iter,100);
l1=getLambda(iter);
f1_32=CreateVolumeInit(iter);
title1_32='lambda 100 P 32';
doGradient(iter,f1_32,g_real,f_real);
setLambda(iter,0.1);
l2=getLambda(iter);
f2_32=CreateVolumeInit(iter);
title2_32='lambda 0.1 P 32';
doGradient(iter,f2_32,g_real,f_real);
setLambda(iter,0);
l3=getLambda(iter);
f3_32=CreateVolumeInit(iter);
title3_32='lambda 0 P 32';
doGradient(iter,f3_32,g_real,f_real);
figure(11);
plot(f1_32(:,N/2,N/2),'b','LineWidth',1.5,'Marker','+');hold on;
plot(f2_32(:,N/2,N/2),'g','LineWidth',1.5,'Marker','x');hold on;
plot(f3_32(:,N/2,N/2),'c','LineWidth',1.5,'Marker','*');hold on;
plot(f_real(:,N/2,N/2),'k','LineWidth',1.5);hold on;
plot(f_FDK_32(:,N/2,N/2),'r','LineWidth',1.5,'Marker','o');hold on;
legend(title1_32,title2_32,title3_32,'real','fdk');
%legend(title1_32,'real','fdk');
figure(12);
imagesc(f_real(:,:,N/2));title('real');colorbar;colormap(gray);drawnow;
figure(13);
imagesc(f1_32(:,:,N/2));title(title1_32);colorbar;colormap(gray);drawnow;
figure(14);
imagesc(f_FDK_32(:,:,N/2));title('FDK');colorbar;colormap(gray);drawnow;
figure(15);
imagesc(f2_32(:,:,N/2));title(title2_32);colorbar;colormap(gray);drawnow;
figure(16);
imagesc(f3_32(:,:,N/2));title(title3_32);colorbar;colormap(gray);drawnow;

@ -0,0 +1,16 @@
#!/bin/bash
export DATA_TP_TOMOGPI="/partage/public/ngac/TomoGPI/data_TP_TomoGPI/"
if [ ! -d "/tmp/data_TP_TomoGPI/data3D_0256" ]
then
cp -r "$DATA_TP_TOMOGPI" /tmp
fi
export CURRENTDIR='pwd'
export TOMO_GPI=/partage/public/ngac/TomoGPI
export MATLABPATH=$TOMO_GPI/Matlab/Tomo8:$MATLABPATH
export MATLABPATH=$TOMO_GPI/build:$MATLABPATH
export MATLABPATH=$CURRENT_DIR:$MATLABPATH
export PATH=$TOMO_GPI/build/:$PATH
alias matlab_TomoGPI='LD_PRELOAD=/usr/lib/gcc/x86_64-linux-gnu/8/libstdc++.so matlab'
cd /tmp/data_TP_TomoGPI/data3D_0128/phantom3D_0006_shepp/
matlab_TomoGPI &
#matlab &

@ -0,0 +1,32 @@
#Déclaration du projet
project(my_project)
#link_directories(/usr/local/lib)
set(ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/)
find_package(CUDA REQUIRED)
#set(OpenCV_DIR "/usr/local/lib/OpenCV/")
find_package(OpenCV REQUIRED)
include_directories(${OpenCV_INCLUDE_DIRS} ".")
find_package(Qt5 COMPONENTS Core Gui Widgets REQUIRED)
list(APPEND HEADER_REP ${ROOT_DIR}/ ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} "/usr/local/cuda/samples/common/inc/" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/../samples/common/inc")
set(CPP_FILES ${ROOT_DIR}/mylib.cpp)
set(CU_FILES ${ROOT_DIR}/main_nano.cu ${ROOT_DIR}/mylib.cu)
set(CUDA_SEPARABLE_COMPILATION ON)
set(CUDA_NVCC_FLAGS "-lineinfo;-I\"${ROOT_DIR}/inc\" -I\"${ROOT_DIR}/\" -I\"/usr/local/cuda/samples/common/inc/\"")
cuda_add_executable(my_project ${CPP_FILES} ${CU_FILES} ${HEADER_FILES})
set_target_properties(my_project PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(my_project PUBLIC ${ROOT_DIR}/inc/ /usr/local/cuda/samples/common/inc/)
target_link_libraries(my_project Qt5::Widgets)
target_link_libraries(my_project Qt5::Core)
target_link_libraries(my_project opencv_core opencv_highgui opencv_imgproc opencv_imgcodecs opencv_video opencv_videoio)

@ -0,0 +1,47 @@
#include "mylib.h"
#include "mylib.cuh"
#include "opencv2/opencv.hpp"
#include <cuda_runtime.h>
int main(int, char**)
{
VideoCapture cap(0); // open the default camera
if(!cap.isOpened()) // check if we succeeded
return -1;
while(1){
Mat frame;
cap >> frame;
char c=(char)waitKey(25);
if(c == '1'){ // if '1' est appuye
Mat NB = noirBlanc(frame);
imshow("NoirEtBlanc", NB);
}
else if(c == '2'){ // if '2' est appuye
Mat seuil = seuillage(frame);
imshow("seuillage", seuil);
}
else if (c == '3'){ // if '3' est appuye
Mat cont = contour(frame);
imshow("contour", cont);
}
else if (c == '4'){ // if '4' est appuye
Mat seuilgpu = seuillageGPU(frame);
imshow("seuillage GPU",seuilgpu);
}
else if(c == '0') destroyAllWindows(); // if '0' est appuye
else imshow("frame", frame);
if(c==27) // if 'esc' est appuye
break;
}
// When everything done, release the video capture object
cap.release();
// Closes all the frames
destroyAllWindows();
return 0;
}

@ -0,0 +1,92 @@
#include "mylib.cuh"
#include "mylib.h"
#include <cuda_runtime.h>
// acces au flux de la camera
std::string gstreamer_pipeline(int capture_width, int capture_height,
int display_width, int display_height,
int framerate, int flip_method) {
return "nvarguscamerasrc ! video/x-raw(memory:NVMM), width=(int)" +
std::to_string(capture_width) + ", height=(int)" +
std::to_string(capture_height) +
", format=(string)NV12, framerate=(fraction)" +
std::to_string(framerate) +
"/1 ! nvvidconv flip-method=" + std::to_string(flip_method) +
" ! video/x-raw, width=(int)" + std::to_string(display_width) +
", height=(int)" + std::to_string(display_height) +
", format=(string)BGRx ! videoconvert ! video/x-raw, "
"format=(string)BGR ! appsink";
}
int main(int, char **) {
int capture_width = 1280;
int capture_height = 720;
int display_width = 640;
int display_height = 360;
int framerate = 60;
int flip_method = 0;
char c = '1';
std::string pipeline =
gstreamer_pipeline(capture_width, capture_height, display_width,
display_height, framerate, flip_method);
std::cout << "Using pipeline: \n\t" << pipeline << "\n";
cv::VideoCapture cap(pipeline, cv::CAP_GSTREAMER);
if (!cap.isOpened()) // check if we succeeded
return -1;
while (1) {
Mat frame;
cap >> frame;
int c_new;
c_new = cv::waitKey(10);
if (c_new != -1)
c = c_new;
switch (c) {
case '1': {
imshow("frame", frame);
break;
}
case '2': {
Mat NB = noirBlanc(frame);
imshow("NoirEtBlanc", NB);
break;
}
case '3': {
Mat seuil = seuillage(frame);
imshow("seuillage", seuil);
break;
}
case '4': {
Mat cont = contour(frame);
imshow("contour", cont);
break;
}
case '5': {
Mat seuilgpu = seuillageGPU(frame);
imshow("seuillage GPU", seuilgpu);
break;
}
case '0': {
destroyAllWindows();
break;
}
default:
break;
}
if (c == '\e')
break;
}
// When everything done, release the video capture object
cap.release();
// Closes all the frames
destroyAllWindows();
return 0;
}

@ -0,0 +1,41 @@
#include "mylib.h"
#include "opencv2/opencv.hpp"
int main(int, char**)
{
VideoCapture cap(0); // open the default camera
if(!cap.isOpened()) // check if we succeeded
return -1;
while(1){
Mat frame;
cap >> frame;
char c=(char)waitKey(25);
if(c == '1'){ // if '1' est appuye
Mat NB = noirBlanc(frame);
imshow("NoirEtBlanc", NB);
}
else if(c == '2'){ // if '2' est appuye
Mat seuil = seuillage(frame);
imshow("seuillage", seuil);
}
else if (c == '3'){ // if '3' est appuye
Mat cont = contour(frame);
imshow("contour", cont);
}
else if(c == '0') destroyAllWindows(); // if '0' est appuye
else imshow("frame", frame);
if(c==27) // if 'esc' est appuye
break;
}
// When everything done, release the video capture object
cap.release();
// Closes all the frames
destroyAllWindows();
return 0;
}

@ -0,0 +1,90 @@
#include "mylib.h"
//---------------------noirBlanc-----------------------
Mat noirBlanc(Mat frame)
{
Mat im_gray_out;
if (frame.empty())
exit(0);
cvtColor(frame,im_gray_out,COLOR_RGB2GRAY);
return im_gray_out;
}
//---------------------get_frame-----------------------
Mat get_frame(Mat frame)
{
if (frame.empty())
exit(0);
return frame;
}
//---------------------seuillage------------------------
Mat seuillage(Mat frame)
{
float nr;
uchar r,v,b;
Mat frame_out;
frame_out.create(frame.rows,frame.cols,CV_8UC3);
if (frame.empty())
exit(0);
for(int i = 0; i < frame.rows; i++)
{
for(int j = 0; j < frame.cols; j++)
{
b = frame.at<Vec3b>(i,j)[0];
v = frame.at<Vec3b>(i,j)[1];
r = frame.at<Vec3b>(i,j)[2];
nr = r/sqrt(r*r+b*b+v*v);
if (nr > 0.7){
frame_out.at<Vec3b>(i,j)[0] = b;
frame_out.at<Vec3b>(i,j)[1] = r;
frame_out.at<Vec3b>(i,j)[2] = r;
}
else{
frame_out.at<Vec3b>(i,j)[0] = b;
frame_out.at<Vec3b>(i,j)[1] = v;
frame_out.at<Vec3b>(i,j)[2] = r;
}
}
}
return frame_out;
}
//---------------------contour------------------------
Mat contour(Mat frame)
{
Mat frame_out,frame_grayt;
cvtColor(frame,frame_grayt,COLOR_RGB2GRAY);
frame_out.create(frame.rows,frame.cols,CV_8UC1);
if (frame.empty())
exit(0);
for (int i=1;i<frame.rows;i++){
for (int j=1;j<frame.cols;j++){
short temp;
temp = (-1)*(short)frame_grayt.at<uchar>(i,j-1)+(-1)*(short)frame_grayt.at<uchar>(i-1,j)+(-1)*(char)frame_grayt.at<uchar>(i,j+1)+(-1)*(short)frame_grayt.at<uchar>(i+1,j)+4*(short)frame_grayt.at<uchar>(i,j);
frame_out.at<uchar>(i,j)=(uchar)abs(temp);
}
}
return frame_out;
}

@ -0,0 +1,93 @@
#include "mylib.h"
#include "mylib.cuh"
__global__ void kernel_seuillageGPU(unsigned char *d_image_in, unsigned char *d_image_out,int size_j)
{
float Csum;
int i, j, k, iFirst, jFirst;
iFirst = blockIdx.x*BLOCK_SIZE; // num de block dans la grille de block
jFirst = blockIdx.y*BLOCK_SIZE;
i = iFirst + threadIdx.x;// recuperer l'identifiant d'un thread dans les blocs
j = jFirst + threadIdx.y;
float nr = 0;
nr=d_image_in[2+j*3+i*3*size_j]/sqrtf(d_image_in[0+j*3+i*3*size_j]*d_image_in[0+j*3+i*3*size_j]+d_image_in[1+j*3+i*3*size_j]*d_image_in[1+j*3+i*3*size_j]+d_image_in[2+j*3+i*3*size_j]*d_image_in[2+j*3+i*3*size_j]);
if(nr > 0.7)
d_image_out[1+j*3+i*3*size_j] = d_image_in[2+j*3+i*3*size_j];
else
d_image_out[1+j*3+i*3*size_j] = d_image_in[1+j*3+i*3*size_j];
d_image_out[0+j*3+i*3*size_j] = d_image_in[0+j*3+i*3*size_j];
d_image_out[2+j*3+i*3*size_j] = d_image_in[2+j*3+i*3*size_j];
}
Mat seuillageGPU( Mat in)
{
cudaError_t error;
Mat out;
out.create(in.rows,in.cols,CV_8UC3);
// allocate host memory
unsigned char *h_image_in_GPU ;
h_image_in_GPU=in.data;
/*cudaEvent_t start,stop,start_mem,stop_mem;
error = cudaEventCreate(&start_mem);
error = cudaEventCreate(&stop_mem);
error = cudaEventRecord(start, NULL);
error = cudaEventSynchronize(start);*/
// images on device memoryÍÍÍ
unsigned char *d_image_in_GPU;
unsigned char *d_image_out_GPU;
const unsigned long int mem_size=in.cols*in.rows*3*sizeof(unsigned char);
// Alocation mémoire de d_image_in et d_image_out sur la carte GPU
cudaMalloc((void**) &d_image_in_GPU,mem_size );
cudaMalloc((void**) &d_image_out_GPU, mem_size);
// copy host memory to device
cudaMemcpy(d_image_in_GPU, h_image_in_GPU,mem_size ,cudaMemcpyHostToDevice);
//error = cudaEventRecord(stop_mem, NULL);
// Wait for the stop event to complete
//error = cudaEventSynchronize(stop_mem);
//float msecMem = 0.0f;
//error = cudaEventElapsedTime(&msecMem, start, stop_mem);
// setup execution parameters -> découpage en threads
dim3 threads(BLOCK_SIZE,BLOCK_SIZE);
dim3 grid(in.rows/BLOCK_SIZE,in.cols/BLOCK_SIZE);
// lancement des threads executé sur la carte GPU
kernel_seuillageGPU<<< grid, threads >>>(d_image_in_GPU, d_image_out_GPU,in.cols);
// Record the start event
//error = cudaEventRecord(start_mem, NULL);
//error = cudaEventSynchronize(start_mem);
// copy result from device to host
cudaMemcpy(out.data, d_image_out_GPU, mem_size,cudaMemcpyDeviceToHost);
cudaFree(d_image_in_GPU);
cudaFree(d_image_out_GPU);
/*
float msecTotal,msecMem2;
error = cudaEventRecord(stop, NULL);
error = cudaEventSynchronize(stop);
error = cudaEventElapsedTime(&msecTotal, start, stop);
error = cudaEventElapsedTime(&msecMem2, start_mem, stop);
*/
return out;
}

@ -0,0 +1,13 @@
#ifndef MYLIB_CUH
#define MYLIB_CUH
#include "mylib.h"
#include <cuda_runtime.h>
Mat seuillageGPU( Mat in);
#endif

@ -0,0 +1,36 @@
#ifndef MYLIB_H
#define MYLIB_H
#define SIZE_I 960
#define SIZE_J 1280
#define BLOCK_SIZE 16
#include "opencv2/opencv.hpp"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <iostream>
using namespace std;
using namespace cv;
//---------------------get_frame---------------------
Mat get_frame(Mat frame);
//---------------------noirBlanc------------------
Mat noirBlanc(Mat frame);
//---------------------seuillage------------------
Mat seuillage(Mat frame);
//---------------------contour--------------------
Mat contour(Mat frame);
#endif
Loading…
Cancel
Save