1128 lines
38 KiB
C++
1128 lines
38 KiB
C++
/*
|
|
Copyright (c) 2014-2016 Intel Corporation. All Rights Reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions
|
|
are met:
|
|
|
|
* Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
* Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
* Neither the name of Intel Corporation nor the names of its
|
|
contributors may be used to endorse or promote products derived
|
|
from this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
|
|
#include "offload_engine.h"
|
|
#include <signal.h>
|
|
#include <errno.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/types.h>
|
|
|
|
#include <algorithm>
|
|
#include <vector>
|
|
|
|
#include "offload_host.h"
|
|
#include "offload_table.h"
|
|
#include "offload_iterator.h"
|
|
|
|
#if defined(HOST_WINNT)
|
|
#define PATH_SEPARATOR ";"
|
|
#else
|
|
#define PATH_SEPARATOR ":"
|
|
#endif
|
|
|
|
// Static members of Stream class must be described somewhere.
|
|
// This members describe the list of all streams defined in programm
|
|
// via call to _Offload_stream_create.
|
|
uint64_t Stream::m_streams_count = 0;
|
|
StreamMap Stream::all_streams;
|
|
mutex_t Stream::m_stream_lock;
|
|
char* mic_library_path = 0;
|
|
|
|
const char* Engine::m_func_names[Engine::c_funcs_total] =
|
|
{
|
|
"server_compute",
|
|
#ifdef MYO_SUPPORT
|
|
"server_myoinit",
|
|
"server_myofini",
|
|
#endif // MYO_SUPPORT
|
|
"server_init",
|
|
"server_var_table_size",
|
|
"server_var_table_copy",
|
|
"server_set_stream_affinity"
|
|
};
|
|
|
|
// Symbolic representation of system signals. Fix for CQ233593
|
|
const char* Engine::c_signal_names[Engine::c_signal_max] =
|
|
{
|
|
"Unknown SIGNAL",
|
|
"SIGHUP", /* 1, Hangup (POSIX). */
|
|
"SIGINT", /* 2, Interrupt (ANSI). */
|
|
"SIGQUIT", /* 3, Quit (POSIX). */
|
|
"SIGILL", /* 4, Illegal instruction (ANSI). */
|
|
"SIGTRAP", /* 5, Trace trap (POSIX). */
|
|
"SIGABRT", /* 6, Abort (ANSI). */
|
|
"SIGBUS", /* 7, BUS error (4.2 BSD). */
|
|
"SIGFPE", /* 8, Floating-point exception (ANSI). */
|
|
"SIGKILL", /* 9, Kill, unblockable (POSIX). */
|
|
"SIGUSR1", /* 10, User-defined signal 1 (POSIX). */
|
|
"SIGSEGV", /* 11, Segmentation violation (ANSI). */
|
|
"SIGUSR2", /* 12, User-defined signal 2 (POSIX). */
|
|
"SIGPIPE", /* 13, Broken pipe (POSIX). */
|
|
"SIGALRM", /* 14, Alarm clock (POSIX). */
|
|
"SIGTERM", /* 15, Termination (ANSI). */
|
|
"SIGSTKFLT", /* 16, Stack fault. */
|
|
"SIGCHLD", /* 17, Child status has changed (POSIX). */
|
|
"SIGCONT", /* 18, Continue (POSIX). */
|
|
"SIGSTOP", /* 19, Stop, unblockable (POSIX). */
|
|
"SIGTSTP", /* 20, Keyboard stop (POSIX). */
|
|
"SIGTTIN", /* 21, Background read from tty (POSIX). */
|
|
"SIGTTOU", /* 22, Background write to tty (POSIX). */
|
|
"SIGURG", /* 23, Urgent condition on socket (4.2 BSD). */
|
|
"SIGXCPU", /* 24, CPU limit exceeded (4.2 BSD). */
|
|
"SIGXFSZ", /* 25, File size limit exceeded (4.2 BSD). */
|
|
"SIGVTALRM", /* 26, Virtual alarm clock (4.2 BSD). */
|
|
"SIGPROF", /* 27, Profiling alarm clock (4.2 BSD). */
|
|
"SIGWINCH", /* 28, Window size change (4.3 BSD, Sun). */
|
|
"SIGIO", /* 29, I/O now possible (4.2 BSD). */
|
|
"SIGPWR", /* 30, Power failure restart (System V). */
|
|
"SIGSYS" /* 31, Bad system call. */
|
|
};
|
|
|
|
void Engine::init(void)
|
|
{
|
|
if (!m_ready) {
|
|
mutex_locker_t locker(m_lock);
|
|
|
|
if (!m_ready) {
|
|
// start process if not done yet
|
|
if (m_process == 0) {
|
|
init_process();
|
|
}
|
|
|
|
// load penging images
|
|
load_libraries();
|
|
|
|
// and (re)build pointer table
|
|
init_ptr_data();
|
|
|
|
// it is ready now
|
|
m_ready = true;
|
|
|
|
// Inform the debugger
|
|
if (__dbg_is_attached) {
|
|
__dbg_target_so_loaded();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void Engine::print_stream_cpu_list(const char * str)
|
|
{
|
|
int count = 0;
|
|
char buffer[1024];
|
|
CpuEl* cpu_el = m_cpu_head;
|
|
|
|
OFFLOAD_DEBUG_TRACE(3,
|
|
"%s : cpu list as Index(Count) for the streams is :\n", str);
|
|
buffer[0] = 0;
|
|
for (int i = 0; i < m_num_threads; i++) {
|
|
cpu_el = m_cpus + i;
|
|
if (m_assigned_cpus == 0 || (*m_assigned_cpus)[i]) {
|
|
count++;
|
|
sprintf(buffer + strlen(buffer), "%d(%d) ", CPU_INDEX(cpu_el), cpu_el->count);
|
|
if (count % 20 == 0) {
|
|
OFFLOAD_DEBUG_TRACE(3, "%s\n", buffer);
|
|
buffer[0] = 0;
|
|
}
|
|
}
|
|
}
|
|
if (count % 20 != 0) {
|
|
OFFLOAD_DEBUG_TRACE(3, "%s\n", buffer);
|
|
}
|
|
}
|
|
|
|
void Engine::init_process(void)
|
|
{
|
|
COIENGINE engine;
|
|
COIRESULT res;
|
|
const char **environ;
|
|
char buf[4096]; // For exe path name
|
|
char* mic_device_main = 0;
|
|
|
|
// create environment for the target process
|
|
environ = (const char**) mic_env_vars.create_environ_for_card(m_index);
|
|
if (environ != 0) {
|
|
for (const char **p = environ; *p != 0; p++) {
|
|
OFFLOAD_DEBUG_TRACE(3, "Env Var for card %d: %s\n", m_index, *p);
|
|
}
|
|
}
|
|
|
|
// Create execution context in the specified device
|
|
OFFLOAD_DEBUG_TRACE(2, "Getting device %d (engine %d) handle\n", m_index,
|
|
m_physical_index);
|
|
res = COI::EngineGetHandle(COI_ISA_MIC, m_physical_index, &engine);
|
|
check_result(res, c_get_engine_handle, m_index, res);
|
|
|
|
// Get engine info on threads and cores.
|
|
// The values of core number and thread number will be used later at stream
|
|
// creation by call to _Offload_stream_create(device,number_of_cpus).
|
|
|
|
COI_ENGINE_INFO engine_info;
|
|
|
|
res = COI::EngineGetInfo(engine, sizeof(COI_ENGINE_INFO), &engine_info);
|
|
check_result(res, c_get_engine_info, m_index, res);
|
|
if (mic_library_path == 0 ) {
|
|
if (engine_info.ISA == COI_DEVICE_KNC) {
|
|
mic_library_path = knc_library_path;
|
|
}
|
|
else if (engine_info.ISA == COI_DEVICE_KNL) {
|
|
mic_library_path = knl_library_path;
|
|
}
|
|
else {
|
|
LIBOFFLOAD_ERROR(c_unknown_mic_device_type);
|
|
}
|
|
}
|
|
|
|
// m_cpus is the list of all available threads.
|
|
// At the begining all threads made available through OFFLOAD_DEVICES
|
|
// or all threads existed at the engine if OFFLOAD_DEVICES isn't set.
|
|
// m_cpu_head points to the head of the m_cpus list.
|
|
// m_cpus is ordered by number of streams using the thread.
|
|
// m_cpu_head points to the least used thread.
|
|
// After creating and destroying a stream the m_cpus list must be fixed
|
|
// to be ordered.
|
|
|
|
m_cpus = (CpuEl*)malloc(engine_info.NumThreads * sizeof(CpuEl));
|
|
if (m_cpus == NULL)
|
|
LIBOFFLOAD_ERROR(c_malloc);
|
|
memset(m_cpus, 0, engine_info.NumThreads * sizeof(CpuEl));
|
|
CpuEl* prev_cpu = NULL;
|
|
|
|
for (int i = 0; i < engine_info.NumThreads; i++) {
|
|
if (m_assigned_cpus == 0 || (*m_assigned_cpus)[i]) {
|
|
if (prev_cpu) {
|
|
prev_cpu->next = m_cpus + i;
|
|
}
|
|
else {
|
|
m_cpu_head = m_cpus + i;
|
|
}
|
|
m_cpus[i].prev = prev_cpu;
|
|
m_cpus[i].count = 0;
|
|
prev_cpu = m_cpus + i;
|
|
}
|
|
}
|
|
|
|
// The following values will be used at pipeline creation for streams
|
|
m_num_cores = engine_info.NumCores;
|
|
m_num_threads = engine_info.NumThreads;
|
|
|
|
print_stream_cpu_list("init_process");
|
|
|
|
// Check if OFFLOAD_DMA_CHANNEL_COUNT is set to 2
|
|
// Only the value 2 is supported in 16.0
|
|
if (mic_dma_channel_count == 2) {
|
|
if (COI::ProcessConfigureDMA) {
|
|
// Set DMA channels using COI API
|
|
COI::ProcessConfigureDMA(2, COI::DMA_MODE_READ_WRITE);
|
|
}
|
|
else {
|
|
// Set environment variable COI_DMA_CHANNEL_COUNT
|
|
// use putenv instead of setenv as Windows has no setenv.
|
|
// Note: putenv requires its argument can't be freed or modified.
|
|
// So no free after call to putenv or elsewhere.
|
|
char * env_var = strdup("COI_DMA_CHANNEL_COUNT=2");
|
|
if (env_var == NULL)
|
|
LIBOFFLOAD_ERROR(c_malloc);
|
|
putenv(env_var);
|
|
}
|
|
}
|
|
|
|
// Target executable is not available then use compiler provided offload_main
|
|
if (__target_exe == 0) {
|
|
// find target executable to be used if main application is not an
|
|
// offload build application.
|
|
const char *base_name = "offload_main";
|
|
if (mic_library_path != 0) {
|
|
char *buf = strdup(mic_library_path);
|
|
if (buf == NULL)
|
|
LIBOFFLOAD_ERROR(c_malloc);
|
|
char *try_name = (char*) alloca(strlen(mic_library_path) +
|
|
strlen(base_name) + 2);
|
|
char *dir, *ptr;
|
|
|
|
for (dir = strtok_r(buf, PATH_SEPARATOR, &ptr); dir != 0;
|
|
dir = strtok_r(0, PATH_SEPARATOR, &ptr)) {
|
|
// compose a full path
|
|
sprintf(try_name, "%s/%s", dir, base_name);
|
|
|
|
// check if such file exists
|
|
struct stat st;
|
|
if (stat(try_name, &st) == 0 && S_ISREG(st.st_mode)) {
|
|
mic_device_main = strdup(try_name);
|
|
if (mic_device_main == NULL)
|
|
LIBOFFLOAD_ERROR(c_malloc);
|
|
break;
|
|
}
|
|
}
|
|
free(buf);
|
|
}
|
|
if (mic_device_main == 0) {
|
|
LIBOFFLOAD_ERROR(c_report_no_target_exe, "offload_main");
|
|
exit(1);
|
|
}
|
|
|
|
OFFLOAD_DEBUG_TRACE(2,
|
|
"Loading target executable %s\n",mic_device_main);
|
|
|
|
res = COI::ProcessCreateFromFile(
|
|
engine, // in_Engine
|
|
mic_device_main, // in_pBinaryName
|
|
0, // in_Argc
|
|
0, // in_ppArgv
|
|
environ == 0, // in_DupEnv
|
|
environ, // in_ppAdditionalEnv
|
|
mic_proxy_io, // in_ProxyActive
|
|
mic_proxy_fs_root, // in_ProxyfsRoot
|
|
mic_buffer_size, // in_BufferSpace
|
|
mic_library_path, // in_LibrarySearchPath
|
|
&m_process // out_pProcess
|
|
);
|
|
}
|
|
else {
|
|
// Target executable should be available by the time when we
|
|
// attempt to initialize the device
|
|
|
|
// Need the full path of the FAT exe for VTUNE
|
|
{
|
|
#ifndef TARGET_WINNT
|
|
ssize_t len = readlink("/proc/self/exe", buf,1000);
|
|
#else
|
|
int len = GetModuleFileName(NULL, buf,1000);
|
|
#endif // TARGET_WINNT
|
|
if (len == -1) {
|
|
LIBOFFLOAD_ERROR(c_report_no_host_exe);
|
|
exit(1);
|
|
}
|
|
else if (len > 999) {
|
|
LIBOFFLOAD_ERROR(c_report_path_buff_overflow);
|
|
exit(1);
|
|
}
|
|
buf[len] = '\0';
|
|
}
|
|
|
|
OFFLOAD_DEBUG_TRACE(2,
|
|
"Loading target executable \"%s\" from %p, size %lld, host file %s\n",
|
|
__target_exe->name, __target_exe->data, __target_exe->size,
|
|
buf);
|
|
|
|
res = COI::ProcessCreateFromMemory(
|
|
engine, // in_Engine
|
|
__target_exe->name, // in_pBinaryName
|
|
__target_exe->data, // in_pBinaryBuffer
|
|
__target_exe->size, // in_BinaryBufferLength,
|
|
0, // in_Argc
|
|
0, // in_ppArgv
|
|
environ == 0, // in_DupEnv
|
|
environ, // in_ppAdditionalEnv
|
|
mic_proxy_io, // in_ProxyActive
|
|
mic_proxy_fs_root, // in_ProxyfsRoot
|
|
mic_buffer_size, // in_BufferSpace
|
|
mic_library_path, // in_LibrarySearchPath
|
|
buf, // in_FileOfOrigin
|
|
-1, // in_FileOfOriginOffset use -1 to indicate to
|
|
// COI that is is a FAT binary
|
|
&m_process // out_pProcess
|
|
);
|
|
}
|
|
check_result(res, c_process_create, m_index, res);
|
|
|
|
if ((mic_4k_buffer_size != 0) || (mic_2m_buffer_size !=0)) {
|
|
// available only in MPSS 4.2 and greater
|
|
if (COI::ProcessSetCacheSize != 0 ) {
|
|
int flags;
|
|
// Need compiler to use MPSS 3.2 or greater to get these
|
|
// definition so currently hardcoding it
|
|
// COI_CACHE_ACTION_GROW_NOW && COI_CACHE_MODE_ONDEMAND_SYNC;
|
|
flags = 0x00020002;
|
|
res = COI::ProcessSetCacheSize(
|
|
m_process, // in_Process
|
|
mic_2m_buffer_size, // in_HugePagePoolSize
|
|
flags, // inHugeFlags
|
|
mic_4k_buffer_size, // in_SmallPagePoolSize
|
|
flags, // inSmallFlags
|
|
0, // in_NumDependencies
|
|
0, // in_pDependencies
|
|
0 // out_PCompletion
|
|
);
|
|
OFFLOAD_DEBUG_TRACE(2,
|
|
"Reserve target buffers 4K pages = %d 2M pages = %d\n",
|
|
mic_4k_buffer_size, mic_2m_buffer_size);
|
|
check_result(res, c_process_set_cache_size, m_index, res);
|
|
}
|
|
else {
|
|
OFFLOAD_DEBUG_TRACE(2,
|
|
"Reserve target buffers not supported in current MPSS\n");
|
|
}
|
|
}
|
|
|
|
// get function handles
|
|
res = COI::ProcessGetFunctionHandles(m_process, c_funcs_total,
|
|
m_func_names, m_funcs);
|
|
check_result(res, c_process_get_func_handles, m_index, res);
|
|
|
|
// initialize device side
|
|
pid_t pid = init_device();
|
|
|
|
// For IDB
|
|
if (__dbg_is_attached) {
|
|
// TODO: we have in-memory executable now.
|
|
// Check with IDB team what should we provide them now?
|
|
if (__target_exe == 0) {
|
|
strcpy(__dbg_target_exe_name, "offload_main");
|
|
}
|
|
else {
|
|
if (strlen(__target_exe->name) < MAX_TARGET_NAME) {
|
|
strcpy(__dbg_target_exe_name, __target_exe->name);
|
|
}
|
|
}
|
|
__dbg_target_so_pid = pid;
|
|
__dbg_target_id = m_physical_index;
|
|
// The call to __dbg_target_so_loaded() is moved
|
|
// to Engine:init so all the libraries are loaded before
|
|
// informing debugger so debugger can access them.
|
|
// __dbg_target_so_loaded();
|
|
}
|
|
}
|
|
|
|
void Engine::fini_process(bool verbose)
|
|
{
|
|
if (m_process != 0) {
|
|
uint32_t sig;
|
|
int8_t ret;
|
|
|
|
// destroy target process
|
|
OFFLOAD_DEBUG_TRACE(2, "Destroying process on the device %d\n",
|
|
m_index);
|
|
|
|
COIRESULT res = COI::ProcessDestroy(m_process, -1, 0, &ret, &sig);
|
|
m_process = 0;
|
|
|
|
if (res == COI_SUCCESS) {
|
|
OFFLOAD_DEBUG_TRACE(3, "Device process: signal %d, exit code %d\n",
|
|
sig, ret);
|
|
if (verbose) {
|
|
if (sig != 0) {
|
|
LIBOFFLOAD_ERROR(
|
|
c_mic_process_exit_sig, m_index, sig,
|
|
c_signal_names[sig >= c_signal_max ? 0 : sig]);
|
|
}
|
|
else {
|
|
LIBOFFLOAD_ERROR(c_mic_process_exit_ret, m_index, ret);
|
|
}
|
|
}
|
|
|
|
// for idb
|
|
if (__dbg_is_attached) {
|
|
__dbg_target_so_unloaded();
|
|
}
|
|
}
|
|
else {
|
|
if (verbose) {
|
|
LIBOFFLOAD_ERROR(c_mic_process_exit, m_index);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void Engine::load_libraries()
|
|
{
|
|
// load libraries collected so far
|
|
for (TargetImageList::iterator it = m_images.begin();
|
|
it != m_images.end(); it++) {
|
|
OFFLOAD_DEBUG_TRACE(2,
|
|
"Loading library \"%s\" from %p, size %llu, host file %s\n",
|
|
it->name, it->data, it->size, it->origin);
|
|
|
|
// load library to the device
|
|
COILIBRARY lib;
|
|
COIRESULT res;
|
|
res = COI::ProcessLoadLibraryFromMemory(m_process,
|
|
it->data,
|
|
it->size,
|
|
it->name,
|
|
mic_library_path,
|
|
it->origin,
|
|
(it->origin) ? -1 : 0,
|
|
COI_LOADLIBRARY_V1_FLAGS,
|
|
&lib);
|
|
m_dyn_libs.push_front(DynLib(it->name, it->data, lib));
|
|
|
|
if (res != COI_SUCCESS && res != COI_ALREADY_EXISTS) {
|
|
check_result(res, c_load_library, it->origin, m_index, res);
|
|
}
|
|
}
|
|
m_images.clear();
|
|
}
|
|
|
|
void Engine::unload_library(const void *data, const char *name)
|
|
{
|
|
if (m_process == 0) {
|
|
return;
|
|
}
|
|
for (DynLibList::iterator it = m_dyn_libs.begin();
|
|
it != m_dyn_libs.end(); it++) {
|
|
if (it->data == data) {
|
|
COIRESULT res;
|
|
OFFLOAD_DEBUG_TRACE(2,
|
|
"Unloading library \"%s\"\n",name);
|
|
res = COI::ProcessUnloadLibrary(m_process,it->lib);
|
|
m_dyn_libs.erase(it);
|
|
if (res != COI_SUCCESS) {
|
|
check_result(res, c_unload_library, m_index, res);
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
static bool target_entry_cmp(
|
|
const VarList::BufEntry &l,
|
|
const VarList::BufEntry &r
|
|
)
|
|
{
|
|
const char *l_name = reinterpret_cast<const char*>(l.name);
|
|
const char *r_name = reinterpret_cast<const char*>(r.name);
|
|
return strcmp(l_name, r_name) < 0;
|
|
}
|
|
|
|
static bool host_entry_cmp(
|
|
const VarTable::Entry *l,
|
|
const VarTable::Entry *r
|
|
)
|
|
{
|
|
return strcmp(l->name, r->name) < 0;
|
|
}
|
|
|
|
void Engine::init_ptr_data(void)
|
|
{
|
|
COIRESULT res;
|
|
COIEVENT event;
|
|
|
|
// Prepare table of host entries
|
|
std::vector<const VarTable::Entry*> host_table(
|
|
Iterator(__offload_vars.get_head()),
|
|
Iterator());
|
|
|
|
// no need to do anything further is host table is empty
|
|
if (host_table.size() <= 0) {
|
|
return;
|
|
}
|
|
|
|
// Get var table entries from the target.
|
|
// First we need to get size for the buffer to copy data
|
|
struct {
|
|
int64_t nelems;
|
|
int64_t length;
|
|
} params;
|
|
|
|
res = COI::PipelineRunFunction(get_pipeline(),
|
|
m_funcs[c_func_var_table_size],
|
|
0, 0, 0,
|
|
0, 0,
|
|
0, 0,
|
|
¶ms, sizeof(params),
|
|
&event);
|
|
check_result(res, c_pipeline_run_func, m_index, res);
|
|
|
|
res = COI::EventWait(1, &event, -1, 1, 0, 0);
|
|
check_result(res, c_event_wait, res);
|
|
|
|
if (params.length == 0) {
|
|
return;
|
|
}
|
|
|
|
// create buffer for target entries and copy data to host
|
|
COIBUFFER buffer;
|
|
res = COI::BufferCreate(params.length, COI_BUFFER_NORMAL, 0, 0, 1,
|
|
&m_process, &buffer);
|
|
check_result(res, c_buf_create, m_index, res);
|
|
|
|
COI_ACCESS_FLAGS flags = COI_SINK_WRITE;
|
|
res = COI::PipelineRunFunction(get_pipeline(),
|
|
m_funcs[c_func_var_table_copy],
|
|
1, &buffer, &flags,
|
|
0, 0,
|
|
¶ms.nelems, sizeof(params.nelems),
|
|
0, 0,
|
|
&event);
|
|
check_result(res, c_pipeline_run_func, m_index, res);
|
|
|
|
res = COI::EventWait(1, &event, -1, 1, 0, 0);
|
|
check_result(res, c_event_wait, res);
|
|
|
|
// patch names in target data
|
|
VarList::BufEntry *target_table;
|
|
COIMAPINSTANCE map_inst;
|
|
res = COI::BufferMap(buffer, 0, params.length, COI_MAP_READ_ONLY, 0, 0,
|
|
0, &map_inst,
|
|
reinterpret_cast<void**>(&target_table));
|
|
check_result(res, c_buf_map, res);
|
|
|
|
VarList::table_patch_names(target_table, params.nelems);
|
|
|
|
// and sort entries
|
|
std::sort(target_table, target_table + params.nelems, target_entry_cmp);
|
|
std::sort(host_table.begin(), host_table.end(), host_entry_cmp);
|
|
|
|
// merge host and target entries and enter matching vars map
|
|
std::vector<const VarTable::Entry*>::const_iterator hi =
|
|
host_table.begin();
|
|
std::vector<const VarTable::Entry*>::const_iterator he =
|
|
host_table.end();
|
|
const VarList::BufEntry *ti = target_table;
|
|
const VarList::BufEntry *te = target_table + params.nelems;
|
|
|
|
while (hi != he && ti != te) {
|
|
int res = strcmp((*hi)->name, reinterpret_cast<const char*>(ti->name));
|
|
if (res == 0) {
|
|
bool is_new;
|
|
// add matching entry to var map
|
|
PtrData *ptr = insert_ptr_data((*hi)->addr, (*hi)->size, is_new);
|
|
|
|
// store address for new entries
|
|
if (is_new) {
|
|
ptr->mic_addr = ti->addr;
|
|
ptr->is_static = true;
|
|
ptr->var_alloc_type = (*hi)->var_alloc_type;
|
|
}
|
|
ptr->alloc_ptr_data_lock.unlock();
|
|
hi++;
|
|
ti++;
|
|
}
|
|
else if (res < 0) {
|
|
hi++;
|
|
}
|
|
else {
|
|
ti++;
|
|
}
|
|
}
|
|
|
|
// cleanup
|
|
res = COI::BufferUnmap(map_inst, 0, 0, 0);
|
|
check_result(res, c_buf_unmap, res);
|
|
|
|
res = COI::BufferDestroy(buffer);
|
|
check_result(res, c_buf_destroy, res);
|
|
}
|
|
|
|
COIRESULT Engine::compute(
|
|
_Offload_stream stream,
|
|
const std::list<COIBUFFER> &buffers,
|
|
const void* data,
|
|
uint16_t data_size,
|
|
void* ret,
|
|
uint16_t ret_size,
|
|
uint32_t num_deps,
|
|
const COIEVENT* deps,
|
|
COIEVENT* event
|
|
) /* const */
|
|
{
|
|
COIBUFFER *bufs;
|
|
COI_ACCESS_FLAGS *flags;
|
|
COIRESULT res;
|
|
|
|
// convert buffers list to array
|
|
int num_bufs = buffers.size();
|
|
if (num_bufs > 0) {
|
|
bufs = (COIBUFFER*) alloca(num_bufs * sizeof(COIBUFFER));
|
|
flags = (COI_ACCESS_FLAGS*) alloca(num_bufs *
|
|
sizeof(COI_ACCESS_FLAGS));
|
|
|
|
int i = 0;
|
|
for (std::list<COIBUFFER>::const_iterator it = buffers.begin();
|
|
it != buffers.end(); it++) {
|
|
bufs[i] = *it;
|
|
|
|
// TODO: this should be fixed
|
|
flags[i++] = COI_SINK_WRITE;
|
|
}
|
|
}
|
|
else {
|
|
bufs = 0;
|
|
flags = 0;
|
|
}
|
|
COIPIPELINE pipeline = (stream == no_stream) ?
|
|
get_pipeline() :
|
|
get_pipeline(stream);
|
|
// start computation
|
|
res = COI::PipelineRunFunction(pipeline,
|
|
m_funcs[c_func_compute],
|
|
num_bufs, bufs, flags,
|
|
num_deps, deps,
|
|
data, data_size,
|
|
ret, ret_size,
|
|
event);
|
|
return res;
|
|
}
|
|
|
|
pid_t Engine::init_device(void)
|
|
{
|
|
struct init_data {
|
|
int device_index;
|
|
int devices_total;
|
|
int console_level;
|
|
int offload_report_level;
|
|
} data;
|
|
COIRESULT res;
|
|
COIEVENT event;
|
|
pid_t pid;
|
|
|
|
OFFLOAD_DEBUG_TRACE_1(2, 0, c_offload_init,
|
|
"Initializing device with logical index %d "
|
|
"and physical index %d\n",
|
|
m_index, m_physical_index);
|
|
|
|
// setup misc data
|
|
data.device_index = m_index;
|
|
data.devices_total = mic_engines_total;
|
|
data.console_level = console_enabled;
|
|
data.offload_report_level = offload_report_level;
|
|
|
|
res = COI::PipelineRunFunction(get_pipeline(),
|
|
m_funcs[c_func_init],
|
|
0, 0, 0, 0, 0,
|
|
&data, sizeof(data),
|
|
&pid, sizeof(pid),
|
|
&event);
|
|
check_result(res, c_pipeline_run_func, m_index, res);
|
|
|
|
res = COI::EventWait(1, &event, -1, 1, 0, 0);
|
|
check_result(res, c_event_wait, res);
|
|
|
|
OFFLOAD_DEBUG_TRACE(2, "Device process pid is %d\n", pid);
|
|
|
|
return pid;
|
|
}
|
|
|
|
// data associated with each thread
|
|
struct Thread {
|
|
Thread(long* addr_coipipe_counter) {
|
|
m_addr_coipipe_counter = addr_coipipe_counter;
|
|
memset(m_pipelines, 0, sizeof(m_pipelines));
|
|
}
|
|
|
|
~Thread() {
|
|
#ifndef TARGET_WINNT
|
|
__sync_sub_and_fetch(m_addr_coipipe_counter, 1);
|
|
#else // TARGET_WINNT
|
|
_InterlockedDecrement(m_addr_coipipe_counter);
|
|
#endif // TARGET_WINNT
|
|
for (int i = 0; i < mic_engines_total; i++) {
|
|
if (m_pipelines[i] != 0) {
|
|
COI::PipelineDestroy(m_pipelines[i]);
|
|
}
|
|
}
|
|
}
|
|
|
|
COIPIPELINE get_pipeline(int index) const {
|
|
return m_pipelines[index];
|
|
}
|
|
|
|
void set_pipeline(int index, COIPIPELINE pipeline) {
|
|
m_pipelines[index] = pipeline;
|
|
}
|
|
|
|
AutoSet& get_auto_vars() {
|
|
return m_auto_vars;
|
|
}
|
|
|
|
private:
|
|
long* m_addr_coipipe_counter;
|
|
AutoSet m_auto_vars;
|
|
COIPIPELINE m_pipelines[MIC_ENGINES_MAX];
|
|
};
|
|
|
|
COIPIPELINE Engine::get_pipeline(void)
|
|
{
|
|
Thread* thread = (Thread*) thread_getspecific(mic_thread_key);
|
|
if (thread == 0) {
|
|
thread = new Thread(&m_proc_number);
|
|
thread_setspecific(mic_thread_key, thread);
|
|
}
|
|
|
|
COIPIPELINE pipeline = thread->get_pipeline(m_index);
|
|
if (pipeline == 0) {
|
|
COIRESULT res;
|
|
int proc_num;
|
|
|
|
#ifndef TARGET_WINNT
|
|
proc_num = __sync_fetch_and_add(&m_proc_number, 1);
|
|
#else // TARGET_WINNT
|
|
proc_num = _InterlockedIncrement(&m_proc_number);
|
|
#endif // TARGET_WINNT
|
|
|
|
if (proc_num > COI_PIPELINE_MAX_PIPELINES) {
|
|
LIBOFFLOAD_ERROR(c_coipipe_max_number, COI_PIPELINE_MAX_PIPELINES);
|
|
LIBOFFLOAD_ABORT;
|
|
}
|
|
|
|
// Create pipeline for this thread
|
|
if (m_assigned_cpus == 0) {
|
|
// If m_assigned_cpus is NULL, it implies all threads
|
|
// Create the pipeline with no CPU mask
|
|
res = COI::PipelineCreate(m_process, 0, mic_stack_size, &pipeline);
|
|
} else {
|
|
// Create COI CPU mask
|
|
COI_CPU_MASK in_Mask;
|
|
res = COI::PipelineClearCPUMask(in_Mask);
|
|
check_result(res, c_clear_cpu_mask, m_index, res);
|
|
|
|
int threads_per_core = m_num_threads / m_num_cores;
|
|
|
|
// Available threads are defined by examining of m_assigned_cpus bitset.
|
|
// We skip thread 0.
|
|
for (int i = 1; i < m_num_threads; i++) {
|
|
// For available thread i m_assigned_cpus[i] is equal to 1
|
|
if ((*m_assigned_cpus)[i]) {
|
|
COI_CPU_MASK_SET(i, in_Mask);
|
|
}
|
|
}
|
|
OFFLOAD_DEBUG_TRACE(2, "COIPipelineCreate Mask for this CPU thread\n"
|
|
"%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n"
|
|
"%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n",
|
|
in_Mask[0], in_Mask[1], in_Mask[2], in_Mask[3],
|
|
in_Mask[4], in_Mask[5], in_Mask[6], in_Mask[7],
|
|
in_Mask[8], in_Mask[9], in_Mask[10], in_Mask[11],
|
|
in_Mask[12], in_Mask[13], in_Mask[14], in_Mask[15]);
|
|
|
|
// Create the pipeline with allowable CPUs
|
|
res = COI::PipelineCreate(m_process, in_Mask, mic_stack_size, &pipeline);
|
|
}
|
|
check_result(res, c_pipeline_create, m_index, res);
|
|
thread->set_pipeline(m_index, pipeline);
|
|
}
|
|
return pipeline;
|
|
}
|
|
|
|
Stream* Stream::find_stream(uint64_t handle, bool remove)
|
|
{
|
|
Stream *stream = 0;
|
|
|
|
m_stream_lock.lock();
|
|
{
|
|
StreamMap::iterator it = all_streams.find(handle);
|
|
if (it != all_streams.end()) {
|
|
stream = it->second;
|
|
if (remove) {
|
|
all_streams.erase(it);
|
|
}
|
|
}
|
|
}
|
|
m_stream_lock.unlock();
|
|
return stream;
|
|
}
|
|
|
|
void Engine::move_cpu_el_after(CpuEl* cpu_what, CpuEl* cpu_after)
|
|
{
|
|
if (cpu_what == cpu_after) {
|
|
return;
|
|
}
|
|
CpuEl* cpu_prev = cpu_what->prev;
|
|
|
|
// remove cpu_what
|
|
if (!cpu_prev) {
|
|
m_cpu_head = cpu_what->next;
|
|
}
|
|
else {
|
|
cpu_prev->next = cpu_what->next;
|
|
}
|
|
if (cpu_what->next) {
|
|
cpu_what->next->prev = cpu_prev;
|
|
}
|
|
|
|
// insert cpu_what after cpu_after
|
|
cpu_what->prev = cpu_after;
|
|
cpu_what->next = cpu_after->next;
|
|
if (cpu_after->next) {
|
|
cpu_after->next->prev = cpu_what;
|
|
}
|
|
cpu_after->next = cpu_what;
|
|
}
|
|
|
|
COIPIPELINE Engine::get_pipeline(_Offload_stream handle)
|
|
{
|
|
Stream * stream = Stream::find_stream(handle, false);
|
|
|
|
if (!stream) {
|
|
LIBOFFLOAD_ERROR(c_offload_no_stream, m_index);
|
|
LIBOFFLOAD_ABORT;
|
|
}
|
|
|
|
COIPIPELINE pipeline = stream->get_pipeline();
|
|
|
|
if (pipeline == 0) {
|
|
COIRESULT res;
|
|
int proc_num;
|
|
COI_CPU_MASK in_Mask ;
|
|
|
|
#ifndef TARGET_WINNT
|
|
proc_num = __sync_fetch_and_add(&m_proc_number, 1);
|
|
#else // TARGET_WINNT
|
|
proc_num = _InterlockedIncrement(&m_proc_number);
|
|
#endif // TARGET_WINNT
|
|
|
|
if (proc_num > COI_PIPELINE_MAX_PIPELINES) {
|
|
LIBOFFLOAD_ERROR(c_coipipe_max_number, COI_PIPELINE_MAX_PIPELINES);
|
|
LIBOFFLOAD_ABORT;
|
|
}
|
|
|
|
m_stream_lock.lock();
|
|
|
|
// start process if not done yet
|
|
if (m_process == 0) {
|
|
init_process();
|
|
}
|
|
|
|
// create CPUmask
|
|
res = COI::PipelineClearCPUMask(in_Mask);
|
|
check_result(res, c_clear_cpu_mask, m_index, res);
|
|
|
|
int stream_cpu_num = stream->get_cpu_number();
|
|
|
|
stream->m_stream_cpus.reset();
|
|
|
|
int threads_per_core = m_num_threads / m_num_cores;
|
|
|
|
|
|
// Available threads is taken from m_cpus list.
|
|
// m_cpu_head points to the head of m_cpus.
|
|
// the elements of m_cpus is ordered by the number of usage in streams.
|
|
|
|
CpuEl *cpu_el = m_cpu_head;
|
|
CpuEl *cpu_used_el, *cpu_used_prev, *cpu_prev;
|
|
|
|
for (int i = 0; i < stream_cpu_num; i++) {
|
|
COI_CPU_MASK_SET(CPU_INDEX(cpu_el), in_Mask);
|
|
stream->m_stream_cpus.set(CPU_INDEX(cpu_el));
|
|
//If the number of availabale threads is less than stream_cpu_num,
|
|
// the stream_cpu_num is restricted to this number.
|
|
if (!cpu_el->next) {
|
|
break;
|
|
}
|
|
if (i + 1 < stream_cpu_num) {
|
|
cpu_el = cpu_el->next;
|
|
}
|
|
}
|
|
|
|
// assertion : cpu_el points to the last used thread
|
|
cpu_used_el = cpu_el;
|
|
while (cpu_used_el) {
|
|
cpu_used_el->count++;
|
|
cpu_el = cpu_prev = cpu_used_el;
|
|
cpu_used_prev = cpu_used_el->prev;
|
|
if (!cpu_el->next) {
|
|
cpu_used_el = cpu_used_prev;
|
|
continue;
|
|
}
|
|
|
|
while (cpu_el) {
|
|
if (cpu_used_el->count < cpu_el->count) {
|
|
break;
|
|
}
|
|
// Equal used threads are ordered by thread number to
|
|
// assign to a stream as contiguous threads as possible.
|
|
else if (cpu_used_el->count == cpu_el->count &&
|
|
CPU_INDEX(cpu_used_el) < CPU_INDEX(cpu_el)) {
|
|
break;
|
|
}
|
|
cpu_prev = cpu_el;
|
|
cpu_el = cpu_el->next;
|
|
}
|
|
if (cpu_used_el != cpu_prev) {
|
|
move_cpu_el_after(cpu_used_el, cpu_prev);
|
|
}
|
|
cpu_used_el = cpu_used_prev;
|
|
}
|
|
print_stream_cpu_list("get_pipeline");
|
|
|
|
// create pipeline for this thread
|
|
OFFLOAD_DEBUG_TRACE(2, "COIPipelineCreate Mask for this Stream\n"
|
|
"%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n"
|
|
"%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n",
|
|
in_Mask[0], in_Mask[1], in_Mask[2], in_Mask[3],
|
|
in_Mask[4], in_Mask[5], in_Mask[6], in_Mask[7],
|
|
in_Mask[8], in_Mask[9], in_Mask[10], in_Mask[11],
|
|
in_Mask[12], in_Mask[13], in_Mask[14], in_Mask[15]);
|
|
res = COI::PipelineCreate(m_process, in_Mask,
|
|
mic_stack_size, &pipeline);
|
|
check_result(res, c_pipeline_create, m_index, res);
|
|
|
|
// Set stream's affinities
|
|
{
|
|
struct affinity_spec affinity_spec;
|
|
char* affinity_type;
|
|
int i;
|
|
|
|
// "compact" by default
|
|
affinity_spec.affinity_type = affinity_compact;
|
|
|
|
// Check if user has specified type of affinity
|
|
if ((affinity_type = getenv("OFFLOAD_STREAM_AFFINITY")) !=
|
|
NULL)
|
|
{
|
|
char affinity_str[16];
|
|
int affinity_str_len;
|
|
|
|
OFFLOAD_DEBUG_TRACE(2,
|
|
"User has specified OFFLOAD_STREAM_AFFINITY=%s\n",
|
|
affinity_type);
|
|
|
|
// Set type of affinity requested
|
|
affinity_str_len = strlen(affinity_type);
|
|
for (i=0; i<affinity_str_len && i<15; i++)
|
|
{
|
|
affinity_str[i] = tolower(affinity_type[i]);
|
|
}
|
|
affinity_str[i] = '\0';
|
|
if (strcmp(affinity_str, "compact") == 0) {
|
|
affinity_spec.affinity_type = affinity_compact;
|
|
OFFLOAD_DEBUG_TRACE(2, "Setting affinity=compact\n");
|
|
} else if (strcmp(affinity_str, "scatter") == 0) {
|
|
affinity_spec.affinity_type = affinity_scatter;
|
|
OFFLOAD_DEBUG_TRACE(2, "Setting affinity=scatter\n");
|
|
} else {
|
|
LIBOFFLOAD_ERROR(c_incorrect_affinity, affinity_str);
|
|
affinity_spec.affinity_type = affinity_compact;
|
|
OFFLOAD_DEBUG_TRACE(2, "Setting affinity=compact\n");
|
|
}
|
|
}
|
|
// Make flat copy of sink mask because COI's mask is opaque
|
|
for (i=0; i<16; i++) {
|
|
affinity_spec.sink_mask[i] = in_Mask[i];
|
|
}
|
|
// Set number of cores and threads
|
|
affinity_spec.num_cores = m_num_cores;
|
|
affinity_spec.num_threads = m_num_threads;
|
|
|
|
COIEVENT event;
|
|
res = COI::PipelineRunFunction(pipeline,
|
|
m_funcs[c_func_set_stream_affinity],
|
|
0, 0, 0,
|
|
0, 0,
|
|
&affinity_spec, sizeof(affinity_spec),
|
|
0, 0,
|
|
&event);
|
|
check_result(res, c_pipeline_run_func, m_index, res);
|
|
|
|
res = COI::EventWait(1, &event, -1, 1, 0, 0);
|
|
check_result(res, c_event_wait, res);
|
|
}
|
|
|
|
m_stream_lock.unlock();
|
|
stream->set_pipeline(pipeline);
|
|
}
|
|
return pipeline;
|
|
}
|
|
|
|
void Engine::stream_destroy(_Offload_stream handle)
|
|
{
|
|
// get stream
|
|
Stream * stream = Stream::find_stream(handle, true);
|
|
|
|
if (stream) {
|
|
// return cpus for future use
|
|
for (int i = 0; i < m_num_threads; i++) {
|
|
if (stream->m_stream_cpus.test(i)) {
|
|
CpuEl *cpu_el = m_cpus + i;
|
|
CpuEl *cpu_first_el = cpu_el;
|
|
// decrease count of thread "i" and move its CpuEl to the
|
|
// proper place into the ordered list
|
|
cpu_el->count--;
|
|
while (cpu_el->prev) {
|
|
if (cpu_first_el->count > cpu_el->prev->count) {
|
|
break;
|
|
}
|
|
else if (cpu_first_el->count == cpu_el->prev->count &&
|
|
CPU_INDEX(cpu_first_el) > CPU_INDEX(cpu_el->prev)) {
|
|
break;
|
|
}
|
|
cpu_el = cpu_el->prev;
|
|
}
|
|
cpu_el = cpu_el->prev;
|
|
// If cpu_el for thread "i" must be moved in the list
|
|
if (cpu_first_el != cpu_el) {
|
|
// Thread "i" is used the least times. It must be set as
|
|
// the m_cpu_head.
|
|
if (!cpu_el) {
|
|
if (!cpu_first_el->prev) {
|
|
continue;
|
|
}
|
|
// remove cpu_el.
|
|
cpu_first_el->prev->next = cpu_first_el->next;
|
|
if (cpu_first_el->next) {
|
|
cpu_first_el->next->prev = cpu_first_el->prev;
|
|
}
|
|
// make cpu_first_el as new m_cpu_head
|
|
cpu_first_el->prev = NULL;
|
|
cpu_first_el->next = m_cpu_head;
|
|
m_cpu_head->prev = cpu_first_el;
|
|
m_cpu_head = cpu_first_el;
|
|
}
|
|
else {
|
|
move_cpu_el_after(cpu_first_el, cpu_el);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
print_stream_cpu_list("stream_destroy");
|
|
delete stream;
|
|
}
|
|
else {
|
|
LIBOFFLOAD_ERROR(c_offload_no_stream, m_index);
|
|
LIBOFFLOAD_ABORT;
|
|
}
|
|
}
|
|
|
|
uint64_t Engine::get_thread_id(void)
|
|
{
|
|
Thread* thread = (Thread*) thread_getspecific(mic_thread_key);
|
|
if (thread == 0) {
|
|
thread = new Thread(&m_proc_number);
|
|
thread_setspecific(mic_thread_key, thread);
|
|
}
|
|
|
|
return reinterpret_cast<uint64_t>(thread);
|
|
}
|
|
|
|
AutoSet& Engine::get_auto_vars(void)
|
|
{
|
|
Thread* thread = (Thread*) thread_getspecific(mic_thread_key);
|
|
if (thread == 0) {
|
|
thread = new Thread(&m_proc_number);
|
|
thread_setspecific(mic_thread_key, thread);
|
|
}
|
|
|
|
return thread->get_auto_vars();
|
|
}
|
|
|
|
void Engine::destroy_thread_data(void *data)
|
|
{
|
|
delete static_cast<Thread*>(data);
|
|
}
|