reference, declarationdefinition
definition → references, declarations, derived classes, virtual overrides
reference to multiple definitions → definitions
unreferenced
    1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79
   80
   81
   82
   83
   84
   85
   86
   87
   88
   89
   90
   91
   92
   93
   94
   95
   96
   97
   98
   99
  100
  101
  102
  103
  104
  105
  106
  107
  108
  109
  110
  111
  112
  113
  114
  115
  116
  117
  118
  119
  120
  121
  122
  123
  124
  125
  126
  127
  128
  129
  130
  131
  132
  133
  134
  135
  136
  137
  138
  139
  140
  141
  142
  143
  144
  145
  146
  147
  148
  149
  150
  151
  152
  153
  154
  155
  156
  157
  158
  159
  160
  161
  162
  163
  164
  165
  166
  167
  168
  169
  170
  171
  172
  173
  174
  175
  176
  177
  178
  179
  180
  181
  182
  183
  184
  185
  186
  187
  188
  189
  190
  191
  192
  193
  194
  195
  196
  197
  198
  199
  200
  201
  202
  203
  204
  205
  206
  207
  208
  209
  210
  211
  212
  213
  214
  215
  216
  217
  218
  219
  220
  221
  222
  223
  224
  225
  226
//===---- omptarget-nvptxi.h - NVPTX OpenMP GPU initialization --- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains the declarations of all library macros, types,
// and functions.
//
//===----------------------------------------------------------------------===//

////////////////////////////////////////////////////////////////////////////////
// Task Descriptor
////////////////////////////////////////////////////////////////////////////////

INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() const {
  // sched starts from 1..4; encode it as 0..3; so add 1 here
  uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1;
  return (omp_sched_t)rc;
}

INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) {
  // sched starts from 1..4; encode it as 0..3; so sub 1 here
  uint8_t val = ((uint8_t)sched) - 1;
  // clear current sched
  items.flags &= ~TaskDescr_SchedMask;
  // set new sched
  items.flags |= val;
}

INLINE void
omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
  // slow method
  // flag:
  //   default sched is static,
  //   dyn is off (unused now anyway, but may need to sample from host ?)
  //   not in parallel

  items.flags = 0;
  items.threadId = 0;         // is master
  items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
}

// This is called when all threads are started together in SPMD mode.
// OMP directives include target parallel, target distribute parallel for, etc.
INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
    omptarget_nvptx_TaskDescr *parentTaskDescr) {
  // slow method
  // flag:
  //   default sched is static,
  //   dyn is off (unused now anyway, but may need to sample from host ?)
  //   in L1 parallel

  items.flags =
      TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
  items.threadId =
      GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
  items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
  prev = parentTaskDescr;
}

INLINE void omptarget_nvptx_TaskDescr::CopyData(
    omptarget_nvptx_TaskDescr *sourceTaskDescr) {
  items = sourceTaskDescr->items;
}

INLINE void
omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) {
  CopyData(sourceTaskDescr);
  prev = sourceTaskDescr->prev;
}

INLINE void omptarget_nvptx_TaskDescr::CopyParent(
    omptarget_nvptx_TaskDescr *parentTaskDescr) {
  CopyData(parentTaskDescr);
  prev = parentTaskDescr;
}

INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(
    omptarget_nvptx_TaskDescr *parentTaskDescr) {
  CopyParent(parentTaskDescr);
  items.flags = items.flags & ~TaskDescr_IsParConstr;
  ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task");
}

INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
    omptarget_nvptx_TaskDescr *masterTaskDescr) {
  CopyParent(masterTaskDescr);
  // overrwrite specific items;
  items.flags |=
      TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
}

INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
    omptarget_nvptx_TaskDescr *workTaskDescr) {
  Copy(workTaskDescr);
  //
  // overrwrite specific items;
  //
  // The threadID should be GetThreadIdInBlock() % GetMasterThreadID().
  // This is so that the serial master (first lane in the master warp)
  // gets a threadId of 0.
  // However, we know that this function is always called in a parallel
  // region where only workers are active.  The serial master thread
  // never enters this region.  When a parallel region is executed serially,
  // the threadId is set to 0 elsewhere and the kmpc_serialized_* functions
  // are called, which never activate this region.
  items.threadId =
      GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
}

INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(
    omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) {
  CopyParent(parentTaskDescr);
  items.flags |= TaskDescr_InParL2P; // In L2+ parallelism
  items.threadId = tid;
}

INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() {
  loopData.loopUpperBound =
      omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId);
  loopData.nextLowerBound =
      omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId);
  loopData.schedule =
      omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId);
  loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId);
  loopData.stride =
      omptarget_nvptx_threadPrivateContext->Stride(items.threadId);
}

INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const {
  omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk;
  omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) =
      loopData.loopUpperBound;
  omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) =
      loopData.nextLowerBound;
  omptarget_nvptx_threadPrivateContext->Stride(items.threadId) =
      loopData.stride;
  omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) =
      loopData.schedule;
}

////////////////////////////////////////////////////////////////////////////////
// Thread Private Context
////////////////////////////////////////////////////////////////////////////////

INLINE omptarget_nvptx_TaskDescr *
omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) const {
  ASSERT0(
      LT_FUSSY, tid < MAX_THREADS_PER_TEAM,
      "Getting top level, tid is larger than allocated data structure size");
  return topTaskDescr[tid];
}

INLINE void
omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) {
  // levelOneTaskDescr is init when starting the parallel region
  // top task descr is NULL (team master version will be fixed separately)
  topTaskDescr[tid] = NULL;
  // no num threads value has been pushed
  nextRegion.tnum[tid] = 0;
  // the following don't need to be init here; they are init when using dyn
  // sched
  // current_Event, events_Number, chunk, num_Iterations, schedule
}

////////////////////////////////////////////////////////////////////////////////
// Team Descriptor
////////////////////////////////////////////////////////////////////////////////

INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() {
  levelZeroTaskDescr.InitLevelZeroTaskDescr();
}

////////////////////////////////////////////////////////////////////////////////
// Get private data structure for thread
////////////////////////////////////////////////////////////////////////////////

// Utility routines for CUDA threads
INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor() {
  return omptarget_nvptx_threadPrivateContext->TeamContext();
}

INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor() {
  omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
  return currTeamDescr.WorkDescr();
}

INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
  return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
}

INLINE omptarget_nvptx_TaskDescr *
getMyTopTaskDescriptor(bool isSPMDExecutionMode) {
  return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock(isSPMDExecutionMode));
}

////////////////////////////////////////////////////////////////////////////////
// Memory management runtime functions.
////////////////////////////////////////////////////////////////////////////////

INLINE void omptarget_nvptx_SimpleMemoryManager::Release() {
  ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
          "SlotIdx is too big or uninitialized.");
  ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT,
          "MemIdx is too big or uninitialized.");
  MemDataTy &MD = MemData[usedSlotIdx];
  atomicExch((unsigned *)&MD.keys[usedMemIdx], 0);
}

INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf,
                                                                size_t size) {
  ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
          "SlotIdx is too big or uninitialized.");
  const unsigned sm = usedSlotIdx;
  MemDataTy &MD = MemData[sm];
  unsigned i = hash(GetBlockIdInKernel());
  while (atomicCAS((unsigned *)&MD.keys[i], 0, 1) != 0) {
    i = hash(i + 1);
  }
  usedSlotIdx = sm;
  usedMemIdx = i;
  return static_cast<const char *>(buf) + (sm * OMP_STATE_COUNT + i) * size;
}