1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
| //===------------ sync.h - NVPTX OpenMP synchronizations --------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Include all synchronization.
//
//===----------------------------------------------------------------------===//
#include "omptarget-nvptx.h"
#include "target_impl.h"
////////////////////////////////////////////////////////////////////////////////
// KMP Ordered calls
////////////////////////////////////////////////////////////////////////////////
EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t tid) {
PRINT0(LD_IO, "call kmpc_ordered\n");
}
EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t tid) {
PRINT0(LD_IO, "call kmpc_end_ordered\n");
}
////////////////////////////////////////////////////////////////////////////////
// KMP Barriers
////////////////////////////////////////////////////////////////////////////////
// a team is a block: we can use CUDA native synchronization mechanism
// FIXME: what if not all threads (warps) participate to the barrier?
// We may need to implement it differently
EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc_ref, int32_t tid) {
PRINT0(LD_IO, "call kmpc_cancel_barrier\n");
__kmpc_barrier(loc_ref, tid);
PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n");
return 0;
}
EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
if (checkRuntimeUninitialized(loc_ref)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc_ref),
"Expected SPMD mode with uninitialized runtime.");
__kmpc_barrier_simple_spmd(loc_ref, tid);
} else {
tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref));
int numberOfActiveOMPThreads =
GetNumberOfOmpThreads(checkSPMDMode(loc_ref));
if (numberOfActiveOMPThreads > 1) {
if (checkSPMDMode(loc_ref)) {
__kmpc_barrier_simple_spmd(loc_ref, tid);
} else {
// The #threads parameter must be rounded up to the WARPSIZE.
int threads =
WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
PRINT(LD_SYNC,
"call kmpc_barrier with %d omp threads, sync parameter %d\n",
(int)numberOfActiveOMPThreads, (int)threads);
// Barrier #1 is for synchronization among active threads.
named_sync(L1_BARRIER, threads);
}
} else {
// Still need to flush the memory per the standard.
__kmpc_flush(loc_ref);
} // numberOfActiveOMPThreads > 1
PRINT0(LD_SYNC, "completed kmpc_barrier\n");
}
}
// Emit a simple barrier call in SPMD mode. Assumes the caller is in an L0
// parallel region and that all worker threads participate.
EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid) {
PRINT0(LD_SYNC, "call kmpc_barrier_simple_spmd\n");
__kmpc_impl_syncthreads();
PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n");
}
// Emit a simple barrier call in Generic mode. Assumes the caller is in an L0
// parallel region and that all worker threads participate.
EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) {
int numberOfActiveOMPThreads = GetNumberOfThreadsInBlock() - WARPSIZE;
// The #threads parameter must be rounded up to the WARPSIZE.
int threads =
WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
PRINT(LD_SYNC,
"call kmpc_barrier_simple_generic with %d omp threads, sync parameter "
"%d\n",
(int)numberOfActiveOMPThreads, (int)threads);
// Barrier #1 is for synchronization among active threads.
named_sync(L1_BARRIER, threads);
PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n");
}
////////////////////////////////////////////////////////////////////////////////
// KMP MASTER
////////////////////////////////////////////////////////////////////////////////
EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid) {
PRINT0(LD_IO, "call kmpc_master\n");
return IsTeamMaster(global_tid);
}
EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid) {
PRINT0(LD_IO, "call kmpc_end_master\n");
ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here");
}
////////////////////////////////////////////////////////////////////////////////
// KMP SINGLE
////////////////////////////////////////////////////////////////////////////////
EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid) {
PRINT0(LD_IO, "call kmpc_single\n");
// decide to implement single with master; master get the single
return IsTeamMaster(global_tid);
}
EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid) {
PRINT0(LD_IO, "call kmpc_end_single\n");
// decide to implement single with master: master get the single
ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here");
// sync barrier is explicitely called... so that is not a problem
}
////////////////////////////////////////////////////////////////////////////////
// Flush
////////////////////////////////////////////////////////////////////////////////
EXTERN void __kmpc_flush(kmp_Ident *loc) {
PRINT0(LD_IO, "call kmpc_flush\n");
__threadfence();
}
////////////////////////////////////////////////////////////////////////////////
// Vote
////////////////////////////////////////////////////////////////////////////////
EXTERN __kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask() {
PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n");
return __kmpc_impl_activemask();
}
////////////////////////////////////////////////////////////////////////////////
// Syncwarp
////////////////////////////////////////////////////////////////////////////////
EXTERN void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask) {
PRINT0(LD_IO, "call __kmpc_syncwarp\n");
__kmpc_impl_syncwarp(Mask);
}
|